From baf80dcc1687e1821360ce81354e61820cf54e75 Mon Sep 17 00:00:00 2001 From: bartzbeielstein <32470350+bartzbeielstein@users.noreply.github.com> Date: Sat, 13 Jun 2026 00:33:37 +0200 Subject: [PATCH 1/2] style: fix pre-existing black/isort drift in 6 files Flagged by black 25 / isort 8; no functional change. Co-Authored-By: Claude Fable 5 --- src/spotforecast2_safe/downloader/__init__.py | 5 +---- src/spotforecast2_safe/downloader/entsoe.py | 4 +++- .../preprocessing/coverage.py | 4 +--- .../configurator/test_config_entsoe_params.py | 4 +--- tests/test_config_multi.py | 19 +++++++++++++------ tests/test_downloader_entsoe_collect_mode.py | 6 +++--- 6 files changed, 22 insertions(+), 20 deletions(-) diff --git a/src/spotforecast2_safe/downloader/__init__.py b/src/spotforecast2_safe/downloader/__init__.py index 1255bc9e..55ae7f8f 100644 --- a/src/spotforecast2_safe/downloader/__init__.py +++ b/src/spotforecast2_safe/downloader/__init__.py @@ -1,10 +1,7 @@ # SPDX-FileCopyrightText: 2026 bartzbeielstein # SPDX-License-Identifier: AGPL-3.0-or-later -from .entsoe import ( - download_new_data, - merge_build_manual, -) +from .entsoe import download_new_data, merge_build_manual __all__ = [ "download_new_data", diff --git a/src/spotforecast2_safe/downloader/entsoe.py b/src/spotforecast2_safe/downloader/entsoe.py index 6b5630db..03637b92 100644 --- a/src/spotforecast2_safe/downloader/entsoe.py +++ b/src/spotforecast2_safe/downloader/entsoe.py @@ -1052,7 +1052,9 @@ def query(client, cc, s, e, _col=column, _fc=forecast_col): # type: ignore[no-u error=None, interim_path=interim_path, ) - except Exception as exc: # noqa: BLE001 — limited to download-layer failures after argument validation + except ( + Exception + ) as exc: # noqa: BLE001 — limited to download-layer failures after argument validation logger.warning( "Zone %s (%s) download failed in collect mode: %s", column, diff --git a/src/spotforecast2_safe/preprocessing/coverage.py b/src/spotforecast2_safe/preprocessing/coverage.py index f325f87b..e09a0ce0 100644 --- a/src/spotforecast2_safe/preprocessing/coverage.py +++ b/src/spotforecast2_safe/preprocessing/coverage.py @@ -299,9 +299,7 @@ def last_complete_hour( else: modes = clean.index.to_series().diff().mode() if modes.empty: - raise ValueError( - "actual has only one non-NaN row; cannot infer cadence." - ) + raise ValueError("actual has only one non-NaN row; cannot infer cadence.") cadence = modes.iloc[0] sph = int(pd.Timedelta(hours=1) / cadence) diff --git a/tests/configurator/test_config_entsoe_params.py b/tests/configurator/test_config_entsoe_params.py index 1c999f98..cce67f78 100644 --- a/tests/configurator/test_config_entsoe_params.py +++ b/tests/configurator/test_config_entsoe_params.py @@ -28,9 +28,7 @@ def test_config_entsoe_get_params(): def test_config_entsoe_warm_start_lags(): """warm_start_lags defaults to the package seed list and round-trips.""" - from spotforecast2_safe.configurator.config_multi import ( - DEFAULT_WARM_START_LAGS, - ) + from spotforecast2_safe.configurator.config_multi import DEFAULT_WARM_START_LAGS default = ConfigEntsoe() assert default.warm_start_lags == DEFAULT_WARM_START_LAGS diff --git a/tests/test_config_multi.py b/tests/test_config_multi.py index 3199e1de..a2205128 100644 --- a/tests/test_config_multi.py +++ b/tests/test_config_multi.py @@ -124,13 +124,22 @@ def test_custom_lags_consider(self): assert cfg.lags_consider == lags def test_warm_start_lags_default_and_override(self): - from spotforecast2_safe.configurator.config_multi import ( - DEFAULT_WARM_START_LAGS, - ) + from spotforecast2_safe.configurator.config_multi import DEFAULT_WARM_START_LAGS assert ConfigMulti().warm_start_lags == DEFAULT_WARM_START_LAGS assert DEFAULT_WARM_START_LAGS == [ - 1, 2, 3, 23, 24, 25, 47, 48, 167, 168, 169, 336, + 1, + 2, + 3, + 23, + 24, + 25, + 47, + 48, + 167, + 168, + 169, + 336, ] cfg = ConfigMulti(warm_start_lags=[1, 24, 168]) assert cfg.warm_start_lags == [1, 24, 168] @@ -1052,5 +1061,3 @@ def test_config_multi_imputation_window_size_in_get_params(): """imputation_window_size is exposed via get_params().""" params = ConfigMulti(imputation_window_size=6).get_params() assert params["imputation_window_size"] == 6 - - diff --git a/tests/test_downloader_entsoe_collect_mode.py b/tests/test_downloader_entsoe_collect_mode.py index 0ecf7aed..9a7b4857 100644 --- a/tests/test_downloader_entsoe_collect_mode.py +++ b/tests/test_downloader_entsoe_collect_mode.py @@ -336,9 +336,9 @@ def test_build_zone_qc_frame_explicit_data_home_no_env_var(tmp_path, monkeypatch interim = tmp_path / "interim" interim.mkdir(parents=True) for col, actual, forecast in [("load_a", 100.0, 105.0), ("load_b", 50.0, 52.0)]: - pd.DataFrame( - {col: actual, f"{col}_forecast": forecast}, index=idx - ).rename_axis("Time (UTC)").to_csv(interim / f"zone_{col}.csv") + pd.DataFrame({col: actual, f"{col}_forecast": forecast}, index=idx).rename_axis( + "Time (UTC)" + ).to_csv(interim / f"zone_{col}.csv") qc = build_zone_qc_frame(zones=zones, data_home=tmp_path) From 39cfa8489432146f4d5b51c29c79921c704dd873 Mon Sep 17 00:00:00 2001 From: bartzbeielstein <32470350+bartzbeielstein@users.noreply.github.com> Date: Sat, 13 Jun 2026 00:33:37 +0200 Subject: [PATCH 2/2] feat(data): add ENTSO-E interim-CSV loaders (ported from spotforecast2.tasks) entsoe_data_loader and entsoe_test_data_loader move verbatim from spotforecast2.tasks.task_entsoe ahead of that subpackage's removal; they depend only on pandas and get_data_home, so the safe package is their natural home. Exported from spotforecast2_safe.data alongside load_actual_combined; documented under the Data quartodoc section; loader behavior covered by tests/test_entsoe_loader.py. Co-Authored-By: Claude Fable 5 --- .../execute-results/html.json | 12 ++ _quarto.yml | 3 + docs/reference/data.entsoe_loader.qmd | 136 +++++++++++++++++ docs/reference/index.qmd | 1 + src/spotforecast2_safe/data/__init__.py | 6 + src/spotforecast2_safe/data/entsoe_loader.py | 138 ++++++++++++++++++ tests/test_entsoe_loader.py | 103 +++++++++++++ 7 files changed, 399 insertions(+) create mode 100644 _freeze/docs/reference/data.entsoe_loader/execute-results/html.json create mode 100644 docs/reference/data.entsoe_loader.qmd create mode 100644 src/spotforecast2_safe/data/entsoe_loader.py create mode 100644 tests/test_entsoe_loader.py diff --git a/_freeze/docs/reference/data.entsoe_loader/execute-results/html.json b/_freeze/docs/reference/data.entsoe_loader/execute-results/html.json new file mode 100644 index 00000000..944936ae --- /dev/null +++ b/_freeze/docs/reference/data.entsoe_loader/execute-results/html.json @@ -0,0 +1,12 @@ +{ + "hash": "a34fd4bbba62bdccf2b6da5436257635", + "result": { + "engine": "jupyter", + "markdown": "---\ntitle: data.entsoe_loader\n---\n\n\n\n`data.entsoe_loader`\n\nENTSO-E interim-CSV data loaders.\n\nConfig-driven loaders for the merged ENTSO-E interim CSV, suitable for the\n``data_loader`` / ``test_data_loader`` hooks on `ConfigEntsoe`. Ported from\n``spotforecast2.tasks.task_entsoe`` ahead of that subpackage's removal.\n\n## Functions\n\n| Name | Description |\n| --- | --- |\n| [entsoe_data_loader](#spotforecast2_safe.data.entsoe_loader.entsoe_data_loader) | Read the merged interim ENTSO-E CSV that ``config.data_filename`` points at. |\n| [entsoe_test_data_loader](#spotforecast2_safe.data.entsoe_loader.entsoe_test_data_loader) | Return the merged ENTSO-E CSV sliced to the forecast horizon. |\n\n### entsoe_data_loader { #spotforecast2_safe.data.entsoe_loader.entsoe_data_loader }\n\n```python\ndata.entsoe_loader.entsoe_data_loader(config)\n```\n\nRead the merged interim ENTSO-E CSV that ``config.data_filename`` points at.\n\n#### Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|--------|----------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|------------|\n| config | [ConfigEntsoe](`spotforecast2_safe.configurator.ConfigEntsoe`) | A `ConfigEntsoe` with ``data_filename`` set. Relative paths are resolved against `spotforecast2_safe.data.fetch_data.get_data_home`. | _required_ |\n\n#### Returns {.doc-section .doc-section-returns}\n\n| Name | Type | Description |\n|--------|------------------------------------------------|--------------------------------------------------------------------|\n| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | DataFrame indexed by the ENTSO-E timestamp column (``Time (UTC)``) |\n| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | with the load columns as data columns. |\n\n#### Raises {.doc-section .doc-section-raises}\n\n| Name | Type | Description |\n|--------|------------------------------------------|-----------------------------------------------------------------------------------------------|\n| | [FileNotFoundError](`FileNotFoundError`) | If the merged CSV does not exist. Run ``spotforecast2-entsoe download`` and ``merge`` first. |\n\n#### Examples {.doc-section .doc-section-examples}\n\n\n::: {#3872eaae .cell execution_count=1}\n``` {.python .cell-code}\nimport os\nimport tempfile\n\nimport pandas as pd\nfrom spotforecast2_safe.configurator import ConfigEntsoe\nfrom spotforecast2_safe.data.entsoe_loader import entsoe_data_loader\n\n# Build a tiny synthetic interim CSV in a temp directory.\ntmp = tempfile.mkdtemp()\ncsv_path = os.path.join(tmp, \"energy_load.csv\")\nidx = pd.date_range(\n \"2025-01-01\", periods=48, freq=\"h\", tz=\"UTC\", name=\"Time (UTC)\"\n)\npd.DataFrame({\"Actual Load\": range(48)}, index=idx).to_csv(csv_path)\n\n# Absolute path bypasses get_data_home; loader returns the full frame.\nconfig = ConfigEntsoe()\nconfig.data_filename = csv_path\ndf = entsoe_data_loader(config)\n\nprint(df.shape)\nassert df.shape == (48, 1)\nassert df.index.name == \"Time (UTC)\"\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n(48, 1)\n```\n:::\n:::\n\n\n### entsoe_test_data_loader { #spotforecast2_safe.data.entsoe_loader.entsoe_test_data_loader }\n\n```python\ndata.entsoe_loader.entsoe_test_data_loader(config)\n```\n\nReturn the merged ENTSO-E CSV sliced to the forecast horizon.\n\nThe slice spans ``(end_train, end_train + predict_size * 1 h]`` so that\n``build_prediction_package``'s ``test_actual = ts.reindex(future_pred.index)``\nmatches the hourly forecast row-for-row. ``end_train`` is taken from\n``config.end_train_default`` (treated as the *inclusive* last training\ntimestamp, the same convention the forecaster uses), and the step is\nassumed to be 1 h after the pipeline's hourly resampling.\n\nFor the live ENTSO-E exemplar with ``end_train_default = D-2 23:00 UTC``\nand ``predict_size = 24``, this returns the rows for\n``[D-1 00:00, D 00:00)`` — i.e., ``y_{-1}``. For backtests at an arbitrary\n``end_train_default``, it returns the post-cutoff window the model is\nactually predicting, rather than always \"yesterday in wall-clock UTC\".\n\n#### Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|--------|----------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|\n| config | [ConfigEntsoe](`spotforecast2_safe.configurator.ConfigEntsoe`) | A `ConfigEntsoe` with ``data_filename``, ``end_train_default``, and ``predict_size`` set; the merged interim CSV must already contain data covering the forecast horizon (run ``spotforecast2-entsoe download`` first). | _required_ |\n\n#### Returns {.doc-section .doc-section-returns}\n\n| Name | Type | Description |\n|--------|------------------------------------------------|------------------------------------------------------------------------|\n| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | DataFrame indexed by ``Time (UTC)`` with the rows the forecast will be |\n| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | scored against. |\n\n#### Examples {.doc-section .doc-section-examples}\n\n::: {#f5e3a200 .cell execution_count=2}\n``` {.python .cell-code}\nimport os\nimport tempfile\n\nimport pandas as pd\nfrom spotforecast2_safe.configurator import ConfigEntsoe\nfrom spotforecast2_safe.data.entsoe_loader import entsoe_test_data_loader\n\n# Synthetic interim CSV spanning the forecast window.\ntmp = tempfile.mkdtemp()\ncsv_path = os.path.join(tmp, \"energy_load.csv\")\nidx = pd.date_range(\n \"2025-12-29 00:00\", periods=120, freq=\"h\", tz=\"UTC\", name=\"Time (UTC)\"\n)\npd.DataFrame({\"Actual Load\": range(120)}, index=idx).to_csv(csv_path)\n\nconfig = ConfigEntsoe()\nconfig.data_filename = csv_path\nconfig.end_train_default = \"2025-12-31 00:00+00:00\"\nconfig.predict_size = 24\n\ntest_df = entsoe_test_data_loader(config)\n\n# The slice covers exactly predict_size hourly steps after end_train.\nprint(test_df.shape)\nassert test_df.shape == (24, 1)\nassert test_df.index[0] == pd.Timestamp(\"2025-12-31 01:00\", tz=\"UTC\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n(24, 1)\n```\n:::\n:::\n\n\n", + "supporting": [ + "data.entsoe_loader_files" + ], + "filters": [], + "includes": {} + } +} \ No newline at end of file diff --git a/_quarto.yml b/_quarto.yml index e7f44f32..534c5f4c 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -126,6 +126,8 @@ website: file: docs/reference/data.fetch_data.load_timeseries_forecast.qmd - text: "demo_loader" file: docs/reference/data.demo_loader.qmd + - text: "entsoe_loader" + file: docs/reference/data.entsoe_loader.qmd - section: "Preprocessing" contents: @@ -587,6 +589,7 @@ quartodoc: - data.fetch_data.load_day_ahead_price - data.data_classes - data.demo_loader + - data.entsoe_loader # ── Preprocessing ───────────────────────────────────────────────────────── - title: "Preprocessing" diff --git a/docs/reference/data.entsoe_loader.qmd b/docs/reference/data.entsoe_loader.qmd new file mode 100644 index 00000000..6a8ddb78 --- /dev/null +++ b/docs/reference/data.entsoe_loader.qmd @@ -0,0 +1,136 @@ +# data.entsoe_loader { #spotforecast2_safe.data.entsoe_loader } + +`data.entsoe_loader` + +ENTSO-E interim-CSV data loaders. + +Config-driven loaders for the merged ENTSO-E interim CSV, suitable for the +``data_loader`` / ``test_data_loader`` hooks on `ConfigEntsoe`. Ported from +``spotforecast2.tasks.task_entsoe`` ahead of that subpackage's removal. + +## Functions + +| Name | Description | +| --- | --- | +| [entsoe_data_loader](#spotforecast2_safe.data.entsoe_loader.entsoe_data_loader) | Read the merged interim ENTSO-E CSV that ``config.data_filename`` points at. | +| [entsoe_test_data_loader](#spotforecast2_safe.data.entsoe_loader.entsoe_test_data_loader) | Return the merged ENTSO-E CSV sliced to the forecast horizon. | + +### entsoe_data_loader { #spotforecast2_safe.data.entsoe_loader.entsoe_data_loader } + +```python +data.entsoe_loader.entsoe_data_loader(config) +``` + +Read the merged interim ENTSO-E CSV that ``config.data_filename`` points at. + +#### Parameters {.doc-section .doc-section-parameters} + +| Name | Type | Description | Default | +|--------|----------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|------------| +| config | [ConfigEntsoe](`spotforecast2_safe.configurator.ConfigEntsoe`) | A `ConfigEntsoe` with ``data_filename`` set. Relative paths are resolved against `spotforecast2_safe.data.fetch_data.get_data_home`. | _required_ | + +#### Returns {.doc-section .doc-section-returns} + +| Name | Type | Description | +|--------|------------------------------------------------|--------------------------------------------------------------------| +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | DataFrame indexed by the ENTSO-E timestamp column (``Time (UTC)``) | +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | with the load columns as data columns. | + +#### Raises {.doc-section .doc-section-raises} + +| Name | Type | Description | +|--------|------------------------------------------|-----------------------------------------------------------------------------------------------| +| | [FileNotFoundError](`FileNotFoundError`) | If the merged CSV does not exist. Run ``spotforecast2-entsoe download`` and ``merge`` first. | + +#### Examples {.doc-section .doc-section-examples} + +```{python} +import os +import tempfile + +import pandas as pd +from spotforecast2_safe.configurator import ConfigEntsoe +from spotforecast2_safe.data.entsoe_loader import entsoe_data_loader + +# Build a tiny synthetic interim CSV in a temp directory. +tmp = tempfile.mkdtemp() +csv_path = os.path.join(tmp, "energy_load.csv") +idx = pd.date_range( + "2025-01-01", periods=48, freq="h", tz="UTC", name="Time (UTC)" +) +pd.DataFrame({"Actual Load": range(48)}, index=idx).to_csv(csv_path) + +# Absolute path bypasses get_data_home; loader returns the full frame. +config = ConfigEntsoe() +config.data_filename = csv_path +df = entsoe_data_loader(config) + +print(df.shape) +assert df.shape == (48, 1) +assert df.index.name == "Time (UTC)" +``` + +### entsoe_test_data_loader { #spotforecast2_safe.data.entsoe_loader.entsoe_test_data_loader } + +```python +data.entsoe_loader.entsoe_test_data_loader(config) +``` + +Return the merged ENTSO-E CSV sliced to the forecast horizon. + +The slice spans ``(end_train, end_train + predict_size * 1 h]`` so that +``build_prediction_package``'s ``test_actual = ts.reindex(future_pred.index)`` +matches the hourly forecast row-for-row. ``end_train`` is taken from +``config.end_train_default`` (treated as the *inclusive* last training +timestamp, the same convention the forecaster uses), and the step is +assumed to be 1 h after the pipeline's hourly resampling. + +For the live ENTSO-E exemplar with ``end_train_default = D-2 23:00 UTC`` +and ``predict_size = 24``, this returns the rows for +``[D-1 00:00, D 00:00)`` — i.e., ``y_{-1}``. For backtests at an arbitrary +``end_train_default``, it returns the post-cutoff window the model is +actually predicting, rather than always "yesterday in wall-clock UTC". + +#### Parameters {.doc-section .doc-section-parameters} + +| Name | Type | Description | Default | +|--------|----------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------| +| config | [ConfigEntsoe](`spotforecast2_safe.configurator.ConfigEntsoe`) | A `ConfigEntsoe` with ``data_filename``, ``end_train_default``, and ``predict_size`` set; the merged interim CSV must already contain data covering the forecast horizon (run ``spotforecast2-entsoe download`` first). | _required_ | + +#### Returns {.doc-section .doc-section-returns} + +| Name | Type | Description | +|--------|------------------------------------------------|------------------------------------------------------------------------| +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | DataFrame indexed by ``Time (UTC)`` with the rows the forecast will be | +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | scored against. | + +#### Examples {.doc-section .doc-section-examples} + +```{python} +import os +import tempfile + +import pandas as pd +from spotforecast2_safe.configurator import ConfigEntsoe +from spotforecast2_safe.data.entsoe_loader import entsoe_test_data_loader + +# Synthetic interim CSV spanning the forecast window. +tmp = tempfile.mkdtemp() +csv_path = os.path.join(tmp, "energy_load.csv") +idx = pd.date_range( + "2025-12-29 00:00", periods=120, freq="h", tz="UTC", name="Time (UTC)" +) +pd.DataFrame({"Actual Load": range(120)}, index=idx).to_csv(csv_path) + +config = ConfigEntsoe() +config.data_filename = csv_path +config.end_train_default = "2025-12-31 00:00+00:00" +config.predict_size = 24 + +test_df = entsoe_test_data_loader(config) + +# The slice covers exactly predict_size hourly steps after end_train. +print(test_df.shape) +assert test_df.shape == (24, 1) +assert test_df.index[0] == pd.Timestamp("2025-12-31 01:00", tz="UTC") +``` \ No newline at end of file diff --git a/docs/reference/index.qmd b/docs/reference/index.qmd index 7213ad1a..ac18dbc0 100644 --- a/docs/reference/index.qmd +++ b/docs/reference/index.qmd @@ -19,6 +19,7 @@ Utilities for fetching and loading time series, weather, and holiday data. | [data.fetch_data.load_day_ahead_price](data.fetch_data.load_day_ahead_price.qmd#spotforecast2_safe.data.fetch_data.load_day_ahead_price) | Load the ENTSO-E day-ahead spot price (DE/LU) as an hourly series. | | [data.data_classes](data.data_classes.qmd#spotforecast2_safe.data.data_classes) | Data structures for input and processed data. | | [data.demo_loader](data.demo_loader.qmd#spotforecast2_safe.data.demo_loader) | Demo data loader for safety-critical forecasting tasks. | +| [data.entsoe_loader](data.entsoe_loader.qmd#spotforecast2_safe.data.entsoe_loader) | ENTSO-E interim-CSV data loaders. | ## Preprocessing diff --git a/src/spotforecast2_safe/data/__init__.py b/src/spotforecast2_safe/data/__init__.py index ee7c0cc9..5bc4b5eb 100644 --- a/src/spotforecast2_safe/data/__init__.py +++ b/src/spotforecast2_safe/data/__init__.py @@ -3,6 +3,10 @@ from spotforecast2_safe.data.data_classes import Data, Period from spotforecast2_safe.data.demo_loader import load_actual_combined +from spotforecast2_safe.data.entsoe_loader import ( + entsoe_data_loader, + entsoe_test_data_loader, +) from spotforecast2_safe.data.fetch_data import ( fetch_data, fetch_holiday_data, @@ -14,6 +18,8 @@ __all__ = [ "Data", "Period", + "entsoe_data_loader", + "entsoe_test_data_loader", "fetch_data", "fetch_holiday_data", "fetch_weather_data", diff --git a/src/spotforecast2_safe/data/entsoe_loader.py b/src/spotforecast2_safe/data/entsoe_loader.py new file mode 100644 index 00000000..265357b5 --- /dev/null +++ b/src/spotforecast2_safe/data/entsoe_loader.py @@ -0,0 +1,138 @@ +# SPDX-FileCopyrightText: 2026 bartzbeielstein +# SPDX-License-Identifier: AGPL-3.0-or-later + +"""ENTSO-E interim-CSV data loaders. + +Config-driven loaders for the merged ENTSO-E interim CSV, suitable for the +``data_loader`` / ``test_data_loader`` hooks on `ConfigEntsoe`. Ported from +``spotforecast2.tasks.task_entsoe`` ahead of that subpackage's removal. +""" + +from pathlib import Path + +import pandas as pd + +from spotforecast2_safe.configurator import ConfigEntsoe +from spotforecast2_safe.data.fetch_data import get_data_home + + +def entsoe_data_loader(config: ConfigEntsoe) -> pd.DataFrame: + """Read the merged interim ENTSO-E CSV that ``config.data_filename`` points at. + + Args: + config: A `ConfigEntsoe` with ``data_filename`` set. Relative paths + are resolved against `spotforecast2_safe.data.fetch_data.get_data_home`. + + Returns: + DataFrame indexed by the ENTSO-E timestamp column (``Time (UTC)``) + with the load columns as data columns. + + Raises: + FileNotFoundError: If the merged CSV does not exist. Run + ``spotforecast2-entsoe download`` and ``merge`` first. + + Examples: + ```{python} + import os + import tempfile + + import pandas as pd + from spotforecast2_safe.configurator import ConfigEntsoe + from spotforecast2_safe.data.entsoe_loader import entsoe_data_loader + + # Build a tiny synthetic interim CSV in a temp directory. + tmp = tempfile.mkdtemp() + csv_path = os.path.join(tmp, "energy_load.csv") + idx = pd.date_range( + "2025-01-01", periods=48, freq="h", tz="UTC", name="Time (UTC)" + ) + pd.DataFrame({"Actual Load": range(48)}, index=idx).to_csv(csv_path) + + # Absolute path bypasses get_data_home; loader returns the full frame. + config = ConfigEntsoe() + config.data_filename = csv_path + df = entsoe_data_loader(config) + + print(df.shape) + assert df.shape == (48, 1) + assert df.index.name == "Time (UTC)" + ``` + """ + path = Path(config.data_filename) + if not path.is_absolute(): + path = get_data_home() / path + if not path.exists(): + raise FileNotFoundError( + f"ENTSO-E merged CSV not found at {path}. Run " + "`spotforecast2-entsoe download` and `merge` first." + ) + return pd.read_csv(path, index_col=0, parse_dates=True) + + +def entsoe_test_data_loader(config: ConfigEntsoe) -> pd.DataFrame: + """Return the merged ENTSO-E CSV sliced to the forecast horizon. + + The slice spans ``(end_train, end_train + predict_size * 1 h]`` so that + ``build_prediction_package``'s ``test_actual = ts.reindex(future_pred.index)`` + matches the hourly forecast row-for-row. ``end_train`` is taken from + ``config.end_train_default`` (treated as the *inclusive* last training + timestamp, the same convention the forecaster uses), and the step is + assumed to be 1 h after the pipeline's hourly resampling. + + For the live ENTSO-E exemplar with ``end_train_default = D-2 23:00 UTC`` + and ``predict_size = 24``, this returns the rows for + ``[D-1 00:00, D 00:00)`` — i.e., ``y_{-1}``. For backtests at an arbitrary + ``end_train_default``, it returns the post-cutoff window the model is + actually predicting, rather than always "yesterday in wall-clock UTC". + + Args: + config: A `ConfigEntsoe` with ``data_filename``, ``end_train_default``, + and ``predict_size`` set; the merged interim CSV must already + contain data covering the forecast horizon (run + ``spotforecast2-entsoe download`` first). + + Returns: + DataFrame indexed by ``Time (UTC)`` with the rows the forecast will be + scored against. + + Examples: + ```{python} + import os + import tempfile + + import pandas as pd + from spotforecast2_safe.configurator import ConfigEntsoe + from spotforecast2_safe.data.entsoe_loader import entsoe_test_data_loader + + # Synthetic interim CSV spanning the forecast window. + tmp = tempfile.mkdtemp() + csv_path = os.path.join(tmp, "energy_load.csv") + idx = pd.date_range( + "2025-12-29 00:00", periods=120, freq="h", tz="UTC", name="Time (UTC)" + ) + pd.DataFrame({"Actual Load": range(120)}, index=idx).to_csv(csv_path) + + config = ConfigEntsoe() + config.data_filename = csv_path + config.end_train_default = "2025-12-31 00:00+00:00" + config.predict_size = 24 + + test_df = entsoe_test_data_loader(config) + + # The slice covers exactly predict_size hourly steps after end_train. + print(test_df.shape) + assert test_df.shape == (24, 1) + assert test_df.index[0] == pd.Timestamp("2025-12-31 01:00", tz="UTC") + ``` + """ + df = entsoe_data_loader(config) + end_train = pd.Timestamp(config.end_train_default) + if end_train.tzinfo is None: + end_train = end_train.tz_localize("UTC") + step = pd.Timedelta(hours=1) # post-resample assumption + start = end_train + step # first forecast step + end = start + config.predict_size * step # exclusive upper bound + if df.index.tz is None: + start = start.tz_localize(None) + end = end.tz_localize(None) + return df.loc[(df.index >= start) & (df.index < end)] diff --git a/tests/test_entsoe_loader.py b/tests/test_entsoe_loader.py new file mode 100644 index 00000000..35e1f28e --- /dev/null +++ b/tests/test_entsoe_loader.py @@ -0,0 +1,103 @@ +# SPDX-FileCopyrightText: 2026 bartzbeielstein +# SPDX-License-Identifier: AGPL-3.0-or-later + +"""Tests for the ENTSO-E interim-CSV loaders in ``data.entsoe_loader``.""" + +import pandas as pd +import pytest + +import spotforecast2_safe.data.entsoe_loader as entsoe_loader +from spotforecast2_safe.configurator import ConfigEntsoe +from spotforecast2_safe.data.entsoe_loader import ( + entsoe_data_loader, + entsoe_test_data_loader, +) + + +def _write_interim_csv(path, start: str, periods: int, tz: str | None = "UTC"): + """Write a synthetic merged ENTSO-E CSV and return its DataFrame.""" + idx = pd.date_range(start, periods=periods, freq="h", tz=tz, name="Time (UTC)") + df = pd.DataFrame({"Actual Load": range(periods)}, index=idx) + df.to_csv(path) + return df + + +class TestEntsoeDataLoader: + def test_absolute_path_loads_full_frame(self, tmp_path): + csv_path = tmp_path / "energy_load.csv" + _write_interim_csv(csv_path, "2025-01-01", 48) + + config = ConfigEntsoe() + config.data_filename = str(csv_path) + df = entsoe_data_loader(config) + + assert df.shape == (48, 1) + assert df.index.name == "Time (UTC)" + assert isinstance(df.index, pd.DatetimeIndex) + + def test_relative_path_resolves_against_data_home(self, tmp_path, monkeypatch): + _write_interim_csv(tmp_path / "energy_load.csv", "2025-01-01", 24) + monkeypatch.setattr(entsoe_loader, "get_data_home", lambda: tmp_path) + + config = ConfigEntsoe() + config.data_filename = "energy_load.csv" + df = entsoe_data_loader(config) + + assert df.shape == (24, 1) + + def test_missing_file_raises_with_cli_hint(self, tmp_path): + config = ConfigEntsoe() + config.data_filename = str(tmp_path / "does_not_exist.csv") + + with pytest.raises(FileNotFoundError, match="spotforecast2-entsoe"): + entsoe_data_loader(config) + + +class TestEntsoeTestDataLoader: + def _config(self, csv_path, end_train: str, predict_size: int = 24): + config = ConfigEntsoe() + config.data_filename = str(csv_path) + config.end_train_default = end_train + config.predict_size = predict_size + return config + + def test_slices_predict_size_steps_after_end_train(self, tmp_path): + csv_path = tmp_path / "energy_load.csv" + _write_interim_csv(csv_path, "2025-12-29 00:00", 120) + config = self._config(csv_path, "2025-12-31 00:00+00:00") + + test_df = entsoe_test_data_loader(config) + + assert test_df.shape == (24, 1) + assert test_df.index[0] == pd.Timestamp("2025-12-31 01:00", tz="UTC") + assert test_df.index[-1] == pd.Timestamp("2026-01-01 00:00", tz="UTC") + + def test_naive_end_train_is_localized_to_utc(self, tmp_path): + csv_path = tmp_path / "energy_load.csv" + _write_interim_csv(csv_path, "2025-12-29 00:00", 120) + config = self._config(csv_path, "2025-12-31 00:00") # no tz marker + + test_df = entsoe_test_data_loader(config) + + assert test_df.shape == (24, 1) + assert test_df.index[0] == pd.Timestamp("2025-12-31 01:00", tz="UTC") + + def test_naive_csv_index_is_supported(self, tmp_path): + csv_path = tmp_path / "energy_load.csv" + _write_interim_csv(csv_path, "2025-12-29 00:00", 120, tz=None) + config = self._config(csv_path, "2025-12-31 00:00+00:00") + + test_df = entsoe_test_data_loader(config) + + assert test_df.shape == (24, 1) + assert test_df.index[0] == pd.Timestamp("2025-12-31 01:00") + assert test_df.index.tz is None + + def test_window_shorter_when_data_runs_out(self, tmp_path): + csv_path = tmp_path / "energy_load.csv" + _write_interim_csv(csv_path, "2025-12-29 00:00", 60) # ends 12-31 11:00 + config = self._config(csv_path, "2025-12-31 00:00+00:00") + + test_df = entsoe_test_data_loader(config) + + assert len(test_df) == 11 # only the rows that exist after the cutoff