From 848027fa7be0ee072bf6fd976a2016e61f4c41d6 Mon Sep 17 00:00:00 2001 From: bartzbeielstein <32470350+bartzbeielstein@users.noreply.github.com> Date: Sun, 14 Jun 2026 00:39:26 +0200 Subject: [PATCH] feat(downloader): ENTSO-E gap repair + resume/cooldown fixes; default country_code DE Reconciles the long-lived fix/entsoe-gap-repair work onto current develop (develop had independently reworked download_new_data, so this is a re-apply, not a mechanical rebase). Field incident: ENTSO-E published no data for 2026-06-01/02, leaving an interior hole in interim/energy_load.csv that the pipeline could neither detect nor heal. New API (spotforecast2_safe.downloader): - repair_data_gaps(): detect interior gaps, heal from already-downloaded raw CSVs first, then issue targeted downloads for intervals still missing; never invents values, raises by default (on_unavailable='use_existing' opts into proceeding with gapped data). - find_missing_intervals(): report interior gaps of a DatetimeIndex (mode-based step, so hourly and 15-min data both work). - download_new_data(..., on_unavailable=...): opt-in to keep operating on the existing interim data when ENTSO-E stays unreachable after the retry budget. Fixes in download_new_data() (kept develop's _MAX_BACKFILL_DAYS heal + timeout): - Resume (start=None) now reads the interim file via fetch_data(filename=...). The bare fetch_data() call raised ValueError ("filename must be specified") unconditionally, so resume silently fell into the 7-day fallback on every incremental run -- which also made develop's backfill heal dead code. Now both work. - Cooldown is keyed on the recency of the newest raw entsoe_load_* file, not the WIDTH of the requested window (the old check silently skipped sub-24h backfills). A window overlapping a known interim gap bypasses the cooldown. - end=None now means "now" instead of "today 00:00 UTC" (the latter made incremental runs no-ops for most of the day once resume works). - end <= start with both bounds explicit raises ValueError; a derived empty window is an "already up to date" no-op. - An empty/all-NaN ENTSO-E response is logged at WARNING and writes no raw file instead of masking the gap as a successful download. - default country_code "FR" -> "DE", matching download_renewable_forecast (DE) / download_day_ahead_price (DE_LU) and the package's German-energy focus. Tests: new tests/test_entsoe_gap_repair.py (375 lines); the two width-semantics cooldown tests updated to the recency model. quartodoc reference + sidebar regenerated for the two new public functions. Full suite 2665 passed. Co-Authored-By: Claude Opus 4.8 (1M context) --- _quarto.yml | 6 + .../downloader.entsoe.download_new_data.qmd | 34 +- ...wnloader.entsoe.find_missing_intervals.qmd | 44 ++ .../downloader.entsoe.repair_data_gaps.qmd | 73 ++++ docs/reference/index.qmd | 2 + src/spotforecast2_safe/downloader/__init__.py | 9 +- src/spotforecast2_safe/downloader/entsoe.py | 402 +++++++++++++++++- tests/test_downloader_entsoe.py | 20 +- tests/test_entsoe_gap_repair.py | 375 ++++++++++++++++ tests/test_entsoe_integration.py | 13 +- 10 files changed, 935 insertions(+), 43 deletions(-) create mode 100644 docs/reference/downloader.entsoe.find_missing_intervals.qmd create mode 100644 docs/reference/downloader.entsoe.repair_data_gaps.qmd create mode 100644 tests/test_entsoe_gap_repair.py diff --git a/_quarto.yml b/_quarto.yml index 534c5f4cd..5334f5a07 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -445,6 +445,10 @@ website: file: docs/reference/downloader.entsoe.download_day_ahead_price.qmd - text: "merge_build_manual" file: docs/reference/downloader.entsoe.merge_build_manual.qmd + - text: "repair_data_gaps" + file: docs/reference/downloader.entsoe.repair_data_gaps.qmd + - text: "find_missing_intervals" + file: docs/reference/downloader.entsoe.find_missing_intervals.qmd - text: "download_zone_loads" file: docs/reference/downloader.entsoe.download_zone_loads.qmd - text: "assemble_zone_loads" @@ -832,6 +836,8 @@ quartodoc: - downloader.entsoe.download_renewable_forecast - downloader.entsoe.download_day_ahead_price - downloader.entsoe.merge_build_manual + - downloader.entsoe.repair_data_gaps + - downloader.entsoe.find_missing_intervals - downloader.entsoe.download_zone_loads - downloader.entsoe.assemble_zone_loads - downloader.entsoe.ZoneResult diff --git a/docs/reference/downloader.entsoe.download_new_data.qmd b/docs/reference/downloader.entsoe.download_new_data.qmd index 8b65e6084..e30b18f22 100644 --- a/docs/reference/downloader.entsoe.download_new_data.qmd +++ b/docs/reference/downloader.entsoe.download_new_data.qmd @@ -3,12 +3,13 @@ ```python downloader.entsoe.download_new_data( api_key, - country_code='FR', + country_code='DE', start=None, end=None, force=False, keep_forecast_future=False, timeout=60.0, + on_unavailable='raise', ) ``` @@ -25,23 +26,24 @@ automatically on the next incremental download. ## Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|----------------------|---------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------| -| api_key | [str](`str`) | The ENTSO-E API key. | _required_ | -| country_code | [str](`str`) | The country code to query (e.g., 'FR', 'DE'). Defaults to "FR". | `'FR'` | -| start | [str](`str`) \| None | Start date in 'YYYYMMDDHH00' format. | `None` | -| end | [str](`str`) \| None | End date in 'YYYYMMDDHH00' format. | `None` | -| force | [bool](`bool`) | If True, bypass the 24h cooldown check. | `False` | -| keep_forecast_future | [bool](`bool`) | If True, retain rows after the current UTC moment when building the interim file, preserving ENTSO-E's day-ahead `Forecasted Load` for tomorrow. Defaults to False (future rows are dropped, the leakage-free input for training). The flag only keeps rows the query window already covers, so pass an ``end`` that reaches into the target day (e.g. tomorrow) as well. See `merge_build_manual` for details. | `False` | -| timeout | [Optional](`typing.Optional`)\[[float](`float`)\] | Per-socket-operation read timeout in seconds passed to the ENTSO-E client. Kills stalled connections (raises ``requests.exceptions.Timeout``, caught by the existing retry loop, then ``RuntimeError`` after retries) without bounding long live transfers. ``None`` disables the timeout. Defaults to ``60.0``. | `60.0` | +| Name | Type | Description | Default | +|----------------------|------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------| +| api_key | [str](`str`) | The ENTSO-E API key. | _required_ | +| country_code | [str](`str`) | The country code to query (e.g., 'DE', 'FR'). Defaults to "DE". | `'DE'` | +| start | [str](`str`) \| [pd](`pandas`).[Timestamp](`pandas.Timestamp`) \| None | Start of the query window ('YYYYMMDDHH00' or a timestamp). If None, resumes from the last observed timestamp in the interim file, extending back to heal still-missing actuals (bounded by ``_MAX_BACKFILL_DAYS``) and falling back to seven days ago when no prior data exists. | `None` | +| end | [str](`str`) \| [pd](`pandas`).[Timestamp](`pandas.Timestamp`) \| None | End of the query window ('YYYYMMDDHH00' or a timestamp). If None, the current UTC moment is used. | `None` | +| force | [bool](`bool`) | If True, bypass the cooldown that skips a download when the last successful download is younger than ``_COOLDOWN_HOURS`` (24) hours. The cooldown is bypassed automatically when the requested window overlaps a known gap in the interim data, so gap backfills are never silently skipped. | `False` | +| keep_forecast_future | [bool](`bool`) | If True, retain rows after the current UTC moment when building the interim file, preserving ENTSO-E's day-ahead `Forecasted Load` for tomorrow. Defaults to False (future rows are dropped, the leakage-free input for training). The flag only keeps rows the query window already covers, so pass an ``end`` that reaches into the target day (e.g. tomorrow) as well. See `merge_build_manual` for details. | `False` | +| timeout | [Optional](`typing.Optional`)\[[float](`float`)\] | Per-socket-operation read timeout in seconds passed to the ENTSO-E client. Kills stalled connections (raises ``requests.exceptions.Timeout``, caught by the existing retry loop, then ``RuntimeError`` after retries) without bounding long live transfers. ``None`` disables the timeout. Defaults to ``60.0``. | `60.0` | +| on_unavailable | [OnUnavailable](`spotforecast2_safe.downloader.entsoe.OnUnavailable`) | What to do when ENTSO-E stays unreachable after the retry budget is exhausted. ``"raise"`` (default) raises ``RuntimeError``; ``"use_existing"`` logs how stale the interim data is and returns so the caller can continue on existing data (requires an existing interim file, else ``RuntimeError``). | `'raise'` | ## Raises {.doc-section .doc-section-raises} -| Name | Type | Description | -|--------|--------------------------------|----------------------------------------------------------------| -| | [ImportError](`ImportError`) | If the Python package 'entsoe-py' is not installed. | -| | [ValueError](`ValueError`) | If ``start`` or ``end`` cannot be parsed as a valid timestamp. | -| | [RuntimeError](`RuntimeError`) | If data fetching fails after ``_MAX_RETRIES`` attempts. | +| Name | Type | Description | +|--------|--------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| | [ImportError](`ImportError`) | If the Python package 'entsoe-py' is not installed. | +| | [ValueError](`ValueError`) | If ``start`` or ``end`` cannot be parsed as a valid timestamp, if both are given explicitly but ``end`` is not after ``start``, or if ``on_unavailable`` is not a recognized value. | +| | [RuntimeError](`RuntimeError`) | If data fetching fails after ``_MAX_RETRIES`` attempts and ``on_unavailable='raise'``. | ## Notes {.doc-section .doc-section-notes} @@ -79,7 +81,7 @@ download_new_data( ) # Incremental download (automatically resumes from last data point) -download_new_data(api_key="YOUR_API_KEY", country_code="FR") +download_new_data(api_key="YOUR_API_KEY", country_code="DE") # Forced download bypassing the 24-hour cooldown check download_new_data( diff --git a/docs/reference/downloader.entsoe.find_missing_intervals.qmd b/docs/reference/downloader.entsoe.find_missing_intervals.qmd new file mode 100644 index 000000000..724dd7bd8 --- /dev/null +++ b/docs/reference/downloader.entsoe.find_missing_intervals.qmd @@ -0,0 +1,44 @@ +# downloader.entsoe.find_missing_intervals { #spotforecast2_safe.downloader.entsoe.find_missing_intervals } + +```python +downloader.entsoe.find_missing_intervals(index) +``` + +Find interior gaps in a datetime index. + +Compares the index against the complete date range spanning its first +and last timestamp -- the same completeness notion as +`spotforecast2_safe.preprocessing.curate_data.basic_ts_checks`, which +raises on an incomplete index; this function reports the gaps instead +so callers can repair them. The expected spacing is the most common +step between consecutive timestamps, so the function works for hourly +as well as 15-minute ENTSO-E data. + +## Parameters {.doc-section .doc-section-parameters} + +| Name | Type | Description | Default | +|--------|--------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------|------------| +| index | [pd](`pandas`).[DatetimeIndex](`pandas.DatetimeIndex`) | Sorted, duplicate-free datetime index to inspect. Indexes with fewer than 3 entries cannot contain a detectable interior gap and yield an empty list. | _required_ | + +## Returns {.doc-section .doc-section-returns} + +| Name | Type | Description | +|--------|--------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| | [list](`list`)\[[tuple](`tuple`)\[[pd](`pandas`).[Timestamp](`pandas.Timestamp`), [pd](`pandas`).[Timestamp](`pandas.Timestamp`)\]\] | list[tuple[pd.Timestamp, pd.Timestamp]]: One ``(first_missing, last_missing)`` pair per contiguous gap, both bounds inclusive. Empty list when the index is complete. | + +## Raises {.doc-section .doc-section-raises} + +| Name | Type | Description | +|--------|----------------------------|-------------------------------------------------| +| | [ValueError](`ValueError`) | If ``index`` is not sorted in increasing order. | + +## Examples {.doc-section .doc-section-examples} + +```{python} +import pandas as pd +from spotforecast2_safe.downloader.entsoe import find_missing_intervals + +full = pd.date_range("2026-06-01", periods=96, freq="h", tz="UTC") +gappy = full.delete(list(range(48, 72))) # June 3rd is missing +print(find_missing_intervals(gappy)) +``` \ No newline at end of file diff --git a/docs/reference/downloader.entsoe.repair_data_gaps.qmd b/docs/reference/downloader.entsoe.repair_data_gaps.qmd new file mode 100644 index 000000000..4dd42f0b0 --- /dev/null +++ b/docs/reference/downloader.entsoe.repair_data_gaps.qmd @@ -0,0 +1,73 @@ +# downloader.entsoe.repair_data_gaps { #spotforecast2_safe.downloader.entsoe.repair_data_gaps } + +```python +downloader.entsoe.repair_data_gaps( + api_key, + country_code='DE', + keep_forecast_future=False, + on_unavailable='raise', +) +``` + +Detect and repair interior gaps in the interim energy-load file. + +Repairs prefer data that is already on disk over the network: + +1. Re-merge every raw CSV under ``get_data_home() / "raw"`` via + `merge_build_manual()`. When a previously downloaded file already + covers a hole, this fixes the interim file without any network + access. +2. For every interval still missing, issue a targeted + `download_new_data()` call restricted to that interval (padded by + one hour on each side; overlaps deduplicate during the merge). + ENTSO-E sometimes publishes data late, so a gap that existed + yesterday may be fillable today. + +Values are never invented: a gap that ENTSO-E still cannot fill stays +a gap, and the function raises by default so the caller decides +explicitly. Downstream imputation (e.g. +`spotforecast2_safe.preprocessing.LinearlyInterpolateTS`) remains a +separate, opt-in step. + +## Parameters {.doc-section .doc-section-parameters} + +| Name | Type | Description | Default | +|----------------------|----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------| +| api_key | [str](`str`) | The ENTSO-E API key. | _required_ | +| country_code | [str](`str`) | The country code to query (e.g., 'DE', 'FR'). Defaults to "DE". | `'DE'` | +| keep_forecast_future | [bool](`bool`) | Forwarded to `merge_build_manual()` and `download_new_data()`; see their docstrings. Defaults to False. | `False` | +| on_unavailable | [str](`str`) | What to do when gaps remain after both repair steps. Options: * "raise": Raise ``ValueError`` listing the remaining gaps (fail-safe default). * "use_existing": Log a prominent warning and return the remaining gaps so the caller can explicitly continue with the data already on disk. Defaults to "raise". | `'raise'` | + +## Returns {.doc-section .doc-section-returns} + +| Name | Type | Description | +|--------|--------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| | [list](`list`)\[[tuple](`tuple`)\[[pd](`pandas`).[Timestamp](`pandas.Timestamp`), [pd](`pandas`).[Timestamp](`pandas.Timestamp`)\]\] | list[tuple[pd.Timestamp, pd.Timestamp]]: Gaps that could not be repaired, as ``(first_missing, last_missing)`` pairs with both bounds inclusive. An empty list means the observed part of the interim file is complete. | + +## Raises {.doc-section .doc-section-raises} + +| Name | Type | Description | +|--------|------------------------------------------|----------------------------------------------------------------------------------------------------------------------| +| | [FileNotFoundError](`FileNotFoundError`) | If no interim file exists even after merging the raw directory (nothing to repair; run `download_new_data()` first). | +| | [ValueError](`ValueError`) | If gaps remain and ``on_unavailable='raise'``, or if ``on_unavailable`` is not a recognized value. | +| | [ImportError](`ImportError`) | If gaps require a download and the Python package 'entsoe-py' is not installed. | + +## Examples {.doc-section .doc-section-examples} + +```{python} +#| eval: false +from spotforecast2_safe.downloader.entsoe import repair_data_gaps + +# Repair a hole like the June 1st-2nd 2026 incident: re-merge +# local raw files first, then download only the still-missing +# interval from ENTSO-E. +unrepaired = repair_data_gaps(api_key="YOUR_API_KEY", country_code="DE") +print(unrepaired) # [] when everything could be filled + +# Keep operating on stale data when ENTSO-E is down (logged loudly) +unrepaired = repair_data_gaps( + api_key="YOUR_API_KEY", + country_code="DE", + on_unavailable="use_existing", +) +``` \ No newline at end of file diff --git a/docs/reference/index.qmd b/docs/reference/index.qmd index ac18dbc03..849c800f8 100644 --- a/docs/reference/index.qmd +++ b/docs/reference/index.qmd @@ -290,6 +290,8 @@ Data downloaders for external data sources (e.g. ENTSO-E). | [downloader.entsoe.download_renewable_forecast](downloader.entsoe.download_renewable_forecast.qmd#spotforecast2_safe.downloader.entsoe.download_renewable_forecast) | Download the ENTSO-E day-ahead wind/solar generation forecast. | | [downloader.entsoe.download_day_ahead_price](downloader.entsoe.download_day_ahead_price.qmd#spotforecast2_safe.downloader.entsoe.download_day_ahead_price) | Download the ENTSO-E day-ahead spot price (DE/LU). | | [downloader.entsoe.merge_build_manual](downloader.entsoe.merge_build_manual.qmd#spotforecast2_safe.downloader.entsoe.merge_build_manual) | Merge all raw CSV files from the 'raw' directory into a single interim file. | +| [downloader.entsoe.repair_data_gaps](downloader.entsoe.repair_data_gaps.qmd#spotforecast2_safe.downloader.entsoe.repair_data_gaps) | Detect and repair interior gaps in the interim energy-load file. | +| [downloader.entsoe.find_missing_intervals](downloader.entsoe.find_missing_intervals.qmd#spotforecast2_safe.downloader.entsoe.find_missing_intervals) | Find interior gaps in a datetime index. | | [downloader.entsoe.download_zone_loads](downloader.entsoe.download_zone_loads.qmd#spotforecast2_safe.downloader.entsoe.download_zone_loads) | Download Actual Total Load separately for each German TSO control area. | | [downloader.entsoe.assemble_zone_loads](downloader.entsoe.assemble_zone_loads.qmd#spotforecast2_safe.downloader.entsoe.assemble_zone_loads) | Join the per-zone interim load files into one aligned, validated frame. | | [downloader.entsoe.ZoneResult](downloader.entsoe.ZoneResult.qmd#spotforecast2_safe.downloader.entsoe.ZoneResult) | Structured result record for one zone in a ``download_zone_loads`` collect run. | diff --git a/src/spotforecast2_safe/downloader/__init__.py b/src/spotforecast2_safe/downloader/__init__.py index 55ae7f8fd..9f3954edf 100644 --- a/src/spotforecast2_safe/downloader/__init__.py +++ b/src/spotforecast2_safe/downloader/__init__.py @@ -1,9 +1,16 @@ # SPDX-FileCopyrightText: 2026 bartzbeielstein # SPDX-License-Identifier: AGPL-3.0-or-later -from .entsoe import download_new_data, merge_build_manual +from .entsoe import ( + download_new_data, + find_missing_intervals, + merge_build_manual, + repair_data_gaps, +) __all__ = [ "download_new_data", + "find_missing_intervals", "merge_build_manual", + "repair_data_gaps", ] diff --git a/src/spotforecast2_safe/downloader/entsoe.py b/src/spotforecast2_safe/downloader/entsoe.py index 03637b92e..a73b6edc6 100644 --- a/src/spotforecast2_safe/downloader/entsoe.py +++ b/src/spotforecast2_safe/downloader/entsoe.py @@ -202,6 +202,86 @@ def _make_client(client_cls: type, api_key: str, timeout: Optional[float]) -> ob # transparency-platform outages). Bounds the heal window so a structurally # empty column cannot trigger an unbounded re-pull. _MAX_BACKFILL_DAYS = 7 +_INTERIM_FILENAME = "energy_load.csv" + +OnUnavailable = Literal["raise", "use_existing"] + + +def _interim_path() -> Path: + """Return the canonical interim energy-load file path.""" + return get_data_home() / "interim" / _INTERIM_FILENAME + + +def _validate_on_unavailable(on_unavailable: str) -> None: + """Reject unknown ``on_unavailable`` values (fail-safe contract).""" + if on_unavailable not in ("raise", "use_existing"): + raise ValueError( + f"on_unavailable must be 'raise' or 'use_existing'; " + f"got {on_unavailable!r}." + ) + + +def _hours_since_last_download(raw_dir: Path) -> float | None: + """Return hours since the newest raw ENTSO-E file was written. + + Recency is derived from the modification time of the newest + ``entsoe_load_*.csv`` file in ``raw_dir``, i.e. only files written by + `download_new_data()` count as downloads. Returns None when no such + file exists (no download has happened yet). + """ + if not raw_dir.exists(): + return None + mtimes = [f.stat().st_mtime for f in raw_dir.glob("entsoe_load_*.csv")] + if not mtimes: + return None + now_epoch = pd.Timestamp.now(tz="UTC").timestamp() + return max(0.0, (now_epoch - max(mtimes)) / 3600.0) + + +def _interim_gaps() -> list[tuple[pd.Timestamp, pd.Timestamp]]: + """Return interior gaps of the observed part of the interim file. + + A timestamp counts as missing when its row is absent from the interim + file or when its ``Actual Load`` value is NaN, restricted to rows at + or before the current UTC moment: future day-ahead forecast rows + carry no actuals by design (see `merge_build_manual`) and are not + gaps. + + Raises: + FileNotFoundError: If the interim file does not exist yet. + """ + data = fetch_data(filename=_interim_path()) + observed = data.loc[data.index <= pd.Timestamp.now(tz="UTC")] + if "Actual Load" in observed.columns: + observed = observed.loc[observed["Actual Load"].notna()] + return find_missing_intervals(observed.index) + + +def _range_overlaps_gap(start_date: pd.Timestamp, end_date: pd.Timestamp) -> bool: + """Whether ``[start_date, end_date]`` intersects a known interior gap. + + Used only as a cooldown-bypass heuristic; when the gaps cannot be + determined (no interim file yet, unreadable index), the answer is + False and the regular cooldown decision applies. + """ + try: + gaps = _interim_gaps() + except (FileNotFoundError, ValueError, IndexError): + return False + return any(gs <= end_date and ge >= start_date for gs, ge in gaps) + + +def _staleness_hours() -> float | None: + """Hours between now and the last observed interim timestamp, if known.""" + try: + data = fetch_data(filename=_interim_path()) + now = pd.Timestamp.now(tz="UTC") + observed = data.loc[data.index <= now] + if observed.empty: + return None + return (now - observed.index[-1]).total_seconds() / 3600.0 + except (FileNotFoundError, ValueError, IndexError): + return None def merge_build_manual( @@ -356,12 +436,13 @@ def merge_build_manual( def download_new_data( api_key: str, - country_code: str = "FR", - start: str | None = None, - end: str | None = None, + country_code: str = "DE", + start: str | pd.Timestamp | None = None, + end: str | pd.Timestamp | None = None, force: bool = False, keep_forecast_future: bool = False, timeout: Optional[float] = 60.0, + on_unavailable: OnUnavailable = "raise", ) -> None: """ Download new load and forecast data from ENTSO-E. @@ -377,11 +458,20 @@ def download_new_data( Args: api_key: The ENTSO-E API key. - country_code: The country code to query (e.g., 'FR', 'DE'). - Defaults to "FR". - start: Start date in 'YYYYMMDDHH00' format. - end: End date in 'YYYYMMDDHH00' format. - force: If True, bypass the 24h cooldown check. + country_code: The country code to query (e.g., 'DE', 'FR'). + Defaults to "DE". + start: Start of the query window ('YYYYMMDDHH00' or a timestamp). If + None, resumes from the last observed timestamp in the interim file, + extending back to heal still-missing actuals (bounded by + ``_MAX_BACKFILL_DAYS``) and falling back to seven days ago when no + prior data exists. + end: End of the query window ('YYYYMMDDHH00' or a timestamp). If None, + the current UTC moment is used. + force: If True, bypass the cooldown that skips a download when the last + successful download is younger than ``_COOLDOWN_HOURS`` (24) hours. + The cooldown is bypassed automatically when the requested window + overlaps a known gap in the interim data, so gap backfills are + never silently skipped. keep_forecast_future: If True, retain rows after the current UTC moment when building the interim file, preserving ENTSO-E's day-ahead `Forecasted Load` for tomorrow. Defaults to False (future @@ -394,14 +484,22 @@ def download_new_data( ``requests.exceptions.Timeout``, caught by the existing retry loop, then ``RuntimeError`` after retries) without bounding long live transfers. ``None`` disables the timeout. Defaults to ``60.0``. + on_unavailable: What to do when ENTSO-E stays unreachable after the + retry budget is exhausted. ``"raise"`` (default) raises + ``RuntimeError``; ``"use_existing"`` logs how stale the interim + data is and returns so the caller can continue on existing data + (requires an existing interim file, else ``RuntimeError``). Raises: ImportError: If the Python package 'entsoe-py' is not installed. ValueError: - If ``start`` or ``end`` cannot be parsed as a valid timestamp. + If ``start`` or ``end`` cannot be parsed as a valid timestamp, if + both are given explicitly but ``end`` is not after ``start``, or if + ``on_unavailable`` is not a recognized value. RuntimeError: - If data fetching fails after ``_MAX_RETRIES`` attempts. + If data fetching fails after ``_MAX_RETRIES`` attempts and + ``on_unavailable='raise'``. Notes: Logging information can be selected by setting the log level for the @@ -437,7 +535,7 @@ def download_new_data( ) # Incremental download (automatically resumes from last data point) - download_new_data(api_key="YOUR_API_KEY", country_code="FR") + download_new_data(api_key="YOUR_API_KEY", country_code="DE") # Forced download bypassing the 24-hour cooldown check download_new_data( @@ -459,6 +557,8 @@ def download_new_data( ``` """ + _validate_on_unavailable(on_unavailable) + try: from entsoe import EntsoePandasClient except ImportError as e: @@ -475,7 +575,11 @@ def download_new_data( # Determine start date if start is None: try: - current_data = fetch_data() # This might look at interim or a specific file + # Resume from the interim file. (A bare ``fetch_data()`` call here + # used to raise ValueError unconditionally -- "filename must be + # specified" -- so resume silently fell into the 7-day fallback on + # every incremental run, and the backfill heal below was dead code.) + current_data = fetch_data(filename=_interim_path()) # Resume from the last row at or before now, not the raw index max: # with keep_forecast_future=True the interim can carry future # forecast rows, and resuming from those would skip real history. @@ -531,23 +635,55 @@ def download_new_data( # Determine end date if end is None: - end_date = pd.Timestamp.now(tz="UTC").floor("D") - logger.info("No end date provided. Using current date: %s", end_date) + end_date = pd.Timestamp.now(tz="UTC") + logger.info("No end date provided. Using current time: %s", end_date) else: end_date = pd.to_datetime(end, utc=True, errors="coerce") if pd.isna(end_date): raise ValueError(f"end={end!r} did not parse to a valid timestamp") logger.info("Using provided end date: %s", end_date) - # Safety check: avoid redundant small downloads - hours_diff = (end_date - start_date).total_seconds() / 3600 - if hours_diff < _COOLDOWN_HOURS and not force: + # An empty window means there is nothing new to fetch. When both bounds + # were given explicitly this is invalid input (fail-safe: raise); when at + # least one bound was derived it is the normal "already up to date" outcome + # of an incremental run. + if end_date <= start_date: + if start is not None and end is not None: + raise ValueError( + f"end={end!r} is not after start={start!r}; an empty download " + "window is invalid when both bounds are given explicitly." + ) logger.info( - "Last download was too recent (%.1f hours ago). Skipping.", - _COOLDOWN_HOURS - hours_diff, + "Nothing to download: requested window is empty (start %s >= end %s).", + start_date, + end_date, ) return + # Cooldown: skip only when a successful download happened within the last + # _COOLDOWN_HOURS. Recency comes from the newest raw file written by this + # function, NOT from the width of the requested window (the old check + # silently skipped small backfills). A window that overlaps a known gap in + # the interim data always bypasses the cooldown so gaps repair immediately. + if not force: + hours_since = _hours_since_last_download(get_data_home() / "raw") + if hours_since is not None and hours_since < _COOLDOWN_HOURS: + if _range_overlaps_gap(start_date, end_date): + logger.info( + "Cooldown bypassed: requested window %s to %s overlaps a " + "known gap in the interim data.", + start_date, + end_date, + ) + else: + logger.info( + "Last successful download was %.1f hours ago (< %d h " + "cooldown). Skipping. Pass force=True to override.", + hours_since, + _COOLDOWN_HOURS, + ) + return + client = _make_client(EntsoePandasClient, api_key=api_key, timeout=timeout) # Retry loop @@ -579,10 +715,40 @@ def download_new_data( time.sleep(_RETRY_BACKOFF_SECONDS) if not success or downloaded_df is None: + if on_unavailable == "use_existing" and _interim_path().exists(): + staleness = _staleness_hours() + staleness_msg = ( + f"last observed data is {staleness:.1f} hours old" + if staleness is not None + else "staleness unknown" + ) + logger.warning( + "ENTSO-E unavailable after %d attempts. Proceeding with the " + "existing interim data (%s) because on_unavailable=" + "'use_existing'. Downstream results are based on stale data.", + _MAX_RETRIES, + staleness_msg, + ) + return raise RuntimeError( f"Failed to download data from ENTSO-E after {_MAX_RETRIES} attempts." ) + # An empty response (or one that only carries fully-NaN rows) means + # ENTSO-E has no data for the window yet. Writing a raw file for it would + # hide the gap behind an apparently successful download, so the condition + # is logged loudly and nothing is saved. + downloaded_df = downloaded_df.dropna(how="all") + if downloaded_df.empty: + logger.warning( + "ENTSO-E returned no data for %s to %s (%s). Nothing was saved; " + "the requested interval is still missing upstream.", + start_date, + end_date, + country_code, + ) + return + # Save to raw data_home = get_data_home() raw_dir = data_home / "raw" @@ -600,6 +766,204 @@ def download_new_data( merge_build_manual(keep_forecast_future=keep_forecast_future) +def find_missing_intervals( + index: pd.DatetimeIndex, +) -> list[tuple[pd.Timestamp, pd.Timestamp]]: + """Find interior gaps in a datetime index. + + Compares the index against the complete date range spanning its first + and last timestamp -- the same completeness notion as + `spotforecast2_safe.preprocessing.curate_data.basic_ts_checks`, which + raises on an incomplete index; this function reports the gaps instead + so callers can repair them. The expected spacing is the most common + step between consecutive timestamps, so the function works for hourly + as well as 15-minute ENTSO-E data. + + Args: + index (pd.DatetimeIndex): Sorted, duplicate-free datetime index to + inspect. Indexes with fewer than 3 entries cannot contain a + detectable interior gap and yield an empty list. + + Returns: + list[tuple[pd.Timestamp, pd.Timestamp]]: One ``(first_missing, + last_missing)`` pair per contiguous gap, both bounds + inclusive. Empty list when the index is complete. + + Raises: + ValueError: If ``index`` is not sorted in increasing order. + + Examples: + ```{python} + import pandas as pd + from spotforecast2_safe.downloader.entsoe import find_missing_intervals + + full = pd.date_range("2026-06-01", periods=96, freq="h", tz="UTC") + gappy = full.delete(list(range(48, 72))) # June 3rd is missing + print(find_missing_intervals(gappy)) + ``` + """ + if len(index) < 3: + return [] + if not index.is_monotonic_increasing: + raise ValueError("index must be sorted in increasing order.") + deltas = index.to_series().diff().dropna() + deltas = deltas[deltas > pd.Timedelta(0)] + if deltas.empty: + return [] + step = deltas.mode().iloc[0] + expected = pd.date_range(start=index.min(), end=index.max(), freq=step) + missing = expected.difference(index) + + gaps: list[tuple[pd.Timestamp, pd.Timestamp]] = [] + gap_start: pd.Timestamp | None = None + gap_end: pd.Timestamp | None = None + for ts in missing: + if gap_end is not None and ts - gap_end == step: + gap_end = ts + else: + if gap_start is not None and gap_end is not None: + gaps.append((gap_start, gap_end)) + gap_start = gap_end = ts + if gap_start is not None and gap_end is not None: + gaps.append((gap_start, gap_end)) + return gaps + + +def repair_data_gaps( + api_key: str, + country_code: str = "DE", + keep_forecast_future: bool = False, + on_unavailable: OnUnavailable = "raise", +) -> list[tuple[pd.Timestamp, pd.Timestamp]]: + """Detect and repair interior gaps in the interim energy-load file. + + Repairs prefer data that is already on disk over the network: + + 1. Re-merge every raw CSV under ``get_data_home() / "raw"`` via + `merge_build_manual()`. When a previously downloaded file already + covers a hole, this fixes the interim file without any network + access. + 2. For every interval still missing, issue a targeted + `download_new_data()` call restricted to that interval (padded by + one hour on each side; overlaps deduplicate during the merge). + ENTSO-E sometimes publishes data late, so a gap that existed + yesterday may be fillable today. + + Values are never invented: a gap that ENTSO-E still cannot fill stays + a gap, and the function raises by default so the caller decides + explicitly. Downstream imputation (e.g. + `spotforecast2_safe.preprocessing.LinearlyInterpolateTS`) remains a + separate, opt-in step. + + Args: + api_key (str): The ENTSO-E API key. + country_code (str, optional): The country code to query (e.g., + 'DE', 'FR'). Defaults to "DE". + keep_forecast_future (bool, optional): Forwarded to + `merge_build_manual()` and `download_new_data()`; see their + docstrings. Defaults to False. + on_unavailable (str, optional): What to do when gaps remain after + both repair steps. Options: + * "raise": Raise ``ValueError`` listing the remaining gaps + (fail-safe default). + * "use_existing": Log a prominent warning and return the + remaining gaps so the caller can explicitly continue with + the data already on disk. + Defaults to "raise". + + Returns: + list[tuple[pd.Timestamp, pd.Timestamp]]: Gaps that could not be + repaired, as ``(first_missing, last_missing)`` pairs with both + bounds inclusive. An empty list means the observed part of the + interim file is complete. + + Raises: + FileNotFoundError: If no interim file exists even after merging + the raw directory (nothing to repair; run + `download_new_data()` first). + ValueError: If gaps remain and ``on_unavailable='raise'``, or if + ``on_unavailable`` is not a recognized value. + ImportError: If gaps require a download and the Python package + 'entsoe-py' is not installed. + + Examples: + ```{python} + #| eval: false + from spotforecast2_safe.downloader.entsoe import repair_data_gaps + + # Repair a hole like the June 1st-2nd 2026 incident: re-merge + # local raw files first, then download only the still-missing + # interval from ENTSO-E. + unrepaired = repair_data_gaps(api_key="YOUR_API_KEY", country_code="DE") + print(unrepaired) # [] when everything could be filled + + # Keep operating on stale data when ENTSO-E is down (logged loudly) + unrepaired = repair_data_gaps( + api_key="YOUR_API_KEY", + country_code="DE", + on_unavailable="use_existing", + ) + ``` + """ + _validate_on_unavailable(on_unavailable) + + # Step 1: repair from data that is already on disk. + merge_build_manual(keep_forecast_future=keep_forecast_future) + gaps = _interim_gaps() + if not gaps: + logger.info("No gaps detected in the interim file.") + return [] + logger.info( + "Detected %d gap(s) in the interim file after re-merging raw files: %s", + len(gaps), + "; ".join(f"{gs} -> {ge}" for gs, ge in gaps), + ) + + # Step 2: targeted downloads for the intervals still missing. + for gap_start, gap_end in gaps: + try: + download_new_data( + api_key=api_key, + country_code=country_code, + start=gap_start - pd.Timedelta(hours=1), + end=gap_end + pd.Timedelta(hours=1), + force=True, + keep_forecast_future=keep_forecast_future, + ) + except RuntimeError as exc: + logger.warning( + "Targeted download for gap %s to %s failed: %s", + gap_start, + gap_end, + exc, + ) + + remaining = _interim_gaps() + if not remaining: + logger.info("All %d gap(s) repaired.", len(gaps)) + return [] + + preview = "; ".join(f"{gs} -> {ge}" for gs, ge in remaining[:5]) + more = f" (+{len(remaining) - 5} more)" if len(remaining) > 5 else "" + if on_unavailable == "raise": + raise ValueError( + f"{len(remaining)} gap(s) in {_interim_path()} could not be " + f"repaired from raw files or ENTSO-E: [{preview}]{more}. " + "ENTSO-E has not published data for these intervals yet. Pass " + "on_unavailable='use_existing' to proceed with the gapped " + "data, or handle the gaps explicitly downstream (e.g. " + "preprocessing.LinearlyInterpolateTS)." + ) + logger.warning( + "Proceeding despite %d unrepaired gap(s): [%s]%s " + "(on_unavailable='use_existing').", + len(remaining), + preview, + more, + ) + return remaining + + def _download_entsoe_table( api_key: str, country_code: str, diff --git a/tests/test_downloader_entsoe.py b/tests/test_downloader_entsoe.py index ff785b974..692f2f30f 100644 --- a/tests/test_downloader_entsoe.py +++ b/tests/test_downloader_entsoe.py @@ -98,12 +98,24 @@ def test_download_new_data_success(self, mock_fetch, mock_get_home): @patch("spotforecast2_safe.downloader.entsoe.get_data_home") @patch("spotforecast2_safe.downloader.entsoe.fetch_data") def test_download_new_data_cooldown(self, mock_fetch, mock_get_home): - """Test that download is skipped if too recent.""" + """A download younger than 24h skips a force=False call. + + Recency comes from the newest ``entsoe_load_*.csv`` raw-file mtime -- + NOT from the width of the requested window (the old, buggy semantics + that silently skipped small backfills). + """ mock_get_home.return_value = self.test_dir - # Last index is very recent - now = pd.Timestamp.now(tz="UTC") - mock_fetch.return_value = pd.DataFrame(index=[now - pd.Timedelta(hours=2)]) + # A freshly written raw file marks a recent successful download. + (self.raw_dir / "entsoe_load_202601010000_202601020000.csv").write_text( + "Time (UTC),Actual Load\n2026-01-01 00:00,1.0\n" + ) + # Contiguous interim data: no gap-based cooldown bypass applies. + now = pd.Timestamp.now(tz="UTC").floor("h") + mock_fetch.return_value = pd.DataFrame( + {"Actual Load": [1.0, 2.0, 3.0]}, + index=pd.DatetimeIndex([now - pd.Timedelta(hours=h) for h in (5, 4, 3)]), + ) import sys from unittest.mock import MagicMock diff --git a/tests/test_entsoe_gap_repair.py b/tests/test_entsoe_gap_repair.py new file mode 100644 index 000000000..008640393 --- /dev/null +++ b/tests/test_entsoe_gap_repair.py @@ -0,0 +1,375 @@ +# SPDX-FileCopyrightText: 2026 bartzbeielstein +# SPDX-License-Identifier: AGPL-3.0-or-later + +"""Tests for ENTSO-E gap detection, gap repair, and unavailability handling. + +Covers the failure mode shown by the June 1st-2nd 2026 incident: ENTSO-E +publishes nothing for an interval, the interim file ends up with an +interior hole, and the pipeline must be able to repair it from already +downloaded raw files or a targeted re-download -- without ever inventing +values. Dates are shifted to January 2026 (safely in the past) so the +default future-row filter in ``merge_build_manual`` never interferes. +""" + +import os +import sys +from unittest.mock import MagicMock, patch + +import pandas as pd +import pytest + +from spotforecast2_safe.downloader.entsoe import ( + download_new_data, + find_missing_intervals, + merge_build_manual, + repair_data_gaps, +) + + +def _write_raw(raw_dir, name, start, periods, value=50000.0): + """Write an hourly raw CSV covering ``periods`` hours from ``start``.""" + idx = pd.date_range(start, periods=periods, freq="h", tz="UTC") + pd.DataFrame({"Time (UTC)": idx.astype(str), "Actual Load": value}).to_csv( + raw_dir / name, index=False + ) + return idx + + +def _mock_entsoe(response=None, side_effect=None): + """Inject a mocked ``entsoe`` module; return (module, client) mocks.""" + mod = MagicMock() + client = mod.EntsoePandasClient.return_value + if side_effect is not None: + client.query_load_and_forecast.side_effect = side_effect + else: + client.query_load_and_forecast.return_value = response + sys.modules["entsoe"] = mod + return mod, client + + +@pytest.fixture +def gap_env(tmp_path): + """Data home with raw files for Jan 1 and Jan 3 but nothing for Jan 2. + + Merging therefore leaves an interior 24h hole on 2026-01-02 -- the + June-1st/2nd pattern from the field incident. + """ + raw_dir = tmp_path / "raw" + raw_dir.mkdir() + (tmp_path / "interim").mkdir() + _write_raw(raw_dir, "jan01.csv", "2026-01-01 00:00", 24) + _write_raw(raw_dir, "jan03.csv", "2026-01-03 00:00", 24) + with patch( + "spotforecast2_safe.downloader.entsoe.get_data_home", return_value=tmp_path + ): + yield tmp_path + + +GAP = ( + pd.Timestamp("2026-01-02 00:00", tz="UTC"), + pd.Timestamp("2026-01-02 23:00", tz="UTC"), +) + + +class TestFindMissingIntervals: + """Unit tests for the index-completeness gap finder.""" + + def test_complete_index_has_no_gaps(self): + idx = pd.date_range("2026-01-01", periods=48, freq="h", tz="UTC") + assert find_missing_intervals(idx) == [] + + def test_detects_single_interior_gap(self): + full = pd.date_range("2026-01-01", periods=72, freq="h", tz="UTC") + gappy = full.delete(list(range(24, 48))) # Jan 2nd missing + assert find_missing_intervals(gappy) == [GAP] + + def test_detects_multiple_gaps_and_15min_resolution(self): + full = pd.date_range("2026-01-01", periods=96 * 3, freq="15min", tz="UTC") + gappy = full.delete(list(range(8, 12)) + list(range(200, 204))) + gaps = find_missing_intervals(gappy) + assert gaps == [(full[8], full[11]), (full[200], full[203])] + + def test_short_index_yields_no_gaps(self): + idx = pd.DatetimeIndex( + [ + pd.Timestamp("2026-01-01", tz="UTC"), + pd.Timestamp("2026-01-03", tz="UTC"), + ] + ) + assert find_missing_intervals(idx) == [] + + def test_unsorted_index_raises(self): + idx = pd.to_datetime(["2026-01-02", "2026-01-01", "2026-01-03"], utc=True) + with pytest.raises(ValueError, match="sorted"): + find_missing_intervals(idx) + + +class TestEmptyEntsoeResponse: + """ENTSO-E is reachable but has no data for the requested interval.""" + + def test_empty_dataframe_writes_nothing_and_warns(self, gap_env, caplog): + _, client = _mock_entsoe(response=pd.DataFrame()) + with caplog.at_level( + "WARNING", logger="spotforecast2_safe.downloader.entsoe" + ): + download_new_data( + api_key="k", start="202601020000", end="202601030000", force=True + ) + assert client.query_load_and_forecast.call_count == 1 + assert not list((gap_env / "raw").glob("entsoe_load_*.csv")) + assert any("returned no data" in m for m in caplog.messages) + + def test_all_nan_rows_write_nothing_and_warn(self, gap_env, caplog): + idx = pd.date_range("2026-01-02", periods=24, freq="h", tz="UTC") + all_nan = pd.DataFrame( + { + "Actual Load": [float("nan")] * 24, + "Forecasted Load": [float("nan")] * 24, + }, + index=idx, + ) + _mock_entsoe(response=all_nan) + with caplog.at_level( + "WARNING", logger="spotforecast2_safe.downloader.entsoe" + ): + download_new_data( + api_key="k", start="202601020000", end="202601030000", force=True + ) + assert not list((gap_env / "raw").glob("entsoe_load_*.csv")) + assert any("returned no data" in m for m in caplog.messages) + + +class TestRepairDataGaps: + """repair_data_gaps: disk first, targeted download second, never invent.""" + + def test_repair_from_raw_files_already_on_disk(self, gap_env): + """A stale interim file is healed by re-merging raw coverage; no network.""" + # Raw coverage of the hole arrived later (e.g. manual export). + _write_raw(gap_env / "raw", "jan02.csv", "2026-01-02 00:00", 24) + # Interim still carries the hole: built only from Jan 1 + Jan 3. + idx = pd.date_range("2026-01-01", periods=24, freq="h", tz="UTC").append( + pd.date_range("2026-01-03", periods=24, freq="h", tz="UTC") + ) + pd.DataFrame({"Actual Load": 50000.0}, index=idx).rename_axis( + "Time (UTC)" + ).to_csv(gap_env / "interim" / "energy_load.csv") + + _, client = _mock_entsoe(response=pd.DataFrame()) + + remaining = repair_data_gaps(api_key="k") + + assert remaining == [] + client.query_load_and_forecast.assert_not_called() + merged = pd.read_csv(gap_env / "interim" / "energy_load.csv", index_col=0) + assert pd.Timestamp("2026-01-02 12:00", tz="UTC") in pd.to_datetime( + merged.index, utc=True + ) + + def test_targeted_download_fetches_exactly_the_missing_range(self, gap_env): + """A gap with no raw coverage triggers one bounded ENTSO-E query.""" + gap_data = pd.DataFrame( + {"Actual Load": 50000.0}, + index=pd.date_range("2026-01-02 00:00", periods=24, freq="h", tz="UTC"), + ) + _, client = _mock_entsoe(response=gap_data) + + remaining = repair_data_gaps(api_key="k", country_code="DE") + + assert remaining == [] + client.query_load_and_forecast.assert_called_once() + _, kwargs = client.query_load_and_forecast.call_args + assert kwargs["country_code"] == "DE" + # Window = gap padded by 1h on each side. + assert kwargs["start"] == pd.Timestamp("2026-01-01 23:00", tz="UTC") + assert kwargs["end"] == pd.Timestamp("2026-01-03 00:00", tz="UTC") + merged = pd.read_csv(gap_env / "interim" / "energy_load.csv", index_col=0) + assert pd.Timestamp("2026-01-02 12:00", tz="UTC") in pd.to_datetime( + merged.index, utc=True + ) + + def test_unfillable_gap_raises_by_default(self, gap_env): + """ENTSO-E has nothing for the interval: fail loudly, never invent.""" + _mock_entsoe(response=pd.DataFrame()) + with pytest.raises(ValueError, match="could not be repaired"): + repair_data_gaps(api_key="k") + + def test_use_existing_returns_remaining_gaps_and_warns(self, gap_env, caplog): + _mock_entsoe(response=pd.DataFrame()) + with caplog.at_level( + "WARNING", logger="spotforecast2_safe.downloader.entsoe" + ): + remaining = repair_data_gaps(api_key="k", on_unavailable="use_existing") + assert remaining == [GAP] + assert any("unrepaired gap" in m for m in caplog.messages) + + def test_no_gaps_is_a_clean_no_op(self, gap_env): + _write_raw(gap_env / "raw", "jan02.csv", "2026-01-02 00:00", 24) + _, client = _mock_entsoe(response=pd.DataFrame()) + assert repair_data_gaps(api_key="k") == [] + client.query_load_and_forecast.assert_not_called() + + def test_invalid_on_unavailable_rejected(self, gap_env): + with pytest.raises(ValueError, match="on_unavailable"): + repair_data_gaps(api_key="k", on_unavailable="banana") + + def test_entsoe_down_keeps_gap_and_raises_by_default(self, gap_env): + """Persistent network failure during targeted download: gap survives.""" + _mock_entsoe(side_effect=RuntimeError("api down")) + with patch( + "spotforecast2_safe.downloader.entsoe.time.sleep", lambda _s: None + ): + with pytest.raises(ValueError, match="could not be repaired"): + repair_data_gaps(api_key="k") + + +class TestCooldownSemantics: + """force=False means 'recent successful download', never 'small window'.""" + + def test_force_false_backfill_of_gap_proceeds_despite_recent_download( + self, gap_env + ): + """The June-1st regression: force=False must not skip a gap backfill.""" + merge_build_manual() # interim now carries the Jan 2 hole + # A freshly written raw file marks a recent successful download. + (gap_env / "raw" / "entsoe_load_202601030000_202601040000.csv").write_text( + "Time (UTC),Actual Load\n2026-01-03 00:00,1.0\n" + ) + + gap_data = pd.DataFrame( + {"Actual Load": 50000.0}, + index=pd.date_range("2026-01-02 00:00", periods=24, freq="h", tz="UTC"), + ) + _, client = _mock_entsoe(response=gap_data) + + # Sub-24h window inside the hole, force=False: the old window-width + # cooldown silently skipped exactly this call. + download_new_data( + api_key="k", start="202601020000", end="202601021200", force=False + ) + + client.query_load_and_forecast.assert_called_once() + + def test_force_false_skips_when_recent_and_no_gap(self, gap_env): + # Heal the hole first so no gap bypass applies. + _write_raw(gap_env / "raw", "jan02.csv", "2026-01-02 00:00", 24) + merge_build_manual() + (gap_env / "raw" / "entsoe_load_202601030000_202601040000.csv").write_text( + "Time (UTC),Actual Load\n2026-01-03 00:00,1.0\n" + ) + + _, client = _mock_entsoe(response=pd.DataFrame()) + download_new_data( + api_key="k", start="202601040000", end="202601050000", force=False + ) + client.query_load_and_forecast.assert_not_called() + + def test_force_false_proceeds_when_last_download_is_stale(self, gap_env): + _write_raw(gap_env / "raw", "jan02.csv", "2026-01-02 00:00", 24) + merge_build_manual() + stale = gap_env / "raw" / "entsoe_load_202601030000_202601040000.csv" + stale.write_text("Time (UTC),Actual Load\n2026-01-03 00:00,1.0\n") + two_days_ago = pd.Timestamp.now(tz="UTC").timestamp() - 48 * 3600 + os.utime(stale, (two_days_ago, two_days_ago)) + + new_data = pd.DataFrame( + {"Actual Load": [1.0]}, + index=[pd.Timestamp("2026-01-04 00:00", tz="UTC")], + ) + _, client = _mock_entsoe(response=new_data) + download_new_data( + api_key="k", start="202601040000", end="202601050000", force=False + ) + client.query_load_and_forecast.assert_called_once() + + +class TestDownloadOnUnavailable: + """Opt-in degradation when ENTSO-E is unreachable after all retries.""" + + def test_use_existing_logs_and_returns_when_interim_exists(self, gap_env, caplog): + merge_build_manual() # interim now exists + _mock_entsoe(side_effect=RuntimeError("api down")) + with ( + patch( + "spotforecast2_safe.downloader.entsoe.time.sleep", lambda _s: None + ), + caplog.at_level("WARNING", logger="spotforecast2_safe.downloader.entsoe"), + ): + download_new_data( + api_key="k", + start="202601040000", + end="202601050000", + force=True, + on_unavailable="use_existing", + ) + assert any( + "Proceeding with the existing interim data" in m for m in caplog.messages + ) + + def test_use_existing_without_interim_still_raises(self, tmp_path): + """Degradation needs data to degrade onto; with none, fail loudly.""" + (tmp_path / "raw").mkdir() + (tmp_path / "interim").mkdir() + _mock_entsoe(side_effect=RuntimeError("api down")) + with ( + patch( + "spotforecast2_safe.downloader.entsoe.get_data_home", + return_value=tmp_path, + ), + patch( + "spotforecast2_safe.downloader.entsoe.time.sleep", lambda _s: None + ), + ): + with pytest.raises(RuntimeError, match="5 attempts"): + download_new_data( + api_key="k", + start="202601040000", + end="202601050000", + force=True, + on_unavailable="use_existing", + ) + + def test_default_raise_preserved(self, gap_env): + _mock_entsoe(side_effect=RuntimeError("api down")) + with patch( + "spotforecast2_safe.downloader.entsoe.time.sleep", lambda _s: None + ): + with pytest.raises(RuntimeError, match="5 attempts"): + download_new_data( + api_key="k", start="202601040000", end="202601050000", force=True + ) + + def test_invalid_on_unavailable_rejected(self, gap_env): + with pytest.raises(ValueError, match="on_unavailable"): + download_new_data(api_key="k", on_unavailable="nope") + + +class TestEmptyWindowGuard: + """end <= start: invalid input when explicit, clean no-op when derived.""" + + def test_explicit_inverted_window_raises(self, gap_env): + _mock_entsoe(response=pd.DataFrame()) + with pytest.raises(ValueError, match="not after"): + download_new_data( + api_key="k", start="202601050000", end="202601040000", force=True + ) + + def test_incremental_up_to_date_returns_quietly(self, tmp_path, caplog): + """A resume landing past ``end`` is 'already up to date', not an error.""" + (tmp_path / "raw").mkdir() + (tmp_path / "interim").mkdir() + now = pd.Timestamp.now(tz="UTC").floor("h") + idx = pd.date_range(now - pd.Timedelta(hours=5), now, freq="h") + pd.DataFrame({"Actual Load": 1.0}, index=idx).rename_axis( + "Time (UTC)" + ).to_csv(tmp_path / "interim" / "energy_load.csv") + _, client = _mock_entsoe(response=pd.DataFrame()) + with ( + patch( + "spotforecast2_safe.downloader.entsoe.get_data_home", + return_value=tmp_path, + ), + caplog.at_level("INFO", logger="spotforecast2_safe.downloader.entsoe"), + ): + download_new_data(api_key="k", force=True) + client.query_load_and_forecast.assert_not_called() + assert any("Nothing to download" in m for m in caplog.messages) diff --git a/tests/test_entsoe_integration.py b/tests/test_entsoe_integration.py index 848874c4e..29eaf0289 100644 --- a/tests/test_entsoe_integration.py +++ b/tests/test_entsoe_integration.py @@ -220,19 +220,26 @@ def test_download_new_data_resume_skips_future_rows(tmp_path): def test_download_new_data_cooldown_skips(tmp_path): - # Patch get_data_home, fetch_data, and sys.modules["entsoe"] + # A recent successful download (newest entsoe_load_* mtime) skips a + # force=False call; recency, not window width, drives the cooldown. raw_dir = tmp_path / "raw" raw_dir.mkdir() interim_dir = tmp_path / "interim" interim_dir.mkdir() + (raw_dir / "entsoe_load_202601010000_202601020000.csv").write_text( + "Time (UTC),Actual Load\n2026-01-01 00:00,1.0\n" + ) with ( patch( "spotforecast2_safe.downloader.entsoe.get_data_home", return_value=tmp_path ), patch("spotforecast2_safe.downloader.entsoe.fetch_data") as mock_fetch, ): - now = pd.Timestamp.now(tz="UTC") - mock_fetch.return_value = pd.DataFrame(index=[now - pd.Timedelta(hours=2)]) + now = pd.Timestamp.now(tz="UTC").floor("h") + mock_fetch.return_value = pd.DataFrame( + {"Actual Load": [1.0, 2.0, 3.0]}, + index=pd.DatetimeIndex([now - pd.Timedelta(hours=h) for h in (5, 4, 3)]), + ) import sys mock_entsoe_mod = MagicMock()