diff --git a/_freeze/docs/tasks/task_multi/execute-results/html.json b/_freeze/docs/tasks/task_multi/execute-results/html.json index 22db59750..da719314c 100644 --- a/_freeze/docs/tasks/task_multi/execute-results/html.json +++ b/_freeze/docs/tasks/task_multi/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "805f9a9e46255e27fc62cc8c3251106c", + "hash": "22d2d1fff17907c923df9f768befd71d", "result": { "engine": "jupyter", - "markdown": "---\ntitle: \"task_multi: Config-Driven Multi-Target Forecasting with MultiTask\"\ndescription: \"What spotforecast2_safe.multitask provides, how ConfigMulti drives it, and a complete runnable example.\"\n---\n\n`spotforecast2_safe.multitask` is the config-driven orchestrator for\nmulti-target time-series forecasting. It owns the complete pipeline —\ndata preparation, outlier handling, imputation, exogenous features,\ntraining, prediction, persistence — and is driven by a single\n`ConfigMulti` object. The unrestricted sibling package `spotforecast2`\ninherits this pipeline and adds only what is deliberately excluded here:\nhyperparameter tuning (Optuna, SpotOptim) and interactive plotting.\n\nBefore version 16.0.0 this pipeline existed twice: once in the sibling\npackage and once, in procedural form, behind the n-to-1 task's 18 keyword\narguments and a hard-coded weight list. Both paths are now one\nimplementation in this package, and the dependency between the siblings\nis strictly one-way: `spotforecast2` imports from `spotforecast2-safe`,\nnever the reverse.\n\n## Vocabulary\n\n::: {#def-multitask}\n\n## MultiTask\n\nThe task dispatcher of the `multitask` package. A `MultiTask` instance is\nconstructed from a `ConfigMulti` and a DataFrame, prepared with the four\npipeline stages (`prepare_data`, `detect_outliers`, `impute`,\n`build_exogenous_features`), and executed with `run(task=...)`, where\n`task` selects one of the available task modes.\n\n:::\n\n::: {#def-n-to-1-aggregation}\n\n## N-to-1 aggregation\n\nThe reduction of per-target forecasts to a single series as a weighted\nsum, with weights taken from `ConfigMulti.agg_weights` in target order.\nEqual weights are used when `agg_weights` is `None`.\n\n:::\n\n## Task modes\n\nThe safe package ships four task modes:\n\n| Mode | What it does |\n| --- | --- |\n| `lazy` | Fit one `ForecasterRecursive` per target with LightGBM defaults, applying cached tuning results when present. |\n| `defaults` | Same fit, but ignoring any tuning cache — fully deterministic baseline. |\n| `predict` | Load previously saved models and predict without retraining. |\n| `clean` | Remove the pipeline's cache directory (models, tuning results, logs). |\n\nThe tuning modes `optuna` and `spotoptim` exist only in `spotforecast2`;\nrequesting them here raises an explicit `ValueError` (see\n[Fail-safe behaviour](#fail-safe-behaviour)).\n\n## A complete worked example\n\n::: {#exm-synthetic-config}\n\n## Synthetic data and a minimal configuration\n\nTwo hourly target series over four weeks, a named `DatetimeIndex`\nmatching `ConfigMulti.index_name` (default `\"DateTime\"`), and a\nconfiguration with the expensive options disabled so the example runs in\nseconds and offline.\n\n::: {#09d15d78 .cell execution_count=1}\n``` {.python .cell-code}\nimport tempfile\nimport warnings\n\nimport numpy as np\nimport pandas as pd\n\nfrom spotforecast2_safe.configurator.config_multi import ConfigMulti\n\nwarnings.filterwarnings(\"ignore\")\n\nrng = np.random.default_rng(0)\nn = 24 * 28 # 4 weeks, hourly\nidx = pd.date_range(\"2023-01-01\", periods=n, freq=\"h\", tz=\"UTC\")\nidx.name = \"DateTime\"\ndf = pd.DataFrame(\n {\n \"a\": 100 + 10 * np.sin(np.arange(n) * 2 * np.pi / 24) + rng.normal(0, 2, n),\n \"b\": 200 + 20 * np.cos(np.arange(n) * 2 * np.pi / 24) + rng.normal(0, 4, n),\n },\n index=idx,\n)\n\ncache = tempfile.mkdtemp()\ncfg = ConfigMulti(\n predict_size=6, # forecast horizon: 6 hours\n agg_weights=[1.0, -1.0], # n-to-1 combination: a - b\n use_exogenous_features=False, # offline example: no weather/calendar\n use_outlier_detection=False,\n auto_save_models=True, # persist models for the predict mode below\n number_folds=2,\n random_state=42,\n verbose=False,\n)\ndf.tail(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=1}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ab
DateTime
2023-01-28 21:00:00+00:0093.591621219.025208
2023-01-28 22:00:00+00:0094.698887220.480717
2023-01-28 23:00:00+00:0097.692551224.620083
\n
\n```\n:::\n:::\n\n\n:::\n\n::: {#exm-pipeline-run}\n\n## Running the pipeline\n\nThe four stages chain (each returns the task), then `run` fits and\npredicts. The returned aggregated package carries the combined forecast\nunder `\"future_pred\"`; the per-target packages live in `task.results`.\n\n::: {#04c3f34a .cell execution_count=2}\n``` {.python .cell-code}\nfrom spotforecast2_safe.multitask import MultiTask\n\nmt = MultiTask(cfg, dataframe=df, cache_home=cache)\nresult = (\n mt.prepare_data()\n .detect_outliers()\n .impute()\n .build_exogenous_features()\n .run(task=\"defaults\")\n)\nresult[\"future_pred\"]\n```\n\n::: {.cell-output .cell-output-display execution_count=2}\n```\n2023-01-29 00:00:00+00:00 -117.612059\n2023-01-29 01:00:00+00:00 -120.726869\n2023-01-29 02:00:00+00:00 -117.511861\n2023-01-29 03:00:00+00:00 -107.689754\n2023-01-29 04:00:00+00:00 -103.317560\n2023-01-29 05:00:00+00:00 -95.314405\nFreq: h, dtype: float64\n```\n:::\n:::\n\n\n:::\n\n::: {#exm-weighted-aggregation}\n\n## The aggregation is exactly the configured weighted sum\n\n@def-n-to-1-aggregation can be verified directly against the per-target\nforecasts:\n\n::: {#29207c2c .cell execution_count=3}\n``` {.python .cell-code}\npred_a = mt.results[\"defaults\"][\"a\"][\"future_pred\"]\npred_b = mt.results[\"defaults\"][\"b\"][\"future_pred\"]\nmanual = 1.0 * pred_a + (-1.0) * pred_b\n\nprint(\"max |aggregated - manual| =\", float((result[\"future_pred\"] - manual).abs().max()))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nmax |aggregated - manual| = 0.0\n```\n:::\n:::\n\n\n:::\n\n::: {#exm-train-then-predict}\n\n## Train once, predict many times\n\nWith `auto_save_models=True` the fitted forecasters were persisted under\n`cache_home`. A later `predict` run loads them instead of retraining —\nthe production pattern for scheduled forecasts:\n\n::: {#962303e8 .cell execution_count=4}\n``` {.python .cell-code}\nmt2 = MultiTask(cfg, dataframe=df, cache_home=cache)\nmt2.prepare_data().detect_outliers().impute().build_exogenous_features()\nreloaded = mt2.run(task=\"predict\")\nreloaded[\"future_pred\"]\n```\n\n::: {.cell-output .cell-output-display execution_count=4}\n```\n2023-01-29 00:00:00+00:00 -117.612059\n2023-01-29 01:00:00+00:00 -120.726869\n2023-01-29 02:00:00+00:00 -117.511861\n2023-01-29 03:00:00+00:00 -107.689754\n2023-01-29 04:00:00+00:00 -103.317560\n2023-01-29 05:00:00+00:00 -95.314405\nFreq: h, dtype: float64\n```\n:::\n:::\n\n\n:::\n\n::: {#exm-determinism}\n\n## Determinism\n\nSame input, same configuration, bit-identical output — a hard requirement\nof this package, enforced by the test suite and demonstrable here with a\nfresh instance in a fresh cache directory:\n\n::: {#50011930 .cell execution_count=5}\n``` {.python .cell-code}\nmt3 = MultiTask(cfg, dataframe=df.copy(), cache_home=tempfile.mkdtemp())\nrerun = (\n mt3.prepare_data()\n .detect_outliers()\n .impute()\n .build_exogenous_features()\n .run(task=\"defaults\")\n)\npd.testing.assert_series_equal(result[\"future_pred\"], rerun[\"future_pred\"], check_exact=True)\nprint(\"bit-identical:\", result[\"future_pred\"].equals(rerun[\"future_pred\"]))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nbit-identical: True\n```\n:::\n:::\n\n\n:::\n\n## Fail-safe behaviour\n\nInvalid requests raise immediately instead of degrading silently.\nRequesting a tuning mode in the safe package names the package that\nprovides it:\n\n::: {#9f75dc65 .cell execution_count=6}\n``` {.python .cell-code}\ntry:\n mt.run(task=\"spotoptim\")\nexcept ValueError as err:\n print(err)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nTask 'spotoptim' requires auto-tuning, which is not available in spotforecast2-safe. Use the spotforecast2 package, or task='lazy'/'defaults'.\n```\n:::\n:::\n\n\nThe same policy applies to unexpected keyword arguments (`TypeError`\ninstead of silent dropping) and to plotting:\n`MultiTask.plot_with_outliers()` raises `NotImplementedError` because no\nplotting library is permitted in this package.\n\n## The n-to-1 task entry point\n\n[`run_pipeline`](../reference/tasks.task_safe_n_to_1_with_covariates_and_dataframe.qmd) from\n`task_safe_n_to_1_with_covariates_and_dataframe`\nwraps exactly this pipeline with `task=\"lazy\"` — one call from config and\nDataFrame to combined forecast:\n\n::: {#1fbfa109 .cell execution_count=7}\n``` {.python .cell-code}\nfrom spotforecast2_safe.tasks.task_safe_n_to_1_with_covariates_and_dataframe import (\n run_pipeline,\n)\n\nforecast = run_pipeline(config=cfg, dataframe=df, cache_home=cache)\nforecast.head(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
forecast
2023-01-29 00:00:00+00:00-117.612059
2023-01-29 01:00:00+00:00-120.726869
2023-01-29 02:00:00+00:00-117.511861
\n
\n```\n:::\n:::\n\n\nThe matching console script accepts the same knobs as flags:\n\n```bash\nuv run spotforecast-safe-n2o1-cov-df --forecast_horizon 24 --lags 24 \\\n --include_holiday_features true\n```\n\n## Scaling up from the toy example\n\nFor a real run, switch the feature machinery on instead of off:\n`use_exogenous_features=True` with `include_holiday_features`,\n`include_holiday_adjacency_features` (bridge days), and\n`include_weather_windows` adds calendar, holiday, day/night, weather, and\npolynomial-interaction covariates before training. Weather features\nrequire network access; `on_weather_failure` keeps its fail-safe default\n`\"raise\"` unless you explicitly opt into `\"skip\"`.\n\n## Upgrade path: the same config in spotforecast2\n\nThe unrestricted sibling subclasses this pipeline and re-adds tuning and\nplotting. The configuration object travels unchanged:\n\n```python\n# spotforecast2 (not installable here — one-way dependency)\nfrom spotforecast2.multitask import MultiTask\n\nmt = MultiTask(cfg, dataframe=df, task=\"spotoptim\")\nmt.prepare_data().detect_outliers().impute().build_exogenous_features()\nmt.run(show=True) # hyperparameter search + interactive figures\n```\n\n::: {.callout-note}\nThe dependency between the packages is strictly one-way:\n`spotforecast2` imports from `spotforecast2-safe`, never the reverse.\nThat is why the cell above is a listing rather than executed code — this\ndocumentation builds in an environment where `spotforecast2` is, by\ndesign, absent.\n:::\n\n## Where to go next\n\n- API reference: [`run_pipeline`](../reference/tasks.task_safe_n_to_1_with_covariates_and_dataframe.qmd) — the CLI-facing wrapper around this pipeline.\n- API reference: [`MultiTask`](../reference/multitask.multi.MultiTask.qmd), [`BaseTask`](../reference/multitask.base.BaseTask.qmd), [`ConfigMulti`](../reference/configurator.config_multi.ConfigMulti.qmd), [`runner.run`](../reference/multitask.runner.run.qmd).\n\n", + "markdown": "---\ntitle: \"task_multi: Config-Driven Multi-Target Forecasting with MultiTask\"\ndescription: \"What spotforecast2_safe.multitask provides, how ConfigMulti drives it, and a complete runnable example.\"\n---\n\n`spotforecast2_safe.multitask` is the config-driven orchestrator for\nmulti-target time-series forecasting. It owns the complete pipeline —\ndata preparation, outlier handling, imputation, exogenous features,\ntraining, prediction, persistence — and is driven by a single\n`ConfigMulti` object. The unrestricted sibling package `spotforecast2`\ninherits this pipeline and adds only what is deliberately excluded here:\nhyperparameter tuning (Optuna, SpotOptim) and interactive plotting.\n\nBefore version 16.0.0 this pipeline existed twice: once in the sibling\npackage and once, in procedural form, behind the n-to-1 task's 18 keyword\narguments and a hard-coded weight list. Both paths are now one\nimplementation in this package, and the dependency between the siblings\nis strictly one-way: `spotforecast2` imports from `spotforecast2-safe`,\nnever the reverse.\n\n## Vocabulary\n\n::: {#def-multitask}\n\n## MultiTask\n\nThe task dispatcher of the `multitask` package. A `MultiTask` instance is\nconstructed from a `ConfigMulti` and a DataFrame, prepared with the four\npipeline stages (`prepare_data`, `detect_outliers`, `impute`,\n`build_exogenous_features`), and executed with `run(task=...)`, where\n`task` selects one of the available task modes.\n\n:::\n\n::: {#def-n-to-1-aggregation}\n\n## N-to-1 aggregation\n\nThe reduction of per-target forecasts to a single series as a weighted\nsum, with weights taken from `ConfigMulti.agg_weights` in target order.\nEqual weights are used when `agg_weights` is `None`.\n\n:::\n\n## Task modes\n\nThe safe package ships four task modes:\n\n| Mode | What it does |\n| --- | --- |\n| `lazy` | Fit one `ForecasterRecursive` per target with LightGBM defaults, applying cached tuning results when present. |\n| `defaults` | Same fit, but ignoring any tuning cache — fully deterministic baseline. |\n| `predict` | Load previously saved models and predict without retraining. |\n| `clean` | Remove the pipeline's cache directory (models, tuning results, logs). |\n\nThe tuning modes `optuna` and `spotoptim` exist only in `spotforecast2`;\nrequesting them here raises an explicit `ValueError` (see\n[Fail-safe behaviour](#fail-safe-behaviour)).\n\n## A complete worked example\n\n::: {#exm-synthetic-config}\n\n## Synthetic data and a minimal configuration\n\nTwo hourly target series over four weeks, a named `DatetimeIndex`\nmatching `ConfigMulti.index_name` (default `\"DateTime\"`), and a\nconfiguration with the expensive options disabled so the example runs in\nseconds and offline.\n\n::: {#d1445518 .cell execution_count=1}\n``` {.python .cell-code}\nimport tempfile\nimport warnings\n\nimport numpy as np\nimport pandas as pd\n\nfrom spotforecast2_safe.configurator.config_multi import ConfigMulti\n\nwarnings.filterwarnings(\"ignore\")\n\nrng = np.random.default_rng(0)\nn = 24 * 28 # 4 weeks, hourly\nidx = pd.date_range(\"2023-01-01\", periods=n, freq=\"h\", tz=\"UTC\")\nidx.name = \"DateTime\"\ndf = pd.DataFrame(\n {\n \"a\": 100 + 10 * np.sin(np.arange(n) * 2 * np.pi / 24) + rng.normal(0, 2, n),\n \"b\": 200 + 20 * np.cos(np.arange(n) * 2 * np.pi / 24) + rng.normal(0, 4, n),\n },\n index=idx,\n)\n\ncache = tempfile.mkdtemp()\ncfg = ConfigMulti(\n predict_size=6, # forecast horizon: 6 hours\n agg_weights=[1.0, -1.0], # n-to-1 combination: a - b\n use_exogenous_features=False, # offline example: no weather/calendar\n use_outlier_detection=False,\n auto_save_models=True, # persist models for the predict mode below\n number_folds=2,\n random_state=42,\n verbose=False,\n)\ndf.tail(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=1}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ab
DateTime
2023-01-28 21:00:00+00:0093.591621219.025208
2023-01-28 22:00:00+00:0094.698887220.480717
2023-01-28 23:00:00+00:0097.692551224.620083
\n
\n```\n:::\n:::\n\n\n:::\n\n::: {#exm-pipeline-run}\n\n## Running the pipeline\n\nThe four stages chain (each returns the task), then `run` fits and\npredicts. The returned aggregated package carries the combined forecast\nunder `\"future_pred\"`; the per-target packages live in `task.results`.\n\n::: {#52262fa4 .cell execution_count=2}\n``` {.python .cell-code}\nfrom spotforecast2_safe.multitask import MultiTask\n\nmt = MultiTask(cfg, dataframe=df, cache_home=cache)\nresult = (\n mt.prepare_data()\n .detect_outliers()\n .impute()\n .build_exogenous_features()\n .run(task=\"defaults\")\n)\nresult[\"future_pred\"]\n```\n\n::: {.cell-output .cell-output-display execution_count=2}\n```\n2023-01-29 00:00:00+00:00 -117.612059\n2023-01-29 01:00:00+00:00 -120.726869\n2023-01-29 02:00:00+00:00 -117.511861\n2023-01-29 03:00:00+00:00 -107.689754\n2023-01-29 04:00:00+00:00 -103.317560\n2023-01-29 05:00:00+00:00 -95.314405\nFreq: h, dtype: float64\n```\n:::\n:::\n\n\n:::\n\n::: {#exm-weighted-aggregation}\n\n## The aggregation is exactly the configured weighted sum\n\n@def-n-to-1-aggregation can be verified directly against the per-target\nforecasts:\n\n::: {#253c1e10 .cell execution_count=3}\n``` {.python .cell-code}\npred_a = mt.results[\"defaults\"][\"a\"][\"future_pred\"]\npred_b = mt.results[\"defaults\"][\"b\"][\"future_pred\"]\nmanual = 1.0 * pred_a + (-1.0) * pred_b\n\nprint(\"max |aggregated - manual| =\", float((result[\"future_pred\"] - manual).abs().max()))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nmax |aggregated - manual| = 0.0\n```\n:::\n:::\n\n\n:::\n\n::: {#exm-train-then-predict}\n\n## Train once, predict many times\n\nWith `auto_save_models=True` the fitted forecasters were persisted under\n`cache_home`. A later `predict` run loads them instead of retraining —\nthe production pattern for scheduled forecasts:\n\n::: {#dd8dcc17 .cell execution_count=4}\n``` {.python .cell-code}\nmt2 = MultiTask(cfg, dataframe=df, cache_home=cache)\nmt2.prepare_data().detect_outliers().impute().build_exogenous_features()\nreloaded = mt2.run(task=\"predict\")\nreloaded[\"future_pred\"]\n```\n\n::: {.cell-output .cell-output-display execution_count=4}\n```\n2023-01-29 00:00:00+00:00 -117.612059\n2023-01-29 01:00:00+00:00 -120.726869\n2023-01-29 02:00:00+00:00 -117.511861\n2023-01-29 03:00:00+00:00 -107.689754\n2023-01-29 04:00:00+00:00 -103.317560\n2023-01-29 05:00:00+00:00 -95.314405\nFreq: h, dtype: float64\n```\n:::\n:::\n\n\n:::\n\n::: {#exm-determinism}\n\n## Determinism\n\nSame input, same configuration, bit-identical output — a hard requirement\nof this package, enforced by the test suite and demonstrable here with a\nfresh instance in a fresh cache directory:\n\n::: {#bdf7e22f .cell execution_count=5}\n``` {.python .cell-code}\nmt3 = MultiTask(cfg, dataframe=df.copy(), cache_home=tempfile.mkdtemp())\nrerun = (\n mt3.prepare_data()\n .detect_outliers()\n .impute()\n .build_exogenous_features()\n .run(task=\"defaults\")\n)\npd.testing.assert_series_equal(result[\"future_pred\"], rerun[\"future_pred\"], check_exact=True)\nprint(\"bit-identical:\", result[\"future_pred\"].equals(rerun[\"future_pred\"]))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nbit-identical: True\n```\n:::\n:::\n\n\n:::\n\n## Fail-safe behaviour\n\nInvalid requests raise immediately instead of degrading silently.\nRequesting a tuning mode in the safe package names the package that\nprovides it:\n\n::: {#0861e84e .cell execution_count=6}\n``` {.python .cell-code}\ntry:\n mt.run(task=\"spotoptim\")\nexcept ValueError as err:\n print(err)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nTask 'spotoptim' requires auto-tuning, which is not available in spotforecast2-safe. Use the spotforecast2 package, or task='lazy'/'defaults'.\n```\n:::\n:::\n\n\nThe same policy applies to unexpected keyword arguments (`TypeError`\ninstead of silent dropping) and to plotting:\n`MultiTask.plot_with_outliers()` raises `NotImplementedError` because no\nplotting library is permitted in this package.\n\n## Scaling up from the toy example\n\nFor a real run, switch the feature machinery on instead of off:\n`use_exogenous_features=True` with `include_holiday_features`,\n`include_holiday_adjacency_features` (bridge days), and\n`include_weather_windows` adds calendar, holiday, day/night, weather, and\npolynomial-interaction covariates before training. Weather features\nrequire network access; `on_weather_failure` keeps its fail-safe default\n`\"raise\"` unless you explicitly opt into `\"skip\"`.\n\n## Upgrade path: the same config in spotforecast2\n\nThe unrestricted sibling subclasses this pipeline and re-adds tuning and\nplotting. The configuration object travels unchanged:\n\n```python\n# spotforecast2 (not installable here — one-way dependency)\nfrom spotforecast2.multitask import MultiTask\n\nmt = MultiTask(cfg, dataframe=df, task=\"spotoptim\")\nmt.prepare_data().detect_outliers().impute().build_exogenous_features()\nmt.run(show=True) # hyperparameter search + interactive figures\n```\n\n::: {.callout-note}\nThe dependency between the packages is strictly one-way:\n`spotforecast2` imports from `spotforecast2-safe`, never the reverse.\nThat is why the cell above is a listing rather than executed code — this\ndocumentation builds in an environment where `spotforecast2` is, by\ndesign, absent.\n:::\n\n## Where to go next\n\n- API reference: [`MultiTask`](../reference/multitask.multi.MultiTask.qmd), [`BaseTask`](../reference/multitask.base.BaseTask.qmd), [`ConfigMulti`](../reference/configurator.config_multi.ConfigMulti.qmd), [`runner.run`](../reference/multitask.runner.run.qmd).\n\n", "supporting": [ "task_multi_files/figure-html" ], diff --git a/_freeze/docs/tutorials/n2n_predict_with_covariates_demo10/execute-results/html.json b/_freeze/docs/tutorials/n2n_predict_with_covariates_demo10/execute-results/html.json index 4a3432628..8e5863a9a 100644 --- a/_freeze/docs/tutorials/n2n_predict_with_covariates_demo10/execute-results/html.json +++ b/_freeze/docs/tutorials/n2n_predict_with_covariates_demo10/execute-results/html.json @@ -1,15 +1,15 @@ { - "hash": "bb144a70993a7911209f9510a954522d", + "hash": "f926f981266aeb204419b4a14cbc98f5", "result": { "engine": "jupyter", - "markdown": "---\ntitle: \"n2n_predict_with_covariates on demo10: Step by Step on Real Data\"\ndescription: \"Sibling walkthrough that executes each pipeline stage from n2n_predict_with_covariates.py against the bundled demo10.csv, one helper at a time.\"\nexecute:\n freeze: true\n warning: false\n---\n\n## What this page does\n\nThis is the executable companion to\n[*n2n_predict_with_covariates*: A Beginner's Walkthrough](n2n_predict_with_covariates_explained.qmd).\nThe sibling page introduces the vocabulary and explains the *why* of each\nstage with small synthetic examples. This page does the opposite: every code\ncell executes one real stage of\n`spotforecast2_safe.processing.n2n_predict_with_covariates` against the\nbundled `demo10.csv` dataset, prints the intermediate state, and then\ndiscusses what just happened. Refer back to the sibling page for the\nvocabulary — `@def-time-series`, `@def-lag`, `@def-forecast-horizon`,\n`@def-recursive-forecaster`, `@def-train-val-test`, `@def-outlier`,\n`@def-imputation`, `@def-sample-weight`, `@def-cyclical-encoding`,\n`@def-persistence` — all definitions are reused unchanged.\n\nThe closing section calls `n2n_predict_with_covariates` end-to-end on the\nsame input as a sanity check that the per-stage breakdown above is\nequivalent to the orchestrator.\n\n## About `demo10.csv`\n\n`demo10.csv` is bundled in the wheel at\n`spotforecast2_safe/datasets/csv/demo10.csv`. It carries eleven numeric\ncolumns A–K, indexed by hourly UTC timestamps from December 2019 through\nDecember 2021 — about 18 000 rows, roughly two years. Column C comes\nonline late (mid-May 2021) and starts with a long run of `NaN` covering\nmost of the window, which makes it a useful exercise for the imputation\nstep in Stage 3.\n\n`demo10.csv` is the compact sibling of the much larger `demo100.csv`\n(about 96 000 rows spanning 2010–2021). Its two-year span already covers\nevery helper faithfully — the 7-day rolling weather window in Stage 4 and\nthe 80/20 temporal split in Stage 8 both have ample data — while keeping\nrender times short, so no row slicing is needed before Stage 1 begins.\n\n## Setup\n\n::: {#exm-setup}\n\n## Loading demo10\n\n::: {#ef81f756 .cell execution_count=1}\n``` {.python .cell-code}\nfrom pathlib import Path\n\nimport pandas as pd\n\nfrom spotforecast2_safe.data.fetch_data import fetch_data, get_package_data_home\n\ndata_demo = fetch_data(\n filename=get_package_data_home() / \"demo10.csv\",\n timezone=\"UTC\",\n)\n\ncache_home = Path.home() / \".spotforecast2_cache\"\n\nprint(\"shape :\", data_demo.shape)\nprint(\"window :\", data_demo.index[0], \"→\", data_demo.index[-1])\nprint(\"NaN per col :\")\nprint(data_demo.isna().sum())\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nshape : (18118, 11)\nwindow : 2019-12-01 00:00:00+00:00 → 2021-12-24 21:00:00+00:00\nNaN per col :\nA 0\nB 0\nC 12707\nD 0\nE 0\nF 0\nG 0\nH 0\nI 0\nJ 0\nK 0\ndtype: int64\n```\n:::\n:::\n\n\n:::\n\n`demo10` is loaded whole — its two-year span is already small enough to\nrender quickly, so no row slicing is required and every stage below sees\nthe full series. `cache_home` points at the same directory that\n`get_cache_home()` would resolve to (`~/.spotforecast2_cache`); reusing it\nmeans subsequent renders read the weather parquet cache instead of\nrefetching from Open-Meteo.\n\n## Stage 1 — Loading and preparing the target series\n\nA forecasting model needs a clean, regularly spaced time series with a\nknown time zone. Stage 1 turns whatever the caller supplied into exactly\nthat. `get_start_end` returns four boundary timestamps: `start` and `end`\ndelimit the historical window used for training, and `cov_start` /\n`cov_end` are the same window extended forward by `forecast_horizon`\nsteps — that extension is the future window for which covariates must be\nconstructed in Stage 4.\n\n::: {#exm-s1-fetch}\n\n## Boundary timestamps, hourly resample\n\n::: {#7bbe5375 .cell execution_count=2}\n``` {.python .cell-code}\nfrom spotforecast2_safe.preprocessing.curate_data import (\n agg_and_resample_data,\n basic_ts_checks,\n get_start_end,\n)\n\nforecast_horizon = 24\n\ndata = data_demo\nstart, end, cov_start, cov_end = get_start_end(\n data=data,\n forecast_horizon=forecast_horizon,\n verbose=False,\n)\n\nbasic_ts_checks(data, verbose=False)\ndata = agg_and_resample_data(data, verbose=False)\n\ntarget_columns = data.columns.tolist()\n\nprint(\"start :\", start)\nprint(\"end :\", end)\nprint(\"cov_start :\", cov_start)\nprint(\"cov_end :\", cov_end)\nprint(\"targets :\", target_columns)\nprint(\"shape :\", data.shape)\ndata.head(2)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nstart : 2019-12-01T00:00\nend : 2021-12-24T21:00\ncov_start : 2019-12-01T00:00\ncov_end : 2021-12-25T21:00\ntargets : ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K']\nshape : (18118, 11)\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=2}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ABCDEFGHIJK
DateTime
2019-12-01 00:00:00+00:00-3559.5040863362.121912NaN190.0749613021.163333-51.98141413.8232065721.520176-151.096582291.088031-211.710112
2019-12-01 01:00:00+00:00-4847.3736513545.130408NaN-594.8348222382.217897197.43058313.0968482824.63207390.163097247.983128-274.333392
\n
\n```\n:::\n:::\n\n\n:::\n\n`cov_end` lies exactly `forecast_horizon` hours past `end` — that is the\nwindow for which Stage 4 will assemble covariates and Stage 10 will emit\npredictions. `basic_ts_checks` confirms the index is a strict, gap-free,\ntimezone-aware `DatetimeIndex`; if it were not, `agg_and_resample_data`\ncould not safely enforce the hourly grid.\n\n::: {.callout-note}\n## Why a strict, gap-free hourly index matters\n\nA recursive forecaster (see `@def-recursive-forecaster` in the sibling\npage) looks up lag 1, lag 24, etc. by *positional* offset. A skipped hour\nwould silently turn \"lag 24\" into \"25 wall-clock hours ago\", which is the\nkind of error that compounds for hundreds of forecast steps before it is\nnoticed.\n:::\n\n## Stage 2 — Outlier detection and removal\n\nStage 2 replaces likely sensor glitches with `NaN` so Stage 3 can treat\nthem the same way it treats genuine gaps. An Isolation Forest is applied\nper column with `contamination=0.01`, meaning about one percent of rows\nper column should be flagged. The `random_state=1234` is fixed so two\nruns on identical input produce identical outlier flags.\n\n::: {#exm-s2-outliers}\n\n## Marking outliers with Isolation Forest\n\n::: {#45c663cc .cell execution_count=3}\n``` {.python .cell-code}\nfrom spotforecast2_safe.preprocessing.outlier import mark_outliers\n\ndata, outliers = mark_outliers(\n data,\n contamination=0.01,\n random_state=1234,\n verbose=False,\n)\n\nprint(\"outliers flagged :\", int((outliers == -1).sum()) if hasattr(outliers, \"__iter__\") else \"—\")\nprint(\"NaN per column after outlier removal:\")\nprint(data.isna().sum())\n```\n\n::: {.cell-output .cell-output-stdout}\n```\noutliers flagged : 181\nNaN per column after outlier removal:\nA 182\nB 181\nC 12889\nD 181\nE 177\nF 172\nG 182\nH 177\nI 182\nJ 181\nK 181\ndtype: int64\n```\n:::\n:::\n\n\n:::\n\nThe per-column `NaN` counts now combine pre-existing gaps (notably the\nlong leading run in column C) with the freshly removed outlier rows.\nStage 3 will not distinguish between the two: imputation closes the gaps,\nand sample weighting penalises the model for paying attention to either\nsource.\n\n## Stage 3 — Imputation and sample weighting\n\n`get_missing_weights` forward- and backward-fills every `NaN`, then\nbuilds a per-row weight series: rows whose `window_size=72` neighbourhood\ntouched an imputed cell receive weight `0`; all other rows receive `1`.\nThe weights are wrapped in a `WeightFunction`, which is a picklable class\nthat the forecaster can carry into a `joblib` dump alongside the model\nitself — a closure would survive `pickle.dumps` but not a `pickle.load`\nin a fresh process.\n\n::: {#exm-s3-weights}\n\n## Imputation count and the weight distribution\n\n::: {#5fc46053 .cell execution_count=4}\n``` {.python .cell-code}\nfrom spotforecast2_safe.preprocessing import WeightFunction\nfrom spotforecast2_safe.preprocessing.imputation import get_missing_weights\n\nimputed_data, weights_series = get_missing_weights(\n data,\n window_size=72,\n verbose=False,\n)\n\nweight_func = WeightFunction(weights_series)\n\nprint(\"imputed NaN remaining :\", int(imputed_data.isna().sum().sum()))\nprint(\"weight distribution :\")\nprint(weights_series.value_counts(dropna=False).sort_index())\nprint(\"weight_func on first 5 timestamps :\", weight_func(imputed_data.index[:5]))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nimputed NaN remaining : 0\nweight distribution :\n0.0 18055\n1.0 63\nName: count, dtype: int64\nweight_func on first 5 timestamps : None\n```\n:::\n:::\n\n\n:::\n\nThe `imputed NaN remaining` count is zero — every gap is closed. The\nweight distribution shows how many rows the forecaster will effectively\nignore; the bulk of the zero-weighted rows are concentrated at the start\nof the window where column C had its leading `NaN` run.\n\n## Stage 4 — Exogenous feature engineering\n\nStage 4 builds four feature DataFrames, each indexed on the extended\ntimeline `[start, cov_end]` so the feature matrix is defined both over\nthe training window and over the future prediction window. The\nsub-stages are independent and can run in any order.\n\n### Stage 4a — Calendar features\n\n::: {#exm-s4a-calendar}\n\n## Calendar features from the index alone\n\n::: {#ef110308 .cell execution_count=5}\n``` {.python .cell-code}\nfrom spotforecast2_safe.calendar import get_calendar_features\n\ncalendar_features = get_calendar_features(\n start=start,\n cov_end=cov_end,\n freq=\"h\",\n timezone=\"UTC\",\n)\n\nprint(\"shape :\", calendar_features.shape)\nprint(\"columns :\", calendar_features.columns.tolist())\ncalendar_features.head(2)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nshape : (18142, 4)\ncolumns : ['month', 'week', 'day_of_week', 'hour']\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monthweekday_of_weekhour
2019-12-01 00:00:00+00:00124860
2019-12-01 01:00:00+00:00124861
\n
\n```\n:::\n:::\n\n\n:::\n\nCalendar features are derived from the index alone, so they are always\ncomplete by construction — no imputation is needed and the missing-value\ncheck at the end of Stage 5 cannot fail on these columns.\n\n### Stage 4b — Day/night features\n\n::: {#exm-s4b-daynight}\n\n## Sunrise, sunset, and the daylight flag\n\n::: {#c0c87d44 .cell execution_count=6}\n``` {.python .cell-code}\nfrom astral import LocationInfo\n\nfrom spotforecast2_safe.calendar import get_day_night_features\n\nlocation = LocationInfo(\n latitude=51.5136,\n longitude=7.4653,\n timezone=\"UTC\",\n)\n\nsun_light_features = get_day_night_features(\n start=start,\n cov_end=cov_end,\n location=location,\n freq=\"h\",\n timezone=\"UTC\",\n)\n\nprint(\"shape :\", sun_light_features.shape)\nprint(\"is_daylight mean:\", round(float(sun_light_features[\"is_daylight\"].mean()), 3))\nsun_light_features.head(2)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nshape : (18142, 4)\nis_daylight mean: 0.505\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=6}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sunrise_hoursunset_hourdaylight_hoursis_daylight
2019-12-01 00:00:00+00:0071580
2019-12-01 01:00:00+00:0071580
\n
\n```\n:::\n:::\n\n\n:::\n\n`is_daylight.mean()` averaged over two years at 51° N should land close\nto `0.5`. Per-day sunrise and sunset values are cached internally so the\nsolar position is not recomputed for every hourly row.\n\n### Stage 4c — Weather features\n\n::: {.callout-warning}\n## First render needs network\n\nStage 4c reaches Open-Meteo. The helper defaults to\n`fallback_on_failure=True`, so renders without network will still\nsucceed (with degraded weather data); but the page is most informative\nwhen run locally once with `cache_home` pointed at a writable directory.\n:::\n\n::: {#exm-s4c-weather}\n\n## Open-Meteo fetch and rolling weather windows\n\n::: {#d5668b70 .cell execution_count=7}\n``` {.python .cell-code}\nfrom spotforecast2_safe.weather import get_weather_features\n\nweather_features, weather_aligned = get_weather_features(\n data=imputed_data,\n start=start,\n cov_end=cov_end,\n forecast_horizon=forecast_horizon,\n latitude=51.5136,\n longitude=7.4653,\n timezone=\"UTC\",\n freq=\"h\",\n cache_home=cache_home,\n verbose=False,\n)\n\nprint(\"weather_features shape :\", weather_features.shape)\nprint(\"weather_aligned shape :\", weather_aligned.shape)\nprint(\"aligned columns :\", weather_aligned.columns.tolist())\nweather_aligned.head(2)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nweather_features shape : (18142, 105)\nweather_aligned shape : (18142, 15)\naligned columns : ['temperature_2m', 'relative_humidity_2m', 'precipitation', 'rain', 'snowfall', 'weather_code', 'pressure_msl', 'surface_pressure', 'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high', 'wind_speed_10m', 'wind_direction_10m', 'wind_gusts_10m']\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
temperature_2mrelative_humidity_2mprecipitationrainsnowfallweather_codepressure_mslsurface_pressurecloud_covercloud_cover_lowcloud_cover_midcloud_cover_highwind_speed_10mwind_direction_10mwind_gusts_10m
2019-12-01 00:00:00+00:00-1.5920.00.00.011023.11009.92000968.39513.3
2019-12-01 01:00:00+00:00-2.0920.00.00.031022.51009.39900996.59614.4
\n
\n```\n:::\n:::\n\n\n:::\n\n`weather_aligned` carries the raw weather columns aligned to the\nextended timeline. `weather_features` carries the same plus rolling\none-day and seven-day mean/min/max windows for each numeric weather\ncolumn. Whether the windows survive into the final feature matrix is\ndecided by the `include_weather_windows` flag in Stage 6.\n\n### Stage 4d — Holiday features\n\n::: {#exm-s4d-holidays}\n\n## German public holidays for North Rhine-Westphalia\n\n::: {#16604171 .cell execution_count=8}\n``` {.python .cell-code}\nfrom spotforecast2_safe.calendar import get_holiday_features\n\nholiday_features = get_holiday_features(\n data=imputed_data,\n start=start,\n cov_end=cov_end,\n forecast_horizon=forecast_horizon,\n tz=\"UTC\",\n freq=\"h\",\n country_code=\"DE\",\n state=\"NW\",\n)\n\nprint(\"shape :\", holiday_features.shape)\nprint(\"flagged holiday hours:\", int(holiday_features[\"is_holiday\"].sum()))\nholiday_features.head(2)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nshape : (18142, 1)\nflagged holiday hours: 550\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=8}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
is_holiday
2019-12-01 00:00:00+00:000
2019-12-01 01:00:00+00:000
\n
\n```\n:::\n:::\n\n\n:::\n\nFor two years of NRW the expected count is roughly eleven holidays per\nyear × twenty-four hours ≈ five hundred flagged hours. The model sees a\nclean binary column with no `NaN`.\n\n## Stage 5 — Combining and encoding exogenous features\n\nThe four feature DataFrames are concatenated column-wise in a fixed\norder: calendar, day/night, weather, holidays. Any `NaN` that survived\ninto the concatenated matrix would be a bug — Stage 5 raises before any\nfurther work happens. Then `apply_cyclical_encoding` replaces every\nperiodic integer column (`month`, `week`, `day_of_week`, `hour`,\n`sunrise_hour`, `sunset_hour`) with its sine and cosine on a unit circle,\nkeeping the original integers alongside, and `create_interaction_features`\nemits pairwise products of cyclical, weather, and holiday columns\nprefixed with `poly_`.\n\n::: {#exm-s5-concat-encode}\n\n## Concat, NaN guard, cyclical encoding, interactions\n\n::: {#fec924f3 .cell execution_count=9}\n``` {.python .cell-code}\nimport pandas as pd\n\nfrom spotforecast2_safe.manager.features import (\n apply_cyclical_encoding,\n create_interaction_features,\n)\n\nexogenous_features = pd.concat(\n [calendar_features, sun_light_features, weather_features, holiday_features],\n axis=1,\n)\n\nmissing_count = int(exogenous_features.isnull().sum().sum())\nprint(\"missing entries in concat :\", missing_count)\nprint(\"shape after concat :\", exogenous_features.shape)\n\nexogenous_features = apply_cyclical_encoding(\n data=exogenous_features,\n drop_original=False,\n)\nsin_cos_cols = [c for c in exogenous_features.columns if c.endswith(\"_sin\") or c.endswith(\"_cos\")]\nprint(\"shape after cyclical enc :\", exogenous_features.shape)\nprint(\"number of sin/cos columns :\", len(sin_cos_cols))\n\nexogenous_features = create_interaction_features(\n exogenous_features=exogenous_features,\n weather_aligned=weather_aligned,\n)\npoly_cols = [c for c in exogenous_features.columns if c.startswith(\"poly_\")]\nprint(\"shape after interactions :\", exogenous_features.shape)\nprint(\"number of poly_ columns :\", len(poly_cols))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nmissing entries in concat : 0\nshape after concat : (18142, 114)\nshape after cyclical enc : (18142, 126)\nnumber of sin/cos columns : 12\nshape after interactions : (18142, 126)\nnumber of poly_ columns : 0\n```\n:::\n:::\n\n\n:::\n\n::: {.callout-note}\n## Why sine and cosine\n\nFeeding the integer hour 0–23 directly to a tree-based model makes hour\n23 and hour 0 look very far apart numerically, even though they are\nneighbours on a clock. Sine and cosine wrap the integer onto a unit\ncircle, so adjacent hours stay adjacent in the encoded space.\nSee `@def-cyclical-encoding` in the sibling page.\n:::\n\n## Stage 6 — Feature selection\n\n`select_exogenous_features` returns the final list of column names that\nwill be passed to the forecaster as `exog`. With the three include flags\nset to `False` (the package defaults), the selection keeps only the\ncyclical sine/cosine columns and the raw weather columns; rolling weather\nwindows, the holiday column, and the polynomial interactions are\nfiltered out.\n\n::: {#exm-s6-select}\n\n## Final exogenous column list\n\n::: {#3281de6a .cell execution_count=10}\n``` {.python .cell-code}\nfrom spotforecast2_safe.manager.features import select_exogenous_features\n\nexog_features = select_exogenous_features(\n exogenous_features=exogenous_features,\n weather_aligned=weather_aligned,\n include_weather_windows=False,\n include_holiday_features=False,\n poly_features_degree=1,\n)\n\nprint(\"number of exog features :\", len(exog_features))\nprint(\"first 6 :\", exog_features[:6])\nprint(\"last 3 :\", exog_features[-3:])\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nnumber of exog features : 27\nfirst 6 : ['month_sin', 'month_cos', 'week_sin', 'week_cos', 'day_of_week_sin', 'day_of_week_cos']\nlast 3 : ['wind_speed_10m', 'wind_direction_10m', 'wind_gusts_10m']\n```\n:::\n:::\n\n\n:::\n\nTo experiment with the other flags, re-run this cell with one or more of\nthe `include_*` arguments set to `True` and observe how\n`len(exog_features)` grows.\n\n## Stage 7 — Merging target and exogenous data\n\n`merge_data_and_covariates` performs an inner join of the target columns\nand the selected exogenous columns over `[start, end]`, casts every\ncolumn to `float32` to halve the memory footprint of the lag matrix, and\nalso returns the slice of the exogenous matrix that covers the future\nprediction window `(end, cov_end]`.\n\n::: {#exm-s7-merge}\n\n## Merged training matrix and the future-window slice\n\n::: {#bf092f6b .cell execution_count=11}\n``` {.python .cell-code}\nfrom spotforecast2_safe.manager.features import merge_data_and_covariates\n\ndata_with_exog, exo_tmp, exo_pred = merge_data_and_covariates(\n data=imputed_data,\n exogenous_features=exogenous_features,\n target_columns=target_columns,\n exog_features=exog_features,\n start=start,\n end=end,\n cov_end=cov_end,\n forecast_horizon=forecast_horizon,\n cast_dtype=\"float32\",\n)\n\nprint(\"data_with_exog shape :\", data_with_exog.shape)\nprint(\"exo_pred shape :\", exo_pred.shape)\nprint(\"dtypes count :\")\nprint(data_with_exog.dtypes.value_counts())\ndata_with_exog.head(2).iloc[:, :4]\n```\n\n::: {.cell-output .cell-output-stdout}\n```\ndata_with_exog shape : (18118, 38)\nexo_pred shape : (24, 126)\ndtypes count :\nfloat32 38\nName: count, dtype: int64\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=11}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ABCD
DateTime
2019-12-01 00:00:00+00:00-3559.5041503362.121826240.210419190.074966
2019-12-01 01:00:00+00:00-4847.3735353545.130371240.210419190.074966
\n
\n```\n:::\n:::\n\n\n:::\n\n`exo_pred` has exactly `forecast_horizon` rows; it is the matrix passed\nto every forecaster at prediction time. Every column of `data_with_exog`\nis `float32`, including the eleven target columns.\n\n## Stage 8 — Train / validation / test split\n\nThe split is purely temporal: the first `train_ratio=0.8` fraction of\nrows is training data, and the remaining twenty percent goes to validation.\nThe test segment is empty because `perc_val = 1 - train_ratio` consumes\nall remaining rows. `end_validation` is the cutoff timestamp used by\n`forecaster.fit(...)`.\n\n::: {#exm-s8-split}\n\n## Temporal split and the validation cutoff\n\n::: {#2e57dc54 .cell execution_count=12}\n``` {.python .cell-code}\nfrom spotforecast2_safe.splitter.split import split_rel_train_val_test\n\ntrain_ratio = 0.8\ndata_train, data_val, data_test = split_rel_train_val_test(\n data_with_exog,\n perc_train=train_ratio,\n perc_val=1.0 - train_ratio,\n verbose=False,\n)\nend_validation = pd.concat([data_train, data_val]).index[-1]\n\nprint(\"train rows :\", len(data_train))\nprint(\"val rows :\", len(data_val))\nprint(\"test rows :\", len(data_test))\nprint(\"end_validation:\", end_validation)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\ntrain rows : 14494\nval rows : 3624\ntest rows : 0\nend_validation: 2021-12-24 21:00:00+00:00\n```\n:::\n:::\n\n\n:::\n\n::: {.callout-note}\n## Why temporal, not random\n\nRandom shuffling before a time-series split lets the model \"see the\nfuture\": its training set could include hour 14 of a day on which the\ntest set asks it to predict hour 13. A temporal split forces a real\nfuture-from-past forecast. See `@def-train-val-test` in the sibling\npage.\n:::\n\n## Stage 9 — Training recursive forecasters\n\nFor each of the eleven target columns Stage 9 builds a fresh\n`ForecasterRecursive`, configured identically: `lags=24`,\n`RollingFeatures(stats=[\"mean\"], window_sizes=72)`, the shared\n`weight_func`, and an `LGBMRegressor` with `random_state=1234`. Every\nfit uses data up to and including `end_validation`, so the test segment\nis held out for any downstream evaluation step.\n\nThis is the most expensive cell on the page — eleven LightGBM models on\nroughly fourteen thousand training rows each. Expect a few minutes on a\nmodern laptop.\n\n::: {#exm-s9-train}\n\n## Eleven forecasters, one shared configuration\n\n::: {#572ed0f6 .cell execution_count=13}\n``` {.python .cell-code}\nfrom lightgbm import LGBMRegressor\n\nfrom spotforecast2_safe.forecaster.recursive import ForecasterRecursive\nfrom spotforecast2_safe.preprocessing import RollingFeatures\n\nestimator = LGBMRegressor(random_state=1234, verbose=-1)\nwindow_features = RollingFeatures(stats=[\"mean\"], window_sizes=72)\n\nrecursive_forecasters = {}\nfor target in target_columns:\n forecaster = ForecasterRecursive(\n estimator=estimator,\n lags=24,\n window_features=window_features,\n weight_func=weight_func,\n )\n forecaster.fit(\n y=data_with_exog[target].loc[:end_validation].squeeze(),\n exog=data_with_exog[exog_features].loc[:end_validation],\n )\n recursive_forecasters[target] = forecaster\n\nfeature_counts = {t: recursive_forecasters[t].estimator.n_features_in_ for t in target_columns}\nprint(\"forecasters trained :\", len(recursive_forecasters))\nprint(\"estimator type :\", type(recursive_forecasters[target_columns[0]].estimator).__name__)\nprint(\"n_features_in_ across all :\", set(feature_counts.values()))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nforecasters trained : 11\nestimator type : LGBMRegressor\nn_features_in_ across all : {52}\n```\n:::\n:::\n\n\n:::\n\nAll eleven forecasters share the same hyperparameters and\n`random_state=1234`, so two renders on identical input produce byte-\nidentical models — the determinism guarantee underpins reproducible\nreporting.\n\n## Stage 10 — Prediction\n\n`predict_multivariate` iterates over the trained forecasters and calls\n`.predict(steps=forecast_horizon, exog=exo_pred[exog_features])` on each.\nThe result is a single DataFrame with one column per target and\n`forecast_horizon` rows.\n\n::: {#exm-s10-predict}\n\n## Twenty-four-hour multi-target forecast\n\n::: {#c2f2dc45 .cell execution_count=14}\n``` {.python .cell-code}\nfrom spotforecast2_safe.forecaster.utils import predict_multivariate\n\npredictions = predict_multivariate(\n recursive_forecasters,\n steps_ahead=forecast_horizon,\n exog=exo_pred[exog_features],\n show_progress=False,\n)\n\nprint(\"predictions shape :\", predictions.shape)\nprint(\"first timestamp :\", predictions.index[0])\nprint(\"last timestamp :\", predictions.index[-1])\npredictions.head(3).round(2)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\npredictions shape : (24, 11)\nfirst timestamp : 2021-12-24 22:00:00+00:00\nlast timestamp : 2021-12-25 21:00:00+00:00\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=14}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ABCDEFGHIJK
2021-12-24 22:00:00+00:00-783.344188.96-497.0472.881460.07-35.5044.408543.76-108.32-400.02-226.03
2021-12-24 23:00:00+00:00-1911.233427.85-91.57-381.161314.43129.4539.647877.67-178.98-398.70-498.15
2021-12-25 00:00:00+00:00-1907.704048.50-403.56-378.691345.64122.5347.887864.84-68.93-398.86-499.13
\n
\n```\n:::\n:::\n\n\n:::\n\n`predictions.shape` is `(24, 11)`: twenty-four forecast hours, eleven\ntarget columns. The index is exactly `exo_pred.index`, which is exactly\n`forecast_horizon` hours starting one step after `end_validation`.\n\n## Aggregation\n\nStage 10 produced one forecast column per target. The production task closes\nthe pipeline with a weighted aggregation: `agg_predict(predictions,\nweights=weights)` reduces the eleven columns to a single Series sharing the\n`DatetimeIndex` of `exo_pred`. The conceptual sibling page explains the\nhelper's `list` / `np.ndarray` / `dict` accepted forms and the signed-weights\nconvention in its\n[Aggregation section](n2n_predict_with_covariates_explained.qmd#aggregation).\n\n::: {#exm-aggregation}\n\n## Aggregating the eleven per-target forecasts\n\n::: {#eddf2d16 .cell execution_count=15}\n``` {.python .cell-code}\nfrom spotforecast2_safe.processing.agg_predict import agg_predict\n\nweights = [1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 1.0, 1.0, 1.0, -1.0, 1.0]\ncombined_prediction = agg_predict(predictions, weights=weights)\n\nprint(\"combined shape :\", combined_prediction.shape)\nprint(\"first timestamp :\", combined_prediction.index[0])\nprint(\"last timestamp :\", combined_prediction.index[-1])\ncombined_prediction.head(3).round(4)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\ncombined shape : (24,)\nfirst timestamp : 2021-12-24 22:00:00+00:00\nlast timestamp : 2021-12-25 21:00:00+00:00\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=15}\n```\n2021-12-24 22:00:00+00:00 13979.1714\n2021-12-24 23:00:00+00:00 10813.2073\n2021-12-25 00:00:00+00:00 11889.6782\nFreq: h, dtype: float64\n```\n:::\n:::\n\n\n:::\n\nThe sign pattern of `weights` is identical to the module-level\n`DEFAULT_WEIGHTS` constant in\n`tasks/task_safe_n_to_1_with_covariates_and_dataframe.py`. With this signed\nchoice the aggregate behaves as a net position — the first, second, fifth,\nseventh, eighth, ninth, and eleventh columns are added and the remainder are\nsubtracted — and the call mirrors what the production task\n[`docs/tasks/task_safe_n21_cov_df.qmd`](../tasks/task_safe_n21_cov_df.qmd)\nexecutes after its own Stage 10.\n\n## Metadata\n\nThe orchestrator emits a metadata dictionary that records every\nconfiguration knob and every shape it just produced — a self-contained\naudit record. Building the same dictionary by hand is a useful integration\ncheck that nothing was lost along the way.\n\n::: {#exm-meta}\n\n## Reconstructing the orchestrator's metadata\n\n::: {#e920bc43 .cell execution_count=16}\n``` {.python .cell-code}\nmetadata = {\n \"forecast_horizon\": forecast_horizon,\n \"target_columns\": target_columns,\n \"exog_features\": exog_features,\n \"n_exog_features\": len(exog_features),\n \"train_size\": len(data_train),\n \"val_size\": len(data_val),\n \"test_size\": len(data_test),\n \"data_shape_original\": data_demo.shape,\n \"data_shape_merged\": data_with_exog.shape,\n \"training_end\": end_validation,\n \"prediction_start\": exo_pred.index[0],\n \"prediction_end\": exo_pred.index[-1],\n \"lags\": 24,\n \"window_size\": 72,\n \"contamination\": 0.01,\n \"n_outliers\": int((outliers == -1).sum()) if hasattr(outliers, \"__iter__\") else 0,\n}\n\nprint(\"metadata keys :\", sorted(metadata.keys()))\nprint(\"training_end :\", metadata[\"training_end\"])\nprint(\"prediction :\", metadata[\"prediction_start\"], \"→\", metadata[\"prediction_end\"])\nprint(\"n_exog_features:\", metadata[\"n_exog_features\"])\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nmetadata keys : ['contamination', 'data_shape_merged', 'data_shape_original', 'exog_features', 'forecast_horizon', 'lags', 'n_exog_features', 'n_outliers', 'prediction_end', 'prediction_start', 'target_columns', 'test_size', 'train_size', 'training_end', 'val_size', 'window_size']\ntraining_end : 2021-12-24 21:00:00+00:00\nprediction : 2021-12-24 22:00:00+00:00 → 2021-12-25 21:00:00+00:00\nn_exog_features: 27\n```\n:::\n:::\n\n\n:::\n\n## Cross-check against the orchestrator\n\nThe pipeline above is exactly what `n2n_predict_with_covariates` does\ninternally. Calling the orchestrator on the same data with the\nsame parameters should yield the same shapes and (modulo any change in\nOpen-Meteo's historical data between the two calls) the same numerical\npredictions.\n\n::: {#exm-end-to-end}\n\n## Single-call equivalence\n\n::: {#5abe2386 .cell execution_count=17}\n``` {.python .cell-code}\nfrom spotforecast2_safe.processing.n2n_predict_with_covariates import (\n n2n_predict_with_covariates,\n)\n\npredictions_full, metadata_full, forecasters_full = n2n_predict_with_covariates(\n data=data_demo,\n forecast_horizon=24,\n lags=24,\n window_size=72,\n contamination=0.01,\n train_ratio=0.8,\n latitude=51.5136,\n longitude=7.4653,\n timezone=\"UTC\",\n country_code=\"DE\",\n state=\"NW\",\n force_train=True,\n model_dir=str(cache_home / \"demo10_forecasters\"),\n verbose=False,\n show_progress=False,\n)\n\nassert predictions_full.shape == predictions.shape\n\nprint(\"step-by-step shape :\", predictions.shape)\nprint(\"orchestrator shape :\", predictions_full.shape)\nprint(\"n_exog (manual) :\", metadata[\"n_exog_features\"])\nprint(\"n_exog (orch) :\", metadata_full[\"n_exog_features\"])\npredictions_full.head(3).round(2)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nstep-by-step shape : (24, 11)\norchestrator shape : (24, 11)\nn_exog (manual) : 27\nn_exog (orch) : 27\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=17}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ABCDEFGHIJK
2021-12-24 22:00:00+00:00-783.344188.96-497.0472.881460.07-35.5044.408543.76-108.32-400.02-226.03
2021-12-24 23:00:00+00:00-1911.233427.85-91.57-381.161314.43129.4539.647877.67-178.98-398.70-498.15
2021-12-25 00:00:00+00:00-1907.704048.50-403.56-378.691345.64122.5347.887864.84-68.93-398.86-499.13
\n
\n```\n:::\n:::\n\n\n:::\n\nNumerical equality of the two prediction matrices depends on Open-Meteo\nreturning identical historical values between the two render passes;\nsharing the same `cache_home` is what makes that the common case rather\nthan a rare one.\n\n## What to take away\n\n- Each stage of `n2n_predict_with_covariates` produces a checkable\n intermediate artefact: the boundary timestamps, the outlier mask, the\n weight series, the four exogenous DataFrames, the merged matrix, the\n three split DataFrames, the dictionary of fitted forecasters, and\n finally the predictions DataFrame.\n- The orchestrator is exactly the composition of these calls. There is\n no hidden state — the cross-check cell above proves it.\n- The only network-dependent stage is 4c. `freeze: true` keeps local\n iteration cheap, and `cache_home` keeps the network round-trip\n amortised across renders.\n- The same configuration on the same input yields byte-identical\n forecasters, predictions, and metadata. That determinism is the\n foundation that makes `n2n_predict_with_covariates` safe to embed in\n an automated batch job.\n\n", + "markdown": "---\ntitle: \"n2n_predict_with_covariates on demo10: Step by Step on Real Data\"\ndescription: \"Sibling walkthrough that executes each pipeline stage from n2n_predict_with_covariates.py against the bundled demo10.csv, one helper at a time.\"\nexecute:\n freeze: true\n warning: false\n---\n\n## What this page does\n\nThis is the executable companion to\n[*n2n_predict_with_covariates*: A Beginner's Walkthrough](n2n_predict_with_covariates_explained.qmd).\nThe sibling page introduces the vocabulary and explains the *why* of each\nstage with small synthetic examples. This page does the opposite: every code\ncell executes one real stage of\n`spotforecast2_safe.processing.n2n_predict_with_covariates` against the\nbundled `demo10.csv` dataset, prints the intermediate state, and then\ndiscusses what just happened. Refer back to the sibling page for the\nvocabulary — `@def-time-series`, `@def-lag`, `@def-forecast-horizon`,\n`@def-recursive-forecaster`, `@def-train-val-test`, `@def-outlier`,\n`@def-imputation`, `@def-sample-weight`, `@def-cyclical-encoding`,\n`@def-persistence` — all definitions are reused unchanged.\n\nThe closing section calls `n2n_predict_with_covariates` end-to-end on the\nsame input as a sanity check that the per-stage breakdown above is\nequivalent to the orchestrator.\n\n## About `demo10.csv`\n\n`demo10.csv` is bundled in the wheel at\n`spotforecast2_safe/datasets/csv/demo10.csv`. It carries eleven numeric\ncolumns A–K, indexed by hourly UTC timestamps from December 2019 through\nDecember 2021 — about 18 000 rows, roughly two years. Column C comes\nonline late (mid-May 2021) and starts with a long run of `NaN` covering\nmost of the window, which makes it a useful exercise for the imputation\nstep in Stage 3.\n\n`demo10.csv` is the compact sibling of the much larger `demo100.csv`\n(about 96 000 rows spanning 2010–2021). Its two-year span already covers\nevery helper faithfully — the 7-day rolling weather window in Stage 4 and\nthe 80/20 temporal split in Stage 8 both have ample data — while keeping\nrender times short, so no row slicing is needed before Stage 1 begins.\n\n## Setup\n\n::: {#exm-setup}\n\n## Loading demo10\n\n::: {#17795aa9 .cell execution_count=1}\n``` {.python .cell-code}\nfrom pathlib import Path\n\nimport pandas as pd\n\nfrom spotforecast2_safe.data.fetch_data import fetch_data, get_package_data_home\n\ndata_demo = fetch_data(\n filename=get_package_data_home() / \"demo10.csv\",\n timezone=\"UTC\",\n)\n\ncache_home = Path.home() / \".spotforecast2_cache\"\n\nprint(\"shape :\", data_demo.shape)\nprint(\"window :\", data_demo.index[0], \"→\", data_demo.index[-1])\nprint(\"NaN per col :\")\nprint(data_demo.isna().sum())\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nshape : (18118, 11)\nwindow : 2019-12-01 00:00:00+00:00 → 2021-12-24 21:00:00+00:00\nNaN per col :\nA 0\nB 0\nC 12707\nD 0\nE 0\nF 0\nG 0\nH 0\nI 0\nJ 0\nK 0\ndtype: int64\n```\n:::\n:::\n\n\n:::\n\n`demo10` is loaded whole — its two-year span is already small enough to\nrender quickly, so no row slicing is required and every stage below sees\nthe full series. `cache_home` points at the same directory that\n`get_cache_home()` would resolve to (`~/.spotforecast2_cache`); reusing it\nmeans subsequent renders read the weather parquet cache instead of\nrefetching from Open-Meteo.\n\n## Stage 1 — Loading and preparing the target series\n\nA forecasting model needs a clean, regularly spaced time series with a\nknown time zone. Stage 1 turns whatever the caller supplied into exactly\nthat. `get_start_end` returns four boundary timestamps: `start` and `end`\ndelimit the historical window used for training, and `cov_start` /\n`cov_end` are the same window extended forward by `forecast_horizon`\nsteps — that extension is the future window for which covariates must be\nconstructed in Stage 4.\n\n::: {#exm-s1-fetch}\n\n## Boundary timestamps, hourly resample\n\n::: {#fa8311cc .cell execution_count=2}\n``` {.python .cell-code}\nfrom spotforecast2_safe.preprocessing.curate_data import (\n agg_and_resample_data,\n basic_ts_checks,\n get_start_end,\n)\n\nforecast_horizon = 24\n\ndata = data_demo\nstart, end, cov_start, cov_end = get_start_end(\n data=data,\n forecast_horizon=forecast_horizon,\n verbose=False,\n)\n\nbasic_ts_checks(data, verbose=False)\ndata = agg_and_resample_data(data, verbose=False)\n\ntarget_columns = data.columns.tolist()\n\nprint(\"start :\", start)\nprint(\"end :\", end)\nprint(\"cov_start :\", cov_start)\nprint(\"cov_end :\", cov_end)\nprint(\"targets :\", target_columns)\nprint(\"shape :\", data.shape)\ndata.head(2)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nstart : 2019-12-01T00:00\nend : 2021-12-24T21:00\ncov_start : 2019-12-01T00:00\ncov_end : 2021-12-25T21:00\ntargets : ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K']\nshape : (18118, 11)\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=2}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ABCDEFGHIJK
DateTime
2019-12-01 00:00:00+00:00-3559.5040863362.121912NaN190.0749613021.163333-51.98141413.8232065721.520176-151.096582291.088031-211.710112
2019-12-01 01:00:00+00:00-4847.3736513545.130408NaN-594.8348222382.217897197.43058313.0968482824.63207390.163097247.983128-274.333392
\n
\n```\n:::\n:::\n\n\n:::\n\n`cov_end` lies exactly `forecast_horizon` hours past `end` — that is the\nwindow for which Stage 4 will assemble covariates and Stage 10 will emit\npredictions. `basic_ts_checks` confirms the index is a strict, gap-free,\ntimezone-aware `DatetimeIndex`; if it were not, `agg_and_resample_data`\ncould not safely enforce the hourly grid.\n\n::: {.callout-note}\n## Why a strict, gap-free hourly index matters\n\nA recursive forecaster (see `@def-recursive-forecaster` in the sibling\npage) looks up lag 1, lag 24, etc. by *positional* offset. A skipped hour\nwould silently turn \"lag 24\" into \"25 wall-clock hours ago\", which is the\nkind of error that compounds for hundreds of forecast steps before it is\nnoticed.\n:::\n\n## Stage 2 — Outlier detection and removal\n\nStage 2 replaces likely sensor glitches with `NaN` so Stage 3 can treat\nthem the same way it treats genuine gaps. An Isolation Forest is applied\nper column with `contamination=0.01`, meaning about one percent of rows\nper column should be flagged. The `random_state=1234` is fixed so two\nruns on identical input produce identical outlier flags.\n\n::: {#exm-s2-outliers}\n\n## Marking outliers with Isolation Forest\n\n::: {#ff797e0a .cell execution_count=3}\n``` {.python .cell-code}\nfrom spotforecast2_safe.preprocessing.outlier import mark_outliers\n\ndata, outliers = mark_outliers(\n data,\n contamination=0.01,\n random_state=1234,\n verbose=False,\n)\n\nprint(\"outliers flagged :\", int((outliers == -1).sum()) if hasattr(outliers, \"__iter__\") else \"—\")\nprint(\"NaN per column after outlier removal:\")\nprint(data.isna().sum())\n```\n\n::: {.cell-output .cell-output-stdout}\n```\noutliers flagged : 181\nNaN per column after outlier removal:\nA 182\nB 181\nC 12707\nD 181\nE 177\nF 172\nG 182\nH 177\nI 182\nJ 181\nK 181\ndtype: int64\n```\n:::\n:::\n\n\n:::\n\nThe per-column `NaN` counts now combine pre-existing gaps (notably the\nlong leading run in column C) with the freshly removed outlier rows.\nStage 3 will not distinguish between the two: imputation closes the gaps,\nand sample weighting penalises the model for paying attention to either\nsource.\n\n## Stage 3 — Imputation and sample weighting\n\n`get_missing_weights` forward- and backward-fills every `NaN`, then\nbuilds a per-row weight series: rows whose `window_size=72` neighbourhood\ntouched an imputed cell receive weight `0`; all other rows receive `1`.\nThe weights are wrapped in a `WeightFunction`, which is a picklable class\nthat the forecaster can carry into a `joblib` dump alongside the model\nitself — a closure would survive `pickle.dumps` but not a `pickle.load`\nin a fresh process.\n\n::: {#exm-s3-weights}\n\n## Imputation count and the weight distribution\n\n::: {#34422ea3 .cell execution_count=4}\n``` {.python .cell-code}\nfrom spotforecast2_safe.preprocessing import WeightFunction\nfrom spotforecast2_safe.preprocessing.imputation import get_missing_weights\n\nimputed_data, weights_series = get_missing_weights(\n data,\n window_size=72,\n verbose=False,\n)\n\nweight_func = WeightFunction(weights_series)\n\nprint(\"imputed NaN remaining :\", int(imputed_data.isna().sum().sum()))\nprint(\"weight distribution :\")\nprint(weights_series.value_counts(dropna=False).sort_index())\nprint(\"weight_func on first 5 timestamps :\", weight_func(imputed_data.index[:5]))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nimputed NaN remaining : 0\nweight distribution :\n0.0 17716\n1.0 402\nName: count, dtype: int64\nweight_func on first 5 timestamps : None\n```\n:::\n:::\n\n\n:::\n\nThe `imputed NaN remaining` count is zero — every gap is closed. The\nweight distribution shows how many rows the forecaster will effectively\nignore; the bulk of the zero-weighted rows are concentrated at the start\nof the window where column C had its leading `NaN` run.\n\n## Stage 4 — Exogenous feature engineering\n\nStage 4 builds four feature DataFrames, each indexed on the extended\ntimeline `[start, cov_end]` so the feature matrix is defined both over\nthe training window and over the future prediction window. The\nsub-stages are independent and can run in any order.\n\n### Stage 4a — Calendar features\n\n::: {#exm-s4a-calendar}\n\n## Calendar features from the index alone\n\n::: {#2753ba47 .cell execution_count=5}\n``` {.python .cell-code}\nfrom spotforecast2_safe.calendar import get_calendar_features\n\ncalendar_features = get_calendar_features(\n start=start,\n cov_end=cov_end,\n freq=\"h\",\n timezone=\"UTC\",\n)\n\nprint(\"shape :\", calendar_features.shape)\nprint(\"columns :\", calendar_features.columns.tolist())\ncalendar_features.head(2)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nshape : (18142, 4)\ncolumns : ['month', 'week', 'day_of_week', 'hour']\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monthweekday_of_weekhour
2019-12-01 00:00:00+00:00124860
2019-12-01 01:00:00+00:00124861
\n
\n```\n:::\n:::\n\n\n:::\n\nCalendar features are derived from the index alone, so they are always\ncomplete by construction — no imputation is needed and the missing-value\ncheck at the end of Stage 5 cannot fail on these columns.\n\n### Stage 4b — Day/night features\n\n::: {#exm-s4b-daynight}\n\n## Sunrise, sunset, and the daylight flag\n\n::: {#25917fa1 .cell execution_count=6}\n``` {.python .cell-code}\nfrom astral import LocationInfo\n\nfrom spotforecast2_safe.calendar import get_day_night_features\n\nlocation = LocationInfo(\n latitude=51.5136,\n longitude=7.4653,\n timezone=\"UTC\",\n)\n\nsun_light_features = get_day_night_features(\n start=start,\n cov_end=cov_end,\n location=location,\n freq=\"h\",\n timezone=\"UTC\",\n)\n\nprint(\"shape :\", sun_light_features.shape)\nprint(\"is_daylight mean:\", round(float(sun_light_features[\"is_daylight\"].mean()), 3))\nsun_light_features.head(2)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nshape : (18142, 4)\nis_daylight mean: 0.505\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=6}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sunrise_hoursunset_hourdaylight_hoursis_daylight
2019-12-01 00:00:00+00:0071580
2019-12-01 01:00:00+00:0071580
\n
\n```\n:::\n:::\n\n\n:::\n\n`is_daylight.mean()` averaged over two years at 51° N should land close\nto `0.5`. Per-day sunrise and sunset values are cached internally so the\nsolar position is not recomputed for every hourly row.\n\n### Stage 4c — Weather features\n\n::: {.callout-warning}\n## First render needs network\n\nStage 4c reaches Open-Meteo. The helper defaults to\n`fallback_on_failure=True`, so renders without network will still\nsucceed (with degraded weather data); but the page is most informative\nwhen run locally once with `cache_home` pointed at a writable directory.\n:::\n\n::: {#exm-s4c-weather}\n\n## Open-Meteo fetch and rolling weather windows\n\n::: {#ec49df4d .cell execution_count=7}\n``` {.python .cell-code}\nfrom spotforecast2_safe.weather import get_weather_features\n\nweather_features, weather_aligned = get_weather_features(\n data=imputed_data,\n start=start,\n cov_end=cov_end,\n forecast_horizon=forecast_horizon,\n latitude=51.5136,\n longitude=7.4653,\n timezone=\"UTC\",\n freq=\"h\",\n cache_home=cache_home,\n verbose=False,\n)\n\nprint(\"weather_features shape :\", weather_features.shape)\nprint(\"weather_aligned shape :\", weather_aligned.shape)\nprint(\"aligned columns :\", weather_aligned.columns.tolist())\nweather_aligned.head(2)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nweather_features shape : (18142, 105)\nweather_aligned shape : (18142, 15)\naligned columns : ['temperature_2m', 'relative_humidity_2m', 'precipitation', 'rain', 'snowfall', 'weather_code', 'pressure_msl', 'surface_pressure', 'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high', 'wind_speed_10m', 'wind_direction_10m', 'wind_gusts_10m']\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
temperature_2mrelative_humidity_2mprecipitationrainsnowfallweather_codepressure_mslsurface_pressurecloud_covercloud_cover_lowcloud_cover_midcloud_cover_highwind_speed_10mwind_direction_10mwind_gusts_10m
2019-12-01 00:00:00+00:00-1.5920.00.00.011023.11009.92000968.39513.3
2019-12-01 01:00:00+00:00-2.0920.00.00.031022.51009.39900996.59614.4
\n
\n```\n:::\n:::\n\n\n:::\n\n`weather_aligned` carries the raw weather columns aligned to the\nextended timeline. `weather_features` carries the same plus rolling\none-day and seven-day mean/min/max windows for each numeric weather\ncolumn. Whether the windows survive into the final feature matrix is\ndecided by the `include_weather_windows` flag in Stage 6.\n\n### Stage 4d — Holiday features\n\n::: {#exm-s4d-holidays}\n\n## German public holidays for North Rhine-Westphalia\n\n::: {#486fcaa1 .cell execution_count=8}\n``` {.python .cell-code}\nfrom spotforecast2_safe.calendar import get_holiday_features\n\nholiday_features = get_holiday_features(\n data=imputed_data,\n start=start,\n cov_end=cov_end,\n forecast_horizon=forecast_horizon,\n tz=\"UTC\",\n freq=\"h\",\n country_code=\"DE\",\n state=\"NW\",\n)\n\nprint(\"shape :\", holiday_features.shape)\nprint(\"flagged holiday hours:\", int(holiday_features[\"is_holiday\"].sum()))\nholiday_features.head(2)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nshape : (18142, 1)\nflagged holiday hours: 550\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=8}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
is_holiday
2019-12-01 00:00:00+00:000
2019-12-01 01:00:00+00:000
\n
\n```\n:::\n:::\n\n\n:::\n\nFor two years of NRW the expected count is roughly eleven holidays per\nyear × twenty-four hours ≈ five hundred flagged hours. The model sees a\nclean binary column with no `NaN`.\n\n## Stage 5 — Combining and encoding exogenous features\n\nThe four feature DataFrames are concatenated column-wise in a fixed\norder: calendar, day/night, weather, holidays. Any `NaN` that survived\ninto the concatenated matrix would be a bug — Stage 5 raises before any\nfurther work happens. Then `apply_cyclical_encoding` replaces every\nperiodic integer column (`month`, `week`, `day_of_week`, `hour`,\n`sunrise_hour`, `sunset_hour`) with its sine and cosine on a unit circle,\nkeeping the original integers alongside, and `create_interaction_features`\nemits pairwise products of cyclical, weather, and holiday columns\nprefixed with `poly_`.\n\n::: {#exm-s5-concat-encode}\n\n## Concat, NaN guard, cyclical encoding, interactions\n\n::: {#b223763c .cell execution_count=9}\n``` {.python .cell-code}\nimport pandas as pd\n\nfrom spotforecast2_safe.manager.features import (\n apply_cyclical_encoding,\n create_interaction_features,\n)\n\nexogenous_features = pd.concat(\n [calendar_features, sun_light_features, weather_features, holiday_features],\n axis=1,\n)\n\nmissing_count = int(exogenous_features.isnull().sum().sum())\nprint(\"missing entries in concat :\", missing_count)\nprint(\"shape after concat :\", exogenous_features.shape)\n\nexogenous_features = apply_cyclical_encoding(\n data=exogenous_features,\n drop_original=False,\n)\nsin_cos_cols = [c for c in exogenous_features.columns if c.endswith(\"_sin\") or c.endswith(\"_cos\")]\nprint(\"shape after cyclical enc :\", exogenous_features.shape)\nprint(\"number of sin/cos columns :\", len(sin_cos_cols))\n\nexogenous_features = create_interaction_features(\n exogenous_features=exogenous_features,\n weather_aligned=weather_aligned,\n)\npoly_cols = [c for c in exogenous_features.columns if c.startswith(\"poly_\")]\nprint(\"shape after interactions :\", exogenous_features.shape)\nprint(\"number of poly_ columns :\", len(poly_cols))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nmissing entries in concat : 0\nshape after concat : (18142, 114)\nshape after cyclical enc : (18142, 126)\nnumber of sin/cos columns : 12\nshape after interactions : (18142, 126)\nnumber of poly_ columns : 0\n```\n:::\n:::\n\n\n:::\n\n::: {.callout-note}\n## Why sine and cosine\n\nFeeding the integer hour 0–23 directly to a tree-based model makes hour\n23 and hour 0 look very far apart numerically, even though they are\nneighbours on a clock. Sine and cosine wrap the integer onto a unit\ncircle, so adjacent hours stay adjacent in the encoded space.\nSee `@def-cyclical-encoding` in the sibling page.\n:::\n\n## Stage 6 — Feature selection\n\n`select_exogenous_features` returns the final list of column names that\nwill be passed to the forecaster as `exog`. With the three include flags\nset to `False` (the package defaults), the selection keeps only the\ncyclical sine/cosine columns and the raw weather columns; rolling weather\nwindows, the holiday column, and the polynomial interactions are\nfiltered out.\n\n::: {#exm-s6-select}\n\n## Final exogenous column list\n\n::: {#cc9a3e8f .cell execution_count=10}\n``` {.python .cell-code}\nfrom spotforecast2_safe.manager.features import select_exogenous_features\n\nexog_features = select_exogenous_features(\n exogenous_features=exogenous_features,\n weather_aligned=weather_aligned,\n include_weather_windows=False,\n include_holiday_features=False,\n poly_features_degree=1,\n)\n\nprint(\"number of exog features :\", len(exog_features))\nprint(\"first 6 :\", exog_features[:6])\nprint(\"last 3 :\", exog_features[-3:])\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nnumber of exog features : 27\nfirst 6 : ['month_sin', 'month_cos', 'week_sin', 'week_cos', 'day_of_week_sin', 'day_of_week_cos']\nlast 3 : ['wind_speed_10m', 'wind_direction_10m', 'wind_gusts_10m']\n```\n:::\n:::\n\n\n:::\n\nTo experiment with the other flags, re-run this cell with one or more of\nthe `include_*` arguments set to `True` and observe how\n`len(exog_features)` grows.\n\n## Stage 7 — Merging target and exogenous data\n\n`merge_data_and_covariates` performs an inner join of the target columns\nand the selected exogenous columns over `[start, end]`, casts every\ncolumn to `float32` to halve the memory footprint of the lag matrix, and\nalso returns the slice of the exogenous matrix that covers the future\nprediction window `(end, cov_end]`.\n\n::: {#exm-s7-merge}\n\n## Merged training matrix and the future-window slice\n\n::: {#217276a1 .cell execution_count=11}\n``` {.python .cell-code}\nfrom spotforecast2_safe.manager.features import merge_data_and_covariates\n\ndata_with_exog, exo_tmp, exo_pred = merge_data_and_covariates(\n data=imputed_data,\n exogenous_features=exogenous_features,\n target_columns=target_columns,\n exog_features=exog_features,\n start=start,\n end=end,\n cov_end=cov_end,\n forecast_horizon=forecast_horizon,\n cast_dtype=\"float32\",\n)\n\nprint(\"data_with_exog shape :\", data_with_exog.shape)\nprint(\"exo_pred shape :\", exo_pred.shape)\nprint(\"dtypes count :\")\nprint(data_with_exog.dtypes.value_counts())\ndata_with_exog.head(2).iloc[:, :4]\n```\n\n::: {.cell-output .cell-output-stdout}\n```\ndata_with_exog shape : (18118, 38)\nexo_pred shape : (24, 126)\ndtypes count :\nfloat32 38\nName: count, dtype: int64\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=11}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ABCD
DateTime
2019-12-01 00:00:00+00:00-3559.5041503362.121826240.210419190.074966
2019-12-01 01:00:00+00:00-4847.3735353545.130371240.210419190.074966
\n
\n```\n:::\n:::\n\n\n:::\n\n`exo_pred` has exactly `forecast_horizon` rows; it is the matrix passed\nto every forecaster at prediction time. Every column of `data_with_exog`\nis `float32`, including the eleven target columns.\n\n## Stage 8 — Train / validation / test split\n\nThe split is purely temporal: the first `train_ratio=0.8` fraction of\nrows is training data, and the remaining twenty percent goes to validation.\nThe test segment is empty because `perc_val = 1 - train_ratio` consumes\nall remaining rows. `end_validation` is the cutoff timestamp used by\n`forecaster.fit(...)`.\n\n::: {#exm-s8-split}\n\n## Temporal split and the validation cutoff\n\n::: {#1cd6279e .cell execution_count=12}\n``` {.python .cell-code}\nfrom spotforecast2_safe.splitter.split import split_rel_train_val_test\n\ntrain_ratio = 0.8\ndata_train, data_val, data_test = split_rel_train_val_test(\n data_with_exog,\n perc_train=train_ratio,\n perc_val=1.0 - train_ratio,\n verbose=False,\n)\nend_validation = pd.concat([data_train, data_val]).index[-1]\n\nprint(\"train rows :\", len(data_train))\nprint(\"val rows :\", len(data_val))\nprint(\"test rows :\", len(data_test))\nprint(\"end_validation:\", end_validation)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\ntrain rows : 14494\nval rows : 3624\ntest rows : 0\nend_validation: 2021-12-24 21:00:00+00:00\n```\n:::\n:::\n\n\n:::\n\n::: {.callout-note}\n## Why temporal, not random\n\nRandom shuffling before a time-series split lets the model \"see the\nfuture\": its training set could include hour 14 of a day on which the\ntest set asks it to predict hour 13. A temporal split forces a real\nfuture-from-past forecast. See `@def-train-val-test` in the sibling\npage.\n:::\n\n## Stage 9 — Training recursive forecasters\n\nFor each of the eleven target columns Stage 9 builds a fresh\n`ForecasterRecursive`, configured identically: `lags=24`,\n`RollingFeatures(stats=[\"mean\"], window_sizes=72)`, the shared\n`weight_func`, and an `LGBMRegressor` with `random_state=1234`. Every\nfit uses data up to and including `end_validation`, so the test segment\nis held out for any downstream evaluation step.\n\nThis is the most expensive cell on the page — eleven LightGBM models on\nroughly fourteen thousand training rows each. Expect a few minutes on a\nmodern laptop.\n\n::: {#exm-s9-train}\n\n## Eleven forecasters, one shared configuration\n\n::: {#cb510e29 .cell execution_count=13}\n``` {.python .cell-code}\nfrom lightgbm import LGBMRegressor\n\nfrom spotforecast2_safe.forecaster.recursive import ForecasterRecursive\nfrom spotforecast2_safe.preprocessing import RollingFeatures\n\nestimator = LGBMRegressor(random_state=1234, verbose=-1)\nwindow_features = RollingFeatures(stats=[\"mean\"], window_sizes=72)\n\nrecursive_forecasters = {}\nfor target in target_columns:\n forecaster = ForecasterRecursive(\n estimator=estimator,\n lags=24,\n window_features=window_features,\n weight_func=weight_func,\n )\n forecaster.fit(\n y=data_with_exog[target].loc[:end_validation].squeeze(),\n exog=data_with_exog[exog_features].loc[:end_validation],\n )\n recursive_forecasters[target] = forecaster\n\nfeature_counts = {t: recursive_forecasters[t].estimator.n_features_in_ for t in target_columns}\nprint(\"forecasters trained :\", len(recursive_forecasters))\nprint(\"estimator type :\", type(recursive_forecasters[target_columns[0]].estimator).__name__)\nprint(\"n_features_in_ across all :\", set(feature_counts.values()))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nforecasters trained : 11\nestimator type : LGBMRegressor\nn_features_in_ across all : {52}\n```\n:::\n:::\n\n\n:::\n\nAll eleven forecasters share the same hyperparameters and\n`random_state=1234`, so two renders on identical input produce byte-\nidentical models — the determinism guarantee underpins reproducible\nreporting.\n\n## Stage 10 — Prediction\n\n`predict_multivariate` iterates over the trained forecasters and calls\n`.predict(steps=forecast_horizon, exog=exo_pred[exog_features])` on each.\nThe result is a single DataFrame with one column per target and\n`forecast_horizon` rows.\n\n::: {#exm-s10-predict}\n\n## Twenty-four-hour multi-target forecast\n\n::: {#ad6176f9 .cell execution_count=14}\n``` {.python .cell-code}\nfrom spotforecast2_safe.forecaster.utils import predict_multivariate\n\npredictions = predict_multivariate(\n recursive_forecasters,\n steps_ahead=forecast_horizon,\n exog=exo_pred[exog_features],\n show_progress=False,\n)\n\nprint(\"predictions shape :\", predictions.shape)\nprint(\"first timestamp :\", predictions.index[0])\nprint(\"last timestamp :\", predictions.index[-1])\npredictions.head(3).round(2)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\npredictions shape : (24, 11)\nfirst timestamp : 2021-12-24 22:00:00+00:00\nlast timestamp : 2021-12-25 21:00:00+00:00\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=14}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ABCDEFGHIJK
2021-12-24 22:00:00+00:00-78.243993.65433.11-88.18661.0376.4341.2412262.686.9579.70317.47
2021-12-24 23:00:00+00:00-1825.815367.6623.66-56.85672.4083.0739.0211685.21-70.6893.93503.77
2021-12-25 00:00:00+00:00-1437.094491.435.1035.37648.93-2.5040.8910864.76-1.84-204.60490.65
\n
\n```\n:::\n:::\n\n\n:::\n\n`predictions.shape` is `(24, 11)`: twenty-four forecast hours, eleven\ntarget columns. The index is exactly `exo_pred.index`, which is exactly\n`forecast_horizon` hours starting one step after `end_validation`.\n\n## Aggregation\n\nStage 10 produced one forecast column per target. The production task closes\nthe pipeline with a weighted aggregation: `agg_predict(predictions,\nweights=weights)` reduces the eleven columns to a single Series sharing the\n`DatetimeIndex` of `exo_pred`. The conceptual sibling page explains the\nhelper's `list` / `np.ndarray` / `dict` accepted forms and the signed-weights\nconvention in its\n[Aggregation section](n2n_predict_with_covariates_explained.qmd#aggregation).\n\n::: {#exm-aggregation}\n\n## Aggregating the eleven per-target forecasts\n\n::: {#091aea7d .cell execution_count=15}\n``` {.python .cell-code}\nfrom spotforecast2_safe.processing.agg_predict import agg_predict\n\nweights = [1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 1.0, 1.0, 1.0, -1.0, 1.0]\ncombined_prediction = agg_predict(predictions, weights=weights)\n\nprint(\"combined shape :\", combined_prediction.shape)\nprint(\"first timestamp :\", combined_prediction.index[0])\nprint(\"last timestamp :\", combined_prediction.index[-1])\ncombined_prediction.head(3).round(4)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\ncombined shape : (24,)\nfirst timestamp : 2021-12-24 22:00:00+00:00\nlast timestamp : 2021-12-25 21:00:00+00:00\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=15}\n```\n2021-12-24 22:00:00+00:00 16703.7137\n2021-12-24 23:00:00+00:00 16227.7490\n2021-12-25 00:00:00+00:00 15264.3525\nFreq: h, dtype: float64\n```\n:::\n:::\n\n\n:::\n\nThe sign pattern of `weights` expresses the same net-position convention used\nby the production task — the first, second, fifth, seventh, eighth, ninth, and\neleventh columns are added and the remainder are subtracted. The n2n pipeline\ndemonstrated here is also used by `task_safe_demo`; the n-to-1 task\n(`tasks/task_safe_n_to_1_with_covariates_and_dataframe.py`)\nnow runs on `spotforecast2_safe.multitask` with the same weights carried by\n`ConfigMulti.agg_weights`.\n\n## Metadata\n\nThe orchestrator emits a metadata dictionary that records every\nconfiguration knob and every shape it just produced — a self-contained\naudit record. Building the same dictionary by hand is a useful integration\ncheck that nothing was lost along the way.\n\n::: {#exm-meta}\n\n## Reconstructing the orchestrator's metadata\n\n::: {#8f72bea0 .cell execution_count=16}\n``` {.python .cell-code}\nmetadata = {\n \"forecast_horizon\": forecast_horizon,\n \"target_columns\": target_columns,\n \"exog_features\": exog_features,\n \"n_exog_features\": len(exog_features),\n \"train_size\": len(data_train),\n \"val_size\": len(data_val),\n \"test_size\": len(data_test),\n \"data_shape_original\": data_demo.shape,\n \"data_shape_merged\": data_with_exog.shape,\n \"training_end\": end_validation,\n \"prediction_start\": exo_pred.index[0],\n \"prediction_end\": exo_pred.index[-1],\n \"lags\": 24,\n \"window_size\": 72,\n \"contamination\": 0.01,\n \"n_outliers\": int((outliers == -1).sum()) if hasattr(outliers, \"__iter__\") else 0,\n}\n\nprint(\"metadata keys :\", sorted(metadata.keys()))\nprint(\"training_end :\", metadata[\"training_end\"])\nprint(\"prediction :\", metadata[\"prediction_start\"], \"→\", metadata[\"prediction_end\"])\nprint(\"n_exog_features:\", metadata[\"n_exog_features\"])\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nmetadata keys : ['contamination', 'data_shape_merged', 'data_shape_original', 'exog_features', 'forecast_horizon', 'lags', 'n_exog_features', 'n_outliers', 'prediction_end', 'prediction_start', 'target_columns', 'test_size', 'train_size', 'training_end', 'val_size', 'window_size']\ntraining_end : 2021-12-24 21:00:00+00:00\nprediction : 2021-12-24 22:00:00+00:00 → 2021-12-25 21:00:00+00:00\nn_exog_features: 27\n```\n:::\n:::\n\n\n:::\n\n## Cross-check against the orchestrator\n\nThe pipeline above is exactly what `n2n_predict_with_covariates` does\ninternally. Calling the orchestrator on the same data with the\nsame parameters should yield the same shapes and (modulo any change in\nOpen-Meteo's historical data between the two calls) the same numerical\npredictions.\n\n::: {#exm-end-to-end}\n\n## Single-call equivalence\n\n::: {#58e5c9af .cell execution_count=17}\n``` {.python .cell-code}\nfrom spotforecast2_safe.processing.n2n_predict_with_covariates import (\n n2n_predict_with_covariates,\n)\n\npredictions_full, metadata_full, forecasters_full = n2n_predict_with_covariates(\n data=data_demo,\n forecast_horizon=24,\n lags=24,\n window_size=72,\n contamination=0.01,\n train_ratio=0.8,\n latitude=51.5136,\n longitude=7.4653,\n timezone=\"UTC\",\n country_code=\"DE\",\n state=\"NW\",\n force_train=True,\n model_dir=str(cache_home / \"demo10_forecasters\"),\n verbose=False,\n show_progress=False,\n)\n\nassert predictions_full.shape == predictions.shape\n\nprint(\"step-by-step shape :\", predictions.shape)\nprint(\"orchestrator shape :\", predictions_full.shape)\nprint(\"n_exog (manual) :\", metadata[\"n_exog_features\"])\nprint(\"n_exog (orch) :\", metadata_full[\"n_exog_features\"])\npredictions_full.head(3).round(2)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nstep-by-step shape : (24, 11)\norchestrator shape : (24, 11)\nn_exog (manual) : 27\nn_exog (orch) : 27\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=17}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ABCDEFGHIJK
2021-12-24 22:00:00+00:00-78.243993.65433.11-88.18661.0376.4341.2412262.686.9579.70317.47
2021-12-24 23:00:00+00:00-1825.815367.6623.66-56.85672.4083.0739.0211685.21-70.6893.93503.77
2021-12-25 00:00:00+00:00-1437.094491.435.1035.37648.93-2.5040.8910864.76-1.84-204.60490.65
\n
\n```\n:::\n:::\n\n\n:::\n\nNumerical equality of the two prediction matrices depends on Open-Meteo\nreturning identical historical values between the two render passes;\nsharing the same `cache_home` is what makes that the common case rather\nthan a rare one.\n\n## What to take away\n\n- Each stage of `n2n_predict_with_covariates` produces a checkable\n intermediate artefact: the boundary timestamps, the outlier mask, the\n weight series, the four exogenous DataFrames, the merged matrix, the\n three split DataFrames, the dictionary of fitted forecasters, and\n finally the predictions DataFrame.\n- The orchestrator is exactly the composition of these calls. There is\n no hidden state — the cross-check cell above proves it.\n- The only network-dependent stage is 4c. `freeze: true` keeps local\n iteration cheap, and `cache_home` keeps the network round-trip\n amortised across renders.\n- The same configuration on the same input yields byte-identical\n forecasters, predictions, and metadata. That determinism is the\n foundation that makes `n2n_predict_with_covariates` safe to embed in\n an automated batch job.\n\n", "supporting": [ "n2n_predict_with_covariates_demo10_files" ], "filters": [], "includes": { "include-in-header": [ - "\n\n\n" + "\n\n\n" ] } } diff --git a/_freeze/docs/tutorials/n2n_predict_with_covariates_demo10/figure-html/cell-17-output-1.png b/_freeze/docs/tutorials/n2n_predict_with_covariates_demo10/figure-html/cell-17-output-1.png deleted file mode 100644 index ebb7b6923..000000000 Binary files a/_freeze/docs/tutorials/n2n_predict_with_covariates_demo10/figure-html/cell-17-output-1.png and /dev/null differ diff --git a/_freeze/docs/tutorials/n2n_predict_with_covariates_demo10/figure-html/cell-20-output-1.png b/_freeze/docs/tutorials/n2n_predict_with_covariates_demo10/figure-html/cell-20-output-1.png deleted file mode 100644 index 9236fa277..000000000 Binary files a/_freeze/docs/tutorials/n2n_predict_with_covariates_demo10/figure-html/cell-20-output-1.png and /dev/null differ diff --git a/docs/tasks/task_multi.qmd b/docs/tasks/task_multi.qmd index bc23305e9..725687376 100644 --- a/docs/tasks/task_multi.qmd +++ b/docs/tasks/task_multi.qmd @@ -206,29 +206,6 @@ instead of silent dropping) and to plotting: `MultiTask.plot_with_outliers()` raises `NotImplementedError` because no plotting library is permitted in this package. -## The n-to-1 task entry point - -[`run_pipeline`](../reference/tasks.task_safe_n_to_1_with_covariates_and_dataframe.qmd) from -`task_safe_n_to_1_with_covariates_and_dataframe` -wraps exactly this pipeline with `task="lazy"` — one call from config and -DataFrame to combined forecast: - -```{python} -from spotforecast2_safe.tasks.task_safe_n_to_1_with_covariates_and_dataframe import ( - run_pipeline, -) - -forecast = run_pipeline(config=cfg, dataframe=df, cache_home=cache) -forecast.head(3) -``` - -The matching console script accepts the same knobs as flags: - -```bash -uv run spotforecast-safe-n2o1-cov-df --forecast_horizon 24 --lags 24 \ - --include_holiday_features true -``` - ## Scaling up from the toy example For a real run, switch the feature machinery on instead of off: @@ -263,5 +240,4 @@ design, absent. ## Where to go next -- API reference: [`run_pipeline`](../reference/tasks.task_safe_n_to_1_with_covariates_and_dataframe.qmd) — the CLI-facing wrapper around this pipeline. - API reference: [`MultiTask`](../reference/multitask.multi.MultiTask.qmd), [`BaseTask`](../reference/multitask.base.BaseTask.qmd), [`ConfigMulti`](../reference/configurator.config_multi.ConfigMulti.qmd), [`runner.run`](../reference/multitask.runner.run.qmd). diff --git a/tests/preprocessing/test_target_corruption.py b/tests/preprocessing/test_target_corruption.py index 911e3bc12..aa0e07ab7 100644 --- a/tests/preprocessing/test_target_corruption.py +++ b/tests/preprocessing/test_target_corruption.py @@ -650,14 +650,11 @@ def test_fall_back_no_raise(self): ) vals = [BASE_MW] * len(idx) df = pd.DataFrame({"load": vals}, index=idx) - try: - mask = detect_target_corruption( - df, targets=["load"], range_mw=5_000, step_mw=8_000, window_days=7 - ) - except Exception as exc: - pytest.fail( - f"detect_target_corruption raised on fall-back DST index: {exc}" - ) + # A raise here fails the test (with a full traceback); that is exactly + # the "must not raise" guarantee this case is asserting. + mask = detect_target_corruption( + df, targets=["load"], range_mw=5_000, step_mw=8_000, window_days=7 + ) assert not mask.any(), "Clean DST week must produce no flags." def test_fall_back_dropout_is_flagged(self): @@ -694,14 +691,11 @@ def test_spring_forward_no_raise(self): ) vals = [BASE_MW] * len(idx) df = pd.DataFrame({"load": vals}, index=idx) - try: - mask = detect_target_corruption( - df, targets=["load"], range_mw=5_000, step_mw=8_000, window_days=7 - ) - except Exception as exc: - pytest.fail( - f"detect_target_corruption raised on spring-forward DST index: {exc}" - ) + # A raise here fails the test (with a full traceback); that is exactly + # the "must not raise" guarantee this case is asserting. + mask = detect_target_corruption( + df, targets=["load"], range_mw=5_000, step_mw=8_000, window_days=7 + ) assert not mask.any(), "Clean spring-forward DST week must produce no flags." diff --git a/tests/test_entsoe_loader.py b/tests/test_entsoe_loader.py index 35e1f28e1..6b91d8b7c 100644 --- a/tests/test_entsoe_loader.py +++ b/tests/test_entsoe_loader.py @@ -8,10 +8,6 @@ import spotforecast2_safe.data.entsoe_loader as entsoe_loader from spotforecast2_safe.configurator import ConfigEntsoe -from spotforecast2_safe.data.entsoe_loader import ( - entsoe_data_loader, - entsoe_test_data_loader, -) def _write_interim_csv(path, start: str, periods: int, tz: str | None = "UTC"): @@ -29,7 +25,7 @@ def test_absolute_path_loads_full_frame(self, tmp_path): config = ConfigEntsoe() config.data_filename = str(csv_path) - df = entsoe_data_loader(config) + df = entsoe_loader.entsoe_data_loader(config) assert df.shape == (48, 1) assert df.index.name == "Time (UTC)" @@ -41,7 +37,7 @@ def test_relative_path_resolves_against_data_home(self, tmp_path, monkeypatch): config = ConfigEntsoe() config.data_filename = "energy_load.csv" - df = entsoe_data_loader(config) + df = entsoe_loader.entsoe_data_loader(config) assert df.shape == (24, 1) @@ -50,7 +46,7 @@ def test_missing_file_raises_with_cli_hint(self, tmp_path): config.data_filename = str(tmp_path / "does_not_exist.csv") with pytest.raises(FileNotFoundError, match="spotforecast2-entsoe"): - entsoe_data_loader(config) + entsoe_loader.entsoe_data_loader(config) class TestEntsoeTestDataLoader: @@ -66,7 +62,7 @@ def test_slices_predict_size_steps_after_end_train(self, tmp_path): _write_interim_csv(csv_path, "2025-12-29 00:00", 120) config = self._config(csv_path, "2025-12-31 00:00+00:00") - test_df = entsoe_test_data_loader(config) + test_df = entsoe_loader.entsoe_test_data_loader(config) assert test_df.shape == (24, 1) assert test_df.index[0] == pd.Timestamp("2025-12-31 01:00", tz="UTC") @@ -77,7 +73,7 @@ def test_naive_end_train_is_localized_to_utc(self, tmp_path): _write_interim_csv(csv_path, "2025-12-29 00:00", 120) config = self._config(csv_path, "2025-12-31 00:00") # no tz marker - test_df = entsoe_test_data_loader(config) + test_df = entsoe_loader.entsoe_test_data_loader(config) assert test_df.shape == (24, 1) assert test_df.index[0] == pd.Timestamp("2025-12-31 01:00", tz="UTC") @@ -87,7 +83,7 @@ def test_naive_csv_index_is_supported(self, tmp_path): _write_interim_csv(csv_path, "2025-12-29 00:00", 120, tz=None) config = self._config(csv_path, "2025-12-31 00:00+00:00") - test_df = entsoe_test_data_loader(config) + test_df = entsoe_loader.entsoe_test_data_loader(config) assert test_df.shape == (24, 1) assert test_df.index[0] == pd.Timestamp("2025-12-31 01:00") @@ -98,6 +94,6 @@ def test_window_shorter_when_data_runs_out(self, tmp_path): _write_interim_csv(csv_path, "2025-12-29 00:00", 60) # ends 12-31 11:00 config = self._config(csv_path, "2025-12-31 00:00+00:00") - test_df = entsoe_test_data_loader(config) + test_df = entsoe_loader.entsoe_test_data_loader(config) assert len(test_df) == 11 # only the rows that exist after the cutoff