diff --git a/LICENSES/ODbL-1.0.txt b/LICENSES/ODbL-1.0.txt new file mode 100644 index 000000000..8ca51c358 --- /dev/null +++ b/LICENSES/ODbL-1.0.txt @@ -0,0 +1,540 @@ +## Open Data Commons Open Database License (ODbL) + +### Preamble + +The Open Database License (ODbL) is a license agreement intended to +allow users to freely share, modify, and use this Database while +maintaining this same freedom for others. Many databases are covered by +copyright, and therefore this document licenses these rights. Some +jurisdictions, mainly in the European Union, have specific rights that +cover databases, and so the ODbL addresses these rights, too. Finally, +the ODbL is also an agreement in contract for users of this Database to +act in certain ways in return for accessing this Database. + +Databases can contain a wide variety of types of content (images, +audiovisual material, and sounds all in the same database, for example), +and so the ODbL only governs the rights over the Database, and not the +contents of the Database individually. Licensors should use the ODbL +together with another license for the contents, if the contents have a +single set of rights that uniformly covers all of the contents. If the +contents have multiple sets of different rights, Licensors should +describe what rights govern what contents together in the individual +record or in some other way that clarifies what rights apply. + +Sometimes the contents of a database, or the database itself, can be +covered by other rights not addressed here (such as private contracts, +trade mark over the name, or privacy rights / data protection rights +over information in the contents), and so you are advised that you may +have to consult other documents or clear other rights before doing +activities not covered by this License. + +------ + +The Licensor (as defined below) + +and + +You (as defined below) + +agree as follows: + +### 1.0 Definitions of Capitalised Words + +"Collective Database" – Means this Database in unmodified form as part +of a collection of independent databases in themselves that together are +assembled into a collective whole. A work that constitutes a Collective +Database will not be considered a Derivative Database. + +"Convey" – As a verb, means Using the Database, a Derivative Database, +or the Database as part of a Collective Database in any way that enables +a Person to make or receive copies of the Database or a Derivative +Database. Conveying does not include interaction with a user through a +computer network, or creating and Using a Produced Work, where no +transfer of a copy of the Database or a Derivative Database occurs. +"Contents" – The contents of this Database, which includes the +information, independent works, or other material collected into the +Database. For example, the contents of the Database could be factual +data or works such as images, audiovisual material, text, or sounds. + +"Database" – A collection of material (the Contents) arranged in a +systematic or methodical way and individually accessible by electronic +or other means offered under the terms of this License. + +"Database Directive" – Means Directive 96/9/EC of the European +Parliament and of the Council of 11 March 1996 on the legal protection +of databases, as amended or succeeded. + +"Database Right" – Means rights resulting from the Chapter III ("sui +generis") rights in the Database Directive (as amended and as transposed +by member states), which includes the Extraction and Re-utilisation of +the whole or a Substantial part of the Contents, as well as any similar +rights available in the relevant jurisdiction under Section 10.4. + +"Derivative Database" – Means a database based upon the Database, and +includes any translation, adaptation, arrangement, modification, or any +other alteration of the Database or of a Substantial part of the +Contents. This includes, but is not limited to, Extracting or +Re-utilising the whole or a Substantial part of the Contents in a new +Database. + +"Extraction" – Means the permanent or temporary transfer of all or a +Substantial part of the Contents to another medium by any means or in +any form. + +"License" – Means this license agreement and is both a license of rights +such as copyright and Database Rights and an agreement in contract. + +"Licensor" – Means the Person that offers the Database under the terms +of this License. + +"Person" – Means a natural or legal person or a body of persons +corporate or incorporate. + +"Produced Work" – a work (such as an image, audiovisual material, text, +or sounds) resulting from using the whole or a Substantial part of the +Contents (via a search or other query) from this Database, a Derivative +Database, or this Database as part of a Collective Database. + +"Publicly" – means to Persons other than You or under Your control by +either more than 50% ownership or by the power to direct their +activities (such as contracting with an independent consultant). + +"Re-utilisation" – means any form of making available to the public all +or a Substantial part of the Contents by the distribution of copies, by +renting, by online or other forms of transmission. + +"Substantial" – Means substantial in terms of quantity or quality or a +combination of both. The repeated and systematic Extraction or +Re-utilisation of insubstantial parts of the Contents may amount to the +Extraction or Re-utilisation of a Substantial part of the Contents. + +"Use" – As a verb, means doing any act that is restricted by copyright +or Database Rights whether in the original medium or any other; and +includes without limitation distributing, copying, publicly performing, +publicly displaying, and preparing derivative works of the Database, as +well as modifying the Database as may be technically necessary to use it +in a different mode or format. + +"You" – Means a Person exercising rights under this License who has not +previously violated the terms of this License with respect to the +Database, or who has received express permission from the Licensor to +exercise rights under this License despite a previous violation. + +Words in the singular include the plural and vice versa. + +### 2.0 What this License covers + +2.1. Legal effect of this document. This License is: + + a. A license of applicable copyright and neighbouring rights; + + b. A license of the Database Right; and + + c. An agreement in contract between You and the Licensor. + +2.2 Legal rights covered. This License covers the legal rights in the +Database, including: + + a. Copyright. Any copyright or neighbouring rights in the Database. + The copyright licensed includes any individual elements of the + Database, but does not cover the copyright over the Contents + independent of this Database. See Section 2.4 for details. Copyright + law varies between jurisdictions, but is likely to cover: the Database + model or schema, which is the structure, arrangement, and organisation + of the Database, and can also include the Database tables and table + indexes; the data entry and output sheets; and the Field names of + Contents stored in the Database; + + b. Database Rights. Database Rights only extend to the Extraction and + Re-utilisation of the whole or a Substantial part of the Contents. + Database Rights can apply even when there is no copyright over the + Database. Database Rights can also apply when the Contents are removed + from the Database and are selected and arranged in a way that would + not infringe any applicable copyright; and + + c. Contract. This is an agreement between You and the Licensor for + access to the Database. In return you agree to certain conditions of + use on this access as outlined in this License. + +2.3 Rights not covered. + + a. This License does not apply to computer programs used in the making + or operation of the Database; + + b. This License does not cover any patents over the Contents or the + Database; and + + c. This License does not cover any trademarks associated with the + Database. + +2.4 Relationship to Contents in the Database. The individual items of +the Contents contained in this Database may be covered by other rights, +including copyright, patent, data protection, privacy, or personality +rights, and this License does not cover any rights (other than Database +Rights or in contract) in individual Contents contained in the Database. +For example, if used on a Database of images (the Contents), this +License would not apply to copyright over individual images, which could +have their own separate licenses, or one single license covering all of +the rights over the images. + +### 3.0 Rights granted + +3.1 Subject to the terms and conditions of this License, the Licensor +grants to You a worldwide, royalty-free, non-exclusive, terminable (but +only under Section 9) license to Use the Database for the duration of +any applicable copyright and Database Rights. These rights explicitly +include commercial use, and do not exclude any field of endeavour. To +the extent possible in the relevant jurisdiction, these rights may be +exercised in all media and formats whether now known or created in the +future. + +The rights granted cover, for example: + + a. Extraction and Re-utilisation of the whole or a Substantial part of + the Contents; + + b. Creation of Derivative Databases; + + c. Creation of Collective Databases; + + d. Creation of temporary or permanent reproductions by any means and + in any form, in whole or in part, including of any Derivative + Databases or as a part of Collective Databases; and + + e. Distribution, communication, display, lending, making available, or + performance to the public by any means and in any form, in whole or in + part, including of any Derivative Database or as a part of Collective + Databases. + +3.2 Compulsory license schemes. For the avoidance of doubt: + + a. Non-waivable compulsory license schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme cannot be waived, the Licensor reserves + the exclusive right to collect such royalties for any exercise by You + of the rights granted under this License; + + b. Waivable compulsory license schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme can be waived, the Licensor waives the + exclusive right to collect such royalties for any exercise by You of + the rights granted under this License; and, + + c. Voluntary license schemes. The Licensor waives the right to collect + royalties, whether individually or, in the event that the Licensor is + a member of a collecting society that administers voluntary licensing + schemes, via that society, from any exercise by You of the rights + granted under this License. + +3.3 The right to release the Database under different terms, or to stop +distributing or making available the Database, is reserved. Note that +this Database may be multiple-licensed, and so You may have the choice +of using alternative licenses for this Database. Subject to Section +10.4, all other rights not expressly granted by Licensor are reserved. + +### 4.0 Conditions of Use + +4.1 The rights granted in Section 3 above are expressly made subject to +Your complying with the following conditions of use. These are important +conditions of this License, and if You fail to follow them, You will be +in material breach of its terms. + +4.2 Notices. If You Publicly Convey this Database, any Derivative +Database, or the Database as part of a Collective Database, then You +must: + + a. Do so only under the terms of this License or another license + permitted under Section 4.4; + + b. Include a copy of this License (or, as applicable, a license + permitted under Section 4.4) or its Uniform Resource Identifier (URI) + with the Database or Derivative Database, including both in the + Database or Derivative Database and in any relevant documentation; and + + c. Keep intact any copyright or Database Right notices and notices + that refer to this License. + + d. If it is not possible to put the required notices in a particular + file due to its structure, then You must include the notices in a + location (such as a relevant directory) where users would be likely to + look for it. + +4.3 Notice for using output (Contents). Creating and Using a Produced +Work does not require the notice in Section 4.2. However, if you +Publicly Use a Produced Work, You must include a notice associated with +the Produced Work reasonably calculated to make any Person that uses, +views, accesses, interacts with, or is otherwise exposed to the Produced +Work aware that Content was obtained from the Database, Derivative +Database, or the Database as part of a Collective Database, and that it +is available under this License. + + a. Example notice. The following text will satisfy notice under + Section 4.3: + + Contains information from DATABASE NAME, which is made available + here under the Open Database License (ODbL). + +DATABASE NAME should be replaced with the name of the Database and a +hyperlink to the URI of the Database. "Open Database License" should +contain a hyperlink to the URI of the text of this License. If +hyperlinks are not possible, You should include the plain text of the +required URI's with the above notice. + +4.4 Share alike. + + a. Any Derivative Database that You Publicly Use must be only under + the terms of: + + i. This License; + + ii. A later version of this License similar in spirit to this + License; or + + iii. A compatible license. + + If You license the Derivative Database under one of the licenses + mentioned in (iii), You must comply with the terms of that license. + + b. For the avoidance of doubt, Extraction or Re-utilisation of the + whole or a Substantial part of the Contents into a new database is a + Derivative Database and must comply with Section 4.4. + + c. Derivative Databases and Produced Works. A Derivative Database is + Publicly Used and so must comply with Section 4.4. if a Produced Work + created from the Derivative Database is Publicly Used. + + d. Share Alike and additional Contents. For the avoidance of doubt, + You must not add Contents to Derivative Databases under Section 4.4 a + that are incompatible with the rights granted under this License. + + e. Compatible licenses. Licensors may authorise a proxy to determine + compatible licenses under Section 4.4 a iii. If they do so, the + authorised proxy's public statement of acceptance of a compatible + license grants You permission to use the compatible license. + + +4.5 Limits of Share Alike. The requirements of Section 4.4 do not apply +in the following: + + a. For the avoidance of doubt, You are not required to license + Collective Databases under this License if You incorporate this + Database or a Derivative Database in the collection, but this License + still applies to this Database or a Derivative Database as a part of + the Collective Database; + + b. Using this Database, a Derivative Database, or this Database as + part of a Collective Database to create a Produced Work does not + create a Derivative Database for purposes of Section 4.4; and + + c. Use of a Derivative Database internally within an organisation is + not to the public and therefore does not fall under the requirements + of Section 4.4. + +4.6 Access to Derivative Databases. If You Publicly Use a Derivative +Database or a Produced Work from a Derivative Database, You must also +offer to recipients of the Derivative Database or Produced Work a copy +in a machine readable form of: + + a. The entire Derivative Database; or + + b. A file containing all of the alterations made to the Database or + the method of making the alterations to the Database (such as an + algorithm), including any additional Contents, that make up all the + differences between the Database and the Derivative Database. + +The Derivative Database (under a.) or alteration file (under b.) must be +available at no more than a reasonable production cost for physical +distributions and free of charge if distributed over the internet. + +4.7 Technological measures and additional terms + + a. This License does not allow You to impose (except subject to + Section 4.7 b.) any terms or any technological measures on the + Database, a Derivative Database, or the whole or a Substantial part of + the Contents that alter or restrict the terms of this License, or any + rights granted under it, or have the effect or intent of restricting + the ability of any person to exercise those rights. + + b. Parallel distribution. You may impose terms or technological + measures on the Database, a Derivative Database, or the whole or a + Substantial part of the Contents (a "Restricted Database") in + contravention of Section 4.74 a. only if You also make a copy of the + Database or a Derivative Database available to the recipient of the + Restricted Database: + + i. That is available without additional fee; + + ii. That is available in a medium that does not alter or restrict + the terms of this License, or any rights granted under it, or have + the effect or intent of restricting the ability of any person to + exercise those rights (an "Unrestricted Database"); and + + iii. The Unrestricted Database is at least as accessible to the + recipient as a practical matter as the Restricted Database. + + c. For the avoidance of doubt, You may place this Database or a + Derivative Database in an authenticated environment, behind a + password, or within a similar access control scheme provided that You + do not alter or restrict the terms of this License or any rights + granted under it or have the effect or intent of restricting the + ability of any person to exercise those rights. + +4.8 Licensing of others. You may not sublicense the Database. Each time +You communicate the Database, the whole or Substantial part of the +Contents, or any Derivative Database to anyone else in any way, the +Licensor offers to the recipient a license to the Database on the same +terms and conditions as this License. You are not responsible for +enforcing compliance by third parties with this License, but You may +enforce any rights that You have over a Derivative Database. You are +solely responsible for any modifications of a Derivative Database made +by You or another Person at Your direction. You may not impose any +further restrictions on the exercise of the rights granted or affirmed +under this License. + +### 5.0 Moral rights + +5.1 Moral rights. This section covers moral rights, including any rights +to be identified as the author of the Database or to object to treatment +that would otherwise prejudice the author's honour and reputation, or +any other derogatory treatment: + + a. For jurisdictions allowing waiver of moral rights, Licensor waives + all moral rights that Licensor may have in the Database to the fullest + extent possible by the law of the relevant jurisdiction under Section + 10.4; + + b. If waiver of moral rights under Section 5.1 a in the relevant + jurisdiction is not possible, Licensor agrees not to assert any moral + rights over the Database and waives all claims in moral rights to the + fullest extent possible by the law of the relevant jurisdiction under + Section 10.4; and + + c. For jurisdictions not allowing waiver or an agreement not to assert + moral rights under Section 5.1 a and b, the author may retain their + moral rights over certain aspects of the Database. + +Please note that some jurisdictions do not allow for the waiver of moral +rights, and so moral rights may still subsist over the Database in some +jurisdictions. + +### 6.0 Fair dealing, Database exceptions, and other rights not affected + +6.1 This License does not affect any rights that You or anyone else may +independently have under any applicable law to make any use of this +Database, including without limitation: + + a. Exceptions to the Database Right including: Extraction of Contents + from non-electronic Databases for private purposes, Extraction for + purposes of illustration for teaching or scientific research, and + Extraction or Re-utilisation for public security or an administrative + or judicial procedure. + + b. Fair dealing, fair use, or any other legally recognised limitation + or exception to infringement of copyright or other applicable laws. + +6.2 This License does not affect any rights of lawful users to Extract +and Re-utilise insubstantial parts of the Contents, evaluated +quantitatively or qualitatively, for any purposes whatsoever, including +creating a Derivative Database (subject to other rights over the +Contents, see Section 2.4). The repeated and systematic Extraction or +Re-utilisation of insubstantial parts of the Contents may however amount +to the Extraction or Re-utilisation of a Substantial part of the +Contents. + +### 7.0 Warranties and Disclaimer + +7.1 The Database is licensed by the Licensor "as is" and without any +warranty of any kind, either express, implied, or arising by statute, +custom, course of dealing, or trade usage. Licensor specifically +disclaims any and all implied warranties or conditions of title, +non-infringement, accuracy or completeness, the presence or absence of +errors, fitness for a particular purpose, merchantability, or otherwise. +Some jurisdictions do not allow the exclusion of implied warranties, so +this exclusion may not apply to You. + +### 8.0 Limitation of liability + +8.1 Subject to any liability that may not be excluded or limited by law, +the Licensor is not liable for, and expressly excludes, all liability +for loss or damage however and whenever caused to anyone by any use +under this License, whether by You or by anyone else, and whether caused +by any fault on the part of the Licensor or not. This exclusion of +liability includes, but is not limited to, any special, incidental, +consequential, punitive, or exemplary damages such as loss of revenue, +data, anticipated profits, and lost business. This exclusion applies +even if the Licensor has been advised of the possibility of such +damages. + +8.2 If liability may not be excluded by law, it is limited to actual and +direct financial loss to the extent it is caused by proved negligence on +the part of the Licensor. + +### 9.0 Termination of Your rights under this License + +9.1 Any breach by You of the terms and conditions of this License +automatically terminates this License with immediate effect and without +notice to You. For the avoidance of doubt, Persons who have received the +Database, the whole or a Substantial part of the Contents, Derivative +Databases, or the Database as part of a Collective Database from You +under this License will not have their licenses terminated provided +their use is in full compliance with this License or a license granted +under Section 4.8 of this License. Sections 1, 2, 7, 8, 9 and 10 will +survive any termination of this License. + +9.2 If You are not in breach of the terms of this License, the Licensor +will not terminate Your rights under it. + +9.3 Unless terminated under Section 9.1, this License is granted to You +for the duration of applicable rights in the Database. + +9.4 Reinstatement of rights. If you cease any breach of the terms and +conditions of this License, then your full rights under this License +will be reinstated: + + a. Provisionally and subject to permanent termination until the 60th + day after cessation of breach; + + b. Permanently on the 60th day after cessation of breach unless + otherwise reasonably notified by the Licensor; or + + c. Permanently if reasonably notified by the Licensor of the + violation, this is the first time You have received notice of + violation of this License from the Licensor, and You cure the + violation prior to 30 days after your receipt of the notice. + +Persons subject to permanent termination of rights are not eligible to +be a recipient and receive a license under Section 4.8. + +9.5 Notwithstanding the above, Licensor reserves the right to release +the Database under different license terms or to stop distributing or +making available the Database. Releasing the Database under different +license terms or stopping the distribution of the Database will not +withdraw this License (or any other license that has been, or is +required to be, granted under the terms of this License), and this +License will continue in full force and effect unless terminated as +stated above. + +### 10.0 General + +10.1 If any provision of this License is held to be invalid or +unenforceable, that must not affect the validity or enforceability of +the remainder of the terms and conditions of this License and each +remaining provision of this License shall be valid and enforced to the +fullest extent permitted by law. + +10.2 This License is the entire agreement between the parties with +respect to the rights granted here over the Database. It replaces any +earlier understandings, agreements or representations with respect to +the Database. + +10.3 If You are in breach of the terms of this License, You will not be +entitled to rely on the terms of this License or to complain of any +breach by the Licensor. + +10.4 Choice of law. This License takes effect in and will be governed by +the laws of the relevant jurisdiction in which the License terms are +sought to be enforced. If the standard suite of rights granted under +applicable copyright law and Database Rights in the relevant +jurisdiction includes additional rights not granted under this License, +these additional rights are granted in this License in order to meet the +terms of this License. diff --git a/_freeze/docs/reference/calendar.holiday.create_school_holiday_df/execute-results/html.json b/_freeze/docs/reference/calendar.holiday.create_school_holiday_df/execute-results/html.json new file mode 100644 index 000000000..cf25326ff --- /dev/null +++ b/_freeze/docs/reference/calendar.holiday.create_school_holiday_df/execute-results/html.json @@ -0,0 +1,12 @@ +{ + "hash": "2ec09d8e46e065b2f395a0504ddf62b9", + "result": { + "engine": "jupyter", + "markdown": "---\ntitle: calendar.holiday.create_school_holiday_df\n---\n\n\n\n```python\ncalendar.holiday.create_school_holiday_df(\n start,\n end,\n tz='UTC',\n freq='h',\n country_code='DE',\n state='NW',\n)\n```\n\nCreate a DataFrame with a binary school-holiday indicator for a German state.\n\nBuilds a tz-aware time grid over ``[start, end]`` at *freq* and marks\nevery timestamp that falls within a school-holiday period of the requested\nBundesland as ``1``; all others are ``0``. Both edges of each interval\nare inclusive.\n\nData source: OpenHolidays API (https://openholidaysapi.org), ODbL-1.0.\nCoverage: 2022-01-01 to 2027-12-31 for all 16 German Bundesländer.\n\nOnly ``country_code=\"DE\"`` is supported. Requests whose span extends\nbeyond the covered range at either edge raise ``ValueError`` — there is\nno fill or extrapolation.\n\n## Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|--------------|----------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------|------------|\n| start | [str](`str`) \\| [pd](`pandas`).[Timestamp](`pandas.Timestamp`) | Start date/datetime of the requested grid. | _required_ |\n| end | [str](`str`) \\| [pd](`pandas`).[Timestamp](`pandas.Timestamp`) | End date/datetime of the requested grid (inclusive). | _required_ |\n| tz | [str](`str`) | Timezone for the resulting index. Ignored when *start* or *end* is already a tz-aware ``pd.Timestamp``. | `'UTC'` |\n| freq | [str](`str`) | Pandas-compatible frequency string. Defaults to ``\"h\"`` (hourly). | `'h'` |\n| country_code | [str](`str`) | Must be ``\"DE\"`` (Germany). Any other value raises ``ValueError``. | `'DE'` |\n| state | [str](`str`) | ISO 3166-2 subdivision short code for the Bundesland, e.g. ``\"NW\"`` (North Rhine-Westphalia), ``\"BY\"`` (Bavaria). Defaults to ``\"NW\"``. | `'NW'` |\n\n## Returns {.doc-section .doc-section-returns}\n\n| Name | Type | Description |\n|--------|------------------------------------------------|----------------------------------------------------------------------|\n| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | pd.DataFrame: Single integer column ``is_school_holiday`` (values in |\n| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | ``{0, 1}``; no NaNs) with a tz-aware `DatetimeIndex` at *freq*. |\n\n## Raises {.doc-section .doc-section-raises}\n\n| Name | Type | Description |\n|--------|----------------------------|-----------------------------------------------------------------------------------------------------------------------|\n| | [ValueError](`ValueError`) | If *country_code* is not ``\"DE\"``, or if the requested span extends beyond the dataset validity range at either edge. |\n\n## Examples {.doc-section .doc-section-examples}\n\n\n::: {#e831a63e .cell execution_count=1}\n``` {.python .cell-code}\nfrom spotforecast2_safe.calendar import create_school_holiday_df\n\n# NW Sommerferien 2024: 2024-07-08 → 2024-08-20 (inclusive).\n# Day before (2024-07-07) must be 0; first day (2024-07-08) must be 1.\ndf = create_school_holiday_df(\n \"2024-07-06\", \"2024-07-10\", freq=\"D\", state=\"NW\"\n)\nprint(df)\nassert df.loc[\"2024-07-07\", \"is_school_holiday\"] == 0\nassert df.loc[\"2024-07-08\", \"is_school_holiday\"] == 1\nassert df.loc[\"2024-07-09\", \"is_school_holiday\"] == 1\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n is_school_holiday\n2024-07-06 00:00:00+00:00 0\n2024-07-07 00:00:00+00:00 0\n2024-07-08 00:00:00+00:00 1\n2024-07-09 00:00:00+00:00 1\n2024-07-10 00:00:00+00:00 1\n```\n:::\n:::\n\n\n", + "supporting": [ + "calendar.holiday.create_school_holiday_df_files/figure-html" + ], + "filters": [], + "includes": {} + } +} \ No newline at end of file diff --git a/_freeze/docs/reference/calendar.holiday.get_school_holiday_features/execute-results/html.json b/_freeze/docs/reference/calendar.holiday.get_school_holiday_features/execute-results/html.json new file mode 100644 index 000000000..40eabace0 --- /dev/null +++ b/_freeze/docs/reference/calendar.holiday.get_school_holiday_features/execute-results/html.json @@ -0,0 +1,12 @@ +{ + "hash": "9f715b6a7ab541ee038cd8c9340d50b4", + "result": { + "engine": "jupyter", + "markdown": "---\ntitle: calendar.holiday.get_school_holiday_features\n---\n\n\n\n```python\ncalendar.holiday.get_school_holiday_features(\n data,\n start,\n cov_end,\n forecast_horizon,\n tz='UTC',\n freq='h',\n country_code='DE',\n state='NW',\n)\n```\n\nBuild per-Bundesland school-holiday indicators and align them to a forecast grid.\n\nGenerates the ``is_school_holiday`` binary indicator via\n`create_school_holiday_df()`, validates temporal coverage with\n`curate_holidays()`, and reindexes onto the full ``[start, cov_end]``\ngrid with ``fill_value=0``.\n\nThe requested span ``[start, cov_end]`` must lie entirely within the\ndataset validity range 2022-01-01 to 2027-12-31. If either edge falls\noutside this range a ``ValueError`` is raised immediately — there is no\nfill or extrapolation.\n\nOnly ``country_code=\"DE\"`` is supported; passing any other value raises\n``ValueError``.\n\n## Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|------------------|-----------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|------------|\n| data | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | Reference time series DataFrame used for temporal coverage validation inside `curate_holidays()`. | _required_ |\n| start | [Union](`typing.Union`)\\[[str](`str`), [pd](`pandas`).[Timestamp](`pandas.Timestamp`)\\] | Start timestamp. String values are parsed with ``utc=True``. | _required_ |\n| cov_end | [Union](`typing.Union`)\\[[str](`str`), [pd](`pandas`).[Timestamp](`pandas.Timestamp`)\\] | Inclusive end timestamp covering the full forecast horizon. String values are parsed with ``utc=True``. | _required_ |\n| forecast_horizon | [int](`int`) | Number of forecast steps ahead; passed to `curate_holidays()`. | _required_ |\n| tz | [str](`str`) | Timezone applied to the generated index. Defaults to ``\"UTC\"``. | `'UTC'` |\n| freq | [str](`str`) | Pandas-compatible frequency string. Defaults to ``\"h\"``. | `'h'` |\n| country_code | [str](`str`) | Must be ``\"DE\"``. Any other value raises ``ValueError``. | `'DE'` |\n| state | [str](`str`) | ISO 3166-2 subdivision short code for the Bundesland. Defaults to ``\"NW\"`` (North Rhine-Westphalia). | `'NW'` |\n\n## Returns {.doc-section .doc-section-returns}\n\n| Name | Type | Description |\n|--------|------------------------------------------------|-----------------------------------------------------------------------|\n| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | pd.DataFrame: Single integer column ``is_school_holiday`` (values in |\n| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | ``{0, 1}``; no NaNs). The index is a tz-aware `DatetimeIndex` with |\n| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | the requested *freq* and shape ``(len(data) + forecast_horizon, 1)``. |\n\n## Raises {.doc-section .doc-section-raises}\n\n| Name | Type | Description |\n|--------|----------------------------|-------------------------------------------------------------------------------------------------------------------------------------|\n| | [ValueError](`ValueError`) | If *country_code* is not ``\"DE\"``, or if the requested span extends beyond the dataset validity range ``[2022-01-01, 2027-12-31]``. |\n\n## Examples {.doc-section .doc-section-examples}\n\n\n::: {#313d9f34 .cell execution_count=1}\n``` {.python .cell-code}\nimport pandas as pd\nfrom spotforecast2_safe.calendar import get_school_holiday_features\n\nforecast_horizon = 24\nn_data = 48\ndata = pd.DataFrame(\n {\"load\": range(n_data)},\n index=pd.date_range(\"2024-07-06\", periods=n_data, freq=\"h\", tz=\"UTC\"),\n)\nstart = data.index[0]\ncov_end = start + pd.Timedelta(hours=(n_data + forecast_horizon - 1))\n\nfeats = get_school_holiday_features(\n data=data,\n start=start,\n cov_end=cov_end,\n forecast_horizon=forecast_horizon,\n state=\"NW\",\n)\nprint(\"shape:\", feats.shape)\nprint(\"columns:\", feats.columns.tolist())\n# NW Sommerferien 2024: 2024-07-08 is a school holiday (is_school_holiday=1).\nprint(\"2024-07-07 00:00 UTC:\", feats.loc[\"2024-07-07 00:00:00+00:00\", \"is_school_holiday\"])\nprint(\"2024-07-08 00:00 UTC:\", feats.loc[\"2024-07-08 00:00:00+00:00\", \"is_school_holiday\"])\nassert feats.shape == (n_data + forecast_horizon, 1)\nassert feats.loc[\"2024-07-07 00:00:00+00:00\", \"is_school_holiday\"] == 0\nassert feats.loc[\"2024-07-08 00:00:00+00:00\", \"is_school_holiday\"] == 1\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nshape: (72, 1)\ncolumns: ['is_school_holiday']\n2024-07-07 00:00 UTC: 0\n2024-07-08 00:00 UTC: 1\n```\n:::\n:::\n\n\n", + "supporting": [ + "calendar.holiday.get_school_holiday_features_files" + ], + "filters": [], + "includes": {} + } +} \ No newline at end of file diff --git a/_freeze/docs/reference/configurator.config_entsoe.ConfigEntsoe/execute-results/html.json b/_freeze/docs/reference/configurator.config_entsoe.ConfigEntsoe/execute-results/html.json index 146cadeae..32585e82e 100644 --- a/_freeze/docs/reference/configurator.config_entsoe.ConfigEntsoe/execute-results/html.json +++ b/_freeze/docs/reference/configurator.config_entsoe.ConfigEntsoe/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "2fbb360eae25c462bcf0a312ddd5429d", + "hash": "47a9b1ca24dce8da35870ad9813f4ad7", "result": { "engine": "jupyter", - "markdown": "---\ntitle: configurator.config_entsoe.ConfigEntsoe\n---\n\n\n\n```python\nconfigurator.config_entsoe.ConfigEntsoe(\n country_code='DE',\n periods=default_periods(),\n lags_consider=(lambda: list(range(1, 24)))(),\n train_size=(lambda: pd.Timedelta(days=(3 * 365)))(),\n end_train_default='2025-12-31 00:00+00:00',\n delta_val=(lambda: pd.Timedelta(hours=(24 * 7 * 10)))(),\n predict_size=24,\n cv_block_size=None,\n refit_size=7,\n random_state=314159,\n n_hyperparameters_trials=20,\n data_filename='interim/energy_load.csv',\n targets=None,\n use_outlier_detection=True,\n contamination=0.01,\n imputation_method='weighted',\n window_size=72,\n imputation_window_size=None,\n use_exogenous_features=True,\n latitude=51.5136,\n longitude=7.4653,\n timezone='UTC',\n state='NW',\n include_weather_windows=False,\n include_holiday_features=False,\n include_holiday_adjacency_features=False,\n use_population_weighted_weather=False,\n include_degree_hours=False,\n include_apparent_temperature=False,\n degree_hours_base_heating=15.0,\n degree_hours_base_cooling=22.0,\n include_ephemeris_features=False,\n include_day_type_features=False,\n poly_features_degree=1,\n max_poly_features=10,\n poly_mi_n_jobs=-1,\n poly_mi_sample_size=4000,\n include_covid_infection_rate=False,\n include_entsoe_forecast_load=False,\n include_entsoe_renewable_forecast=False,\n include_entsoe_net_load=False,\n include_entsoe_day_ahead_price=False,\n index_name='Time (UTC)',\n bounds=None,\n verbose=False,\n cache_home=None,\n n_trials_optuna=15,\n n_trials_spotoptim=10,\n n_initial_spotoptim=5,\n max_time_spotoptim=None,\n warm_start_lags=(lambda: list(DEFAULT_WARM_START_LAGS))(),\n task='lazy',\n agg_weights=None,\n forecaster_factory=None,\n data_loader=None,\n test_data_loader=None,\n auto_save_models=True,\n data_frame_name='default',\n number_folds=10,\n on_weather_failure='raise',\n on_exog_provider_failure='raise',\n exog_max_gap_hours=0,\n exog_max_tail_gap_hours=0,\n exog_provider_window='full',\n target_qc_range_mw=None,\n target_qc_step_mw=None,\n target_qc_window_days=None,\n target_corruption_policy='abort',\n target_max_heal_hours=0,\n target_anchor_zone_hours=168,\n target_qc_deviation_mw=None,\n target_qc_deviation_ref=None,\n target_qc_deviation_slots=2,\n retrain_max_age=(lambda: pd.Timedelta(days=7))(),\n)\n```\n\nConfiguration for the ENTSO-E forecasting pipeline.\n\nSingle-target counterpart to `ConfigMulti`, used by the ENTSO-E CLI\n(``spotforecast2.tasks.task_entsoe``) and any single-target pipeline routed\nthrough ``spotforecast2.multitask.runner.run(config_cls=ConfigEntsoe)``.\n\n``ConfigEntsoe`` **inherits every field and method of `ConfigMulti`** — so any\nfeature flag added to ``ConfigMulti`` is available here automatically (this\nis what closes the historical feature-flag parity gap structurally, rather\nthan via a hand-maintained mirror). It differs from ``ConfigMulti`` in\nexactly two ways:\n\n- ``index_name`` defaults to ``\"Time (UTC)\"`` (the ENTSO-E CSV time column)\n instead of ``\"DateTime\"``.\n- it adds ``retrain_max_age`` — the maximum age of a previously trained model\n before retraining is required (consumed by\n `spotforecast2_safe.manager.trainer.should_retrain`).\n\nSee `ConfigMulti` for the full field reference (training/validation windows,\nfeature toggles, exogenous-provider flags, target-corruption knobs, …).\n\n## Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|-----------------|------------------------------------------------|--------------------------------------------------------------------------------------------------|------------------------------------|\n| index_name | [str](`str`) | Datetime column name used when resetting the index. Defaults to ``\"Time (UTC)\"``. | `'Time (UTC)'` |\n| retrain_max_age | [pd](`pandas`).[Timedelta](`pandas.Timedelta`) | Maximum age of a trained model before a retrain is forced. Defaults to ``pd.Timedelta(days=7)``. | `(lambda: pd.Timedelta(days=7))()` |\n\n## Examples {.doc-section .doc-section-examples}\n\n\n::: {#765bb827 .cell execution_count=1}\n``` {.python .cell-code}\nimport pandas as pd\nfrom spotforecast2_safe.configurator.config_entsoe import ConfigEntsoe\nfrom spotforecast2_safe.configurator.config_multi import ConfigMulti\n\nconfig = ConfigEntsoe(country_code=\"DE\")\n# ENTSO-E-specific defaults:\nprint(\"index_name:\", config.index_name)\nprint(\"retrain_max_age:\", config.retrain_max_age)\nassert config.index_name == \"Time (UTC)\"\nassert config.retrain_max_age == pd.Timedelta(days=7)\n\n# Inherits the full ConfigMulti surface, incl. the opt-in feature flags:\nassert isinstance(config, ConfigMulti)\nconfig = ConfigEntsoe(\n include_ephemeris_features=True,\n include_day_type_features=True,\n include_degree_hours=True,\n)\nprint(\"ephemeris:\", config.include_ephemeris_features)\nprint(\"predict_size:\", config.predict_size)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nindex_name: Time (UTC)\nretrain_max_age: 7 days 00:00:00\nephemeris: True\npredict_size: 24\n```\n:::\n:::\n\n\n", + "markdown": "---\ntitle: configurator.config_entsoe.ConfigEntsoe\n---\n\n\n\n```python\nconfigurator.config_entsoe.ConfigEntsoe(\n country_code='DE',\n periods=default_periods(),\n lags_consider=(lambda: list(range(1, 24)))(),\n train_size=(lambda: pd.Timedelta(days=(3 * 365)))(),\n end_train_default='2025-12-31 00:00+00:00',\n delta_val=(lambda: pd.Timedelta(hours=(24 * 7 * 10)))(),\n predict_size=24,\n cv_block_size=None,\n refit_size=7,\n random_state=314159,\n n_hyperparameters_trials=20,\n data_filename='interim/energy_load.csv',\n targets=None,\n use_outlier_detection=True,\n contamination=0.01,\n imputation_method='weighted',\n window_size=72,\n imputation_window_size=None,\n use_exogenous_features=True,\n latitude=51.5136,\n longitude=7.4653,\n timezone='UTC',\n state='NW',\n include_weather_windows=False,\n include_holiday_features=False,\n include_holiday_adjacency_features=False,\n use_population_weighted_weather=False,\n include_degree_hours=False,\n include_apparent_temperature=False,\n degree_hours_base_heating=15.0,\n degree_hours_base_cooling=22.0,\n include_ephemeris_features=False,\n include_day_type_features=False,\n include_school_holiday_features=False,\n poly_features_degree=1,\n max_poly_features=10,\n poly_mi_n_jobs=-1,\n poly_mi_sample_size=4000,\n include_covid_infection_rate=False,\n include_entsoe_forecast_load=False,\n include_entsoe_renewable_forecast=False,\n include_entsoe_net_load=False,\n include_entsoe_day_ahead_price=False,\n index_name='Time (UTC)',\n bounds=None,\n verbose=False,\n cache_home=None,\n n_trials_optuna=15,\n n_trials_spotoptim=10,\n n_initial_spotoptim=5,\n max_time_spotoptim=None,\n warm_start_lags=(lambda: list(DEFAULT_WARM_START_LAGS))(),\n task='lazy',\n agg_weights=None,\n forecaster_factory=None,\n data_loader=None,\n test_data_loader=None,\n auto_save_models=True,\n data_frame_name='default',\n number_folds=10,\n on_weather_failure='raise',\n on_exog_provider_failure='raise',\n exog_max_gap_hours=0,\n exog_max_tail_gap_hours=0,\n exog_provider_window='full',\n target_qc_range_mw=None,\n target_qc_step_mw=None,\n target_qc_window_days=None,\n target_corruption_policy='abort',\n target_max_heal_hours=0,\n target_anchor_zone_hours=168,\n target_qc_deviation_mw=None,\n target_qc_deviation_ref=None,\n target_qc_deviation_slots=2,\n retrain_max_age=(lambda: pd.Timedelta(days=7))(),\n)\n```\n\nConfiguration for the ENTSO-E forecasting pipeline.\n\nSingle-target counterpart to `ConfigMulti`, used by the ENTSO-E CLI\n(``spotforecast2.tasks.task_entsoe``) and any single-target pipeline routed\nthrough ``spotforecast2.multitask.runner.run(config_cls=ConfigEntsoe)``.\n\n``ConfigEntsoe`` **inherits every field and method of `ConfigMulti`** — so any\nfeature flag added to ``ConfigMulti`` is available here automatically (this\nis what closes the historical feature-flag parity gap structurally, rather\nthan via a hand-maintained mirror). It differs from ``ConfigMulti`` in\nexactly two ways:\n\n- ``index_name`` defaults to ``\"Time (UTC)\"`` (the ENTSO-E CSV time column)\n instead of ``\"DateTime\"``.\n- it adds ``retrain_max_age`` — the maximum age of a previously trained model\n before retraining is required (consumed by\n `spotforecast2_safe.manager.trainer.should_retrain`).\n\nSee `ConfigMulti` for the full field reference (training/validation windows,\nfeature toggles, exogenous-provider flags, target-corruption knobs, …).\n\n## Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|-----------------|------------------------------------------------|--------------------------------------------------------------------------------------------------|------------------------------------|\n| index_name | [str](`str`) | Datetime column name used when resetting the index. Defaults to ``\"Time (UTC)\"``. | `'Time (UTC)'` |\n| retrain_max_age | [pd](`pandas`).[Timedelta](`pandas.Timedelta`) | Maximum age of a trained model before a retrain is forced. Defaults to ``pd.Timedelta(days=7)``. | `(lambda: pd.Timedelta(days=7))()` |\n\n## Examples {.doc-section .doc-section-examples}\n\n\n::: {#2e85db0e .cell execution_count=1}\n``` {.python .cell-code}\nimport pandas as pd\nfrom spotforecast2_safe.configurator.config_entsoe import ConfigEntsoe\nfrom spotforecast2_safe.configurator.config_multi import ConfigMulti\n\nconfig = ConfigEntsoe(country_code=\"DE\")\n# ENTSO-E-specific defaults:\nprint(\"index_name:\", config.index_name)\nprint(\"retrain_max_age:\", config.retrain_max_age)\nassert config.index_name == \"Time (UTC)\"\nassert config.retrain_max_age == pd.Timedelta(days=7)\n\n# Inherits the full ConfigMulti surface, incl. the opt-in feature flags:\nassert isinstance(config, ConfigMulti)\nconfig = ConfigEntsoe(\n include_ephemeris_features=True,\n include_day_type_features=True,\n include_degree_hours=True,\n)\nprint(\"ephemeris:\", config.include_ephemeris_features)\nprint(\"predict_size:\", config.predict_size)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nindex_name: Time (UTC)\nretrain_max_age: 7 days 00:00:00\nephemeris: True\npredict_size: 24\n```\n:::\n:::\n\n\n", "supporting": [ "configurator.config_entsoe.ConfigEntsoe_files/figure-html" ], diff --git a/_freeze/docs/reference/configurator.config_multi.ConfigMulti/execute-results/html.json b/_freeze/docs/reference/configurator.config_multi.ConfigMulti/execute-results/html.json index b4feec4f9..48731e80d 100644 --- a/_freeze/docs/reference/configurator.config_multi.ConfigMulti/execute-results/html.json +++ b/_freeze/docs/reference/configurator.config_multi.ConfigMulti/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "a479065f5131600e0487867356e527c2", + "hash": "0a579d03556e46270609ebf0a3718e9b", "result": { "engine": "jupyter", - "markdown": "---\ntitle: configurator.config_multi.ConfigMulti\n---\n\n\n\n```python\nconfigurator.config_multi.ConfigMulti(\n country_code='DE',\n periods=default_periods(),\n lags_consider=(lambda: list(range(1, 24)))(),\n train_size=(lambda: pd.Timedelta(days=(3 * 365)))(),\n end_train_default='2025-12-31 00:00+00:00',\n delta_val=(lambda: pd.Timedelta(hours=(24 * 7 * 10)))(),\n predict_size=24,\n cv_block_size=None,\n refit_size=7,\n random_state=314159,\n n_hyperparameters_trials=20,\n data_filename='interim/energy_load.csv',\n targets=None,\n use_outlier_detection=True,\n contamination=0.01,\n imputation_method='weighted',\n window_size=72,\n imputation_window_size=None,\n use_exogenous_features=True,\n latitude=51.5136,\n longitude=7.4653,\n timezone='UTC',\n state='NW',\n include_weather_windows=False,\n include_holiday_features=False,\n include_holiday_adjacency_features=False,\n use_population_weighted_weather=False,\n include_degree_hours=False,\n include_apparent_temperature=False,\n degree_hours_base_heating=15.0,\n degree_hours_base_cooling=22.0,\n include_ephemeris_features=False,\n include_day_type_features=False,\n poly_features_degree=1,\n max_poly_features=10,\n poly_mi_n_jobs=-1,\n poly_mi_sample_size=4000,\n include_covid_infection_rate=False,\n include_entsoe_forecast_load=False,\n include_entsoe_renewable_forecast=False,\n include_entsoe_net_load=False,\n include_entsoe_day_ahead_price=False,\n index_name='DateTime',\n bounds=None,\n verbose=False,\n cache_home=None,\n n_trials_optuna=15,\n n_trials_spotoptim=10,\n n_initial_spotoptim=5,\n max_time_spotoptim=None,\n warm_start_lags=(lambda: list(DEFAULT_WARM_START_LAGS))(),\n task='lazy',\n agg_weights=None,\n forecaster_factory=None,\n data_loader=None,\n test_data_loader=None,\n auto_save_models=True,\n data_frame_name='default',\n number_folds=10,\n on_weather_failure='raise',\n on_exog_provider_failure='raise',\n exog_max_gap_hours=0,\n exog_max_tail_gap_hours=0,\n exog_provider_window='full',\n target_qc_range_mw=None,\n target_qc_step_mw=None,\n target_qc_window_days=None,\n target_corruption_policy='abort',\n target_max_heal_hours=0,\n target_anchor_zone_hours=168,\n target_qc_deviation_mw=None,\n target_qc_deviation_ref=None,\n target_qc_deviation_slots=2,\n)\n```\n\nConfiguration for the multi-input forecasting pipeline.\n\nThis class manages all configuration parameters for the multi-input task,\nincluding training/prediction intervals, data sources, and feature\nengineering specifications. All parameters can be customized during\ninitialization or used with sensible defaults.\n\n``country_code`` serves as the single ISO country code used for both\nAPI queries and holiday feature generation.\n\n## Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|------------------------------------|------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------|\n| country_code | [str](`str`) | ISO 3166-1 alpha-2 country code (e.g. ``\"DE\"``). Used for both API queries and holiday feature generation. | `'DE'` |\n| periods | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[Period](`spotforecast2_safe.data.Period`)\\]\\] | List of Period objects defining cyclical feature encodings. | `default_periods()` |\n| lags_consider | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[int](`int`)\\]\\] | List of lag values to consider for feature selection. | `(lambda: list(range(1, 24)))()` |\n| train_size | [Optional](`typing.Optional`)\\[[pd](`pandas`).[Timedelta](`pandas.Timedelta`)\\] | Time window for training data. | `(lambda: pd.Timedelta(days=(3 * 365)))()` |\n| end_train_default | [str](`str`) | Default end date for training period (ISO format with timezone). | `'2025-12-31 00:00+00:00'` |\n| delta_val | [Optional](`typing.Optional`)\\[[pd](`pandas`).[Timedelta](`pandas.Timedelta`)\\] | Validation window size. | `(lambda: pd.Timedelta(hours=(24 * 7 * 10)))()` |\n| predict_size | [int](`int`) | Number of hours to predict ahead. | `24` |\n| cv_block_size | [int](`int`) \\| None | Cross-validation test-block width in hours. Defaults to ``None``, meaning the CV uses ``predict_size``. Set to a fixed value (e.g. ``24``) to decouple the cross-validation horizon from a render-dependent live ``predict_size``. | `None` |\n| refit_size | [int](`int`) | Number of days between model refits. | `7` |\n| random_state | [int](`int`) | Random seed for reproducibility. | `314159` |\n| n_hyperparameters_trials | [int](`int`) | Number of trials for hyperparameter optimization. | `20` |\n| data_filename | [str](`str`) | Path to the interim merged data file. | `'interim/energy_load.csv'` |\n| targets | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[str](`str`)\\]\\] | List of target column names to train models for. When ``None`` (default), no targets are pre-selected; set this attribute after loading the dataset (e.g. ``config.targets = df.columns.tolist()``). Replaces standalone ``TARGETS`` and ``target_columns`` variables in pipeline scripts, providing a single source of truth for the active target set. | `None` |\n| use_outlier_detection | [bool](`bool`) | If True, apply IsolationForest-based outlier removal. | `True` |\n| contamination | [float](`float`) | Proportion of outliers for IsolationForest (0 < contamination < 0.5). | `0.01` |\n| imputation_method | [str](`str`) | Gap-filling strategy — ``\"weighted\"`` (n2n-style rolling weights) or ``\"linear\"`` (linear interpolation). | `'weighted'` |\n| window_size | [int](`int`) | Rolling window size in hours for gap detection (weighted imputation). | `72` |\n| use_exogenous_features | [bool](`bool`) | If True, build weather/calendar/day-night/holiday features. | `True` |\n| latitude | [float](`float`) | Latitude of the target location in decimal degrees. | `51.5136` |\n| longitude | [float](`float`) | Longitude of the target location in decimal degrees. | `7.4653` |\n| timezone | [str](`str`) | IANA timezone string for the target location (e.g. ``\"Europe/Berlin\"``). | `'UTC'` |\n| state | [str](`str`) | ISO 3166-2 subdivision code for regional holidays (e.g. ``\"NW\"``). | `'NW'` |\n| include_weather_windows | [bool](`bool`) | If True, include rolling weather-window features. | `False` |\n| include_holiday_features | [bool](`bool`) | If True, include public-holiday indicator features. | `False` |\n| include_holiday_adjacency_features | [bool](`bool`) | If True, include Brückentag and before/after-holiday indicators (``is_brueckentag``, ``is_before_holiday``, ``is_after_holiday``). Defaults to ``False``. | `False` |\n| poly_features_degree | [int](`int`) | Polynomial-interaction degree. ``1`` (default) generates no interactions; ``2`` adds pairwise bilinear terms; ``3+`` higher order. | `1` |\n| max_poly_features | [int](`int`) | Cap on polynomial interaction columns; only the top ``max_poly_features`` ranked by mutual information with the target are kept (``<= 0`` disables). Defaults to ``10``. | `10` |\n| poly_mi_n_jobs | [Optional](`typing.Optional`)\\[[int](`int`)\\] | Parallel jobs for the mutual-information ranking that enforces ``max_poly_features``. ``-1`` (default) uses all cores; ``None`` runs single-threaded. Parallelism does not change the selection. | `-1` |\n| poly_mi_sample_size | [Optional](`typing.Optional`)\\[[int](`int`)\\] | Row cap for that ranking; longer series are scored on a reproducible random subsample of this size (seeded by ``random_state``), which can change which borderline columns make the top K. ``None`` scores every row (the pre-15.8 behaviour). Defaults to ``4000``. | `4000` |\n| index_name | [str](`str`) | Name assigned to the datetime column when the index is reset. Defaults to ``\"DateTime\"``. | `'DateTime'` |\n| bounds | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[tuple](`tuple`)\\]\\] | Per-column outlier bounds as a list of ``(lower, upper)`` tuples, one entry per target column. ``None`` until set. | `None` |\n| verbose | [bool](`bool`) | If ``True``, enable verbose output for pipeline steps. Defaults to ``False``. | `False` |\n| cache_home | [Optional](`typing.Optional`)\\[[Any](`typing.Any`)\\] | Path to the cache directory. ``None`` means the library default (``~/spotforecast2_cache/``) is used. | `None` |\n| n_trials_optuna | [int](`int`) | Number of Optuna Bayesian-search trials for hyperparameter optimization (task 3). Defaults to ``15``. | `15` |\n| n_trials_spotoptim | [int](`int`) | Number of SpotOptim surrogate-search trials (task 4). Defaults to ``10``. | `10` |\n| n_initial_spotoptim | [int](`int`) | Number of initial random evaluations for SpotOptim (task 4). Defaults to ``5``. | `5` |\n| max_time_spotoptim | [Optional](`typing.Optional`)\\[[float](`float`)\\] | Wall-clock budget for the SpotOptim search in minutes (task 4). The search stops when either ``n_trials_spotoptim`` evaluations or this time limit is reached, whichever comes first. ``None`` (the default) disables the limit. | `None` |\n| warm_start_lags | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[int](`int`)\\]\\] | Lag set the SpotOptim task injects as a search-space candidate and uses to seed the optimizer's first evaluation. Defaults to ``DEFAULT_WARM_START_LAGS`` (``[1, 2, 3, 23, 24, 25, 47, 48, 167, 168, 169, 336]``). ``None`` or an empty list disables the warm start. | `(lambda: list(DEFAULT_WARM_START_LAGS))()` |\n| task | [str](`str`) | Active prediction task — one of ``\"lazy\"``, ``\"training\"``, ``\"optuna\"``, or ``\"spotoptim\"``. Defaults to ``\"lazy\"``. | `'lazy'` |\n| agg_weights | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[float](`float`)\\]\\] | Per-target aggregation weights used when combining individual target forecasts into a single weighted sum. The list must contain one weight per entry in ``targets`` (in the same order). Positive values add the target's contribution; negative values invert it. Slice the list to ``agg_weights[:len(targets)]`` when only a subset of targets is active. Defaults to ``None`` (no weights pre-defined; set after loading the dataset). | `None` |\n| auto_save_models | [bool](`bool`) | Whether ``BaseTask._run_strategy`` should persist fitted forecasters to ``/models/`` after every training run. Defaults to ``True`` so that saved models are immediately available for ``PredictTask`` without an explicit ``save_models()`` call. | `True` |\n| data_frame_name | [str](`str`) | Identifier for the active dataset. Used by ``BaseTask`` to name cache subdirectories, model files, and the per-dataset log file. Defaults to ``\"default\"``. | `'default'` |\n| on_weather_failure | [Literal](`typing.Literal`)\\[\\'raise\\', \\'skip\\'\\] | Policy for handling Open-Meteo fetch failures inside ``BaseTask.build_exogenous_features``. ``\"raise\"`` (default) aborts the pipeline with a ``WeatherFetchError`` and preserves the safety-critical fail-safe semantics. ``\"skip\"`` logs a warning and continues with empty weather features so the rest of the pipeline can run without the Open-Meteo dependency. | `'raise'` |\n| exog_max_gap_hours | [int](`int`) | Maximum length, in hours, of a contiguous run of missing exogenous-provider values healed before the provider is rejected. Interior gaps are time-interpolated; leading/trailing edge gaps are back-/forward-filled. ``0`` (default) keeps the strict fail-safe (any gap raises). Healed runs are logged with count and span. Only already-published day-ahead vintages are involved, so healing is leakage-clean (CR-3). | `0` |\n| exog_max_tail_gap_hours | [int](`int`) | Extended healing budget, in hours, applied exclusively to the trailing-edge NaN run (the run containing the last index timestamp). The effective tail budget is ``max(exog_max_gap_hours, exog_max_tail_gap_hours)``. The canonical use case is the ENTSO-E day-ahead publication frontier: the last published vintage is zero-order-held forward to the forecast horizon without touching interior gaps (CR-3-clean). When ``exog_max_tail_gap_hours <= exog_max_gap_hours`` the parameter is inert (the interior budget already covers the tail) and a warning is logged. Defaults to ``0``. | `0` |\n| exog_provider_window | [Literal](`typing.Literal`)\\[\\'full\\', \\'train\\'\\] | Span the exogenous providers are validated against. ``\"full\"`` (default) requires coverage of the entire ``data_start``→``cov_end`` request, matching prior behaviour. ``\"train\"`` validates only the consumed window ``[start_train_ts, cov_end]``, tolerating missing values before the training window. Honoured by the MultiTask pipeline; the forecaster-wrapper path currently always validates the full span. | `'full'` |\n\n## Attributes {.doc-section .doc-section-attributes}\n\n| Name | Type | Description |\n|------------------------------------|----------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| country_code | [str](`str`) | ISO country code for API queries and holiday generation. |\n| periods | [List](`typing.List`)\\[[Period](`spotforecast2_safe.data.Period`)\\] | Cyclical feature encoding specifications. |\n| lags_consider | [List](`typing.List`)\\[[int](`int`)\\] | Lag values for autoregressive features. |\n| train_size | [pd](`pandas`).[Timedelta](`pandas.Timedelta`) | Training data window. |\n| end_train_default | [str](`str`) | Default training end date. |\n| delta_val | [pd](`pandas`).[Timedelta](`pandas.Timedelta`) | Validation window. |\n| predict_size | [int](`int`) | Prediction horizon in hours. |\n| refit_size | [int](`int`) | Refit interval in days. |\n| random_state | [int](`int`) | Random seed. |\n| n_hyperparameters_trials | [int](`int`) | Hyperparameter tuning trials. |\n| targets | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[str](`str`)\\]\\] | Active target column names. ``None`` until explicitly set from the loaded dataset. |\n| use_outlier_detection | [bool](`bool`) | IsolationForest outlier removal toggle. |\n| contamination | [float](`float`) | IsolationForest contamination fraction. |\n| imputation_method | [str](`str`) | Gap-filling strategy (``\"weighted\"`` or ``\"linear\"``). |\n| window_size | [int](`int`) | Rolling window size for weighted imputation. |\n| use_exogenous_features | [bool](`bool`) | Exogenous feature construction toggle. |\n| latitude | [float](`float`) | Location latitude. |\n| longitude | [float](`float`) | Location longitude. |\n| timezone | [str](`str`) | IANA timezone string. |\n| state | [str](`str`) | Subdivision code for regional holidays. |\n| include_weather_windows | [bool](`bool`) | Weather-window feature toggle. |\n| include_holiday_features | [bool](`bool`) | Holiday feature toggle. |\n| include_holiday_adjacency_features | [bool](`bool`) | Brückentag and before/after-holiday indicator toggle. Defaults to ``False``. |\n| poly_features_degree | [int](`int`) | Polynomial-interaction degree (1 = off). |\n| max_poly_features | [int](`int`) | Cap on kept ``poly_*`` columns (top-K by MI). |\n| poly_mi_n_jobs | [Optional](`typing.Optional`)\\[[int](`int`)\\] | Parallel jobs for the MI ranking (``-1`` = all cores; selection-invariant). |\n| poly_mi_sample_size | [Optional](`typing.Optional`)\\[[int](`int`)\\] | Row cap for the MI ranking (``None`` = score every row). |\n| include_covid_infection_rate | [bool](`bool`) | Append the bundled RKI German national COVID-19 7-day incidence as an exogenous regressor. |\n| include_entsoe_forecast_load | [bool](`bool`) | Append the ENTSO-E day-ahead Forecasted Load as a near-oracle exogenous prior. |\n| include_entsoe_renewable_forecast | [bool](`bool`) | Append the ENTSO-E day-ahead wind/solar generation forecast. |\n| include_entsoe_net_load | [bool](`bool`) | Append the ENTSO-E day-ahead net load (Forecasted Load minus wind/solar forecast). |\n| include_entsoe_day_ahead_price | [bool](`bool`) | Append the ENTSO-E day-ahead spot price (DE/LU). |\n| index_name | [str](`str`) | Datetime column name used when resetting the index. |\n| bounds | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[tuple](`tuple`)\\]\\] | Per-column outlier bounds ``(lower, upper)``. |\n| verbose | [bool](`bool`) | Verbose output toggle. |\n| cache_home | [Optional](`typing.Optional`)\\[[Any](`typing.Any`)\\] | Path to the cache directory. |\n| n_trials_optuna | [int](`int`) | Number of Optuna hyperparameter-search trials. |\n| n_trials_spotoptim | [int](`int`) | Number of SpotOptim search trials. |\n| n_initial_spotoptim | [int](`int`) | Number of initial SpotOptim evaluations. |\n| max_time_spotoptim | [Optional](`typing.Optional`)\\[[float](`float`)\\] | Wall-clock budget for the SpotOptim search in minutes; ``None`` disables the limit. |\n| warm_start_lags | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[int](`int`)\\]\\] | Seed lag set for the SpotOptim search; ``None`` or empty disables the warm start. |\n| task | [str](`str`) | Active prediction task (``\"lazy\"``, ``\"training\"``, ``\"optuna\"``, or ``\"spotoptim\"``). |\n| agg_weights | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[float](`float`)\\]\\] | Per-target aggregation weights. One weight per entry in ``targets``; positive values add, negative values invert the target's contribution. ``None`` until set. |\n| auto_save_models | [bool](`bool`) | Whether to auto-persist fitted forecasters after each training run. |\n| data_frame_name | [str](`str`) | Active-dataset identifier used for cache and log-file naming. |\n| number_folds | [int](`int`) | Cross-validation fold count for tuning tasks. |\n| on_weather_failure | [Literal](`typing.Literal`)\\[\\'raise\\', \\'skip\\'\\] | Open-Meteo fetch-failure policy: ``\"raise\"`` aborts, ``\"skip\"`` continues without weather. |\n| on_exog_provider_failure | [Literal](`typing.Literal`)\\[\\'raise\\', \\'skip\\'\\] | Exog-provider failure policy in ``ExogBuilder.build``: ``\"raise\"`` (default) propagates the ``ExogProviderError``; ``\"skip\"`` logs and omits the failing provider's columns. |\n| exog_max_gap_hours | [int](`int`) | Maximum contiguous gap in hours that providers will heal before raising (0 = strict fail-safe). |\n| exog_provider_window | [Literal](`typing.Literal`)\\[\\'full\\', \\'train\\'\\] | Validation window for exog providers: ``\"full\"`` (default) or ``\"train\"``. |\n\n## Notes {.doc-section .doc-section-notes}\n\nThe default period configurations use specific `n_periods` to balance resolution and smoothing:\n- **Daily**: `n_periods=12` (24h) provides ~2h resolution, smoothing hourly noise and halving dimensionality.\n- **Weekly**: `n_periods` typically matches range (1:1) to distinguish day-of-week patterns.\n- **Yearly**: `n_periods=12` (365d) provides ~1 month resolution, capturing broad seasonal trends without overfitting.\n\nSee `docs/PERIOD_CONFIGURATION_RATIONALE.md` for a detailed analysis.\n\n## Examples {.doc-section .doc-section-examples}\n\n\n::: {#a936b1a9 .cell execution_count=1}\n``` {.python .cell-code}\nimport pandas as pd\nfrom spotforecast2_safe.configurator.config_multi import ConfigMulti\nconfig = ConfigMulti()\nprint(f\"country_code: {config.country_code}\")\nprint(f\"Predict size: {config.predict_size}\")\nprint(f\"Random state: {config.random_state}\")\nprint(f\"Targets (default): {config.targets}\")\nprint(f\"agg_weights (default): {config.agg_weights}\")\nprint(f\"index_name: {config.index_name}\")\nprint(f\"bounds: {config.bounds}\")\n\n# Set targets and bounds (user input that stays on the config)\nconfig.targets = [\"A\", \"B\", \"C\"]\nconfig.bounds = [(-2500, 4500), (-10, 3000)]\nprint(f\"Targets (after setting): {config.targets}\")\nprint(f\"bounds: {config.bounds}\")\n\n# Create custom configuration — country_code serves both API and holiday purposes\ncustom_config = ConfigMulti(\n country_code='FR',\n predict_size=48,\n random_state=42,\n targets=[\"A\", \"B\"],\n index_name=\"DateTime\",\n)\nprint(f\"country_code: {custom_config.country_code}\")\nprint(f\"Predict size: {custom_config.predict_size}\")\nprint(f\"Random state: {custom_config.random_state}\")\nprint(f\"Targets: {custom_config.targets}\")\n\n# Verify training window\nprint(f\"Training window: {config.train_size == pd.Timedelta(days=3 * 365)}\")\n\n# Check default periods\nprint(f\"Number of periods: {len(config.periods)}\")\nprint(f\"First period name: {config.periods[0].name}\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\ncountry_code: DE\nPredict size: 24\nRandom state: 314159\nTargets (default): None\nagg_weights (default): None\nindex_name: DateTime\nbounds: None\nTargets (after setting): ['A', 'B', 'C']\nbounds: [(-2500, 4500), (-10, 3000)]\ncountry_code: FR\nPredict size: 48\nRandom state: 42\nTargets: ['A', 'B']\nTraining window: True\nNumber of periods: 5\nFirst period name: daily\n```\n:::\n:::\n\n\n## Methods\n\n| Name | Description |\n| --- | --- |\n| [get_params](#spotforecast2_safe.configurator.config_multi.ConfigMulti.get_params) | Get parameters for this configuration object. |\n| [set_params](#spotforecast2_safe.configurator.config_multi.ConfigMulti.set_params) | Set the parameters of this configuration object. |\n\n### get_params { #spotforecast2_safe.configurator.config_multi.ConfigMulti.get_params }\n\n```python\nconfigurator.config_multi.ConfigMulti.get_params(deep=True)\n```\n\nGet parameters for this configuration object.\n\n#### Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|--------|----------------|-----------------------------------------------------------------------------------------------------------|-----------|\n| deep | [bool](`bool`) | If True, will return the parameters for this configuration and contained sub-objects that are estimators. | `True` |\n\n#### Returns {.doc-section .doc-section-returns}\n\n| Name | Type | Description |\n|--------|-----------------------------------------------------------|-------------------------------------------------------|\n| params | [Dict](`typing.Dict`)\\[[str](`str`), [object](`object`)\\] | Dictionary of parameter names mapped to their values. |\n\n#### Examples {.doc-section .doc-section-examples}\n\n::: {#1e690578 .cell execution_count=2}\n``` {.python .cell-code}\nfrom spotforecast2_safe.configurator.config_multi import ConfigMulti\nconfig = ConfigMulti(country_code=\"FR\")\np = config.get_params()\nprint(f\"country_code: {p['country_code']}\")\nprint(f\"Predict size: {p['predict_size']}\")\nprint(f\"Random state: {p['random_state']}\")\nprint(f\"index_name: {p['index_name']}\")\nprint(f\"bounds: {p['bounds']}\")\nprint(f\"agg_weights: {p['agg_weights']}\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\ncountry_code: FR\nPredict size: 24\nRandom state: 314159\nindex_name: DateTime\nbounds: None\nagg_weights: None\n```\n:::\n:::\n\n\n### set_params { #spotforecast2_safe.configurator.config_multi.ConfigMulti.set_params }\n\n```python\nconfigurator.config_multi.ConfigMulti.set_params(params=None, **kwargs)\n```\n\nSet the parameters of this configuration object.\n\n#### Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|----------|-----------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------|-----------|\n| params | [Dict](`typing.Dict`)\\[[str](`str`), [object](`object`)\\] | Optional dictionary of parameter names mapped to their new values. | `None` |\n| **kwargs | [object](`object`) | Additional parameter names mapped to their new values. It supports configuring nested 'Period' objects using the `periods____` notation. | `{}` |\n\n#### Returns {.doc-section .doc-section-returns}\n\n| Name | Type | Description |\n|-------------|---------------------------------------------------------------------------|--------------------------------------------------------------------------------|\n| ConfigMulti | [ConfigMulti](`spotforecast2_safe.configurator.config_multi.ConfigMulti`) | The configuration instance with updated parameters (supports method chaining). |\n\n#### Examples {.doc-section .doc-section-examples}\n\n::: {#dd9e8c82 .cell execution_count=3}\n``` {.python .cell-code}\nfrom spotforecast2_safe.configurator.config_multi import ConfigMulti\nconfig = ConfigMulti()\n_ = config.set_params(country_code=\"FR\", predict_size=48)\nprint(f\"country_code: {config.country_code}\")\nprint(f\"Predict size: {config.predict_size}\")\nprint(f\"Random state: {config.random_state}\")\n\n# Deep parameter setting\n_ = config.set_params(periods__daily__n_periods=24)\nprint(next(p.n_periods for p in config.periods if p.name == \"daily\"))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\ncountry_code: FR\nPredict size: 48\nRandom state: 314159\n24\n```\n:::\n:::\n\n\n", + "markdown": "---\ntitle: configurator.config_multi.ConfigMulti\n---\n\n\n\n```python\nconfigurator.config_multi.ConfigMulti(\n country_code='DE',\n periods=default_periods(),\n lags_consider=(lambda: list(range(1, 24)))(),\n train_size=(lambda: pd.Timedelta(days=(3 * 365)))(),\n end_train_default='2025-12-31 00:00+00:00',\n delta_val=(lambda: pd.Timedelta(hours=(24 * 7 * 10)))(),\n predict_size=24,\n cv_block_size=None,\n refit_size=7,\n random_state=314159,\n n_hyperparameters_trials=20,\n data_filename='interim/energy_load.csv',\n targets=None,\n use_outlier_detection=True,\n contamination=0.01,\n imputation_method='weighted',\n window_size=72,\n imputation_window_size=None,\n use_exogenous_features=True,\n latitude=51.5136,\n longitude=7.4653,\n timezone='UTC',\n state='NW',\n include_weather_windows=False,\n include_holiday_features=False,\n include_holiday_adjacency_features=False,\n use_population_weighted_weather=False,\n include_degree_hours=False,\n include_apparent_temperature=False,\n degree_hours_base_heating=15.0,\n degree_hours_base_cooling=22.0,\n include_ephemeris_features=False,\n include_day_type_features=False,\n include_school_holiday_features=False,\n poly_features_degree=1,\n max_poly_features=10,\n poly_mi_n_jobs=-1,\n poly_mi_sample_size=4000,\n include_covid_infection_rate=False,\n include_entsoe_forecast_load=False,\n include_entsoe_renewable_forecast=False,\n include_entsoe_net_load=False,\n include_entsoe_day_ahead_price=False,\n index_name='DateTime',\n bounds=None,\n verbose=False,\n cache_home=None,\n n_trials_optuna=15,\n n_trials_spotoptim=10,\n n_initial_spotoptim=5,\n max_time_spotoptim=None,\n warm_start_lags=(lambda: list(DEFAULT_WARM_START_LAGS))(),\n task='lazy',\n agg_weights=None,\n forecaster_factory=None,\n data_loader=None,\n test_data_loader=None,\n auto_save_models=True,\n data_frame_name='default',\n number_folds=10,\n on_weather_failure='raise',\n on_exog_provider_failure='raise',\n exog_max_gap_hours=0,\n exog_max_tail_gap_hours=0,\n exog_provider_window='full',\n target_qc_range_mw=None,\n target_qc_step_mw=None,\n target_qc_window_days=None,\n target_corruption_policy='abort',\n target_max_heal_hours=0,\n target_anchor_zone_hours=168,\n target_qc_deviation_mw=None,\n target_qc_deviation_ref=None,\n target_qc_deviation_slots=2,\n)\n```\n\nConfiguration for the multi-input forecasting pipeline.\n\nThis class manages all configuration parameters for the multi-input task,\nincluding training/prediction intervals, data sources, and feature\nengineering specifications. All parameters can be customized during\ninitialization or used with sensible defaults.\n\n``country_code`` serves as the single ISO country code used for both\nAPI queries and holiday feature generation.\n\n## Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|------------------------------------|------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------|\n| country_code | [str](`str`) | ISO 3166-1 alpha-2 country code (e.g. ``\"DE\"``). Used for both API queries and holiday feature generation. | `'DE'` |\n| periods | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[Period](`spotforecast2_safe.data.Period`)\\]\\] | List of Period objects defining cyclical feature encodings. | `default_periods()` |\n| lags_consider | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[int](`int`)\\]\\] | List of lag values to consider for feature selection. | `(lambda: list(range(1, 24)))()` |\n| train_size | [Optional](`typing.Optional`)\\[[pd](`pandas`).[Timedelta](`pandas.Timedelta`)\\] | Time window for training data. | `(lambda: pd.Timedelta(days=(3 * 365)))()` |\n| end_train_default | [str](`str`) | Default end date for training period (ISO format with timezone). | `'2025-12-31 00:00+00:00'` |\n| delta_val | [Optional](`typing.Optional`)\\[[pd](`pandas`).[Timedelta](`pandas.Timedelta`)\\] | Validation window size. | `(lambda: pd.Timedelta(hours=(24 * 7 * 10)))()` |\n| predict_size | [int](`int`) | Number of hours to predict ahead. | `24` |\n| cv_block_size | [int](`int`) \\| None | Cross-validation test-block width in hours. Defaults to ``None``, meaning the CV uses ``predict_size``. Set to a fixed value (e.g. ``24``) to decouple the cross-validation horizon from a render-dependent live ``predict_size``. | `None` |\n| refit_size | [int](`int`) | Number of days between model refits. | `7` |\n| random_state | [int](`int`) | Random seed for reproducibility. | `314159` |\n| n_hyperparameters_trials | [int](`int`) | Number of trials for hyperparameter optimization. | `20` |\n| data_filename | [str](`str`) | Path to the interim merged data file. | `'interim/energy_load.csv'` |\n| targets | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[str](`str`)\\]\\] | List of target column names to train models for. When ``None`` (default), no targets are pre-selected; set this attribute after loading the dataset (e.g. ``config.targets = df.columns.tolist()``). Replaces standalone ``TARGETS`` and ``target_columns`` variables in pipeline scripts, providing a single source of truth for the active target set. | `None` |\n| use_outlier_detection | [bool](`bool`) | If True, apply IsolationForest-based outlier removal. | `True` |\n| contamination | [float](`float`) | Proportion of outliers for IsolationForest (0 < contamination < 0.5). | `0.01` |\n| imputation_method | [str](`str`) | Gap-filling strategy — ``\"weighted\"`` (n2n-style rolling weights) or ``\"linear\"`` (linear interpolation). | `'weighted'` |\n| window_size | [int](`int`) | Rolling window size in hours for gap detection (weighted imputation). | `72` |\n| use_exogenous_features | [bool](`bool`) | If True, build weather/calendar/day-night/holiday features. | `True` |\n| latitude | [float](`float`) | Latitude of the target location in decimal degrees. | `51.5136` |\n| longitude | [float](`float`) | Longitude of the target location in decimal degrees. | `7.4653` |\n| timezone | [str](`str`) | IANA timezone string for the target location (e.g. ``\"Europe/Berlin\"``). | `'UTC'` |\n| state | [str](`str`) | ISO 3166-2 subdivision code for regional holidays (e.g. ``\"NW\"``). | `'NW'` |\n| include_weather_windows | [bool](`bool`) | If True, include rolling weather-window features. | `False` |\n| include_holiday_features | [bool](`bool`) | If True, include public-holiday indicator features. | `False` |\n| include_holiday_adjacency_features | [bool](`bool`) | If True, include Brückentag and before/after-holiday indicators (``is_brueckentag``, ``is_before_holiday``, ``is_after_holiday``). Defaults to ``False``. | `False` |\n| include_ephemeris_features | [bool](`bool`) | If True, include solar-elevation and daylight-duration features. Defaults to ``False``. | `False` |\n| include_day_type_features | [bool](`bool`) | If True, include working-day and day-type class features (``is_workday``, ``day_type``). Defaults to ``False``. | `False` |\n| include_school_holiday_features | [bool](`bool`) | Append the ``is_school_holiday`` binary indicator from the bundled OpenHolidays API dataset (ODbL-1.0). Coverage 2022-01-01 to 2027-12-31 for all 16 German Bundesländer. Only ``country_code=\"DE\"`` is supported. Defaults to ``False``. | `False` |\n| poly_features_degree | [int](`int`) | Polynomial-interaction degree. ``1`` (default) generates no interactions; ``2`` adds pairwise bilinear terms; ``3+`` higher order. | `1` |\n| max_poly_features | [int](`int`) | Cap on polynomial interaction columns; only the top ``max_poly_features`` ranked by mutual information with the target are kept (``<= 0`` disables). Defaults to ``10``. | `10` |\n| poly_mi_n_jobs | [Optional](`typing.Optional`)\\[[int](`int`)\\] | Parallel jobs for the mutual-information ranking that enforces ``max_poly_features``. ``-1`` (default) uses all cores; ``None`` runs single-threaded. Parallelism does not change the selection. | `-1` |\n| poly_mi_sample_size | [Optional](`typing.Optional`)\\[[int](`int`)\\] | Row cap for that ranking; longer series are scored on a reproducible random subsample of this size (seeded by ``random_state``), which can change which borderline columns make the top K. ``None`` scores every row (the pre-15.8 behaviour). Defaults to ``4000``. | `4000` |\n| index_name | [str](`str`) | Name assigned to the datetime column when the index is reset. Defaults to ``\"DateTime\"``. | `'DateTime'` |\n| bounds | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[tuple](`tuple`)\\]\\] | Per-column outlier bounds as a list of ``(lower, upper)`` tuples, one entry per target column. ``None`` until set. | `None` |\n| verbose | [bool](`bool`) | If ``True``, enable verbose output for pipeline steps. Defaults to ``False``. | `False` |\n| cache_home | [Optional](`typing.Optional`)\\[[Any](`typing.Any`)\\] | Path to the cache directory. ``None`` means the library default (``~/spotforecast2_cache/``) is used. | `None` |\n| n_trials_optuna | [int](`int`) | Number of Optuna Bayesian-search trials for hyperparameter optimization (task 3). Defaults to ``15``. | `15` |\n| n_trials_spotoptim | [int](`int`) | Number of SpotOptim surrogate-search trials (task 4). Defaults to ``10``. | `10` |\n| n_initial_spotoptim | [int](`int`) | Number of initial random evaluations for SpotOptim (task 4). Defaults to ``5``. | `5` |\n| max_time_spotoptim | [Optional](`typing.Optional`)\\[[float](`float`)\\] | Wall-clock budget for the SpotOptim search in minutes (task 4). The search stops when either ``n_trials_spotoptim`` evaluations or this time limit is reached, whichever comes first. ``None`` (the default) disables the limit. | `None` |\n| warm_start_lags | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[int](`int`)\\]\\] | Lag set the SpotOptim task injects as a search-space candidate and uses to seed the optimizer's first evaluation. Defaults to ``DEFAULT_WARM_START_LAGS`` (``[1, 2, 3, 23, 24, 25, 47, 48, 167, 168, 169, 336]``). ``None`` or an empty list disables the warm start. | `(lambda: list(DEFAULT_WARM_START_LAGS))()` |\n| task | [str](`str`) | Active prediction task — one of ``\"lazy\"``, ``\"training\"``, ``\"optuna\"``, or ``\"spotoptim\"``. Defaults to ``\"lazy\"``. | `'lazy'` |\n| agg_weights | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[float](`float`)\\]\\] | Per-target aggregation weights used when combining individual target forecasts into a single weighted sum. The list must contain one weight per entry in ``targets`` (in the same order). Positive values add the target's contribution; negative values invert it. Slice the list to ``agg_weights[:len(targets)]`` when only a subset of targets is active. Defaults to ``None`` (no weights pre-defined; set after loading the dataset). | `None` |\n| auto_save_models | [bool](`bool`) | Whether ``BaseTask._run_strategy`` should persist fitted forecasters to ``/models/`` after every training run. Defaults to ``True`` so that saved models are immediately available for ``PredictTask`` without an explicit ``save_models()`` call. | `True` |\n| data_frame_name | [str](`str`) | Identifier for the active dataset. Used by ``BaseTask`` to name cache subdirectories, model files, and the per-dataset log file. Defaults to ``\"default\"``. | `'default'` |\n| on_weather_failure | [Literal](`typing.Literal`)\\[\\'raise\\', \\'skip\\'\\] | Policy for handling Open-Meteo fetch failures inside ``BaseTask.build_exogenous_features``. ``\"raise\"`` (default) aborts the pipeline with a ``WeatherFetchError`` and preserves the safety-critical fail-safe semantics. ``\"skip\"`` logs a warning and continues with empty weather features so the rest of the pipeline can run without the Open-Meteo dependency. | `'raise'` |\n| exog_max_gap_hours | [int](`int`) | Maximum length, in hours, of a contiguous run of missing exogenous-provider values healed before the provider is rejected. Interior gaps are time-interpolated; leading/trailing edge gaps are back-/forward-filled. ``0`` (default) keeps the strict fail-safe (any gap raises). Healed runs are logged with count and span. Only already-published day-ahead vintages are involved, so healing is leakage-clean (CR-3). | `0` |\n| exog_max_tail_gap_hours | [int](`int`) | Extended healing budget, in hours, applied exclusively to the trailing-edge NaN run (the run containing the last index timestamp). The effective tail budget is ``max(exog_max_gap_hours, exog_max_tail_gap_hours)``. The canonical use case is the ENTSO-E day-ahead publication frontier: the last published vintage is zero-order-held forward to the forecast horizon without touching interior gaps (CR-3-clean). When ``exog_max_tail_gap_hours <= exog_max_gap_hours`` the parameter is inert (the interior budget already covers the tail) and a warning is logged. Defaults to ``0``. | `0` |\n| exog_provider_window | [Literal](`typing.Literal`)\\[\\'full\\', \\'train\\'\\] | Span the exogenous providers are validated against. ``\"full\"`` (default) requires coverage of the entire ``data_start``→``cov_end`` request, matching prior behaviour. ``\"train\"`` validates only the consumed window ``[start_train_ts, cov_end]``, tolerating missing values before the training window. Honoured by the MultiTask pipeline; the forecaster-wrapper path currently always validates the full span. | `'full'` |\n\n## Attributes {.doc-section .doc-section-attributes}\n\n| Name | Type | Description |\n|------------------------------------|----------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| country_code | [str](`str`) | ISO country code for API queries and holiday generation. |\n| periods | [List](`typing.List`)\\[[Period](`spotforecast2_safe.data.Period`)\\] | Cyclical feature encoding specifications. |\n| lags_consider | [List](`typing.List`)\\[[int](`int`)\\] | Lag values for autoregressive features. |\n| train_size | [pd](`pandas`).[Timedelta](`pandas.Timedelta`) | Training data window. |\n| end_train_default | [str](`str`) | Default training end date. |\n| delta_val | [pd](`pandas`).[Timedelta](`pandas.Timedelta`) | Validation window. |\n| predict_size | [int](`int`) | Prediction horizon in hours. |\n| refit_size | [int](`int`) | Refit interval in days. |\n| random_state | [int](`int`) | Random seed. |\n| n_hyperparameters_trials | [int](`int`) | Hyperparameter tuning trials. |\n| targets | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[str](`str`)\\]\\] | Active target column names. ``None`` until explicitly set from the loaded dataset. |\n| use_outlier_detection | [bool](`bool`) | IsolationForest outlier removal toggle. |\n| contamination | [float](`float`) | IsolationForest contamination fraction. |\n| imputation_method | [str](`str`) | Gap-filling strategy (``\"weighted\"`` or ``\"linear\"``). |\n| window_size | [int](`int`) | Rolling window size for weighted imputation. |\n| use_exogenous_features | [bool](`bool`) | Exogenous feature construction toggle. |\n| latitude | [float](`float`) | Location latitude. |\n| longitude | [float](`float`) | Location longitude. |\n| timezone | [str](`str`) | IANA timezone string. |\n| state | [str](`str`) | Subdivision code for regional holidays. |\n| include_weather_windows | [bool](`bool`) | Weather-window feature toggle. |\n| include_holiday_features | [bool](`bool`) | Holiday feature toggle. |\n| include_holiday_adjacency_features | [bool](`bool`) | Brückentag and before/after-holiday indicator toggle. Defaults to ``False``. |\n| include_ephemeris_features | [bool](`bool`) | Solar-elevation and daylight-duration feature toggle. Defaults to ``False``. |\n| include_day_type_features | [bool](`bool`) | Working-day / day-type class feature toggle. Defaults to ``False``. |\n| include_school_holiday_features | [bool](`bool`) | Per-Bundesland school-holiday indicator toggle. Defaults to ``False``. |\n| poly_features_degree | [int](`int`) | Polynomial-interaction degree (1 = off). |\n| max_poly_features | [int](`int`) | Cap on kept ``poly_*`` columns (top-K by MI). |\n| poly_mi_n_jobs | [Optional](`typing.Optional`)\\[[int](`int`)\\] | Parallel jobs for the MI ranking (``-1`` = all cores; selection-invariant). |\n| poly_mi_sample_size | [Optional](`typing.Optional`)\\[[int](`int`)\\] | Row cap for the MI ranking (``None`` = score every row). |\n| include_covid_infection_rate | [bool](`bool`) | Append the bundled RKI German national COVID-19 7-day incidence as an exogenous regressor. |\n| include_entsoe_forecast_load | [bool](`bool`) | Append the ENTSO-E day-ahead Forecasted Load as a near-oracle exogenous prior. |\n| include_entsoe_renewable_forecast | [bool](`bool`) | Append the ENTSO-E day-ahead wind/solar generation forecast. |\n| include_entsoe_net_load | [bool](`bool`) | Append the ENTSO-E day-ahead net load (Forecasted Load minus wind/solar forecast). |\n| include_entsoe_day_ahead_price | [bool](`bool`) | Append the ENTSO-E day-ahead spot price (DE/LU). |\n| index_name | [str](`str`) | Datetime column name used when resetting the index. |\n| bounds | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[tuple](`tuple`)\\]\\] | Per-column outlier bounds ``(lower, upper)``. |\n| verbose | [bool](`bool`) | Verbose output toggle. |\n| cache_home | [Optional](`typing.Optional`)\\[[Any](`typing.Any`)\\] | Path to the cache directory. |\n| n_trials_optuna | [int](`int`) | Number of Optuna hyperparameter-search trials. |\n| n_trials_spotoptim | [int](`int`) | Number of SpotOptim search trials. |\n| n_initial_spotoptim | [int](`int`) | Number of initial SpotOptim evaluations. |\n| max_time_spotoptim | [Optional](`typing.Optional`)\\[[float](`float`)\\] | Wall-clock budget for the SpotOptim search in minutes; ``None`` disables the limit. |\n| warm_start_lags | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[int](`int`)\\]\\] | Seed lag set for the SpotOptim search; ``None`` or empty disables the warm start. |\n| task | [str](`str`) | Active prediction task (``\"lazy\"``, ``\"training\"``, ``\"optuna\"``, or ``\"spotoptim\"``). |\n| agg_weights | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[float](`float`)\\]\\] | Per-target aggregation weights. One weight per entry in ``targets``; positive values add, negative values invert the target's contribution. ``None`` until set. |\n| auto_save_models | [bool](`bool`) | Whether to auto-persist fitted forecasters after each training run. |\n| data_frame_name | [str](`str`) | Active-dataset identifier used for cache and log-file naming. |\n| number_folds | [int](`int`) | Cross-validation fold count for tuning tasks. |\n| on_weather_failure | [Literal](`typing.Literal`)\\[\\'raise\\', \\'skip\\'\\] | Open-Meteo fetch-failure policy: ``\"raise\"`` aborts, ``\"skip\"`` continues without weather. |\n| on_exog_provider_failure | [Literal](`typing.Literal`)\\[\\'raise\\', \\'skip\\'\\] | Exog-provider failure policy in ``ExogBuilder.build``: ``\"raise\"`` (default) propagates the ``ExogProviderError``; ``\"skip\"`` logs and omits the failing provider's columns. |\n| exog_max_gap_hours | [int](`int`) | Maximum contiguous gap in hours that providers will heal before raising (0 = strict fail-safe). |\n| exog_provider_window | [Literal](`typing.Literal`)\\[\\'full\\', \\'train\\'\\] | Validation window for exog providers: ``\"full\"`` (default) or ``\"train\"``. |\n\n## Notes {.doc-section .doc-section-notes}\n\nThe default period configurations use specific `n_periods` to balance resolution and smoothing:\n- **Daily**: `n_periods=12` (24h) provides ~2h resolution, smoothing hourly noise and halving dimensionality.\n- **Weekly**: `n_periods` typically matches range (1:1) to distinguish day-of-week patterns.\n- **Yearly**: `n_periods=12` (365d) provides ~1 month resolution, capturing broad seasonal trends without overfitting.\n\nSee `docs/PERIOD_CONFIGURATION_RATIONALE.md` for a detailed analysis.\n\n## Examples {.doc-section .doc-section-examples}\n\n\n::: {#6a30c976 .cell execution_count=1}\n``` {.python .cell-code}\nimport pandas as pd\nfrom spotforecast2_safe.configurator.config_multi import ConfigMulti\nconfig = ConfigMulti()\nprint(f\"country_code: {config.country_code}\")\nprint(f\"Predict size: {config.predict_size}\")\nprint(f\"Random state: {config.random_state}\")\nprint(f\"Targets (default): {config.targets}\")\nprint(f\"agg_weights (default): {config.agg_weights}\")\nprint(f\"index_name: {config.index_name}\")\nprint(f\"bounds: {config.bounds}\")\n\n# Set targets and bounds (user input that stays on the config)\nconfig.targets = [\"A\", \"B\", \"C\"]\nconfig.bounds = [(-2500, 4500), (-10, 3000)]\nprint(f\"Targets (after setting): {config.targets}\")\nprint(f\"bounds: {config.bounds}\")\n\n# Create custom configuration — country_code serves both API and holiday purposes\ncustom_config = ConfigMulti(\n country_code='FR',\n predict_size=48,\n random_state=42,\n targets=[\"A\", \"B\"],\n index_name=\"DateTime\",\n)\nprint(f\"country_code: {custom_config.country_code}\")\nprint(f\"Predict size: {custom_config.predict_size}\")\nprint(f\"Random state: {custom_config.random_state}\")\nprint(f\"Targets: {custom_config.targets}\")\n\n# Verify training window\nprint(f\"Training window: {config.train_size == pd.Timedelta(days=3 * 365)}\")\n\n# Check default periods\nprint(f\"Number of periods: {len(config.periods)}\")\nprint(f\"First period name: {config.periods[0].name}\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\ncountry_code: DE\nPredict size: 24\nRandom state: 314159\nTargets (default): None\nagg_weights (default): None\nindex_name: DateTime\nbounds: None\nTargets (after setting): ['A', 'B', 'C']\nbounds: [(-2500, 4500), (-10, 3000)]\ncountry_code: FR\nPredict size: 48\nRandom state: 42\nTargets: ['A', 'B']\nTraining window: True\nNumber of periods: 5\nFirst period name: daily\n```\n:::\n:::\n\n\n## Methods\n\n| Name | Description |\n| --- | --- |\n| [get_params](#spotforecast2_safe.configurator.config_multi.ConfigMulti.get_params) | Get parameters for this configuration object. |\n| [set_params](#spotforecast2_safe.configurator.config_multi.ConfigMulti.set_params) | Set the parameters of this configuration object. |\n\n### get_params { #spotforecast2_safe.configurator.config_multi.ConfigMulti.get_params }\n\n```python\nconfigurator.config_multi.ConfigMulti.get_params(deep=True)\n```\n\nGet parameters for this configuration object.\n\n#### Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|--------|----------------|-----------------------------------------------------------------------------------------------------------|-----------|\n| deep | [bool](`bool`) | If True, will return the parameters for this configuration and contained sub-objects that are estimators. | `True` |\n\n#### Returns {.doc-section .doc-section-returns}\n\n| Name | Type | Description |\n|--------|-----------------------------------------------------------|-------------------------------------------------------|\n| params | [Dict](`typing.Dict`)\\[[str](`str`), [object](`object`)\\] | Dictionary of parameter names mapped to their values. |\n\n#### Examples {.doc-section .doc-section-examples}\n\n::: {#2135a404 .cell execution_count=2}\n``` {.python .cell-code}\nfrom spotforecast2_safe.configurator.config_multi import ConfigMulti\nconfig = ConfigMulti(country_code=\"FR\")\np = config.get_params()\nprint(f\"country_code: {p['country_code']}\")\nprint(f\"Predict size: {p['predict_size']}\")\nprint(f\"Random state: {p['random_state']}\")\nprint(f\"index_name: {p['index_name']}\")\nprint(f\"bounds: {p['bounds']}\")\nprint(f\"agg_weights: {p['agg_weights']}\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\ncountry_code: FR\nPredict size: 24\nRandom state: 314159\nindex_name: DateTime\nbounds: None\nagg_weights: None\n```\n:::\n:::\n\n\n### set_params { #spotforecast2_safe.configurator.config_multi.ConfigMulti.set_params }\n\n```python\nconfigurator.config_multi.ConfigMulti.set_params(params=None, **kwargs)\n```\n\nSet the parameters of this configuration object.\n\n#### Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|----------|-----------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------|-----------|\n| params | [Dict](`typing.Dict`)\\[[str](`str`), [object](`object`)\\] | Optional dictionary of parameter names mapped to their new values. | `None` |\n| **kwargs | [object](`object`) | Additional parameter names mapped to their new values. It supports configuring nested 'Period' objects using the `periods____` notation. | `{}` |\n\n#### Returns {.doc-section .doc-section-returns}\n\n| Name | Type | Description |\n|-------------|---------------------------------------------------------------------------|--------------------------------------------------------------------------------|\n| ConfigMulti | [ConfigMulti](`spotforecast2_safe.configurator.config_multi.ConfigMulti`) | The configuration instance with updated parameters (supports method chaining). |\n\n#### Examples {.doc-section .doc-section-examples}\n\n::: {#1ef65d7b .cell execution_count=3}\n``` {.python .cell-code}\nfrom spotforecast2_safe.configurator.config_multi import ConfigMulti\nconfig = ConfigMulti()\n_ = config.set_params(country_code=\"FR\", predict_size=48)\nprint(f\"country_code: {config.country_code}\")\nprint(f\"Predict size: {config.predict_size}\")\nprint(f\"Random state: {config.random_state}\")\n\n# Deep parameter setting\n_ = config.set_params(periods__daily__n_periods=24)\nprint(next(p.n_periods for p in config.periods if p.name == \"daily\"))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\ncountry_code: FR\nPredict size: 48\nRandom state: 314159\n24\n```\n:::\n:::\n\n\n", "supporting": [ "configurator.config_multi.ConfigMulti_files/figure-html" ], diff --git a/_freeze/docs/reference/manager.features.select_exogenous_features/execute-results/html.json b/_freeze/docs/reference/manager.features.select_exogenous_features/execute-results/html.json index d4cc0d8ac..f8cb1f3ef 100644 --- a/_freeze/docs/reference/manager.features.select_exogenous_features/execute-results/html.json +++ b/_freeze/docs/reference/manager.features.select_exogenous_features/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "73a295c7d4d403b91029e1a499f0964d", + "hash": "9542cdfe51005cf8d252e2b032e0e0e8", "result": { "engine": "jupyter", - "markdown": "---\ntitle: manager.features.select_exogenous_features\n---\n\n\n\n```python\nmanager.features.select_exogenous_features(\n exogenous_features,\n weather_aligned,\n cyclical_regex='_sin$|_cos$',\n include_weather_windows=False,\n include_holiday_features=False,\n include_holiday_adjacency_features=False,\n poly_features_degree=1,\n)\n```\n\nSelect and deduplicate exogenous feature columns for model training.\n\nBuilds a prioritised, deduplicated list of column names from\n*exogenous_features* suitable for passing as ``exog`` to a recursive\nforecaster. The selection order is:\n\n1. Cyclical sine/cosine columns (always included).\n2. Weather rolling-window columns (optional, ``include_weather_windows``).\n3. Raw weather columns shared with *weather_aligned*.\n4. Holiday-related columns: ``is_holiday`` plus any column starting\n with ``\"holiday\"`` (optional, ``include_holiday_features``).\n5. Holiday-adjacency columns: ``is_brueckentag``, ``is_before_holiday``,\n ``is_after_holiday`` (optional, ``include_holiday_adjacency_features``).\n6. Polynomial interaction columns starting with ``\"poly_\"`` (included\n when ``poly_features_degree >= 2``).\n\nDuplicates are removed while preserving insertion order.\n\n## Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|------------------------------------|------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|\n| exogenous_features | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | DataFrame containing the full set of candidate feature columns. | _required_ |\n| weather_aligned | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | DataFrame whose column names identify the raw ( non-window, non-polynomial) weather variables. | _required_ |\n| cyclical_regex | [str](`str`) | Regular expression matched against column names to detect cyclical sine/cosine features. Defaults to ``\"_sin$\\|_cos$\"``. | `'_sin$|_cos$'` |\n| include_weather_windows | [bool](`bool`) | If ``True``, include rolling-window weather columns (those containing ``\"_window_\"`` plus ``\"_mean\"``, ``\"_min\"``, or ``\"_max\"``). Defaults to ``False``. | `False` |\n| include_holiday_features | [bool](`bool`) | If ``True``, include the ``is_holiday`` column and any column whose name starts with ``\"holiday\"``. Defaults to ``False``. | `False` |\n| include_holiday_adjacency_features | [bool](`bool`) | If ``True``, include the three adjacency columns ``is_brueckentag``, ``is_before_holiday``, and ``is_after_holiday`` when present in *exogenous_features*. Defaults to ``False``. | `False` |\n| poly_features_degree | [int](`int`) | Polynomial-interaction degree. Interaction columns (names starting with ``\"poly_\"``) are included only when this is ``>= 2``; at ``1`` no interactions exist. Defaults to ``1``. | `1` |\n\n## Returns {.doc-section .doc-section-returns}\n\n| Name | Type | Description |\n|--------|---------------------------------------|-------------------------------------------------------------------|\n| | [List](`typing.List`)\\[[str](`str`)\\] | List[str]: Deduplicated list of selected column names in priority |\n| | [List](`typing.List`)\\[[str](`str`)\\] | order. |\n\n## Examples {.doc-section .doc-section-examples}\n\nSelect cyclical and raw weather columns from a feature matrix:\n\n\n::: {#a170d5c1 .cell execution_count=1}\n``` {.python .cell-code}\nimport numpy as np\nimport pandas as pd\nfrom spotforecast2_safe.manager.features import select_exogenous_features\n\nrng = np.random.default_rng(1)\nidx = pd.date_range(\"2024-01-01\", periods=24, freq=\"h\", tz=\"UTC\")\n\nweather = pd.DataFrame({\"wind_speed\": rng.uniform(0, 10, 24)}, index=idx)\nexog = pd.DataFrame(\n {\n \"hour_sin\": np.sin(2 * np.pi * idx.hour / 24),\n \"hour_cos\": np.cos(2 * np.pi * idx.hour / 24),\n \"wind_speed\": weather[\"wind_speed\"],\n \"holiday_flag\": 0,\n },\n index=idx,\n)\n\nselected = select_exogenous_features(\n exogenous_features=exog,\n weather_aligned=weather,\n include_holiday_features=False,\n)\nprint(\"selected:\", selected)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nselected: ['hour_sin', 'hour_cos', 'wind_speed']\n```\n:::\n:::\n\n\n", + "markdown": "---\ntitle: manager.features.select_exogenous_features\n---\n\n\n\n```python\nmanager.features.select_exogenous_features(\n exogenous_features,\n weather_aligned,\n cyclical_regex='_sin$|_cos$',\n include_weather_windows=False,\n include_holiday_features=False,\n include_holiday_adjacency_features=False,\n include_school_holiday_features=False,\n poly_features_degree=1,\n)\n```\n\nSelect and deduplicate exogenous feature columns for model training.\n\nBuilds a prioritised, deduplicated list of column names from\n*exogenous_features* suitable for passing as ``exog`` to a recursive\nforecaster. The selection order is:\n\n1. Cyclical sine/cosine columns (always included).\n2. Weather rolling-window columns (optional, ``include_weather_windows``).\n3. Raw weather columns shared with *weather_aligned*.\n4. Holiday-related columns: ``is_holiday`` plus any column starting\n with ``\"holiday\"`` (optional, ``include_holiday_features``).\n5. Holiday-adjacency columns: ``is_brueckentag``, ``is_before_holiday``,\n ``is_after_holiday`` (optional, ``include_holiday_adjacency_features``).\n6. School-holiday column: ``is_school_holiday`` (optional,\n ``include_school_holiday_features``).\n7. Polynomial interaction columns starting with ``\"poly_\"`` (included\n when ``poly_features_degree >= 2``).\n\nDuplicates are removed while preserving insertion order.\n\n## Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|------------------------------------|------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|\n| exogenous_features | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | DataFrame containing the full set of candidate feature columns. | _required_ |\n| weather_aligned | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | DataFrame whose column names identify the raw ( non-window, non-polynomial) weather variables. | _required_ |\n| cyclical_regex | [str](`str`) | Regular expression matched against column names to detect cyclical sine/cosine features. Defaults to ``\"_sin$\\|_cos$\"``. | `'_sin$|_cos$'` |\n| include_weather_windows | [bool](`bool`) | If ``True``, include rolling-window weather columns (those containing ``\"_window_\"`` plus ``\"_mean\"``, ``\"_min\"``, or ``\"_max\"``). Defaults to ``False``. | `False` |\n| include_holiday_features | [bool](`bool`) | If ``True``, include the ``is_holiday`` column and any column whose name starts with ``\"holiday\"``. Defaults to ``False``. | `False` |\n| include_holiday_adjacency_features | [bool](`bool`) | If ``True``, include the three adjacency columns ``is_brueckentag``, ``is_before_holiday``, and ``is_after_holiday`` when present in *exogenous_features*. Defaults to ``False``. | `False` |\n| include_school_holiday_features | [bool](`bool`) | If ``True``, include the ``is_school_holiday`` column when present in *exogenous_features*. Defaults to ``False``. | `False` |\n| poly_features_degree | [int](`int`) | Polynomial-interaction degree. Interaction columns (names starting with ``\"poly_\"``) are included only when this is ``>= 2``; at ``1`` no interactions exist. Defaults to ``1``. | `1` |\n\n## Returns {.doc-section .doc-section-returns}\n\n| Name | Type | Description |\n|--------|---------------------------------------|-------------------------------------------------------------------|\n| | [List](`typing.List`)\\[[str](`str`)\\] | List[str]: Deduplicated list of selected column names in priority |\n| | [List](`typing.List`)\\[[str](`str`)\\] | order. |\n\n## Examples {.doc-section .doc-section-examples}\n\nSelect cyclical and raw weather columns from a feature matrix:\n\n\n::: {#1b3ed187 .cell execution_count=1}\n``` {.python .cell-code}\nimport numpy as np\nimport pandas as pd\nfrom spotforecast2_safe.manager.features import select_exogenous_features\n\nrng = np.random.default_rng(1)\nidx = pd.date_range(\"2024-01-01\", periods=24, freq=\"h\", tz=\"UTC\")\n\nweather = pd.DataFrame({\"wind_speed\": rng.uniform(0, 10, 24)}, index=idx)\nexog = pd.DataFrame(\n {\n \"hour_sin\": np.sin(2 * np.pi * idx.hour / 24),\n \"hour_cos\": np.cos(2 * np.pi * idx.hour / 24),\n \"wind_speed\": weather[\"wind_speed\"],\n \"holiday_flag\": 0,\n },\n index=idx,\n)\n\nselected = select_exogenous_features(\n exogenous_features=exog,\n weather_aligned=weather,\n include_holiday_features=False,\n)\nprint(\"selected:\", selected)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nselected: ['hour_sin', 'hour_cos', 'wind_speed']\n```\n:::\n:::\n\n\n", "supporting": [ "manager.features.select_exogenous_features_files/figure-html" ], diff --git a/_freeze/docs/reference/weather.features.get_weather_features/execute-results/html.json b/_freeze/docs/reference/weather.features.get_weather_features/execute-results/html.json index 90f47377c..93de3092b 100644 --- a/_freeze/docs/reference/weather.features.get_weather_features/execute-results/html.json +++ b/_freeze/docs/reference/weather.features.get_weather_features/execute-results/html.json @@ -1,10 +1,10 @@ { - "hash": "3ff4e5fa9dff997c3b5bc04a124c39d1", + "hash": "2da9c725ad34e9cf80065ead07ac58d2", "result": { "engine": "jupyter", - "markdown": "---\ntitle: weather.features.get_weather_features\n---\n\n\n\n```python\nweather.features.get_weather_features(\n data,\n start,\n cov_end,\n forecast_horizon,\n latitude=51.5136,\n longitude=7.4653,\n timezone='UTC',\n freq='h',\n window_periods=None,\n window_functions=None,\n fallback_on_failure=True,\n cache_home=None,\n verbose=False,\n)\n```\n\nFetch weather data and compute rolling-window features.\n\nDownloads weather observations/forecasts for the requested period,\naligns them to a regular ``freq`` grid, and applies\n`WindowFeatures` to\nproduce rolling-mean, -max, and -min features over configurable\nwindows.\n\n## Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|---------------------|--------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|\n| data | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | Reference time series DataFrame used only for validation (shape / temporal coverage checks via `curate_weather()`). | _required_ |\n| start | [Union](`typing.Union`)\\[[str](`str`), [pd](`pandas`).[Timestamp](`pandas.Timestamp`)\\] | Start of the feature window. String values are parsed with ``utc=True``. | _required_ |\n| cov_end | [Union](`typing.Union`)\\[[str](`str`), [pd](`pandas`).[Timestamp](`pandas.Timestamp`)\\] | Inclusive end of the feature window (must cover the full forecast horizon beyond ``end``). String values are parsed with ``utc=True``. | _required_ |\n| forecast_horizon | [int](`int`) | Number of forecast steps; passed to `curate_weather()` for validation. | _required_ |\n| latitude | [float](`float`) | Latitude of the target location in decimal degrees. Defaults to ``51.5136`` (Dortmund, Germany). | `51.5136` |\n| longitude | [float](`float`) | Longitude of the target location in decimal degrees. Defaults to ``7.4653`` (Dortmund, Germany). | `7.4653` |\n| timezone | [str](`str`) | Timezone label applied to the generated index. Defaults to ``\"UTC\"``. | `'UTC'` |\n| freq | [str](`str`) | Pandas-compatible frequency string for the output index. Defaults to ``\"h\"`` (hourly). | `'h'` |\n| window_periods | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[str](`str`)\\]\\] | Rolling window sizes passed to `WindowFeatures`. Defaults to ``[\"1D\", \"7D\"]``. | `None` |\n| window_functions | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[str](`str`)\\]\\] | Aggregation functions applied over each window. Defaults to ``[\"mean\", \"max\", \"min\"]``. | `None` |\n| fallback_on_failure | [bool](`bool`) | If ``True``, use locally cached fallback data when the weather API is unavailable. Defaults to ``True``. | `True` |\n| cache_home | [Optional](`typing.Optional`)\\[[Union](`typing.Union`)\\[[str](`str`), [Path](`pathlib.Path`)\\]\\] | Optional path to cache directory. When provided, fetched weather data is cached in ``/weather_cache.parquet``. When None (default), no caching is performed. | `None` |\n| verbose | [bool](`bool`) | If ``True``, print progress messages to stdout. Defaults to ``False``. | `False` |\n\n## Returns {.doc-section .doc-section-returns}\n\n| Name | Type | Description |\n|--------|---------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|\n| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | tuple[pd.DataFrame, pd.DataFrame]: A two-element tuple: |\n| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | - **weather_features** – DataFrame with rolling-window weather features aligned to the ``[start, cov_end]`` index. |\n| | [Tuple](`typing.Tuple`)\\[[pd](`pandas`).[DataFrame](`pandas.DataFrame`), [pd](`pandas`).[DataFrame](`pandas.DataFrame`)\\] | - **weather_aligned** – Raw weather DataFrame reindexed to the same ``[start, cov_end]`` hourly grid (forward-filled). |\n\n## Raises {.doc-section .doc-section-raises}\n\n| Name | Type | Description |\n|--------|----------------------------|------------------------------------------------------------------------------------------------|\n| | [ValueError](`ValueError`) | If no numeric weather columns are found, or if missing values cannot be filled after fetching. |\n\n## Examples {.doc-section .doc-section-examples}\n\n\n::: {#8fc918bd .cell execution_count=1}\n``` {.python .cell-code}\nimport tempfile\n\nimport pandas as pd\n\nfrom spotforecast2_safe.weather import get_weather_features\n\n# Build a minimal synthetic reference DataFrame whose row count is\n# consistent with the requested weather window so curate_weather\n# validation passes without warnings.\nforecast_horizon = 2\nstart = pd.Timestamp(\"2020-06-01\", tz=\"UTC\")\ncov_end = pd.Timestamp(\"2020-06-03\", tz=\"UTC\")\ndata_end = cov_end - pd.Timedelta(hours=forecast_horizon)\ndata_idx = pd.date_range(start=start, end=data_end, freq=\"h\", tz=\"UTC\")\ndata = pd.DataFrame({\"load\": range(len(data_idx))}, index=data_idx)\n\ncache_home = tempfile.mkdtemp()\nweather_features, weather_aligned = get_weather_features(\n data=data,\n start=start,\n cov_end=cov_end,\n forecast_horizon=forecast_horizon,\n cache_home=cache_home,\n verbose=False,\n)\nprint(\"weather_features shape:\", weather_features.shape)\nprint(\"weather_aligned shape:\", weather_aligned.shape)\nprint(\"weather_aligned columns (first 3):\", list(weather_aligned.columns)[:3])\nassert weather_features.shape[0] > 0\nassert weather_aligned.shape[0] > 0\nassert \"temperature_2m\" in weather_aligned.columns\n# Rolling-window transformer adds more columns than raw aligned data\nassert weather_features.shape[1] > weather_aligned.shape[1]\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nweather_features shape: (49, 105)\nweather_aligned shape: (49, 15)\nweather_aligned columns (first 3): ['temperature_2m', 'relative_humidity_2m', 'precipitation']\n```\n:::\n:::\n\n\n", + "markdown": "---\ntitle: weather.features.get_weather_features\n---\n\n\n\n```python\nweather.features.get_weather_features(\n data,\n start,\n cov_end,\n forecast_horizon,\n latitude=51.5136,\n longitude=7.4653,\n timezone='UTC',\n freq='h',\n window_periods=None,\n window_functions=None,\n fallback_on_failure=True,\n cache_home=None,\n verbose=False,\n locations=None,\n location_weights=None,\n derived_features=None,\n hdh_base=DEFAULT_HDH_BASE_C,\n cdh_base=DEFAULT_CDH_BASE_C,\n wind_speed_unit='kmh',\n)\n```\n\nFetch weather data and compute rolling-window features.\n\nDownloads weather observations/forecasts for the requested period,\naligns them to a regular ``freq`` grid, and applies\n`WindowFeatures` to\nproduce rolling-mean, -max, and -min features over configurable\nwindows.\n\n## Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|---------------------|---------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------|\n| data | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | Reference time series DataFrame used only for validation (shape / temporal coverage checks via `curate_weather()`). | _required_ |\n| start | [Union](`typing.Union`)\\[[str](`str`), [pd](`pandas`).[Timestamp](`pandas.Timestamp`)\\] | Start of the feature window. String values are parsed with ``utc=True``. | _required_ |\n| cov_end | [Union](`typing.Union`)\\[[str](`str`), [pd](`pandas`).[Timestamp](`pandas.Timestamp`)\\] | Inclusive end of the feature window (must cover the full forecast horizon beyond ``end``). String values are parsed with ``utc=True``. | _required_ |\n| forecast_horizon | [int](`int`) | Number of forecast steps; passed to `curate_weather()` for validation. | _required_ |\n| latitude | [float](`float`) | Latitude of the target location in decimal degrees. Defaults to ``51.5136`` (Dortmund, Germany). | `51.5136` |\n| longitude | [float](`float`) | Longitude of the target location in decimal degrees. Defaults to ``7.4653`` (Dortmund, Germany). | `7.4653` |\n| timezone | [str](`str`) | Timezone label applied to the generated index. Defaults to ``\"UTC\"``. | `'UTC'` |\n| freq | [str](`str`) | Pandas-compatible frequency string for the output index. Defaults to ``\"h\"`` (hourly). | `'h'` |\n| window_periods | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[str](`str`)\\]\\] | Rolling window sizes passed to `WindowFeatures`. Defaults to ``[\"1D\", \"7D\"]``. | `None` |\n| window_functions | [Optional](`typing.Optional`)\\[[List](`typing.List`)\\[[str](`str`)\\]\\] | Aggregation functions applied over each window. Defaults to ``[\"mean\", \"max\", \"min\"]``. | `None` |\n| fallback_on_failure | [bool](`bool`) | If ``True``, use locally cached fallback data when the weather API is unavailable. Defaults to ``True``. | `True` |\n| cache_home | [Optional](`typing.Optional`)\\[[Union](`typing.Union`)\\[[str](`str`), [Path](`pathlib.Path`)\\]\\] | Optional path to cache directory. When provided, fetched weather data is cached in ``/weather_cache.parquet``. When None (default), no caching is performed. | `None` |\n| verbose | [bool](`bool`) | If ``True``, print progress messages to stdout. Defaults to ``False``. | `False` |\n| locations | [Optional](`typing.Optional`)\\[[Sequence](`typing.Sequence`)\\[[Tuple](`typing.Tuple`)\\[[float](`float`), [float](`float`)\\]\\]\\] | Optional sequence of ``(latitude, longitude)`` pairs for a **population-weighted multi-city** weather index. When ``None`` (default) the single ``latitude``/``longitude`` point is used, preserving prior behaviour exactly. When given, each location is fetched and the raw frames are combined via `population_weighted_average` using *location_weights*. See `spotforecast2_safe.weather.locations`. | `None` |\n| location_weights | [Optional](`typing.Optional`)\\[[Sequence](`typing.Sequence`)\\[[float](`float`)\\]\\] | Non-negative weight per entry in *locations* (e.g. city population). Required when *locations* is given; normalised internally. | `None` |\n| derived_features | [Optional](`typing.Optional`)\\[[Sequence](`typing.Sequence`)\\[[str](`str`)\\]\\] | Optional subset of ``{\"hdh\", \"cdh\", \"apparent_temperature\", \"dew_point\"}``. When given, those columns are derived from the (weighted) weather and rolled up alongside the raw fields. ``None`` (default) adds nothing. See `add_derived_weather_features`. | `None` |\n| hdh_base | [float](`float`) | Heating base temperature (°C) for ``hdh``. Defaults to ``15.0``. | `DEFAULT_HDH_BASE_C` |\n| cdh_base | [float](`float`) | Cooling base temperature (°C) for ``cdh``. Defaults to ``22.0``. | `DEFAULT_CDH_BASE_C` |\n| wind_speed_unit | [str](`str`) | Unit of the fetched ``wind_speed_10m`` column for apparent-temperature, ``\"ms\"`` or ``\"kmh\"``. Defaults to ``\"kmh\"`` (the Open-Meteo default). | `'kmh'` |\n\n## Returns {.doc-section .doc-section-returns}\n\n| Name | Type | Description |\n|--------|---------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|\n| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | tuple[pd.DataFrame, pd.DataFrame]: A two-element tuple: |\n| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | - **weather_features** – DataFrame with rolling-window weather features aligned to the ``[start, cov_end]`` index. |\n| | [Tuple](`typing.Tuple`)\\[[pd](`pandas`).[DataFrame](`pandas.DataFrame`), [pd](`pandas`).[DataFrame](`pandas.DataFrame`)\\] | - **weather_aligned** – Raw weather DataFrame reindexed to the same ``[start, cov_end]`` hourly grid (forward-filled). |\n\n## Raises {.doc-section .doc-section-raises}\n\n| Name | Type | Description |\n|--------|----------------------------|------------------------------------------------------------------------------------------------|\n| | [ValueError](`ValueError`) | If no numeric weather columns are found, or if missing values cannot be filled after fetching. |\n\n## Examples {.doc-section .doc-section-examples}\n\n\n::: {#a4d01aaf .cell execution_count=1}\n``` {.python .cell-code}\nimport tempfile\n\nimport pandas as pd\n\nfrom spotforecast2_safe.weather import get_weather_features\n\n# Build a minimal synthetic reference DataFrame whose row count is\n# consistent with the requested weather window so curate_weather\n# validation passes without warnings.\nforecast_horizon = 2\nstart = pd.Timestamp(\"2020-06-01\", tz=\"UTC\")\ncov_end = pd.Timestamp(\"2020-06-03\", tz=\"UTC\")\ndata_end = cov_end - pd.Timedelta(hours=forecast_horizon)\ndata_idx = pd.date_range(start=start, end=data_end, freq=\"h\", tz=\"UTC\")\ndata = pd.DataFrame({\"load\": range(len(data_idx))}, index=data_idx)\n\ncache_home = tempfile.mkdtemp()\nweather_features, weather_aligned = get_weather_features(\n data=data,\n start=start,\n cov_end=cov_end,\n forecast_horizon=forecast_horizon,\n cache_home=cache_home,\n verbose=False,\n)\nprint(\"weather_features shape:\", weather_features.shape)\nprint(\"weather_aligned shape:\", weather_aligned.shape)\nprint(\"weather_aligned columns (first 3):\", list(weather_aligned.columns)[:3])\nassert weather_features.shape[0] > 0\nassert weather_aligned.shape[0] > 0\nassert \"temperature_2m\" in weather_aligned.columns\n# Rolling-window transformer adds more columns than raw aligned data\nassert weather_features.shape[1] > weather_aligned.shape[1]\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nweather_features shape: (49, 105)\nweather_aligned shape: (49, 15)\nweather_aligned columns (first 3): ['temperature_2m', 'relative_humidity_2m', 'precipitation']\n```\n:::\n:::\n\n\n", "supporting": [ - "weather.features.get_weather_features_files" + "weather.features.get_weather_features_files/figure-html" ], "filters": [], "includes": {} diff --git a/_freeze/docs/tutorials/n2n_predict_with_covariates_explained/execute-results/html.json b/_freeze/docs/tutorials/n2n_predict_with_covariates_explained/execute-results/html.json index 4377f8fe3..57d9870c3 100644 --- a/_freeze/docs/tutorials/n2n_predict_with_covariates_explained/execute-results/html.json +++ b/_freeze/docs/tutorials/n2n_predict_with_covariates_explained/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "c96762a73c7dd13f8eabccf50558f5ef", + "hash": "f20d6c3eec04bd53bdfaa53873eea2d9", "result": { "engine": "jupyter", - "markdown": "---\ntitle: \"n2n_predict_with_covariates: A Beginner's Walkthrough\"\ndescription: \"A step-by-step explanation of the N-to-N covariate forecasting pipeline for readers new to time-series analysis.\"\n---\n\n## What this pipeline does, in one paragraph\n\nThe function `n2n_predict_with_covariates` takes a table of historical\nmeasurements (one or more numeric columns indexed by time), augments it with\nside information that is known both for the past and for the near future\n(calendar effects, daylight, weather forecasts, public holidays), and then\ntrains one machine-learning forecaster per target column. Each forecaster is\nasked to predict the next `forecast_horizon` time steps. The function returns\nthree things: a DataFrame of predictions, a dictionary of metadata that\nrecords *how* the run was configured, and the fitted forecaster objects\nthemselves (so a caller can re-use them later without retraining). The\nfunction is deliberately one long pipeline rather than many small calls,\nbecause every stage depends tightly on the previous one — and a single entry\npoint makes the whole pipeline easy to embed in a daily batch job.\n\n## Vocabulary {#sec-vocabulary}\n\nThe walkthrough below uses a handful of technical terms. Their definitions\nare kept brief here so that the rest of the page can stay focused on the\nmechanics of the pipeline. Each definition is cross-referenceable — later\nsections link back with `@def-…` where it helps the reader.\n\n::: {#def-time-series}\n\n## Time series\n\nA sequence of numeric measurements indexed by timestamps at a regular\ninterval (here: one row per hour).\n\n:::\n\n::: {#def-target-variable}\n\n## Target variable\n\nThe quantity we want to predict — in this pipeline, any column of the\ninput DataFrame.\n\n:::\n\n::: {#def-exogenous-variable}\n\n## Exogenous variable (covariate)\n\nAny other feature known at the same timestamps as the target *and*\navailable for the future window — for example, the hour of the day or a\nweather forecast.\n\n:::\n\n::: {#def-lag}\n\n## Lag\n\nA past value of the same series, e.g. *lag 1* = \"the value one hour ago\",\n*lag 24* = \"the value at the same hour yesterday\".\n\n:::\n\n::: {#def-forecast-horizon}\n\n## Forecast horizon\n\nHow many time steps ahead the model has to predict (e.g.\n`forecast_horizon=24` means \"predict the next 24 hours\").\n\n:::\n\n::: {#def-recursive-forecaster}\n\n## Recursive multi-step forecaster\n\nA model that predicts one step ahead, then feeds its own prediction back\nin as the most recent lag, and repeats. We use it because we want long\nhorizons (24+ steps) without training a separate model per step.\n\n:::\n\n::: {#def-train-val-test}\n\n## Train / validation / test split\n\nThe historical data is sliced into three temporally ordered chunks:\ntraining (used to fit the model), validation (used to compare\nconfigurations), and test (held back as unseen data for the final check).\n\n:::\n\n::: {#def-outlier}\n\n## Outlier\n\nA value that is much larger or smaller than the rest of the series —\noften a sensor glitch or a one-off event we do not want the model to\nlearn from.\n\n:::\n\n::: {#def-imputation}\n\n## Imputation\n\nFilling in missing entries so the model sees a complete series.\n\n:::\n\n::: {#def-sample-weight}\n\n## Sample weight\n\nA non-negative number attached to each training row that tells the model\nhow much that row should contribute to the loss. Weight `0` effectively\nignores the row.\n\n:::\n\n::: {#def-cyclical-encoding}\n\n## Cyclical encoding\n\nReplacing a periodic integer like \"hour of day\" by its sine and cosine on\na unit circle, so the model sees that hour 23 and hour 0 are neighbors.\n\n:::\n\n::: {#def-persistence}\n\n## Persistence (serialization)\n\nSaving a trained model to disk so the next run can load it instead of\nretraining.\n\n:::\n\n## Input validation\n\nBefore any computation begins, `n2n_predict_with_covariates` checks that the\nfive numerical parameters make sense:\n\n| Parameter | Allowed range | Error if violated |\n| ------------------- | --------------------- | ------------------------------------------------------------------- |\n| `forecast_horizon` | `> 0` | `ValueError: forecast_horizon must be positive, got …` |\n| `contamination` | `0 ≤ x ≤ 0.5` | `ValueError: contamination must be between 0 and 0.5, got …` |\n| `window_size` | `> 0` | `ValueError: window_size must be positive, got …` |\n| `lags` | `> 0` | `ValueError: lags must be positive, got …` |\n| `train_ratio` | `0 < x < 1` | `ValueError: train_ratio must be between 0 and 1, got …` |\n\nThe pipeline follows a *fail-fast* philosophy: rather than silently coercing\na bad value (and producing wrong predictions hours later), it stops at the\nfront door with an informative message. The most consequential check is the\none on `forecast_horizon`: a non-positive value would cause downstream\narray-slicing to return empty results, which the model would happily train\non without complaint.\n\n## Stage 1 — Loading and preparing the target series\n\n*Why this step exists.* A forecasting model can only learn from a clean,\nregularly spaced time series with a known time zone. The first stage's job\nis to produce exactly that, regardless of whether the caller passed a\nDataFrame or relied on the default demo data set.\n\nWhen the caller supplies a `data` argument, the series is forwarded to\n`fetch_data(dataframe=data, timezone=timezone)`. When no data is supplied,\n`fetch_data` reads the bundled demonstration file `demo10.csv` (located via\n`get_package_data_home()`). Either way, the result is a DataFrame with a\ntimezone-aware `DatetimeIndex` and numeric columns for each target. The\ncolumn names are immediately stored in `target_columns`, because they will\ndetermine the names of the saved model files and the columns of the\nprediction DataFrame.\n\nNext, `get_start_end(data, forecast_horizon)` returns four boundary\ntimestamps:\n\n- `start`, `end` — the historical window used for training,\n- `cov_start`, `cov_end` — the same window *extended* forward by\n `forecast_horizon` steps, so that covariates can be aligned with the\n prediction period.\n\n`basic_ts_checks(data)` then verifies that the index is a `DatetimeIndex`,\nstrictly monotonically increasing, and free of gaps. Finally,\n`agg_and_resample_data(data)` enforces an hourly grid: sub-hourly data is\naggregated by mean over each hour bin, and the result is a uniformly spaced\ntable that the lag-based forecaster can consume safely.\n\n::: {.callout-note}\n## Why a strict, gap-free hourly index matters\n\nA recursive forecaster predicts lag 1, lag 2, lag 24, etc. by *positional*\nlookup. If the index skips an hour, \"lag 24\" no longer means \"24 hours\nago\" — it means \"24 rows ago\", which might be 25 wall-clock hours. The\nfail-fast checks here prevent that silent error.\n:::\n\n## Stage 2 — Outlier detection and removal\n\n*Why this step exists.* Real-world sensors occasionally produce values that\nare not signal but noise (a spike from a calibration error, a dip from a\nbrief power loss). Letting the model train on these outliers (see\n@def-outlier) contaminates its idea of \"normal\" behaviour. We replace\nthem with `NaN` so the next stage can treat them the same way it treats\ngenuine gaps.\n\n`mark_outliers(data, contamination=contamination, random_state=1234)`\napplies an [Isolation Forest](https://en.wikipedia.org/wiki/Isolation_forest):\nthe algorithm builds random partitioning trees, and points that are isolated\nin only a few splits are flagged as anomalous. The `contamination` parameter\nis the expected fraction of anomalies per column — `0.01` means \"about 1 %\nof rows should be marked\". The function returns a modified DataFrame\n(outlier positions set to `NaN`) and an outlier-label array (Isolation\nForest's convention: `-1` for outliers, `+1` for inliers). The label array\nis kept aside so the metadata dictionary at the end of the pipeline can\nreport the number of detected outliers.\n\nThe fixed `random_state=1234` ensures that two runs on identical input\nproduce identical outlier flags — a core requirement of the reproducible\nartefact design.\n\n::: {#exm-isolation-forest}\n\n## Isolation Forest on injected spikes\n\n::: {#68440c09 .cell execution_count=1}\n``` {.python .cell-code}\nimport numpy as np\nimport pandas as pd\nfrom sklearn.ensemble import IsolationForest\n\nrng = np.random.default_rng(0)\nn = 200\ny = pd.Series(rng.standard_normal(n))\ny.iloc[50] = 15.0 # inject a spike\ny.iloc[100] = -12.0 # inject a dip\n\niso = IsolationForest(contamination=0.01, random_state=1234)\nlabels = iso.fit_predict(y.to_numpy().reshape(-1, 1))\nflagged_positions = np.where(labels == -1)[0].tolist()\n\nprint(f\"Series length : {n}\")\nprint(f\"Injected positions : [50, 100]\")\nprint(f\"Flagged positions : {flagged_positions}\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nSeries length : 200\nInjected positions : [50, 100]\nFlagged positions : [50, 100]\n```\n:::\n:::\n\n\n:::\n\n::: {.callout-note}\n## What is an outlier in a time series?\n\nTime-series outliers come in flavours: a one-off spike (instrumentation\nerror), a short burst of \"wrong\" values (a sensor reboot), or a sustained\nshift (a real but rare event like a heat wave). The Isolation Forest\ndefaults used here are tuned to catch the first kind. Treat the\n`contamination` parameter as a guess at how rare such spikes are — if the\ndata is famously clean, set it low; if it is famously messy, set it\nhigher.\n:::\n\n## Stage 3 — Imputation and sample weighting\n\n*Why this step exists.* After stage 2 the series has `NaN`s wherever it\noriginally had missing rows *or* where stage 2 just removed an outlier.\nLag-based models cannot accept `NaN` in the input. We fill the gaps via\nimputation (see @def-imputation), but we also remember which rows were\nimputed so the loss function can pay less attention to them by way of\nsample weights (see @def-sample-weight).\n\n`get_missing_weights(data, window_size=window_size)` does two things:\n\n1. It fills the gaps using both forward-fill and backward-fill so no\n `NaN` survives.\n2. It builds a weight series: a binary contamination mask (`1` where the\n value was imputed, `0` otherwise) is propagated forward by `window_size`\n positions with a rolling maximum. Any training row whose lag window\n *touches* an imputed value receives weight `0`. All other rows receive\n weight `1`. The downstream forecaster will then see the filled-in values\n but down-weight their gradient contribution to zero.\n\nThe resulting `weights_series` is wrapped in a `WeightFunction` object,\n*not* passed as a plain array. This indirection solves three problems at\nonce:\n\n- The training matrix produced by\n `ForecasterRecursive._create_train_X_y` is shorter than the original\n series by `max(lags)` rows. A positional array would misalign.\n- A *callable* that receives `X_train.index` does a label-based lookup\n into the Series and is immune to that offset regardless of `lags`.\n- `WeightFunction` is a plain class, so it is picklable and gets saved to\n disk together with the forecaster (a closure-based weight function\n would not be).\n\n::: {#exm-weight-function}\n\n## Label-based weight lookup\n\n::: {#df7be445 .cell execution_count=2}\n``` {.python .cell-code}\nimport pandas as pd\nimport numpy as np\n\nweights = pd.Series([1, 1, 0, 0, 0, 1, 1, 1], dtype=float,\n index=pd.date_range(\"2020-01-01\", periods=8, freq=\"h\"))\n\nclass WeightFunction:\n def __init__(self, weights_series):\n self.weights_series = weights_series\n def __call__(self, index):\n return self.weights_series.reindex(index).fillna(1.0).values\n\nwf = WeightFunction(weights)\nquery_idx = weights.index[1:5]\nprint(\"Queried timestamps :\", query_idx.tolist())\nprint(\"Returned weights :\", wf(query_idx).tolist())\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nQueried timestamps : [Timestamp('2020-01-01 01:00:00'), Timestamp('2020-01-01 02:00:00'), Timestamp('2020-01-01 03:00:00'), Timestamp('2020-01-01 04:00:00')]\nReturned weights : [1.0, 0.0, 0.0, 0.0]\n```\n:::\n:::\n\n\n:::\n\n::: {.callout-note}\n## Why we weight instead of drop\n\nWe *could* delete every row near a gap, but that breaks the\n\"regular hourly grid\" invariant the recursive forecaster needs. By keeping\nthe rows but giving them weight `0`, the time axis stays uniform and the\nloss function still ignores the bad data.\n:::\n\n## Stage 4 — Exogenous feature engineering\n\n*Why this step exists.* A target series alone often does not contain enough\nsignal to forecast accurately — but combining it with calendar and weather\ncontext typically does. Stage 4 produces four feature DataFrames, each\nindexed on the *extended* timeline `[start, cov_end]` so that the feature\nmatrix is defined both over the training window and over the prediction\nwindow.\n\nThe four feature categories, in the order they will be concatenated in\nstage 5, are:\n\n1. Calendar features — `get_calendar_features(start, cov_end, freq=\"h\",\n timezone=timezone)` uses `feature_engine.DatetimeFeatures` to extract\n `month`, `week`, `day_of_week`, and `hour` from the index alone. Because\n these are derived from the index, they are always complete — no\n imputation needed.\n2. Day/night features — `get_day_night_features(start, cov_end, location,\n ...)` computes sunrise and sunset times for the configured geographic\n location using the `astral` library. Sunrise and sunset hours are\n rounded to the nearest hour, and an `is_daylight` flag equals `1`\n during the hours between sunrise and sunset. Per-day sunrise/sunset\n values are cached so multi-year series do not recompute the solar\n position for every hourly row.\n3. Weather features — `get_weather_features(...)` fetches weather data\n for the configured latitude/longitude and applies a sliding window\n transform that computes rolling mean, max, and min over one-day and\n seven-day windows for each numeric weather column. The helper itself\n resolves missing values (backward fill + curate step) before returning,\n so callers can treat the result as gap-free.\n4. Holiday features — `get_holiday_features(...)` calls the `holidays`\n library for the configured `country_code` and `state`, validates the\n returned dates with an internal curate step, and reindexes the binary\n flag to the hourly grid with `fill_value=0`. The model sees a clean\n `0 / 1` column it can split on directly.\n\n::: {#exm-day-night}\n\n## Daylight flag for a June day\n\n::: {#9ea17010 .cell execution_count=3}\n``` {.python .cell-code}\nimport pandas as pd\nimport numpy as np\n\ndates = pd.date_range(\"2020-06-01\", periods=24, freq=\"h\", tz=\"UTC\")\nsunrise_hour = 4 # approximate June sunrise at 51°N\nsunset_hour = 21 # approximate June sunset at 51°N\n\nis_daylight = np.where(\n (dates.hour >= sunrise_hour) & (dates.hour < sunset_hour), 1, 0\n)\ndf_sun = pd.DataFrame({\"hour\": dates.hour, \"is_daylight\": is_daylight}, index=dates)\nprint(df_sun.groupby(\"is_daylight\")[\"hour\"].apply(list).to_string())\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nis_daylight\n0 [0, 1, 2, 3, 21, 22, 23]\n1 [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...\n```\n:::\n:::\n\n\n:::\n\n## Stage 5 — Combining and encoding exogenous features\n\n*Why this step exists.* The four DataFrames from stage 4 need to be joined\ninto one matrix and then transformed so the model can learn from the\nperiodic structure of calendar variables.\n\nThe pipeline concatenates the four feature DataFrames column-wise in this\nexact order:\n\n```python\nexogenous_features = pd.concat(\n [calendar_features, sun_light_features, weather_features, holiday_features],\n axis=1,\n)\n```\n\nIt then runs a *single* hard check: if any `NaN` survived into\n`exogenous_features`, a `ValueError` is raised reporting the count of\nmissing entries. This guard is the catch-all for any silent gap left by a\nfeature helper.\n\nTwo transformations follow:\n\n1. Cyclical encoding — `apply_cyclical_encoding(data,\n drop_original=False)` replaces each periodic integer column (`month`,\n `week`, `day_of_week`, `hour`, `sunrise_hour`, `sunset_hour`) by its\n sine and cosine on a unit circle scaled to the natural period of the\n column. Because `drop_original=False`, the original integer columns are\n kept alongside the new sine/cosine columns — the model can use either\n representation depending on which is more informative.\n2. Interaction terms — `create_interaction_features(exogenous_features,\n weather_aligned)` uses `sklearn.preprocessing.PolynomialFeatures` with\n `interaction_only=True` to produce pairwise products of the calendar\n cyclical columns with weather-window columns, raw weather columns, and\n (if present) the holiday column. These columns are prefixed with\n `poly_` and capture joint effects such as \"cold morning *and* weekday\n peak\" that neither feature can describe on its own.\n\n::: {#exm-cyclical-month}\n\n## Cyclical encoding of the month\n\n::: {#78ae47f5 .cell execution_count=4}\n``` {.python .cell-code}\nimport numpy as np\nimport pandas as pd\n\ndates = pd.date_range(\"2020-01-01\", periods=12, freq=\"ME\")\nmonth = dates.month\nmonth_sin = np.sin(2 * np.pi * month / 12)\nmonth_cos = np.cos(2 * np.pi * month / 12)\n\npd.DataFrame(\n {\"month\": month, \"sin\": month_sin.round(4), \"cos\": month_cos.round(4)},\n index=dates,\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=4}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monthsincos
2020-01-3110.5000.866
2020-02-2920.8660.500
2020-03-3131.0000.000
2020-04-3040.866-0.500
2020-05-3150.500-0.866
2020-06-3060.000-1.000
2020-07-317-0.500-0.866
2020-08-318-0.866-0.500
2020-09-309-1.000-0.000
2020-10-3110-0.8660.500
2020-11-3011-0.5000.866
2020-12-3112-0.0001.000
\n
\n```\n:::\n:::\n\n\n:::\n\n::: {.callout-note}\n## Why sine and cosine instead of just the hour number?\n\nIf you feed an integer hour 0–23 directly to a tree-based model, hour 23\nand hour 0 look *very* far apart numerically — even though they are\nneighbours on a clock. Sine and cosine wrap the integer onto a unit\ncircle, so adjacent hours stay adjacent in the encoded space and the model\ndoes not learn a spurious discontinuity at midnight.\n:::\n\n## Stage 6 — Feature selection\n\n*Why this step exists.* The combined matrix from stage 5 contains many\ncolumns the caller may not want. Stage 6 builds the final list of column\nnames that will be passed as `exog` to the forecaster.\n\n`select_exogenous_features(exogenous_features, weather_aligned,\ninclude_weather_windows, include_holiday_features, poly_features_degree)`\napplies these rules:\n\n- Always included: the cyclical sine/cosine columns (matched by the\n regular expression `_sin$|_cos$`) and the raw weather columns (which\n are exactly the columns of `weather_aligned`).\n- Included only if `include_weather_windows=True`: the rolling-window\n weather features.\n- Included only if `include_holiday_features=True`: the binary\n `is_holiday` column.\n- Included only if `poly_features_degree >= 2`: the `poly_`-prefixed\n interaction features (capped to `max_poly_features` by mutual\n information).\n\nThe final list is deduplicated with the order-preserving idiom\n`list(dict.fromkeys(exog_features))` so that no column name appears twice\n(a duplicate column would crash the forecaster's lag-matrix construction).\n\n## Stage 7 — Merging target and exogenous data\n\n*Why this step exists.* The forecaster expects one DataFrame with the\ntarget column and a separate DataFrame of exogenous features, both indexed\non the same training window — and a separate exogenous slice covering the\nfuture prediction window.\n\n`merge_data_and_covariates(data, exogenous_features, target_columns,\nexog_features, start, end, cov_end, forecast_horizon, cast_dtype=\"float32\")`\nreturns three DataFrames:\n\n- `data_with_exog` — an *inner join* of the target columns and the\n selected exogenous columns over `[start, end]`. The inner join is the\n belt-and-braces guarantee that only rows where both sides are defined\n survive; any residual misalignment is dropped here.\n- `exo_tmp` — the slice of the exogenous feature matrix that covers\n `[start, end]`. Useful for inspection.\n- `exo_pred` — the slice that covers `(end, cov_end]`, the future window\n used at prediction time.\n\nEvery column of `data_with_exog` is cast to `float32` to halve memory\nconsumption during lag-matrix construction without measurable loss in\npredictive accuracy for this class of model.\n\n## Stage 8 — Train / validation / test split\n\n*Why this step exists.* The model must be evaluated on data it has never\nseen. Random shuffling is dangerous in time series — it lets the model\n\"learn from the future\". The split here is purely temporal — see\n@def-train-val-test.\n\n`split_rel_train_val_test(data_with_exog, perc_train=train_ratio,\nperc_val=1.0 - train_ratio)` slices the rows in order: the first\n`train_ratio` fraction goes to training, and the remainder goes to\nvalidation. With the default `train_ratio=0.8`, that is 80 % training and\n20 % validation — the test set is empty because `perc_val = 1 - train_ratio`\nconsumes all remaining rows.\n\nThe boundary timestamp `end_validation` — the last index of the combined\ntrain + validation segment — is the cutoff used for `forecaster.fit(...)`.\n\n::: {#exm-split-ratios}\n\n## Computing train/val/test sizes\n\n::: {#c3468997 .cell execution_count=5}\n``` {.python .cell-code}\nimport pandas as pd\nimport numpy as np\n\nn = 100\nidx = pd.date_range(\"2020-01-01\", periods=n, freq=\"h\")\ny = pd.Series(np.arange(n, dtype=float), index=idx)\n\ntrain_ratio = 0.8\nperc_val = 1.0 - train_ratio\nn_train = int(round(n * train_ratio))\nn_val = int(round(n * perc_val))\nn_test = n - n_train - n_val\n\nprint(f\"Total rows : {n}\")\nprint(f\"Train rows : {n_train} ({train_ratio:.0%})\")\nprint(f\"Val rows : {n_val} ({perc_val:.0%})\")\nprint(f\"Test rows : {n_test}\")\nprint(f\"Train ends : {y.index[n_train - 1]}\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nTotal rows : 100\nTrain rows : 80 (80%)\nVal rows : 20 (20%)\nTest rows : 0\nTrain ends : 2020-01-04 07:00:00\n```\n:::\n:::\n\n\n:::\n\n::: {.callout-note}\n## Why temporal, not random?\n\nIf you shuffled rows before splitting, the model might be trained on hour\n14 of 1 March 2024 and tested on hour 13 of the same day — i.e. it would\nknow the \"answer\" sits one step before the \"question\". A temporal split\nforces the model to forecast a real future from a real past.\n:::\n\n## Stage 9 — Training or loading recursive forecasters\n\n*Why this step exists.* Retraining a forecaster takes time. For repeated\ncalls with the same configuration (a typical operational pattern), we want\nto load previously trained models from disk instead.\n\nThe decision is governed by two parameters:\n\n- `force_train` (default `True`): if `True`, all forecasters are\n retrained from scratch and any cached models are overwritten. Set\n `force_train=False` to skip training when cached models exist.\n- `model_dir`: directory where the joblib `.pkl` files live. If `None`\n (the default), the pipeline uses\n `get_cache_home() / \"forecasters\"` — i.e.\n `~/spotforecast2_cache/forecasters` unless overridden by the\n `SPOTFORECAST2_CACHE` environment variable.\n\nWhen `force_train=False` *and* the model directory exists,\n`load_forecasters(target_columns, model_dir)` attempts to deserialise one\n`ForecasterRecursive` object per target column. Targets whose model file\nis missing are collected into `targets_to_train` and trained below; the\nothers are reused.\n\nWhen training is required, a `ForecasterRecursive` is constructed for\neach target in `targets_to_train` (see @def-recursive-forecaster for the\nunderlying idea):\n\n```python\nforecaster = ForecasterRecursive(\n estimator=estimator, # default: LGBMRegressor(random_state=1234, verbose=-1)\n lags=lags, # how many past values to use as features\n window_features=RollingFeatures(\n stats=[\"mean\"], window_sizes=window_size,\n ), # adds a rolling-mean feature alongside the lags\n weight_func=weight_func, # the WeightFunction from stage 3\n)\nforecaster.fit(\n y=data_with_exog[target].loc[:end_validation].squeeze(),\n exog=data_with_exog[exog_features].loc[:end_validation],\n)\n```\n\nNote that the fit uses `:end_validation`, not the end of the full dataset\n— the test portion is held out so a downstream evaluation step can score\nthe model on truly unseen data.\n\nAfter training, `save_forecasters(forecasters, model_dir)` serialises\neach fitted object with joblib, one file per target column. Because\n`WeightFunction` is a regular class (not a closure), it survives the\npickle/unpickle round trip together with the rest of the model.\n\nA `show_progress=True` flag wraps the per-target loop in a `tqdm`\nprogress bar — useful for runs with many target columns.\n\n::: {#a76ebf5b .cell execution_count=6}\n``` {.python .cell-code}\nfrom lightgbm import LGBMRegressor\n\nestimator = LGBMRegressor(random_state=1234, verbose=-1)\nprint(f\"random_state : {estimator.random_state}\")\nprint(f\"verbose : {estimator.verbose}\")\nprint(f\"n_estimators : {estimator.n_estimators}\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nrandom_state : 1234\nverbose : -1\nn_estimators : 100\n```\n:::\n:::\n\n\n::: {.callout-note}\n## What a recursive forecaster does, one step at a time\n\nAt step 1, the forecaster sees the most recent `lags` values of the\ntarget plus the exogenous features for step 1 and predicts the value for\nstep 1. At step 2, it shifts the lag window by one — using the\n*just-predicted* step-1 value as the most recent lag — and predicts step\n2. It keeps doing this for `forecast_horizon` steps. That is why the\nexogenous matrix for the future window must be fully known *before*\nprediction starts: the forecaster cannot ask \"what will the temperature\nbe at step 5?\" mid-way through.\n:::\n\n## Stage 10 — Prediction\n\n*Why this step exists.* This is the payoff: turn the trained forecasters\nand the future exogenous matrix into actual forecasts.\n\n`predict_multivariate(recursive_forecasters, steps_ahead=forecast_horizon,exog=exo_pred[exog_features],show_progress=show_progress)` iterates over\nthe trained forecasters dictionary and calls `.predict(steps=horizon,exog=...)` on each. The exogenous matrix passed to every forecaster is the\nsame slice `exo_pred[exog_features]` covering exactly `forecast_horizon`\nsteps after `end_validation`. The result is assembled into a `predictions`\nDataFrame with one column per target and `forecast_horizon` rows.\n\n::: {#c4031d4e .cell execution_count=7}\n``` {.python .cell-code}\nimport numpy as np\nimport pandas as pd\n\nforecast_horizon = 24\nn_targets = 11\nrng = np.random.default_rng(42)\n\npred_index = pd.date_range(\"2023-01-02\", periods=forecast_horizon, freq=\"h\")\npredictions = pd.DataFrame(\n rng.standard_normal((forecast_horizon, n_targets)),\n index=pred_index,\n columns=[f\"col_{i+1}\" for i in range(n_targets)],\n)\n\nprint(f\"Predictions shape : {predictions.shape}\")\nprint(f\"First timestamp : {predictions.index[0]}\")\nprint(f\"Last timestamp : {predictions.index[-1]}\")\npredictions.head(3).round(4)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nPredictions shape : (24, 11)\nFirst timestamp : 2023-01-02 00:00:00\nLast timestamp : 2023-01-02 23:00:00\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
col_1col_2col_3col_4col_5col_6col_7col_8col_9col_10col_11
2023-01-02 00:00:000.3047-1.04000.75050.9406-1.9510-1.30220.1278-0.3162-0.0168-0.85300.8794
2023-01-02 01:00:000.77780.06601.12720.4675-0.85930.3688-0.95890.8785-0.0499-0.1849-0.6809
2023-01-02 02:00:001.2225-0.1545-0.4283-0.35210.53230.36540.41270.43082.1416-0.4064-0.5122
\n
\n```\n:::\n:::\n\n\n## Metadata and return values\n\nThe function returns a three-element tuple. The first element is the\n`predictions` DataFrame described above. The second element is a\n`metadata` dictionary that records every parameter and intermediate shape\nof the run:\n\n- `forecast_horizon`\n- `target_columns`\n- `exog_features`\n- `n_exog_features`\n- `train_size`, `val_size`, `test_size`\n- `data_shape_original`, `data_shape_merged`\n- `training_end`\n- `prediction_start`, `prediction_end`\n- `lags`, `window_size`, `contamination`\n- `n_outliers` (computed as `outliers.sum()` for the Series case, or\n `len(outliers)` otherwise)\n\nThis dictionary is a self-contained audit record — it lets a future reader\nreconstruct the run's configuration without rerunning the pipeline.\n\nThe third element is the `recursive_forecasters` dictionary keyed by\ntarget column name. Returning the fitted objects lets the caller inspect\ninternal state (feature names, lag matrices, fitted estimator parameters)\nor call `.predict` again with a different horizon, without retraining.\n\n## Aggregation\n\nStage 10 returns one forecast Series per target column — eleven independent\ntrajectories for the bundled demo data. Operational consumers often need a\nsingle scalar trajectory instead: a regional net position, a portfolio-level\nload forecast, or a balancing-zone aggregate. The helper\n`agg_predict(predictions, weights=weights)` performs exactly that reduction.\nIt takes the prediction DataFrame from Stage 10 and returns a Series with the\nsame `DatetimeIndex`, computed as the weighted sum of the per-column\nforecasts.\n\nThe `weights` argument accepts three forms:\n\n- a `list` or `numpy.ndarray` matched *positionally* against the columns of\n `predictions` (length must equal the column count),\n- a `dict` of `{column_name: weight}` matched *by name*, useful when the\n caller wants to be explicit about which column carries which weight,\n- `None`, which falls back to the package default.\n\nA list or array entry may be negative. A positive weight adds that column's\nforecast to the aggregate; a negative weight subtracts it. This signed\nconvention is used with the eleven-element vector\n`[+1, +1, -1, -1, +1, -1, +1, +1, +1, -1, +1]` to express a net-position\naggregation in which the first, second, fifth, seventh, eighth, ninth, and\neleventh columns are added and the remaining columns are subtracted.\n\nThe n2n pipeline remains available and is used by `task_safe_demo`. The n-to-1\ntask (`tasks/task_safe_n_to_1_with_covariates_and_dataframe.py`) now runs on\n`spotforecast2_safe.multitask`; see the\n[API reference](../reference/tasks.task_safe_n_to_1_with_covariates_and_dataframe.qmd)\nfor its `ConfigMulti`-driven workflow and `agg_weights` configuration.\n\n::: {#e62379e0 .cell execution_count=8}\n``` {.python .cell-code}\nfrom spotforecast2_safe.processing.agg_predict import agg_predict\n\nweights = [1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 1.0, 1.0, 1.0, -1.0, 1.0]\ncombined_prediction = agg_predict(predictions, weights=weights)\n\nprint(f\"combined shape : {combined_prediction.shape}\")\nprint(f\"first timestamp : {combined_prediction.index[0]}\")\nprint(f\"last timestamp : {combined_prediction.index[-1]}\")\ncombined_prediction.head(3).round(4)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\ncombined shape : (24,)\nfirst timestamp : 2023-01-02 00:00:00\nlast timestamp : 2023-01-02 23:00:00\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=8}\n```\n2023-01-02 00:00:00 -1.5479\n2023-01-02 01:00:00 -2.6054\n2023-01-02 02:00:00 4.8947\nFreq: h, dtype: float64\n```\n:::\n:::\n\n\n## End-to-end example {#sec-end-to-end}\n\nThe cell below runs the entire pipeline with very small parameters\n(`forecast_horizon=2`, `lags=4`, `window_size=8`), using\n`tempfile.mkdtemp()` as the model directory so the run does not pollute\nthe user's persistent cache.\n\n::: {#exm-end-to-end}\n\n## Full pipeline on small parameters\n\n::: {#c8fbcddb .cell execution_count=9}\n``` {.python .cell-code}\nimport tempfile\n\nfrom spotforecast2_safe.processing.agg_predict import agg_predict\nfrom spotforecast2_safe.processing.n2n_predict_with_covariates import (\n n2n_predict_with_covariates,\n)\n\npredictions, metadata, forecasters = n2n_predict_with_covariates(\n forecast_horizon=2,\n lags=4,\n window_size=8,\n force_train=True,\n model_dir=tempfile.mkdtemp(),\n verbose=False,\n)\n\nprint(f\"predictions shape : {predictions.shape}\")\nprint(f\"target columns : {metadata['target_columns'][:3]} ...\")\nprint(f\"# exog features : {metadata['n_exog_features']}\")\nprint(f\"training_end : {metadata['training_end']}\")\nprint(f\"prediction window : {metadata['prediction_start']} → {metadata['prediction_end']}\")\npredictions.head().round(4)\n\nweights = [1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 1.0, 1.0, 1.0, -1.0, 1.0]\ncombined_prediction = agg_predict(predictions, weights=weights)\n\nprint(f\"combined shape : {combined_prediction.shape}\")\ncombined_prediction.head(3).round(4)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\npredictions shape : (2, 11)\ntarget columns : ['A', 'B', 'C'] ...\n# exog features : 27\ntraining_end : 2021-12-24 21:00:00+00:00\nprediction window : 2021-12-24 22:00:00+00:00 → 2021-12-24 23:00:00+00:00\ncombined shape : (2,)\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=9}\n```\n2021-12-24 22:00:00+00:00 19116.1863\n2021-12-24 23:00:00+00:00 15714.5288\nFreq: h, dtype: float64\n```\n:::\n:::\n\n\n:::\n\nThe same call with `force_train=False` and the same `model_dir` would\nload the just-trained forecasters from disk instead of retraining — the\ncore caching mechanism behind the pipeline's speedup on repeated runs.\n\n## The complete execution flow\n\nA single call to `n2n_predict_with_covariates` follows this invariant\nsequence:\n\n1. Input validation runs first.\n2. The target data is loaded and cleaned (Stage 1).\n3. Outlier positions are replaced with `NaN` (Stage 2).\n4. Missing values are imputed and sample weights are computed (Stage 3).\n5. Four categories of exogenous features are constructed over the full\n extended time window (Stage 4).\n6. The features are concatenated, validated for completeness, cyclically\n encoded, and augmented with interaction terms (Stage 5).\n7. A feature-selection step reduces the matrix to the columns relevant\n to the configured pipeline variant (Stage 6).\n8. The selected features are merged with the target data over the\n historical window (Stage 7).\n9. The merged data is split temporally into train, validation, and test\n segments (Stage 8).\n10. Forecasters are either loaded from disk or trained from scratch and\n then persisted (Stage 9).\n11. Predictions are generated for the `forecast_horizon` steps beyond the\n training end (Stage 10).\n\nEvery public parameter is validated at entry, every exogenous feature\nmatrix is checked for completeness before it enters the model, and every\nfitted object is serialised in a form that survives process boundaries.\nThese invariants make the pipeline safe to embed in automated batch jobs\nwhere a silent failure would not be discovered until long after the\nprediction window has closed.\n\n## sf2-safe API surface used in this walkthrough\n\nThe first table below lists every `spotforecast2_safe` symbol referenced\n**directly** by the orchestrator or named in this page's prose. The\nsecond table extends that set with the **transitive** symbols that the\ndirect dependencies themselves import from `spotforecast2_safe` — these\nare the classes and helpers a reader will encounter as soon as they\nopen one of the level-1 modules. Third-party dependencies (pandas,\nnumpy, scikit-learn, lightgbm, astral, holidays, feature-engine,\njoblib, tqdm) are deliberately omitted from both tables.\n\n### Direct (orchestrator + prose)\n\n| Symbol | Kind | sf2-safe module | Stage |\n| --- | --- | --- | --- |\n| [`n2n_predict_with_covariates`](`spotforecast2_safe.processing.n2n_predict_with_covariates.n2n_predict_with_covariates`) | function | `spotforecast2_safe.processing.n2n_predict_with_covariates` | orchestrator |\n| [`fetch_data`](`spotforecast2_safe.data.fetch_data.fetch_data`) | function | `spotforecast2_safe.data.fetch_data` | Stage 1 |\n| [`get_package_data_home`](`spotforecast2_safe.data.fetch_data.get_package_data_home`) | function | `spotforecast2_safe.data.fetch_data` | Stage 1 |\n| [`get_cache_home`](`spotforecast2_safe.data.fetch_data.get_cache_home`) | function | `spotforecast2_safe.data.fetch_data` | Stage 9 |\n| `get_start_end` | function | `spotforecast2_safe.preprocessing.curate_data` | Stage 1 |\n| `basic_ts_checks` | function | `spotforecast2_safe.preprocessing.curate_data` | Stage 1 |\n| `agg_and_resample_data` | function | `spotforecast2_safe.preprocessing.curate_data` | Stage 1 |\n| `mark_outliers` | function | `spotforecast2_safe.preprocessing.outlier` | Stage 2 |\n| `get_missing_weights` | function | `spotforecast2_safe.preprocessing.imputation` | Stage 3 |\n| `WeightFunction` | class | `spotforecast2_safe.preprocessing` (re-export of `.imputation.WeightFunction`) | Stage 3 |\n| `get_calendar_features` | function | `spotforecast2_safe.calendar` | Stage 4 |\n| `get_day_night_features` | function | `spotforecast2_safe.calendar` | Stage 4 |\n| `get_holiday_features` | function | `spotforecast2_safe.calendar` | Stage 4 |\n| `get_weather_features` | function | `spotforecast2_safe.weather` | Stage 4 |\n| `apply_cyclical_encoding` | function | `spotforecast2_safe.manager.features` | Stage 5 |\n| `create_interaction_features` | function | `spotforecast2_safe.manager.features` | Stage 5 |\n| `select_exogenous_features` | function | `spotforecast2_safe.manager.features` | Stage 6 |\n| `merge_data_and_covariates` | function | `spotforecast2_safe.manager.features` | Stage 7 |\n| `split_rel_train_val_test` | function | `spotforecast2_safe.splitter.split` | Stage 8 |\n| [`ForecasterRecursive`](`spotforecast2_safe.forecaster.recursive.ForecasterRecursive`) | class | `spotforecast2_safe.forecaster.recursive` | Stage 9 |\n| `ForecasterRecursive._create_train_X_y` | method | `spotforecast2_safe.forecaster.recursive.ForecasterRecursive` | Stage 3 (referenced) |\n| `RollingFeatures` | class | `spotforecast2_safe.preprocessing` (re-export of `.rolling.RollingFeatures`) | Stage 9 |\n| `load_forecasters` | function | `spotforecast2_safe.manager.persistence` | Stage 9 |\n| `save_forecasters` | function | `spotforecast2_safe.manager.persistence` | Stage 9 |\n| `predict_multivariate` | function | `spotforecast2_safe.forecaster.utils` | Stage 10 |\n\n### Transitive (imported by the direct dependencies)\n\nThese symbols are not invoked from the orchestrator or named in the\nprose above, but the direct dependencies pull them in from\n`spotforecast2_safe`. A reader who opens any of the level-1 modules\nwill land on these next, so they are part of the same internal\ncontract.\n\n| Symbol | Kind | sf2-safe module | Surfaces in |\n| --- | --- | --- | --- |\n| `QuantileBinner` | class | `spotforecast2_safe.preprocessing` (re-export of `._binner.QuantileBinner`) | `ForecasterRecursive` internals (residual-binning) |\n| `TimeSeriesDifferentiator` | class | `spotforecast2_safe.preprocessing` (re-export of `._differentiator.TimeSeriesDifferentiator`) | `ForecasterRecursive` internals (differentiation) |\n| `LinearlyInterpolateTS` | class | `spotforecast2_safe.preprocessing.linearly_interpolate_ts` | `get_missing_weights` imputation path |\n| `WeatherService` | class | `spotforecast2_safe.weather` | `fetch_data` weather-fetch helper |\n| `create_holiday_df` | function | `spotforecast2_safe.calendar` (re-export of `.holiday.create_holiday_df`) | `fetch_data.fetch_holiday_data` |\n| `curate_weather` | function | `spotforecast2_safe.preprocessing.curate_data` | `get_weather_features` post-fetch curation |\n| `convert_to_utc` | function | `spotforecast2_safe.utils.convert_to_utc` | `fetch_data` index normalisation |\n| `to_utc_timestamp` | function | `spotforecast2_safe.utils.convert_to_utc` | calendar + weather range-boundary normalisation |\n| `ForecasterBase` | class | `spotforecast2_safe.forecaster.base` | base class of `ForecasterRecursive` |\n| `check_y`, `check_exog`, `check_exog_dtypes`, `check_interval`, `check_predict_input`, `check_residuals_input`, `get_exog_dtypes`, `set_cpu_gpu_device` | functions | `spotforecast2_safe.preprocessing.checking` | fail-safe input validation in `ForecasterRecursive.fit` / `predict` and in `forecaster.utils` |\n| `expand_index`, `input_to_frame`, `transform_dataframe`, `date_to_index_position` | functions | `spotforecast2_safe.preprocessing.data_transform` | index / frame coercion in `forecaster.utils` and `ForecasterRecursive` |\n| `check_select_fit_kwargs`, `initialize_lags`, `initialize_weights` | functions | `spotforecast2_safe.preprocessing.forecaster_config` | constructor + fit-time configuration for `ForecasterRecursive` |\n| `check_extract_values_and_index`, `get_style_repr_html`, `initialize_estimator`, `initialize_window_features`, `transform_numpy` | functions | `spotforecast2_safe.forecaster.utils` | helpers consumed by `ForecasterRecursive` (already exported by the module table above) |\n| `DataTransformationWarning`, `NotFittedError`, `ResidualsUsageWarning`, `IgnoredArgumentWarning`, `InputTypeWarning`, `MissingValuesWarning`, `UnknownLevelWarning`, `set_skforecast_warnings` | exceptions / warnings | `spotforecast2_safe.exceptions` | raised across `ForecasterRecursive`, `forecaster.utils`, and the preprocessing layer |\n\n## sf2-safe internal dependencies\n\nThe walkthrough touches the following `spotforecast2_safe` subpackages.\nEach line collapses the symbols listed above into the module they live\nin, so the dependency footprint of this page is visible at a glance.\n\n- `spotforecast2_safe.processing.n2n_predict_with_covariates` — the\n orchestrator entry point exercised by the end-to-end cell.\n- `spotforecast2_safe.data.fetch_data` — `fetch_data`,\n `get_package_data_home`, `get_cache_home`.\n- `spotforecast2_safe.preprocessing.curate_data` — `get_start_end`,\n `basic_ts_checks`, `agg_and_resample_data`.\n- `spotforecast2_safe.preprocessing.outlier` — `mark_outliers`.\n- `spotforecast2_safe.preprocessing.imputation` — `get_missing_weights`,\n `WeightFunction` (re-exported one level up at\n `spotforecast2_safe.preprocessing.WeightFunction`).\n- `spotforecast2_safe.preprocessing.rolling` — `RollingFeatures`\n (re-exported one level up at\n `spotforecast2_safe.preprocessing.RollingFeatures`).\n- `spotforecast2_safe.calendar` — `get_calendar_features`,\n `get_day_night_features`, `get_holiday_features`.\n- `spotforecast2_safe.weather` — `get_weather_features`.\n- `spotforecast2_safe.manager.features` — `apply_cyclical_encoding`,\n `create_interaction_features`, `select_exogenous_features`,\n `merge_data_and_covariates`.\n- `spotforecast2_safe.manager.persistence` — `load_forecasters`,\n `save_forecasters` (and the orchestrator additionally calls\n `model_directory_exists`).\n- `spotforecast2_safe.splitter.split` — `split_rel_train_val_test`.\n- `spotforecast2_safe.forecaster.recursive` — `ForecasterRecursive`\n (and its internal `_create_train_X_y` referenced in Stage 3).\n- `spotforecast2_safe.forecaster.utils` — `predict_multivariate` (and\n the helpers `check_extract_values_and_index`, `get_style_repr_html`,\n `initialize_estimator`, `initialize_window_features`, `transform_numpy`\n consumed by `ForecasterRecursive`).\n\nThe following modules are not imported by the orchestrator directly,\nbut the level-1 dependencies above pull them in. Any change to a\npublic symbol in one of these modules can therefore propagate up into\nthis walkthrough's pipeline.\n\n- `spotforecast2_safe.forecaster.base` — `ForecasterBase` (base class\n of `ForecasterRecursive`).\n- `spotforecast2_safe.preprocessing._binner` — `QuantileBinner`\n (re-exported at `spotforecast2_safe.preprocessing.QuantileBinner`,\n used by `ForecasterRecursive` for residual binning).\n- `spotforecast2_safe.preprocessing._differentiator` —\n `TimeSeriesDifferentiator` (re-exported at\n `spotforecast2_safe.preprocessing.TimeSeriesDifferentiator`, used by\n `ForecasterRecursive` for differentiation).\n- `spotforecast2_safe.preprocessing.linearly_interpolate_ts` —\n `LinearlyInterpolateTS`, used by the imputation path of\n `get_missing_weights`.\n- `spotforecast2_safe.preprocessing.checking` — `check_y`, `check_exog`,\n `check_exog_dtypes`, `check_interval`, `check_predict_input`,\n `check_residuals_input`, `get_exog_dtypes`, `set_cpu_gpu_device`\n (fail-safe input validation throughout `ForecasterRecursive` and\n `forecaster.utils`).\n- `spotforecast2_safe.preprocessing.data_transform` — `expand_index`,\n `input_to_frame`, `transform_dataframe`, `date_to_index_position`\n (index and frame coercion).\n- `spotforecast2_safe.preprocessing.forecaster_config` —\n `check_select_fit_kwargs`, `initialize_lags`, `initialize_weights`\n (constructor and fit-time configuration).\n- `spotforecast2_safe.utils.convert_to_utc` — `convert_to_utc`,\n `to_utc_timestamp` (timezone normalisation used by `fetch_data`,\n `calendar.*`, and `weather.features`).\n- `spotforecast2_safe.exceptions` — `DataTransformationWarning`,\n `NotFittedError`, `ResidualsUsageWarning`, `IgnoredArgumentWarning`,\n `InputTypeWarning`, `MissingValuesWarning`, `UnknownLevelWarning`,\n `set_skforecast_warnings` (raised across the pipeline).\n\n", + "markdown": "---\ntitle: \"n2n_predict_with_covariates: A Beginner's Walkthrough\"\ndescription: \"A step-by-step explanation of the N-to-N covariate forecasting pipeline for readers new to time-series analysis.\"\n---\n\n## What this pipeline does, in one paragraph\n\nThe function `n2n_predict_with_covariates` takes a table of historical\nmeasurements (one or more numeric columns indexed by time), augments it with\nside information that is known both for the past and for the near future\n(calendar effects, daylight, weather forecasts, public holidays), and then\ntrains one machine-learning forecaster per target column. Each forecaster is\nasked to predict the next `forecast_horizon` time steps. The function returns\nthree things: a DataFrame of predictions, a dictionary of metadata that\nrecords *how* the run was configured, and the fitted forecaster objects\nthemselves (so a caller can re-use them later without retraining). The\nfunction is deliberately one long pipeline rather than many small calls,\nbecause every stage depends tightly on the previous one — and a single entry\npoint makes the whole pipeline easy to embed in a daily batch job.\n\n## Vocabulary {#sec-vocabulary}\n\nThe walkthrough below uses a handful of technical terms. Their definitions\nare kept brief here so that the rest of the page can stay focused on the\nmechanics of the pipeline. Each definition is cross-referenceable — later\nsections link back with `@def-…` where it helps the reader.\n\n::: {#def-time-series}\n\n## Time series\n\nA sequence of numeric measurements indexed by timestamps at a regular\ninterval (here: one row per hour).\n\n:::\n\n::: {#def-target-variable}\n\n## Target variable\n\nThe quantity we want to predict — in this pipeline, any column of the\ninput DataFrame.\n\n:::\n\n::: {#def-exogenous-variable}\n\n## Exogenous variable (covariate)\n\nAny other feature known at the same timestamps as the target *and*\navailable for the future window — for example, the hour of the day or a\nweather forecast.\n\n:::\n\n::: {#def-lag}\n\n## Lag\n\nA past value of the same series, e.g. *lag 1* = \"the value one hour ago\",\n*lag 24* = \"the value at the same hour yesterday\".\n\n:::\n\n::: {#def-forecast-horizon}\n\n## Forecast horizon\n\nHow many time steps ahead the model has to predict (e.g.\n`forecast_horizon=24` means \"predict the next 24 hours\").\n\n:::\n\n::: {#def-recursive-forecaster}\n\n## Recursive multi-step forecaster\n\nA model that predicts one step ahead, then feeds its own prediction back\nin as the most recent lag, and repeats. We use it because we want long\nhorizons (24+ steps) without training a separate model per step.\n\n:::\n\n::: {#def-train-val-test}\n\n## Train / validation / test split\n\nThe historical data is sliced into three temporally ordered chunks:\ntraining (used to fit the model), validation (used to compare\nconfigurations), and test (held back as unseen data for the final check).\n\n:::\n\n::: {#def-outlier}\n\n## Outlier\n\nA value that is much larger or smaller than the rest of the series —\noften a sensor glitch or a one-off event we do not want the model to\nlearn from.\n\n:::\n\n::: {#def-imputation}\n\n## Imputation\n\nFilling in missing entries so the model sees a complete series.\n\n:::\n\n::: {#def-sample-weight}\n\n## Sample weight\n\nA non-negative number attached to each training row that tells the model\nhow much that row should contribute to the loss. Weight `0` effectively\nignores the row.\n\n:::\n\n::: {#def-cyclical-encoding}\n\n## Cyclical encoding\n\nReplacing a periodic integer like \"hour of day\" by its sine and cosine on\na unit circle, so the model sees that hour 23 and hour 0 are neighbors.\n\n:::\n\n::: {#def-persistence}\n\n## Persistence (serialization)\n\nSaving a trained model to disk so the next run can load it instead of\nretraining.\n\n:::\n\n## Input validation\n\nBefore any computation begins, `n2n_predict_with_covariates` checks that the\nfive numerical parameters make sense:\n\n| Parameter | Allowed range | Error if violated |\n| ------------------- | --------------------- | ------------------------------------------------------------------- |\n| `forecast_horizon` | `> 0` | `ValueError: forecast_horizon must be positive, got …` |\n| `contamination` | `0 ≤ x ≤ 0.5` | `ValueError: contamination must be between 0 and 0.5, got …` |\n| `window_size` | `> 0` | `ValueError: window_size must be positive, got …` |\n| `lags` | `> 0` | `ValueError: lags must be positive, got …` |\n| `train_ratio` | `0 < x < 1` | `ValueError: train_ratio must be between 0 and 1, got …` |\n\nThe pipeline follows a *fail-fast* philosophy: rather than silently coercing\na bad value (and producing wrong predictions hours later), it stops at the\nfront door with an informative message. The most consequential check is the\none on `forecast_horizon`: a non-positive value would cause downstream\narray-slicing to return empty results, which the model would happily train\non without complaint.\n\n## Stage 1 — Loading and preparing the target series\n\n*Why this step exists.* A forecasting model can only learn from a clean,\nregularly spaced time series with a known time zone. The first stage's job\nis to produce exactly that, regardless of whether the caller passed a\nDataFrame or relied on the default demo data set.\n\nWhen the caller supplies a `data` argument, the series is forwarded to\n`fetch_data(dataframe=data, timezone=timezone)`. When no data is supplied,\n`fetch_data` reads the bundled demonstration file `demo10.csv` (located via\n`get_package_data_home()`). Either way, the result is a DataFrame with a\ntimezone-aware `DatetimeIndex` and numeric columns for each target. The\ncolumn names are immediately stored in `target_columns`, because they will\ndetermine the names of the saved model files and the columns of the\nprediction DataFrame.\n\nNext, `get_start_end(data, forecast_horizon)` returns four boundary\ntimestamps:\n\n- `start`, `end` — the historical window used for training,\n- `cov_start`, `cov_end` — the same window *extended* forward by\n `forecast_horizon` steps, so that covariates can be aligned with the\n prediction period.\n\n`basic_ts_checks(data)` then verifies that the index is a `DatetimeIndex`,\nstrictly monotonically increasing, and free of gaps. Finally,\n`agg_and_resample_data(data)` enforces an hourly grid: sub-hourly data is\naggregated by mean over each hour bin, and the result is a uniformly spaced\ntable that the lag-based forecaster can consume safely.\n\n::: {.callout-note}\n## Why a strict, gap-free hourly index matters\n\nA recursive forecaster predicts lag 1, lag 2, lag 24, etc. by *positional*\nlookup. If the index skips an hour, \"lag 24\" no longer means \"24 hours\nago\" — it means \"24 rows ago\", which might be 25 wall-clock hours. The\nfail-fast checks here prevent that silent error.\n:::\n\n## Stage 2 — Outlier detection and removal\n\n*Why this step exists.* Real-world sensors occasionally produce values that\nare not signal but noise (a spike from a calibration error, a dip from a\nbrief power loss). Letting the model train on these outliers (see\n@def-outlier) contaminates its idea of \"normal\" behaviour. We replace\nthem with `NaN` so the next stage can treat them the same way it treats\ngenuine gaps.\n\n`mark_outliers(data, contamination=contamination, random_state=1234)`\napplies an [Isolation Forest](https://en.wikipedia.org/wiki/Isolation_forest):\nthe algorithm builds random partitioning trees, and points that are isolated\nin only a few splits are flagged as anomalous. The `contamination` parameter\nis the expected fraction of anomalies per column — `0.01` means \"about 1 %\nof rows should be marked\". The function returns a modified DataFrame\n(outlier positions set to `NaN`) and an outlier-label array (Isolation\nForest's convention: `-1` for outliers, `+1` for inliers). The label array\nis kept aside so the metadata dictionary at the end of the pipeline can\nreport the number of detected outliers.\n\nThe fixed `random_state=1234` ensures that two runs on identical input\nproduce identical outlier flags — a core requirement of the reproducible\nartefact design.\n\n::: {#exm-isolation-forest}\n\n## Isolation Forest on injected spikes\n\n::: {#7d15152f .cell execution_count=1}\n``` {.python .cell-code}\nimport numpy as np\nimport pandas as pd\nfrom sklearn.ensemble import IsolationForest\n\nrng = np.random.default_rng(0)\nn = 200\ny = pd.Series(rng.standard_normal(n))\ny.iloc[50] = 15.0 # inject a spike\ny.iloc[100] = -12.0 # inject a dip\n\niso = IsolationForest(contamination=0.01, random_state=1234)\nlabels = iso.fit_predict(y.to_numpy().reshape(-1, 1))\nflagged_positions = np.where(labels == -1)[0].tolist()\n\nprint(f\"Series length : {n}\")\nprint(f\"Injected positions : [50, 100]\")\nprint(f\"Flagged positions : {flagged_positions}\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nSeries length : 200\nInjected positions : [50, 100]\nFlagged positions : [50, 100]\n```\n:::\n:::\n\n\n:::\n\n::: {.callout-note}\n## What is an outlier in a time series?\n\nTime-series outliers come in flavours: a one-off spike (instrumentation\nerror), a short burst of \"wrong\" values (a sensor reboot), or a sustained\nshift (a real but rare event like a heat wave). The Isolation Forest\ndefaults used here are tuned to catch the first kind. Treat the\n`contamination` parameter as a guess at how rare such spikes are — if the\ndata is famously clean, set it low; if it is famously messy, set it\nhigher.\n:::\n\n## Stage 3 — Imputation and sample weighting\n\n*Why this step exists.* After stage 2 the series has `NaN`s wherever it\noriginally had missing rows *or* where stage 2 just removed an outlier.\nLag-based models cannot accept `NaN` in the input. We fill the gaps via\nimputation (see @def-imputation), but we also remember which rows were\nimputed so the loss function can pay less attention to them by way of\nsample weights (see @def-sample-weight).\n\n`get_missing_weights(data, window_size=window_size)` does two things:\n\n1. It fills the gaps using both forward-fill and backward-fill so no\n `NaN` survives.\n2. It builds a weight series: a binary contamination mask (`1` where the\n value was imputed, `0` otherwise) is propagated forward by `window_size`\n positions with a rolling maximum. Any training row whose lag window\n *touches* an imputed value receives weight `0`. All other rows receive\n weight `1`. The downstream forecaster will then see the filled-in values\n but down-weight their gradient contribution to zero.\n\nThe resulting `weights_series` is wrapped in a `WeightFunction` object,\n*not* passed as a plain array. This indirection solves three problems at\nonce:\n\n- The training matrix produced by\n `ForecasterRecursive._create_train_X_y` is shorter than the original\n series by `max(lags)` rows. A positional array would misalign.\n- A *callable* that receives `X_train.index` does a label-based lookup\n into the Series and is immune to that offset regardless of `lags`.\n- `WeightFunction` is a plain class, so it is picklable and gets saved to\n disk together with the forecaster (a closure-based weight function\n would not be).\n\n::: {#exm-weight-function}\n\n## Label-based weight lookup\n\n::: {#37414055 .cell execution_count=2}\n``` {.python .cell-code}\nimport pandas as pd\nimport numpy as np\n\nweights = pd.Series([1, 1, 0, 0, 0, 1, 1, 1], dtype=float,\n index=pd.date_range(\"2020-01-01\", periods=8, freq=\"h\"))\n\nclass WeightFunction:\n def __init__(self, weights_series):\n self.weights_series = weights_series\n def __call__(self, index):\n return self.weights_series.reindex(index).fillna(1.0).values\n\nwf = WeightFunction(weights)\nquery_idx = weights.index[1:5]\nprint(\"Queried timestamps :\", query_idx.tolist())\nprint(\"Returned weights :\", wf(query_idx).tolist())\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nQueried timestamps : [Timestamp('2020-01-01 01:00:00'), Timestamp('2020-01-01 02:00:00'), Timestamp('2020-01-01 03:00:00'), Timestamp('2020-01-01 04:00:00')]\nReturned weights : [1.0, 0.0, 0.0, 0.0]\n```\n:::\n:::\n\n\n:::\n\n::: {.callout-note}\n## Why we weight instead of drop\n\nWe *could* delete every row near a gap, but that breaks the\n\"regular hourly grid\" invariant the recursive forecaster needs. By keeping\nthe rows but giving them weight `0`, the time axis stays uniform and the\nloss function still ignores the bad data.\n:::\n\n## Stage 4 — Exogenous feature engineering\n\n*Why this step exists.* A target series alone often does not contain enough\nsignal to forecast accurately — but combining it with calendar and weather\ncontext typically does. Stage 4 produces four feature DataFrames, each\nindexed on the *extended* timeline `[start, cov_end]` so that the feature\nmatrix is defined both over the training window and over the prediction\nwindow.\n\nThe four feature categories, in the order they will be concatenated in\nstage 5, are:\n\n1. Calendar features — `get_calendar_features(start, cov_end, freq=\"h\",\n timezone=timezone)` uses `feature_engine.DatetimeFeatures` to extract\n `month`, `week`, `day_of_week`, and `hour` from the index alone. Because\n these are derived from the index, they are always complete — no\n imputation needed.\n2. Day/night features — `get_day_night_features(start, cov_end, location,\n ...)` computes sunrise and sunset times for the configured geographic\n location using the `astral` library. Sunrise and sunset hours are\n rounded to the nearest hour, and an `is_daylight` flag equals `1`\n during the hours between sunrise and sunset. Per-day sunrise/sunset\n values are cached so multi-year series do not recompute the solar\n position for every hourly row.\n3. Weather features — `get_weather_features(...)` fetches weather data\n for the configured latitude/longitude and applies a sliding window\n transform that computes rolling mean, max, and min over one-day and\n seven-day windows for each numeric weather column. The helper itself\n resolves missing values (backward fill + curate step) before returning,\n so callers can treat the result as gap-free.\n4. Holiday features — `get_holiday_features(...)` calls the `holidays`\n library for the configured `country_code` and `state`, validates the\n returned dates with an internal curate step, and reindexes the binary\n flag to the hourly grid with `fill_value=0`. The model sees a clean\n `0 / 1` column it can split on directly.\n\n::: {#exm-day-night}\n\n## Daylight flag for a June day\n\n::: {#a88f572c .cell execution_count=3}\n``` {.python .cell-code}\nimport pandas as pd\nimport numpy as np\n\ndates = pd.date_range(\"2020-06-01\", periods=24, freq=\"h\", tz=\"UTC\")\nsunrise_hour = 4 # approximate June sunrise at 51°N\nsunset_hour = 21 # approximate June sunset at 51°N\n\nis_daylight = np.where(\n (dates.hour >= sunrise_hour) & (dates.hour < sunset_hour), 1, 0\n)\ndf_sun = pd.DataFrame({\"hour\": dates.hour, \"is_daylight\": is_daylight}, index=dates)\nprint(df_sun.groupby(\"is_daylight\")[\"hour\"].apply(list).to_string())\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nis_daylight\n0 [0, 1, 2, 3, 21, 22, 23]\n1 [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...\n```\n:::\n:::\n\n\n:::\n\n## Stage 5 — Combining and encoding exogenous features\n\n*Why this step exists.* The four DataFrames from stage 4 need to be joined\ninto one matrix and then transformed so the model can learn from the\nperiodic structure of calendar variables.\n\nThe pipeline concatenates the four feature DataFrames column-wise in this\nexact order:\n\n```python\nexogenous_features = pd.concat(\n [calendar_features, sun_light_features, weather_features, holiday_features],\n axis=1,\n)\n```\n\nIt then runs a *single* hard check: if any `NaN` survived into\n`exogenous_features`, a `ValueError` is raised reporting the count of\nmissing entries. This guard is the catch-all for any silent gap left by a\nfeature helper.\n\nTwo transformations follow:\n\n1. Cyclical encoding — `apply_cyclical_encoding(data,\n drop_original=False)` replaces each periodic integer column (`month`,\n `week`, `day_of_week`, `hour`, `sunrise_hour`, `sunset_hour`) by its\n sine and cosine on a unit circle scaled to the natural period of the\n column. Because `drop_original=False`, the original integer columns are\n kept alongside the new sine/cosine columns — the model can use either\n representation depending on which is more informative.\n2. Interaction terms — `create_interaction_features(exogenous_features,\n weather_aligned)` uses `sklearn.preprocessing.PolynomialFeatures` with\n `interaction_only=True` to produce pairwise products of the calendar\n cyclical columns with weather-window columns, raw weather columns, and\n (if present) the holiday column. These columns are prefixed with\n `poly_` and capture joint effects such as \"cold morning *and* weekday\n peak\" that neither feature can describe on its own.\n\n::: {#exm-cyclical-month}\n\n## Cyclical encoding of the month\n\n::: {#337cbe31 .cell execution_count=4}\n``` {.python .cell-code}\nimport numpy as np\nimport pandas as pd\n\ndates = pd.date_range(\"2020-01-01\", periods=12, freq=\"ME\")\nmonth = dates.month\nmonth_sin = np.sin(2 * np.pi * month / 12)\nmonth_cos = np.cos(2 * np.pi * month / 12)\n\npd.DataFrame(\n {\"month\": month, \"sin\": month_sin.round(4), \"cos\": month_cos.round(4)},\n index=dates,\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=4}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monthsincos
2020-01-3110.5000.866
2020-02-2920.8660.500
2020-03-3131.0000.000
2020-04-3040.866-0.500
2020-05-3150.500-0.866
2020-06-3060.000-1.000
2020-07-317-0.500-0.866
2020-08-318-0.866-0.500
2020-09-309-1.000-0.000
2020-10-3110-0.8660.500
2020-11-3011-0.5000.866
2020-12-3112-0.0001.000
\n
\n```\n:::\n:::\n\n\n:::\n\n::: {.callout-note}\n## Why sine and cosine instead of just the hour number?\n\nIf you feed an integer hour 0–23 directly to a tree-based model, hour 23\nand hour 0 look *very* far apart numerically — even though they are\nneighbours on a clock. Sine and cosine wrap the integer onto a unit\ncircle, so adjacent hours stay adjacent in the encoded space and the model\ndoes not learn a spurious discontinuity at midnight.\n:::\n\n## Stage 6 — Feature selection\n\n*Why this step exists.* The combined matrix from stage 5 contains many\ncolumns the caller may not want. Stage 6 builds the final list of column\nnames that will be passed as `exog` to the forecaster.\n\n`select_exogenous_features(exogenous_features, weather_aligned,\ninclude_weather_windows, include_holiday_features, poly_features_degree)`\napplies these rules:\n\n- Always included: the cyclical sine/cosine columns (matched by the\n regular expression `_sin$|_cos$`) and the raw weather columns (which\n are exactly the columns of `weather_aligned`).\n- Included only if `include_weather_windows=True`: the rolling-window\n weather features.\n- Included only if `include_holiday_features=True`: the binary\n `is_holiday` column.\n- Included only if `poly_features_degree >= 2`: the `poly_`-prefixed\n interaction features (capped to `max_poly_features` by mutual\n information).\n\nThe final list is deduplicated with the order-preserving idiom\n`list(dict.fromkeys(exog_features))` so that no column name appears twice\n(a duplicate column would crash the forecaster's lag-matrix construction).\n\n## Stage 7 — Merging target and exogenous data\n\n*Why this step exists.* The forecaster expects one DataFrame with the\ntarget column and a separate DataFrame of exogenous features, both indexed\non the same training window — and a separate exogenous slice covering the\nfuture prediction window.\n\n`merge_data_and_covariates(data, exogenous_features, target_columns,\nexog_features, start, end, cov_end, forecast_horizon, cast_dtype=\"float32\")`\nreturns three DataFrames:\n\n- `data_with_exog` — an *inner join* of the target columns and the\n selected exogenous columns over `[start, end]`. The inner join is the\n belt-and-braces guarantee that only rows where both sides are defined\n survive; any residual misalignment is dropped here.\n- `exo_tmp` — the slice of the exogenous feature matrix that covers\n `[start, end]`. Useful for inspection.\n- `exo_pred` — the slice that covers `(end, cov_end]`, the future window\n used at prediction time.\n\nEvery column of `data_with_exog` is cast to `float32` to halve memory\nconsumption during lag-matrix construction without measurable loss in\npredictive accuracy for this class of model.\n\n## Stage 8 — Train / validation / test split\n\n*Why this step exists.* The model must be evaluated on data it has never\nseen. Random shuffling is dangerous in time series — it lets the model\n\"learn from the future\". The split here is purely temporal — see\n@def-train-val-test.\n\n`split_rel_train_val_test(data_with_exog, perc_train=train_ratio,\nperc_val=1.0 - train_ratio)` slices the rows in order: the first\n`train_ratio` fraction goes to training, and the remainder goes to\nvalidation. With the default `train_ratio=0.8`, that is 80 % training and\n20 % validation — the test set is empty because `perc_val = 1 - train_ratio`\nconsumes all remaining rows.\n\nThe boundary timestamp `end_validation` — the last index of the combined\ntrain + validation segment — is the cutoff used for `forecaster.fit(...)`.\n\n::: {#exm-split-ratios}\n\n## Computing train/val/test sizes\n\n::: {#cf701f36 .cell execution_count=5}\n``` {.python .cell-code}\nimport pandas as pd\nimport numpy as np\n\nn = 100\nidx = pd.date_range(\"2020-01-01\", periods=n, freq=\"h\")\ny = pd.Series(np.arange(n, dtype=float), index=idx)\n\ntrain_ratio = 0.8\nperc_val = 1.0 - train_ratio\nn_train = int(round(n * train_ratio))\nn_val = int(round(n * perc_val))\nn_test = n - n_train - n_val\n\nprint(f\"Total rows : {n}\")\nprint(f\"Train rows : {n_train} ({train_ratio:.0%})\")\nprint(f\"Val rows : {n_val} ({perc_val:.0%})\")\nprint(f\"Test rows : {n_test}\")\nprint(f\"Train ends : {y.index[n_train - 1]}\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nTotal rows : 100\nTrain rows : 80 (80%)\nVal rows : 20 (20%)\nTest rows : 0\nTrain ends : 2020-01-04 07:00:00\n```\n:::\n:::\n\n\n:::\n\n::: {.callout-note}\n## Why temporal, not random?\n\nIf you shuffled rows before splitting, the model might be trained on hour\n14 of 1 March 2024 and tested on hour 13 of the same day — i.e. it would\nknow the \"answer\" sits one step before the \"question\". A temporal split\nforces the model to forecast a real future from a real past.\n:::\n\n## Stage 9 — Training or loading recursive forecasters\n\n*Why this step exists.* Retraining a forecaster takes time. For repeated\ncalls with the same configuration (a typical operational pattern), we want\nto load previously trained models from disk instead.\n\nThe decision is governed by two parameters:\n\n- `force_train` (default `True`): if `True`, all forecasters are\n retrained from scratch and any cached models are overwritten. Set\n `force_train=False` to skip training when cached models exist.\n- `model_dir`: directory where the joblib `.pkl` files live. If `None`\n (the default), the pipeline uses\n `get_cache_home() / \"forecasters\"` — i.e.\n `~/spotforecast2_cache/forecasters` unless overridden by the\n `SPOTFORECAST2_CACHE` environment variable.\n\nWhen `force_train=False` *and* the model directory exists,\n`load_forecasters(target_columns, model_dir)` attempts to deserialise one\n`ForecasterRecursive` object per target column. Targets whose model file\nis missing are collected into `targets_to_train` and trained below; the\nothers are reused.\n\nWhen training is required, a `ForecasterRecursive` is constructed for\neach target in `targets_to_train` (see @def-recursive-forecaster for the\nunderlying idea):\n\n```python\nforecaster = ForecasterRecursive(\n estimator=estimator, # default: LGBMRegressor(random_state=1234, verbose=-1)\n lags=lags, # how many past values to use as features\n window_features=RollingFeatures(\n stats=[\"mean\"], window_sizes=window_size,\n ), # adds a rolling-mean feature alongside the lags\n weight_func=weight_func, # the WeightFunction from stage 3\n)\nforecaster.fit(\n y=data_with_exog[target].loc[:end_validation].squeeze(),\n exog=data_with_exog[exog_features].loc[:end_validation],\n)\n```\n\nNote that the fit uses `:end_validation`, not the end of the full dataset\n— the test portion is held out so a downstream evaluation step can score\nthe model on truly unseen data.\n\nAfter training, `save_forecasters(forecasters, model_dir)` serialises\neach fitted object with joblib, one file per target column. Because\n`WeightFunction` is a regular class (not a closure), it survives the\npickle/unpickle round trip together with the rest of the model.\n\nA `show_progress=True` flag wraps the per-target loop in a `tqdm`\nprogress bar — useful for runs with many target columns.\n\n::: {#fa87622e .cell execution_count=6}\n``` {.python .cell-code}\nfrom lightgbm import LGBMRegressor\n\nestimator = LGBMRegressor(random_state=1234, verbose=-1)\nprint(f\"random_state : {estimator.random_state}\")\nprint(f\"verbose : {estimator.verbose}\")\nprint(f\"n_estimators : {estimator.n_estimators}\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nrandom_state : 1234\nverbose : -1\nn_estimators : 100\n```\n:::\n:::\n\n\n::: {.callout-note}\n## What a recursive forecaster does, one step at a time\n\nAt step 1, the forecaster sees the most recent `lags` values of the\ntarget plus the exogenous features for step 1 and predicts the value for\nstep 1. At step 2, it shifts the lag window by one — using the\n*just-predicted* step-1 value as the most recent lag — and predicts step\n2. It keeps doing this for `forecast_horizon` steps. That is why the\nexogenous matrix for the future window must be fully known *before*\nprediction starts: the forecaster cannot ask \"what will the temperature\nbe at step 5?\" mid-way through.\n:::\n\n## Stage 10 — Prediction\n\n*Why this step exists.* This is the payoff: turn the trained forecasters\nand the future exogenous matrix into actual forecasts.\n\n`predict_multivariate(recursive_forecasters, steps_ahead=forecast_horizon,exog=exo_pred[exog_features],show_progress=show_progress)` iterates over\nthe trained forecasters dictionary and calls `.predict(steps=horizon,exog=...)` on each. The exogenous matrix passed to every forecaster is the\nsame slice `exo_pred[exog_features]` covering exactly `forecast_horizon`\nsteps after `end_validation`. The result is assembled into a `predictions`\nDataFrame with one column per target and `forecast_horizon` rows.\n\n::: {#9dff7c85 .cell execution_count=7}\n``` {.python .cell-code}\nimport numpy as np\nimport pandas as pd\n\nforecast_horizon = 24\nn_targets = 11\nrng = np.random.default_rng(42)\n\npred_index = pd.date_range(\"2023-01-02\", periods=forecast_horizon, freq=\"h\")\npredictions = pd.DataFrame(\n rng.standard_normal((forecast_horizon, n_targets)),\n index=pred_index,\n columns=[f\"col_{i+1}\" for i in range(n_targets)],\n)\n\nprint(f\"Predictions shape : {predictions.shape}\")\nprint(f\"First timestamp : {predictions.index[0]}\")\nprint(f\"Last timestamp : {predictions.index[-1]}\")\npredictions.head(3).round(4)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nPredictions shape : (24, 11)\nFirst timestamp : 2023-01-02 00:00:00\nLast timestamp : 2023-01-02 23:00:00\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
col_1col_2col_3col_4col_5col_6col_7col_8col_9col_10col_11
2023-01-02 00:00:000.3047-1.04000.75050.9406-1.9510-1.30220.1278-0.3162-0.0168-0.85300.8794
2023-01-02 01:00:000.77780.06601.12720.4675-0.85930.3688-0.95890.8785-0.0499-0.1849-0.6809
2023-01-02 02:00:001.2225-0.1545-0.4283-0.35210.53230.36540.41270.43082.1416-0.4064-0.5122
\n
\n```\n:::\n:::\n\n\n## Metadata and return values\n\nThe function returns a three-element tuple. The first element is the\n`predictions` DataFrame described above. The second element is a\n`metadata` dictionary that records every parameter and intermediate shape\nof the run:\n\n- `forecast_horizon`\n- `target_columns`\n- `exog_features`\n- `n_exog_features`\n- `train_size`, `val_size`, `test_size`\n- `data_shape_original`, `data_shape_merged`\n- `training_end`\n- `prediction_start`, `prediction_end`\n- `lags`, `window_size`, `contamination`\n- `n_outliers` (computed as `outliers.sum()` for the Series case, or\n `len(outliers)` otherwise)\n\nThis dictionary is a self-contained audit record — it lets a future reader\nreconstruct the run's configuration without rerunning the pipeline.\n\nThe third element is the `recursive_forecasters` dictionary keyed by\ntarget column name. Returning the fitted objects lets the caller inspect\ninternal state (feature names, lag matrices, fitted estimator parameters)\nor call `.predict` again with a different horizon, without retraining.\n\n## Aggregation\n\nStage 10 returns one forecast Series per target column — eleven independent\ntrajectories for the bundled demo data. Operational consumers often need a\nsingle scalar trajectory instead: a regional net position, a portfolio-level\nload forecast, or a balancing-zone aggregate. The helper\n`agg_predict(predictions, weights=weights)` performs exactly that reduction.\nIt takes the prediction DataFrame from Stage 10 and returns a Series with the\nsame `DatetimeIndex`, computed as the weighted sum of the per-column\nforecasts.\n\nThe `weights` argument accepts three forms:\n\n- a `list` or `numpy.ndarray` matched *positionally* against the columns of\n `predictions` (length must equal the column count),\n- a `dict` of `{column_name: weight}` matched *by name*, useful when the\n caller wants to be explicit about which column carries which weight,\n- `None`, which falls back to the package default.\n\nA list or array entry may be negative. A positive weight adds that column's\nforecast to the aggregate; a negative weight subtracts it. This signed\nconvention is used with the eleven-element vector\n`[+1, +1, -1, -1, +1, -1, +1, +1, +1, -1, +1]` to express a net-position\naggregation in which the first, second, fifth, seventh, eighth, ninth, and\neleventh columns are added and the remaining columns are subtracted.\n\nThe n2n pipeline remains available and is used by `task_safe_demo`. The\n`ConfigMulti`-driven, `agg_weights`-configurable n-to-1 workflow it describes now\nruns directly on `spotforecast2_safe.multitask` (`MultiTask` /\n`multitask.runner.run`); the former `spotforecast-safe-n2o1-cov-df` console task\nwas removed in 20.0.0.\n\n::: {#d07a59b0 .cell execution_count=8}\n``` {.python .cell-code}\nfrom spotforecast2_safe.processing.agg_predict import agg_predict\n\nweights = [1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 1.0, 1.0, 1.0, -1.0, 1.0]\ncombined_prediction = agg_predict(predictions, weights=weights)\n\nprint(f\"combined shape : {combined_prediction.shape}\")\nprint(f\"first timestamp : {combined_prediction.index[0]}\")\nprint(f\"last timestamp : {combined_prediction.index[-1]}\")\ncombined_prediction.head(3).round(4)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\ncombined shape : (24,)\nfirst timestamp : 2023-01-02 00:00:00\nlast timestamp : 2023-01-02 23:00:00\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=8}\n```\n2023-01-02 00:00:00 -1.5479\n2023-01-02 01:00:00 -2.6054\n2023-01-02 02:00:00 4.8947\nFreq: h, dtype: float64\n```\n:::\n:::\n\n\n## End-to-end example {#sec-end-to-end}\n\nThe cell below runs the entire pipeline with very small parameters\n(`forecast_horizon=2`, `lags=4`, `window_size=8`), using\n`tempfile.mkdtemp()` as the model directory so the run does not pollute\nthe user's persistent cache.\n\n::: {#exm-end-to-end}\n\n## Full pipeline on small parameters\n\n::: {#3ba4103b .cell execution_count=9}\n``` {.python .cell-code}\nimport tempfile\n\nfrom spotforecast2_safe.processing.agg_predict import agg_predict\nfrom spotforecast2_safe.processing.n2n_predict_with_covariates import (\n n2n_predict_with_covariates,\n)\n\npredictions, metadata, forecasters = n2n_predict_with_covariates(\n forecast_horizon=2,\n lags=4,\n window_size=8,\n force_train=True,\n model_dir=tempfile.mkdtemp(),\n verbose=False,\n)\n\nprint(f\"predictions shape : {predictions.shape}\")\nprint(f\"target columns : {metadata['target_columns'][:3]} ...\")\nprint(f\"# exog features : {metadata['n_exog_features']}\")\nprint(f\"training_end : {metadata['training_end']}\")\nprint(f\"prediction window : {metadata['prediction_start']} → {metadata['prediction_end']}\")\npredictions.head().round(4)\n\nweights = [1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 1.0, 1.0, 1.0, -1.0, 1.0]\ncombined_prediction = agg_predict(predictions, weights=weights)\n\nprint(f\"combined shape : {combined_prediction.shape}\")\ncombined_prediction.head(3).round(4)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\npredictions shape : (2, 11)\ntarget columns : ['A', 'B', 'C'] ...\n# exog features : 27\ntraining_end : 2021-12-24 21:00:00+00:00\nprediction window : 2021-12-24 22:00:00+00:00 → 2021-12-24 23:00:00+00:00\ncombined shape : (2,)\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=9}\n```\n2021-12-24 22:00:00+00:00 19116.1863\n2021-12-24 23:00:00+00:00 15714.5288\nFreq: h, dtype: float64\n```\n:::\n:::\n\n\n:::\n\nThe same call with `force_train=False` and the same `model_dir` would\nload the just-trained forecasters from disk instead of retraining — the\ncore caching mechanism behind the pipeline's speedup on repeated runs.\n\n## The complete execution flow\n\nA single call to `n2n_predict_with_covariates` follows this invariant\nsequence:\n\n1. Input validation runs first.\n2. The target data is loaded and cleaned (Stage 1).\n3. Outlier positions are replaced with `NaN` (Stage 2).\n4. Missing values are imputed and sample weights are computed (Stage 3).\n5. Four categories of exogenous features are constructed over the full\n extended time window (Stage 4).\n6. The features are concatenated, validated for completeness, cyclically\n encoded, and augmented with interaction terms (Stage 5).\n7. A feature-selection step reduces the matrix to the columns relevant\n to the configured pipeline variant (Stage 6).\n8. The selected features are merged with the target data over the\n historical window (Stage 7).\n9. The merged data is split temporally into train, validation, and test\n segments (Stage 8).\n10. Forecasters are either loaded from disk or trained from scratch and\n then persisted (Stage 9).\n11. Predictions are generated for the `forecast_horizon` steps beyond the\n training end (Stage 10).\n\nEvery public parameter is validated at entry, every exogenous feature\nmatrix is checked for completeness before it enters the model, and every\nfitted object is serialised in a form that survives process boundaries.\nThese invariants make the pipeline safe to embed in automated batch jobs\nwhere a silent failure would not be discovered until long after the\nprediction window has closed.\n\n## sf2-safe API surface used in this walkthrough\n\nThe first table below lists every `spotforecast2_safe` symbol referenced\n**directly** by the orchestrator or named in this page's prose. The\nsecond table extends that set with the **transitive** symbols that the\ndirect dependencies themselves import from `spotforecast2_safe` — these\nare the classes and helpers a reader will encounter as soon as they\nopen one of the level-1 modules. Third-party dependencies (pandas,\nnumpy, scikit-learn, lightgbm, astral, holidays, feature-engine,\njoblib, tqdm) are deliberately omitted from both tables.\n\n### Direct (orchestrator + prose)\n\n| Symbol | Kind | sf2-safe module | Stage |\n| --- | --- | --- | --- |\n| [`n2n_predict_with_covariates`](`spotforecast2_safe.processing.n2n_predict_with_covariates.n2n_predict_with_covariates`) | function | `spotforecast2_safe.processing.n2n_predict_with_covariates` | orchestrator |\n| [`fetch_data`](`spotforecast2_safe.data.fetch_data.fetch_data`) | function | `spotforecast2_safe.data.fetch_data` | Stage 1 |\n| [`get_package_data_home`](`spotforecast2_safe.data.fetch_data.get_package_data_home`) | function | `spotforecast2_safe.data.fetch_data` | Stage 1 |\n| [`get_cache_home`](`spotforecast2_safe.data.fetch_data.get_cache_home`) | function | `spotforecast2_safe.data.fetch_data` | Stage 9 |\n| `get_start_end` | function | `spotforecast2_safe.preprocessing.curate_data` | Stage 1 |\n| `basic_ts_checks` | function | `spotforecast2_safe.preprocessing.curate_data` | Stage 1 |\n| `agg_and_resample_data` | function | `spotforecast2_safe.preprocessing.curate_data` | Stage 1 |\n| `mark_outliers` | function | `spotforecast2_safe.preprocessing.outlier` | Stage 2 |\n| `get_missing_weights` | function | `spotforecast2_safe.preprocessing.imputation` | Stage 3 |\n| `WeightFunction` | class | `spotforecast2_safe.preprocessing` (re-export of `.imputation.WeightFunction`) | Stage 3 |\n| `get_calendar_features` | function | `spotforecast2_safe.calendar` | Stage 4 |\n| `get_day_night_features` | function | `spotforecast2_safe.calendar` | Stage 4 |\n| `get_holiday_features` | function | `spotforecast2_safe.calendar` | Stage 4 |\n| `get_weather_features` | function | `spotforecast2_safe.weather` | Stage 4 |\n| `apply_cyclical_encoding` | function | `spotforecast2_safe.manager.features` | Stage 5 |\n| `create_interaction_features` | function | `spotforecast2_safe.manager.features` | Stage 5 |\n| `select_exogenous_features` | function | `spotforecast2_safe.manager.features` | Stage 6 |\n| `merge_data_and_covariates` | function | `spotforecast2_safe.manager.features` | Stage 7 |\n| `split_rel_train_val_test` | function | `spotforecast2_safe.splitter.split` | Stage 8 |\n| [`ForecasterRecursive`](`spotforecast2_safe.forecaster.recursive.ForecasterRecursive`) | class | `spotforecast2_safe.forecaster.recursive` | Stage 9 |\n| `ForecasterRecursive._create_train_X_y` | method | `spotforecast2_safe.forecaster.recursive.ForecasterRecursive` | Stage 3 (referenced) |\n| `RollingFeatures` | class | `spotforecast2_safe.preprocessing` (re-export of `.rolling.RollingFeatures`) | Stage 9 |\n| `load_forecasters` | function | `spotforecast2_safe.manager.persistence` | Stage 9 |\n| `save_forecasters` | function | `spotforecast2_safe.manager.persistence` | Stage 9 |\n| `predict_multivariate` | function | `spotforecast2_safe.forecaster.utils` | Stage 10 |\n\n### Transitive (imported by the direct dependencies)\n\nThese symbols are not invoked from the orchestrator or named in the\nprose above, but the direct dependencies pull them in from\n`spotforecast2_safe`. A reader who opens any of the level-1 modules\nwill land on these next, so they are part of the same internal\ncontract.\n\n| Symbol | Kind | sf2-safe module | Surfaces in |\n| --- | --- | --- | --- |\n| `QuantileBinner` | class | `spotforecast2_safe.preprocessing` (re-export of `._binner.QuantileBinner`) | `ForecasterRecursive` internals (residual-binning) |\n| `TimeSeriesDifferentiator` | class | `spotforecast2_safe.preprocessing` (re-export of `._differentiator.TimeSeriesDifferentiator`) | `ForecasterRecursive` internals (differentiation) |\n| `LinearlyInterpolateTS` | class | `spotforecast2_safe.preprocessing.linearly_interpolate_ts` | `get_missing_weights` imputation path |\n| `WeatherService` | class | `spotforecast2_safe.weather` | `fetch_data` weather-fetch helper |\n| `create_holiday_df` | function | `spotforecast2_safe.calendar` (re-export of `.holiday.create_holiday_df`) | `fetch_data.fetch_holiday_data` |\n| `curate_weather` | function | `spotforecast2_safe.preprocessing.curate_data` | `get_weather_features` post-fetch curation |\n| `convert_to_utc` | function | `spotforecast2_safe.utils.convert_to_utc` | `fetch_data` index normalisation |\n| `to_utc_timestamp` | function | `spotforecast2_safe.utils.convert_to_utc` | calendar + weather range-boundary normalisation |\n| `ForecasterBase` | class | `spotforecast2_safe.forecaster.base` | base class of `ForecasterRecursive` |\n| `check_y`, `check_exog`, `check_exog_dtypes`, `check_interval`, `check_predict_input`, `check_residuals_input`, `get_exog_dtypes`, `set_cpu_gpu_device` | functions | `spotforecast2_safe.preprocessing.checking` | fail-safe input validation in `ForecasterRecursive.fit` / `predict` and in `forecaster.utils` |\n| `expand_index`, `input_to_frame`, `transform_dataframe`, `date_to_index_position` | functions | `spotforecast2_safe.preprocessing.data_transform` | index / frame coercion in `forecaster.utils` and `ForecasterRecursive` |\n| `check_select_fit_kwargs`, `initialize_lags`, `initialize_weights` | functions | `spotforecast2_safe.preprocessing.forecaster_config` | constructor + fit-time configuration for `ForecasterRecursive` |\n| `check_extract_values_and_index`, `get_style_repr_html`, `initialize_estimator`, `initialize_window_features`, `transform_numpy` | functions | `spotforecast2_safe.forecaster.utils` | helpers consumed by `ForecasterRecursive` (already exported by the module table above) |\n| `DataTransformationWarning`, `NotFittedError`, `ResidualsUsageWarning`, `IgnoredArgumentWarning`, `InputTypeWarning`, `MissingValuesWarning`, `UnknownLevelWarning`, `set_skforecast_warnings` | exceptions / warnings | `spotforecast2_safe.exceptions` | raised across `ForecasterRecursive`, `forecaster.utils`, and the preprocessing layer |\n\n## sf2-safe internal dependencies\n\nThe walkthrough touches the following `spotforecast2_safe` subpackages.\nEach line collapses the symbols listed above into the module they live\nin, so the dependency footprint of this page is visible at a glance.\n\n- `spotforecast2_safe.processing.n2n_predict_with_covariates` — the\n orchestrator entry point exercised by the end-to-end cell.\n- `spotforecast2_safe.data.fetch_data` — `fetch_data`,\n `get_package_data_home`, `get_cache_home`.\n- `spotforecast2_safe.preprocessing.curate_data` — `get_start_end`,\n `basic_ts_checks`, `agg_and_resample_data`.\n- `spotforecast2_safe.preprocessing.outlier` — `mark_outliers`.\n- `spotforecast2_safe.preprocessing.imputation` — `get_missing_weights`,\n `WeightFunction` (re-exported one level up at\n `spotforecast2_safe.preprocessing.WeightFunction`).\n- `spotforecast2_safe.preprocessing.rolling` — `RollingFeatures`\n (re-exported one level up at\n `spotforecast2_safe.preprocessing.RollingFeatures`).\n- `spotforecast2_safe.calendar` — `get_calendar_features`,\n `get_day_night_features`, `get_holiday_features`.\n- `spotforecast2_safe.weather` — `get_weather_features`.\n- `spotforecast2_safe.manager.features` — `apply_cyclical_encoding`,\n `create_interaction_features`, `select_exogenous_features`,\n `merge_data_and_covariates`.\n- `spotforecast2_safe.manager.persistence` — `load_forecasters`,\n `save_forecasters` (and the orchestrator additionally calls\n `model_directory_exists`).\n- `spotforecast2_safe.splitter.split` — `split_rel_train_val_test`.\n- `spotforecast2_safe.forecaster.recursive` — `ForecasterRecursive`\n (and its internal `_create_train_X_y` referenced in Stage 3).\n- `spotforecast2_safe.forecaster.utils` — `predict_multivariate` (and\n the helpers `check_extract_values_and_index`, `get_style_repr_html`,\n `initialize_estimator`, `initialize_window_features`, `transform_numpy`\n consumed by `ForecasterRecursive`).\n\nThe following modules are not imported by the orchestrator directly,\nbut the level-1 dependencies above pull them in. Any change to a\npublic symbol in one of these modules can therefore propagate up into\nthis walkthrough's pipeline.\n\n- `spotforecast2_safe.forecaster.base` — `ForecasterBase` (base class\n of `ForecasterRecursive`).\n- `spotforecast2_safe.preprocessing._binner` — `QuantileBinner`\n (re-exported at `spotforecast2_safe.preprocessing.QuantileBinner`,\n used by `ForecasterRecursive` for residual binning).\n- `spotforecast2_safe.preprocessing._differentiator` —\n `TimeSeriesDifferentiator` (re-exported at\n `spotforecast2_safe.preprocessing.TimeSeriesDifferentiator`, used by\n `ForecasterRecursive` for differentiation).\n- `spotforecast2_safe.preprocessing.linearly_interpolate_ts` —\n `LinearlyInterpolateTS`, used by the imputation path of\n `get_missing_weights`.\n- `spotforecast2_safe.preprocessing.checking` — `check_y`, `check_exog`,\n `check_exog_dtypes`, `check_interval`, `check_predict_input`,\n `check_residuals_input`, `get_exog_dtypes`, `set_cpu_gpu_device`\n (fail-safe input validation throughout `ForecasterRecursive` and\n `forecaster.utils`).\n- `spotforecast2_safe.preprocessing.data_transform` — `expand_index`,\n `input_to_frame`, `transform_dataframe`, `date_to_index_position`\n (index and frame coercion).\n- `spotforecast2_safe.preprocessing.forecaster_config` —\n `check_select_fit_kwargs`, `initialize_lags`, `initialize_weights`\n (constructor and fit-time configuration).\n- `spotforecast2_safe.utils.convert_to_utc` — `convert_to_utc`,\n `to_utc_timestamp` (timezone normalisation used by `fetch_data`,\n `calendar.*`, and `weather.features`).\n- `spotforecast2_safe.exceptions` — `DataTransformationWarning`,\n `NotFittedError`, `ResidualsUsageWarning`, `IgnoredArgumentWarning`,\n `InputTypeWarning`, `MissingValuesWarning`, `UnknownLevelWarning`,\n `set_skforecast_warnings` (raised across the pipeline).\n\n", "supporting": [ "n2n_predict_with_covariates_explained_files/figure-html" ], diff --git a/_quarto.yml b/_quarto.yml index 014153254..6beb307af 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -387,6 +387,14 @@ website: file: docs/reference/calendar.holiday.create_holiday_adjacency_df.qmd - text: "get_holiday_adjacency_features" file: docs/reference/calendar.holiday.get_holiday_adjacency_features.qmd + - text: "create_day_type_df" + file: docs/reference/calendar.holiday.create_day_type_df.qmd + - text: "get_day_type_features" + file: docs/reference/calendar.holiday.get_day_type_features.qmd + - text: "create_school_holiday_df" + file: docs/reference/calendar.holiday.create_school_holiday_df.qmd + - text: "get_school_holiday_features" + file: docs/reference/calendar.holiday.get_school_holiday_features.qmd - text: "get_calendar_features" file: docs/reference/calendar.features.get_calendar_features.qmd - text: "get_day_night_features" @@ -747,6 +755,8 @@ quartodoc: - calendar.holiday.get_holiday_adjacency_features - calendar.holiday.create_day_type_df - calendar.holiday.get_day_type_features + - calendar.holiday.create_school_holiday_df + - calendar.holiday.get_school_holiday_features - calendar.features.get_calendar_features - calendar.features.get_day_night_features - calendar.features.get_ephemeris_features diff --git a/docs/reference/calendar.holiday.create_school_holiday_df.qmd b/docs/reference/calendar.holiday.create_school_holiday_df.qmd new file mode 100644 index 000000000..6a28e34c6 --- /dev/null +++ b/docs/reference/calendar.holiday.create_school_holiday_df.qmd @@ -0,0 +1,66 @@ +# calendar.holiday.create_school_holiday_df { #spotforecast2_safe.calendar.holiday.create_school_holiday_df } + +```python +calendar.holiday.create_school_holiday_df( + start, + end, + tz='UTC', + freq='h', + country_code='DE', + state='NW', +) +``` + +Create a DataFrame with a binary school-holiday indicator for a German state. + +Builds a tz-aware time grid over ``[start, end]`` at *freq* and marks +every timestamp that falls within a school-holiday period of the requested +Bundesland as ``1``; all others are ``0``. Both edges of each interval +are inclusive. + +Data source: OpenHolidays API (https://openholidaysapi.org), ODbL-1.0. +Coverage: 2022-01-01 to 2027-12-31 for all 16 German Bundesländer. + +Only ``country_code="DE"`` is supported. Requests whose span extends +beyond the covered range at either edge raise ``ValueError`` — there is +no fill or extrapolation. + +## Parameters {.doc-section .doc-section-parameters} + +| Name | Type | Description | Default | +|--------------|----------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------|------------| +| start | [str](`str`) \| [pd](`pandas`).[Timestamp](`pandas.Timestamp`) | Start date/datetime of the requested grid. | _required_ | +| end | [str](`str`) \| [pd](`pandas`).[Timestamp](`pandas.Timestamp`) | End date/datetime of the requested grid (inclusive). | _required_ | +| tz | [str](`str`) | Timezone for the resulting index. Ignored when *start* or *end* is already a tz-aware ``pd.Timestamp``. | `'UTC'` | +| freq | [str](`str`) | Pandas-compatible frequency string. Defaults to ``"h"`` (hourly). | `'h'` | +| country_code | [str](`str`) | Must be ``"DE"`` (Germany). Any other value raises ``ValueError``. | `'DE'` | +| state | [str](`str`) | ISO 3166-2 subdivision short code for the Bundesland, e.g. ``"NW"`` (North Rhine-Westphalia), ``"BY"`` (Bavaria). Defaults to ``"NW"``. | `'NW'` | + +## Returns {.doc-section .doc-section-returns} + +| Name | Type | Description | +|--------|------------------------------------------------|----------------------------------------------------------------------| +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | pd.DataFrame: Single integer column ``is_school_holiday`` (values in | +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | ``{0, 1}``; no NaNs) with a tz-aware `DatetimeIndex` at *freq*. | + +## Raises {.doc-section .doc-section-raises} + +| Name | Type | Description | +|--------|----------------------------|-----------------------------------------------------------------------------------------------------------------------| +| | [ValueError](`ValueError`) | If *country_code* is not ``"DE"``, or if the requested span extends beyond the dataset validity range at either edge. | + +## Examples {.doc-section .doc-section-examples} + +```{python} +from spotforecast2_safe.calendar import create_school_holiday_df + +# NW Sommerferien 2024: 2024-07-08 → 2024-08-20 (inclusive). +# Day before (2024-07-07) must be 0; first day (2024-07-08) must be 1. +df = create_school_holiday_df( + "2024-07-06", "2024-07-10", freq="D", state="NW" +) +print(df) +assert df.loc["2024-07-07", "is_school_holiday"] == 0 +assert df.loc["2024-07-08", "is_school_holiday"] == 1 +assert df.loc["2024-07-09", "is_school_holiday"] == 1 +``` \ No newline at end of file diff --git a/docs/reference/calendar.holiday.get_school_holiday_features.qmd b/docs/reference/calendar.holiday.get_school_holiday_features.qmd new file mode 100644 index 000000000..0d7484b78 --- /dev/null +++ b/docs/reference/calendar.holiday.get_school_holiday_features.qmd @@ -0,0 +1,88 @@ +# calendar.holiday.get_school_holiday_features { #spotforecast2_safe.calendar.holiday.get_school_holiday_features } + +```python +calendar.holiday.get_school_holiday_features( + data, + start, + cov_end, + forecast_horizon, + tz='UTC', + freq='h', + country_code='DE', + state='NW', +) +``` + +Build per-Bundesland school-holiday indicators and align them to a forecast grid. + +Generates the ``is_school_holiday`` binary indicator via +`create_school_holiday_df()`, validates temporal coverage with +`curate_holidays()`, and reindexes onto the full ``[start, cov_end]`` +grid with ``fill_value=0``. + +The requested span ``[start, cov_end]`` must lie entirely within the +dataset validity range 2022-01-01 to 2027-12-31. If either edge falls +outside this range a ``ValueError`` is raised immediately — there is no +fill or extrapolation. + +Only ``country_code="DE"`` is supported; passing any other value raises +``ValueError``. + +## Parameters {.doc-section .doc-section-parameters} + +| Name | Type | Description | Default | +|------------------|-----------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|------------| +| data | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | Reference time series DataFrame used for temporal coverage validation inside `curate_holidays()`. | _required_ | +| start | [Union](`typing.Union`)\[[str](`str`), [pd](`pandas`).[Timestamp](`pandas.Timestamp`)\] | Start timestamp. String values are parsed with ``utc=True``. | _required_ | +| cov_end | [Union](`typing.Union`)\[[str](`str`), [pd](`pandas`).[Timestamp](`pandas.Timestamp`)\] | Inclusive end timestamp covering the full forecast horizon. String values are parsed with ``utc=True``. | _required_ | +| forecast_horizon | [int](`int`) | Number of forecast steps ahead; passed to `curate_holidays()`. | _required_ | +| tz | [str](`str`) | Timezone applied to the generated index. Defaults to ``"UTC"``. | `'UTC'` | +| freq | [str](`str`) | Pandas-compatible frequency string. Defaults to ``"h"``. | `'h'` | +| country_code | [str](`str`) | Must be ``"DE"``. Any other value raises ``ValueError``. | `'DE'` | +| state | [str](`str`) | ISO 3166-2 subdivision short code for the Bundesland. Defaults to ``"NW"`` (North Rhine-Westphalia). | `'NW'` | + +## Returns {.doc-section .doc-section-returns} + +| Name | Type | Description | +|--------|------------------------------------------------|-----------------------------------------------------------------------| +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | pd.DataFrame: Single integer column ``is_school_holiday`` (values in | +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | ``{0, 1}``; no NaNs). The index is a tz-aware `DatetimeIndex` with | +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | the requested *freq* and shape ``(len(data) + forecast_horizon, 1)``. | + +## Raises {.doc-section .doc-section-raises} + +| Name | Type | Description | +|--------|----------------------------|-------------------------------------------------------------------------------------------------------------------------------------| +| | [ValueError](`ValueError`) | If *country_code* is not ``"DE"``, or if the requested span extends beyond the dataset validity range ``[2022-01-01, 2027-12-31]``. | + +## Examples {.doc-section .doc-section-examples} + +```{python} +import pandas as pd +from spotforecast2_safe.calendar import get_school_holiday_features + +forecast_horizon = 24 +n_data = 48 +data = pd.DataFrame( + {"load": range(n_data)}, + index=pd.date_range("2024-07-06", periods=n_data, freq="h", tz="UTC"), +) +start = data.index[0] +cov_end = start + pd.Timedelta(hours=(n_data + forecast_horizon - 1)) + +feats = get_school_holiday_features( + data=data, + start=start, + cov_end=cov_end, + forecast_horizon=forecast_horizon, + state="NW", +) +print("shape:", feats.shape) +print("columns:", feats.columns.tolist()) +# NW Sommerferien 2024: 2024-07-08 is a school holiday (is_school_holiday=1). +print("2024-07-07 00:00 UTC:", feats.loc["2024-07-07 00:00:00+00:00", "is_school_holiday"]) +print("2024-07-08 00:00 UTC:", feats.loc["2024-07-08 00:00:00+00:00", "is_school_holiday"]) +assert feats.shape == (n_data + forecast_horizon, 1) +assert feats.loc["2024-07-07 00:00:00+00:00", "is_school_holiday"] == 0 +assert feats.loc["2024-07-08 00:00:00+00:00", "is_school_holiday"] == 1 +``` \ No newline at end of file diff --git a/docs/reference/configurator.config_entsoe.ConfigEntsoe.qmd b/docs/reference/configurator.config_entsoe.ConfigEntsoe.qmd index 374d0b87a..81ad9aba1 100644 --- a/docs/reference/configurator.config_entsoe.ConfigEntsoe.qmd +++ b/docs/reference/configurator.config_entsoe.ConfigEntsoe.qmd @@ -35,6 +35,7 @@ configurator.config_entsoe.ConfigEntsoe( degree_hours_base_cooling=22.0, include_ephemeris_features=False, include_day_type_features=False, + include_school_holiday_features=False, poly_features_degree=1, max_poly_features=10, poly_mi_n_jobs=-1, diff --git a/docs/reference/configurator.config_multi.ConfigMulti.qmd b/docs/reference/configurator.config_multi.ConfigMulti.qmd index a7e3c300d..91319d10e 100644 --- a/docs/reference/configurator.config_multi.ConfigMulti.qmd +++ b/docs/reference/configurator.config_multi.ConfigMulti.qmd @@ -35,6 +35,7 @@ configurator.config_multi.ConfigMulti( degree_hours_base_cooling=22.0, include_ephemeris_features=False, include_day_type_features=False, + include_school_holiday_features=False, poly_features_degree=1, max_poly_features=10, poly_mi_n_jobs=-1, @@ -117,6 +118,9 @@ API queries and holiday feature generation. | include_weather_windows | [bool](`bool`) | If True, include rolling weather-window features. | `False` | | include_holiday_features | [bool](`bool`) | If True, include public-holiday indicator features. | `False` | | include_holiday_adjacency_features | [bool](`bool`) | If True, include Brückentag and before/after-holiday indicators (``is_brueckentag``, ``is_before_holiday``, ``is_after_holiday``). Defaults to ``False``. | `False` | +| include_ephemeris_features | [bool](`bool`) | If True, include solar-elevation and daylight-duration features. Defaults to ``False``. | `False` | +| include_day_type_features | [bool](`bool`) | If True, include working-day and day-type class features (``is_workday``, ``day_type``). Defaults to ``False``. | `False` | +| include_school_holiday_features | [bool](`bool`) | Append the ``is_school_holiday`` binary indicator from the bundled OpenHolidays API dataset (ODbL-1.0). Coverage 2022-01-01 to 2027-12-31 for all 16 German Bundesländer. Only ``country_code="DE"`` is supported. Defaults to ``False``. | `False` | | poly_features_degree | [int](`int`) | Polynomial-interaction degree. ``1`` (default) generates no interactions; ``2`` adds pairwise bilinear terms; ``3+`` higher order. | `1` | | max_poly_features | [int](`int`) | Cap on polynomial interaction columns; only the top ``max_poly_features`` ranked by mutual information with the target are kept (``<= 0`` disables). Defaults to ``10``. | `10` | | poly_mi_n_jobs | [Optional](`typing.Optional`)\[[int](`int`)\] | Parallel jobs for the mutual-information ranking that enforces ``max_poly_features``. ``-1`` (default) uses all cores; ``None`` runs single-threaded. Parallelism does not change the selection. | `-1` | @@ -166,6 +170,9 @@ API queries and holiday feature generation. | include_weather_windows | [bool](`bool`) | Weather-window feature toggle. | | include_holiday_features | [bool](`bool`) | Holiday feature toggle. | | include_holiday_adjacency_features | [bool](`bool`) | Brückentag and before/after-holiday indicator toggle. Defaults to ``False``. | +| include_ephemeris_features | [bool](`bool`) | Solar-elevation and daylight-duration feature toggle. Defaults to ``False``. | +| include_day_type_features | [bool](`bool`) | Working-day / day-type class feature toggle. Defaults to ``False``. | +| include_school_holiday_features | [bool](`bool`) | Per-Bundesland school-holiday indicator toggle. Defaults to ``False``. | | poly_features_degree | [int](`int`) | Polynomial-interaction degree (1 = off). | | max_poly_features | [int](`int`) | Cap on kept ``poly_*`` columns (top-K by MI). | | poly_mi_n_jobs | [Optional](`typing.Optional`)\[[int](`int`)\] | Parallel jobs for the MI ranking (``-1`` = all cores; selection-invariant). | diff --git a/docs/reference/index.qmd b/docs/reference/index.qmd index e7bd97652..3d9d74ca3 100644 --- a/docs/reference/index.qmd +++ b/docs/reference/index.qmd @@ -237,6 +237,8 @@ construction. | [calendar.holiday.get_holiday_adjacency_features](calendar.holiday.get_holiday_adjacency_features.qmd#spotforecast2_safe.calendar.holiday.get_holiday_adjacency_features) | Build holiday-adjacency indicators and align them to a regular time grid. | | [calendar.holiday.create_day_type_df](calendar.holiday.create_day_type_df.qmd#spotforecast2_safe.calendar.holiday.create_day_type_df) | Create a day-type refinement of the public-holiday column. | | [calendar.holiday.get_day_type_features](calendar.holiday.get_day_type_features.qmd#spotforecast2_safe.calendar.holiday.get_day_type_features) | Build day-type indicators and align them to a regular time grid. | +| [calendar.holiday.create_school_holiday_df](calendar.holiday.create_school_holiday_df.qmd#spotforecast2_safe.calendar.holiday.create_school_holiday_df) | Create a DataFrame with a binary school-holiday indicator for a German state. | +| [calendar.holiday.get_school_holiday_features](calendar.holiday.get_school_holiday_features.qmd#spotforecast2_safe.calendar.holiday.get_school_holiday_features) | Build per-Bundesland school-holiday indicators and align them to a forecast grid. | | [calendar.features.get_calendar_features](calendar.features.get_calendar_features.qmd#spotforecast2_safe.calendar.features.get_calendar_features) | Create calendar-based features for a contiguous time range. | | [calendar.features.get_day_night_features](calendar.features.get_day_night_features.qmd#spotforecast2_safe.calendar.features.get_day_night_features) | Create day/night features using astronomical sunrise and sunset times. | | [calendar.features.get_ephemeris_features](calendar.features.get_ephemeris_features.qmd#spotforecast2_safe.calendar.features.get_ephemeris_features) | Create continuous solar-geometry features from the ephemeris. | diff --git a/docs/reference/manager.features.select_exogenous_features.qmd b/docs/reference/manager.features.select_exogenous_features.qmd index be4f99cb1..f3f5feef3 100644 --- a/docs/reference/manager.features.select_exogenous_features.qmd +++ b/docs/reference/manager.features.select_exogenous_features.qmd @@ -8,6 +8,7 @@ manager.features.select_exogenous_features( include_weather_windows=False, include_holiday_features=False, include_holiday_adjacency_features=False, + include_school_holiday_features=False, poly_features_degree=1, ) ``` @@ -25,7 +26,9 @@ forecaster. The selection order is: with ``"holiday"`` (optional, ``include_holiday_features``). 5. Holiday-adjacency columns: ``is_brueckentag``, ``is_before_holiday``, ``is_after_holiday`` (optional, ``include_holiday_adjacency_features``). -6. Polynomial interaction columns starting with ``"poly_"`` (included +6. School-holiday column: ``is_school_holiday`` (optional, + ``include_school_holiday_features``). +7. Polynomial interaction columns starting with ``"poly_"`` (included when ``poly_features_degree >= 2``). Duplicates are removed while preserving insertion order. @@ -40,6 +43,7 @@ Duplicates are removed while preserving insertion order. | include_weather_windows | [bool](`bool`) | If ``True``, include rolling-window weather columns (those containing ``"_window_"`` plus ``"_mean"``, ``"_min"``, or ``"_max"``). Defaults to ``False``. | `False` | | include_holiday_features | [bool](`bool`) | If ``True``, include the ``is_holiday`` column and any column whose name starts with ``"holiday"``. Defaults to ``False``. | `False` | | include_holiday_adjacency_features | [bool](`bool`) | If ``True``, include the three adjacency columns ``is_brueckentag``, ``is_before_holiday``, and ``is_after_holiday`` when present in *exogenous_features*. Defaults to ``False``. | `False` | +| include_school_holiday_features | [bool](`bool`) | If ``True``, include the ``is_school_holiday`` column when present in *exogenous_features*. Defaults to ``False``. | `False` | | poly_features_degree | [int](`int`) | Polynomial-interaction degree. Interaction columns (names starting with ``"poly_"``) are included only when this is ``>= 2``; at ``1`` no interactions exist. Defaults to ``1``. | `1` | ## Returns {.doc-section .doc-section-returns} diff --git a/src/spotforecast2_safe/calendar/__init__.py b/src/spotforecast2_safe/calendar/__init__.py index 0e86b9478..2f2e1bff0 100644 --- a/src/spotforecast2_safe/calendar/__init__.py +++ b/src/spotforecast2_safe/calendar/__init__.py @@ -19,19 +19,23 @@ create_day_type_df, create_holiday_adjacency_df, create_holiday_df, + create_school_holiday_df, get_day_type_features, get_holiday_adjacency_features, get_holiday_features, + get_school_holiday_features, ) __all__ = [ "create_day_type_df", "create_holiday_adjacency_df", "create_holiday_df", + "create_school_holiday_df", "get_calendar_features", "get_day_night_features", "get_day_type_features", "get_ephemeris_features", "get_holiday_adjacency_features", "get_holiday_features", + "get_school_holiday_features", ] diff --git a/src/spotforecast2_safe/calendar/holiday.py b/src/spotforecast2_safe/calendar/holiday.py index 12e214991..69834c677 100644 --- a/src/spotforecast2_safe/calendar/holiday.py +++ b/src/spotforecast2_safe/calendar/holiday.py @@ -20,6 +20,11 @@ of the holiday column: a binary working-day indicator and an integer day-type class (working day / Saturday / Sunday / public holiday), derived purely from the weekday and the public-holiday calendar. +- `create_school_holiday_df()` / `get_school_holiday_features()` — per-Bundesland + school-holiday binary indicator (``is_school_holiday``) built from the bundled + OpenHolidays API dataset (ODbL-1.0), coverage 2022-01-01 to 2027-12-31. + Only ``country_code="DE"`` is supported; 16 German Bundesländer are available. + Requests outside the covered date range raise ``ValueError`` (fail-safe). """ from typing import Union @@ -607,3 +612,248 @@ def get_day_type_features( extended_index = pd.date_range(start=start, end=cov_end, freq=freq, tz=tz) return day_type_df.reindex(extended_index, fill_value=0).astype(int) + + +# --------------------------------------------------------------------------- +# School-holiday features (per-Bundesland, Germany only) +# --------------------------------------------------------------------------- + +_SCHOOL_HOLIDAY_COUNTRY_SUPPORTED = ("DE",) + + +def create_school_holiday_df( + start: str | pd.Timestamp, + end: str | pd.Timestamp, + tz: str = "UTC", + freq: str = "h", + country_code: str = "DE", + state: str = "NW", +) -> pd.DataFrame: + """Create a DataFrame with a binary school-holiday indicator for a German state. + + Builds a tz-aware time grid over ``[start, end]`` at *freq* and marks + every timestamp that falls within a school-holiday period of the requested + Bundesland as ``1``; all others are ``0``. Both edges of each interval + are inclusive. + + Data source: OpenHolidays API (https://openholidaysapi.org), ODbL-1.0. + Coverage: 2022-01-01 to 2027-12-31 for all 16 German Bundesländer. + + Only ``country_code="DE"`` is supported. Requests whose span extends + beyond the covered range at either edge raise ``ValueError`` — there is + no fill or extrapolation. + + Args: + start: Start date/datetime of the requested grid. + end: End date/datetime of the requested grid (inclusive). + tz: Timezone for the resulting index. Ignored when *start* or *end* + is already a tz-aware ``pd.Timestamp``. + freq: Pandas-compatible frequency string. Defaults to ``"h"`` + (hourly). + country_code: Must be ``"DE"`` (Germany). Any other value raises + ``ValueError``. + state: ISO 3166-2 subdivision short code for the Bundesland, e.g. + ``"NW"`` (North Rhine-Westphalia), ``"BY"`` (Bavaria). Defaults + to ``"NW"``. + + Returns: + pd.DataFrame: Single integer column ``is_school_holiday`` (values in + ``{0, 1}``; no NaNs) with a tz-aware `DatetimeIndex` at *freq*. + + Raises: + ValueError: If *country_code* is not ``"DE"``, or if the requested + span extends beyond the dataset validity range at either edge. + + Examples: + ```{python} + from spotforecast2_safe.calendar import create_school_holiday_df + + # NW Sommerferien 2024: 2024-07-08 → 2024-08-20 (inclusive). + # Day before (2024-07-07) must be 0; first day (2024-07-08) must be 1. + df = create_school_holiday_df( + "2024-07-06", "2024-07-10", freq="D", state="NW" + ) + print(df) + assert df.loc["2024-07-07", "is_school_holiday"] == 0 + assert df.loc["2024-07-08", "is_school_holiday"] == 1 + assert df.loc["2024-07-09", "is_school_holiday"] == 1 + ``` + """ + if country_code not in _SCHOOL_HOLIDAY_COUNTRY_SUPPORTED: + raise ValueError( + f"country_code={country_code!r} is not supported for school-holiday " + f"features. Only {_SCHOOL_HOLIDAY_COUNTRY_SUPPORTED} is available." + ) + + from spotforecast2_safe.data.fetch_data import load_school_holidays_de + + intervals_df, valid_from, valid_to = load_school_holidays_de() + + # Normalise start/end to timezone-naive dates for range check. + start_ts = pd.Timestamp(start) + end_ts = pd.Timestamp(end) + start_date_only = ( + start_ts.normalize().tz_localize(None) + if start_ts.tz is None + else start_ts.tz_convert(None).normalize() + ) + end_date_only = ( + end_ts.normalize().tz_localize(None) + if end_ts.tz is None + else end_ts.tz_convert(None).normalize() + ) + + if start_date_only < valid_from or end_date_only > valid_to: + raise ValueError( + f"Requested span [{start_date_only.date()}, {end_date_only.date()}] " + f"extends beyond the school-holiday dataset validity range " + f"[{valid_from.date()}, {valid_to.date()}]. " + "There is no fill or extrapolation for out-of-coverage dates." + ) + + # Build the time grid. + # When either endpoint is already tz-aware, normalise both to a consistent + # tz-aware form so pd.date_range gets two compatible endpoints. + inferred_tz = None + if isinstance(start, pd.Timestamp) and start.tz is not None: + inferred_tz = str(start.tz) + elif isinstance(end, pd.Timestamp) and end.tz is not None: + inferred_tz = str(end.tz) + + if inferred_tz is not None: + start_grid = ( + pd.Timestamp(start).tz_localize(inferred_tz) + if pd.Timestamp(start).tz is None + else pd.Timestamp(start) + ) + end_grid = ( + pd.Timestamp(end).tz_localize(inferred_tz) + if pd.Timestamp(end).tz is None + else pd.Timestamp(end) + ) + full_index = pd.date_range(start=start_grid, end=end_grid, freq=freq) + else: + full_index = pd.date_range(start=start, end=end, freq=freq, tz=tz) + + # Filter intervals to the requested state. + state_intervals = intervals_df[intervals_df["state"] == state].copy() + + # Map each normalised calendar day to 0/1. + unique_days = pd.DatetimeIndex(full_index.normalize().unique()) + + day_flags: dict = {} + for d in unique_days: + d_naive = d.tz_localize(None) if d.tz is not None else d + d_date = d_naive.normalize() + in_holiday = ( + (state_intervals["start_date"] <= d_date) + & (d_date <= state_intervals["end_date"]) + ).any() + day_flags[d] = int(in_holiday) + + flag_series = pd.Series(day_flags) + df_full = pd.DataFrame(index=full_index) + df_full["is_school_holiday"] = ( + full_index.normalize().map(flag_series).fillna(0).astype(int) + ) + return df_full + + +def get_school_holiday_features( + data: pd.DataFrame, + start: Union[str, pd.Timestamp], + cov_end: Union[str, pd.Timestamp], + forecast_horizon: int, + tz: str = "UTC", + freq: str = "h", + country_code: str = "DE", + state: str = "NW", +) -> pd.DataFrame: + """Build per-Bundesland school-holiday indicators and align them to a forecast grid. + + Generates the ``is_school_holiday`` binary indicator via + `create_school_holiday_df()`, validates temporal coverage with + `curate_holidays()`, and reindexes onto the full ``[start, cov_end]`` + grid with ``fill_value=0``. + + The requested span ``[start, cov_end]`` must lie entirely within the + dataset validity range 2022-01-01 to 2027-12-31. If either edge falls + outside this range a ``ValueError`` is raised immediately — there is no + fill or extrapolation. + + Only ``country_code="DE"`` is supported; passing any other value raises + ``ValueError``. + + Args: + data: Reference time series DataFrame used for temporal coverage + validation inside `curate_holidays()`. + start: Start timestamp. String values are parsed with ``utc=True``. + cov_end: Inclusive end timestamp covering the full forecast horizon. + String values are parsed with ``utc=True``. + forecast_horizon: Number of forecast steps ahead; passed to + `curate_holidays()`. + tz: Timezone applied to the generated index. Defaults to ``"UTC"``. + freq: Pandas-compatible frequency string. Defaults to ``"h"``. + country_code: Must be ``"DE"``. Any other value raises ``ValueError``. + state: ISO 3166-2 subdivision short code for the Bundesland. + Defaults to ``"NW"`` (North Rhine-Westphalia). + + Returns: + pd.DataFrame: Single integer column ``is_school_holiday`` (values in + ``{0, 1}``; no NaNs). The index is a tz-aware `DatetimeIndex` with + the requested *freq* and shape ``(len(data) + forecast_horizon, 1)``. + + Raises: + ValueError: If *country_code* is not ``"DE"``, or if the requested + span extends beyond the dataset validity range + ``[2022-01-01, 2027-12-31]``. + + Examples: + ```{python} + import pandas as pd + from spotforecast2_safe.calendar import get_school_holiday_features + + forecast_horizon = 24 + n_data = 48 + data = pd.DataFrame( + {"load": range(n_data)}, + index=pd.date_range("2024-07-06", periods=n_data, freq="h", tz="UTC"), + ) + start = data.index[0] + cov_end = start + pd.Timedelta(hours=(n_data + forecast_horizon - 1)) + + feats = get_school_holiday_features( + data=data, + start=start, + cov_end=cov_end, + forecast_horizon=forecast_horizon, + state="NW", + ) + print("shape:", feats.shape) + print("columns:", feats.columns.tolist()) + # NW Sommerferien 2024: 2024-07-08 is a school holiday (is_school_holiday=1). + print("2024-07-07 00:00 UTC:", feats.loc["2024-07-07 00:00:00+00:00", "is_school_holiday"]) + print("2024-07-08 00:00 UTC:", feats.loc["2024-07-08 00:00:00+00:00", "is_school_holiday"]) + assert feats.shape == (n_data + forecast_horizon, 1) + assert feats.loc["2024-07-07 00:00:00+00:00", "is_school_holiday"] == 0 + assert feats.loc["2024-07-08 00:00:00+00:00", "is_school_holiday"] == 1 + ``` + """ + from spotforecast2_safe.preprocessing.curate_data import curate_holidays + + start = to_utc_timestamp(start) + cov_end = to_utc_timestamp(cov_end) + + school_holiday_df = create_school_holiday_df( + start=start, + end=cov_end, + tz=tz, + freq=freq, + country_code=country_code, + state=state, + ) + + curate_holidays(school_holiday_df, data, forecast_horizon=forecast_horizon) + + extended_index = pd.date_range(start=start, end=cov_end, freq=freq, tz=tz) + return school_holiday_df.reindex(extended_index, fill_value=0).astype(int) diff --git a/src/spotforecast2_safe/configurator/config_multi.py b/src/spotforecast2_safe/configurator/config_multi.py index 3674cfa55..7ec038f04 100644 --- a/src/spotforecast2_safe/configurator/config_multi.py +++ b/src/spotforecast2_safe/configurator/config_multi.py @@ -20,7 +20,18 @@ # around-daily (23-25 h), two-day (47-48 h), around-weekly (167-169 h), and # two-week (336 h) structure of hourly load series. DEFAULT_WARM_START_LAGS: List[int] = [ - 1, 2, 3, 23, 24, 25, 47, 48, 167, 168, 169, 336, + 1, + 2, + 3, + 23, + 24, + 25, + 47, + 48, + 167, + 168, + 169, + 336, ] @@ -74,6 +85,14 @@ class ConfigMulti: include_holiday_adjacency_features (bool): If True, include Brückentag and before/after-holiday indicators (``is_brueckentag``, ``is_before_holiday``, ``is_after_holiday``). Defaults to ``False``. + include_ephemeris_features (bool): If True, include solar-elevation and + daylight-duration features. Defaults to ``False``. + include_day_type_features (bool): If True, include working-day and day-type + class features (``is_workday``, ``day_type``). Defaults to ``False``. + include_school_holiday_features (bool): Append the ``is_school_holiday`` + binary indicator from the bundled OpenHolidays API dataset (ODbL-1.0). + Coverage 2022-01-01 to 2027-12-31 for all 16 German Bundesländer. + Only ``country_code="DE"`` is supported. Defaults to ``False``. poly_features_degree (int): Polynomial-interaction degree. ``1`` (default) generates no interactions; ``2`` adds pairwise bilinear terms; ``3+`` higher order. @@ -188,6 +207,12 @@ class ConfigMulti: include_holiday_features (bool): Holiday feature toggle. include_holiday_adjacency_features (bool): Brückentag and before/after-holiday indicator toggle. Defaults to ``False``. + include_ephemeris_features (bool): Solar-elevation and daylight-duration + feature toggle. Defaults to ``False``. + include_day_type_features (bool): Working-day / day-type class feature + toggle. Defaults to ``False``. + include_school_holiday_features (bool): Per-Bundesland school-holiday + indicator toggle. Defaults to ``False``. poly_features_degree (int): Polynomial-interaction degree (1 = off). max_poly_features (int): Cap on kept ``poly_*`` columns (top-K by MI). poly_mi_n_jobs (Optional[int]): Parallel jobs for the MI ranking @@ -342,6 +367,7 @@ class ConfigMulti: # is_workday + day_type (workday/Saturday/Sunday/holiday class). include_ephemeris_features: bool = False include_day_type_features: bool = False + include_school_holiday_features: bool = False poly_features_degree: int = 1 max_poly_features: int = 10 poly_mi_n_jobs: Optional[int] = -1 diff --git a/src/spotforecast2_safe/data/fetch_data.py b/src/spotforecast2_safe/data/fetch_data.py index 656eb62e9..0c82991ee 100644 --- a/src/spotforecast2_safe/data/fetch_data.py +++ b/src/spotforecast2_safe/data/fetch_data.py @@ -805,3 +805,77 @@ def load_day_ahead_price( f"Available columns: {list(df.columns)}." ) return _apply_on_missing(df[column], on_missing, column, csv_path) + + +def load_school_holidays_de() -> tuple[pd.DataFrame, pd.Timestamp, pd.Timestamp]: + """Load the bundled German school-holiday interval table. + + Reads ``datasets/csv/school_holidays_de.csv`` (ODbL-1.0) from the package + data directory and the companion validity-range metadata from + ``school_holidays_de_meta.csv``. No download is needed; both files ship + with the package. + + The CSV has four columns: ``state`` (ISO 3166-2 subdivision short code, + e.g. ``"NW"``), ``name`` (German name of the holiday period), ``start_date`` + and ``end_date`` (both inclusive, parsed as datetime64 (resolution depends + on the pandas version)). Coverage is 2022-01-01 to 2027-12-31 (all 16 + German Bundesländer). + + Data provenance: OpenHolidays API (https://openholidaysapi.org), database + https://github.com/openpotato/openholidaysapi.data, ODC Open Database + License (ODbL-1.0). + + Regeneration command (requires network access): + + ```text + for code in BW BY BE BB HB HH HE MV NI NW RP SL SN ST SH TH: + GET https://openholidaysapi.org/SchoolHolidays?countryIsoCode=DE + &subdivisionCode=DE-&validFrom=2022-01-01&validTo=2024-12-31 + &languageIsoCode=DE (split into two 3-year windows to respect the + 1095-day API limit; second window: validFrom=2025-01-01, + validTo=2027-12-31). Keep every record whose startDate falls within + [valid_from, valid_to]; endDate may extend beyond valid_to and is + kept verbatim (queries past valid_to raise). + ``` + + Returns: + tuple: A three-tuple ``(df, valid_from, valid_to)`` where: + + - **df** — DataFrame with columns ``state``, ``name``, + ``start_date`` (datetime64, resolution depends on the pandas version), + ``end_date`` (datetime64, resolution depends on the pandas version), + sorted by ``(state, start_date)``. + - **valid_from** — `pd.Timestamp` for the first covered day + (``2022-01-01``). + - **valid_to** — `pd.Timestamp` for the last covered day + (``2027-12-31``). + + Examples: + ```{python} + from spotforecast2_safe.data.fetch_data import load_school_holidays_de + + df, valid_from, valid_to = load_school_holidays_de() + print("states:", sorted(df["state"].unique())) + print("valid_from:", valid_from.date()) + print("valid_to:", valid_to.date()) + print("shape:", df.shape) + assert len(df["state"].unique()) == 16 + assert valid_from == pd.Timestamp("2022-01-01") + assert valid_to == pd.Timestamp("2027-12-31") + ``` + """ + pkg_dir = get_package_data_home() + csv_path = pkg_dir / "school_holidays_de.csv" + meta_path = pkg_dir / "school_holidays_de_meta.csv" + + df = pd.read_csv( + csv_path, + parse_dates=["start_date", "end_date"], + ) + df = df.sort_values(["state", "start_date"]).reset_index(drop=True) + + meta = pd.read_csv(meta_path, parse_dates=["valid_from", "valid_to"]) + valid_from = pd.Timestamp(meta.at[0, "valid_from"]) + valid_to = pd.Timestamp(meta.at[0, "valid_to"]) + + return df, valid_from, valid_to diff --git a/src/spotforecast2_safe/datasets/csv/school_holidays_de.csv b/src/spotforecast2_safe/datasets/csv/school_holidays_de.csv new file mode 100644 index 000000000..a8e9a50a7 --- /dev/null +++ b/src/spotforecast2_safe/datasets/csv/school_holidays_de.csv @@ -0,0 +1,627 @@ +state,name,start_date,end_date +BB,Winterferien,2022-01-31,2022-02-05 +BB,Osterferien,2022-04-11,2022-04-24 +BB,Variabler Ferientag,2022-05-27,2022-05-27 +BB,Sommerferien,2022-07-07,2022-08-20 +BB,Herbstferien,2022-10-24,2022-11-05 +BB,Weihnachtsferien,2022-12-22,2023-01-03 +BB,Winterferien,2023-01-30,2023-02-03 +BB,Osterferien,2023-04-03,2023-04-14 +BB,Variabler Ferientag,2023-05-19,2023-05-19 +BB,Sommerferien,2023-07-13,2023-08-26 +BB,Variabler Ferientag,2023-10-02,2023-10-02 +BB,Herbstferien,2023-10-23,2023-11-04 +BB,Weihnachtsferien,2023-12-23,2024-01-05 +BB,Winterferien,2024-02-05,2024-02-09 +BB,Osterferien,2024-03-25,2024-04-05 +BB,Variabler Ferientag,2024-05-10,2024-05-10 +BB,Sommerferien,2024-07-18,2024-08-31 +BB,Variabler Ferientag,2024-10-04,2024-10-04 +BB,Herbstferien,2024-10-21,2024-11-02 +BB,Weihnachtsferien,2024-12-23,2024-12-31 +BB,Winterferien,2025-02-03,2025-02-08 +BB,Osterferien,2025-04-14,2025-04-25 +BB,Variabler Ferientag,2025-05-02,2025-05-02 +BB,Variabler Ferientag,2025-05-30,2025-05-30 +BB,Pfingstferien,2025-06-10,2025-06-10 +BB,Sommerferien,2025-07-24,2025-09-06 +BB,Herbstferien,2025-10-20,2025-11-01 +BB,Weihnachtsferien,2025-12-22,2026-01-02 +BB,Winterferien,2026-02-02,2026-02-07 +BB,Osterferien,2026-03-30,2026-04-10 +BB,Variabler Ferientag,2026-05-15,2026-05-15 +BB,Pfingstferien,2026-05-26,2026-05-26 +BB,Sommerferien,2026-07-09,2026-08-22 +BB,Herbstferien,2026-10-19,2026-10-30 +BB,Weihnachtsferien,2026-12-23,2027-01-02 +BB,Winterferien,2027-02-01,2027-02-06 +BB,Osterferien,2027-03-22,2027-04-03 +BB,Variabler Ferientag,2027-05-07,2027-05-07 +BB,Pfingstferien,2027-05-18,2027-05-18 +BB,Sommerferien,2027-07-01,2027-08-14 +BB,Herbstferien,2027-10-11,2027-10-23 +BB,Weihnachtsferien,2027-12-23,2027-12-31 +BE,Winterferien,2022-01-30,2022-02-04 +BE,Osterferien,2022-04-11,2022-04-23 +BE,Unterrichtsfreier Tag,2022-05-27,2022-05-27 +BE,Pfingstferien,2022-06-07,2022-06-07 +BE,Sommerferien,2022-07-07,2022-08-18 +BE,Herbstferien,2022-10-24,2022-11-05 +BE,Weihnachtsferien,2022-12-22,2023-01-02 +BE,Winterferien,2023-01-30,2023-02-04 +BE,Osterferien,2023-04-03,2023-04-14 +BE,Unterrichtsfreier Tag,2023-05-19,2023-05-19 +BE,Pfingstferien,2023-05-30,2023-05-30 +BE,Sommerferien,2023-07-13,2023-08-25 +BE,Unterrichtsfreier Tag,2023-10-02,2023-10-02 +BE,Herbstferien,2023-10-23,2023-11-04 +BE,Weihnachtsferien,2023-12-23,2024-01-05 +BE,Winterferien,2024-02-05,2024-02-10 +BE,Osterferien,2024-03-25,2024-04-05 +BE,Unterrichtsfreier Tag,2024-05-10,2024-05-10 +BE,Sommerferien,2024-07-18,2024-08-30 +BE,Unterrichtsfreier Tag,2024-10-04,2024-10-04 +BE,Herbstferien,2024-10-21,2024-11-02 +BE,Weihnachtsferien,2024-12-23,2024-12-31 +BE,Winterferien,2025-02-03,2025-02-08 +BE,Osterferien,2025-04-14,2025-04-25 +BE,Unterrichtsfreier Tag,2025-05-02,2025-05-02 +BE,Unterrichtsfreier Tag,2025-05-30,2025-05-30 +BE,Pfingstferien,2025-06-10,2025-06-10 +BE,Sommerferien,2025-07-24,2025-09-06 +BE,Herbstferien,2025-10-20,2025-11-01 +BE,Weihnachtsferien,2025-12-22,2026-01-02 +BE,Winterferien,2026-02-02,2026-02-07 +BE,Osterferien,2026-03-30,2026-04-10 +BE,Unterrichtsfreier Tag,2026-05-15,2026-05-15 +BE,Pfingstferien,2026-05-26,2026-05-26 +BE,Sommerferien,2026-07-09,2026-08-22 +BE,Herbstferien,2026-10-19,2026-10-31 +BE,Weihnachtsferien,2026-12-23,2027-01-02 +BE,Winterferien,2027-02-01,2027-02-06 +BE,Osterferien,2027-03-22,2027-04-02 +BE,Unterrichtsfreier Tag,2027-05-07,2027-05-07 +BE,Pfingstferien,2027-05-18,2027-05-19 +BE,Sommerferien,2027-07-01,2027-08-14 +BE,Herbstferien,2027-10-11,2027-10-23 +BE,Weihnachtsferien,2027-12-22,2027-12-31 +BW,Osterferien,2022-04-19,2022-04-23 +BW,Pfingstferien,2022-06-07,2022-06-18 +BW,Sommerferien,2022-07-28,2022-09-10 +BW,Reformationsfest,2022-10-31,2022-10-31 +BW,Herbstferien,2022-11-02,2022-11-04 +BW,Weihnachtsferien,2022-12-21,2023-01-07 +BW,Gründonnerstag,2023-04-06,2023-04-06 +BW,Osterferien,2023-04-11,2023-04-15 +BW,Pfingstferien,2023-05-30,2023-06-09 +BW,Sommerferien,2023-07-27,2023-09-09 +BW,Herbstferien,2023-10-30,2023-11-03 +BW,Weihnachtsferien,2023-12-23,2024-01-05 +BW,Osterferien,2024-03-23,2024-04-05 +BW,Pfingstferien,2024-05-21,2024-05-31 +BW,Sommerferien,2024-07-25,2024-09-07 +BW,Herbstferien,2024-10-28,2024-10-30 +BW,Reformationsfest,2024-10-31,2024-10-31 +BW,Weihnachtsferien,2024-12-23,2025-01-04 +BW,Osterferien,2025-04-14,2025-04-26 +BW,Pfingstferien,2025-06-10,2025-06-20 +BW,Sommerferien,2025-07-31,2025-09-13 +BW,Herbstferien,2025-10-27,2025-10-30 +BW,Reformationsfest,2025-10-31,2025-10-31 +BW,Weihnachtsferien,2025-12-22,2026-01-05 +BW,Osterferien,2026-03-30,2026-04-11 +BW,Pfingstferien,2026-05-26,2026-06-05 +BW,Sommerferien,2026-07-30,2026-09-12 +BW,Herbstferien,2026-10-26,2026-10-30 +BW,Reformationsfest,2026-10-31,2026-10-31 +BW,Weihnachtsferien,2026-12-23,2027-01-09 +BW,Gründonnerstag,2027-03-25,2027-03-25 +BW,Osterferien,2027-03-30,2027-04-03 +BW,Pfingstferien,2027-05-18,2027-05-29 +BW,Sommerferien,2027-07-29,2027-09-11 +BW,Herbstferien,2027-11-02,2027-11-06 +BW,Weihnachtsferien,2027-12-23,2028-01-08 +BY,Frühjahrsferien,2022-02-28,2022-03-04 +BY,Osterferien,2022-04-11,2022-04-23 +BY,Pfingstferien,2022-06-07,2022-06-18 +BY,Sommerferien,2022-08-01,2022-09-12 +BY,Herbstferien,2022-10-31,2022-11-04 +BY,Buß- und Bettag,2022-11-16,2022-11-16 +BY,Weihnachtsferien,2022-12-24,2023-01-07 +BY,Frühjahrsferien,2023-02-20,2023-02-24 +BY,Osterferien,2023-04-03,2023-04-15 +BY,Pfingstferien,2023-05-30,2023-06-09 +BY,Sommerferien,2023-07-31,2023-09-11 +BY,Herbstferien,2023-10-30,2023-11-04 +BY,Buß- und Bettag,2023-11-22,2023-11-22 +BY,Weihnachtsferien,2023-12-23,2024-01-05 +BY,Frühjahrsferien,2024-02-12,2024-02-16 +BY,Osterferien,2024-03-25,2024-04-06 +BY,Pfingstferien,2024-05-21,2024-06-01 +BY,Sommerferien,2024-07-29,2024-09-09 +BY,Herbstferien,2024-10-28,2024-10-31 +BY,Buß- und Bettag,2024-11-20,2024-11-20 +BY,Weihnachtsferien,2024-12-23,2025-01-03 +BY,Frühjahrsferien,2025-03-03,2025-03-07 +BY,Osterferien,2025-04-14,2025-04-25 +BY,Pfingstferien,2025-06-10,2025-06-20 +BY,Sommerferien,2025-08-01,2025-09-15 +BY,Herbstferien,2025-11-03,2025-11-07 +BY,Buß- und Bettag,2025-11-19,2025-11-19 +BY,Weihnachtsferien,2025-12-22,2026-01-05 +BY,Frühjahrsferien,2026-02-16,2026-02-20 +BY,Osterferien,2026-03-30,2026-04-10 +BY,Pfingstferien,2026-05-26,2026-06-05 +BY,Sommerferien,2026-08-03,2026-09-14 +BY,Herbstferien,2026-11-02,2026-11-06 +BY,Buß- und Bettag,2026-11-18,2026-11-18 +BY,Weihnachtsferien,2026-12-24,2027-01-08 +BY,Frühjahrsferien,2027-02-08,2027-02-12 +BY,Osterferien,2027-03-22,2027-04-02 +BY,Pfingstferien,2027-05-18,2027-05-28 +BY,Sommerferien,2027-08-02,2027-09-13 +BY,Herbstferien,2027-11-02,2027-11-05 +BY,Buß- und Bettag,2027-11-17,2027-11-17 +BY,Weihnachtsferien,2027-12-24,2028-01-07 +HB,Halbjahresferien,2022-01-31,2022-02-01 +HB,Osterferien,2022-04-04,2022-04-19 +HB,Tag nach Himmelfahrt,2022-05-27,2022-05-27 +HB,Pfingstferien,2022-06-07,2022-06-07 +HB,Sommerferien,2022-07-14,2022-08-24 +HB,Herbstferien,2022-10-17,2022-10-29 +HB,Weihnachtsferien,2022-12-23,2023-01-06 +HB,Halbjahresferien,2023-01-30,2023-01-31 +HB,Osterferien,2023-03-27,2023-04-11 +HB,Tag nach Himmelfahrt,2023-05-19,2023-05-19 +HB,Pfingstferien,2023-05-30,2023-05-30 +HB,Sommerferien,2023-07-06,2023-08-16 +HB,Tag vor dem 3. Oktober,2023-10-02,2023-10-02 +HB,Herbstferien,2023-10-16,2023-10-30 +HB,Weihnachtsferien,2023-12-23,2024-01-05 +HB,Halbjahresferien,2024-02-01,2024-02-02 +HB,Osterferien,2024-03-18,2024-03-28 +HB,Tag nach Himmelfahrt,2024-05-10,2024-05-10 +HB,Pfingstferien,2024-05-21,2024-05-21 +HB,Sommerferien,2024-06-24,2024-08-02 +HB,Herbstferien,2024-10-04,2024-10-19 +HB,Tag nach dem Reformationstag,2024-11-01,2024-11-01 +HB,Weihnachtsferien,2024-12-23,2025-01-04 +HB,Halbjahresferien,2025-02-03,2025-02-04 +HB,Osterferien,2025-04-07,2025-04-19 +HB,Kirchentag und Tag nach dem 1. Mai,2025-04-30,2025-05-02 +HB,Tag nach Himmelfahrt,2025-05-30,2025-05-30 +HB,Pfingstferien,2025-06-10,2025-06-10 +HB,Sommerferien,2025-07-03,2025-08-13 +HB,Herbstferien,2025-10-13,2025-10-25 +HB,Weihnachtsferien,2025-12-22,2026-01-05 +HB,Halbjahresferien,2026-02-02,2026-02-03 +HB,Osterferien,2026-03-23,2026-04-07 +HB,Tag nach Himmelfahrt,2026-05-15,2026-05-15 +HB,Pfingstferien,2026-05-26,2026-05-26 +HB,Sommerferien,2026-07-02,2026-08-12 +HB,Herbstferien,2026-10-12,2026-10-24 +HB,Weihnachtsferien,2026-12-23,2027-01-09 +HB,Halbjahresferien,2027-02-01,2027-02-02 +HB,Osterferien,2027-03-22,2027-04-03 +HB,Tag nach Himmelfahrt,2027-05-07,2027-05-07 +HB,Pfingstferien,2027-05-18,2027-05-18 +HB,Sommerferien,2027-07-08,2027-08-18 +HB,Herbstferien,2027-10-18,2027-10-30 +HB,Weihnachtsferien,2027-12-23,2028-01-08 +HE,Osterferien,2022-04-11,2022-04-23 +HE,Sommerferien,2022-07-25,2022-09-02 +HE,Herbstferien,2022-10-24,2022-10-29 +HE,Weihnachtsferien,2022-12-22,2023-01-07 +HE,Osterferien,2023-04-03,2023-04-22 +HE,Sommerferien,2023-07-24,2023-09-01 +HE,Herbstferien,2023-10-23,2023-10-28 +HE,Weihnachtsferien,2023-12-27,2024-01-13 +HE,Osterferien,2024-03-25,2024-04-13 +HE,Sommerferien,2024-07-15,2024-08-23 +HE,Herbstferien,2024-10-14,2024-10-25 +HE,Weihnachtsferien,2024-12-23,2025-01-10 +HE,Osterferien,2025-04-07,2025-04-21 +HE,Sommerferien,2025-07-07,2025-08-15 +HE,Herbstferien,2025-10-06,2025-10-18 +HE,Weihnachtsferien,2025-12-22,2026-01-10 +HE,Osterferien,2026-03-30,2026-04-10 +HE,Sommerferien,2026-06-29,2026-08-07 +HE,Herbstferien,2026-10-05,2026-10-17 +HE,Weihnachtsferien,2026-12-23,2027-01-12 +HE,Osterferien,2027-03-22,2027-04-02 +HE,Sommerferien,2027-06-28,2027-08-06 +HE,Herbstferien,2027-10-04,2027-10-16 +HE,Weihnachtsferien,2027-12-23,2028-01-11 +HH,Halbjahrespause,2022-01-28,2022-01-28 +HH,Frühjahrsferien,2022-03-07,2022-03-18 +HH,Pfingstferien,2022-05-23,2022-05-27 +HH,Sommerferien,2022-07-07,2022-08-17 +HH,Herbstferien,2022-10-10,2022-10-21 +HH,Weihnachtsferien,2022-12-23,2023-01-06 +HH,Halbjahrespause,2023-01-27,2023-01-27 +HH,Frühjahrsferien,2023-03-06,2023-03-17 +HH,Pfingstferien,2023-05-15,2023-05-19 +HH,Sommerferien,2023-07-13,2023-08-23 +HH,Brückentag,2023-10-02,2023-10-02 +HH,Herbstferien,2023-10-16,2023-10-27 +HH,Weihnachtsferien,2023-12-22,2024-01-05 +HH,Halbjahrespause,2024-02-02,2024-02-02 +HH,Frühjahrsferien,2024-03-18,2024-03-28 +HH,Brückentag,2024-05-10,2024-05-10 +HH,Pfingstferien,2024-05-21,2024-05-24 +HH,Sommerferien,2024-07-18,2024-08-28 +HH,Brückentag,2024-10-04,2024-10-04 +HH,Herbstferien,2024-10-21,2024-11-01 +HH,Weihnachtsferien,2024-12-20,2025-01-03 +HH,Halbjahrespause,2025-01-31,2025-01-31 +HH,Frühjahrsferien,2025-03-10,2025-03-21 +HH,Brückentag,2025-05-02,2025-05-02 +HH,Pfingstferien,2025-05-26,2025-05-30 +HH,Sommerferien,2025-07-24,2025-09-03 +HH,Herbstferien,2025-10-20,2025-10-31 +HH,Weihnachtsferien,2025-12-17,2026-01-02 +HH,Halbjahrespause,2026-01-30,2026-01-30 +HH,Frühjahrsferien,2026-03-02,2026-03-13 +HH,Pfingstferien,2026-05-11,2026-05-15 +HH,Sommerferien,2026-07-09,2026-08-19 +HH,Herbstferien,2026-10-19,2026-10-30 +HH,Weihnachtsferien,2026-12-21,2027-01-01 +HH,Halbjahrespause,2027-01-29,2027-01-29 +HH,Frühjahrsferien,2027-03-01,2027-03-12 +HH,Pfingstferien,2027-05-07,2027-05-14 +HH,Sommerferien,2027-07-01,2027-08-11 +HH,Herbstferien,2027-10-11,2027-10-22 +HH,Weihnachtsferien,2027-12-20,2027-12-31 +MV,Winterferien,2022-02-05,2022-02-17 +MV,Schulfrei,2022-02-18,2022-02-18 +MV,Osterferien,2022-04-11,2022-04-22 +MV,Zusätzlicher Ferientag,2022-05-27,2022-05-27 +MV,Pfingstferien,2022-06-03,2022-06-07 +MV,Sommerferien,2022-07-04,2022-08-13 +MV,Sommerferien,2022-07-11,2022-08-27 +MV,Herbstferien,2022-10-10,2022-10-15 +MV,Zusätzlicher Ferientag,2022-10-28,2022-10-28 +MV,Zusätzlicher Ferientag,2022-11-01,2022-11-01 +MV,Zusätzlicher Ferientag,2022-11-02,2022-11-02 +MV,Weihnachtsferien,2022-12-22,2023-01-02 +MV,Winterferien,2023-02-06,2023-02-18 +MV,Osterferien,2023-04-03,2023-04-15 +MV,Zusätzlicher Ferientag,2023-05-19,2023-05-19 +MV,Pfingstferien,2023-05-26,2023-05-30 +MV,Zusätzlicher Ferientag,2023-05-26,2023-05-26 +MV,Sommerferien,2023-07-17,2023-09-01 +MV,Zusätzlicher Ferientag,2023-10-02,2023-10-02 +MV,Herbstferien,2023-10-09,2023-10-14 +MV,Zusätzlicher Ferientag,2023-10-30,2023-10-30 +MV,Zusätzlicher Ferientag,2023-11-01,2023-11-01 +MV,Weihnachtsferien,2023-12-21,2024-01-03 +MV,Winterferien,2024-02-05,2024-02-16 +MV,Osterferien,2024-03-25,2024-04-05 +MV,Zusätzlicher Ferientag,2024-05-10,2024-05-10 +MV,Pfingstferien,2024-05-17,2024-05-21 +MV,Sommerferien,2024-07-15,2024-08-31 +MV,Sommerferien,2024-07-22,2024-08-31 +MV,Zusätzlicher Ferientag,2024-10-04,2024-10-04 +MV,Herbstferien,2024-10-21,2024-10-26 +MV,Zusätzlicher Ferientag,2024-11-01,2024-11-01 +MV,Weihnachtsferien,2024-12-23,2025-01-06 +MV,Winterferien,2025-02-03,2025-02-14 +MV,Osterferien,2025-04-14,2025-04-25 +MV,Zusätzlicher Ferientag,2025-05-02,2025-05-02 +MV,Zusätzlicher Ferientag,2025-05-30,2025-05-30 +MV,Pfingstferien,2025-06-06,2025-06-10 +MV,Sommerferien,2025-07-14,2025-08-30 +MV,Sommerferien,2025-07-28,2025-09-06 +MV,Zusätzlicher Ferientag,2025-10-01,2025-10-01 +MV,Zusätzlicher Ferientag,2025-10-02,2025-10-02 +MV,Herbstferien,2025-10-20,2025-10-25 +MV,Zusätzlicher Ferientag,2025-11-03,2025-11-03 +MV,Weihnachtsferien,2025-12-20,2026-01-03 +MV,Weihnachtsferien,2025-12-22,2026-01-03 +MV,Winterferien,2026-02-09,2026-02-20 +MV,Osterferien,2026-03-30,2026-04-10 +MV,Zusätzlicher Ferientag,2026-05-15,2026-05-15 +MV,Pfingstferien,2026-05-22,2026-05-26 +MV,Sommerferien,2026-07-13,2026-08-29 +MV,Herbstferien,2026-10-15,2026-10-24 +MV,Herbstferien,2026-10-19,2026-10-24 +MV,Zusätzlicher Ferientag,2026-11-26,2026-11-26 +MV,Zusätzlicher Ferientag,2026-11-27,2026-11-27 +MV,Weihnachtsferien,2026-12-19,2027-01-02 +MV,Weihnachtsferien,2026-12-21,2027-01-02 +MV,Winterferien,2027-02-08,2027-02-19 +MV,Osterferien,2027-03-22,2027-04-02 +MV,Osterferien,2027-03-24,2027-04-02 +MV,Zusätzlicher Ferientag,2027-05-07,2027-05-07 +MV,Pfingstferien,2027-05-14,2027-05-18 +MV,Sommerferien,2027-07-05,2027-08-14 +MV,Sommerferien,2027-07-12,2027-08-28 +MV,Herbstferien,2027-10-14,2027-10-23 +MV,Herbstferien,2027-10-16,2027-10-23 +MV,Zusätzlicher Ferientag,2027-11-25,2027-11-25 +MV,Zusätzlicher Ferientag,2027-11-26,2027-11-26 +MV,Weihnachtsferien,2027-12-22,2028-01-04 +NI,Halbjahresferien,2022-01-31,2022-02-01 +NI,Osterferien,2022-04-04,2022-04-19 +NI,Tag nach Himmelfahrt,2022-05-27,2022-05-27 +NI,Pfingstferien,2022-06-07,2022-06-07 +NI,Sommerferien,2022-07-14,2022-08-24 +NI,Herbstferien,2022-10-17,2022-10-28 +NI,Weihnachtsferien,2022-12-23,2023-01-06 +NI,Halbjahresferien,2023-01-30,2023-01-31 +NI,Osterferien,2023-03-27,2023-04-11 +NI,Tag nach Himmelfahrt,2023-05-19,2023-05-19 +NI,Pfingstferien,2023-05-30,2023-05-30 +NI,Sommerferien,2023-07-06,2023-08-16 +NI,Tag vor dem 3. Oktober,2023-10-02,2023-10-02 +NI,Herbstferien,2023-10-16,2023-10-30 +NI,Weihnachtsferien,2023-12-27,2024-01-05 +NI,Halbjahresferien,2024-02-01,2024-02-02 +NI,Osterferien,2024-03-18,2024-03-28 +NI,Tag nach Himmelfahrt,2024-05-10,2024-05-10 +NI,Pfingstferien,2024-05-21,2024-05-21 +NI,Sommerferien,2024-06-24,2024-08-03 +NI,Herbstferien,2024-10-04,2024-10-19 +NI,Tag nach dem Reformationstag,2024-11-01,2024-11-01 +NI,Weihnachtsferien,2024-12-23,2025-01-04 +NI,Halbjahresferien,2025-02-03,2025-02-04 +NI,Osterferien,2025-04-07,2025-04-19 +NI,Kirchentag,2025-04-30,2025-04-30 +NI,Tag nach dem 1. Mai,2025-05-02,2025-05-02 +NI,Tag nach Himmelfahrt,2025-05-30,2025-05-30 +NI,Pfingstferien,2025-06-10,2025-06-10 +NI,Sommerferien,2025-07-03,2025-08-13 +NI,Herbstferien,2025-10-13,2025-10-25 +NI,Weihnachtsferien,2025-12-22,2026-01-05 +NI,Halbjahresferien,2026-02-02,2026-02-03 +NI,Osterferien,2026-03-23,2026-04-07 +NI,Tag nach Himmelfahrt,2026-05-15,2026-05-15 +NI,Pfingstferien,2026-05-26,2026-05-26 +NI,Sommerferien,2026-07-02,2026-08-12 +NI,Herbstferien,2026-10-12,2026-10-24 +NI,Weihnachtsferien,2026-12-23,2027-01-09 +NI,Halbjahresferien,2027-02-01,2027-02-02 +NI,Osterferien,2027-03-22,2027-04-03 +NI,Tag nach Himmelfahrt,2027-05-07,2027-05-07 +NI,Pfingstferien,2027-05-18,2027-05-18 +NI,Sommerferien,2027-07-08,2027-08-18 +NI,Herbstferien,2027-10-16,2027-10-30 +NI,Weihnachtsferien,2027-12-23,2028-01-08 +NW,Osterferien,2022-04-11,2022-04-23 +NW,Sommerferien,2022-06-27,2022-08-09 +NW,Herbstferien,2022-10-04,2022-10-15 +NW,Weihnachtsferien,2022-12-23,2023-01-06 +NW,Osterferien,2023-04-03,2023-04-15 +NW,Pfingstferien,2023-05-30,2023-05-30 +NW,Sommerferien,2023-06-22,2023-08-04 +NW,Herbstferien,2023-10-02,2023-10-14 +NW,Weihnachtsferien,2023-12-21,2024-01-05 +NW,Osterferien,2024-03-25,2024-04-06 +NW,Pfingstferien,2024-05-21,2024-05-21 +NW,Sommerferien,2024-07-08,2024-08-20 +NW,Herbstferien,2024-10-14,2024-10-26 +NW,Weihnachtsferien,2024-12-23,2025-01-06 +NW,Osterferien,2025-04-14,2025-04-26 +NW,Pfingstferien,2025-06-10,2025-06-10 +NW,Sommerferien,2025-07-14,2025-08-26 +NW,Herbstferien,2025-10-13,2025-10-25 +NW,Weihnachtsferien,2025-12-22,2026-01-06 +NW,Osterferien,2026-03-30,2026-04-11 +NW,Pfingstferien,2026-05-26,2026-05-26 +NW,Sommerferien,2026-07-20,2026-09-01 +NW,Herbstferien,2026-10-17,2026-10-31 +NW,Weihnachtsferien,2026-12-23,2027-01-06 +NW,Osterferien,2027-03-22,2027-04-03 +NW,Pfingstferien,2027-05-18,2027-05-18 +NW,Sommerferien,2027-07-19,2027-08-31 +NW,Herbstferien,2027-10-23,2027-11-06 +NW,Weihnachtsferien,2027-12-24,2028-01-08 +RP,Winterferien,2022-02-21,2022-02-25 +RP,Osterferien,2022-04-13,2022-04-22 +RP,Sommerferien,2022-07-25,2022-09-02 +RP,Herbstferien,2022-10-17,2022-10-31 +RP,Weihnachtsferien,2022-12-23,2023-01-02 +RP,Osterferien,2023-04-03,2023-04-06 +RP,Pfingstferien,2023-05-30,2023-06-07 +RP,Sommerferien,2023-07-24,2023-09-01 +RP,Herbstferien,2023-10-16,2023-10-27 +RP,Weihnachtsferien,2023-12-27,2024-01-05 +RP,Osterferien,2024-03-25,2024-04-02 +RP,Pfingstferien,2024-05-21,2024-05-29 +RP,Sommerferien,2024-07-15,2024-08-23 +RP,Herbstferien,2024-10-14,2024-10-25 +RP,Weihnachtsferien,2024-12-23,2025-01-08 +RP,Osterferien,2025-04-14,2025-04-25 +RP,Sommerferien,2025-07-07,2025-08-15 +RP,Herbstferien,2025-10-13,2025-10-24 +RP,Weihnachtsferien,2025-12-22,2026-01-07 +RP,Osterferien,2026-03-30,2026-04-10 +RP,Sommerferien,2026-06-29,2026-08-07 +RP,Herbstferien,2026-10-05,2026-10-16 +RP,Weihnachtsferien,2026-12-23,2027-01-08 +RP,Osterferien,2027-03-22,2027-04-02 +RP,Sommerferien,2027-06-28,2027-08-06 +RP,Herbstferien,2027-10-04,2027-10-15 +RP,Weihnachtsferien,2027-12-23,2028-01-07 +SH,Osterferien,2022-04-04,2022-04-16 +SH,Himmelfahrt,2022-05-27,2022-05-28 +SH,Sommerferien,2022-07-04,2022-08-13 +SH,Herbstferien,2022-10-03,2022-10-21 +SH,Herbstferien,2022-10-10,2022-10-21 +SH,Weihnachtsferien,2022-12-23,2023-01-07 +SH,Osterferien,2023-04-06,2023-04-22 +SH,Himmelfahrt,2023-05-19,2023-05-20 +SH,Sommerferien,2023-07-17,2023-08-26 +SH,Herbstferien,2023-10-09,2023-10-27 +SH,Herbstferien,2023-10-16,2023-10-27 +SH,Weihnachtsferien,2023-12-27,2024-01-06 +SH,Osterferien,2024-04-02,2024-04-19 +SH,Himmelfahrt,2024-05-10,2024-05-11 +SH,Sommerferien,2024-07-22,2024-08-31 +SH,Herbstferien,2024-10-14,2024-11-01 +SH,Herbstferien,2024-10-21,2024-11-01 +SH,Weihnachtsferien,2024-12-19,2025-01-07 +SH,Osterferien,2025-04-11,2025-04-25 +SH,Himmelfahrt,2025-05-30,2025-05-30 +SH,Sommerferien,2025-07-28,2025-09-06 +SH,Herbstferien,2025-10-13,2025-10-30 +SH,Herbstferien,2025-10-20,2025-10-30 +SH,Weihnachtsferien,2025-12-19,2026-01-06 +SH,Osterferien,2026-03-26,2026-04-10 +SH,Himmelfahrt,2026-05-15,2026-05-15 +SH,Sommerferien,2026-07-04,2026-08-15 +SH,Herbstferien,2026-10-05,2026-10-24 +SH,Herbstferien,2026-10-12,2026-10-24 +SH,Weihnachtsferien,2026-12-21,2027-01-06 +SH,Osterferien,2027-03-30,2027-04-10 +SH,Himmelfahrt,2027-05-07,2027-05-07 +SH,Sommerferien,2027-07-03,2027-08-14 +SH,Herbstferien,2027-10-04,2027-10-23 +SH,Herbstferien,2027-10-11,2027-10-23 +SH,Weihnachtsferien,2027-12-23,2028-01-08 +SL,Fastnachtsferien,2022-02-21,2022-03-01 +SL,Osterferien,2022-04-14,2022-04-22 +SL,Pfingstferien,2022-06-07,2022-06-10 +SL,Sommerferien,2022-07-25,2022-09-02 +SL,Herbstferien,2022-10-24,2022-11-04 +SL,Weihnachtsferien,2022-12-22,2023-01-04 +SL,Fastnachtsferien,2023-02-20,2023-02-24 +SL,Osterferien,2023-04-03,2023-04-12 +SL,Pfingstferien,2023-05-30,2023-06-02 +SL,Sommerferien,2023-07-24,2023-09-01 +SL,Herbstferien,2023-10-23,2023-11-03 +SL,Weihnachtsferien,2023-12-21,2024-01-02 +SL,Fastnachtsferien,2024-02-12,2024-02-16 +SL,Osterferien,2024-03-25,2024-04-05 +SL,Pfingstferien,2024-05-21,2024-05-24 +SL,Sommerferien,2024-07-15,2024-08-23 +SL,Herbstferien,2024-10-14,2024-10-25 +SL,Weihnachtsferien,2024-12-23,2025-01-03 +SL,Fastnachtsferien,2025-02-24,2025-03-04 +SL,Osterferien,2025-04-14,2025-04-25 +SL,Sommerferien,2025-07-07,2025-08-14 +SL,Herbstferien,2025-10-13,2025-10-24 +SL,Weihnachtsferien,2025-12-22,2026-01-02 +SL,Fastnachtsferien,2026-02-16,2026-02-20 +SL,Osterferien,2026-04-07,2026-04-17 +SL,Sommerferien,2026-06-29,2026-08-07 +SL,Herbstferien,2026-10-05,2026-10-16 +SL,Weihnachtsferien,2026-12-21,2026-12-31 +SL,Fastnachtsferien,2027-02-08,2027-02-12 +SL,Osterferien,2027-03-30,2027-04-09 +SL,Sommerferien,2027-06-28,2027-08-06 +SL,Herbstferien,2027-10-04,2027-10-15 +SL,Weihnachtsferien,2027-12-20,2027-12-31 +SN,Winterferien,2022-02-12,2022-02-26 +SN,Osterferien,2022-04-15,2022-04-23 +SN,Unterrichtsfreier Tag,2022-05-27,2022-05-27 +SN,Sommerferien,2022-07-18,2022-08-26 +SN,Herbstferien,2022-10-17,2022-10-29 +SN,Weihnachtsferien,2022-12-22,2023-01-02 +SN,Winterferien,2023-02-13,2023-02-24 +SN,Osterferien,2023-04-07,2023-04-15 +SN,Unterrichtsfreier Tag,2023-05-19,2023-05-19 +SN,Sommerferien,2023-07-10,2023-08-18 +SN,Herbstferien,2023-10-02,2023-10-14 +SN,Unterrichtsfreier Tag,2023-10-30,2023-10-30 +SN,Weihnachtsferien,2023-12-23,2024-01-02 +SN,Winterferien,2024-02-12,2024-02-23 +SN,Osterferien,2024-03-28,2024-04-05 +SN,Unterrichtsfreier Tag,2024-05-10,2024-05-10 +SN,Pfingstferien,2024-05-18,2024-05-21 +SN,Sommerferien,2024-06-20,2024-08-02 +SN,Herbstferien,2024-10-07,2024-10-19 +SN,Weihnachtsferien,2024-12-23,2025-01-03 +SN,Winterferien,2025-02-17,2025-03-01 +SN,Osterferien,2025-04-18,2025-04-25 +SN,Unterrichtsfreier Tag,2025-05-30,2025-05-30 +SN,Sommerferien,2025-06-28,2025-08-08 +SN,Herbstferien,2025-10-06,2025-10-18 +SN,Weihnachtsferien,2025-12-22,2026-01-02 +SN,Winterferien,2026-02-09,2026-02-21 +SN,Osterferien,2026-04-03,2026-04-10 +SN,Unterrichtsfreier Tag,2026-05-15,2026-05-15 +SN,Sommerferien,2026-07-04,2026-08-14 +SN,Herbstferien,2026-10-12,2026-10-24 +SN,Weihnachtsferien,2026-12-23,2027-01-02 +SN,Winterferien,2027-02-08,2027-02-19 +SN,Osterferien,2027-03-26,2027-04-02 +SN,Unterrichtsfreier Tag,2027-05-07,2027-05-07 +SN,Pfingstferien,2027-05-15,2027-05-18 +SN,Sommerferien,2027-07-10,2027-08-20 +SN,Herbstferien,2027-10-11,2027-10-23 +SN,Weihnachtsferien,2027-12-23,2028-01-01 +ST,Winterferien,2022-02-12,2022-02-19 +ST,Osterferien,2022-04-11,2022-04-16 +ST,Pfingstferien,2022-05-23,2022-05-28 +ST,Sommerferien,2022-07-14,2022-08-24 +ST,Herbstferien,2022-10-24,2022-11-04 +ST,Weihnachtsferien,2022-12-21,2023-01-05 +ST,Winterferien,2023-02-06,2023-02-11 +ST,Osterferien,2023-04-03,2023-04-08 +ST,Pfingstferien,2023-05-15,2023-05-19 +ST,Sommerferien,2023-07-06,2023-08-16 +ST,Ferientag,2023-10-02,2023-10-02 +ST,Herbstferien,2023-10-16,2023-10-30 +ST,Weihnachtsferien,2023-12-21,2024-01-03 +ST,Winterferien,2024-02-05,2024-02-10 +ST,Osterferien,2024-03-25,2024-03-30 +ST,Pfingstferien,2024-05-21,2024-05-24 +ST,Sommerferien,2024-06-24,2024-08-03 +ST,Herbstferien,2024-09-30,2024-10-12 +ST,Ferientag,2024-11-01,2024-11-01 +ST,Weihnachtsferien,2024-12-23,2025-01-04 +ST,Winterferien,2025-01-27,2025-01-31 +ST,Osterferien,2025-04-07,2025-04-19 +ST,Ferientag,2025-05-30,2025-05-30 +ST,Sommerferien,2025-06-28,2025-08-08 +ST,Herbstferien,2025-10-13,2025-10-25 +ST,Weihnachtsferien,2025-12-22,2026-01-05 +ST,Winterferien,2026-01-31,2026-02-06 +ST,Osterferien,2026-03-30,2026-04-04 +ST,Pfingstferien,2026-05-26,2026-05-29 +ST,Sommerferien,2026-07-04,2026-08-14 +ST,Herbstferien,2026-10-19,2026-10-30 +ST,Weihnachtsferien,2026-12-21,2027-01-02 +ST,Winterferien,2027-02-01,2027-02-06 +ST,Osterferien,2027-03-22,2027-03-27 +ST,Pfingstferien,2027-05-15,2027-05-22 +ST,Sommerferien,2027-07-10,2027-08-20 +ST,Herbstferien,2027-10-18,2027-10-23 +ST,Weihnachtsferien,2027-12-20,2027-12-31 +TH,Winterferien,2022-02-12,2022-02-19 +TH,Osterferien,2022-04-11,2022-04-23 +TH,Schulfreier Tag,2022-05-27,2022-05-27 +TH,Sommerferien,2022-07-18,2022-08-27 +TH,Herbstferien,2022-10-17,2022-10-29 +TH,Weihnachtsferien,2022-12-22,2023-01-03 +TH,Winterferien,2023-02-13,2023-02-17 +TH,Osterferien,2023-04-03,2023-04-15 +TH,Schulfreier Tag,2023-05-19,2023-05-19 +TH,Sommerferien,2023-07-10,2023-08-19 +TH,Herbstferien,2023-10-02,2023-10-14 +TH,Weihnachtsferien,2023-12-22,2024-01-05 +TH,Winterferien,2024-02-12,2024-02-16 +TH,Osterferien,2024-03-25,2024-04-06 +TH,Schulfreier Tag,2024-05-10,2024-05-10 +TH,Sommerferien,2024-06-20,2024-07-31 +TH,Herbstferien,2024-09-30,2024-10-12 +TH,Weihnachtsferien,2024-12-23,2025-01-03 +TH,Winterferien,2025-02-03,2025-02-08 +TH,Osterferien,2025-04-07,2025-04-19 +TH,Schulfreier Tag,2025-05-30,2025-05-30 +TH,Sommerferien,2025-06-28,2025-08-08 +TH,Herbstferien,2025-10-06,2025-10-18 +TH,Weihnachtsferien,2025-12-22,2026-01-03 +TH,Winterferien,2026-02-16,2026-02-21 +TH,Osterferien,2026-04-07,2026-04-17 +TH,Schulfreier Tag,2026-05-15,2026-05-15 +TH,Sommerferien,2026-07-04,2026-08-14 +TH,Herbstferien,2026-10-12,2026-10-24 +TH,Weihnachtsferien,2026-12-23,2027-01-02 +TH,Winterferien,2027-02-01,2027-02-06 +TH,Osterferien,2027-03-22,2027-04-03 +TH,Schulfreier Tag,2027-05-07,2027-05-07 +TH,Sommerferien,2027-07-10,2027-08-20 +TH,Herbstferien,2027-10-09,2027-10-23 +TH,Weihnachtsferien,2027-12-23,2027-12-31 diff --git a/src/spotforecast2_safe/datasets/csv/school_holidays_de.csv.license b/src/spotforecast2_safe/datasets/csv/school_holidays_de.csv.license new file mode 100644 index 000000000..6d12fa12c --- /dev/null +++ b/src/spotforecast2_safe/datasets/csv/school_holidays_de.csv.license @@ -0,0 +1,8 @@ +SPDX-FileCopyrightText: The OpenHolidays API Project (https://www.openholidaysapi.org) + +SPDX-License-Identifier: ODbL-1.0 + +Dataset: German school holidays for all 16 federal states (Bundesländer), 2022-01-01 to 2027-12-31. +Source: https://openholidaysapi.org +Database: https://github.com/openpotato/openholidaysapi.data +License: ODC Open Database License (ODbL-1.0) diff --git a/src/spotforecast2_safe/datasets/csv/school_holidays_de_meta.csv b/src/spotforecast2_safe/datasets/csv/school_holidays_de_meta.csv new file mode 100644 index 000000000..c2a1614d5 --- /dev/null +++ b/src/spotforecast2_safe/datasets/csv/school_holidays_de_meta.csv @@ -0,0 +1,2 @@ +valid_from,valid_to +2022-01-01,2027-12-31 diff --git a/src/spotforecast2_safe/datasets/csv/school_holidays_de_meta.csv.license b/src/spotforecast2_safe/datasets/csv/school_holidays_de_meta.csv.license new file mode 100644 index 000000000..63c695db5 --- /dev/null +++ b/src/spotforecast2_safe/datasets/csv/school_holidays_de_meta.csv.license @@ -0,0 +1,8 @@ +SPDX-FileCopyrightText: The OpenHolidays API Project (https://www.openholidaysapi.org) + +SPDX-License-Identifier: ODbL-1.0 + +Dataset: Validity-range metadata for German school holidays (school_holidays_de.csv). +Source: https://openholidaysapi.org +Database: https://github.com/openpotato/openholidaysapi.data +License: ODC Open Database License (ODbL-1.0) diff --git a/src/spotforecast2_safe/manager/features.py b/src/spotforecast2_safe/manager/features.py index e4f0a26b5..0a45aecb5 100644 --- a/src/spotforecast2_safe/manager/features.py +++ b/src/spotforecast2_safe/manager/features.py @@ -281,6 +281,7 @@ def select_exogenous_features( include_weather_windows: bool = False, include_holiday_features: bool = False, include_holiday_adjacency_features: bool = False, + include_school_holiday_features: bool = False, poly_features_degree: int = 1, ) -> List[str]: """Select and deduplicate exogenous feature columns for model training. @@ -296,7 +297,9 @@ def select_exogenous_features( with ``"holiday"`` (optional, ``include_holiday_features``). 5. Holiday-adjacency columns: ``is_brueckentag``, ``is_before_holiday``, ``is_after_holiday`` (optional, ``include_holiday_adjacency_features``). - 6. Polynomial interaction columns starting with ``"poly_"`` (included + 6. School-holiday column: ``is_school_holiday`` (optional, + ``include_school_holiday_features``). + 7. Polynomial interaction columns starting with ``"poly_"`` (included when ``poly_features_degree >= 2``). Duplicates are removed while preserving insertion order. @@ -319,6 +322,9 @@ def select_exogenous_features( adjacency columns ``is_brueckentag``, ``is_before_holiday``, and ``is_after_holiday`` when present in *exogenous_features*. Defaults to ``False``. + include_school_holiday_features: If ``True``, include the + ``is_school_holiday`` column when present in *exogenous_features*. + Defaults to ``False``. poly_features_degree: Polynomial-interaction degree. Interaction columns (names starting with ``"poly_"``) are included only when this is ``>= 2``; at ``1`` no interactions exist. Defaults to @@ -394,6 +400,10 @@ def select_exogenous_features( ] exog_list.extend(adjacency_cols) + if include_school_holiday_features: + if "is_school_holiday" in exogenous_features.columns: + exog_list.append("is_school_holiday") + if poly_features_degree >= 2: poly_features_list = [ col for col in exogenous_features.columns if col.startswith("poly_") diff --git a/src/spotforecast2_safe/multitask/base.py b/src/spotforecast2_safe/multitask/base.py index ac41b0046..e81f50cec 100644 --- a/src/spotforecast2_safe/multitask/base.py +++ b/src/spotforecast2_safe/multitask/base.py @@ -35,6 +35,7 @@ get_ephemeris_features, get_holiday_adjacency_features, get_holiday_features, + get_school_holiday_features, ) from spotforecast2_safe.configurator.config_multi import ( # noqa: F401 (re-exported for subclasses) ConfigMulti, @@ -1175,6 +1176,21 @@ def build_exogenous_features(self) -> "BaseTask": " Holiday adjacency features: %s", holiday_adjacency_features.shape ) concat_frames.append(holiday_adjacency_features) + if self.config.include_school_holiday_features: + school_holiday_features = get_school_holiday_features( + data=self.df_pipeline, + start=self.run_state.data_start, + cov_end=self.run_state.cov_end, + forecast_horizon=self.config.predict_size, + tz=self.config.timezone, + freq="h", + country_code=self.config.country_code, + state=self.config.state, + ) + self.logger.info( + " School holiday features: %s", school_holiday_features.shape + ) + concat_frames.append(school_holiday_features) # Step 5 — Combine self.exogenous_features = pd.concat( @@ -1309,6 +1325,7 @@ def build_exogenous_features(self) -> "BaseTask": include_weather_windows=self.config.include_weather_windows, include_holiday_features=self.config.include_holiday_features, include_holiday_adjacency_features=self.config.include_holiday_adjacency_features, + include_school_holiday_features=self.config.include_school_holiday_features, poly_features_degree=self.config.poly_features_degree, ) # ``select_exogenous_features`` matches calendar/weather/holiday/poly diff --git a/tests/test_calendar_school_holiday.py b/tests/test_calendar_school_holiday.py new file mode 100644 index 000000000..c4fb44684 --- /dev/null +++ b/tests/test_calendar_school_holiday.py @@ -0,0 +1,367 @@ +# SPDX-FileCopyrightText: 2026 bartzbeielstein +# SPDX-License-Identifier: AGPL-3.0-or-later + +"""Tests for per-Bundesland school-holiday calendar features.""" + +import pandas as pd +import pytest + +from spotforecast2_safe.calendar import ( + create_school_holiday_df, + get_school_holiday_features, +) +from spotforecast2_safe.data.fetch_data import load_school_holidays_de +from spotforecast2_safe.manager.features import select_exogenous_features + + +class TestCreateSchoolHolidayDf: + """Unit tests for create_school_holiday_df.""" + + def test_determinism(self): + a = create_school_holiday_df("2024-07-01", "2024-07-31", freq="h", state="NW") + b = create_school_holiday_df("2024-07-01", "2024-07-31", freq="h", state="NW") + pd.testing.assert_frame_equal(a, b) + + def test_dtype_no_nan_values_binary(self): + df = create_school_holiday_df("2024-01-01", "2024-12-31", freq="D", state="NW") + assert df["is_school_holiday"].dtype == int + assert not df["is_school_holiday"].isna().any() + assert set(df["is_school_holiday"].unique()).issubset({0, 1}) + + def test_nw_osterferien_2024(self): + # NW Osterferien 2024: 2024-03-25 → 2024-04-06 (inclusive). + df = create_school_holiday_df("2024-03-24", "2024-04-07", freq="D", state="NW") + assert df.loc["2024-03-24", "is_school_holiday"] == 0 + assert df.loc["2024-03-25", "is_school_holiday"] == 1 + assert df.loc["2024-04-06", "is_school_holiday"] == 1 + assert df.loc["2024-04-07", "is_school_holiday"] == 0 + + def test_nw_sommerferien_2024(self): + # NW Sommerferien 2024: 2024-07-08 → 2024-08-20 (inclusive). + df = create_school_holiday_df("2024-07-06", "2024-08-22", freq="D", state="NW") + assert df.loc["2024-07-07", "is_school_holiday"] == 0 + assert df.loc["2024-07-08", "is_school_holiday"] == 1 + assert df.loc["2024-08-20", "is_school_holiday"] == 1 + assert df.loc["2024-08-21", "is_school_holiday"] == 0 + + def test_nw_herbstferien_2024(self): + # NW Herbstferien 2024: 2024-10-14 → 2024-10-26 (inclusive). + df = create_school_holiday_df("2024-10-13", "2024-10-27", freq="D", state="NW") + assert df.loc["2024-10-13", "is_school_holiday"] == 0 + assert df.loc["2024-10-14", "is_school_holiday"] == 1 + assert df.loc["2024-10-26", "is_school_holiday"] == 1 + assert df.loc["2024-10-27", "is_school_holiday"] == 0 + + def test_state_isolation_by_vs_nw(self): + # BY Sommerferien 2024: 2024-07-29 → 2024-09-09. + # NW Sommerferien 2024: 2024-07-08 → 2024-08-20. + # 2024-08-21 is 0 for NW (after NW Sommerferien) but 1 for BY (still in BY Sommerferien). + df_nw = create_school_holiday_df( + "2024-08-21", "2024-08-21", freq="D", state="NW" + ) + df_by = create_school_holiday_df( + "2024-08-21", "2024-08-21", freq="D", state="BY" + ) + assert df_nw.loc["2024-08-21", "is_school_holiday"] == 0 + assert df_by.loc["2024-08-21", "is_school_holiday"] == 1 + + def test_inclusive_edges(self): + # First and last day of NW Sommerferien must be 1. + df = create_school_holiday_df("2024-07-08", "2024-08-20", freq="D", state="NW") + assert df.loc["2024-07-08", "is_school_holiday"] == 1 + assert df.loc["2024-08-20", "is_school_holiday"] == 1 + + def test_hourly_broadcast_vacation_day(self): + # 2024-07-08 is in NW Sommerferien → all 24 hours must be 1. + df = create_school_holiday_df( + "2024-07-08", "2024-07-08 23:00", freq="h", state="NW" + ) + assert df.shape == (24, 1) + assert (df["is_school_holiday"] == 1).all() + + def test_fail_safe_start_before_valid_from(self): + with pytest.raises(ValueError, match="2022-01-01"): + create_school_holiday_df("2021-12-01", "2022-06-01", freq="D", state="NW") + + def test_fail_safe_end_after_valid_to(self): + with pytest.raises(ValueError, match="2027-12-31"): + create_school_holiday_df("2027-06-01", "2028-06-01", freq="D", state="NW") + + def test_country_code_not_de_raises(self): + with pytest.raises(ValueError, match="country_code"): + create_school_holiday_df( + "2024-01-01", "2024-01-31", freq="D", country_code="FR" + ) + + def test_single_column(self): + df = create_school_holiday_df("2024-07-01", "2024-07-31", freq="D", state="NW") + assert df.columns.tolist() == ["is_school_holiday"] + assert df.shape[1] == 1 + + def test_he_weihnachtsferien_2027(self): + # HE Weihnachtsferien 2027: 2027-12-23 → 2028-01-11. + # 2027-12-22 (day before) must be 0; 2027-12-23 → 2027-12-31 must all be 1. + df = create_school_holiday_df("2027-12-22", "2027-12-31", freq="D", state="HE") + assert df.loc["2027-12-22", "is_school_holiday"] == 0 + assert (df.loc["2027-12-23":, "is_school_holiday"] == 1).all() + + def test_tz_aware_boundary_does_not_raise(self): + # A tz-aware intra-day timestamp on the boundary date (2027-12-31 23:00 UTC) + # normalises to 2027-12-31, which is within valid_to → must NOT raise. + end_ts = pd.Timestamp("2027-12-31 23:00", tz="UTC") + df = create_school_holiday_df("2027-12-01", end_ts, freq="D", state="NW") + assert df is not None + + def test_tz_aware_beyond_valid_to_raises(self): + # 2028-01-01 00:00 UTC normalises to 2028-01-01 → beyond valid_to → must raise. + end_ts = pd.Timestamp("2028-01-01 00:00", tz="UTC") + with pytest.raises(ValueError, match="2027-12-31"): + create_school_holiday_df("2027-12-01", end_ts, freq="D", state="NW") + + +class TestGetSchoolHolidayFeatures: + """Unit tests for get_school_holiday_features.""" + + def _make_data(self, start_str: str, n_data: int = 48) -> pd.DataFrame: + return pd.DataFrame( + {"load": range(n_data)}, + index=pd.date_range(start_str, periods=n_data, freq="h", tz="UTC"), + ) + + def test_shape_and_columns(self): + forecast_horizon = 24 + n_data = 48 + data = self._make_data("2024-07-06", n_data) + start = data.index[0] + cov_end = start + pd.Timedelta(hours=(n_data + forecast_horizon - 1)) + feats = get_school_holiday_features( + data=data, + start=start, + cov_end=cov_end, + forecast_horizon=forecast_horizon, + state="NW", + ) + assert feats.shape == (n_data + forecast_horizon, 1) + assert feats.columns.tolist() == ["is_school_holiday"] + + def test_dtype_no_nan_binary(self): + forecast_horizon = 24 + n_data = 48 + data = self._make_data("2024-07-06", n_data) + start = data.index[0] + cov_end = start + pd.Timedelta(hours=(n_data + forecast_horizon - 1)) + feats = get_school_holiday_features( + data=data, + start=start, + cov_end=cov_end, + forecast_horizon=forecast_horizon, + state="NW", + ) + assert feats["is_school_holiday"].dtype == int + assert not feats["is_school_holiday"].isna().any() + assert set(feats["is_school_holiday"].unique()).issubset({0, 1}) + + def test_determinism(self): + forecast_horizon = 24 + n_data = 48 + data = self._make_data("2024-07-06", n_data) + start = data.index[0] + cov_end = start + pd.Timedelta(hours=(n_data + forecast_horizon - 1)) + a = get_school_holiday_features( + data=data, + start=start, + cov_end=cov_end, + forecast_horizon=forecast_horizon, + state="NW", + ) + b = get_school_holiday_features( + data=data, + start=start, + cov_end=cov_end, + forecast_horizon=forecast_horizon, + state="NW", + ) + pd.testing.assert_frame_equal(a, b) + + def test_known_dates_nw_sommerferien_2024(self): + # Grid starts 2024-07-06; NW Sommerferien starts 2024-07-08. + forecast_horizon = 24 + n_data = 48 + data = self._make_data("2024-07-06", n_data) + start = data.index[0] + cov_end = start + pd.Timedelta(hours=(n_data + forecast_horizon - 1)) + feats = get_school_holiday_features( + data=data, + start=start, + cov_end=cov_end, + forecast_horizon=forecast_horizon, + state="NW", + ) + assert feats.loc["2024-07-07 00:00:00+00:00", "is_school_holiday"] == 0 + assert feats.loc["2024-07-08 00:00:00+00:00", "is_school_holiday"] == 1 + + def test_fail_safe_start_before_valid_from(self): + forecast_horizon = 24 + n_data = 48 + data = self._make_data("2021-11-01", n_data) + start = data.index[0] + cov_end = start + pd.Timedelta(hours=(n_data + forecast_horizon - 1)) + with pytest.raises(ValueError, match="2022-01-01"): + get_school_holiday_features( + data=data, + start=start, + cov_end=cov_end, + forecast_horizon=forecast_horizon, + state="NW", + ) + + def test_fail_safe_end_after_valid_to(self): + # Start late enough that cov_end (start + 71h) extends past 2027-12-31. + # 2027-12-31 00:00 + 71h = 2028-01-02 23:00 → beyond valid_to. + forecast_horizon = 24 + n_data = 48 + data = self._make_data("2027-12-31", n_data) + start = data.index[0] + cov_end = start + pd.Timedelta(hours=(n_data + forecast_horizon - 1)) + with pytest.raises(ValueError, match="2027-12-31"): + get_school_holiday_features( + data=data, + start=start, + cov_end=cov_end, + forecast_horizon=forecast_horizon, + state="NW", + ) + + def test_country_code_not_de_raises(self): + forecast_horizon = 24 + n_data = 48 + data = self._make_data("2024-07-06", n_data) + start = data.index[0] + cov_end = start + pd.Timedelta(hours=(n_data + forecast_horizon - 1)) + with pytest.raises(ValueError, match="country_code"): + get_school_holiday_features( + data=data, + start=start, + cov_end=cov_end, + forecast_horizon=forecast_horizon, + country_code="FR", + ) + + +class TestSelectExogenousSchoolHoliday: + """Unit tests for select_exogenous_features with school-holiday toggle.""" + + def _make_exog_with_school_holiday(self) -> tuple[pd.DataFrame, pd.DataFrame]: + import numpy as np + + idx = pd.date_range("2024-07-01", periods=24, freq="h", tz="UTC") + weather = pd.DataFrame({"wind_speed": np.ones(24)}, index=idx) + exog = pd.DataFrame( + { + "hour_sin": np.sin(2 * 3.14159 * idx.hour / 24), + "hour_cos": np.cos(2 * 3.14159 * idx.hour / 24), + "wind_speed": weather["wind_speed"], + "is_school_holiday": 0, + }, + index=idx, + ) + return exog, weather + + def test_include_school_holiday_true_includes_column(self): + exog, weather = self._make_exog_with_school_holiday() + selected = select_exogenous_features( + exogenous_features=exog, + weather_aligned=weather, + include_school_holiday_features=True, + ) + assert "is_school_holiday" in selected + + def test_include_school_holiday_false_excludes_column(self): + exog, weather = self._make_exog_with_school_holiday() + selected = select_exogenous_features( + exogenous_features=exog, + weather_aligned=weather, + include_school_holiday_features=False, + ) + assert "is_school_holiday" not in selected + + def test_include_school_holiday_default_excludes_column(self): + exog, weather = self._make_exog_with_school_holiday() + selected = select_exogenous_features( + exogenous_features=exog, + weather_aligned=weather, + ) + assert "is_school_holiday" not in selected + + def test_missing_column_silently_absent(self): + """When is_school_holiday is not in exog, enabling the flag is a no-op.""" + import numpy as np + + idx = pd.date_range("2024-07-01", periods=24, freq="h", tz="UTC") + weather = pd.DataFrame({"wind_speed": np.ones(24)}, index=idx) + exog = pd.DataFrame( + {"hour_sin": np.zeros(24), "hour_cos": np.zeros(24)}, + index=idx, + ) + selected = select_exogenous_features( + exogenous_features=exog, + weather_aligned=weather, + include_school_holiday_features=True, + ) + assert "is_school_holiday" not in selected + + +class TestBundledDataIntegrity: + """Tests that verify the bundled CSV files are well-formed.""" + + def test_sixteen_states(self): + df, _, _ = load_school_holidays_de() + assert len(df["state"].unique()) == 16 + + def test_all_expected_state_codes_present(self): + expected = { + "BW", + "BY", + "BE", + "BB", + "HB", + "HH", + "HE", + "MV", + "NI", + "NW", + "RP", + "SL", + "SN", + "ST", + "SH", + "TH", + } + df, _, _ = load_school_holidays_de() + assert set(df["state"].unique()) == expected + + def test_schema(self): + df, _, _ = load_school_holidays_de() + assert list(df.columns) == ["state", "name", "start_date", "end_date"] + + def test_row_count_in_range(self): + df, _, _ = load_school_holidays_de() + assert 500 <= len(df) <= 650, f"Row count {len(df)} not in [500, 650]" + + def test_start_le_end(self): + df, _, _ = load_school_holidays_de() + assert (df["start_date"] <= df["end_date"]).all() + + def test_meta_parses(self): + _, valid_from, valid_to = load_school_holidays_de() + assert valid_from == pd.Timestamp("2022-01-01") + assert valid_to == pd.Timestamp("2027-12-31") + + def test_date_columns_are_timestamps(self): + df, _, _ = load_school_holidays_de() + assert pd.api.types.is_datetime64_any_dtype(df["start_date"]) + assert pd.api.types.is_datetime64_any_dtype(df["end_date"]) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])