diff --git a/docs/contributing/01-community.md b/community/01-community.md similarity index 98% rename from docs/contributing/01-community.md rename to community/01-community.md index 7d83b347..8898486c 100644 --- a/docs/contributing/01-community.md +++ b/community/01-community.md @@ -1,5 +1,6 @@ --- -slug: /community +id: index +slug: / title: Community sidebar_position: 1 --- diff --git a/docs/contributing/02-contribution-guideline.md b/community/02-contribution-guideline.md similarity index 100% rename from docs/contributing/02-contribution-guideline.md rename to community/02-contribution-guideline.md diff --git a/docs/contributing/03-roadmap.mdx b/community/03-roadmap.mdx similarity index 100% rename from docs/contributing/03-roadmap.mdx rename to community/03-roadmap.mdx diff --git a/docs/contributing/04-Proposals/MEP1/Distributed-API-Working.png b/community/04-Proposals/MEP1/Distributed-API-Working.png similarity index 100% rename from docs/contributing/04-Proposals/MEP1/Distributed-API-Working.png rename to community/04-Proposals/MEP1/Distributed-API-Working.png diff --git a/docs/contributing/04-Proposals/MEP1/Distributed-API.png b/community/04-Proposals/MEP1/Distributed-API.png similarity index 100% rename from docs/contributing/04-Proposals/MEP1/Distributed-API.png rename to community/04-Proposals/MEP1/Distributed-API.png diff --git a/docs/contributing/04-Proposals/MEP1/Distributed-Deployment.png b/community/04-Proposals/MEP1/Distributed-Deployment.png similarity index 100% rename from docs/contributing/04-Proposals/MEP1/Distributed-Deployment.png rename to community/04-Proposals/MEP1/Distributed-Deployment.png diff --git a/docs/contributing/04-Proposals/MEP1/Distributed.drawio b/community/04-Proposals/MEP1/Distributed.drawio similarity index 100% rename from docs/contributing/04-Proposals/MEP1/Distributed.drawio rename to community/04-Proposals/MEP1/Distributed.drawio diff --git a/docs/contributing/04-Proposals/MEP1/Distributed.png b/community/04-Proposals/MEP1/Distributed.png similarity index 100% rename from docs/contributing/04-Proposals/MEP1/Distributed.png rename to community/04-Proposals/MEP1/Distributed.png diff --git a/docs/contributing/04-Proposals/MEP1/README.md b/community/04-Proposals/MEP1/README.md similarity index 100% rename from docs/contributing/04-Proposals/MEP1/README.md rename to community/04-Proposals/MEP1/README.md diff --git a/docs/contributing/04-Proposals/MEP10/README.md b/community/04-Proposals/MEP10/README.md similarity index 100% rename from docs/contributing/04-Proposals/MEP10/README.md rename to community/04-Proposals/MEP10/README.md diff --git a/docs/contributing/04-Proposals/MEP11/README.md b/community/04-Proposals/MEP11/README.md similarity index 100% rename from docs/contributing/04-Proposals/MEP11/README.md rename to community/04-Proposals/MEP11/README.md diff --git a/docs/contributing/04-Proposals/MEP12/README.md b/community/04-Proposals/MEP12/README.md similarity index 100% rename from docs/contributing/04-Proposals/MEP12/README.md rename to community/04-Proposals/MEP12/README.md diff --git a/docs/contributing/04-Proposals/MEP13/README.md b/community/04-Proposals/MEP13/README.md similarity index 100% rename from docs/contributing/04-Proposals/MEP13/README.md rename to community/04-Proposals/MEP13/README.md diff --git a/docs/contributing/04-Proposals/MEP14/README.md b/community/04-Proposals/MEP14/README.md similarity index 100% rename from docs/contributing/04-Proposals/MEP14/README.md rename to community/04-Proposals/MEP14/README.md diff --git a/docs/contributing/04-Proposals/MEP16/README.md b/community/04-Proposals/MEP16/README.md similarity index 100% rename from docs/contributing/04-Proposals/MEP16/README.md rename to community/04-Proposals/MEP16/README.md diff --git a/docs/contributing/04-Proposals/MEP16/firewall-for-capms-overview.drawio b/community/04-Proposals/MEP16/firewall-for-capms-overview.drawio similarity index 100% rename from docs/contributing/04-Proposals/MEP16/firewall-for-capms-overview.drawio rename to community/04-Proposals/MEP16/firewall-for-capms-overview.drawio diff --git a/docs/contributing/04-Proposals/MEP16/firewall-for-capms-overview.svg b/community/04-Proposals/MEP16/firewall-for-capms-overview.svg similarity index 100% rename from docs/contributing/04-Proposals/MEP16/firewall-for-capms-overview.svg rename to community/04-Proposals/MEP16/firewall-for-capms-overview.svg diff --git a/docs/contributing/04-Proposals/MEP17/README.md b/community/04-Proposals/MEP17/README.md similarity index 100% rename from docs/contributing/04-Proposals/MEP17/README.md rename to community/04-Proposals/MEP17/README.md diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/README.md b/community/04-Proposals/MEP18/README.md similarity index 98% rename from versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/README.md rename to community/04-Proposals/MEP18/README.md index eb574491..e1049fcd 100644 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/README.md +++ b/community/04-Proposals/MEP18/README.md @@ -6,7 +6,7 @@ sidebar_position: 18 # Autonomous Control Plane -As described in the [deployment chapter](../../../docs/04-For%20Operators/03-deployment-guide.md), we strongly recommend Kubernetes as the target platform for running the metal-stack control plane. +As described in the [deployment chapter](/docs/deployment-guide), we strongly recommend Kubernetes as the target platform for running the metal-stack control plane. Kubernetes clusters for this purpose are readily available from hyperscalers, metalstack.cloud, or other cloud providers. Simply using a managed Kubernetes cluster greatly simplifies a metal-stack installation. However, sometimes it might be desirable to host the metal-stack control plane autonomously, without the help of another cloud provider. Reasons for this might include corporate policies that prohibit the use of external data center products, or network constraints. diff --git a/docs/contributing/04-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio b/community/04-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio similarity index 100% rename from docs/contributing/04-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio rename to community/04-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio diff --git a/docs/contributing/04-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg b/community/04-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg similarity index 100% rename from docs/contributing/04-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg rename to community/04-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg diff --git a/docs/contributing/04-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio b/community/04-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio similarity index 100% rename from docs/contributing/04-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio rename to community/04-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio diff --git a/docs/contributing/04-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg b/community/04-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg similarity index 100% rename from docs/contributing/04-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg rename to community/04-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg diff --git a/docs/contributing/04-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio b/community/04-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio similarity index 100% rename from docs/contributing/04-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio rename to community/04-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio diff --git a/docs/contributing/04-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg b/community/04-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg similarity index 100% rename from docs/contributing/04-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg rename to community/04-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg diff --git a/docs/contributing/04-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio b/community/04-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio similarity index 100% rename from docs/contributing/04-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio rename to community/04-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio diff --git a/docs/contributing/04-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg b/community/04-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg similarity index 100% rename from docs/contributing/04-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg rename to community/04-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg diff --git a/docs/contributing/04-Proposals/MEP2/README.md b/community/04-Proposals/MEP2/README.md similarity index 100% rename from docs/contributing/04-Proposals/MEP2/README.md rename to community/04-Proposals/MEP2/README.md diff --git a/docs/contributing/04-Proposals/MEP3/README.md b/community/04-Proposals/MEP3/README.md similarity index 100% rename from docs/contributing/04-Proposals/MEP3/README.md rename to community/04-Proposals/MEP3/README.md diff --git a/docs/contributing/04-Proposals/MEP4/README.md b/community/04-Proposals/MEP4/README.md similarity index 100% rename from docs/contributing/04-Proposals/MEP4/README.md rename to community/04-Proposals/MEP4/README.md diff --git a/docs/contributing/04-Proposals/MEP5/README.md b/community/04-Proposals/MEP5/README.md similarity index 100% rename from docs/contributing/04-Proposals/MEP5/README.md rename to community/04-Proposals/MEP5/README.md diff --git a/docs/contributing/04-Proposals/MEP5/shared.drawio b/community/04-Proposals/MEP5/shared.drawio similarity index 100% rename from docs/contributing/04-Proposals/MEP5/shared.drawio rename to community/04-Proposals/MEP5/shared.drawio diff --git a/docs/contributing/04-Proposals/MEP5/shared.png b/community/04-Proposals/MEP5/shared.png similarity index 100% rename from docs/contributing/04-Proposals/MEP5/shared.png rename to community/04-Proposals/MEP5/shared.png diff --git a/docs/contributing/04-Proposals/MEP5/shared_advanced.drawio b/community/04-Proposals/MEP5/shared_advanced.drawio similarity index 100% rename from docs/contributing/04-Proposals/MEP5/shared_advanced.drawio rename to community/04-Proposals/MEP5/shared_advanced.drawio diff --git a/docs/contributing/04-Proposals/MEP5/shared_advanced.png b/community/04-Proposals/MEP5/shared_advanced.png similarity index 100% rename from docs/contributing/04-Proposals/MEP5/shared_advanced.png rename to community/04-Proposals/MEP5/shared_advanced.png diff --git a/docs/contributing/04-Proposals/MEP6/README.md b/community/04-Proposals/MEP6/README.md similarity index 100% rename from docs/contributing/04-Proposals/MEP6/README.md rename to community/04-Proposals/MEP6/README.md diff --git a/docs/contributing/04-Proposals/MEP6/dmz-internet_private.drawio b/community/04-Proposals/MEP6/dmz-internet_private.drawio similarity index 100% rename from docs/contributing/04-Proposals/MEP6/dmz-internet_private.drawio rename to community/04-Proposals/MEP6/dmz-internet_private.drawio diff --git a/docs/contributing/04-Proposals/MEP6/dmz-internet_private.svg b/community/04-Proposals/MEP6/dmz-internet_private.svg similarity index 100% rename from docs/contributing/04-Proposals/MEP6/dmz-internet_private.svg rename to community/04-Proposals/MEP6/dmz-internet_private.svg diff --git a/docs/contributing/04-Proposals/MEP6/dmz-internet_public.drawio b/community/04-Proposals/MEP6/dmz-internet_public.drawio similarity index 100% rename from docs/contributing/04-Proposals/MEP6/dmz-internet_public.drawio rename to community/04-Proposals/MEP6/dmz-internet_public.drawio diff --git a/docs/contributing/04-Proposals/MEP6/dmz-internet_public.svg b/community/04-Proposals/MEP6/dmz-internet_public.svg similarity index 100% rename from docs/contributing/04-Proposals/MEP6/dmz-internet_public.svg rename to community/04-Proposals/MEP6/dmz-internet_public.svg diff --git a/docs/contributing/04-Proposals/MEP8/README.md b/community/04-Proposals/MEP8/README.md similarity index 100% rename from docs/contributing/04-Proposals/MEP8/README.md rename to community/04-Proposals/MEP8/README.md diff --git a/docs/contributing/04-Proposals/MEP8/filesystems.drawio b/community/04-Proposals/MEP8/filesystems.drawio similarity index 100% rename from docs/contributing/04-Proposals/MEP8/filesystems.drawio rename to community/04-Proposals/MEP8/filesystems.drawio diff --git a/docs/contributing/04-Proposals/MEP8/filesystems.png b/community/04-Proposals/MEP8/filesystems.png similarity index 100% rename from docs/contributing/04-Proposals/MEP8/filesystems.png rename to community/04-Proposals/MEP8/filesystems.png diff --git a/docs/contributing/04-Proposals/MEP9/README.md b/community/04-Proposals/MEP9/README.md similarity index 100% rename from docs/contributing/04-Proposals/MEP9/README.md rename to community/04-Proposals/MEP9/README.md diff --git a/docs/contributing/04-Proposals/MEP9/architecture.drawio b/community/04-Proposals/MEP9/architecture.drawio similarity index 100% rename from docs/contributing/04-Proposals/MEP9/architecture.drawio rename to community/04-Proposals/MEP9/architecture.drawio diff --git a/docs/contributing/04-Proposals/MEP9/architecture.svg b/community/04-Proposals/MEP9/architecture.svg similarity index 100% rename from docs/contributing/04-Proposals/MEP9/architecture.svg rename to community/04-Proposals/MEP9/architecture.svg diff --git a/docs/contributing/04-Proposals/_category_.json b/community/04-Proposals/_category_.json similarity index 100% rename from docs/contributing/04-Proposals/_category_.json rename to community/04-Proposals/_category_.json diff --git a/docs/contributing/04-Proposals/index.md b/community/04-Proposals/index.md similarity index 100% rename from docs/contributing/04-Proposals/index.md rename to community/04-Proposals/index.md diff --git a/docs/contributing/05-release-flow.md b/community/05-release-flow.md similarity index 100% rename from docs/contributing/05-release-flow.md rename to community/05-release-flow.md diff --git a/docs/contributing/06-oci-artifacts.md b/community/06-oci-artifacts.md similarity index 100% rename from docs/contributing/06-oci-artifacts.md rename to community/06-oci-artifacts.md diff --git a/docs/contributing/release.png b/community/release.png similarity index 100% rename from docs/contributing/release.png rename to community/release.png diff --git a/docs/contributing/release_flow.drawio b/community/release_flow.drawio similarity index 100% rename from docs/contributing/release_flow.drawio rename to community/release_flow.drawio diff --git a/docs/contributing/release_flow.svg b/community/release_flow.svg similarity index 100% rename from docs/contributing/release_flow.svg rename to community/release_flow.svg diff --git a/docs/docs/01-home.md b/docs/01-home.md similarity index 100% rename from docs/docs/01-home.md rename to docs/01-home.md diff --git a/docs/docs/02-General/01-quickstart.md b/docs/02-General/01-quickstart.md similarity index 100% rename from docs/docs/02-General/01-quickstart.md rename to docs/02-General/01-quickstart.md diff --git a/docs/docs/02-General/02-why metal stack.md b/docs/02-General/02-why metal stack.md similarity index 100% rename from docs/docs/02-General/02-why metal stack.md rename to docs/02-General/02-why metal stack.md diff --git a/docs/docs/02-General/03-why bare metal.md b/docs/02-General/03-why bare metal.md similarity index 100% rename from docs/docs/02-General/03-why bare metal.md rename to docs/02-General/03-why bare metal.md diff --git a/docs/docs/02-General/04-flavors-of-metalstack.md b/docs/02-General/04-flavors-of-metalstack.md similarity index 89% rename from docs/docs/02-General/04-flavors-of-metalstack.md rename to docs/02-General/04-flavors-of-metalstack.md index 207bebd5..97767a7a 100644 --- a/docs/docs/02-General/04-flavors-of-metalstack.md +++ b/docs/02-General/04-flavors-of-metalstack.md @@ -14,7 +14,7 @@ As modern infrastructure and cloud native applications are designed with Kuberne Regardless which flavor of metal-stack you use, it is always possible to manually provision machines, networks and ip addresses. This is the most basic way of using metal-stack and is very similar to how traditional bare metal infrastructures are managed. -Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](../../contributing/04-Proposals/MEP4/README.md) and [MEP-16](../../contributing/04-Proposals/MEP16/README.md) in the future. +Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](/community/MEP-14-independence-from-external-sources) and [MEP-16](/community/MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller) in the future. ## Gardener diff --git a/docs/docs/03-For Users/01-client_libraries.md b/docs/03-For Users/01-client_libraries.md similarity index 100% rename from docs/docs/03-For Users/01-client_libraries.md rename to docs/03-For Users/01-client_libraries.md diff --git a/docs/docs/04-For Operators/01-hardware.md b/docs/04-For Operators/01-hardware.md similarity index 100% rename from docs/docs/04-For Operators/01-hardware.md rename to docs/04-For Operators/01-hardware.md diff --git a/docs/docs/04-For Operators/02-operating-systems.md b/docs/04-For Operators/02-operating-systems.md similarity index 100% rename from docs/docs/04-For Operators/02-operating-systems.md rename to docs/04-For Operators/02-operating-systems.md diff --git a/docs/docs/04-For Operators/03-deployment-guide.mdx b/docs/04-For Operators/03-deployment-guide.mdx similarity index 98% rename from docs/docs/04-For Operators/03-deployment-guide.mdx rename to docs/04-For Operators/03-deployment-guide.mdx index ee0f0d7c..ce58e0e0 100644 --- a/docs/docs/04-For Operators/03-deployment-guide.mdx +++ b/docs/04-For Operators/03-deployment-guide.mdx @@ -31,7 +31,7 @@ You can use the [mini-lab](https://github.com/metal-stack/mini-lab) as a templat The metal control plane is typically deployed in a Kubernetes cluster. Therefore, this document will assume that you have a Kubernetes cluster ready for getting deployed. Even though it is theoretically possible to deploy metal-stack without Kubernetes, we strongly advise you to use the described method because we believe that Kubernetes gives you a lot of benefits regarding the stability and maintainability of the application deployment. :::tip -For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](../../contributing/04-Proposals/MEP18/README.md) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). +For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](/community/MEP-18-autonomous-control-plane) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). ::: Let's start off with a fresh folder for your deployment: @@ -75,7 +75,7 @@ At the end of this section we are gonna end up with the following files and fold ### Releases and Ansible Role Dependencies -As metal-stack consists of many microservices all having individual versions, we have come up with a [releases](https://github.com/metal-stack/releases) repository. It contains a YAML file (we often call it release vector) describing the fitting versions of all components for every release of metal-stack. Ansible role dependencies are also part of a metal-stack release. Both the metal-stack release vector and the metal-stack ansible-roles are shipped as OCI artifacts following a specific format that's described [here](../../contributing/06-oci-artifacts.md). These artifacts are signed with the CI token of the metal-stack Github organization and can be verified using [cosign](https://github.com/sigstore/cosign). +As metal-stack consists of many microservices all having individual versions, we have come up with a [releases](https://github.com/metal-stack/releases) repository. It contains a YAML file (we often call it release vector) describing the fitting versions of all components for every release of metal-stack. Ansible role dependencies are also part of a metal-stack release. Both the metal-stack release vector and the metal-stack ansible-roles are shipped as OCI artifacts following a specific format that's described [here](/community/oci-artifacts). These artifacts are signed with the CI token of the metal-stack Github organization and can be verified using [cosign](https://github.com/sigstore/cosign). In order to download the release vector and the referenced ansible-roles prior to a deployment, we provide a small helper module called `metal_stack_release_vector` as part of the [metal-deployment-base](https://github.com/metal-stack/metal-deployment-base) deployment image. Its main tasks are: diff --git a/docs/docs/04-For Operators/04-maintenance.md b/docs/04-For Operators/04-maintenance.md similarity index 100% rename from docs/docs/04-For Operators/04-maintenance.md rename to docs/04-For Operators/04-maintenance.md diff --git a/docs/docs/04-For Operators/05-monitoring.md b/docs/04-For Operators/05-monitoring.md similarity index 100% rename from docs/docs/04-For Operators/05-monitoring.md rename to docs/04-For Operators/05-monitoring.md diff --git a/docs/docs/04-For Operators/06-troubleshoot.md b/docs/04-For Operators/06-troubleshoot.md similarity index 100% rename from docs/docs/04-For Operators/06-troubleshoot.md rename to docs/04-For Operators/06-troubleshoot.md diff --git a/docs/docs/04-For Operators/mgmt_net_layer3.drawio b/docs/04-For Operators/mgmt_net_layer3.drawio similarity index 100% rename from docs/docs/04-For Operators/mgmt_net_layer3.drawio rename to docs/04-For Operators/mgmt_net_layer3.drawio diff --git a/docs/docs/04-For Operators/mgmt_net_layer3.png b/docs/04-For Operators/mgmt_net_layer3.png similarity index 100% rename from docs/docs/04-For Operators/mgmt_net_layer3.png rename to docs/04-For Operators/mgmt_net_layer3.png diff --git a/docs/docs/04-For Operators/monitoring-stack.svg b/docs/04-For Operators/monitoring-stack.svg similarity index 100% rename from docs/docs/04-For Operators/monitoring-stack.svg rename to docs/04-For Operators/monitoring-stack.svg diff --git a/docs/docs/04-For Operators/starter.jpg b/docs/04-For Operators/starter.jpg similarity index 100% rename from docs/docs/04-For Operators/starter.jpg rename to docs/04-For Operators/starter.jpg diff --git a/docs/docs/05-Concepts/01-architecture.mdx b/docs/05-Concepts/01-architecture.mdx similarity index 99% rename from docs/docs/05-Concepts/01-architecture.mdx rename to docs/05-Concepts/01-architecture.mdx index 316eeb37..75298df9 100644 --- a/docs/docs/05-Concepts/01-architecture.mdx +++ b/docs/05-Concepts/01-architecture.mdx @@ -152,4 +152,4 @@ Thus, for creating a partition as well as a machine or a firewall, the flags `dn In order to be fully offline resilient, make sure to check out `metal-image-cache-sync`. This component provides copies of `metal-images`, `metal-kernel` and `metal-hammer`. -This feature is related to [MEP14](../../contributing/04-Proposals/MEP14/README.md). +This feature is related to [MEP14](/community/MEP-14-independence-from-external-sources). diff --git a/docs/docs/05-Concepts/02-user-management.md b/docs/05-Concepts/02-user-management.md similarity index 98% rename from docs/docs/05-Concepts/02-user-management.md rename to docs/05-Concepts/02-user-management.md index 21d9922f..e6c84fea 100644 --- a/docs/docs/05-Concepts/02-user-management.md +++ b/docs/05-Concepts/02-user-management.md @@ -7,7 +7,7 @@ sidebar_position: 2 # User Management At the moment, metal-stack can more or less be seen as a low-level API that does not scope access based on projects and tenants. -Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](../../contributing/04-Proposals/MEP4/README.md). +Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](/community/MEP-14-independence-from-external-sources). Until then projects and tenants can be created, but have no effect on access control. diff --git a/docs/docs/05-Concepts/03-Network/01-theory.md b/docs/05-Concepts/03-Network/01-theory.md similarity index 100% rename from docs/docs/05-Concepts/03-Network/01-theory.md rename to docs/05-Concepts/03-Network/01-theory.md diff --git a/docs/docs/05-Concepts/03-Network/02-firewalls.md b/docs/05-Concepts/03-Network/02-firewalls.md similarity index 100% rename from docs/docs/05-Concepts/03-Network/02-firewalls.md rename to docs/05-Concepts/03-Network/02-firewalls.md diff --git a/docs/docs/05-Concepts/03-Network/03-tailscale.md b/docs/05-Concepts/03-Network/03-tailscale.md similarity index 100% rename from docs/docs/05-Concepts/03-Network/03-tailscale.md rename to docs/05-Concepts/03-Network/03-tailscale.md diff --git a/docs/docs/05-Concepts/03-Network/2-layer-leaf-spine.drawio b/docs/05-Concepts/03-Network/2-layer-leaf-spine.drawio similarity index 100% rename from docs/docs/05-Concepts/03-Network/2-layer-leaf-spine.drawio rename to docs/05-Concepts/03-Network/2-layer-leaf-spine.drawio diff --git a/docs/docs/05-Concepts/03-Network/2-layer-leaf-spine.svg b/docs/05-Concepts/03-Network/2-layer-leaf-spine.svg similarity index 100% rename from docs/docs/05-Concepts/03-Network/2-layer-leaf-spine.svg rename to docs/05-Concepts/03-Network/2-layer-leaf-spine.svg diff --git a/docs/docs/05-Concepts/03-Network/3-layer-leaf-spine.drawio b/docs/05-Concepts/03-Network/3-layer-leaf-spine.drawio similarity index 100% rename from docs/docs/05-Concepts/03-Network/3-layer-leaf-spine.drawio rename to docs/05-Concepts/03-Network/3-layer-leaf-spine.drawio diff --git a/docs/docs/05-Concepts/03-Network/3-layer-leaf-spine.svg b/docs/05-Concepts/03-Network/3-layer-leaf-spine.svg similarity index 100% rename from docs/docs/05-Concepts/03-Network/3-layer-leaf-spine.svg rename to docs/05-Concepts/03-Network/3-layer-leaf-spine.svg diff --git a/docs/docs/05-Concepts/03-Network/evpn-vtep.drawio b/docs/05-Concepts/03-Network/evpn-vtep.drawio similarity index 100% rename from docs/docs/05-Concepts/03-Network/evpn-vtep.drawio rename to docs/05-Concepts/03-Network/evpn-vtep.drawio diff --git a/docs/docs/05-Concepts/03-Network/evpn-vtep.svg b/docs/05-Concepts/03-Network/evpn-vtep.svg similarity index 100% rename from docs/docs/05-Concepts/03-Network/evpn-vtep.svg rename to docs/05-Concepts/03-Network/evpn-vtep.svg diff --git a/docs/docs/05-Concepts/03-Network/network-physical-wiring.drawio b/docs/05-Concepts/03-Network/network-physical-wiring.drawio similarity index 100% rename from docs/docs/05-Concepts/03-Network/network-physical-wiring.drawio rename to docs/05-Concepts/03-Network/network-physical-wiring.drawio diff --git a/docs/docs/05-Concepts/03-Network/network-physical-wiring.svg b/docs/05-Concepts/03-Network/network-physical-wiring.svg similarity index 100% rename from docs/docs/05-Concepts/03-Network/network-physical-wiring.svg rename to docs/05-Concepts/03-Network/network-physical-wiring.svg diff --git a/docs/docs/05-Concepts/03-Network/network-vrfs.drawio b/docs/05-Concepts/03-Network/network-vrfs.drawio similarity index 100% rename from docs/docs/05-Concepts/03-Network/network-vrfs.drawio rename to docs/05-Concepts/03-Network/network-vrfs.drawio diff --git a/docs/docs/05-Concepts/03-Network/network-vrfs.svg b/docs/05-Concepts/03-Network/network-vrfs.svg similarity index 100% rename from docs/docs/05-Concepts/03-Network/network-vrfs.svg rename to docs/05-Concepts/03-Network/network-vrfs.svg diff --git a/docs/docs/05-Concepts/03-Network/tailscale-authkeys.png b/docs/05-Concepts/03-Network/tailscale-authkeys.png similarity index 100% rename from docs/docs/05-Concepts/03-Network/tailscale-authkeys.png rename to docs/05-Concepts/03-Network/tailscale-authkeys.png diff --git a/docs/docs/05-Concepts/03-Network/tailscale-devices.png b/docs/05-Concepts/03-Network/tailscale-devices.png similarity index 100% rename from docs/docs/05-Concepts/03-Network/tailscale-devices.png rename to docs/05-Concepts/03-Network/tailscale-devices.png diff --git a/docs/docs/05-Concepts/03-Network/vrf-simple.drawio b/docs/05-Concepts/03-Network/vrf-simple.drawio similarity index 100% rename from docs/docs/05-Concepts/03-Network/vrf-simple.drawio rename to docs/05-Concepts/03-Network/vrf-simple.drawio diff --git a/docs/docs/05-Concepts/03-Network/vrf-simple.svg b/docs/05-Concepts/03-Network/vrf-simple.svg similarity index 100% rename from docs/docs/05-Concepts/03-Network/vrf-simple.svg rename to docs/05-Concepts/03-Network/vrf-simple.svg diff --git a/docs/docs/05-Concepts/04-Kubernetes/01-gardener.md b/docs/05-Concepts/04-Kubernetes/01-gardener.md similarity index 100% rename from docs/docs/05-Concepts/04-Kubernetes/01-gardener.md rename to docs/05-Concepts/04-Kubernetes/01-gardener.md diff --git a/docs/docs/05-Concepts/04-Kubernetes/02-cluster-api.md b/docs/05-Concepts/04-Kubernetes/02-cluster-api.md similarity index 100% rename from docs/docs/05-Concepts/04-Kubernetes/02-cluster-api.md rename to docs/05-Concepts/04-Kubernetes/02-cluster-api.md diff --git a/docs/docs/05-Concepts/04-Kubernetes/03-cloud-controller-manager.md b/docs/05-Concepts/04-Kubernetes/03-cloud-controller-manager.md similarity index 100% rename from docs/docs/05-Concepts/04-Kubernetes/03-cloud-controller-manager.md rename to docs/05-Concepts/04-Kubernetes/03-cloud-controller-manager.md diff --git a/docs/docs/05-Concepts/04-Kubernetes/04-firewall-controller-manager.md b/docs/05-Concepts/04-Kubernetes/04-firewall-controller-manager.md similarity index 100% rename from docs/docs/05-Concepts/04-Kubernetes/04-firewall-controller-manager.md rename to docs/05-Concepts/04-Kubernetes/04-firewall-controller-manager.md diff --git a/docs/docs/05-Concepts/04-Kubernetes/05-isolated-clusters.md b/docs/05-Concepts/04-Kubernetes/05-isolated-clusters.md similarity index 100% rename from docs/docs/05-Concepts/04-Kubernetes/05-isolated-clusters.md rename to docs/05-Concepts/04-Kubernetes/05-isolated-clusters.md diff --git a/docs/docs/05-Concepts/04-Kubernetes/06-gpu-workers.md b/docs/05-Concepts/04-Kubernetes/06-gpu-workers.md similarity index 100% rename from docs/docs/05-Concepts/04-Kubernetes/06-gpu-workers.md rename to docs/05-Concepts/04-Kubernetes/06-gpu-workers.md diff --git a/docs/docs/05-Concepts/04-Kubernetes/07-storage.md b/docs/05-Concepts/04-Kubernetes/07-storage.md similarity index 100% rename from docs/docs/05-Concepts/04-Kubernetes/07-storage.md rename to docs/05-Concepts/04-Kubernetes/07-storage.md diff --git a/docs/docs/05-Concepts/04-Kubernetes/isolated-kubernetes.drawio b/docs/05-Concepts/04-Kubernetes/isolated-kubernetes.drawio similarity index 100% rename from docs/docs/05-Concepts/04-Kubernetes/isolated-kubernetes.drawio rename to docs/05-Concepts/04-Kubernetes/isolated-kubernetes.drawio diff --git a/docs/docs/05-Concepts/04-Kubernetes/isolated-kubernetes.svg b/docs/05-Concepts/04-Kubernetes/isolated-kubernetes.svg similarity index 100% rename from docs/docs/05-Concepts/04-Kubernetes/isolated-kubernetes.svg rename to docs/05-Concepts/04-Kubernetes/isolated-kubernetes.svg diff --git a/docs/docs/05-Concepts/assets/2-layer-leaf-spine.svg b/docs/05-Concepts/assets/2-layer-leaf-spine.svg similarity index 100% rename from docs/docs/05-Concepts/assets/2-layer-leaf-spine.svg rename to docs/05-Concepts/assets/2-layer-leaf-spine.svg diff --git a/docs/docs/05-Concepts/assets/3-layer-leaf-spine.svg b/docs/05-Concepts/assets/3-layer-leaf-spine.svg similarity index 100% rename from docs/docs/05-Concepts/assets/3-layer-leaf-spine.svg rename to docs/05-Concepts/assets/3-layer-leaf-spine.svg diff --git a/docs/docs/05-Concepts/assets/evpn-vtep.svg b/docs/05-Concepts/assets/evpn-vtep.svg similarity index 100% rename from docs/docs/05-Concepts/assets/evpn-vtep.svg rename to docs/05-Concepts/assets/evpn-vtep.svg diff --git a/docs/docs/05-Concepts/assets/isolated-kubernetes.drawio b/docs/05-Concepts/assets/isolated-kubernetes.drawio similarity index 100% rename from docs/docs/05-Concepts/assets/isolated-kubernetes.drawio rename to docs/05-Concepts/assets/isolated-kubernetes.drawio diff --git a/docs/docs/05-Concepts/assets/isolated-kubernetes.svg b/docs/05-Concepts/assets/isolated-kubernetes.svg similarity index 100% rename from docs/docs/05-Concepts/assets/isolated-kubernetes.svg rename to docs/05-Concepts/assets/isolated-kubernetes.svg diff --git a/docs/docs/05-Concepts/assets/metal-stack-architecture.drawio b/docs/05-Concepts/assets/metal-stack-architecture.drawio similarity index 100% rename from docs/docs/05-Concepts/assets/metal-stack-architecture.drawio rename to docs/05-Concepts/assets/metal-stack-architecture.drawio diff --git a/docs/docs/05-Concepts/assets/metal-stack-architecture.svg b/docs/05-Concepts/assets/metal-stack-architecture.svg similarity index 100% rename from docs/docs/05-Concepts/assets/metal-stack-architecture.svg rename to docs/05-Concepts/assets/metal-stack-architecture.svg diff --git a/docs/docs/05-Concepts/assets/metal-stack-control-plane.svg b/docs/05-Concepts/assets/metal-stack-control-plane.svg similarity index 100% rename from docs/docs/05-Concepts/assets/metal-stack-control-plane.svg rename to docs/05-Concepts/assets/metal-stack-control-plane.svg diff --git a/docs/docs/05-Concepts/assets/metal-stack-partition.svg b/docs/05-Concepts/assets/metal-stack-partition.svg similarity index 100% rename from docs/docs/05-Concepts/assets/metal-stack-partition.svg rename to docs/05-Concepts/assets/metal-stack-partition.svg diff --git a/docs/docs/05-Concepts/assets/network-physical-wiring.drawio b/docs/05-Concepts/assets/network-physical-wiring.drawio similarity index 100% rename from docs/docs/05-Concepts/assets/network-physical-wiring.drawio rename to docs/05-Concepts/assets/network-physical-wiring.drawio diff --git a/docs/docs/05-Concepts/assets/network-physical-wiring.svg b/docs/05-Concepts/assets/network-physical-wiring.svg similarity index 100% rename from docs/docs/05-Concepts/assets/network-physical-wiring.svg rename to docs/05-Concepts/assets/network-physical-wiring.svg diff --git a/docs/docs/05-Concepts/assets/network-vrfs.drawio b/docs/05-Concepts/assets/network-vrfs.drawio similarity index 100% rename from docs/docs/05-Concepts/assets/network-vrfs.drawio rename to docs/05-Concepts/assets/network-vrfs.drawio diff --git a/docs/docs/05-Concepts/assets/network-vrfs.svg b/docs/05-Concepts/assets/network-vrfs.svg similarity index 100% rename from docs/docs/05-Concepts/assets/network-vrfs.svg rename to docs/05-Concepts/assets/network-vrfs.svg diff --git a/docs/docs/05-Concepts/assets/provisioning_sequence.drawio b/docs/05-Concepts/assets/provisioning_sequence.drawio similarity index 100% rename from docs/docs/05-Concepts/assets/provisioning_sequence.drawio rename to docs/05-Concepts/assets/provisioning_sequence.drawio diff --git a/docs/docs/05-Concepts/assets/provisioning_sequence.svg b/docs/05-Concepts/assets/provisioning_sequence.svg similarity index 100% rename from docs/docs/05-Concepts/assets/provisioning_sequence.svg rename to docs/05-Concepts/assets/provisioning_sequence.svg diff --git a/docs/docs/05-Concepts/assets/vrf-simple.svg b/docs/05-Concepts/assets/vrf-simple.svg similarity index 100% rename from docs/docs/05-Concepts/assets/vrf-simple.svg rename to docs/05-Concepts/assets/vrf-simple.svg diff --git a/docs/docs/06-For CISOs/Security/01-principles.md b/docs/06-For CISOs/Security/01-principles.md similarity index 98% rename from docs/docs/06-For CISOs/Security/01-principles.md rename to docs/06-For CISOs/Security/01-principles.md index 155adfa0..652053e0 100644 --- a/docs/docs/06-For CISOs/Security/01-principles.md +++ b/docs/06-For CISOs/Security/01-principles.md @@ -15,7 +15,7 @@ The minimal need to know principle is a security concept that restricts access t ### RBAC :::info -As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](../../../contributing/04-Proposals/MEP4/README.md). +As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](/community/MEP-14-independence-from-external-sources). ::: As described in our [User Management](../../05-Concepts/02-user-management.md) concept the [metal-api](https://github.com/metal-stack/metal-api) currently offers three different user roles for authorization: diff --git a/docs/docs/06-For CISOs/Security/02-sbom.md b/docs/06-For CISOs/Security/02-sbom.md similarity index 100% rename from docs/docs/06-For CISOs/Security/02-sbom.md rename to docs/06-For CISOs/Security/02-sbom.md diff --git a/docs/docs/06-For CISOs/Security/03-cryptography.md b/docs/06-For CISOs/Security/03-cryptography.md similarity index 100% rename from docs/docs/06-For CISOs/Security/03-cryptography.md rename to docs/06-For CISOs/Security/03-cryptography.md diff --git a/docs/docs/06-For CISOs/Security/04-communication-matrix.md b/docs/06-For CISOs/Security/04-communication-matrix.md similarity index 99% rename from docs/docs/06-For CISOs/Security/04-communication-matrix.md rename to docs/06-For CISOs/Security/04-communication-matrix.md index c326b401..341a45be 100644 --- a/docs/docs/06-For CISOs/Security/04-communication-matrix.md +++ b/docs/06-For CISOs/Security/04-communication-matrix.md @@ -116,7 +116,7 @@ Please note that every [networking setup](../../05-Concepts/03-Network/01-theory | VLAN | Switches, Firewalls | Layer 2 traffic segmentation. | | VXLAN | Switches, Firewalls | Encapsulate Layer 2 frames in Layer 3 packets for network virtualization. | | EVPN | Switches, Firewalls | Overlay network technology for scalable and flexible network architectures. | -| VPN | Firewalls | Management access [without open SSH ports](../../../contributing/04-Proposals/MEP9/README.md). | +| VPN | Firewalls | Management access [without open SSH ports](..//community/MEP-9-no-open-ports-to-the-data-center). | | BGP | Multiple | Routing protocol for dynamic routing and network management. | | SSH | Management Server, Switches | Secure shell access for management and configuration. | | LLDP | Switches, Machines | Link Layer Discovery Protocol for network device discovery. | diff --git a/docs/docs/06-For CISOs/artifacts-signing.md b/docs/06-For CISOs/artifacts-signing.md similarity index 100% rename from docs/docs/06-For CISOs/artifacts-signing.md rename to docs/06-For CISOs/artifacts-signing.md diff --git a/docs/docs/06-For CISOs/integration-checks.md b/docs/06-For CISOs/integration-checks.md similarity index 100% rename from docs/docs/06-For CISOs/integration-checks.md rename to docs/06-For CISOs/integration-checks.md diff --git a/docs/docs/06-For CISOs/network.md b/docs/06-For CISOs/network.md similarity index 100% rename from docs/docs/06-For CISOs/network.md rename to docs/06-For CISOs/network.md diff --git a/docs/docs/06-For CISOs/rbac.md b/docs/06-For CISOs/rbac.md similarity index 90% rename from docs/docs/06-For CISOs/rbac.md rename to docs/06-For CISOs/rbac.md index 736c2b1f..617434aa 100644 --- a/docs/docs/06-For CISOs/rbac.md +++ b/docs/06-For CISOs/rbac.md @@ -31,4 +31,4 @@ To ensure that internal components interact securely with the metal-api, metal-s Users can interact with the metal-api using [metalctl](https://github.com/metal-stack/metalctl), the command-line interface provided by metal-stack. Depending on the required operations, users should authenticate with the appropriate role to match their level of access. -As part of [MEP-4](../../contributing/04-Proposals/MEP4/README.md), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. +As part of [MEP-4](/community/MEP-14-independence-from-external-sources), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. diff --git a/docs/docs/06-For CISOs/remote-access.md b/docs/06-For CISOs/remote-access.md similarity index 88% rename from docs/docs/06-For CISOs/remote-access.md rename to docs/06-For CISOs/remote-access.md index a7281722..9e8a7cf4 100644 --- a/docs/docs/06-For CISOs/remote-access.md +++ b/docs/06-For CISOs/remote-access.md @@ -6,7 +6,7 @@ title: Remote Access ## Machines and Firewalls -Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](../../contributing/04-Proposals/MEP9/README.md). Administrators can access machines in two primary ways. +Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](/community/MEP-9-no-open-ports-to-the-data-center). Administrators can access machines in two primary ways. **Out-of-band management via SOL** @@ -26,4 +26,4 @@ This approach uses the [`metal-console`](../08-References/Control%20Plane/metal- Both methods ensure secure and controlled access to machines without exposing them unnecessarily to the network, maintaining the integrity and safety of the infrastructure. -Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](../../contributing/04-Proposals/MEP4/README.md). +Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](/community/MEP-14-independence-from-external-sources). diff --git a/docs/docs/06-For CISOs/security-vulnerability.md b/docs/06-For CISOs/security-vulnerability.md similarity index 100% rename from docs/docs/06-For CISOs/security-vulnerability.md rename to docs/06-For CISOs/security-vulnerability.md diff --git a/docs/docs/07-Release Notes/v0.18/v0.18.10.md b/docs/07-Release Notes/v0.18/v0.18.10.md similarity index 100% rename from docs/docs/07-Release Notes/v0.18/v0.18.10.md rename to docs/07-Release Notes/v0.18/v0.18.10.md diff --git a/docs/docs/07-Release Notes/v0.18/v0.18.11.md b/docs/07-Release Notes/v0.18/v0.18.11.md similarity index 100% rename from docs/docs/07-Release Notes/v0.18/v0.18.11.md rename to docs/07-Release Notes/v0.18/v0.18.11.md diff --git a/docs/docs/07-Release Notes/v0.18/v0.18.12.md b/docs/07-Release Notes/v0.18/v0.18.12.md similarity index 100% rename from docs/docs/07-Release Notes/v0.18/v0.18.12.md rename to docs/07-Release Notes/v0.18/v0.18.12.md diff --git a/docs/docs/07-Release Notes/v0.18/v0.18.13.md b/docs/07-Release Notes/v0.18/v0.18.13.md similarity index 100% rename from docs/docs/07-Release Notes/v0.18/v0.18.13.md rename to docs/07-Release Notes/v0.18/v0.18.13.md diff --git a/docs/docs/07-Release Notes/v0.18/v0.18.14.md b/docs/07-Release Notes/v0.18/v0.18.14.md similarity index 100% rename from docs/docs/07-Release Notes/v0.18/v0.18.14.md rename to docs/07-Release Notes/v0.18/v0.18.14.md diff --git a/docs/docs/07-Release Notes/v0.18/v0.18.15.md b/docs/07-Release Notes/v0.18/v0.18.15.md similarity index 100% rename from docs/docs/07-Release Notes/v0.18/v0.18.15.md rename to docs/07-Release Notes/v0.18/v0.18.15.md diff --git a/docs/docs/07-Release Notes/v0.18/v0.18.16.md b/docs/07-Release Notes/v0.18/v0.18.16.md similarity index 100% rename from docs/docs/07-Release Notes/v0.18/v0.18.16.md rename to docs/07-Release Notes/v0.18/v0.18.16.md diff --git a/docs/docs/07-Release Notes/v0.18/v0.18.17.md b/docs/07-Release Notes/v0.18/v0.18.17.md similarity index 100% rename from docs/docs/07-Release Notes/v0.18/v0.18.17.md rename to docs/07-Release Notes/v0.18/v0.18.17.md diff --git a/docs/docs/07-Release Notes/v0.18/v0.18.18.md b/docs/07-Release Notes/v0.18/v0.18.18.md similarity index 100% rename from docs/docs/07-Release Notes/v0.18/v0.18.18.md rename to docs/07-Release Notes/v0.18/v0.18.18.md diff --git a/docs/docs/07-Release Notes/v0.19/v0.19.0.md b/docs/07-Release Notes/v0.19/v0.19.0.md similarity index 100% rename from docs/docs/07-Release Notes/v0.19/v0.19.0.md rename to docs/07-Release Notes/v0.19/v0.19.0.md diff --git a/docs/docs/07-Release Notes/v0.19/v0.19.1.md b/docs/07-Release Notes/v0.19/v0.19.1.md similarity index 100% rename from docs/docs/07-Release Notes/v0.19/v0.19.1.md rename to docs/07-Release Notes/v0.19/v0.19.1.md diff --git a/docs/docs/07-Release Notes/v0.19/v0.19.2.md b/docs/07-Release Notes/v0.19/v0.19.2.md similarity index 100% rename from docs/docs/07-Release Notes/v0.19/v0.19.2.md rename to docs/07-Release Notes/v0.19/v0.19.2.md diff --git a/docs/docs/07-Release Notes/v0.19/v0.19.3.md b/docs/07-Release Notes/v0.19/v0.19.3.md similarity index 100% rename from docs/docs/07-Release Notes/v0.19/v0.19.3.md rename to docs/07-Release Notes/v0.19/v0.19.3.md diff --git a/docs/docs/07-Release Notes/v0.19/v0.19.4.md b/docs/07-Release Notes/v0.19/v0.19.4.md similarity index 100% rename from docs/docs/07-Release Notes/v0.19/v0.19.4.md rename to docs/07-Release Notes/v0.19/v0.19.4.md diff --git a/docs/docs/07-Release Notes/v0.19/v0.19.5.md b/docs/07-Release Notes/v0.19/v0.19.5.md similarity index 100% rename from docs/docs/07-Release Notes/v0.19/v0.19.5.md rename to docs/07-Release Notes/v0.19/v0.19.5.md diff --git a/docs/docs/07-Release Notes/v0.19/v0.19.6.md b/docs/07-Release Notes/v0.19/v0.19.6.md similarity index 100% rename from docs/docs/07-Release Notes/v0.19/v0.19.6.md rename to docs/07-Release Notes/v0.19/v0.19.6.md diff --git a/docs/docs/07-Release Notes/v0.19/v0.19.7.md b/docs/07-Release Notes/v0.19/v0.19.7.md similarity index 100% rename from docs/docs/07-Release Notes/v0.19/v0.19.7.md rename to docs/07-Release Notes/v0.19/v0.19.7.md diff --git a/docs/docs/07-Release Notes/v0.19/v0.19.8.md b/docs/07-Release Notes/v0.19/v0.19.8.md similarity index 100% rename from docs/docs/07-Release Notes/v0.19/v0.19.8.md rename to docs/07-Release Notes/v0.19/v0.19.8.md diff --git a/docs/docs/07-Release Notes/v0.20/v0.20.0.md b/docs/07-Release Notes/v0.20/v0.20.0.md similarity index 100% rename from docs/docs/07-Release Notes/v0.20/v0.20.0.md rename to docs/07-Release Notes/v0.20/v0.20.0.md diff --git a/docs/docs/07-Release Notes/v0.20/v0.20.1.md b/docs/07-Release Notes/v0.20/v0.20.1.md similarity index 100% rename from docs/docs/07-Release Notes/v0.20/v0.20.1.md rename to docs/07-Release Notes/v0.20/v0.20.1.md diff --git a/docs/docs/07-Release Notes/v0.20/v0.20.2.md b/docs/07-Release Notes/v0.20/v0.20.2.md similarity index 100% rename from docs/docs/07-Release Notes/v0.20/v0.20.2.md rename to docs/07-Release Notes/v0.20/v0.20.2.md diff --git a/docs/docs/07-Release Notes/v0.21/v0.21.0.md b/docs/07-Release Notes/v0.21/v0.21.0.md similarity index 100% rename from docs/docs/07-Release Notes/v0.21/v0.21.0.md rename to docs/07-Release Notes/v0.21/v0.21.0.md diff --git a/docs/docs/07-Release Notes/v0.21/v0.21.1.md b/docs/07-Release Notes/v0.21/v0.21.1.md similarity index 90% rename from docs/docs/07-Release Notes/v0.21/v0.21.1.md rename to docs/07-Release Notes/v0.21/v0.21.1.md index a2f36d47..eef201b8 100644 --- a/docs/docs/07-Release Notes/v0.21/v0.21.1.md +++ b/docs/07-Release Notes/v0.21/v0.21.1.md @@ -17,7 +17,7 @@ See original release note at [https://github.com/metal-stack/releases/releases/t ``` * [Gardener v1.110](https://github.com/gardener/gardener/releases/tag/v1.110.0) ## Noteworthy -* As part of the [MEP-4](https://docs.metal-stack.io/stable/development/proposals/MEP4/README/) implementation, it is now possible to deploy a preview version of the [metal-apiserver](https://github.com/metal-stack/metal-apiserver). Note that this is only a development preview and will undergo a lot of breaking changes in the next time, so do not deploy this for any production use cases yet. (metal-stack/metal-roles#391) +* As part of the [MEP-4](/community/MEP-14-independence-from-external-sources) implementation, it is now possible to deploy a preview version of the [metal-apiserver](https://github.com/metal-stack/metal-apiserver). Note that this is only a development preview and will undergo a lot of breaking changes in the next time, so do not deploy this for any production use cases yet. (metal-stack/metal-roles#391) ## Breaking Changes * The support for meilisearch as an audit backend was dropped. Please migrate to the TimescaleDB backend if you depend on this implementation of meilisearch support. (metal-stack/metal-lib#174) ## Component Releases @@ -73,4 +73,4 @@ The fact that these pull requests were merged does not necessarily imply that th * Add CODEOWNERS and code contribution guidelines. (metal-stack/sonic-configdb-utils#3) @Gerrit91 * Try pushing to ghcr.io (metal-stack/metal-images#274) @majst01 * fix(ci): forgot registry login (metal-stack/metal-images#292) @vknabel -* Next release (metal-stack/releases#222) @metal-robot[bot] \ No newline at end of file +* Next release (metal-stack/releases#222) @metal-robot[bot] diff --git a/docs/docs/07-Release Notes/v0.21/v0.21.10.md b/docs/07-Release Notes/v0.21/v0.21.10.md similarity index 100% rename from docs/docs/07-Release Notes/v0.21/v0.21.10.md rename to docs/07-Release Notes/v0.21/v0.21.10.md diff --git a/docs/docs/07-Release Notes/v0.21/v0.21.11.md b/docs/07-Release Notes/v0.21/v0.21.11.md similarity index 100% rename from docs/docs/07-Release Notes/v0.21/v0.21.11.md rename to docs/07-Release Notes/v0.21/v0.21.11.md diff --git a/docs/docs/07-Release Notes/v0.21/v0.21.2.md b/docs/07-Release Notes/v0.21/v0.21.2.md similarity index 100% rename from docs/docs/07-Release Notes/v0.21/v0.21.2.md rename to docs/07-Release Notes/v0.21/v0.21.2.md diff --git a/docs/docs/07-Release Notes/v0.21/v0.21.3.md b/docs/07-Release Notes/v0.21/v0.21.3.md similarity index 100% rename from docs/docs/07-Release Notes/v0.21/v0.21.3.md rename to docs/07-Release Notes/v0.21/v0.21.3.md diff --git a/docs/docs/07-Release Notes/v0.21/v0.21.4.md b/docs/07-Release Notes/v0.21/v0.21.4.md similarity index 100% rename from docs/docs/07-Release Notes/v0.21/v0.21.4.md rename to docs/07-Release Notes/v0.21/v0.21.4.md diff --git a/docs/docs/07-Release Notes/v0.21/v0.21.5.md b/docs/07-Release Notes/v0.21/v0.21.5.md similarity index 100% rename from docs/docs/07-Release Notes/v0.21/v0.21.5.md rename to docs/07-Release Notes/v0.21/v0.21.5.md diff --git a/docs/docs/07-Release Notes/v0.21/v0.21.6.md b/docs/07-Release Notes/v0.21/v0.21.6.md similarity index 100% rename from docs/docs/07-Release Notes/v0.21/v0.21.6.md rename to docs/07-Release Notes/v0.21/v0.21.6.md diff --git a/docs/docs/07-Release Notes/v0.21/v0.21.7.md b/docs/07-Release Notes/v0.21/v0.21.7.md similarity index 100% rename from docs/docs/07-Release Notes/v0.21/v0.21.7.md rename to docs/07-Release Notes/v0.21/v0.21.7.md diff --git a/docs/docs/07-Release Notes/v0.21/v0.21.8.md b/docs/07-Release Notes/v0.21/v0.21.8.md similarity index 100% rename from docs/docs/07-Release Notes/v0.21/v0.21.8.md rename to docs/07-Release Notes/v0.21/v0.21.8.md diff --git a/docs/docs/07-Release Notes/v0.21/v0.21.9.md b/docs/07-Release Notes/v0.21/v0.21.9.md similarity index 100% rename from docs/docs/07-Release Notes/v0.21/v0.21.9.md rename to docs/07-Release Notes/v0.21/v0.21.9.md diff --git a/docs/docs/07-Release Notes/v0.22/v0.22.0.md b/docs/07-Release Notes/v0.22/v0.22.0.md similarity index 100% rename from docs/docs/07-Release Notes/v0.22/v0.22.0.md rename to docs/07-Release Notes/v0.22/v0.22.0.md diff --git a/docs/docs/07-Release Notes/v0.22/v0.22.1.md b/docs/07-Release Notes/v0.22/v0.22.1.md similarity index 100% rename from docs/docs/07-Release Notes/v0.22/v0.22.1.md rename to docs/07-Release Notes/v0.22/v0.22.1.md diff --git a/docs/docs/07-Release Notes/v0.22/v0.22.2.md b/docs/07-Release Notes/v0.22/v0.22.2.md similarity index 100% rename from docs/docs/07-Release Notes/v0.22/v0.22.2.md rename to docs/07-Release Notes/v0.22/v0.22.2.md diff --git a/docs/docs/07-Release Notes/v0.22/v0.22.3.md b/docs/07-Release Notes/v0.22/v0.22.3.md similarity index 100% rename from docs/docs/07-Release Notes/v0.22/v0.22.3.md rename to docs/07-Release Notes/v0.22/v0.22.3.md diff --git a/docs/docs/07-Release Notes/v0.22/v0.22.4.md b/docs/07-Release Notes/v0.22/v0.22.4.md similarity index 100% rename from docs/docs/07-Release Notes/v0.22/v0.22.4.md rename to docs/07-Release Notes/v0.22/v0.22.4.md diff --git a/docs/docs/08-References/API/index.mdx b/docs/08-References/API/index.mdx similarity index 100% rename from docs/docs/08-References/API/index.mdx rename to docs/08-References/API/index.mdx diff --git a/docs/docs/08-References/Clients/metalctl/metalctl.md b/docs/08-References/Clients/metalctl/metalctl.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl.md rename to docs/08-References/Clients/metalctl/metalctl.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_audit.md b/docs/08-References/Clients/metalctl/metalctl_audit.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_audit.md rename to docs/08-References/Clients/metalctl/metalctl_audit.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_audit_describe.md b/docs/08-References/Clients/metalctl/metalctl_audit_describe.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_audit_describe.md rename to docs/08-References/Clients/metalctl/metalctl_audit_describe.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_audit_list.md b/docs/08-References/Clients/metalctl/metalctl_audit_list.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_audit_list.md rename to docs/08-References/Clients/metalctl/metalctl_audit_list.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_completion.md b/docs/08-References/Clients/metalctl/metalctl_completion.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_completion.md rename to docs/08-References/Clients/metalctl/metalctl_completion.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_completion_bash.md b/docs/08-References/Clients/metalctl/metalctl_completion_bash.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_completion_bash.md rename to docs/08-References/Clients/metalctl/metalctl_completion_bash.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_completion_fish.md b/docs/08-References/Clients/metalctl/metalctl_completion_fish.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_completion_fish.md rename to docs/08-References/Clients/metalctl/metalctl_completion_fish.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_completion_powershell.md b/docs/08-References/Clients/metalctl/metalctl_completion_powershell.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_completion_powershell.md rename to docs/08-References/Clients/metalctl/metalctl_completion_powershell.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_completion_zsh.md b/docs/08-References/Clients/metalctl/metalctl_completion_zsh.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_completion_zsh.md rename to docs/08-References/Clients/metalctl/metalctl_completion_zsh.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_context.md b/docs/08-References/Clients/metalctl/metalctl_context.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_context.md rename to docs/08-References/Clients/metalctl/metalctl_context.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_context_short.md b/docs/08-References/Clients/metalctl/metalctl_context_short.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_context_short.md rename to docs/08-References/Clients/metalctl/metalctl_context_short.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout.md b/docs/08-References/Clients/metalctl/metalctl_filesystemlayout.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout.md rename to docs/08-References/Clients/metalctl/metalctl_filesystemlayout.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_apply.md b/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_apply.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_apply.md rename to docs/08-References/Clients/metalctl/metalctl_filesystemlayout_apply.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_create.md b/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_create.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_create.md rename to docs/08-References/Clients/metalctl/metalctl_filesystemlayout_create.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_delete.md b/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_delete.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_delete.md rename to docs/08-References/Clients/metalctl/metalctl_filesystemlayout_delete.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_describe.md b/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_describe.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_describe.md rename to docs/08-References/Clients/metalctl/metalctl_filesystemlayout_describe.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_edit.md b/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_edit.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_edit.md rename to docs/08-References/Clients/metalctl/metalctl_filesystemlayout_edit.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_list.md b/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_list.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_list.md rename to docs/08-References/Clients/metalctl/metalctl_filesystemlayout_list.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_match.md b/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_match.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_match.md rename to docs/08-References/Clients/metalctl/metalctl_filesystemlayout_match.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_try.md b/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_try.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_try.md rename to docs/08-References/Clients/metalctl/metalctl_filesystemlayout_try.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_update.md b/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_update.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_filesystemlayout_update.md rename to docs/08-References/Clients/metalctl/metalctl_filesystemlayout_update.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_firewall.md b/docs/08-References/Clients/metalctl/metalctl_firewall.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_firewall.md rename to docs/08-References/Clients/metalctl/metalctl_firewall.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_firewall_create.md b/docs/08-References/Clients/metalctl/metalctl_firewall_create.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_firewall_create.md rename to docs/08-References/Clients/metalctl/metalctl_firewall_create.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_firewall_describe.md b/docs/08-References/Clients/metalctl/metalctl_firewall_describe.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_firewall_describe.md rename to docs/08-References/Clients/metalctl/metalctl_firewall_describe.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_firewall_list.md b/docs/08-References/Clients/metalctl/metalctl_firewall_list.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_firewall_list.md rename to docs/08-References/Clients/metalctl/metalctl_firewall_list.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_firewall_ssh.md b/docs/08-References/Clients/metalctl/metalctl_firewall_ssh.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_firewall_ssh.md rename to docs/08-References/Clients/metalctl/metalctl_firewall_ssh.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_firmware.md b/docs/08-References/Clients/metalctl/metalctl_firmware.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_firmware.md rename to docs/08-References/Clients/metalctl/metalctl_firmware.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_firmware_delete.md b/docs/08-References/Clients/metalctl/metalctl_firmware_delete.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_firmware_delete.md rename to docs/08-References/Clients/metalctl/metalctl_firmware_delete.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_firmware_list.md b/docs/08-References/Clients/metalctl/metalctl_firmware_list.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_firmware_list.md rename to docs/08-References/Clients/metalctl/metalctl_firmware_list.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_firmware_upload.md b/docs/08-References/Clients/metalctl/metalctl_firmware_upload.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_firmware_upload.md rename to docs/08-References/Clients/metalctl/metalctl_firmware_upload.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_firmware_upload_bios.md b/docs/08-References/Clients/metalctl/metalctl_firmware_upload_bios.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_firmware_upload_bios.md rename to docs/08-References/Clients/metalctl/metalctl_firmware_upload_bios.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_firmware_upload_bmc.md b/docs/08-References/Clients/metalctl/metalctl_firmware_upload_bmc.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_firmware_upload_bmc.md rename to docs/08-References/Clients/metalctl/metalctl_firmware_upload_bmc.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_health.md b/docs/08-References/Clients/metalctl/metalctl_health.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_health.md rename to docs/08-References/Clients/metalctl/metalctl_health.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_image.md b/docs/08-References/Clients/metalctl/metalctl_image.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_image.md rename to docs/08-References/Clients/metalctl/metalctl_image.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_image_apply.md b/docs/08-References/Clients/metalctl/metalctl_image_apply.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_image_apply.md rename to docs/08-References/Clients/metalctl/metalctl_image_apply.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_image_create.md b/docs/08-References/Clients/metalctl/metalctl_image_create.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_image_create.md rename to docs/08-References/Clients/metalctl/metalctl_image_create.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_image_delete.md b/docs/08-References/Clients/metalctl/metalctl_image_delete.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_image_delete.md rename to docs/08-References/Clients/metalctl/metalctl_image_delete.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_image_describe.md b/docs/08-References/Clients/metalctl/metalctl_image_describe.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_image_describe.md rename to docs/08-References/Clients/metalctl/metalctl_image_describe.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_image_edit.md b/docs/08-References/Clients/metalctl/metalctl_image_edit.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_image_edit.md rename to docs/08-References/Clients/metalctl/metalctl_image_edit.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_image_list.md b/docs/08-References/Clients/metalctl/metalctl_image_list.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_image_list.md rename to docs/08-References/Clients/metalctl/metalctl_image_list.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_image_update.md b/docs/08-References/Clients/metalctl/metalctl_image_update.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_image_update.md rename to docs/08-References/Clients/metalctl/metalctl_image_update.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_login.md b/docs/08-References/Clients/metalctl/metalctl_login.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_login.md rename to docs/08-References/Clients/metalctl/metalctl_login.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_logout.md b/docs/08-References/Clients/metalctl/metalctl_logout.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_logout.md rename to docs/08-References/Clients/metalctl/metalctl_logout.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine.md b/docs/08-References/Clients/metalctl/metalctl_machine.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine.md rename to docs/08-References/Clients/metalctl/metalctl_machine.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_apply.md b/docs/08-References/Clients/metalctl/metalctl_machine_apply.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_apply.md rename to docs/08-References/Clients/metalctl/metalctl_machine_apply.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_console.md b/docs/08-References/Clients/metalctl/metalctl_machine_console.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_console.md rename to docs/08-References/Clients/metalctl/metalctl_machine_console.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_consolepassword.md b/docs/08-References/Clients/metalctl/metalctl_machine_consolepassword.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_consolepassword.md rename to docs/08-References/Clients/metalctl/metalctl_machine_consolepassword.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_create.md b/docs/08-References/Clients/metalctl/metalctl_machine_create.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_create.md rename to docs/08-References/Clients/metalctl/metalctl_machine_create.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_delete.md b/docs/08-References/Clients/metalctl/metalctl_machine_delete.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_delete.md rename to docs/08-References/Clients/metalctl/metalctl_machine_delete.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_describe.md b/docs/08-References/Clients/metalctl/metalctl_machine_describe.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_describe.md rename to docs/08-References/Clients/metalctl/metalctl_machine_describe.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_edit.md b/docs/08-References/Clients/metalctl/metalctl_machine_edit.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_edit.md rename to docs/08-References/Clients/metalctl/metalctl_machine_edit.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_identify.md b/docs/08-References/Clients/metalctl/metalctl_machine_identify.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_identify.md rename to docs/08-References/Clients/metalctl/metalctl_machine_identify.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_identify_off.md b/docs/08-References/Clients/metalctl/metalctl_machine_identify_off.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_identify_off.md rename to docs/08-References/Clients/metalctl/metalctl_machine_identify_off.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_identify_on.md b/docs/08-References/Clients/metalctl/metalctl_machine_identify_on.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_identify_on.md rename to docs/08-References/Clients/metalctl/metalctl_machine_identify_on.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_ipmi.md b/docs/08-References/Clients/metalctl/metalctl_machine_ipmi.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_ipmi.md rename to docs/08-References/Clients/metalctl/metalctl_machine_ipmi.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_ipmi_events.md b/docs/08-References/Clients/metalctl/metalctl_machine_ipmi_events.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_ipmi_events.md rename to docs/08-References/Clients/metalctl/metalctl_machine_ipmi_events.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_issues.md b/docs/08-References/Clients/metalctl/metalctl_machine_issues.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_issues.md rename to docs/08-References/Clients/metalctl/metalctl_machine_issues.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_issues_list.md b/docs/08-References/Clients/metalctl/metalctl_machine_issues_list.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_issues_list.md rename to docs/08-References/Clients/metalctl/metalctl_machine_issues_list.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_list.md b/docs/08-References/Clients/metalctl/metalctl_machine_list.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_list.md rename to docs/08-References/Clients/metalctl/metalctl_machine_list.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_lock.md b/docs/08-References/Clients/metalctl/metalctl_machine_lock.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_lock.md rename to docs/08-References/Clients/metalctl/metalctl_machine_lock.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_logs.md b/docs/08-References/Clients/metalctl/metalctl_machine_logs.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_logs.md rename to docs/08-References/Clients/metalctl/metalctl_machine_logs.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_power.md b/docs/08-References/Clients/metalctl/metalctl_machine_power.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_power.md rename to docs/08-References/Clients/metalctl/metalctl_machine_power.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_power_bios.md b/docs/08-References/Clients/metalctl/metalctl_machine_power_bios.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_power_bios.md rename to docs/08-References/Clients/metalctl/metalctl_machine_power_bios.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_power_cycle.md b/docs/08-References/Clients/metalctl/metalctl_machine_power_cycle.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_power_cycle.md rename to docs/08-References/Clients/metalctl/metalctl_machine_power_cycle.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_power_disk.md b/docs/08-References/Clients/metalctl/metalctl_machine_power_disk.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_power_disk.md rename to docs/08-References/Clients/metalctl/metalctl_machine_power_disk.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_power_off.md b/docs/08-References/Clients/metalctl/metalctl_machine_power_off.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_power_off.md rename to docs/08-References/Clients/metalctl/metalctl_machine_power_off.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_power_on.md b/docs/08-References/Clients/metalctl/metalctl_machine_power_on.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_power_on.md rename to docs/08-References/Clients/metalctl/metalctl_machine_power_on.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_power_pxe.md b/docs/08-References/Clients/metalctl/metalctl_machine_power_pxe.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_power_pxe.md rename to docs/08-References/Clients/metalctl/metalctl_machine_power_pxe.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_power_reset.md b/docs/08-References/Clients/metalctl/metalctl_machine_power_reset.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_power_reset.md rename to docs/08-References/Clients/metalctl/metalctl_machine_power_reset.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_reinstall.md b/docs/08-References/Clients/metalctl/metalctl_machine_reinstall.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_reinstall.md rename to docs/08-References/Clients/metalctl/metalctl_machine_reinstall.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_reserve.md b/docs/08-References/Clients/metalctl/metalctl_machine_reserve.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_reserve.md rename to docs/08-References/Clients/metalctl/metalctl_machine_reserve.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_update-firmware.md b/docs/08-References/Clients/metalctl/metalctl_machine_update-firmware.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_update-firmware.md rename to docs/08-References/Clients/metalctl/metalctl_machine_update-firmware.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_update-firmware_bios.md b/docs/08-References/Clients/metalctl/metalctl_machine_update-firmware_bios.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_update-firmware_bios.md rename to docs/08-References/Clients/metalctl/metalctl_machine_update-firmware_bios.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_update-firmware_bmc.md b/docs/08-References/Clients/metalctl/metalctl_machine_update-firmware_bmc.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_update-firmware_bmc.md rename to docs/08-References/Clients/metalctl/metalctl_machine_update-firmware_bmc.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_machine_update.md b/docs/08-References/Clients/metalctl/metalctl_machine_update.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_machine_update.md rename to docs/08-References/Clients/metalctl/metalctl_machine_update.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_markdown.md b/docs/08-References/Clients/metalctl/metalctl_markdown.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_markdown.md rename to docs/08-References/Clients/metalctl/metalctl_markdown.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network.md b/docs/08-References/Clients/metalctl/metalctl_network.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network.md rename to docs/08-References/Clients/metalctl/metalctl_network.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_allocate.md b/docs/08-References/Clients/metalctl/metalctl_network_allocate.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_allocate.md rename to docs/08-References/Clients/metalctl/metalctl_network_allocate.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_apply.md b/docs/08-References/Clients/metalctl/metalctl_network_apply.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_apply.md rename to docs/08-References/Clients/metalctl/metalctl_network_apply.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_create.md b/docs/08-References/Clients/metalctl/metalctl_network_create.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_create.md rename to docs/08-References/Clients/metalctl/metalctl_network_create.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_delete.md b/docs/08-References/Clients/metalctl/metalctl_network_delete.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_delete.md rename to docs/08-References/Clients/metalctl/metalctl_network_delete.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_describe.md b/docs/08-References/Clients/metalctl/metalctl_network_describe.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_describe.md rename to docs/08-References/Clients/metalctl/metalctl_network_describe.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_edit.md b/docs/08-References/Clients/metalctl/metalctl_network_edit.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_edit.md rename to docs/08-References/Clients/metalctl/metalctl_network_edit.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_free.md b/docs/08-References/Clients/metalctl/metalctl_network_free.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_free.md rename to docs/08-References/Clients/metalctl/metalctl_network_free.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_ip.md b/docs/08-References/Clients/metalctl/metalctl_network_ip.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_ip.md rename to docs/08-References/Clients/metalctl/metalctl_network_ip.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_ip_apply.md b/docs/08-References/Clients/metalctl/metalctl_network_ip_apply.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_ip_apply.md rename to docs/08-References/Clients/metalctl/metalctl_network_ip_apply.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_ip_create.md b/docs/08-References/Clients/metalctl/metalctl_network_ip_create.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_ip_create.md rename to docs/08-References/Clients/metalctl/metalctl_network_ip_create.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_ip_delete.md b/docs/08-References/Clients/metalctl/metalctl_network_ip_delete.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_ip_delete.md rename to docs/08-References/Clients/metalctl/metalctl_network_ip_delete.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_ip_describe.md b/docs/08-References/Clients/metalctl/metalctl_network_ip_describe.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_ip_describe.md rename to docs/08-References/Clients/metalctl/metalctl_network_ip_describe.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_ip_edit.md b/docs/08-References/Clients/metalctl/metalctl_network_ip_edit.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_ip_edit.md rename to docs/08-References/Clients/metalctl/metalctl_network_ip_edit.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_ip_issues.md b/docs/08-References/Clients/metalctl/metalctl_network_ip_issues.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_ip_issues.md rename to docs/08-References/Clients/metalctl/metalctl_network_ip_issues.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_ip_list.md b/docs/08-References/Clients/metalctl/metalctl_network_ip_list.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_ip_list.md rename to docs/08-References/Clients/metalctl/metalctl_network_ip_list.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_ip_update.md b/docs/08-References/Clients/metalctl/metalctl_network_ip_update.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_ip_update.md rename to docs/08-References/Clients/metalctl/metalctl_network_ip_update.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_list.md b/docs/08-References/Clients/metalctl/metalctl_network_list.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_list.md rename to docs/08-References/Clients/metalctl/metalctl_network_list.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_network_update.md b/docs/08-References/Clients/metalctl/metalctl_network_update.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_network_update.md rename to docs/08-References/Clients/metalctl/metalctl_network_update.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_partition.md b/docs/08-References/Clients/metalctl/metalctl_partition.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_partition.md rename to docs/08-References/Clients/metalctl/metalctl_partition.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_partition_apply.md b/docs/08-References/Clients/metalctl/metalctl_partition_apply.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_partition_apply.md rename to docs/08-References/Clients/metalctl/metalctl_partition_apply.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_partition_capacity.md b/docs/08-References/Clients/metalctl/metalctl_partition_capacity.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_partition_capacity.md rename to docs/08-References/Clients/metalctl/metalctl_partition_capacity.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_partition_create.md b/docs/08-References/Clients/metalctl/metalctl_partition_create.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_partition_create.md rename to docs/08-References/Clients/metalctl/metalctl_partition_create.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_partition_delete.md b/docs/08-References/Clients/metalctl/metalctl_partition_delete.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_partition_delete.md rename to docs/08-References/Clients/metalctl/metalctl_partition_delete.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_partition_describe.md b/docs/08-References/Clients/metalctl/metalctl_partition_describe.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_partition_describe.md rename to docs/08-References/Clients/metalctl/metalctl_partition_describe.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_partition_edit.md b/docs/08-References/Clients/metalctl/metalctl_partition_edit.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_partition_edit.md rename to docs/08-References/Clients/metalctl/metalctl_partition_edit.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_partition_list.md b/docs/08-References/Clients/metalctl/metalctl_partition_list.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_partition_list.md rename to docs/08-References/Clients/metalctl/metalctl_partition_list.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_partition_update.md b/docs/08-References/Clients/metalctl/metalctl_partition_update.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_partition_update.md rename to docs/08-References/Clients/metalctl/metalctl_partition_update.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_project.md b/docs/08-References/Clients/metalctl/metalctl_project.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_project.md rename to docs/08-References/Clients/metalctl/metalctl_project.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_project_apply.md b/docs/08-References/Clients/metalctl/metalctl_project_apply.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_project_apply.md rename to docs/08-References/Clients/metalctl/metalctl_project_apply.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_project_create.md b/docs/08-References/Clients/metalctl/metalctl_project_create.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_project_create.md rename to docs/08-References/Clients/metalctl/metalctl_project_create.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_project_delete.md b/docs/08-References/Clients/metalctl/metalctl_project_delete.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_project_delete.md rename to docs/08-References/Clients/metalctl/metalctl_project_delete.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_project_describe.md b/docs/08-References/Clients/metalctl/metalctl_project_describe.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_project_describe.md rename to docs/08-References/Clients/metalctl/metalctl_project_describe.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_project_edit.md b/docs/08-References/Clients/metalctl/metalctl_project_edit.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_project_edit.md rename to docs/08-References/Clients/metalctl/metalctl_project_edit.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_project_list.md b/docs/08-References/Clients/metalctl/metalctl_project_list.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_project_list.md rename to docs/08-References/Clients/metalctl/metalctl_project_list.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_project_update.md b/docs/08-References/Clients/metalctl/metalctl_project_update.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_project_update.md rename to docs/08-References/Clients/metalctl/metalctl_project_update.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size.md b/docs/08-References/Clients/metalctl/metalctl_size.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size.md rename to docs/08-References/Clients/metalctl/metalctl_size.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_apply.md b/docs/08-References/Clients/metalctl/metalctl_size_apply.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_apply.md rename to docs/08-References/Clients/metalctl/metalctl_size_apply.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_create.md b/docs/08-References/Clients/metalctl/metalctl_size_create.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_create.md rename to docs/08-References/Clients/metalctl/metalctl_size_create.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_delete.md b/docs/08-References/Clients/metalctl/metalctl_size_delete.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_delete.md rename to docs/08-References/Clients/metalctl/metalctl_size_delete.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_describe.md b/docs/08-References/Clients/metalctl/metalctl_size_describe.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_describe.md rename to docs/08-References/Clients/metalctl/metalctl_size_describe.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_edit.md b/docs/08-References/Clients/metalctl/metalctl_size_edit.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_edit.md rename to docs/08-References/Clients/metalctl/metalctl_size_edit.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint.md b/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint.md rename to docs/08-References/Clients/metalctl/metalctl_size_imageconstraint.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_apply.md b/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_apply.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_apply.md rename to docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_apply.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_create.md b/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_create.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_create.md rename to docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_create.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_delete.md b/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_delete.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_delete.md rename to docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_delete.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_describe.md b/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_describe.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_describe.md rename to docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_describe.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_edit.md b/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_edit.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_edit.md rename to docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_edit.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_list.md b/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_list.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_list.md rename to docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_list.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_try.md b/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_try.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_try.md rename to docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_try.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_update.md b/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_update.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_update.md rename to docs/08-References/Clients/metalctl/metalctl_size_imageconstraint_update.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_list.md b/docs/08-References/Clients/metalctl/metalctl_size_list.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_list.md rename to docs/08-References/Clients/metalctl/metalctl_size_list.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_reservation.md b/docs/08-References/Clients/metalctl/metalctl_size_reservation.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_reservation.md rename to docs/08-References/Clients/metalctl/metalctl_size_reservation.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_reservation_apply.md b/docs/08-References/Clients/metalctl/metalctl_size_reservation_apply.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_reservation_apply.md rename to docs/08-References/Clients/metalctl/metalctl_size_reservation_apply.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_reservation_create.md b/docs/08-References/Clients/metalctl/metalctl_size_reservation_create.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_reservation_create.md rename to docs/08-References/Clients/metalctl/metalctl_size_reservation_create.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_reservation_delete.md b/docs/08-References/Clients/metalctl/metalctl_size_reservation_delete.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_reservation_delete.md rename to docs/08-References/Clients/metalctl/metalctl_size_reservation_delete.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_reservation_describe.md b/docs/08-References/Clients/metalctl/metalctl_size_reservation_describe.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_reservation_describe.md rename to docs/08-References/Clients/metalctl/metalctl_size_reservation_describe.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_reservation_edit.md b/docs/08-References/Clients/metalctl/metalctl_size_reservation_edit.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_reservation_edit.md rename to docs/08-References/Clients/metalctl/metalctl_size_reservation_edit.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_reservation_list.md b/docs/08-References/Clients/metalctl/metalctl_size_reservation_list.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_reservation_list.md rename to docs/08-References/Clients/metalctl/metalctl_size_reservation_list.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_reservation_update.md b/docs/08-References/Clients/metalctl/metalctl_size_reservation_update.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_reservation_update.md rename to docs/08-References/Clients/metalctl/metalctl_size_reservation_update.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_reservation_usage.md b/docs/08-References/Clients/metalctl/metalctl_size_reservation_usage.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_reservation_usage.md rename to docs/08-References/Clients/metalctl/metalctl_size_reservation_usage.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_suggest.md b/docs/08-References/Clients/metalctl/metalctl_size_suggest.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_suggest.md rename to docs/08-References/Clients/metalctl/metalctl_size_suggest.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_size_update.md b/docs/08-References/Clients/metalctl/metalctl_size_update.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_size_update.md rename to docs/08-References/Clients/metalctl/metalctl_size_update.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_switch.md b/docs/08-References/Clients/metalctl/metalctl_switch.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_switch.md rename to docs/08-References/Clients/metalctl/metalctl_switch.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_switch_connected-machines.md b/docs/08-References/Clients/metalctl/metalctl_switch_connected-machines.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_switch_connected-machines.md rename to docs/08-References/Clients/metalctl/metalctl_switch_connected-machines.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_switch_console.md b/docs/08-References/Clients/metalctl/metalctl_switch_console.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_switch_console.md rename to docs/08-References/Clients/metalctl/metalctl_switch_console.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_switch_delete.md b/docs/08-References/Clients/metalctl/metalctl_switch_delete.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_switch_delete.md rename to docs/08-References/Clients/metalctl/metalctl_switch_delete.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_switch_describe.md b/docs/08-References/Clients/metalctl/metalctl_switch_describe.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_switch_describe.md rename to docs/08-References/Clients/metalctl/metalctl_switch_describe.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_switch_detail.md b/docs/08-References/Clients/metalctl/metalctl_switch_detail.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_switch_detail.md rename to docs/08-References/Clients/metalctl/metalctl_switch_detail.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_switch_edit.md b/docs/08-References/Clients/metalctl/metalctl_switch_edit.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_switch_edit.md rename to docs/08-References/Clients/metalctl/metalctl_switch_edit.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_switch_list.md b/docs/08-References/Clients/metalctl/metalctl_switch_list.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_switch_list.md rename to docs/08-References/Clients/metalctl/metalctl_switch_list.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_switch_migrate.md b/docs/08-References/Clients/metalctl/metalctl_switch_migrate.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_switch_migrate.md rename to docs/08-References/Clients/metalctl/metalctl_switch_migrate.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_switch_port.md b/docs/08-References/Clients/metalctl/metalctl_switch_port.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_switch_port.md rename to docs/08-References/Clients/metalctl/metalctl_switch_port.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_switch_port_describe.md b/docs/08-References/Clients/metalctl/metalctl_switch_port_describe.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_switch_port_describe.md rename to docs/08-References/Clients/metalctl/metalctl_switch_port_describe.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_switch_port_down.md b/docs/08-References/Clients/metalctl/metalctl_switch_port_down.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_switch_port_down.md rename to docs/08-References/Clients/metalctl/metalctl_switch_port_down.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_switch_port_up.md b/docs/08-References/Clients/metalctl/metalctl_switch_port_up.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_switch_port_up.md rename to docs/08-References/Clients/metalctl/metalctl_switch_port_up.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_switch_replace.md b/docs/08-References/Clients/metalctl/metalctl_switch_replace.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_switch_replace.md rename to docs/08-References/Clients/metalctl/metalctl_switch_replace.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_switch_ssh.md b/docs/08-References/Clients/metalctl/metalctl_switch_ssh.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_switch_ssh.md rename to docs/08-References/Clients/metalctl/metalctl_switch_ssh.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_switch_update.md b/docs/08-References/Clients/metalctl/metalctl_switch_update.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_switch_update.md rename to docs/08-References/Clients/metalctl/metalctl_switch_update.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_tenant.md b/docs/08-References/Clients/metalctl/metalctl_tenant.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_tenant.md rename to docs/08-References/Clients/metalctl/metalctl_tenant.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_tenant_apply.md b/docs/08-References/Clients/metalctl/metalctl_tenant_apply.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_tenant_apply.md rename to docs/08-References/Clients/metalctl/metalctl_tenant_apply.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_tenant_create.md b/docs/08-References/Clients/metalctl/metalctl_tenant_create.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_tenant_create.md rename to docs/08-References/Clients/metalctl/metalctl_tenant_create.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_tenant_delete.md b/docs/08-References/Clients/metalctl/metalctl_tenant_delete.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_tenant_delete.md rename to docs/08-References/Clients/metalctl/metalctl_tenant_delete.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_tenant_describe.md b/docs/08-References/Clients/metalctl/metalctl_tenant_describe.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_tenant_describe.md rename to docs/08-References/Clients/metalctl/metalctl_tenant_describe.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_tenant_edit.md b/docs/08-References/Clients/metalctl/metalctl_tenant_edit.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_tenant_edit.md rename to docs/08-References/Clients/metalctl/metalctl_tenant_edit.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_tenant_list.md b/docs/08-References/Clients/metalctl/metalctl_tenant_list.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_tenant_list.md rename to docs/08-References/Clients/metalctl/metalctl_tenant_list.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_tenant_update.md b/docs/08-References/Clients/metalctl/metalctl_tenant_update.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_tenant_update.md rename to docs/08-References/Clients/metalctl/metalctl_tenant_update.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_update.md b/docs/08-References/Clients/metalctl/metalctl_update.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_update.md rename to docs/08-References/Clients/metalctl/metalctl_update.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_update_check.md b/docs/08-References/Clients/metalctl/metalctl_update_check.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_update_check.md rename to docs/08-References/Clients/metalctl/metalctl_update_check.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_update_do.md b/docs/08-References/Clients/metalctl/metalctl_update_do.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_update_do.md rename to docs/08-References/Clients/metalctl/metalctl_update_do.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_version.md b/docs/08-References/Clients/metalctl/metalctl_version.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_version.md rename to docs/08-References/Clients/metalctl/metalctl_version.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_vpn.md b/docs/08-References/Clients/metalctl/metalctl_vpn.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_vpn.md rename to docs/08-References/Clients/metalctl/metalctl_vpn.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_vpn_key.md b/docs/08-References/Clients/metalctl/metalctl_vpn_key.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_vpn_key.md rename to docs/08-References/Clients/metalctl/metalctl_vpn_key.md diff --git a/docs/docs/08-References/Clients/metalctl/metalctl_whoami.md b/docs/08-References/Clients/metalctl/metalctl_whoami.md similarity index 100% rename from docs/docs/08-References/Clients/metalctl/metalctl_whoami.md rename to docs/08-References/Clients/metalctl/metalctl_whoami.md diff --git a/docs/docs/08-References/Control Plane/backup-restore-sidecar/assets/sequence.drawio.svg b/docs/08-References/Control Plane/backup-restore-sidecar/assets/sequence.drawio.svg similarity index 100% rename from docs/docs/08-References/Control Plane/backup-restore-sidecar/assets/sequence.drawio.svg rename to docs/08-References/Control Plane/backup-restore-sidecar/assets/sequence.drawio.svg diff --git a/docs/docs/08-References/Control Plane/backup-restore-sidecar/backup-restore-sidecar.md b/docs/08-References/Control Plane/backup-restore-sidecar/backup-restore-sidecar.md similarity index 100% rename from docs/docs/08-References/Control Plane/backup-restore-sidecar/backup-restore-sidecar.md rename to docs/08-References/Control Plane/backup-restore-sidecar/backup-restore-sidecar.md diff --git a/docs/docs/08-References/Control Plane/backup-restore-sidecar/manual_restore.md b/docs/08-References/Control Plane/backup-restore-sidecar/manual_restore.md similarity index 100% rename from docs/docs/08-References/Control Plane/backup-restore-sidecar/manual_restore.md rename to docs/08-References/Control Plane/backup-restore-sidecar/manual_restore.md diff --git a/docs/docs/08-References/Control Plane/go-ipam/go-ipam.md b/docs/08-References/Control Plane/go-ipam/go-ipam.md similarity index 100% rename from docs/docs/08-References/Control Plane/go-ipam/go-ipam.md rename to docs/08-References/Control Plane/go-ipam/go-ipam.md diff --git a/docs/docs/08-References/Control Plane/masterdata-api/masterdata-api.md b/docs/08-References/Control Plane/masterdata-api/masterdata-api.md similarity index 100% rename from docs/docs/08-References/Control Plane/masterdata-api/masterdata-api.md rename to docs/08-References/Control Plane/masterdata-api/masterdata-api.md diff --git a/docs/docs/08-References/Control Plane/metal-api/metal-api.md b/docs/08-References/Control Plane/metal-api/metal-api.md similarity index 100% rename from docs/docs/08-References/Control Plane/metal-api/metal-api.md rename to docs/08-References/Control Plane/metal-api/metal-api.md diff --git a/docs/docs/08-References/Control Plane/metal-console/metal-console.md b/docs/08-References/Control Plane/metal-console/metal-console.md similarity index 100% rename from docs/docs/08-References/Control Plane/metal-console/metal-console.md rename to docs/08-References/Control Plane/metal-console/metal-console.md diff --git a/docs/docs/08-References/Deployment/helm-charts/helm-charts.md b/docs/08-References/Deployment/helm-charts/helm-charts.md similarity index 100% rename from docs/docs/08-References/Deployment/helm-charts/helm-charts.md rename to docs/08-References/Deployment/helm-charts/helm-charts.md diff --git a/docs/docs/08-References/Deployment/metal-images/ARCHITECTURE.md b/docs/08-References/Deployment/metal-images/ARCHITECTURE.md similarity index 100% rename from docs/docs/08-References/Deployment/metal-images/ARCHITECTURE.md rename to docs/08-References/Deployment/metal-images/ARCHITECTURE.md diff --git a/docs/docs/08-References/Deployment/metal-images/IMAGE_STORE.md b/docs/08-References/Deployment/metal-images/IMAGE_STORE.md similarity index 100% rename from docs/docs/08-References/Deployment/metal-images/IMAGE_STORE.md rename to docs/08-References/Deployment/metal-images/IMAGE_STORE.md diff --git a/docs/docs/08-References/Deployment/metal-images/metal-images.md b/docs/08-References/Deployment/metal-images/metal-images.md similarity index 100% rename from docs/docs/08-References/Deployment/metal-images/metal-images.md rename to docs/08-References/Deployment/metal-images/metal-images.md diff --git a/docs/docs/08-References/Deployment/mini-lab/assets/network.svg b/docs/08-References/Deployment/mini-lab/assets/network.svg similarity index 100% rename from docs/docs/08-References/Deployment/mini-lab/assets/network.svg rename to docs/08-References/Deployment/mini-lab/assets/network.svg diff --git a/docs/docs/08-References/Deployment/mini-lab/assets/overview.drawio.svg b/docs/08-References/Deployment/mini-lab/assets/overview.drawio.svg similarity index 100% rename from docs/docs/08-References/Deployment/mini-lab/assets/overview.drawio.svg rename to docs/08-References/Deployment/mini-lab/assets/overview.drawio.svg diff --git a/docs/docs/08-References/Deployment/mini-lab/assets/overview.png b/docs/08-References/Deployment/mini-lab/assets/overview.png similarity index 100% rename from docs/docs/08-References/Deployment/mini-lab/assets/overview.png rename to docs/08-References/Deployment/mini-lab/assets/overview.png diff --git a/docs/docs/08-References/Deployment/mini-lab/mini-lab.md b/docs/08-References/Deployment/mini-lab/mini-lab.md similarity index 100% rename from docs/docs/08-References/Deployment/mini-lab/mini-lab.md rename to docs/08-References/Deployment/mini-lab/mini-lab.md diff --git a/docs/docs/08-References/Gardener/gardener-extension-audit/gardener-extension-audit.md b/docs/08-References/Gardener/gardener-extension-audit/gardener-extension-audit.md similarity index 100% rename from docs/docs/08-References/Gardener/gardener-extension-audit/gardener-extension-audit.md rename to docs/08-References/Gardener/gardener-extension-audit/gardener-extension-audit.md diff --git a/docs/docs/08-References/Gardener/gardener-extension-csi-driver-lvm/gardener-extension-csi-driver-lvm.md b/docs/08-References/Gardener/gardener-extension-csi-driver-lvm/gardener-extension-csi-driver-lvm.md similarity index 100% rename from docs/docs/08-References/Gardener/gardener-extension-csi-driver-lvm/gardener-extension-csi-driver-lvm.md rename to docs/08-References/Gardener/gardener-extension-csi-driver-lvm/gardener-extension-csi-driver-lvm.md diff --git a/docs/docs/08-References/Gardener/gardener-extension-csi-driver-lvm/migration.md b/docs/08-References/Gardener/gardener-extension-csi-driver-lvm/migration.md similarity index 100% rename from docs/docs/08-References/Gardener/gardener-extension-csi-driver-lvm/migration.md rename to docs/08-References/Gardener/gardener-extension-csi-driver-lvm/migration.md diff --git a/docs/docs/08-References/Gardener/gardener-extension-ontap/gardener-extension-ontap.md b/docs/08-References/Gardener/gardener-extension-ontap/gardener-extension-ontap.md similarity index 100% rename from docs/docs/08-References/Gardener/gardener-extension-ontap/gardener-extension-ontap.md rename to docs/08-References/Gardener/gardener-extension-ontap/gardener-extension-ontap.md diff --git a/docs/docs/08-References/Gardener/gardener-vpn-gateway/gardener-vpn-gateway.md b/docs/08-References/Gardener/gardener-vpn-gateway/gardener-vpn-gateway.md similarity index 100% rename from docs/docs/08-References/Gardener/gardener-vpn-gateway/gardener-vpn-gateway.md rename to docs/08-References/Gardener/gardener-vpn-gateway/gardener-vpn-gateway.md diff --git a/docs/docs/08-References/Gardener/os-metal-extension/os-metal-extension.md b/docs/08-References/Gardener/os-metal-extension/os-metal-extension.md similarity index 100% rename from docs/docs/08-References/Gardener/os-metal-extension/os-metal-extension.md rename to docs/08-References/Gardener/os-metal-extension/os-metal-extension.md diff --git a/docs/docs/08-References/Kubernetes/cluster-api-provider-metal-stack/DEVELOPMENT.md b/docs/08-References/Kubernetes/cluster-api-provider-metal-stack/DEVELOPMENT.md similarity index 100% rename from docs/docs/08-References/Kubernetes/cluster-api-provider-metal-stack/DEVELOPMENT.md rename to docs/08-References/Kubernetes/cluster-api-provider-metal-stack/DEVELOPMENT.md diff --git a/docs/docs/08-References/Kubernetes/cluster-api-provider-metal-stack/cluster-api-provider-metal-stack.md b/docs/08-References/Kubernetes/cluster-api-provider-metal-stack/cluster-api-provider-metal-stack.md similarity index 100% rename from docs/docs/08-References/Kubernetes/cluster-api-provider-metal-stack/cluster-api-provider-metal-stack.md rename to docs/08-References/Kubernetes/cluster-api-provider-metal-stack/cluster-api-provider-metal-stack.md diff --git a/docs/docs/08-References/Kubernetes/droptailer/droptailer.md b/docs/08-References/Kubernetes/droptailer/droptailer.md similarity index 100% rename from docs/docs/08-References/Kubernetes/droptailer/droptailer.md rename to docs/08-References/Kubernetes/droptailer/droptailer.md diff --git a/docs/docs/08-References/Kubernetes/firewall-controller-manager/firewall-controller-manager.md b/docs/08-References/Kubernetes/firewall-controller-manager/firewall-controller-manager.md similarity index 100% rename from docs/docs/08-References/Kubernetes/firewall-controller-manager/firewall-controller-manager.md rename to docs/08-References/Kubernetes/firewall-controller-manager/firewall-controller-manager.md diff --git a/docs/docs/08-References/Kubernetes/firewall-controller/assets/architecture.drawio.svg b/docs/08-References/Kubernetes/firewall-controller/assets/architecture.drawio.svg similarity index 100% rename from docs/docs/08-References/Kubernetes/firewall-controller/assets/architecture.drawio.svg rename to docs/08-References/Kubernetes/firewall-controller/assets/architecture.drawio.svg diff --git a/docs/docs/08-References/Kubernetes/firewall-controller/firewall-controller.md b/docs/08-References/Kubernetes/firewall-controller/firewall-controller.md similarity index 100% rename from docs/docs/08-References/Kubernetes/firewall-controller/firewall-controller.md rename to docs/08-References/Kubernetes/firewall-controller/firewall-controller.md diff --git a/docs/docs/08-References/Kubernetes/metal-ccm/metal-ccm.md b/docs/08-References/Kubernetes/metal-ccm/metal-ccm.md similarity index 100% rename from docs/docs/08-References/Kubernetes/metal-ccm/metal-ccm.md rename to docs/08-References/Kubernetes/metal-ccm/metal-ccm.md diff --git a/docs/docs/08-References/Monitoring/metal-metrics-exporter/metal-metrics-exporter.md b/docs/08-References/Monitoring/metal-metrics-exporter/metal-metrics-exporter.md similarity index 100% rename from docs/docs/08-References/Monitoring/metal-metrics-exporter/metal-metrics-exporter.md rename to docs/08-References/Monitoring/metal-metrics-exporter/metal-metrics-exporter.md diff --git a/docs/docs/08-References/Monitoring/nftables-exporter/nftables-exporter.md b/docs/08-References/Monitoring/nftables-exporter/nftables-exporter.md similarity index 100% rename from docs/docs/08-References/Monitoring/nftables-exporter/nftables-exporter.md rename to docs/08-References/Monitoring/nftables-exporter/nftables-exporter.md diff --git a/docs/docs/08-References/Monitoring/rethinkdb-exporter/assets/grafana.png b/docs/08-References/Monitoring/rethinkdb-exporter/assets/grafana.png similarity index 100% rename from docs/docs/08-References/Monitoring/rethinkdb-exporter/assets/grafana.png rename to docs/08-References/Monitoring/rethinkdb-exporter/assets/grafana.png diff --git a/docs/docs/08-References/Monitoring/rethinkdb-exporter/rethinkdb-exporter.md b/docs/08-References/Monitoring/rethinkdb-exporter/rethinkdb-exporter.md similarity index 100% rename from docs/docs/08-References/Monitoring/rethinkdb-exporter/rethinkdb-exporter.md rename to docs/08-References/Monitoring/rethinkdb-exporter/rethinkdb-exporter.md diff --git a/docs/docs/08-References/Partition/go-hal/go-hal.md b/docs/08-References/Partition/go-hal/go-hal.md similarity index 100% rename from docs/docs/08-References/Partition/go-hal/go-hal.md rename to docs/08-References/Partition/go-hal/go-hal.md diff --git a/docs/docs/08-References/Partition/metal-bmc/metal-bmc.md b/docs/08-References/Partition/metal-bmc/metal-bmc.md similarity index 100% rename from docs/docs/08-References/Partition/metal-bmc/metal-bmc.md rename to docs/08-References/Partition/metal-bmc/metal-bmc.md diff --git a/docs/docs/08-References/Partition/metal-core/metal-core.md b/docs/08-References/Partition/metal-core/metal-core.md similarity index 100% rename from docs/docs/08-References/Partition/metal-core/metal-core.md rename to docs/08-References/Partition/metal-core/metal-core.md diff --git a/docs/docs/08-References/Partition/metal-hammer/metal-hammer.md b/docs/08-References/Partition/metal-hammer/metal-hammer.md similarity index 100% rename from docs/docs/08-References/Partition/metal-hammer/metal-hammer.md rename to docs/08-References/Partition/metal-hammer/metal-hammer.md diff --git a/docs/docs/08-References/Partition/pixie/pixie.md b/docs/08-References/Partition/pixie/pixie.md similarity index 100% rename from docs/docs/08-References/Partition/pixie/pixie.md rename to docs/08-References/Partition/pixie/pixie.md diff --git a/docs/docs/08-References/Storage/csi-driver-lvm/csi-driver-lvm.md b/docs/08-References/Storage/csi-driver-lvm/csi-driver-lvm.md similarity index 100% rename from docs/docs/08-References/Storage/csi-driver-lvm/csi-driver-lvm.md rename to docs/08-References/Storage/csi-driver-lvm/csi-driver-lvm.md diff --git a/docs/docs/08-References/Storage/duros-controller/MULTITENANCY.md b/docs/08-References/Storage/duros-controller/MULTITENANCY.md similarity index 100% rename from docs/docs/08-References/Storage/duros-controller/MULTITENANCY.md rename to docs/08-References/Storage/duros-controller/MULTITENANCY.md diff --git a/docs/docs/08-References/Storage/duros-controller/assets/architecture.drawio.svg b/docs/08-References/Storage/duros-controller/assets/architecture.drawio.svg similarity index 100% rename from docs/docs/08-References/Storage/duros-controller/assets/architecture.drawio.svg rename to docs/08-References/Storage/duros-controller/assets/architecture.drawio.svg diff --git a/docs/docs/08-References/Storage/duros-controller/assets/dataplane.drawio.svg b/docs/08-References/Storage/duros-controller/assets/dataplane.drawio.svg similarity index 100% rename from docs/docs/08-References/Storage/duros-controller/assets/dataplane.drawio.svg rename to docs/08-References/Storage/duros-controller/assets/dataplane.drawio.svg diff --git a/docs/docs/08-References/Storage/duros-controller/assets/nvme-over-tcp.jpg b/docs/08-References/Storage/duros-controller/assets/nvme-over-tcp.jpg similarity index 100% rename from docs/docs/08-References/Storage/duros-controller/assets/nvme-over-tcp.jpg rename to docs/08-References/Storage/duros-controller/assets/nvme-over-tcp.jpg diff --git a/docs/docs/08-References/Storage/duros-controller/duros-controller.md b/docs/08-References/Storage/duros-controller/duros-controller.md similarity index 100% rename from docs/docs/08-References/Storage/duros-controller/duros-controller.md rename to docs/08-References/Storage/duros-controller/duros-controller.md diff --git a/docs/contributing/04-Proposals/MEP18/README.md b/docs/contributing/04-Proposals/MEP18/README.md deleted file mode 100644 index 9c02c0b7..00000000 --- a/docs/contributing/04-Proposals/MEP18/README.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -slug: /MEP-18-autonomous-control-plane -title: MEP-18 -sidebar_position: 18 ---- - -# Autonomous Control Plane - -As described in the [deployment chapter](../../../docs/04-For%20Operators/03-deployment-guide.mdx), we strongly recommend Kubernetes as the target platform for running the metal-stack control plane. - -Kubernetes clusters for this purpose are readily available from hyperscalers, metalstack.cloud, or other cloud providers. Simply using a managed Kubernetes cluster greatly simplifies a metal-stack installation. However, sometimes it might be desirable to host the metal-stack control plane autonomously, without the help of another cloud provider. Reasons for this might include corporate policies that prohibit the use of external data center products, or network constraints. - -The Kubernetes cluster hosting the metal-stack control plane must provide at least the following features: - -- Load balancing (for exposing the APIs) -- Persistent storage (for the databases and key-value stores) -- Access to object storage for automated backups of the stateful sets -- Access to a DNS provider supported by one of the used DNS extensions -- Externally accessible DNS records for obtaining officially signed certificates through DNS challenges - -This metal-stack control plane cluster must also be highly available to prevent a complete loss of control over the managed resources in the data center. -Regular Kubernetes updates to apply security fixes and feature updates must be possible in an automated manner. The Day-2 operational overhead of running this cluster in your own datacenter must be reasonable. - -In this chapter, we propose a solution for setting up a metal-stack environment with an autonomous control plane that is independent of another cloud provider. - -## Use Your Own Dogfood - -The most obvious solution is to just deploy a Kubernetes cluster manually in your own data center by utilizing existing tooling for the deployment: - -- k3s -- kubeadm -- vmware and rancher -- talos -- kubespray -- ... (not a complete list) - -However, all these solutions add another layer of complexity that needs to be maintained and operated by people who also need to learn and understand metal-stack. In general, metal-stack in combination with [Gardener](https://gardener.cloud) contains all the necessary tools to provide KaaS, so it makes sense to reuse what is already in place without introducing new dependencies on other products and vendors. - -The only problem here is that Gardener is not yet able to create an initial cluster, which may change with the implementation of [GEP-28](https://github.com/gardener/gardener/blob/master/docs/proposals/28-autonomous-shoot-clusters.md). In the meantime, we suggest using [k3s](https://k3s.io/), which manages the initial metal-stack partition to host the control plane, since the maintenance overhead is acceptable and it is easy to deploy. - -## The Matryoshka Principle - -Instead of directly using the K3s cluster for the production control plane, we propose using it as a minimal control plane cluster which only purpose is to host the production control plane cluster. This layer of indirection brings some reasonable advantages: - -- In the event of an interruption or loss of this minimal control plane cluster, the production control plane remains unaffected, and end users can continue to manage their clusters as normal. -- A dedicated operations team can take care of the Day-2 maintenance of this installation, which can be handy because the tools like k3s are a little different from the rest of the setup (it is likely that more manual maintenance is required than for any other cluster). This would also be true if the initial cluster problem would be solved by the Gardener itself and not using k3s. -- Since the number of shoot clusters to host is static, the resource requirements are minimal and will not change significantly over time. There are no huge resource requirements in terms of cpu, memory and storage. As such, the lack of scalability is not such a big issue. - -So, our proposal is to chain two metal-stack control planes. The initial control plane cluster would use k3s and on this cluster we can spin up a cluster for the production control plane with the use of Gardener. - -The following figure shows how the high-level architecture of this setup looks like. A even more simplified illustration of this setup can be looked up in the appendix[^1]. - -![Autonomous Control Plane Architecture](./autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg) - -The k3s nodes can either be bare metal machines or virtual machines. When using VMs a single k3s node might be a viable solution, too. These nodes are supposed to be setup manually / partly automated with an operating system like Debian. - -To name the cluster that hosts the initial metal-stack control plane and Gardener we use the term _initial cluster_. The initial cluster creates worker nodes to host the _target cluster_. - -## Initial Cluster - -The initial cluster is kept very small. The physical bare metal machines can be any machines and switches which are supported by metal-stack, but can be smaller in terms of cpu, memory and network speed because these machines must only be capable of running the target cluster for the metal-stack control plane. A typical single socket server with 8-16 cores and 64GB of RAM and two NVMe drives of 1TB would be a good starting point. - -In a typical k3s setup, a stateful set would lose the data once the k3s cluster was terminated and started again. But there is a possibility to define parts of the local storage of the server to be provided to the k3s cluster for the PVCs. With that, k3s could be terminated and started again, for example to update and reboot the host os, or update k3s itself and the data will persist. - -Example k3s configuration for persistent storage on the hosts os: - -```yaml -k3s: Cluster -apiVersion: k3s.x-k8s.io/v1alpha4 -name: needle-control-plane -nodes: - - role: control-plane - # add a mount from /path/to/my/files on the host to /files on the node - extraMounts: - - hostPath: /path/to/my/files - containerPath: /files -``` - -Into this cluster metal-stack and Gardener will be deployed. This deployment can be done by a Gitlab runner which is running on this machine. -The mini-lab will be used as a base for this deployment. The current development of [gardener-in-minilab](https://github.com/metal-stack/mini-lab/pull/202) must be extended to host all required extensions to make this a working metal-stack control plane which can manage the machines in the attached bare metal setup. - -In addition to the metal-stack and Gardener deployment, some additional required services are deployed (non-complete list): - -- PowerDNS to serve as a DNS Server for all DNS entries used in the initial and the target cluster, like `api.initial.metal-stack.local`, `gardener-api.initial.metal-stack.local` and the DNS entries for the api servers of the created kubernetes clusters. -- NTP -- Monitoring for the initial cluster and partition -- Optional: OIDC Server for authenticating against the metal-api -- Optional: Container Registry to host all metal-stack and gardener containers -- Optional: Let's Encrypt [boulder](https://github.com/letsencrypt/boulder) as a certificate authority -- ... - -Physical view, minimal setup for a initial cluster with a single physical node: - -![Small Initial Cluster](autonomous-control-plane-images/small-initial-cluster.svg) - -Physical View, bigger ha setup which is spread across two data centers: - -![HA Initial Cluster](autonomous-control-plane-images/ha-initial-cluster.svg) - -### Control Plane High Availability - -Running the initial control plane on a single physical server is not as available as it should be in such a use case. It should be possible to survive a loss of this server, because the server could be lost by many events, such as hardware failure, disk corruption or even failure of the datacenter location where this server is deployed. - -Setting up a second server with the same software components is an option, but the problem of data redundancy must be solved, because neither the gardener control plane, nor the metal-stack control plane can be instantiated twice. - -Given that we provide part of the local storage of the server as backing storage for the stateful sets in the k3s cluster, the data stored on the server itself must be replicated to another server and backed up on a regular basis. - -The replication of ETCD can be achieved through [clustered configuration](https://docs.k3s.io/datastore/ha-embedded) of k3s. Components of metal-stack and Gardener can run standalone and already utilize backup-restore mechanism that must be configured accordingly. For two or more bare metal machine used for the initial cluster, a loadbalancing mechanism for the ingress is required. kube-vip could be a possible solution. - -For monitoring a backend like a Victoria Metrics Cluster would allow spearding the monitoring data across the initial cluster nodes. These metrics should also be backed up in object storage. - -### Partition - -The partition which is managed by the initial cluster can be a simple and small hardware setup but yet capable enough to host the target cluster. It would even be a good practice to create separate target clusters on the initial cluster, e.g. one for the metal-stack control plane and one for the Gardener (maybe one more for monitoring). - -It can follow the metal-stack minimal setup which provides about 8-16 small servers connected to a 1G/s or 10G/s network dataplane. Central storage is optional as the persistence of the services running in these clusters is always backed up to a central object storage. Operations would be much easier if a central storage is provided. - -## Target Cluster - -The target cluster is the metal-stack environment which serves for end-user production use, the control plane is running in a shoot hosted in the initial cluster. The seed(s) and shoot(s) for end-users are created on the machines provided by the target cluster. -These machines can be of a different type in terms of size, but more importantly, these machines are connected to another network dataplane. Also the management infrastructure is separated from the initial cluster management network. - -## Failure Scenarios - -Everything could fail, everything will fail at some point. But this must kept in mind and nothing bad should happen if only one component at a time fails. -If more than one fails, the restoration to a working state must be easily possible and well documented. - -To ensure all possible breakages are documented, we suggest writing a list which summarizes all failure scenarios that might occur including the remediation. - -Here is an example of how a scenario documentation could look like: - -**Scenario**: Initial cluster is gone, all machines have died -**Impact**: Management of the initial cluster infrastructure not possible anymore, the target cluster continues to run but cannot be managed because the API servers are gone. end-users are not affected by this incident. -**Remediation**: The initial cluster nodes must be provisioned from scratch and re-deployed through the CI mechanism. The backups of the stateful sets are automatically restored during this process. - -## Implementation - -As part of this proposal, we provide the following tools and integrations in order to setup an autonomous control plane: - -- Deployment roles for the services like PowerDNS and NTP for the initial cluster -- Stretch goal: Deployment role to setup k3s in clustered configuration for the initial cluster and update it -- Extend the Gardener on mini-lab integration to allow shoot creation in the mini-lab -- Steady integration of the setup (maybe something like [k3d](https://github.com/k3d-io/k3d) in the mini-lab) - -## Appendix - -[^1]: ![metal-stack-chain](autonomous-control-plane-images/metal-stack-chain.svg) diff --git a/docusaurus.config.ts b/docusaurus.config.ts index 6e982903..8cf29b22 100644 --- a/docusaurus.config.ts +++ b/docusaurus.config.ts @@ -74,6 +74,18 @@ const config: Config = { languages: ["en"], }, ], + [ + "@docusaurus/plugin-content-docs", + { + id: "community", + path: "community", + routeBasePath: "community", + sidebarPath: "./sidebars-community.ts", + editUrl: "https://github.com/metal-stack/website/tree/main/", + includeCurrentVersion: true, + lastVersion: undefined, // intentionally no version + }, + ], ], presets: [ @@ -81,7 +93,7 @@ const config: Config = { "classic", { docs: { - sidebarPath: "./sidebars.ts", + sidebarPath: "./sidebars-docs.ts", // Please change this to your repo. // Remove this to remove the "edit this page" links. editUrl: "https://github.com/metal-stack/website/tree/main/", @@ -115,10 +127,7 @@ const config: Config = { }, { label: "Community", - type: "doc", - // TODO: after next release change to: - // docId: "contributing/community", - docId: "contributing/contribution-guideline", + to: "/community", }, { to: "/blog", diff --git a/sidebars-community.ts b/sidebars-community.ts new file mode 100644 index 00000000..c95790cf --- /dev/null +++ b/sidebars-community.ts @@ -0,0 +1,24 @@ +import type { SidebarsConfig } from "@docusaurus/plugin-content-docs"; + +// This runs in Node.js - Don't use client-side code here (browser APIs, JSX...) + +/** + * Creating a sidebar enables you to: + - create an ordered group of docs + - render a sidebar for each doc of that group + - provide next/previous navigation + + The sidebars can be generated from the filesystem, or explicitly defined here. + + Create as many sidebars as you want. + */ +const sidebars: SidebarsConfig = { + community: [ + { + type: "autogenerated", + dirName: ".", + }, + ], +}; + +export default sidebars; diff --git a/sidebars.ts b/sidebars-docs.ts similarity index 82% rename from sidebars.ts rename to sidebars-docs.ts index 53e3d643..7148188a 100644 --- a/sidebars.ts +++ b/sidebars-docs.ts @@ -16,13 +16,7 @@ const sidebars: SidebarsConfig = { docs: [ { type: "autogenerated", - dirName: "docs", - }, - ], - contributing: [ - { - type: "autogenerated", - dirName: "contributing", + dirName: ".", }, ], }; diff --git a/src/css/custom.css b/src/css/custom.css index 5ee4be12..7d723f72 100644 --- a/src/css/custom.css +++ b/src/css/custom.css @@ -7,8 +7,8 @@ /* You can override the default Infima variables here. */ @import "tailwindcss/theme.css"; @import "tailwindcss/utilities.css"; -@import url('./fonts/inter/inter-v12-latin.css'); -@import url('./fonts/space-grotesk/space-grotesk-v13-latin.css'); +@import url("./fonts/inter/inter-v12-latin.css"); +@import url("./fonts/space-grotesk/space-grotesk-v13-latin.css"); @custom-variant dark (&:where([data-theme=dark], [data-theme=dark] *)); @@ -56,11 +56,10 @@ html[data-theme="dark"] { --ifm-background-color: var(--color-neutral-950); --ifm-background-surface-color: var(--color-neutral-950); - } -html[data-theme="dark"] p img { - background-color:#f5f5f57f; +html[data-theme="dark"] p img { + background-color: #f5f5f57f; } body { @@ -120,8 +119,9 @@ p { @apply text-base text-neutral-500 dark:text-neutral-400 leading-relaxed; } -ul, ol { - @apply text-neutral-500 dark:text-neutral-400 ; +ul, +ol { + @apply text-neutral-500 dark:text-neutral-400; } a { @@ -224,8 +224,10 @@ footer { /* hide the navbar on non-doc pages including it's hoverable dropdown */ .plugin-pages #docs-version-dropdown, .plugin-blog #docs-version-dropdown, +.plugin-id-community #docs-version-dropdown, .plugin-pages .dropdown--hoverable:has(> #docs-version-dropdown), -.plugin-blog .dropdown--hoverable:has(> #docs-version-dropdown) { +.plugin-blog .dropdown--hoverable:has(> #docs-version-dropdown), +.plugin-id-community .dropdown--hoverable:has(> #docs-version-dropdown) { display: none; } @@ -240,4 +242,4 @@ footer { left: 0; width: 100%; height: 100%; -} \ No newline at end of file +} diff --git a/static/_redirects b/static/_redirects index 109db180..607a1b46 100644 --- a/static/_redirects +++ b/static/_redirects @@ -9,8 +9,20 @@ https://docs.metal-stack.io https://metal-stack.io/docs/home 301! https://docs.metal-stack.io/* https://metal-stack.io/:splat 301! /docs /docs/home -/docs/planning-meetings /docs/roadmap -/docs/next/planning-meetings /docs/next/roadmap + +# migrate community out from docs +/docs/planning-meetings /community/roadmap +/docs/:v/planning-meetings /community/roadmap +/docs/contribution-guideline /community/contribution-guideline +/docs/:v/contribution-guideline /community/contribution-guideline +/docs/release-flow /community/release-flow +/docs/:v/release-flow /community/release-flow +/docs/oci-artifacts /community/oci-artifacts +/docs/:v/oci-artifacts /community/oci-artifacts +/docs/enhancement-proposals /community/enhancement-proposals +/docs/:v/enhancement-proposals /community/enhancement-proposals +/docs/MEP-* /community/MEP-:splat +/docs/:v/MEP-* /community/MEP-:splat # migrate archived paths to stable versions quickly /stable/overview/* /docs/:splat 301 diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/Distributed-API-Working.png b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/Distributed-API-Working.png deleted file mode 100644 index 899e223d..00000000 Binary files a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/Distributed-API-Working.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/Distributed-API.png b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/Distributed-API.png deleted file mode 100644 index 688c7c2e..00000000 Binary files a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/Distributed-API.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/Distributed-Deployment.png b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/Distributed-Deployment.png deleted file mode 100644 index 8bba51b8..00000000 Binary files a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/Distributed-Deployment.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/Distributed.drawio b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/Distributed.drawio deleted file mode 100644 index f7c6fe79..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/Distributed.drawio +++ /dev/null @@ -1 +0,0 @@ -7V3bcts2EP0aP8pDgPfH2ImbziRTt+m0zVMHpmCJMUUoFG1L/fqC4kUkAEoULwDkRC8WIBIkF2d3z+6C8JV5u9r+kqD18jOZ4+gKGvPtlfn+CtKPY9M/Wc8u7/FtM+9YJOE87wKHji/hf7joNIre53CON40DU0KiNFw3OwMSxzhIG30oSchr87BHEjWvukYLzHV8CVDE9/4dztNl3uvZxqH/Iw4Xy/LKwCh+eUDB0yIhz3FxvStoPu4/+c8rVI5VHL9Zojl5rXWZH67M24SQNP+22t7iKJNtKbb8vLuWX6v7TnCcdjkhWVoPJP5zF3/w/trOvG+vv70+z4rJe0HRMy4fw4noeDePhA5L5YmC/Afn+3N2pzcfcfSC0zBAh67s4dJd1DwuO3+22U/3O3oAgOtt/Qxnkf29R0kapiGJy6vS288vnP9eSK4aHFIhrrOvz6voLkEr+vXmdRmm+Ms6v89XClLat0xXEW0B+nU/QziTgUFb1SRkjYCswqD4HqEHHN1UU3pLIpLQn2ISZ9fYpAl5qvABise7Q6swymD/F07mKEZFd4FxkA2LonAR00ZAJwkn1RO94CTF29aJBBU8qNphssJpsqOHFCfMbLeAVKFywC0x+VpDcNm3rKHX9opOVGjNohr9gBz6pQDPGUACkEPSBif0OWkfHGEayyfzMqlWj2QaI4kUcCK1eJGajkikYASRrrboX/K79Qh+vYe77+Ef8RMBM4+TaECCp4SgYClDoJz0BDJuF6jFCNQ0O8oTGiPI055D4NsPc8+Yo0cAwMy0OJHhOfUDRZMk6ZIsSIyiD4dexnTUZDdHm+W+H3SwHNTE3YXZne5HwfH8Xea1souucZz3NH+v24+OZqZ1xrKHPDpfCY5QGr40naFI9PtT6a2jXe2ANQnjdFMb+T7rOMDAAoxaGdBvOqkzT6Bf8nsQn256LadXd7whz0mAi9MYQFVi6a+zrsCfDsTd4ZhPhKwL0H3DaborEICeU9LE5x50JcyCCG02mZ9rYBEcQ00upCOPWdAGOt4Cp0eOc8ZG4SCDypOdmkE1ADdTpVGlPGFNtTkTUuXQI/yYNTfUvobx4tO+9d50xrKeVhPHlsDBAyiwns5Uzsg5Krt2Dy9fdpUMVMgOuCh4QLZredg2fedhBji5MQT7bObckZJzBNt498OQ7HKm3Wm4jW0zXsbmEWaL6LdlTqWegLdesv1MC+Hp72T8SSgMxxkoN2yhquUYuZubjDP4nImgI6JohtahRmbVYsxqlcBR5pLKkPMtYb4U6uSgh23xmSTQlw9aRz3apJmNT5FGsIeedrA3j1ExzfMC0G6BnZS0gFieloCivcGYDXQN2oBeURu4mLCNtRXqozZwMWEbSy80kJ0ol6ModLv5GbqNgjIuza9B5OYp9zbjs1hJoRub7qVs4tqofWBzwKkp7UUEcmx6TD2hrQrkb0gDJqq/8PUSnk8r1IAKjXoHdWx2XQMVgAKuwerEoXLIhAd8dw3neBum/2R4vva8ovl137SL1vttgfZ9Y1dr3OMkpM+X+eWiNkmfNR8LOGW7NljWPIy2b+3qLXa8TurVllE/Hca4HVWw4fv5SS/7hmZc2KyxYzNoailNCkYymJHY2HhqNb/kDAS3MgFUpFBdDgL+IDkI2DUHAfXKQcCLyUFwpWMABSuZJHu3i8lCcMVjHaR3Mg8hbY3mzxLyVCVkv3Miwp0MZ9omIg4UtuSsX2vkVsxfB/goVXXnAxGRxeMuImHBEzZDoCxybXKZDdRPV/rj3pSUsuBKz9Jxb15GmoKiTD/g84mKywn9gK9f5GfysbRy0zJF5FcuwD8Z+Zn22GZo2PzwkbmmkV/1opk+oYt5PGzWKPCzWFOrgfD4qPln/fnC4z4AtAv7LNEKddYB/Zilh6PpmNOOrGsKU1H9AVg+g6neBQhQJR0lMXhLVIGIN4QCVhuXwr9TKqTvIm2fzKdYGr6f1vqiZKXxdkPhT6h7f822Or/VZnXU7IEqa9sN/Hjsy9tTKxmfHvoJlrPB4koCi8nSf8Pyr53Dx5WLHZ75o3V4nacX6ewFT9ch0chWM6badQSW2pVpqUvd11DXpGbj7dELwS3qw754LjspafPhnobJeMC9YK88Jeko8Uo1j+Oe5XL0UzGna0RTsu7JKwSA8WU+u8fKxLs4OKauxrd1lk/zEElvFtpmv7k7OdCe0Hjm4WNLtc8Onwiu7POMzn2WW9DGTHM1U19Q6QCmVDMtWgS0D9mv12WmcfZwiiHS50+bWjOCtNglU1WgVRMWFMXpk70U4vBxOi8spERYM2hoJy1tV64McMqSVqFwhQ/ZvNfhsww6FuNN/RahlHekcxKUyxSrz4G6auOFqtGtejEtKZS31o0tfPVlhTPTsBlEWdkrTwda6HQyX+fuZMc/QQHa9hvltrLzGmc0t7IbbQM6vjKSJd6Uswb2VU31rMG9JELP5l3U83lXSYJSiRmdPPdosablaKDb1VTyywfdPgH0uZaSf5rjhpJbBi3HTvLhaNNOqglF2bWxUs2keGNnTk7Vvs7ti9+02dfZZsEld19noUQhJ1EVlvSsFZ+MBTwZ0gqfu2AmdVIqPM4aaGQHTc7RV1tHXu45ENoMvxRuZjJZRNo+c5KWew4SHrcBgHLRqdkEY0TtFhSRhMf5KrUb8gost1bYY3WK1NnJNxdUNT181nuaEvi4hhf4ULn1UJvTOrcG3r++PYoy+BehDNLy4sO0wWTLtOq1QZMdPUdELOjKnZUittxtUpkVgr2sEKjZoNKSyTBDnSd1aEDUK43DRheGb9cBcvL4MhvZdrx7/PjBWZ+jIq9ZBmo4E7KlkqHgNUH+2tGpF1rVcw4otfwoliX//6mUqH/FJdzuZKI3cxlT/btycqX5EMGmlhdKNaESrtl5lod67l5GnjPC7P+QZIuaFjx+xkRmmw8ML8Jss2km9Ua73GjuMhIgvWz7iMor2q7uCEDHXzbB/vMJg90zcrzFWWIB6OHjVUwpHDqlw/SUf39Kx1QYYMtr6oN/qDoI7ctPFJm4zpnhiUycrdrECZLOGubZ9FO0Mu/3hnxD18SwWtdwZNe+KdatjVws8UTAtaQCF7414JYb2p0mNbZK5Ir23dMXuRy38UT/JmAk5NJmQrLtlw6uLUHr5Wcyx9kR/wM= \ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/Distributed.png b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/Distributed.png deleted file mode 100644 index d96ca216..00000000 Binary files a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/Distributed.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/README.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/README.md deleted file mode 100644 index 0fd4bb63..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP1/README.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -slug: /MEP-1-distributed-metal-control-plane -title: MEP-1 -sidebar_position: 1 ---- - -# Distributed Metal Control Plane - -This enhancement proposal was replaced by [MEP18](../MEP18/README.md). - -## Problem Statement - -We face the situation that we argue for running bare metal on-premises because this way the customers can control where and how their software and data are processed and stored. -On the other hand, we have currently decided that our metal-api control plane components run on a kubernetes cluster (in our case on a cluster provided by one of the available hyperscalers). - -Running the control plane on Kubernetes has the following benefits: - -- Ease of deployment -- Get most, if not all, of the required infrastructure services like (probably incomplete): - - IPs - - DNS - - L7-Loadbalancing - - Storage - - S3 Backup - - High Availability - -Using a kubernetes as a service offering from one of the hyperscalers, enables us to focus on using kubernetes instead of maintaining it as well. - -## Goal - -It would be much saner if metal-stack has no, or only minimal dependencies to external services. Imagine a metal-stack deployment in a plant, it would be optimal if we only have to deliver a single rack with servers and networking gear installed and wired, plug that rack to the power supply and a internet uplink and its ready to go. - -Have a second plant which you want to be part of all your plants? Just tell both that they are part of something bigger and metal-api knows of two partitions. - -## Possible Solutions - -We can think of two different solutions to this vision: - -1. Keep the central control plane approach and require some sort of kubernetes deployment accessible from the internet. This has the downside that the user must, provide a managed kubernetes deployment in his own datacenter or uses a hyperscaler. Still not optimal. -1. Install the metal-api and all its dependencies in every partition, replicate or shard the databases to every connected partition, make them know each other. Connect the partitions over the internet with some sort of vpn to make the services visible to each other. - -As we can see, the first approach does not really address the problem, therefore i will describe solution #2 in more details. - -## Central/Current setup - -### Stateful services - -Every distributed system suffer from handling state in a scalable, fast and correct way. To start how to cope with the state, we first must identify which state can be seen as partition local only and which state must be synchronous for read, and synchronous for writes across partitions. - -Affected states: - -- masterdata: e.g. tenant and project must be present in every partition, but these are entities which are read often but updates are rare. A write can therefore be visible with a decent delay in a distinct partition with no consequences. -- ipam: the prefixes and ip´s allocated from machines. These entities are also read often and rare updates. But we must differentiate between dirty reads for different types. A machine network is partition local, ips acquired from such a network must by synchronous in the same partition. Ips acquired from global networks such as internet must by synchronous for all partitions, as otherwise a internet ip could be acquired twice. -- vrf ids: they must only be unique in one partition -- image and size configurations: read often, written seldom, so no high requirements on the storage of these entities. -- images: os images are already replicated from a central s3 storage to a per partition s3 service. metal-hammer kernel and initrd are small and pull always from the central s3, can be done similar to os images. -- machine and machine allocation: must be only synchronous in the partition -- switch: must be only synchronous in the partition -- nsq messages: do not need to cross partition boundaries. No need to keep the messages persistent, even the opposite is true, we don't want to have the messages persist for a longer period. - -Now we can see that the most critical state to held and synchronize are the IPAM data, because these entities must be guaranteed to be synchronously updated, while being updated frequently. - -Datastores: - -We use three different types of datastores to persist the states of the metal application. - -- rethinkdb is the main datastore for almost all entities managed by metal-api -- postgresql is used for masterdata and ipam data. -- nsq uses disk and memory tho store the messages. - -### Stateless services - -These are the easy part, all of our services which are stateless can be scaled up and down without any impact on functionality. Even the stateful services like masterdata and metal-api rely fully on the underlying datastore and can therefore also be scaled up and down to meet scalability requirements. - -Albeit, most of these services need to be placed behind a loadbalancer which does the L4/L7 balancing across the started/available replicas of the service for the clients talking to it. This is actually provided by kubernetes with either service type loadbalancer or type clusterip. - -One exception is the `metal-console` service which must have the partition in it´s dns name now, because there is no direct network connectivity between the management networks of the partitions. See "Network Setup) - -## Distributed setup - -### State - -In order to replicate certain data which must be available across all partitions we can use on of the existing open source databases which enable such kind of setup. There are a few available out there, the following incomplete list will highlight the pro´s and cons of each. - -- RethinkDB - - We already store most of our data in RethinkDB and it gives already the ability to synchronize the data in a distributed manner with different guarantees for consistency and latency. This is described here: [Scaling, Sharding and replication](https://rethinkdb.com/docs/sharding-and-replication/). But because rethinkdb has a rough history and unsure future with the last release took more than a year, we in the team already thought that we eventually must move away from rethinkdb in the future. - -- Postgresql - - Postgres does not have a multi datacenter with replication in both directions, it just can make the remote instance store the same data. - -- CockroachDB - - Is a Postgresql compatible database engine on the wire. CockroachDB gives you both, ACID and geo replication with writes allowed from all connected members. It is even possible to configure [Follow the Workload](https://www.cockroachlabs.com/docs/stable/topology-follow-the-workload) and [Geo Partitioning and Replication](https://www.cockroachlabs.com/docs/v19.2/topology-geo-partitioned-replicas). - -If we migrate all metal-api entities to be stored the same way we store masterdata, we could use cockroachdb to store all metal entities in one ore more databases spread across all partitions and still ensure consistency and high availability. - -A simple setup how this would look like is shown here. - -![Simple CockroachDB setup](Distributed.png) - -go-ipam was modified in a example PR here: [PR 17](https://github.com/metal-stack/go-ipam/pull/17) - -### API Access - -In order to make the metal-api accessible for api users like `cloud-api` or `metalctl` as easy at it is today, some effort has to be taken. One possible approach would be to use a external loadbalancer which spread the requests evenly to all metal-api endpoints in all partitions. Because all data are accessible from all partitions, a api request going to partition A with a request to create a machine in partition B, will still work. If on the other hand partition B is not in a connected state because the interconnection between both partitions is broken, then of course the request will fail. - -**IMPORTANT** -The NSQ Message to inform `metal-core` must end in the correct partition - -To provide such a external loadbalancer we have several opportunities: - -- Cloudflare or comparable CDN service. -- BGP Anycast from every partition - -Another setup would place a small gateway behind the metal-api address, which forwards to the metal-api in the partition where the request must be executed. This gateway, `metal-api-router` must inspect the payload, extract the desired partition, and forward the request without any modifications to the metal-api endpoint in this partition. This can be done for all requests, or if we want to optimize, only for write accesses. - -## Network setup - -In order to have the impact to the overall security concept as minimal as possible i would not modify the current network setup. The only modifications which has to be made are: - -- Allow https ingress traffic to all metal-api instances. -- Allow ssh ingress traffic to all metal-console instances. -- Allow CockroachDB Replication between all partitions. -- No NSQ traffic from outside required anymore, except we cant solve the topic above. - -A simple setup how this would look like is shown here, this does not work though because of the forementioned NSQ issue. - -![API and Console Access](Distributed-API.png) - -Therefore we need the `metal-api-router`: - -![Working API and Console Access](Distributed-API-Working.png) - -## Deployment - -The deployment of our components will substantially differ in a partition compared to a the deployment we have actually. Deploying it in kubernetes in the partition would be very difficult to achieve because we have no sane way to deploy kubernetes on physical machines without a underlying API. -I would therefore suggest to deploy our components in the same way we do that for the services running on the management server. Use systemd to start docker containers. - -![Deployment](Distributed-Deployment.png) diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP10/README.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP10/README.md deleted file mode 100644 index 6811cdc0..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP10/README.md +++ /dev/null @@ -1,197 +0,0 @@ ---- -slug: /MEP-10-sonic-support -title: MEP-10 -sidebar_position: 10 ---- - -# SONiC Support - -As writing this proposal, metal-stack only supports Cumulus on Broadcom ASICs. Unfortunately, after the acquisition of -Cumulus Networks by Nvidia, Broadcom decided to cut its relationship with Cumulus, and therefore Cumulus 4.2 is the last -version that supports Broadcom ASICs. Since trashing the existing hardware is not a solution, adding support for a -different network operating system is necessary. - -One of the remaining big players is [SONiC](https://sonic-net.github.io/SONiC/), which Microsoft created to scale the -network of Azure. It's an open-source project and is now part of the [Linux Foundation](https://www.linuxfoundation.org/press/press-release/software-for-open-networking-in-the-cloud-sonic-moves-to-the-linux-foundation). - -For a general introduction to SONiC, please follow the [Architecture](https://github.com/sonic-net/SONiC/wiki/Architecture) official -documentation. - -## ConfigDB - -On a cold start, the content of `/etc/sonic/config_db.json` will be loaded into the Redis database `CONFIG_DB`, and both -contain the switch's configuration except the BGP unnumbered configuration, which still has to be configured directly by -the frr configuration files. The SONiC community is working to remove this exception, but no release date is known. - -## BGP Configuration - -Frr runs inside a container, and a shell script configured it on the container startup. For BGP unnumbered, we must set -the configuration variable `docker_routing_config_mode` to `split` to prevent SONiC from overwriting our configuration -files created by `metal-core`. But by using the split mode, the integrated configuration mode of frr is deactivated, and -we have to write our BGP configuration to the daemon-specific files `bgp.conf`, `staticd.conf`, and `zebra.conf` instead -to `frr.conf`. - -```bash -elif [ "$CONFIG_TYPE" == "split" ]; then - echo "no service integrated-vtysh-config" > /etc/frr/vtysh.conf - rm -f /etc/frr/frr.conf -``` - -Reference: [docker-init](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/docker_init.sh#L69) - -Adding support for the integrated configuration mode, we must at least adjust the startup shell script and the supervisor configuration: - -```bash -{% if DEVICE_METADATA.localhost.docker_routing_config_mode is defined and DEVICE_METADATA.localhost.docker_routing_config_mode == "unified" %} -[program:vtysh_b] -command=/usr/bin/vtysh -b -``` - -Reference: [supervisord.conf](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/frr/supervisord/supervisord.conf.j2#L157) - -## Non-BGP Configuration - -For the Non-BGP configuration we have to write it into the Redis database directly or via one of the following interfaces: - -- `config replace ` -- the Mgmt Framework -- the SONiC restapi - -Directly writing into the Redis database isn't a stable interface, and we must determine the create, delete, and update -operations on our own. The last point is also valid for the Mgmt Framework and the SONiC restapi. Furthermore, the -Mgmt Framework doesn't start anymore for several months, and a [potential fix](https://github.com/sonic-net/sonic-buildimage/pull/10893) -is still not merged. And the SONiC restapi isn't enabled by default, and we must build and maintain our own SONiC images. - -Using `config replace` would reduce the complexity in the `metal-core` codebase because we don't have to determine the -actual changes between the running and the desired configuration. The approach's drawbacks are using a version of SONiC -that contains the PR [Yang support for VXLAN](https://github.com/sonic-net/sonic-buildimage/pull/7294), and we must provide -the whole new startup configuration to prevent unwanted deconfiguration. - -### Configure Loopback interface and activate VXLAN - -```json -{ - "LOOPBACK_INTERFACE": { - "Loopback0": {}, - "Loopback0|": {} - }, - "VXLAN_TUNNEL": { - "vtep": { - "src_ip": "" - } - } -} -``` - -#### Configure MTU - -```json -{ - "PORT": { - "Ethernet0": { - "mtu": "9000" - } - } -} -``` - -#### Configure PXE Vlan - -```json -{ - "VLAN": { - "Vlan4000": { - "vlanid": "4000" - } - }, - "VLAN_INTERFACE": { - "Vlan4000": {}, - "Vlan4000|": {} - }, - "VLAN_MEMBER": { - "Vlan4000|": { - "tagging_mode": "untagged" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104000_Vlan4000": { - "vlan": "Vlan4000", - "vni": "104000" - } - } -} -``` - -#### Configure VRF - -```json -{ - "INTERFACE": { - "Ethernet0": { - "vrf_name": "vrf104001" - } - }, - "VLAN": { - "Vlan4001": { - "vlanid": "4001" - } - }, - "VLAN_INTERFACE": { - "Vlan4001": { - "vrf_name": "vrf104001" - } - }, - "VRF": { - "vrf104001": { - "vni": "104001" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104001_Vlan4001": { - "vlan": "Vlan4001", - "vni": "104001" - } - } -} -``` - -## DHCP Relay - -The DHCP relay container only starts if `DEVICE_METADATA.localhost.type` is equal to `ToRRouter`. - -## LLDP - -SONiC always uses the local port subtype for LLDP and sets it to some freely configurable alias field of the interface. - -```python -# Get the port alias. If None or empty string, use port name instead -port_alias = port_table_dict.get("alias") -if not port_alias: - self.log_info("Unable to retrieve port alias for port '{}'. Using port name instead.".format(port_name)) - port_alias = port_name - -lldpcli_cmd = "lldpcli configure ports {0} lldp portidsubtype local {1}".format(port_name, port_alias) -``` - -Reference: [lldpmgr](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-lldp/lldpmgrd#L153) - -## Mgmt Interface - -The mgmt interface is `eth0`. To configure a static IP address and activate the Mgmt VRF, use: - -```json -{ - "MGMT_INTERFACE": { - "eth0|": { - "gwaddr": "" - } - }, - "MGMT_VRF_CONFIG": { - "vrf_global": { - "mgmtVrfEnabled": "true" - } - } -} -``` - -[IP forwarding is deactivated on `eth0`](https://github.com/sonic-net/sonic-buildimage/blob/202205/files/image_config/sysctl/sysctl-net.conf#L7), and no IP Masquerade is configured. diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP11/README.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP11/README.md deleted file mode 100644 index 87f48a10..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP11/README.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -slug: /MEP-11-auditing-of-metal-stack-resources -title: MEP-11 -sidebar_position: 11 ---- - -# Auditing of metal-stack resources - -Currently no logs of the ownership of resources like machines, networks, ips and volumes are generated or kept. Though due to legal requirements data centers are required to keep track of this ownership over time to prevent liability issues when opening the platform for external users. - -In this proposal we want to introduce a flexible and low-maintenance approach for auditing on top of [Meilisearch](https://www.meilisearch.com/). - -## Overview - -In general our auditing logs will be collected by a request interceptor or middleware. Every request and response will be processed and eventually logged to Meilisearch. -Meilisearch will be configured to regularly create chunks of the auditing logs. These finished chunks will be backed up to a S3 compatible storage with a read-only option enabled. - -Of course sensitive data like session keys or passwords will be redacted before logging. We want to track relevant requests and responses. If auditing the request fails, the request itself will be aborted and will not be processed further. The requests and responses that will be audited will be annotated with a correlation id. - -Transferring the meilisearch auditing data chunks to the S3 compatible storage will be done by a sidecar cronjob that is executed periodically. -To avoid data manipulation the S3 compatible storage will be configured to be read-only. - -## Whitelisting - -To reduce the amount of unnecessary logs we want to introduce a whitelist of resources and operations on those that should be logged. -Other requests will be passed directly to the next middleware or web service without any further processing. - -As we are only interested in mutating endpoints, we ignore all `GET` requests. -The whitelist includes all `POST`, `PUT`, `PATCH` and `DELETE` endpoints of the HTTP middleware except for the following (non-manipulating) route suffixes: - -- `/find` -- `/notify` -- `/try` and `/match` -- `/capacity` -- `/from-hardware` - -Regarding GRPC audit trails, they are not so interesting because only internal clients are using this API. However, we can log the trails of the `Boot` service, which can be interesting to revise the machine lifecycle. - -## Chunking in Meilisearch - -We want our data to be chunked in Meilisearch. To accomplish this, we rotate the index identifier on a scheduled basis. The index identifiers will be derived from the current date and time. - -To keep things simple, we only support hourly, daily and monthly rotation. The eventually prefixed index names will only include relevant parts of date and time like `2021-01`, `2021-01-01` or `2021-01-01_13`. - -The metal-api will only write to the current index and switches to the new index on rotation. The metal-api will never read or update data in any indices. - -## Moving chunks to S3 compatible storage - -As Meilisearch will be filled with data over time, we want to move completed chunks to a S3 compatible storage. This will be done by a sidecar cronjob that is executed periodically. Note that the periods of the index rotation and the cronjob execution don't have to match. - -When the backup process gets started, it initiates a [Meilisearch dump](https://www.meilisearch.com/docs/learn/advanced/dumps) of the whole database across all indices. Once the returned task is finished, the dump must be copied from a Meilisearch volume to the S3 compatible storage. After a successful copy, the dump can be deleted. - -Now we want to remove all indices from Meilisearch, except the most recent one. For this, we [get all indices](https://www.meilisearch.com/docs/reference/api/indexes#list-all-indexes), sort them and [delete each index](https://www.meilisearch.com/docs/reference/api/indexes#delete-an-index) except the most recent one to avoid data loss. - -For the actual implementation, we can build upon [backup-restore-sidecar](https://github.com/metal-stack/backup-restore-sidecar). But due to the index rotation and the fact, that older indices need to be deleted, this probably does not fit into the mentioned sidecar. - -## S3 compatible storage - -The dumps of chunks should automatically deleted after a certain amount of time, once we are either no longer allowed or required to keep them. -The default retention time will be 6 months. Ideally already uploaded chunks should be read-only to prevent data manipulation. - -A candidate for the S3 compatible storage is Google Cloud Storage, which allows to configure automatic expiration of objects through a [lifecycle rule](https://cloud.google.com/storage/docs/managing-lifecycles?hl=en#storage-set-lifecycle-config-go). - -## Affected components - -- metal-api grpc server needs an auditing interceptor -- metal-api web server needs an auditing filter chain / middleware -- metal-api needs new command line arguments to configure the auditing -- mini-lab needs a Meilisearch instance -- mini-lab may need a local S3 compatible storage -- we need a sidecar to implement the backup to S3 compatible storage -- Consider auditing of volume allocations and freeings outside of metal-stack - -## Alternatives considered - -Instead of using Meilisearch we investigated using an immutable database like [immudb](https://immudb.io/). But immudb does not support chunking of data and due to its immutable nature, we will never be able to free up space of expired data. Even if we are legally allowed or required to delete data, we will not be able to do so with immudb. - -In another variant of the Meilisearch approach the metal-api would also be responsible for copying chunks to the S3 compatible storage and deleting old indices. But separating the concerns allows completely different implementations for every deployment stage. diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP12/README.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP12/README.md deleted file mode 100644 index 65532c57..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP12/README.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -slug: /MEP-12-rack-spreading -title: MEP-12 -sidebar_position: 12 ---- - -# Rack Spreading - -Currently, when creating a machine through the metal-api, the machine is placed randomly inside a partition. This algorithm does not consider spreading machines across different racks and different chassis. This may lead to the situation that a group of machines (that for example form a cluster) can end up being placed in the same rack and the same chassis. - -Spreading a group of machines across racks can enhance availability for scenarios like a rack losing power or a chassis meltdown. - -So, instead of just randomly deciding the placement of a machine candidate, we want to propose a placement strategy that attempts to spread machine candidates across the racks inside a partition. - -Furthermore a followup improvement to guarantee that machines are really spread across multiple racks, even if multiple machines are ordered in parallel, was implemented with [PR490](https://github.com/metal-stack/metal-api/pull/490). - -## Placement Strategy - -Machines in the project are spread across all available racks evenly within a partition (best effort). For this, an additional request to the datastore has to be made in order to find allocated machines within the project in the partition. - -The algorithm will then figure out the least occupied racks and elect a machine candidate randomly from those racks. - -The user can optionally pass placement tags which will be considered for spreading the machines as well (this will for example allow spreading by a cluster id tag inside the same project). - -## API - -```golang -// service/v1/machine.go - -type MachineAllocation struct { - // existing fields are omitted for readability - PlacementTags []string `json:"placement_tags" description:"by default machines are spread across the racks inside a partition for every project. if placement tags are provided, the machine candidate has an additional anti-affinity to other machines having the same tags"` -} -``` diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP13/README.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP13/README.md deleted file mode 100644 index 2dde20f5..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP13/README.md +++ /dev/null @@ -1,111 +0,0 @@ ---- -slug: /MEP-13-dual-stack-support -title: MEP-13 -sidebar_position: 13 ---- - -# Dual-stack Support - -dual-stack support is required to be able to create Kubernetes clusters with either IPv6 single-stack or dual-stack enabled. -With the inherent scarcity of IPv4 addresses, the need to be able to use IPv6 has increased. - -Full IPv6 dual-stack support was added to Kubernetes with v1.23 as stable. - -Gardeners have had full IPv6 dual-stack support since `v1.109`. - -metal-stack manages CIDRs and IP addresses with the [go-ipam](https://github.com/metal-stack/go-ipam) library, which already got full IPv6 support in 2021 (see [https://metal-stack.io/blog/2021/02/ipv6-part1](https://metal-stack.io/blog/2021/02/ipv6-part1)). -But this was only the foundation, more work needs to be done to get full IPv6 support for all aspects managed by metal-stack.io. - -## General Decisions - -For the general decision we do not look at the isolated clusters feature for now as this would make the solution even more complex and we want to introduce IPv6 in smaller steps to the users. - -### Networks - -Currently, metal-stack organizes CIDRs / prefixes into a `network' resource in the metal-api. A network can consist of multiple CIDRs from the same address family. For example, if an operator wants to provide Internet connectivity to provisioned machines, they can start with small network CIDRs. The number of managed network prefixes can then be expanded as needed over time. - -With dual-stack we have to choose between two options: Network per address family or networks with both address families. These options are described in the next section. - -#### Network per Address Family - -This means that we allow networks with CIDRs from one address family only, one for IPv4 and one for IPv6. - -The machine creation process will not change if the machine only needs to be either IPv4 or IPv6 addressable. -But if on the other side, the machine need to be able to connect to both address families, the machine creation needs to specify two networks, one for IPv4 and one for IPv6. -Also there will be 2 distinct VRF IDs for every network with a different address family. - -#### Network with both Address Families - -Make a network dual address family capable, meaning that you can add multiple cidrs from both address families to a network. -Then the machine creation will remain the same for single-stack and dual-stack cases, but the ip address allocation will need to specify the address family from which to allocate an ip address when the network is dual-stack. -This does not break the existing API, but allows existing extensions to easily add dual-stack support. -To avoid additional checking of which address families are available on this network during an ip allocation call, we could store the address families in the network. - -#### Decision - -The decision was made to go with the having both address families in a single network entity because we think this is the most flexible way to support dual-stack machines and Kubernetes clusters as well as single-stack with the least amount of modifications on the networking side. - -### Examples - -To illustrate the the usage we start by creating a tenant super network which has both address families: - -```yaml ---- -id: tenant-super-network-mini-lab -name: Project Super Network -description: Super network of all project networks -partitionid: mini-lab -prefixes: - - 10.0.0.0/16 - - 2001:db8:0:10::/64 -defaultchildprefixlength: - IPv4: 22 - IPv6: 96 -privatesuper: true -``` - -In order to create this network, we simple call: - -```bash -metalctl network create -f tenant-super.yaml -``` - -This is usually done during the initial setup of the environment. - -Next step is to allocate a tenant network where the machines of a project can be placed: - -```bash -metalctl network allocate --partition mini-lab --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 --name my-node-network -``` - -This leads to the following network allocation: - -```yaml -id: 2d2c0350-3f66-4597-ae97-ef6797232212 -name: my-node-network -parentnetworkid: tenant-super-network-mini-lab -partitionid: mini-lab -prefixes: - - 10.0.0.0/22 - - 2001:db8:0:10::/96 -projectid: 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -vrf: 20 -consumption: - ipv4: - available_ips: 1024 - available_prefixes: 256 - used_ips: 2 - used_prefixes: 0 - ipv6: - available_ips: 2147483647 - available_prefixes: 1073741824 - used_ips: 1 - used_prefixes: 0 -privatesuper: false -``` - -Users can the create IP addresses from these child networks. By default, they retrieve an IPv4 address except a super network only consists of IPv6 prefixes. In the latter case the users acquire an IPv6 address. - -```bash -metalctl network ip create --network 2d2c0350-3f66-4597-ae97-ef6797232212 --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -``` diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP14/README.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP14/README.md deleted file mode 100644 index 47c06434..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP14/README.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -slug: /MEP-14-independence-from-external-sources -title: MEP-14 -sidebar_position: 14 ---- - -# Independence from external sources - -In certain situations some customers may need to operate and create machines without making use of external services like DNS or NTP through the internet. To make this possible, all metal-stack components reaching external services need to be configurable with custom endpoints. - -So far, the following components have been identified as requiring changes: - -- pixiecore -- metal-hammer -- metal-images - -More components are likely to be added to the list during processing. -For DNS and NTP servers it should be possible to provide default values within a partition. They can either be inherited from machines and firewalls or overwritten with own ones. - -## pixiecore - -A NTP server endpoint need to be configured on the pixiecore. This can be achieved by providing it through environment variables on start up. - -## metal-hammer - -If using a self-deployed NTP server, also the metal-hammer need to be configured with it. For backward compatibility, default values from `pool.ntp.org` and `time.google.com` are used. - -## metal-images - -Configurations for the `metal-images` are different for machines and firewalls. - -## metalctl - -In order to pass DNS and NTP servers to partitions and machines while creating them, the flags `dnsservers` and `ntpservers` need to be added. - -The implementation of this MEP will make metal-stack possible to create and maintain machines without requiring an internet connection. diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP16/README.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP16/README.md deleted file mode 100644 index 205670ab..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP16/README.md +++ /dev/null @@ -1,318 +0,0 @@ ---- -slug: /MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller -title: MEP-16 -sidebar_position: 16 ---- - -# metal-api as an Alternative Configuration Source for the firewall-controller - -In the current situation, a firewall as provisioned by metal-stack is a fully immutable entity. Any modifications on the firewall like changing the firewall ruleset must be done _somehow_ by the user – the metal-api and hence metal-stack is not aware of its current state. - -As part of our [integration with the Gardener project](https://docs.metal-stack.io/stable/overview/kubernetes/#Gardener) we offer a solution called the [firewall-controller](https://github.com/metal-stack/firewall-controller), which is part of our [firewall OS images](https://github.com/metal-stack/metal-images/blob/6318a624861b18a559a9d37299bca5f760eef524/firewall/Dockerfile#L57-L58) and addresses shortcomings of the firewall resource's immutability, which would otherwise be completely impractible to work with. The firewall-controller crashes infinitely if it is not properly configured through the userdata when using the firewall image of metal-stack. - -The firewall-controller approach is tightly coupled to Gardener and it requires the administrator of the Gardener installation to pass a shoot and a seed kubeconfig through machine userdata when creating the firewall. How this userdata has to look like is not documented and is just part of another project called the [firewall-controller-manager](https://github.com/metal-stack/firewall-controller-manager) (FCM), which task is to orchestrate rolling updates of firewall machines in a way that network traffic interruption is minimal when updating a firewall or applying a change to an immutable firewall configuration. - -In general, a firewall entity in metal-stack has similarities to the machine entity but it has a fundamental difference: A user gains ownership over a machine after provisioning. They can access it through SSH, modify it at will and this is completely wanted. For firewalls, however, we do not want a user to access the provisioned firewall as the firewall is a privileged part of the infrastructure with access to the underlay network. The underlay can not be tampered with at any given point in time by a user as it can destroy the entire network traffic flow inside a metal-stack partition. - -For this reason, we have a gap in the metal-stack project in terms of a missing solution for people who do not rely on the Gardener integration. We are basically leaving a user with the option to implement an orchestrated recreation of every possible change on the firewall to minimize traffic interruption for the machines sitting behind the firewall or re-implement the firewall-controller to how they want to use it for their use-case. - -Also we do not have a clear distinction in the API between user and metal-stack operator for firewalls. If a user would allocate a firewall it is also possible for the user to inject his own SSH keys and access the firewall and tamper with the underlay network. - -Parts of these problems are probably going to decrease with the work on [MEP-4](../MEP4/README.md) where there will be dedicated APIs for users and administrators of metal-stack including fine-grained access tokens. - -With this MEP we want to describe a way to improve this current situation and allow other users that do not rely on the Gardener integration – for whatever motivation they have – to adequately manage firewalls. For this, we propose an alternative configuration for the firewall-controller that is native to metal-stack and more independent of Gardener. - -## Proposal - -The central idea of this proposal is allowing the firewall-controller to use the metal-api as a configuration source. This should serve as an alternative strategy to the currently used FCM `Firewall` resource based approach in the Gardener use-case. -Updates of the firewall rules should be possible through the metal-api. - -The firewall-controller itself should now be able to decide which of the two main strategies should be used for the base configuration: a kubeconfig or the metal-api. This should be possible through a dedicated _firewall-controller-config_. - -Using this config will now allow operators to fine-tune the data sources for all of its dynamic configuration tasks independently. -For example the data source of the core firewall rules could be set either from the `Firewall` resource located in the Gardener `Seed` or the metal-apiserver node network entity, while the CWNPs should be fetched and applied from a given kubeconfig (the `Shoot` Kubeconfig in the Gardener case). -This configuration file is intended to be injected during firewall creation through the userdata along with potential source connection credentials. - -```yaml -# the name of the firewall, defaulted to the hostname -name: best-firewall-ever - -sources: - seed: - kubeconfig: /path/to/seed.yaml # current gardener behavior - namespace: shoot--proj--name - shoot: - kubeconfig: /path/to/shoot.yaml # current gardener behavior - namespace: firewall - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - static: - # static should mirror all information provided by the metal or seed/shoot sources - firewall: # optional - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -# all sub-controllers running on the firewall -# each can be configured independently -controllers: - # this is the base controller - firewall: - source: seed # or: metal, static - - # these are optional: when not provided, they are disabled - selfUpdate: - enabled: true - droptailer: - enabled: true - - # these are optional: when not provided, they are disabled - service: - source: shoot # or: metal, static - cwnp: - source: shoot # or: metal, static - monitor: - source: shoot # currently only shoot is supported -``` - -The existing behavior of the firewall-controller writing into `/etc/nftables/firewall-controller.v4` is not changed. The different controller configuration sources are internally treated in the same way as before. The `static` source can be used to prevent the firewall-controller from crashing and consistently providing a static ruleset. This might be interesting for metal-stack native use cases or environments where the metal-api cannot be accessed. - -There must be one central nftables-rule-file-controller that is notified and triggered by all other controllers that contribute to the nftables configuration. - -For example, in order to maintain the existing Gardener integration, the configuration file for the firewall-controller will look like this: - -```yaml -name: shoot--abc--cluster-firewall-def -sources: - seed: - kubeconfig: /etc/firewall-controller/seed.yaml - namespace: shoot--abc--cluster - shoot: - kubeconfig: /etc/firewall-controller/shoot.yaml - namespace: firewall - -controllers: - firewall: - source: seed - - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Plain metal-stack users might use a configuration like this: - -```yaml -name: best-firewall-ever - -sources: - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - -controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - cwnp: - # firewall rules stored in firewall entity - # potential improvement would be to attach the rules to the node network entity - # be aware that the firewall and private networks are immutable - # eventually we introduce a firewall ruleset entity - source: metal -``` - -In highly restricted environments that cannot access metal-api the static source could be used: - -```yaml -name: most-restricted-firewall-ever - -sources: - static: - firewall: - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -controllers: - firewall: - source: static - - cwnp: - source: static -``` - -### Non-Goals - -- Resolving the missing differentiation between users and administrators by letting users pass userdata and SSH keys to the firewall creation. - - This is even more related to [MEP-4](../MEP4/README.md) than this MEP. - -### Advantages - -- Offers a native metal-stack solution that improves managing firewalls for users by adding dynamic reconfiguration through the metal-api - - e.g., in the mini-lab, users can now allocate a machine, then an IP address and announce this IP from the machine without having to re-create the firewall but by adding a firewall rule to the metal-api. -- Improve consistency throughout the API (firewall rules would reflect what is persisted in metal-api). -- Other providers like Cluster API can leverage this approach, too. -- It can contribute to solving the shoot migration issue (in Cluster API case the `clusterctl move` for firewall objects) - - For Gardener takes the seed out of the equation (of which the kubeconfig changes during shoot migration) - - However: Things like egress rules, rate limiting, etc. are currently not part of the firewall or network entity in the metal-api. These would need to be added to one of them. -- Potentially resolve the issue that end-users can manipulate accounting data of the firewall through the `FirewallMonitor` - - for this we would need to be able to report traffic data to metal-api - -### Caveats - -- Metal-View access is too broad for firewalls. Mitigated by [MEP-4](../MEP4/README.md). -- Polling of the firewall-controller is bad for performance. Mitigated by [MEP-4](../MEP4/README.md). - -### Firewall Controller Manager - -Currently the firewall-controller-manager expects the creators of a `FirewallDeployment` to use the defaulting webhook that is tailored to the Gardener integration in order to generate `Firewall.spec.userdata` or to override it manually. Currently `Firewall.spec.userdata` will never be set explicitly. - -Instead we'd like to propose `Firewall.spec.userdataContents` which will replace the old `userdata`-string by a typed data structure. The FCM will do the heavy lifting while the `FirewallDeployment` creator decides what should be configured. - -```yaml -kind: FirewallDeployment -spec: - template: - spec: - userdataContents: - - path: /etc/firewall-controller/config.yaml - content: | - --- - sources: - static: {} - controllers: - firewall: - source: static - - path: /etc/firewall-controller/seed.yaml - secretRef: - name: seed-kubeconfig - generateFirewallControllerKubeconfig: true - - path: /etc/firewall-controller/shoot.yaml - secretRef: - name: shoot-kubeconfig -``` - -### Gardener Extension Provider Metal Stack - -The GEPM should be migrated to the new `Firewall.spec.userdataContents` field. - -### Cluster API Provider Metal Stack - -![architectural overview](firewall-for-capms-overview.svg) - -In Cluster API there are essentially two main clusters: the management cluster and the workload cluster while the CAPMS takes in the role of the GEPM. -Typically a local bootstrap cluster is created in KinD which acts as the management cluster. It creates the workload cluster. Thereafter the ownership of the workload cluster is typically moved (using `clusterctl move`) to a different cluster which will then become the management cluster. -The new management cluster might actually be the workload cluster itself. - -In contrast to Gardener, Cluster API aims to be less opinionated and minimal. It is common practice to not install any non-required components or CRDs into the workload cluster by default. Therefore we cannot expect custom resources like `ClusterwideNetworkPolicy` or `FirewallMonitor` to be installed in the workload cluster but strongly recommend our users to do it. Therefore it's the responsibility of the operator to tell [cluster-api-provider-metal-stack](https://github.com/metal-stack/cluster-api-provider-metal-stack) the kubeconfig for the cluster where these CRDs are installed and defined in. - -A viable configuration for a `MetalStackCluster` that generates firewall rules based of `Service` type `LoadBalancer` and `ClusterwideNetworkPolicy` and expects them to be deployed in the workload cluster is shown below. The `FirewallMonitor` will be reported into the same cluster. - -```yaml -kind: MetalStackCluster -metadata: - name: ${CLUSTER_NAME} -spec: - firewallTemplate: - userdataContents: - - path: /etc/firewall-controller/config.yaml - secretName: ${CLUSTER_NAME}-firewall-controller-config - - - path: /etc/firewall-controller/workload.yaml - # this is the kubeconfig generated by kubeadm - secretName: ${CLUSTER_NAME}-kubeconfig ---- -kind: Secret -metadata: - name: ${CLUSTER_NAME}-firewall-controller-config -stringData: - controllerConfig: | - --- - name: ${CLUSTER_NAME}-firewall - - sources: - metal: - url: ${METAL_API_URL} - hmac: ${METAL_API_HMAC} - type: ${METAL_API_HMAC_TYPE} - projectID: ${METAL_API_PROJECT_ID} - shoot: - kubeconfig: /etc/firewall-controller/workload.yaml - namespace: firewall - - controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Here the firewall-controller-config will be referenced by the `MetalStackCluster` as a `Secret`. Please note that the `Secret`s in `userdataContents` will not be fetched and will directly be passed to the `FirewallDeployment`. At first the reconciliation of it in the FCM will fail due to the missing Kubeconfig secret. After the `MetalStackCluster` has been marked as ready, CAPI will create this missing secret. Effectively the firewall and initial control plane node should be created at the same time. - -This approach allows maximum flexibility as intended by Cluster API and is still able to provide robust rolling updates of firewalls. - -An advanced use case of this flexibility would be a management cluster, that is in charge of multiple workload clusters. Where one workload cluster acts as a monitoring or tooling cluster, receives logs and the firewall monitor for the other workload clusters. The CWNPs could be defined here, all in a separate namespace. - -#### Cluster API Caveats - -When the cluster is pivoted and reconciles its own firewall, a malfunctioning firewall prevents the cluster from self-healing and requires manual intervention by creating a new firewall. This is an inherent problem of the cluster-api approach. It can be circumvented by using an extra cluster to manage workload clusters. - -In the current form of this approach firewalls and therefore the firewall egress and ingress rules are managed by the cluster operators that manage the cluster-api resources. -Hence it will not be possible to gain a fine-grained control over every cluster operator's choices from a central ruleset at the level of metal-stack firewalls. -In case this control surfaces as a requirement, it would need to be implemented in a firewall external to metal-stack. - -## Roadmap - -In general this proposal is not thought to be implemented in one batch. Instead an incremental approach is required. - -1. Enhance firewall-controller - - - Reduce coupling between controllers - - Introduce controller config - - Abstract module to write into distinct nftable rules for every controller - - Implement `sources.static`, but not `sources.metal` - - GEPM should set `FirewallDeployment.spec.template.spec.userdataContents` - -2. Allow Cluster API to use the FCM with static ruleset - - - Add `firewall.metal-stack.io/paused` annotation (managed by CAPMS during `clusterctl move`, theoretically useful for Gardener shoot migration as well to avoid shallow deletion). - - Reconcile multiple `FirewallDeployment` resources across multiple namespaces. For Gardener the old behavior of reconciling only one namespace should persist. - - Allow setting the `firewall.metal-stack.io/no-controller-connection` annotation through the `FirewallDeployment` (either through the template or inheritance). - - Add `MetalStackCluster.spec.firewallTemplate`. - - Make `MetalStackCluster.spec.nodeNetworkID` optional if `spec.firewallTemplate` given. - -3. Add `sources.metal` as configuration option. - - - Allow updates of firewall rules in the metal-apiserver. - - Depends on [MEP-4](../MEP4/README.md) metal-apiserver progress - -4. Potentially migrate the GEPM to use `sources.metal` diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio deleted file mode 100644 index faea3e3d..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio +++ /dev/null @@ -1,4 +0,0 @@ - - - -
handles traffic
Firewall
Firewall Controller
node-exporter
nftables-exporter
droptailer-client
Workload Cluster
droptailer
Configures
Bootstrap or Management Cluster
reconcile
configures
reconcile
Cluster API Provider metal-stack
Metal Stack Cluster CRD
Firewall Deployment CRD
Firewall CRD
Firewall Set CRD
rec
reconcile
reconcile
Firewall Controller Manager
Metal Stack Machine CRD
manages
Admin
Kubeconfig FirewallMonitor
FirewallMonitor CRD
main metal-api
Firewall entity
kubeconfig CWNP
Clusterwide Network Policy CRD
base config
controllerConfig
user-defined
network rules
reports firewall
state
send firewall log lines
controllerConfig
controllerConfig
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg deleted file mode 100644 index 853f8175..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg +++ /dev/null @@ -1 +0,0 @@ -
handles traffic
handles traffic
Firewall
Firewall
Firewall Controller
Firewall Controller
node-exporter
node-exporter
nftables-exporter
nftables-exporter
droptailer-client
droptailer-client
Workload Cluster
Workload Cluster
droptailer
droptailer
Configures
Configures
Bootstrap or Management Cluster
Bootstrap or Management Cluster
reconcile
reconcile
configures
configures
reconcile
reconcile
Cluster API Provider metal-stack
Cluster API Provider...
Metal Stack Cluster CRD
Metal Stack Cluster...
Firewall Deployment CRD
Firewall Deployment...
Firewall CRD
Firewall CRD
Firewall Set CRD
Firewall Set CRD
rec
rec
reconcile
reconcile
reconcile
reconcile
Firewall Controller Manager
Firewall Controller...
Metal Stack Machine CRD
Metal Stack Machine...
manages
manages
Admin
Admin
Kubeconfig FirewallMonitor
Kubeconfig FirewallMonitor
FirewallMonitor CRD
FirewallMonitor CRD
main metal-api
main metal-api
Firewall entity
Firewall entity
kubeconfig CWNP
kubeconfig CWNP
Clusterwide Network PolicyCRD
Clusterwide Network...
base config
base config
controllerConfig
controllerConfig
user-defined
network rules
user-defined...
reports firewall
state
reports firewall...
send firewall log lines
send firewall log lines
controllerConfig
controllerConfig
controllerConfig
controllerConfig
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP17/README.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP17/README.md deleted file mode 100644 index 35f48970..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP17/README.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -slug: /MEP-17-global-network-view -title: MEP-17 -sidebar_position: 17 ---- - -# Global Network View - -> [!IMPORTANT] -> This MEP assumes the implementation of the metal-apiserver as described by [MEP-4](../MEP4/README.md) which is currently work in progress. - -Having a complete view of the network topology is useful when working with deployments or troubleshooting connectivity issues. -Currently, the API doesn't know of any other switches than the leaf switches. -Information about all other switches and their connections must be gathered from Ansible inventories or by accessing the switches via SSH. -Documentation of each partition's network must be kept in-sync with all changes made to the deployment or cabling. -We would like to expand the API's knowledge of the network to the entire underlay including inter-switch connections as well as BGP statistics and health status. - -## Switch Types - -Registering a switch at the API is done by the metal-core. -Apart from that, it also reconciles port and FRR configuration to adapt to the machine provisioning cycle. -This reconfiguration is only necessary on the leaf switches. -To allow deploying the metal-core on other switches than leaves we need a way of telling it what type of switch it is running on so it can act accordingly. -On any non-leaf switches it will only register the switch and report statistic but not change any configuration. -Supported switch types are - -- `leaf` -- `spine` -- `exit` -- `mgmtleaf` -- `mgmtspine` - -## Network Topology - -All switches should periodically report their LLDP neighbors and port configuration. -This information can be used to quickly identify common network issues, like MTU mismatch or the like. -Ideally, there would be some graphical representation of the network topology containing only the most important information for a quick overview. -It should contain all switches and machines as nodes and all connections as edges of a graph. -Ports, VRFs, and maybe also IPs should be associated with a connection. - -Apart from the topology graph, there should be a way to display more detailed information about both ports of a connection, like - -- MTU -- speed -- IP -- UP/DOWN status -- VRF -- VLAN -- whether it participates in a BGP session - -## BGP Announcements - -The metal-core should collect all routes it knows about and send them to the API along with a timestamp. -Reported routes should be stored to a redis database along with the switch that reported them and the timestamp of the last time they were reported. -An expiration threshold should be defined and all expired routes should be cleaned up periodically. -Whenever new routes are reported they get merged into the existing ones by the strategy: - -- when new, just add -- when existing, update `last_announced` timestamp - -By querying the BGP announcements we can find out whether an allocated IP is still in use. diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/README.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/README.md deleted file mode 100644 index 9c02c0b7..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/README.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -slug: /MEP-18-autonomous-control-plane -title: MEP-18 -sidebar_position: 18 ---- - -# Autonomous Control Plane - -As described in the [deployment chapter](../../../docs/04-For%20Operators/03-deployment-guide.mdx), we strongly recommend Kubernetes as the target platform for running the metal-stack control plane. - -Kubernetes clusters for this purpose are readily available from hyperscalers, metalstack.cloud, or other cloud providers. Simply using a managed Kubernetes cluster greatly simplifies a metal-stack installation. However, sometimes it might be desirable to host the metal-stack control plane autonomously, without the help of another cloud provider. Reasons for this might include corporate policies that prohibit the use of external data center products, or network constraints. - -The Kubernetes cluster hosting the metal-stack control plane must provide at least the following features: - -- Load balancing (for exposing the APIs) -- Persistent storage (for the databases and key-value stores) -- Access to object storage for automated backups of the stateful sets -- Access to a DNS provider supported by one of the used DNS extensions -- Externally accessible DNS records for obtaining officially signed certificates through DNS challenges - -This metal-stack control plane cluster must also be highly available to prevent a complete loss of control over the managed resources in the data center. -Regular Kubernetes updates to apply security fixes and feature updates must be possible in an automated manner. The Day-2 operational overhead of running this cluster in your own datacenter must be reasonable. - -In this chapter, we propose a solution for setting up a metal-stack environment with an autonomous control plane that is independent of another cloud provider. - -## Use Your Own Dogfood - -The most obvious solution is to just deploy a Kubernetes cluster manually in your own data center by utilizing existing tooling for the deployment: - -- k3s -- kubeadm -- vmware and rancher -- talos -- kubespray -- ... (not a complete list) - -However, all these solutions add another layer of complexity that needs to be maintained and operated by people who also need to learn and understand metal-stack. In general, metal-stack in combination with [Gardener](https://gardener.cloud) contains all the necessary tools to provide KaaS, so it makes sense to reuse what is already in place without introducing new dependencies on other products and vendors. - -The only problem here is that Gardener is not yet able to create an initial cluster, which may change with the implementation of [GEP-28](https://github.com/gardener/gardener/blob/master/docs/proposals/28-autonomous-shoot-clusters.md). In the meantime, we suggest using [k3s](https://k3s.io/), which manages the initial metal-stack partition to host the control plane, since the maintenance overhead is acceptable and it is easy to deploy. - -## The Matryoshka Principle - -Instead of directly using the K3s cluster for the production control plane, we propose using it as a minimal control plane cluster which only purpose is to host the production control plane cluster. This layer of indirection brings some reasonable advantages: - -- In the event of an interruption or loss of this minimal control plane cluster, the production control plane remains unaffected, and end users can continue to manage their clusters as normal. -- A dedicated operations team can take care of the Day-2 maintenance of this installation, which can be handy because the tools like k3s are a little different from the rest of the setup (it is likely that more manual maintenance is required than for any other cluster). This would also be true if the initial cluster problem would be solved by the Gardener itself and not using k3s. -- Since the number of shoot clusters to host is static, the resource requirements are minimal and will not change significantly over time. There are no huge resource requirements in terms of cpu, memory and storage. As such, the lack of scalability is not such a big issue. - -So, our proposal is to chain two metal-stack control planes. The initial control plane cluster would use k3s and on this cluster we can spin up a cluster for the production control plane with the use of Gardener. - -The following figure shows how the high-level architecture of this setup looks like. A even more simplified illustration of this setup can be looked up in the appendix[^1]. - -![Autonomous Control Plane Architecture](./autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg) - -The k3s nodes can either be bare metal machines or virtual machines. When using VMs a single k3s node might be a viable solution, too. These nodes are supposed to be setup manually / partly automated with an operating system like Debian. - -To name the cluster that hosts the initial metal-stack control plane and Gardener we use the term _initial cluster_. The initial cluster creates worker nodes to host the _target cluster_. - -## Initial Cluster - -The initial cluster is kept very small. The physical bare metal machines can be any machines and switches which are supported by metal-stack, but can be smaller in terms of cpu, memory and network speed because these machines must only be capable of running the target cluster for the metal-stack control plane. A typical single socket server with 8-16 cores and 64GB of RAM and two NVMe drives of 1TB would be a good starting point. - -In a typical k3s setup, a stateful set would lose the data once the k3s cluster was terminated and started again. But there is a possibility to define parts of the local storage of the server to be provided to the k3s cluster for the PVCs. With that, k3s could be terminated and started again, for example to update and reboot the host os, or update k3s itself and the data will persist. - -Example k3s configuration for persistent storage on the hosts os: - -```yaml -k3s: Cluster -apiVersion: k3s.x-k8s.io/v1alpha4 -name: needle-control-plane -nodes: - - role: control-plane - # add a mount from /path/to/my/files on the host to /files on the node - extraMounts: - - hostPath: /path/to/my/files - containerPath: /files -``` - -Into this cluster metal-stack and Gardener will be deployed. This deployment can be done by a Gitlab runner which is running on this machine. -The mini-lab will be used as a base for this deployment. The current development of [gardener-in-minilab](https://github.com/metal-stack/mini-lab/pull/202) must be extended to host all required extensions to make this a working metal-stack control plane which can manage the machines in the attached bare metal setup. - -In addition to the metal-stack and Gardener deployment, some additional required services are deployed (non-complete list): - -- PowerDNS to serve as a DNS Server for all DNS entries used in the initial and the target cluster, like `api.initial.metal-stack.local`, `gardener-api.initial.metal-stack.local` and the DNS entries for the api servers of the created kubernetes clusters. -- NTP -- Monitoring for the initial cluster and partition -- Optional: OIDC Server for authenticating against the metal-api -- Optional: Container Registry to host all metal-stack and gardener containers -- Optional: Let's Encrypt [boulder](https://github.com/letsencrypt/boulder) as a certificate authority -- ... - -Physical view, minimal setup for a initial cluster with a single physical node: - -![Small Initial Cluster](autonomous-control-plane-images/small-initial-cluster.svg) - -Physical View, bigger ha setup which is spread across two data centers: - -![HA Initial Cluster](autonomous-control-plane-images/ha-initial-cluster.svg) - -### Control Plane High Availability - -Running the initial control plane on a single physical server is not as available as it should be in such a use case. It should be possible to survive a loss of this server, because the server could be lost by many events, such as hardware failure, disk corruption or even failure of the datacenter location where this server is deployed. - -Setting up a second server with the same software components is an option, but the problem of data redundancy must be solved, because neither the gardener control plane, nor the metal-stack control plane can be instantiated twice. - -Given that we provide part of the local storage of the server as backing storage for the stateful sets in the k3s cluster, the data stored on the server itself must be replicated to another server and backed up on a regular basis. - -The replication of ETCD can be achieved through [clustered configuration](https://docs.k3s.io/datastore/ha-embedded) of k3s. Components of metal-stack and Gardener can run standalone and already utilize backup-restore mechanism that must be configured accordingly. For two or more bare metal machine used for the initial cluster, a loadbalancing mechanism for the ingress is required. kube-vip could be a possible solution. - -For monitoring a backend like a Victoria Metrics Cluster would allow spearding the monitoring data across the initial cluster nodes. These metrics should also be backed up in object storage. - -### Partition - -The partition which is managed by the initial cluster can be a simple and small hardware setup but yet capable enough to host the target cluster. It would even be a good practice to create separate target clusters on the initial cluster, e.g. one for the metal-stack control plane and one for the Gardener (maybe one more for monitoring). - -It can follow the metal-stack minimal setup which provides about 8-16 small servers connected to a 1G/s or 10G/s network dataplane. Central storage is optional as the persistence of the services running in these clusters is always backed up to a central object storage. Operations would be much easier if a central storage is provided. - -## Target Cluster - -The target cluster is the metal-stack environment which serves for end-user production use, the control plane is running in a shoot hosted in the initial cluster. The seed(s) and shoot(s) for end-users are created on the machines provided by the target cluster. -These machines can be of a different type in terms of size, but more importantly, these machines are connected to another network dataplane. Also the management infrastructure is separated from the initial cluster management network. - -## Failure Scenarios - -Everything could fail, everything will fail at some point. But this must kept in mind and nothing bad should happen if only one component at a time fails. -If more than one fails, the restoration to a working state must be easily possible and well documented. - -To ensure all possible breakages are documented, we suggest writing a list which summarizes all failure scenarios that might occur including the remediation. - -Here is an example of how a scenario documentation could look like: - -**Scenario**: Initial cluster is gone, all machines have died -**Impact**: Management of the initial cluster infrastructure not possible anymore, the target cluster continues to run but cannot be managed because the API servers are gone. end-users are not affected by this incident. -**Remediation**: The initial cluster nodes must be provisioned from scratch and re-deployed through the CI mechanism. The backups of the stateful sets are automatically restored during this process. - -## Implementation - -As part of this proposal, we provide the following tools and integrations in order to setup an autonomous control plane: - -- Deployment roles for the services like PowerDNS and NTP for the initial cluster -- Stretch goal: Deployment role to setup k3s in clustered configuration for the initial cluster and update it -- Extend the Gardener on mini-lab integration to allow shoot creation in the mini-lab -- Steady integration of the setup (maybe something like [k3d](https://github.com/k3d-io/k3d) in the mini-lab) - -## Appendix - -[^1]: ![metal-stack-chain](autonomous-control-plane-images/metal-stack-chain.svg) diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio deleted file mode 100644 index eafcb514..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio +++ /dev/null @@ -1,535 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine01 -
-
-
-
- - spine01 - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 01 - -
-
-
-
- - Initial cluster node 01 - -
-
-
- - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine02 -
-
-
-
- - spine02 - -
-
-
- - - - - - - - - -
-
-
- leaf03 -
-
-
-
- - leaf03 - -
-
-
- - - - - - - - - -
-
-
- leaf04 -
-
-
-
- - leaf04 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 02 - -
-
-
-
- - Initial cluster node 02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 03 - -
-
-
-
- - Initial cluster node 03 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg deleted file mode 100644 index 99261ada..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine01
spine01
leaf01
leaf01
leaf02
leaf02
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Initial cluster node 01
Initial cluster node 01
123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine02
spine02
leaf03
leaf03
leaf04
leaf04
Initial cluster node 02
Initial cluster node 02
Initial cluster node 03
Initial cluster node 03
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio deleted file mode 100644 index aae8a12d..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio +++ /dev/null @@ -1,1133 +0,0 @@ - - - - - - - - - - - - - - - - - - - -
-
-
- Initial Cluster -
-
-
-
- - Initial Cluster - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - -
-
-
- K3s Standalone - - - (on Debian) - - -
-
-
-
- - K3s Standalone (on Debian) - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Initial Partition -
-
-
-
- - Initial Partition - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for metal-stack -
-
-
-
- - Target Cluster for metal-stack - -
-
-
- - - - - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for Gardener -
-
-
-
- - Target Cluster for Gardener - -
-
-
- - - - - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - - - - - - - -
-
-
- Target Partition -
-
-
-
- - Target Partition - -
-
-
- - - - - - - - - - -
-
-
- Gardener Seeds and End-User Shoots -
-
-
-
- - Gardener Seeds and End-User Shoots - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - - - - -
-
-
- ETCD can be clustered or standalone, backed up by sidecar -
-
-
-
- - ETCD can be clustere... - -
-
-
- - - - - - - - - - -
-
-
- This data will get lost in case local PV gets deleted -
-
-
-
- - This data will get l... - -
-
-
- - - - - - - - - - -
-
-
- We can work with local PVs here, too. -
- backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered. -
-
-
-
- - We can work with local PVs he... - -
-
-
- - - - - - - -
-
-
- ETCD will be deployed in HA configuration on local PVs. -
-
- csi-driver-lvm needs to implement auto deletion of orphaned PVs. -
-
- Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot. -
-
-
-
- - ETCD will be deployed in HA c... - -
-
-
- - - - - - - - - - -
-
-
- More sophisticated storage solutions can be in place. -
-
- (Lightbits, NetApp, ...) -
-
-
-
- - More sophisticated storage so... - -
-
-
- - - - - - - - - - -
-
-
- TODO: Evaluate how to persist these metrics. -
-
-
-
- - TODO: Evaluate how to persist... - -
-
-
- - - - - - - - - - -
-
-
- - 1 VM or -
-
-
- - - 3 Bare Metal Machines - - -
-
-
-
-
- - 1 VM or... - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-stack -
-
-
-
- - metal-stack - -
-
-
- - - - - - - -
-
-
- metal-api -
-
-
-
- - metal-api - -
-
-
- - - - - - - -
-
-
- metal-db -
-
-
-
- - metal-db - -
-
-
- - - - - - - -
-
-
- ipam-db -
-
-
-
- - ipam-db - -
-
-
- - - - - - - -
-
-
- masterdata-db -
-
-
-
- - masterdata-db - -
-
-
- - - - - - - -
-
-
- headscale-db -
-
-
-
- - headscale-db - -
-
-
- - - - - - - -
-
-
- auditing-db -
-
-
-
- - auditing-db - -
-
-
- - - - - - - -
-
-
- nsqd -
-
-
-
- - nsqd - -
-
-
- - - - - - - - - - - -
-
-
- Gardener -
-
-
-
- - Gardener - -
-
-
- - - - - - - - - - -
-
-
- Virtual Garden -
-
-
-
- - Virtual Garden - -
-
-
- - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - -
-
-
- gardenlet -
-
-
-
- - gardenlet - -
-
-
- - - - - - - -
-
-
- Garden etcd -
-
-
-
- - Garden etcd - -
-
-
- - - - - - - -
-
-
- Prometheus -
-
-
-
- - Prometheus - -
-
-
- - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - -
-
-
- - Gitlab - -
- - Runner - -
-
-
-
-
- - Gitlab... - -
-
-
- - - - - - - - - - -
-
-
- Services -
-
-
-
- - Services - -
-
-
- - - - - - - -
-
-
- PowerDNS -
-
-
-
- - PowerDNS - -
-
-
- - - - - - - -
-
-
- boulder -
-
-
-
- - boulder - -
-
-
- - - - - - - -
-
-
- NTP -
-
-
-
- - NTP - -
-
-
- - - - - - - -
-
-
- OIDC -
-
-
-
- - OIDC - -
-
-
- - - - - - - -
-
-
- ... -
-
-
-
- - ... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg deleted file mode 100644 index e58e783b..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg +++ /dev/null @@ -1 +0,0 @@ -
Initial Cluster
Initial Cluster
metal-roles
metal-roles
CI
CI
K3s Standalone(on Debian)
K3s Standalone (on Debian)
Initial Partition
Initial Partition
Target Cluster for metal-stack
Target Cluster for metal-stack
Metal Control Plane
Metal Control Plane
provisions
provisions
Target Cluster for Gardener
Target Cluster for Gardener
Gardener Control Plane
Gardener Control Plane
Monitoring
Monitoring
Target Partition
Target Partition
Gardener Seeds and End-User Shoots
Gardener Seeds and End-User Shoots
provisions
provisions
metal-roles
metal-roles
CI
CI
metal-roles
metal-roles
ETCD can be clustered or standalone, backed up by sidecar
ETCD can be clustere...
This data will get lost in case local PV gets deleted
This data will get l...
We can work with local PVs here, too.
backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered.
We can work with local PVs he...
ETCD will be deployed in HA configuration on local PVs.

csi-driver-lvm needs to implement auto deletion of orphaned PVs.

Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot.
ETCD will be deployed in HA c...
More sophisticated storage solutions can be in place.

(Lightbits, NetApp, ...)
More sophisticated storage so...
TODO: Evaluate how to persist these metrics.
TODO: Evaluate how to persist...
1 VM or
3 Bare Metal Machines
1 VM or...
metal-stack
metal-stack
metal-api
metal-api
metal-db
metal-db
ipam-db
ipam-db
masterdata-db
masterdata-db
headscale-db
headscale-db
auditing-db
auditing-db
nsqd
nsqd
Gardener
Gardener
Virtual Garden
Virtual Garden
Gardener Control Plane
Gardener Control Plane
gardenlet
gardenlet
Garden etcd
Garden etcd
Prometheus
Prometheus
Monitoring
Monitoring
Gitlab
Runner
Gitlab...
Services
Services
PowerDNS
PowerDNS
boulder
boulder
NTP
NTP
OIDC
OIDC
...
...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio deleted file mode 100644 index cd5cf007..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio +++ /dev/null @@ -1,404 +0,0 @@ - - - - - - - - - - -
-
-
- Partition 1 -
-
-
-
- - Partition 1 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 2 -
-
-
-
- - Partition 2 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 3 -
-
-
-
- - Partition 3 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Production Control Plane -
-
-
-
- - Production Control Plane - -
-
- - - - -
-
-
- metal-stack -
- kubernetes cluster -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- gardener -
- kubernetes cluster -
-
-
-
- - gardener... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - - - - -
-
-
- Control Plane Partition -
-
-
-
- - Control Plane Partition - -
-
- - - - - -
-
-
- backup of stateful sets -
-
-
-
- - backup of stateful sets - -
-
- - - - - - -
-
-
- bare metal machine -
-
-
-
- - bare metal machine - -
-
- - - - -
-
-
- metal-stack -
- and -
- gardener -
- kubernetes cluster -
- running in kind -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - -
-
-
- S3 -
-
-
-
- - S3 - -
-
- - - - -
-
-
- Needle -
-
-
-
- - Needle - -
-
- - - -
-
-
- - Nail - -
-
-
-
- - Nail - -
-
-
- - - - - Text is not SVG - cannot display - - - -
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg deleted file mode 100644 index 8f88ba14..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg +++ /dev/null @@ -1 +0,0 @@ -
Partition 1
Partition 1
seeds
seeds
shoots
shoots
Partition 2
Partition 2
seeds
seeds
shoots
shoots
Partition 3
Partition 3
seeds
seeds
shoots
shoots
Production Control Plane
Production Control Plane
metal-stack
kubernetes cluster
metal-stack...
gardener
kubernetes cluster
gardener...
Manages
Manages
Control Plane Partition
Control Plane Partition
backup of stateful sets
backup of stateful sets
bare metal machine
bare metal machine
metal-stack
and
gardener
kubernetes cluster
running in kind
metal-stack...
Manages
Manages
S3
S3
Needle
Needle 
Nail
Nail
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio deleted file mode 100644 index a75ee340..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- Initial cluster node -
-
-
-
- - Initial cluster node - -
-
-
- - - - - - - - - - - - - -
-
-
- mirocloud (initial cluster partition nodes) -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg deleted file mode 100644 index a9d29f05..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
leaf01
leaf01
leaf02
leaf02
Initial cluster node
Initial cluster node
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP2/README.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP2/README.md deleted file mode 100644 index c7f2360a..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP2/README.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -slug: /MEP-2-two-factor-authentication -title: MEP-2 -sidebar_position: 2 ---- - -# Two Factor Authentication diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP3/README.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP3/README.md deleted file mode 100644 index 5ce36721..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP3/README.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -slug: /MEP-3-machine-re-installation -title: MEP-3 -sidebar_position: 3 ---- - -# Machine Re-Installation - -In the current metal-api only machine installations are possible, performing a machine upgrade is only possible by creating a new machine and delete the old one. -This has the drawback that in case a lot of data is stored on the local disks, a full restore of the original data must be performed. - -To prevent this, we will introduce a new metal-api endpoint to reinstall the machine with a new image, _without_ actually deleting the data stored on the additional hard disks. - -Storage is a difficult task to get right and reliable. A short analysis of our different storage requirements lead to 3 different scenarios. - -- Storage for the etcd pvs in the seed cluster of every partition. - This is the most important storage in our setup because these etcd pods serve as configuration backend for all customer kubernetes clusters. If they fail, the cluster is down. However gardener deploys a backup and restore sidecar into the etcd pod of every customer kubernetes control plane, and if this sidecar detects a corrupt or missing etcd database file(s) it starts automatic restore from the configured backup location. This will take some minutes. If for example a node dies, and gardener creates a new node instead, the csi-lvm created pv is not present on that node. Kubernetes will not schedule the missing etcd pod on this node because it has a local PV configured and is therefore tainted to run only on that node. To let kubernetes create that pod anyhow, someone has to either remove the taint, or delete the pod. If this is done, the pod starts and the restore of the etcd data can start as well. You can see this is a bit too complicated and will take the customer cluster down for a while (not measured yet but in the range of 5-10 minutes). -- Storage in customer clusters. - This was not promised in 2020. We have a intermediate solution with the provisioning of csi-lvm by default into all customer clusters. Albeit this is only local storage and will get deleted if a node dies. -- S3 Storage. - We have two possibilities to cope with storage: - - In place update of the OS with a daemonset - This will be fast and simple, but might fail because the packages being installed are broken right now, or a filesystem gets full, or any other failure you can think of during a os update. Another drawback is that metal-api does not reflect the updated os image. - - metal-api get a machine reinstall endpoint - With this approach we leverage from existing and already proven mechanisms. Reinstall must keep all data except the sata-dom. Gardener currently is not able to do an update with this approach because it can only do `rolling` updates. Therefore a additional `osupdatestrategy` has to be implemented for metal and other providers in gardener to be able to leverage the metal reinstall on the same machineID approach. - -If reinstall is implemented, we should focus on the same technology for all scenarios and put ceph via rook.io into the kubernetes clusters as additional StorageClass. It has to be checked whether to use the raw disk or a PV as the underlay block device where ceph stores its data. - -## API and behavior - -The API will get an new endpoint "reinstall" this endpoint takes two arguments: - -- machineID -- image - -No other aspects of the machine can be modified during the re-installation. All data stored in the existing allocation will be preserved, only the image will be modified. -Once this endpoint was called, the machine will get a `reboot` signal with the boot order set to PXE instead of HDD and the network interfaces on the leaf are set to PXE as well. Then the normal installation process starts: - -- unchanged: PXE boot with metal-hammer -- changed: metal-hammer first checks with the machineID in the metal-api (through metal-core) if there is already a allocation present -- changed: if a allocation is present and the allocation has set `reinstall: true`, wipe disk is only executed for the root disk, all other disks are untouched. -- unchanged: the specified image is downloaded and burned, `/install.sh` is executed -- unchanged: successful installation is reported back, network is set the the vrf, boot order is set to HDD. -- unchanged: distribution kernel is booted via kexec - -We can see that the `allocation` requires one additional parameter: `reinstall` and metal-hammer must check for already existing allocation at an earlier stage. - -Components which requires modifications (first guess): - -- metal-hammer: - - check for allocation present earlier - - evaluation of `reinstall` flag set - - wipe of disks depends on that flag - - Bonus: move configuration of disk layout and primary disk detection algorithm (PDDA) from metal-hammer into metal-api. - metal-api **MUST** reject reinstallation if the disk found by PDDA does not have the `/etc/metal` directory! -- metal-core: - - probably nothing -- metal-api: - - new endpoint `/machine/reinstall` - - add `Reinstall bool` to data model of `allocation` - - make sure to reset `Reinstall` after reinstallation to prevent endless reinstallation loop -- metalctl: - - implement `reinstall` -- metal-go: - - implement `reinstall` -- gardener (longterm): - - add the `OSUpgradeStrategy` `reinstall` diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP4/README.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP4/README.md deleted file mode 100644 index 389a02d4..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP4/README.md +++ /dev/null @@ -1,211 +0,0 @@ ---- -slug: /MEP-4-multi-tenancy-for-the-metal-api -title: MEP-4 -sidebar_position: 4 ---- - -# Multi-Tenancy for the metal-api -:::info -This document is work in progress. -::: - -In the past we decided to treat the metal-api as a "low-level API", i.e. the API does not specifically deal with projects and tenants. A user with editor access can for example assign machines to every project he desires, he can see all the machines available and can control them. We tried to keep the metal-api code base as small as possible and we added resource scoping to a "higher-level APIs". From there, a user would be able to only see his own clusters and IP addresses. - -As time passed metal-stack has become an open-source project and people are willing to adopt. Adopters who want to put their own technologies on top of the metal-stack infrastructure don't have those "higher-level APIs" that we implemented closed-source for our user base. So, external adopters most likely need to implement resource scoping on their own. - -Introducing multi-tenancy to the metal-api is a serious chance of making our product better and more successful as it opens the door for: - -- Becoming a "fully-featured" API -- Narrowing down attack surfaces and possibility of unintended resource modification produced by bugs or human errors -- Discouraging people to implement their own scoping layers in front of the metal-stack -- Gaining performance through resource scopes -- Letting untrusted / third-parties work with the API - -## Requirements - -These are some general requirements / higher objectives that MEP-4 has to fulfill. - -- Should be able to run with mini-lab without requiring to setup complex auth backends (dex, LDAP, keycloak, ...) - - Simple to start with, more complex options for production setups -- Fine-grained access permissions (every endpoint maps to a permission) -- Tenant scoping (disallow resource access to resources of other tenants) -- Project scoping (disallow resource access to resources of other projects) -- Access tokens in self-service for technical user access - -## Implementation - -We gathered a lot of knowledge while implementing a multi-tenancy-capable backend for metalstack.cloud. The goal is now to use the same technology and adopt that to the metal-api, this includes: - -- gRPC in combination with connectrpc -- OPA for making auth decisions -- REST HTTP only for OIDC login flows - -### API Definitions - -The API definitions should be located on a separate Github repository separate from the server implementation. The proposed repository location is: https://github.com/metal-stack/api. - -This repository contains the `proto3` specification of the exposed metal-stack api. This includes the messages, simple validations, services and the access permission to these services. The input parameters for the authorization in the backend are generated from the `proto3` annotations. - -Client implementations for the most relevant languages (go, python) are generated automatically. - -This api is divided into end-user and admin access at the top level. The proposed APIs are: - -- `metalstack.api.v2`: For end-user facing services -- `metalstack.admin.v2`: For operators and controllers which need access to unscoped entities - -The methods of the API can have different role scopes (and can be narrowed down further with fine-grained method permissions): - -- `tenant`: Tenant-scoped methods, e.g. project creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `project`: Project-scoped methods, e.g. machine creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `admin` Admin-scoped methods, e.g. unscoped tenant list or switch register - - Available roles: VIEWER, EDITOR - -And has methods with different visibility scopes: - -- `self`: Methods that only the logged in user can access, e.g. show permissions with the presented token -- `public`: Methods that do not require any specific authorization - -### API - -The API server implements the services defined in the API and validates access to a method using OPA with the JWT tokens passed in the requests. The server is implemented using the connectrpc.com framework. - -The API server implements the login flow through OIDC. After successful authentication, the API server derives user permissions from the OIDC provider and issues a new JWT token which is passed on to the user. The tokens including the permissions are stored in a redis compatible backend. - -With these tokens, users can create Access Tokens for CI/CD or other use cases. - -JWT Tokens can be revoked by admins and the user itself. - -### API Server - -Is put into a new github repo which implements the services defined in the `api` repository. It opens a `https` endpoints where the grpc (via connectrpc.com) and oidc services are exposed. - -### Migration of the Consumers - -To allow consumers to migrate to the `v2` API gradually, both apis, the new and the old, are deployed in parallel. In the control-plane both apis are deployed side-by-side behind the ingress. `api.example.com` is forwarded to `metal-api` and `metal.example.com` is forwarded to the new `metal-apiserver`. - -The api-server will talk to the existing metal-api during the process of migration services away to the new grpc api. - -The migration process can be done in the following manner: - -for each resource in the metal-api: - -- create a new proto3 based definition in the `api` repo. -- implement the business logic per service in the new `metal-apiserver` without calling the metal-api. -- clients must be able to talk to `v1` and `v2` backend in parallel -- Deprecate the already migrated service in the swagger route to notify the client that this route should not be used anymore. -- identify all consumers of this resource and replace them to use the grpc instead of the rest api -- move the business logic incl. the backend calls to ipam, metal-db, masterdata-api, nsq for this resource from the metal-api to the `metal-apiserver` - -We will migrate the rethinkdb backend implementation to a generic approach during this effort. - -- Try to enhance the generic rethinkdb interface with `project` scoped methods. - -There are a lot of consumers of metal-api, which need to be migrated: - -- ansible -- firewall-controller -- firewall-controller-manager -- gardener-extension-auth -- gardener-extension-provider-metal - - Do not point the secret bindings to a the shared provider secret in the seed anymore. Instead, use individual provider-secret containing project-scoped API access tokens in the Gardener project namespaces. -- machine-controller-manager-provider-metal -- metal-ccm -- metal-console -- metal-bmc -- metal-core -- metal-hammer -- metal-image-cache-sync -- metal-images -- metal-metrics-exporter -- metal-networker -- metalctl -- pixie - -## User Scenarios - -This section gathers a collection of workflows from the perspective of a user that we want to provide with the implementation of this proposal. - -### Machine Creation - -A regular user wants to create a machine resource. - -Requirements: Project was created, permissions are present - -- The user can see networks that were provided by the admin. - - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` - -- The user has to set the project scope first or provide `--project` flags for all commands. - ``` - $ metalctl project set 793bb6cd-8b46-479d-9209-0fedca428fe1 - You are now acting on project 793bb6cd-8b46-479d-9209-0fedca428fe1. - ``` -- The user can create the child network required for machine allocation. - ``` - $ metalctl network allocate --partition fra-equ01 --name test - ``` -- Now, the user sees his own child network. - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - └─╴08b9114b-ec47-4697-b402-a11421788dc6 test 793bb6cd-8b46-479d-9209-0fedca428fe1 fra-equ01 false false 10.128.64.0/22  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` -- The user does not see any machines yet. - ``` - $ metalctl machine ls - ``` -- The user can create a machine. - ``` - $ metalctl machine create --networks internet,08b9114b-ec47-4697-b402-a11421788dc6 --name test --hostname test --image ubuntu-20.04 --partition fra-equ01 --size c1-xlarge-x86` - ``` -- The machine will now be provisioned. - ``` - $ metalctl machine ls - ID LAST EVENT WHEN AGE HOSTNAME PROJECT SIZE IMAGE PARTITION - 00000000-0000-0000-0000-ac1f6b7befb2 Phoned Home 20s 50d 4h test 793bb6cd-8b46-479d-9209-0fedca428fe1 c1-xlarge-x86 Ubuntu 20.04 20210415 fra-equ01 - ``` - -:::warning -A user **cannot** list all allocated machines for all projects. The user **must** always switch project context first and can only view the machines inside this project. Only admins can see all machines at once. -::: -### Scopes for Resources - -The admins / operators of the metal-stack should be able to provide _global_ resources that users are able to use along with their own resources. In particular, users can view and use _global_ resources, but they are not allowed to create, modify or delete them. - -:::info -When a project ID field is empty on a resource, the resource is considered _global_. -::: - -Where possible, users should be capable of creating their own resource entities. - -| Resource | User | Global | -| :----------------- | :--- | :----- | -| File System Layout | yes | yes | -| Firewall | yes | | -| Firmware | | yes | -| OS Image | | yes | -| Machine | yes | | -| Network (Base) | | yes | -| Network (Children) | yes | | -| IP | yes | | -| Partition | | yes | -| Project | yes | | -| Project Token | yes | | -| Size | | yes | -| Switch | | | -| Tenant | | yes | - -:::info -Example: A user can make use of the file system layouts provided by the admins, but can also create own layouts. Same applies for images. As soon as a user creates own resources, the user takes over the responsibility for the machine provisioning to succeed. -::: diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP5/README.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP5/README.md deleted file mode 100644 index 3b7fc45c..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP5/README.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -slug: /MEP-5-shared-networks -title: MEP-5 -sidebar_position: 5 ---- - -# Shared Networks - -## Why are shared networks needed - -For special purpose machines that serve shared services with performance critical workloads to all machines of a partition (like persistent storage) it would be good to have kind of a "shared network" that is easily accessible. -They do not necessarily need another firewall. This would avoid having two firewalls in the datapath between a machine in a private network and the machines of a shared service. - -## Constraints that need to hold - -- a shared network is usable from all machines that have a firewall in front, that uses it -- a shared network is only usable within a single partition (currently we are constrained in bandwidth and have no routing of 10.0.0.0/8 addresses btw. partitions and failure domain should be the partition but this constraint might get lifted in the future) -- networks may be marked as shared after network allocation (but there should be no way back from shared to unshared) -- neither machines nor firewalls may have multiple private, unshared networks configured -- machines must have a single primary network configured - - this might be a shared network - - OR a plain, unshared private network -- firewalls may participate in multiple shared networks -- machines can be allocated with a primary network using auto IP allocation or with `noauto` and a specific IP - -## Should shared networks be private - -**Alternative 1:** If we implemented shared networks by extending functions around plain, private networks we would not have to manage another CIDR (mini point) and it would be possible to create a k8s cluster with a private network, mark the network as `shared` and produce shared services from this k8s cluster. - -**Alternative 2:** If shared networks are implemented as first class networks we could customize the VRF and also accomplish an other goal of our roadmap: being able to create machines directly in an external network. - -Together with @majst01 and @Gerrit91 we decided to continue to implement **Alternative 1**. - -## Firewalls accessing a shared network - -Firewalls that access shared networks need to: - -- hide the private network behind an ip address of the shared network if the shared network was configured with `nat=true`. -- import the prefixes of the shared VRF to the private VRF and import the prefixes of the private VRF to the shared VRF so that the communication between the two is working in both directions. As long as no `nat=true` was set on the shared VRF, the original machine ips are visible in both communication directions. - -## Setup with shared networks and single consumer - -![Simple Setup](./shared.png) - -## Setup with single shared network and multiple consumers - -![Advanced Setup](./shared_advanced.png) - -## Getting internet access - -Machines contained in a shared network can access the internet with different scenarios: - -- if they have an own firewall: this is internet accessibility, as common (check whether all traffic gets routed through it!) -- if they don't have an own firewall, an external HTTP proxy is needed that has an endpoint exposed as Service Type NodePort diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP5/shared.drawio b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP5/shared.drawio deleted file mode 100644 index aa7af045..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP5/shared.drawio +++ /dev/null @@ -1,121 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP5/shared.png b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP5/shared.png deleted file mode 100644 index b0b47f03..00000000 Binary files a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP5/shared.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP5/shared_advanced.drawio b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP5/shared_advanced.drawio deleted file mode 100644 index 6f96eca0..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP5/shared_advanced.drawio +++ /dev/null @@ -1,187 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP5/shared_advanced.png b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP5/shared_advanced.png deleted file mode 100644 index da989915..00000000 Binary files a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP5/shared_advanced.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP6/README.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP6/README.md deleted file mode 100644 index edf52a6c..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP6/README.md +++ /dev/null @@ -1,123 +0,0 @@ ---- -slug: /MEP-6-dmz-networks -title: MEP-6 -sidebar_position: 6 ---- - -# DMZ Networks - -## Reasoning - -To fulfill higher levels of security measures the standard metal-stack approach with a single firewall in front of a set of machines might be insufficient. -There are cases where two physically distinct firewalls in front of application workload are mandatory. In traditional network terms this is known as DMZ approach. - -For Kubernetes workloads it makes sense to use the front cluster for ingress, WAF purposes and as outgoing proxy. The clusters may be used for application workload. - -## DMZ network - -- Use a separate DMZ network prefix for every tenant -- This is used as intermediate network btw. private networks of a tenant and the internet -- For every partition a distinct DMZ firewall/cluster is needed for a tenant -- For Gardener orchestrated Kubernetes clusters this network must be a publicly reachable internet prefix because shoot clusters need a vpn service that is used for instrumentation from the seed cluster - this will be a requirement as long as the inverse vpn tunnel feature Konnectivity is not available to us. - -## Approach 1: DMZ with publicly reachable internet prefix - -![DMZ Internet](dmz-internet_public.svg) - -A DMZ network with publicly reachable internet prefix will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: null -partitionid: "" -prefixes: - - 212.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 104007 -vrfshared: false -nat: true -shared: false -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -This is currently supported by the metal-networker and needs no further changes! - -## Approach 2: DMZ with private IPs - -![DMZ Internet](dmz-internet_private.svg) - -A DMZ network with private IPs will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: tenant-super-network-fra-equ01 -partitionid: fra-equ01 -prefixes: - - 10.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 4711 -vrfshared: false -nat: true -shared: true # it's usable from multiple projects -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network (only locally, no-export) -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -## Code Changes / Implications - -- `metal-networker` and `metal-ccm` assume that there is only one network providing the default-route -- `metal-networker` needs to - - import the default route from the internet network to the dmz network (DMZ Firewall) - - import the DMZ network to the internet network and adjusting NAT rules (DMZ Firewall) - - import destination prefixes of the DMZ network to the private primary network (DMZ Firewall, Application Firewall) - - import DMZ-IPs of the private primary network to the DMZ network (DMZ Firewall, Application Firewall) -- `metal-api`: destination prefixes of private networks need to be configurable (`allocateNetwork`) -- `gardener-extension-provider-metal`: needs to be able to delete DMZ clusters (but skip the network deletion part) -- the application firewall is not publicly reachable - for debugging purposes a hop over the DMZ firewall is needed - -## Decision - -We decided to follow the second approach with private DMZ networks. diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP6/dmz-internet_private.drawio b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP6/dmz-internet_private.drawio deleted file mode 100644 index 7b83bbfc..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP6/dmz-internet_private.drawio +++ /dev/null @@ -1,178 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP6/dmz-internet_private.svg b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP6/dmz-internet_private.svg deleted file mode 100644 index f5e58204..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP6/dmz-internet_private.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
10.90.30.129
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
10.90.30.128/25
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
10.90.30.129 is reachable
/0 via Firewall DMZ
10.0.0.0/24 is reachable
10.0.1.0/24 is reachable
10.90.30.129 is reachable...
Internet
212.1.1.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0

import 10.0.0.0/24 no export
import 10.0.1.0/24 no export
import 10.90.30.128/25 no export
import 10.0.0.0/24 no exp...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP6/dmz-internet_public.drawio b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP6/dmz-internet_public.drawio deleted file mode 100644 index 544939e5..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP6/dmz-internet_public.drawio +++ /dev/null @@ -1,184 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP6/dmz-internet_public.svg b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP6/dmz-internet_public.svg deleted file mode 100644 index 5e825081..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP6/dmz-internet_public.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
212.1.2.3
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
212.1.2.0/27
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
212.1.2.3 is reachable
/0 via Firewall DMZ
212.1.2.3 is reachable...
Internet
212.1.1.0/27 212.1.2.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0
import 212.1.2.0/27
import 10.0.0.0/24 no redistribute
import 10.0.1.0/24 no redistribute

import 212.1.2.0/27...
SNAT to
212.1.2.1
SNAT to...
SNAT to
212.1.2.2
SNAT to...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP8/README.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP8/README.md deleted file mode 100644 index 14748fae..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP8/README.md +++ /dev/null @@ -1,503 +0,0 @@ ---- -slug: /MEP-7-configurable-filesystem-layout-for-machine-allocation -title: MEP-7 -sidebar_position: 7 ---- - -# Configurable Filesystem layout for Machine Allocation - -The current implementation uses a hard coded filesystem layout depending on the specified size and image. This is done in the metal-hammer. This worked well in the past because we had a small amount of sizes and images. But we reached a point where this is to restricted for all use cases we have to fulfill. It also forces us to modify the metal-hammer source code to support a new filesystem layout. - -This proposal tries to address this issue by introducing a filesystem layout struct in the metal-api which is then configurable per machine allocation. -The original behavior of automatic filesystem layout decision must still be present, because there must be no API change for existing API consumers. It should be a additional feature during machine allocation. - -## API and behavior - -The API will get a new endpoint `filesystemlayouts`to create/update/delete a set of available `filesystemlayouts`. - -### Constraints - -In order to keep the actual machine allocation api compatible, there must be no difference while allocating a machine. To achieve this every -`filesystemlayout` defines constraints which specifies for which combination of `sizes` and `images` this layout should be used by default. -The specified constraints over all `filesystemlayouts` therefore must be collision free, to be more specific, there must be exactly one layout outcome -for every possible combination of `sizes` and `images`. - -The `size` constraint must be a list of the exact size ids, the `image` constraint must be a map of os to semver compatible version constraint. For example: - -- `debian: ">= 10.20210101"` or `debian: "< 10.20210101"` - -The general form of a `image` constraint is a map from `os` to `versionconstraint` where: - -`os` must match the first part of the image without the version. -`versionconstraint` must be the comparator, a space and the version, or simply `*` to match all versions of this `os`. -The comparator must be one of: "=", "!=", ">", "<", ">=", "=>", "<=", "=<", "~", "~>", "^" - -It must also be possible to have a `filesystemlayout` in development or for other special purposes, which can be specified during the machine allocation. -To have such a layout, both constraints `sizes` and `images`must be empty list. - -### Reinstall - -The current reinstall implementation the metal-hammer detects during the installation on which disk the OS was installed and reports back to the metal-api the Report struct which has two properties `primarydisk` and `ospartition`. -Both fields are not required anymore because the logic is now shifted to the `filesystemlayout` definition. If `Disk.WipeOnReinstall` is set to true, this disk will be wiped, default is false and is preserved. - -### Handling of s2-xlarge machines - -These machines are a bit special compared to our `c1-*` machines because they have rotating hard disks for the mass storage purpose. -The downside is that the on board SATA-DOM has the same naming as the HDDs and can not be specified as the first /dev/sda disk because all HDDs are also /dev/sd\* disks. -Therefore we had a special SATA-DOM detection algorithm inside metal-hammer which simply checks for the smallest /dev/sd disk and took this to install the OS. - -This is not possible with the current approach, but we figured out that the SATA-DOM is always `/dev/sde`. So we can create a special `filesystemlayout` where the installations is made on this disk. - -### Possible Filesystemlayout hierarchies - -It is only possible to create a filesystem on top of a block device. The creation of a block device can be done on multiple ways, depending on the requirements regarding performance, space and redundancy of the filesystem. -It also depends on the disks available on the server. - -The current approach implements the following hierarchies: - -![filesystems](filesystems.png) - -### Implementation - -```go -// FilesystemLayout to be created on the given machine -type FilesystemLayout struct { - // ID unique layout identifier - ID string - // Description is human readable - Description string - // Filesystems to create on the server - Filesystems []Filesystem - // Disks to configure in the server with their partitions - Disks []Disk - // Raid if not empty, create raid arrays out of the individual disks, to place filesystems onto - Raid []Raid - // VolumeGroups to create - VolumeGroups []VolumeGroup - // LogicalVolumes to create on top of VolumeGroups - LogicalVolumes []LogicalVolume - // Constraints which must match to select this Layout - Constraints FilesystemLayoutConstraints -} - -type FilesystemLayoutConstraints struct { - // Sizes defines the list of sizes this layout applies to - Sizes []string - // Images defines a map from os to versionconstraint - // the combination of os and versionconstraint per size must be conflict free over all filesystemlayouts - Images map[string]string -} - -type RaidLevel string -type Format string -type GPTType string - -// Filesystem defines a single filesystem to be mounted -type Filesystem struct { - // Path defines the mountpoint, if nil, it will not be mounted - Path *string - // Device where the filesystem is created on, must be the full device path seen by the OS - Device string - // Format is the type of filesystem should be created - Format Format - // Label is optional enhances readability - Label *string - // MountOptions which might be required - MountOptions []string - // CreateOptions during filesystem creation - CreateOptions []string -} - -// Disk represents a single block device visible from the OS, required -type Disk struct { - // Device is the full device path - Device string - // Partitions to create on this device - Partitions []Partition - // WipeOnReinstall, if set to true the whole disk will be erased if reinstall happens - // during fresh install all disks are wiped - WipeOnReinstall bool -} - -// Raid is optional, if given the devices must match. -// TODO inherit GPTType from underlay device ? -type Raid struct { - // ArrayName of the raid device, most often this will be /dev/md0 and so forth - ArrayName string - // Devices the devices to form a raid device - Devices []Device - // Level the raidlevel to use, can be one of 0,1,5,10 - // TODO what should be support - Level RaidLevel - // CreateOptions required during raid creation, example: --metadata=1.0 for uefi boot partition - CreateOptions []string - // Spares defaults to 0 - Spares int -} - - -// VolumeGroup is optional, if given the devices must match. -type VolumeGroup struct { - // Name of the volumegroup without the /dev prefix - Name string - // Devices the devices to form a volumegroup device - Devices []string - // Tags to attach to the volumegroup - Tags []string -} - -// LogicalVolume is a block devices created with lvm on top of a volumegroup -type LogicalVolume struct { - // Name the name of the logical volume, without /dev prefix, will be accessible at /dev/vgname/lvname - Name string - // VolumeGroup the name of the volumegroup - VolumeGroup string - // Size of this LV in mebibytes (MiB) - Size uint64 - // LVMType can be either striped or raid1 - LVMType LVMType -} - -// Partition is a single partition on a device, only GPT partition types are supported -type Partition struct { - // Number of this partition, will be added to the device once partitioned - Number int - // Label to enhance readability - Label *string - // Size given in MebiBytes (MiB) - // if "0" is given the rest of the device will be used, this requires Number to be the highest in this partition - Size string - // GPTType defines the GPT partition type - GPTType *GPTType -} - -const ( - // VFAT is used for the UEFI boot partition - VFAT = Format("vfat") - // EXT3 is usually only used for /boot - EXT3 = Format("ext3") - // EXT4 is the default fs - EXT4 = Format("ext4") - // SWAP is for the swap partition - SWAP = Format("swap") - // None - NONE = Format("none") - - // GPTBoot EFI Boot Partition - GPTBoot = GPTType("ef00") - // GPTLinux Linux Partition - GPTLinux = GPTType("8300") - // GPTLinuxRaid Linux Raid Partition - GPTLinuxRaid = GPTType("fd00") - // GPTLinux Linux Partition - GPTLinuxLVM = GPTType("8e00") - - // LVMTypeLinear append across all physical volumes - LVMTypeLinear = LVMType("linear") - // LVMTypeStriped stripe across all physical volumes - LVMTypeStriped = LVMType("striped") - // LVMTypeStripe mirror with raid across all physical volumes - LVMTypeRaid1 = LVMType("raid1") -) -``` - -Example `metalctl` outputs: - -```bash -$ metalctl filesystemlayouts ls -ID DESCRIPTION SIZES IMAGES -default default fs layout c1-large-x86, c1-xlarge-x86 debian >=10, ubuntu >=20.04, centos >=7 -ceph fs layout for ceph s2-large-x86, s2-xlarge-x86 debian >=10, ubuntu >=20.04 -firewall firewall fs layout c1-large-x86, c1-xlarge-x86 firewall >=2 -storage storage fs layout s3-large-x86 centos >=7 -s3 storage fs layout s2-xlarge-x86 debian >=10, ubuntu >=20.04, >=firewall-2 -default-devel devel fs layout -``` - -The `default` layout reflects what is actually implemented in metal-hammer to guarantee backward compatibility. - -```yaml ---- -id: default -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - label: "efi" # required to be compatible with old images - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" # required to be compatible with old images - - path: "/var/lib" - device: "/dev/sda3" - format: "ext4" - label: "varlib" # required to be compatible with old images - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -The `firewall` layout reuses the built in nvme disk to store the logs, which is way faster and larger than what the sata-dom ssd provides. - -```yaml ---- -id: firewall -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - firewall: ">=2" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sda2" - format: "ext4" - - path: "/var" - device: "/dev/nvme0n1p1" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - device: "/dev/nvme0n1" - wipe: true - partitions: - - number: 1 - label: "var" - size: 0 - type: GPTLinux -``` - -The `storage` layout will be used for the storage servers, which must have mirrored boot disks. - -```yaml ---- -id: storage -constraints: - sizes: - - s3-large-x86 - images: - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/md1" - format: "vfat" - options: "-F32" - - path: "/" - device: "/dev/md2" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid - - device: "/dev/sdb" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid -raid: - - name: "/dev/md1" - level: 1 - devices: - - "/dev/sda1" - - "/dev/sdb1" - options: "--metadata=1.0" - - name: "/dev/md2" - level: 1 - devices: - - "/dev/sda2" - - "/dev/sdb2" - options: "--metadata=1.0" -``` - -The `s3-storage` layout matches the special situation on the s2-xlarge machines. - -```yaml ---- -id: s3-storage -constraints: - sizes: - - c1-large-x86 - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sde1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sde2" - format: "ext4" - - path: "/var/lib" - device: "/dev/sde3" - format: "ext4" -disks: - - device: "/dev/sde" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -A sample `lvm` layout which puts `/var/lib` as stripe on the nvme device - -```yaml ---- -id: lvm -description: "lvm layout" -constraints: - size: - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - createoptions: - - "-F 32" - label: "efi" - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" - - path: "/var/lib" - device: "/dev/vg00/varlib" - format: "ext4" - label: "varlib" - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -volumegroups: - - name: "vg00" - devices: - - "/dev/nvmne0n1" - - "/dev/nvmne0n2" -logicalvolumes: - - name: "varlib" - volumegroup: "vg00" - size: 200 - lvmtype: "striped" -disks: - - device: "/dev/sda" - wipeonreinstall: true - partitions: - - number: 1 - label: "efi" - size: 500 - gpttype: "ef00" - - number: 2 - label: "root" - size: 5000 - gpttype: "8300" - - device: "/dev/nvmne0n1" - wipeonreinstall: false - - device: "/dev/nvmne0n2" - wipeonreinstall: false -``` - -## Components which requires modifications - -- metal-hammer: - - change implementation from build in hard coded logic - - move logic to create fstab from install.sh to metal-hammer -- metal-api: - - new endpoint `filesystemlayouts` - - add optional spec of `filesystemlayout` during `allocation` with validation if given `filesystemlayout` is possible on given size. - - add `allocation.filesystemlayout` in the response, based on either the specified `filesystemlayout` or the calculated one. - - implement `filesystemlayouts` validation for: - - matching to disks in the size - - no overlapping with the sizes/imagefilter specified in `filesystemlayouts` - - all devices specified exists from top to bottom (fs -> disks -> device || fs -> raid -> devices) -- metalctl: - - implement `filesystemlayouts` -- metal-go: - - adopt api changes -- metal-images: - - install mdadm for raid support diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP8/filesystems.drawio b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP8/filesystems.drawio deleted file mode 100644 index 0f0c6ab5..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP8/filesystems.drawio +++ /dev/null @@ -1,43 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP8/filesystems.png b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP8/filesystems.png deleted file mode 100644 index 6d903b7e..00000000 Binary files a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP8/filesystems.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP9/README.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP9/README.md deleted file mode 100644 index a8cae83d..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP9/README.md +++ /dev/null @@ -1,132 +0,0 @@ ---- -slug: /MEP-9-no-open-ports-to-the-data-center -title: MEP-9 -sidebar_position: 9 ---- - -# No Open Ports To the Data Center - -Our metal-stack partitions typically have open ports for metal-stack native services, these are: - -- SSH port on the firewalls -- bmc-reverse-proxy for serial console access through the metal-console - -These open ports are potential security risks. For example, while SSH access is possible only with private key it's still vulnerable to DoS attack. - -Therefore, we want to get rid off these open ports to reduce the attack surface to the data center. - -## Requirements - -- Access to firewall SSH only via VPN -- Easy to update VPN components - -As a next step, we can also consider joining the management servers to the VPN mesh, which would replace typical WireGuard setups for operators to enter resources inside the partition. - -## High Level Design - -[](./architecture.svg) - -> Simplified drawing showing old vs. new architecture. - -### Concerns - -There's few concerns when using WireGuard for implementing VPN: - -1. WireGuard doesn't implement dynamic cipher substitution. Which is important in case one of the crypto methods, used by WireGuard will be broken. The only possible solution for that will be to update WireGuard to a fixed version. -2. Coordination server(Headscale) is a single point of failure. In case it fails, it potentially can disconnect existing members of the network, as WireGuard can't manage dynamic IPs by itself. -3. Headscale is already falls behind Tailscale coordination server implementation. Which can complicate the upgrade to newer version of Tailscale client in case of emergency. - -### Solutions to concerns - -1. Tailscale node software is using userspace implementation of WireGuard -- `wireguard-go`. One of the options is to inject Tailscale client into `metalctl`. And make it available as `metalctl vpn` or similar command. It should be possible to do as `tailscale` node is already available as open sourced Go pkg. That would allow us to control, what version of Tailscale users are using and in case of any critical changes to enforce them to update `metalctl` to use VPN functionality. -2. Would it be a considerable risk? We could look into `wg-dynamic` project to cover this problem. -3. At the moment, repository looks well maintained and the metal-stack team already contributes to it. - -## Implementation Details - -### metal-roles - -`metal-roles` will be responsible for deployment of `headscale` server(via new `headscale` role). It also should provide sufficient config to `metal-api` so it establishes connection with `headscale` gRPC server. - -### New `metalctl` commands - -`metalctl` will be responsible for client-side implementation of this MEP. Specifically, it's by using `metalctl` user expected to connect to firewalls. - -- `metalctl vpn` -- section for VPN related commands: - - `metalctl vpn get key [vpn name] --namespace [namespace name]` -- returns auth key to be used with `tailscale` client for establishing connection. - -Extend `metalctl firewall`: - -- `metalctl firewall ssh [ID]` -- connect to firewall via SSH. - -Extend `metalctl machine`: - -- `metalctl machine ssh [ID]` -- connect to machine via SSH. - -`metalctl` will be able to connect to firewall and machines by running `tailscale` in container. - -### metal-api - -Updates to `metal-api` should be made, so that it's able to add firewalls to VPNs. There should be one Tailscale namespace per project. So if multiple firewalls are created in single project, they will join the same namespace. - -Two new flags should be introduced to connect `metal-api` to `headscale` gRPC server: - -- `headscale-addr` -- specifies address of Headscale grpc API. -- `headscale-api-key` -- specifies temporary API key to connect to Headscale. It should be replaced and then rotated by `metal-api`. - -If `metal-api` initialized with `headscale` connection it should automatically join all created firewalls to VPN. - -Add new endpoint, that will be used by `metalctl` to connect to VPN: - -- `/v1/vpn GET` -- requests auth key from `headscale` server. - -### metal-hammer - -`metal-hammer` acts as an intermediary for machine configuration between `metal-api` and machine's image. Specifically it writes to `/etc/metal/install.yaml` file, data from which later will be used by image's `install.sh` file. - -To implement VPN support we have to add authentication key and VPN server address to `install.yaml` file. This key will be used to join machine to a VPN. - -### metal-images - -Images `install.sh` script have to be updated to work with authentication key and VPN server address, provided in `install.yaml` file. If this key is present, machine should connect to VPN. - -### metal-networker - -`metal-networker` also have to know if VPN was configured. In that case we need to disable public access to SSH and allow all(?) traffic from WireGuard interface. - -### firewall-controller - -`firewall-controller` have to monitor changes in `Firewall` resource and keep `tailscaled` version up-to-date. - -### Resources - -Update `Firewall` resource to include desired/actual `tailscale` version: - -``` -Firewall: - Spec: - tailscale: - Version: Minimal version - ... - Status: - ... - VPN: - Status: Boolean field - tailscale: - Version: Actual version - ... -``` - -### bmc-reverse-proxy - -TODO - -## References - -1. [WireGuard: Next Generation Secure Network Tunnel](https://www.youtube.com/watch?v=88GyLoZbDNw) -2. [How Tailscale works](https://tailscale.com/blog/how-tailscale-works) -3. [Tailscale is officially SOC 2 compliant](https://tailscale.com/blog/soc2) -4. [Why not Wireguard](https://www.ipfire.org/blog/why-not-wireguard) -5. [Wireguard: Known Limitations](https://www.wireguard.com/known-limitations/) -6. [Wireguard: Things That Might Be Accomplished](https://www.wireguard.com/todo/) -7. [Headscale: Tailscale control protocol v2](https://github.com/juanfont/headscale/issues/526) diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP9/architecture.drawio b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP9/architecture.drawio deleted file mode 100644 index adb09214..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP9/architecture.drawio +++ /dev/null @@ -1,324 +0,0 @@ - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - -
-
-
- headscale -
-
-
-
- - headscale - -
-
- - - - - - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
-
- - - - - Viewer does not support full SVG 1.1 - - - -
diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP9/architecture.svg b/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP9/architecture.svg deleted file mode 100644 index fd268d2f..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/MEP9/architecture.svg +++ /dev/null @@ -1 +0,0 @@ -
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
headscale
headscale
tailscaled
tailscaled
tailscaled
tailscaled
Internet
Internet
Internet
Internet
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/_category_.json b/versioned_docs/version-v0.21.10/contributing/01-Proposals/_category_.json deleted file mode 100644 index ec1a4ebc..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "position": 3, - "label": "Enhancement Proposals" -} diff --git a/versioned_docs/version-v0.21.10/contributing/01-Proposals/index.md b/versioned_docs/version-v0.21.10/contributing/01-Proposals/index.md deleted file mode 100644 index 9046bdf5..00000000 --- a/versioned_docs/version-v0.21.10/contributing/01-Proposals/index.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -slug: /enhancement-proposals -title: Enhancement Proposals -sidebar_position: 1 ---- - -# Metal Stack Enhancement Proposals (MEPs) - -This section contains proposals which address substantial modifications to metal-stack. - -Every proposal has a short name which starts with _MEP_ followed by an incremental, unique number. Proposals should be raised as pull requests in the [website](https://github.com/metal-stack/website) repository and can be discussed in Github issues. - -The list of proposals and their current state is listed in the table below. - -Possible states are: - -- `In Discussion` -- `Accepted` -- `Declined` -- `In Progress` -- `Completed` -- `Aborted` - -Once a proposal was accepted, an issue should be raised and the implementation should be done in a separate PR. - -| Name | Description | State | -| :------------------------ | :--------------------------------------------- | :-------------: | -| [MEP-1](MEP1/README.md) | Distributed Control Plane Deployment | `Declined` | -| [MEP-2](MEP2/README.md) | Two Factor Authentication | `Aborted` | -| [MEP-3](MEP3/README.md) | Machine Re-Installation to preserve local data | `Completed` | -| [MEP-4](MEP4/README.md) | Multi-tenancy for the metal-api | `In Progress` | -| [MEP-5](MEP5/README.md) | Shared Networks | `Completed` | -| [MEP-6](MEP6/README.md) | DMZ Networks | `Completed` | -| MEP-7 | Passing environment variables to machines | `Declined` | -| [MEP-8](MEP8/README.md) | Configurable Filesystemlayout | `Completed` | -| [MEP-9](MEP9/README.md) | No Open Ports To the Data Center | `Completed` | -| [MEP-10](MEP10/README.md) | SONiC Support | `Completed` | -| [MEP-11](MEP11/README.md) | Auditing ^of metal-stack resources | `Completed` | -| [MEP-12](MEP12/README.md) | Rack Spreading | `Completed` | -| [MEP-13](MEP13/README.md) | IPv6 | `Completed` | -| [MEP-14](MEP14/README.md) | Independence from external sources | `Completed` | -| MEP-15 | HAL Improvements | `In Discussion` | -| [MEP-16](MEP16/README.md) | Firewall Support for Cluster API Provider | `In Discussion` | -| [MEP-17](MEP17/README.md) | Global Network View | `In Discussion` | -| [MEP-18](MEP18/README.md) | Autonomous Control Plane | `In Discussion` | - -## Proposal Process - -1. Before starting a new proposal, it is advised to have a quick chat with one of the maintainers. -2. Create a draft pull request in the [website](https://github.com/metal-stack/website) repository with your proposal. Your proposal doesn't have to be finished at this point. -3. Share the PR in the [metal-stack Slack](https://metal-stack.slack.com/) and invite maintainers to review it. -4. The review itself will probably take place in multiple iterations. Don't be discouraged if your proposal is not accepted right away. The goal is to reach consensus. -5. Once your proposal is accepted, create an umbrella issue in the relevant repository or when multiple repositories are involved in the [releases](https://github.com/metal-stack/releases). -6. Other issues should be created in different repositories and linked to the umbrella issue. -7. Unless stated otherwise, the proposer is responsible for the implementation of the proposal. - -## How to Write a Good MEP - -In the first section of your MEP, start with the current situation and the motivation for the change. Summarize your proposal briefly. - -Next follows the main part: describe your proposal in detail. Which parts of of metal-stack are affected? Are there API changes? If yes, describe them and provide examples here. -Try to think of side effects your proposal might have. Try to provide a view on how your proposal affects users of metal-stack. -Highlight breaking changes and think of a migration path for existing users. If your proposal affects multiple components, try to describe the interaction between them. - -After the main part of your proposal, feel free to add additional sections, e.g. about alternatives that were considered, non-goals or future possibilities. - -Depending on the complexity of your proposal, you might want to add a section about the implementation plan or roadmap. - -You can have a look at the existing MEPs for inspiration. As you will notice: not every MEP has the same structure. Feel free to structure your MEP in a way that makes sense for your proposal. diff --git a/versioned_docs/version-v0.21.10/contributing/02-planning-meetings.md b/versioned_docs/version-v0.21.10/contributing/02-planning-meetings.md deleted file mode 100644 index ef602204..00000000 --- a/versioned_docs/version-v0.21.10/contributing/02-planning-meetings.md +++ /dev/null @@ -1,51 +0,0 @@ ---- -slug: /planning-meetings -title: Planning Meetings -sidebar_position: 2 ---- - -# Planning Meetings - -Public planning meetings are held **biweekly** on **odd calendar weeks** from **14:00 to 14:30** on Microsoft Teams. The purpose is to provide an overview of our current projects and priorities, as well as to discuss new topics and issues within the group. - -Our [development planning board](https://github.com/orgs/metal-stack/projects/34) can be found on GitHub. - -You can use [this link](https://teams.microsoft.com/l/meetup-join/19%3ameeting_ZTVmNWFkYjYtMzVmYi00ZTMxLTk5ZTUtMGFjYjU2OTk0MjQz%40thread.v2/0?context=%7b%22Tid%22%3a%22f9d9b921-8f78-466d-95fd-4495e73d8d65%22%2c%22Oid%22%3a%228ac2a791-e637-4a90-8505-0a1ee175ebfc%22%7d) to join. If you want to get an invitation to the event, please drop us a line on our Slack channel. - -Planning meetings are currently not recorded. The meetings are held either in English or German depending on the attendees. - -:::info -Note that anyone can contribute to metal-stack without participating in planning meetings. However, if you want to speed up the review process for your requirements, it might be helpful to attend the meetings. -::: - -## Agenda - -Here is the agenda that we generally want to follow in a planning meeting: - -- Possibility to bring up news that are interesting for every developer of the metal-stack org -- Check `Done` column and archive cards - - Attendees have the chance to briefly present achievements if they want -- Check the `In Progress` column and discuss whether these tasks are still worked on, there were significant blockers or they can be lower-prioritized -- Check new issues labelled with `triage` and prioritize them -- Allow attendees to bring up issues and prioritize them - - Attendees have the chance to briefly present these new issues - -## Idea Backlog - -The backlog contains ideas of what could become part of the roadmap in the future. The list is ordered alphabetically. Therefore, the order does not express the importance or weight of a backlog item. - -We incorporate community feedback into the roadmap. If you think that important points are missing in the backlog, please share your ideas with us. We have a Slack channel. Please check out [metal-stack.io](https://metal-stack.io) for contact information. - -:::danger -By no means this list is a promise of what is being worked on in the near future. It is just a summary of ideas that was agreed on to be "nice to have". It is up to the investors, maintainers and the community to choose topics from this list and to implement them or to remove them from the list. -::: - -- Add metal-stack to [Gardener conformance test grid](https://testgrid.k8s.io/gardener-all) -- Autoscaler for metal control plane components -- CI dashboard and public integration testing -- Improved release and deploy processes (GitOps, [Spinnaker](https://spinnaker.io/), [Flux](https://fluxcd.io/)) -- Machine internet without firewalls -- metal-stack dashboard (UI) -- Offer our metal-stack extensions as enterprise products (accounting, cluster-api, S3) (neither of them will ever be required for running metal-stack, they just add extra value for certain enterprises) -- Partition managed by Kubernetes (with Kubelets joining the control plane cluster) -- Public offering / demo playground diff --git a/versioned_docs/version-v0.21.10/contributing/03-contribution-guideline.md b/versioned_docs/version-v0.21.10/contributing/03-contribution-guideline.md deleted file mode 100644 index 15a73d0d..00000000 --- a/versioned_docs/version-v0.21.10/contributing/03-contribution-guideline.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -slug: /contribution-guideline -title: Contribution Guideline -sidebar_position: 3 ---- - -# Contribution Guideline - -This document describes the way we want to contribute code to the projects of metal-stack, which are hosted on [github.com/metal-stack](https://github.com/metal-stack). - -The document is meant to be understood as a general guideline for contributions, but not as burden to be placed on a developer. Use your best judgment when contributing code. Try to be as clean and precise as possible when writing code and try to make your code as maintainable and understandable as possible for other people. - -Even if it should go without saying, we live an open culture of discussion, in which everybody is welcome to participate. We treat every contribution with respect and objectiveness with the general aim to write software of quality. - -If you want, feel free to propose changes to this document in a pull request. - -## How Can I Contribute? - -Open a Github issue in the project you would like to contribute. Within the issue, your idea can be discussed. It is also possible to directly create a pull request when the set of changes is relatively small. - -When opening an issue please consider the following aspects: - -1. Create a meaningful issue describing the WHY? of your contribution. -1. Try to set appropriate labels to the issue. For example, attach the `triage` label to your issue if you want it to be discussed in the next [planning meeting](./02-planning-meetings.md). It might be useful to attend the meeting if you want to emphasize it being worked on. - -### Pull Requests - -The process described here has several goals: - -- Maintain quality -- Enable a sustainable system to review contributions -- Enable documented and reproducible addition of contributions - -1. Create a repository fork within the context of that issue. Members of the organization may work on the repository directly without a fork, which allows building development artifacts more easily. -1. Develop, document and test your contribution (try not to solve more than one issue in a single pull request). -1. Create a Draft Pull Request to the repository's main branch. -1. Create a meaningful description of the pull request or reference the related issue. The pull request template explains what the content should include, please read it. -1. Ask for merging your contribution by removing the draft marker. Repository maintainers (see [Code Ownership](#code-ownership)) are notified automatically, but you can also reach out to people directly on Slack if you want a review from a specific person. - -## General Objectives - -This section contains language-agnostic topics that all metal-stack projects are trying to follow. - -### Code Ownership - -The code base is owned by the entire team and every member is allowed to contribute changes to any of the projects. This is considered as collective code ownership[^1]. - -As a matter of fact, there are persons in a project, which already have experience with the sources. These are defined directly in the repository's [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) file. If you want to merge changes into the master branch, it is advisable to include code owners into the process of discussion and merging. - -### Microservices - -One major ambition of metal-stack is to follow the idea of [microservices](https://en.wikipedia.org/wiki/Microservices). This way, we want to achieve that we can - -- adapt to changes faster than with monolithic architectures, -- be free of restrictions due to certain choices of technology, -- leverage powerful traits of cloud infrastructures (e.g. high-scalability, high-availability, ...). - -### Programming Languages - -We are generally open to write code in any language that fits best to the function of the software. However, we encourage [golang](https://en.wikipedia.org/wiki/Go_(programming_language)) to be the main language of metal-stack as we think that it makes development faster when not establishing too many different languages in our architecture. Reason for this is that we are striving for consistent behavior of the microservices, similar to what has been described for the Twelve-Factor App (see [12 Factor](https://12factor.net/)). We help enforcing unified behavior by allowing a small layer of shared code for every programming language. We will refer to this shared code as "libraries" for the rest of this document. - -### Artifacts - -Artifacts are always produced by a CI process (Github Actions). - -Docker images are published on the Github Container Registry of the metal-stack organization. - -Binary artifacts or OS images can be uploaded to `images.metal-stack.io` if necessary. - -When building Docker images, please consider our build tool [docker-make](https://github.com/fi-ts/docker-make) or the specific [docker-make action](https://github.com/fi-ts/action-docker-make) respectively. - -### APIs - -We are currently making use of [Swagger](https://swagger.io/) when we exposing traditional REST APIs for end-users. This helps us with being technology-agnostic as we can generate clients in almost any language using [go-swagger](https://goswagger.io/). Swagger additionally simplifies the documentation of our APIs. - -Most APIs though are not required to be user-facing but are of technical nature. These are preferred to be implemented using [grpc](https://grpc.io/). - -#### Versioning - -Artifacts are versioned by tagging the respective repository with a tag starting with the letter `v`. After the letter, there stands a valid [semantic version](https://semver.org/). - -### Documentation - -In order to make it easier for others to understand a project, we document general information and usage instructions in a `README.md` in any project. - -In addition to that, we document a microservice in the [docs](https://github.com/metal-stack/docs) repository. The documentation should contain the reasoning why this service exists and why it was being implemented the way it was being implemented. The aim of this procedure is to reduce the time for contributors to comprehend architectural decisions that were made during the process of writing the software and to clarify the general purpose of this service in the entire context of the software. - -## Guidelines - -This chapter describes general guidelines on how to develop and contribute code for a certain programming language. - -### Golang - -Development follows the official guide to: - -- Write clear, idiomatic Go code[^2] -- Learn from mistakes that must not be repeated[^3] -- Apply appropriate names to your artifacts: - - [https://go.dev/talks/2014/names.slide](https://go.dev/talks/2014/names.slide) - - [https://go.dev/blog/package-names](https://go.dev/blog/package-names) - - [https://go.dev/doc/effective_go#names](https://go.dev/doc/effective_go#names) -- Enable others to understand the reasoning of non-trivial code sequences by applying a meaningful documentation. - -#### Development Decisions - -- **Dependency Management** by using Go modules -- **Build and Test Automation** by using [GNU Make](https://man7.org/linux/man-pages/man1/make.1p.html). -- **End-user APIs** should consider using go-swagger and [Go-Restful](https://github.com/emicklei/go-restful) - **Technical APIs** should consider using [grpc](https://grpc.io/) - -#### Libraries - -metal-stack maintains several libraries that you should utilize in your project in order to unify common behavior. Some of these projects are: - -- [metal-go](https://github.com/metal-stack/metal-go) -- [metal-lib](https://github.com/metal-stack/metal-lib) - -#### Error Handling with Generated Swagger Clients - -From the server-side you should ensure that you are returning the common error json struct in case of an error as defined in the `metal-lib/httperrors`. Ensure you are using `go-restful >= v2.9.1` and `go-restful-openapi >= v0.13.1` (allows default responses with error codes other than 200). - -### Documentation - -We want to share knowledge and keep things simple. If things cannot kept simple we want to enable everybody to understand them by: - -- Document in short sentences[^4]. -- Do not explain the HOW (this is already documented by your code and documenting the obvious is considered a defect). -- Explain the WHY. Add a "to" in your documentation line to force yourself to explain the reasonning (e.g. "` to `"). - -### Python - -Development follows the official guide to: - -- Style Guide for Python Code (PEP 8)[^5] - - The use of an IDE like [PyCharm](https://www.jetbrains.com/pycharm/) helps to write compliant code easily -- Consider [setuptools](https://pythonhosted.org/an_example_pypi_project/setuptools.html) for packaging -- If you want to add a Python microservice to the mix, consider [pyinstaller](https://github.com/pyinstaller/pyinstaller) on Alpine to achieve small image sizes - -[^1]: [https://martinfowler.com/bliki/CodeOwnership.html](https://martinfowler.com/bliki/CodeOwnership.html) - -[^2]: [https://go.dev/doc/effective_go](https://go.dev/doc/effective_go) - -[^3]: [https://github.com/golang/go/wiki/CodeReviewComments](https://github.com/golang/go/wiki/CodeReviewComments) - -[^4]: [https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences](https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences) - -[^5]: [https://www.python.org/dev/peps/pep-0008/](https://www.python.org/dev/peps/pep-0008/) diff --git a/versioned_docs/version-v0.21.10/contributing/04-release-flow.md b/versioned_docs/version-v0.21.10/contributing/04-release-flow.md deleted file mode 100644 index 2a6403b7..00000000 --- a/versioned_docs/version-v0.21.10/contributing/04-release-flow.md +++ /dev/null @@ -1,107 +0,0 @@ ---- -slug: /release-flow -title: Release Flow -sidebar_position: 4 ---- - -# Releases - -The metal-stack contains of many microservices that depend on each other. The automated release flow is there to ensure that all components work together flawlessly for every metal-stack release. - -Releases and integration tests are published through our [release repository](https://github.com/metal-stack/releases). You can also find the [release notes](https://github.com/metal-stack/releases/releases) for this metal-stack version in there. The release notes contain information about new features, upgrade paths and bug fixes. - -If you want, you can sign up at our Slack channel where we are announcing every new release. Often, we provide additional information for metal-stack administrators and adopters at this place, too. - -This document is intended for developers, especially maintainers of metal-stack projects. - -## Release Flow - -The following diagram attempts to describe our current release flow: - -![](release_flow.svg) - -A release is created in the following way: - -- Individual repository maintainers within the metal-stack GitHub Organization can publish a release of their component. -- This release is automatically pushed to the `develop` branch of the release repository by the metal-robot. -- A push triggers a virtual release integration test using the mini-lab environment. This setup launches metal-stack with the `sonic` and `gardener` flavors to validate the different Ansible roles and execute basic operations across the metal-stack layer. -- To contribute components that are not directly part of the release vector, a pull request must be made against the `develop` branch of the release repository. Release maintainers may push directly to the `develop` branch. -- The release maintainers can `/freeze` the `develop` branch, effectively stopping the metal-robot from pushing component releases to this branch. -- The `develop` branch is tagged by a release maintainer with a `-rc.x` suffix to create a __release candidate__. -- The release candidate must pass a large integration test suite on a real environment, which is currently run by FI-TS. It tests the entire machine provisioning engine including the integration with Gardener, the deployment, metal-images and Kubernetes conformance tests. -- If the integration tests pass, the PR of the `develop` branch must be approved by at least two release maintainers. -- A release is created via GitHub releases, including all release notes, with a tag on the `main` branch. - -## FAQ - -**Question: I need PR #xyz to go into the release, why did you not include it?** - -Answer: It's not on purpose if we miss a PR to be included into a metal-stack release. Please use the pending pull request from `develop` into `master` as soon as it is open and comment which pull request you want to have included into the release. Also consider attending our planning meetings or contact us in our Slack channel if you have urgent requirements that need to be dealt with. - -**Question: Who is responsible for the releases? Who can freeze a release?** - -Answer: Every repository in metal-stack has a `CODEOWNERS` file pointing to a maintainer team. This is also true for the releases repository. Only release repository maintainers are allowed to `/freeze` a release (meaning the metal-robot does not automatically append new component releases to the release vector anymore). - -**Question: I can't push to the `develop` branch of this repository? How can I request changes to the release vector?** - -Answer: Most changes are automatically integrated by the metal-robot. For manually managed components, please raise a pull request against the `develop` branch. Only release maintainers are allowed to push to `develop` as otherwise it would be possible to mess up the release pipeline. - -**Question: What requirements need to be fulfilled to add a repository to the release vector?** - -Please see the section below named [Requirements for Release Vector Repositories](#requirements-for-release-vector-repositories). - -### Requirements for Release Vector Repositories - -Before adding a repository in the metal-stack org to the releases repository, it is advised for the maintainer to fulfill the following points: - -- The following files should be present at the repository root: - - [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) - - When a repository is created, the metal-robot automatically creates a -maintainers team in our GitHub org. - - The CODEOWNERS file should reference this team. - - The team should contain at least two maintainers. - - `LICENSE` - - This usually should be MIT with "metal-stack" as authors. - - `CONTRIBUTING.md` - - This should contain the following content: - ``` - # Contributing - - Please check out the [contributing section](https://docs.metal-stack.io/stable/development/contributing/) in our [docs](https://docs.metal-stack.io/). - ``` - - `README.md` -- The `developers-core` team should be given repository access with `write` role, the codeowners team should have the `maintain` role -- Release artifacts should have an SPDX-formatted SBOM attached. - - For container images these are embedded using Buildx. -- The following branch protection rules should be set: - - The mainline should be protected. - - A pull request should be required before merging (required by at least one code owner). - - Status checks should be required to pass. - - Force push should not be allowed on this branch. -- One person from the releases maintainers has to add the repository to the metal-robot in order to pick up the releases, add them to the release vector and generate release notes. - -### How-To Release a Project - -[release-drafter](https://github.com/release-drafter/release-drafter) is preferred in order to generate release notes from merged PRs for your projects. It should be triggered for pushes on your main branch. - -The draft is then used to create a project release. The release has to be published through the GitHub UI as demonstrated in the screenshot below. - -**Tagging the repository is not enough as repository tagging does not associate your release notes to your release!** - -![](release.png) - -Some further remarks: - -- Use semver versions with `v` prefix for your tags -- Name your release after your release tag -- The metal-robot only picks up lines from your release notes that start with `-` or `*` (unordered list items) and appends them to the according section in the aggregated release draft -- A tag created through a GitHub UI release does not trigger a `push` event . This means, your pipeline will not start to run with the `push` trigger when publishing through the UI. - - Instead, use the `published` [release event trigger](https://docs.github.com/en/actions/reference/events-that-trigger-workflows#release) for your actions: - - ```yaml - on: - release: - types: - - published - ``` -- In case they are necessary, please do not forget to include `NOTEWORTHY`, `ACTIONS_REQUIRED` or `BREAKING_CHANGE` sections into releases. More information on those release draft sections can be read in a pull request template. diff --git a/versioned_docs/version-v0.21.10/contributing/05-community.md b/versioned_docs/version-v0.21.10/contributing/05-community.md deleted file mode 100644 index 61eaf099..00000000 --- a/versioned_docs/version-v0.21.10/contributing/05-community.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -slug: /community -title: Community -sidebar_position: 5 -draft: true ---- - -# Community - -(Slack channel, community events like FOSDEM, Kubernetes Community Days..., blog -articles) diff --git a/versioned_docs/version-v0.21.10/contributing/release.png b/versioned_docs/version-v0.21.10/contributing/release.png deleted file mode 100644 index 598b1182..00000000 Binary files a/versioned_docs/version-v0.21.10/contributing/release.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.10/contributing/release_flow.drawio b/versioned_docs/version-v0.21.10/contributing/release_flow.drawio deleted file mode 100644 index 6ca6b34f..00000000 --- a/versioned_docs/version-v0.21.10/contributing/release_flow.drawio +++ /dev/null @@ -1,721 +0,0 @@ - - - - - - - - - - - -
-
-
- Review release notes -
-
-
-
- - Review release notes - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- Organization Webhook -
-
-
-
- - Organization Webhook - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- - Publish release - -
-
-
-
- - Publish release - -
-
-
- - - - - - - - -
-
-
- Maintainer -
-
-
-
- - Maint... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- metal-robot release handler -
-
-
-
- - metal-robot release han... - -
-
-
- - - - - - - - -
-
-
- - no - -
-
-
-
- - no - -
-
-
- - - - - - - - -
-
-
- - yes - -
-
-
-
- - yes - -
-
-
- - - - - - - -
-
-
- version in event newer than release vector version -
-
-
-
- - version in event newer than... - -
-
-
- - - - - - - -
-
-
- - do nothing - -
-
-
-
- - do nothing - -
-
-
- - - - - - - - - - - - -
-
-
- Github Action -
-
-
-
- - Github Action - -
-
-
- - - - - - - -
-
-
- Bump version in release vector and push to - - develop - -
-
-
-
- - Bump version in release vector... - -
-
-
- - - - - - - - - - - -
-
-
- Open pull request from - - develop - - to - - master - -
-
-
-
- - Open pull request from develop... - -
-
-
- - - - - - - -
-
-
- Update aggregated release draft in - - metal-stack/releases - -
-
-
-
- - Update aggregated release draf... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Integration Testing -
-
-
-
- - Integration Testing - -
-
-
- - - - - - - - - - - -
-
-
- Merge to - - master - -
-
-
-
- - Merge to master - -
-
-
- - - - - - - - - - - - -
-
-
- Review -
-
-
-
- - Review - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Tests suceeded and PR changes reviewed -
-
-
-
- - Tests suceeded and PR chang... - -
-
-
- - - - - - - -
-
-
- - publish results to #integration - -
-
-
-
- - publish results to #integr... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Release metal-stack -
-
-
-
- - Release metal-stack - -
-
-
- - - - - - - - - - - -
-
-
- - publish to #announcements - -
-
-
-
- - publish to #announcements - -
-
-
- - - - - - - -
-
-
- - - metal-stack/docs - - pull request - -
-
-
-
- - metal-stack/docs pull requ... - -
-
-
- - - - - - - - - - - - -
-
-
- Freeze -
-
-
-
- - Freeze - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Freeze - - develop - - and create a release candidate -
-
-
-
- - Freeze develop and create a rel... - -
-
-
- - - - - - - -
-
-
- Large integration suites -
- - (currently owned by FI-TS, not public) - -
-
-
-
-
- - Large integration suites... - -
-
-
- - - - - - - - -
-
-
- Run -
-
-
-
- - Run - -
-
-
- - - - -
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.21.10/contributing/release_flow.svg b/versioned_docs/version-v0.21.10/contributing/release_flow.svg deleted file mode 100644 index 55cdd493..00000000 --- a/versioned_docs/version-v0.21.10/contributing/release_flow.svg +++ /dev/null @@ -1 +0,0 @@ -
Review release notes
Review release notes
projects
projects
projects
projects
Organization Webhook
Organization Webhook
projects
projects
Publish release
Publish release
Maintainer
Maint...
metal-robot release handler
metal-robot release han...
no
no
yes
yes
version in event newer than release vector version
version in event newer than...
do nothing
do nothing
Github Action
Github Action
Bump version in release vector and push todevelop
Bump version in release vector...
Open pull request fromdeveloptomaster
Open pull request from develop...
Update aggregated release draft inmetal-stack/releases
Update aggregated release draf...
Integration Testing
Integration Testing
Merge tomaster
Merge to master
Review
Review
Tests suceeded and PR changes reviewed
Tests suceeded and PR chang...
publish results to #integration
publish results to #integr...
Release metal-stack
Release metal-stack
publish to #announcements
publish to #announcements
metal-stack/docspull request
metal-stack/docs pull requ...
Freeze
Freeze
Freezedevelopand create a release candidate
Freeze develop and create a rel...
Large integration suites
(currently owned by FI-TS, not public)
Large integration suites...
Run
Run
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.10/docs/02-General/04-flavors-of-metalstack.md b/versioned_docs/version-v0.21.10/docs/02-General/04-flavors-of-metalstack.md index 7da427fc..2277ca6b 100644 --- a/versioned_docs/version-v0.21.10/docs/02-General/04-flavors-of-metalstack.md +++ b/versioned_docs/version-v0.21.10/docs/02-General/04-flavors-of-metalstack.md @@ -14,7 +14,7 @@ As modern infrastructure and cloud native applications are designed with Kuberne Regardless which flavor of metal-stack you use, it is always possible to manually provision machines, networks and ip addresses. This is the most basic way of using metal-stack and is very similar to how traditional bare metal infrastructures are managed. -Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](../../contributing/01-Proposals/MEP4/README.md) and [MEP-16](../../contributing/01-Proposals/MEP16/README.md) in the future. +Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api) and [MEP-16](/community/MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller) in the future. ## Gardener diff --git a/versioned_docs/version-v0.21.10/docs/04-For Operators/03-deployment-guide.mdx b/versioned_docs/version-v0.21.10/docs/04-For Operators/03-deployment-guide.mdx index 58ddafd3..6be800cd 100644 --- a/versioned_docs/version-v0.21.10/docs/04-For Operators/03-deployment-guide.mdx +++ b/versioned_docs/version-v0.21.10/docs/04-For Operators/03-deployment-guide.mdx @@ -31,7 +31,7 @@ You can use the [mini-lab](https://github.com/metal-stack/mini-lab) as a templat The metal control plane is typically deployed in a Kubernetes cluster. Therefore, this document will assume that you have a Kubernetes cluster ready for getting deployed. Even though it is theoretically possible to deploy metal-stack without Kubernetes, we strongly advise you to use the described method because we believe that Kubernetes gives you a lot of benefits regarding the stability and maintainability of the application deployment. :::tip -For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](../../contributing/01-Proposals/MEP18/README.md) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). +For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](/community/MEP-18-autonomous-control-plane) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). ::: Let's start off with a fresh folder for your deployment: diff --git a/versioned_docs/version-v0.21.10/docs/05-Concepts/01-architecture.mdx b/versioned_docs/version-v0.21.10/docs/05-Concepts/01-architecture.mdx index 709960e3..75298df9 100644 --- a/versioned_docs/version-v0.21.10/docs/05-Concepts/01-architecture.mdx +++ b/versioned_docs/version-v0.21.10/docs/05-Concepts/01-architecture.mdx @@ -152,4 +152,4 @@ Thus, for creating a partition as well as a machine or a firewall, the flags `dn In order to be fully offline resilient, make sure to check out `metal-image-cache-sync`. This component provides copies of `metal-images`, `metal-kernel` and `metal-hammer`. -This feature is related to [MEP14](../../contributing/01-Proposals/MEP14/README.md). +This feature is related to [MEP14](/community/MEP-14-independence-from-external-sources). diff --git a/versioned_docs/version-v0.21.10/docs/05-Concepts/02-user-management.md b/versioned_docs/version-v0.21.10/docs/05-Concepts/02-user-management.md index f1ee2778..ba742ee9 100644 --- a/versioned_docs/version-v0.21.10/docs/05-Concepts/02-user-management.md +++ b/versioned_docs/version-v0.21.10/docs/05-Concepts/02-user-management.md @@ -7,7 +7,7 @@ sidebar_position: 2 # User Management At the moment, metal-stack can more or less be seen as a low-level API that does not scope access based on projects and tenants. -Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](../../contributing/01-Proposals/MEP4/README.md). +Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](/community/MEP-4-multi-tenancy-for-the-metal-api). Until then projects and tenants can be created, but have no effect on access control. diff --git a/versioned_docs/version-v0.21.10/docs/06-For CISOs/Security/01-principles.md b/versioned_docs/version-v0.21.10/docs/06-For CISOs/Security/01-principles.md index 8e7030f5..e327ec4a 100644 --- a/versioned_docs/version-v0.21.10/docs/06-For CISOs/Security/01-principles.md +++ b/versioned_docs/version-v0.21.10/docs/06-For CISOs/Security/01-principles.md @@ -15,7 +15,7 @@ The minimal need to know principle is a security concept that restricts access t ### RBAC :::info -As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](../../../contributing/01-Proposals/MEP4/README.md). +As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](..//community/MEP-4-multi-tenancy-for-the-metal-api). ::: As described in our [User Management](../../05-Concepts/02-user-management.md) concept the [metal-api](https://github.com/metal-stack/metal-api) currently offers three different user roles for authorization: diff --git a/versioned_docs/version-v0.21.10/docs/06-For CISOs/Security/04-communication-matrix.md b/versioned_docs/version-v0.21.10/docs/06-For CISOs/Security/04-communication-matrix.md index 07df2607..24c1bc1d 100644 --- a/versioned_docs/version-v0.21.10/docs/06-For CISOs/Security/04-communication-matrix.md +++ b/versioned_docs/version-v0.21.10/docs/06-For CISOs/Security/04-communication-matrix.md @@ -116,7 +116,7 @@ Please note that every [networking setup](../../05-Concepts/03-Network/01-theory | VLAN | Switches, Firewalls | Layer 2 traffic segmentation. | | VXLAN | Switches, Firewalls | Encapsulate Layer 2 frames in Layer 3 packets for network virtualization. | | EVPN | Switches, Firewalls | Overlay network technology for scalable and flexible network architectures. | -| VPN | Firewalls | Management access [without open SSH ports](../../../contributing/01-Proposals/MEP9/README.md). | +| VPN | Firewalls | Management access [without open SSH ports](..//community/MEP-9-no-open-ports-to-the-data-center). | | BGP | Multiple | Routing protocol for dynamic routing and network management. | | SSH | Management Server, Switches | Secure shell access for management and configuration. | | LLDP | Switches, Machines | Link Layer Discovery Protocol for network device discovery. | diff --git a/versioned_docs/version-v0.21.10/docs/06-For CISOs/rbac.md b/versioned_docs/version-v0.21.10/docs/06-For CISOs/rbac.md index 9a87b896..06c902bb 100644 --- a/versioned_docs/version-v0.21.10/docs/06-For CISOs/rbac.md +++ b/versioned_docs/version-v0.21.10/docs/06-For CISOs/rbac.md @@ -31,4 +31,4 @@ To ensure that internal components interact securely with the metal-api, metal-s Users can interact with the metal-api using [metalctl](https://github.com/metal-stack/metalctl), the command-line interface provided by metal-stack. Depending on the required operations, users should authenticate with the appropriate role to match their level of access. -As part of [MEP-4](../../contributing/01-Proposals/MEP4/README.md), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. +As part of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. diff --git a/versioned_docs/version-v0.21.10/docs/06-For CISOs/remote-access.md b/versioned_docs/version-v0.21.10/docs/06-For CISOs/remote-access.md index 0b8dbb19..dc24e82f 100644 --- a/versioned_docs/version-v0.21.10/docs/06-For CISOs/remote-access.md +++ b/versioned_docs/version-v0.21.10/docs/06-For CISOs/remote-access.md @@ -6,7 +6,7 @@ title: Remote Access ## Machines and Firewalls -Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](../../contributing/01-Proposals/MEP9/README.md). Administrators can access machines in two primary ways. +Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](/community/MEP-9-no-open-ports-to-the-data-center). Administrators can access machines in two primary ways. **Out-of-band management via SOL** @@ -26,4 +26,4 @@ This approach uses the [`metal-console`](../08-References/Control%20Plane/metal- Both methods ensure secure and controlled access to machines without exposing them unnecessarily to the network, maintaining the integrity and safety of the infrastructure. -Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](../../contributing/01-Proposals/MEP4/README.md). \ No newline at end of file +Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api). \ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/Distributed-API-Working.png b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/Distributed-API-Working.png deleted file mode 100644 index 899e223d..00000000 Binary files a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/Distributed-API-Working.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/Distributed-API.png b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/Distributed-API.png deleted file mode 100644 index 688c7c2e..00000000 Binary files a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/Distributed-API.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/Distributed-Deployment.png b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/Distributed-Deployment.png deleted file mode 100644 index 8bba51b8..00000000 Binary files a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/Distributed-Deployment.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/Distributed.drawio b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/Distributed.drawio deleted file mode 100644 index f7c6fe79..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/Distributed.drawio +++ /dev/null @@ -1 +0,0 @@ -7V3bcts2EP0aP8pDgPfH2ImbziRTt+m0zVMHpmCJMUUoFG1L/fqC4kUkAEoULwDkRC8WIBIkF2d3z+6C8JV5u9r+kqD18jOZ4+gKGvPtlfn+CtKPY9M/Wc8u7/FtM+9YJOE87wKHji/hf7joNIre53CON40DU0KiNFw3OwMSxzhIG30oSchr87BHEjWvukYLzHV8CVDE9/4dztNl3uvZxqH/Iw4Xy/LKwCh+eUDB0yIhz3FxvStoPu4/+c8rVI5VHL9Zojl5rXWZH67M24SQNP+22t7iKJNtKbb8vLuWX6v7TnCcdjkhWVoPJP5zF3/w/trOvG+vv70+z4rJe0HRMy4fw4noeDePhA5L5YmC/Afn+3N2pzcfcfSC0zBAh67s4dJd1DwuO3+22U/3O3oAgOtt/Qxnkf29R0kapiGJy6vS288vnP9eSK4aHFIhrrOvz6voLkEr+vXmdRmm+Ms6v89XClLat0xXEW0B+nU/QziTgUFb1SRkjYCswqD4HqEHHN1UU3pLIpLQn2ISZ9fYpAl5qvABise7Q6swymD/F07mKEZFd4FxkA2LonAR00ZAJwkn1RO94CTF29aJBBU8qNphssJpsqOHFCfMbLeAVKFywC0x+VpDcNm3rKHX9opOVGjNohr9gBz6pQDPGUACkEPSBif0OWkfHGEayyfzMqlWj2QaI4kUcCK1eJGajkikYASRrrboX/K79Qh+vYe77+Ef8RMBM4+TaECCp4SgYClDoJz0BDJuF6jFCNQ0O8oTGiPI055D4NsPc8+Yo0cAwMy0OJHhOfUDRZMk6ZIsSIyiD4dexnTUZDdHm+W+H3SwHNTE3YXZne5HwfH8Xea1souucZz3NH+v24+OZqZ1xrKHPDpfCY5QGr40naFI9PtT6a2jXe2ANQnjdFMb+T7rOMDAAoxaGdBvOqkzT6Bf8nsQn256LadXd7whz0mAi9MYQFVi6a+zrsCfDsTd4ZhPhKwL0H3DaborEICeU9LE5x50JcyCCG02mZ9rYBEcQ00upCOPWdAGOt4Cp0eOc8ZG4SCDypOdmkE1ADdTpVGlPGFNtTkTUuXQI/yYNTfUvobx4tO+9d50xrKeVhPHlsDBAyiwns5Uzsg5Krt2Dy9fdpUMVMgOuCh4QLZredg2fedhBji5MQT7bObckZJzBNt498OQ7HKm3Wm4jW0zXsbmEWaL6LdlTqWegLdesv1MC+Hp72T8SSgMxxkoN2yhquUYuZubjDP4nImgI6JohtahRmbVYsxqlcBR5pLKkPMtYb4U6uSgh23xmSTQlw9aRz3apJmNT5FGsIeedrA3j1ExzfMC0G6BnZS0gFieloCivcGYDXQN2oBeURu4mLCNtRXqozZwMWEbSy80kJ0ol6ModLv5GbqNgjIuza9B5OYp9zbjs1hJoRub7qVs4tqofWBzwKkp7UUEcmx6TD2hrQrkb0gDJqq/8PUSnk8r1IAKjXoHdWx2XQMVgAKuwerEoXLIhAd8dw3neBum/2R4vva8ovl137SL1vttgfZ9Y1dr3OMkpM+X+eWiNkmfNR8LOGW7NljWPIy2b+3qLXa8TurVllE/Hca4HVWw4fv5SS/7hmZc2KyxYzNoailNCkYymJHY2HhqNb/kDAS3MgFUpFBdDgL+IDkI2DUHAfXKQcCLyUFwpWMABSuZJHu3i8lCcMVjHaR3Mg8hbY3mzxLyVCVkv3Miwp0MZ9omIg4UtuSsX2vkVsxfB/goVXXnAxGRxeMuImHBEzZDoCxybXKZDdRPV/rj3pSUsuBKz9Jxb15GmoKiTD/g84mKywn9gK9f5GfysbRy0zJF5FcuwD8Z+Zn22GZo2PzwkbmmkV/1opk+oYt5PGzWKPCzWFOrgfD4qPln/fnC4z4AtAv7LNEKddYB/Zilh6PpmNOOrGsKU1H9AVg+g6neBQhQJR0lMXhLVIGIN4QCVhuXwr9TKqTvIm2fzKdYGr6f1vqiZKXxdkPhT6h7f822Or/VZnXU7IEqa9sN/Hjsy9tTKxmfHvoJlrPB4koCi8nSf8Pyr53Dx5WLHZ75o3V4nacX6ewFT9ch0chWM6badQSW2pVpqUvd11DXpGbj7dELwS3qw754LjspafPhnobJeMC9YK88Jeko8Uo1j+Oe5XL0UzGna0RTsu7JKwSA8WU+u8fKxLs4OKauxrd1lk/zEElvFtpmv7k7OdCe0Hjm4WNLtc8Onwiu7POMzn2WW9DGTHM1U19Q6QCmVDMtWgS0D9mv12WmcfZwiiHS50+bWjOCtNglU1WgVRMWFMXpk70U4vBxOi8spERYM2hoJy1tV64McMqSVqFwhQ/ZvNfhsww6FuNN/RahlHekcxKUyxSrz4G6auOFqtGtejEtKZS31o0tfPVlhTPTsBlEWdkrTwda6HQyX+fuZMc/QQHa9hvltrLzGmc0t7IbbQM6vjKSJd6Uswb2VU31rMG9JELP5l3U83lXSYJSiRmdPPdosablaKDb1VTyywfdPgH0uZaSf5rjhpJbBi3HTvLhaNNOqglF2bWxUs2keGNnTk7Vvs7ti9+02dfZZsEld19noUQhJ1EVlvSsFZ+MBTwZ0gqfu2AmdVIqPM4aaGQHTc7RV1tHXu45ENoMvxRuZjJZRNo+c5KWew4SHrcBgHLRqdkEY0TtFhSRhMf5KrUb8gost1bYY3WK1NnJNxdUNT181nuaEvi4hhf4ULn1UJvTOrcG3r++PYoy+BehDNLy4sO0wWTLtOq1QZMdPUdELOjKnZUittxtUpkVgr2sEKjZoNKSyTBDnSd1aEDUK43DRheGb9cBcvL4MhvZdrx7/PjBWZ+jIq9ZBmo4E7KlkqHgNUH+2tGpF1rVcw4otfwoliX//6mUqH/FJdzuZKI3cxlT/btycqX5EMGmlhdKNaESrtl5lod67l5GnjPC7P+QZIuaFjx+xkRmmw8ML8Jss2km9Ua73GjuMhIgvWz7iMor2q7uCEDHXzbB/vMJg90zcrzFWWIB6OHjVUwpHDqlw/SUf39Kx1QYYMtr6oN/qDoI7ctPFJm4zpnhiUycrdrECZLOGubZ9FO0Mu/3hnxD18SwWtdwZNe+KdatjVws8UTAtaQCF7414JYb2p0mNbZK5Ir23dMXuRy38UT/JmAk5NJmQrLtlw6uLUHr5Wcyx9kR/wM= \ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/Distributed.png b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/Distributed.png deleted file mode 100644 index d96ca216..00000000 Binary files a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/Distributed.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/README.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/README.md deleted file mode 100644 index 0fd4bb63..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP1/README.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -slug: /MEP-1-distributed-metal-control-plane -title: MEP-1 -sidebar_position: 1 ---- - -# Distributed Metal Control Plane - -This enhancement proposal was replaced by [MEP18](../MEP18/README.md). - -## Problem Statement - -We face the situation that we argue for running bare metal on-premises because this way the customers can control where and how their software and data are processed and stored. -On the other hand, we have currently decided that our metal-api control plane components run on a kubernetes cluster (in our case on a cluster provided by one of the available hyperscalers). - -Running the control plane on Kubernetes has the following benefits: - -- Ease of deployment -- Get most, if not all, of the required infrastructure services like (probably incomplete): - - IPs - - DNS - - L7-Loadbalancing - - Storage - - S3 Backup - - High Availability - -Using a kubernetes as a service offering from one of the hyperscalers, enables us to focus on using kubernetes instead of maintaining it as well. - -## Goal - -It would be much saner if metal-stack has no, or only minimal dependencies to external services. Imagine a metal-stack deployment in a plant, it would be optimal if we only have to deliver a single rack with servers and networking gear installed and wired, plug that rack to the power supply and a internet uplink and its ready to go. - -Have a second plant which you want to be part of all your plants? Just tell both that they are part of something bigger and metal-api knows of two partitions. - -## Possible Solutions - -We can think of two different solutions to this vision: - -1. Keep the central control plane approach and require some sort of kubernetes deployment accessible from the internet. This has the downside that the user must, provide a managed kubernetes deployment in his own datacenter or uses a hyperscaler. Still not optimal. -1. Install the metal-api and all its dependencies in every partition, replicate or shard the databases to every connected partition, make them know each other. Connect the partitions over the internet with some sort of vpn to make the services visible to each other. - -As we can see, the first approach does not really address the problem, therefore i will describe solution #2 in more details. - -## Central/Current setup - -### Stateful services - -Every distributed system suffer from handling state in a scalable, fast and correct way. To start how to cope with the state, we first must identify which state can be seen as partition local only and which state must be synchronous for read, and synchronous for writes across partitions. - -Affected states: - -- masterdata: e.g. tenant and project must be present in every partition, but these are entities which are read often but updates are rare. A write can therefore be visible with a decent delay in a distinct partition with no consequences. -- ipam: the prefixes and ip´s allocated from machines. These entities are also read often and rare updates. But we must differentiate between dirty reads for different types. A machine network is partition local, ips acquired from such a network must by synchronous in the same partition. Ips acquired from global networks such as internet must by synchronous for all partitions, as otherwise a internet ip could be acquired twice. -- vrf ids: they must only be unique in one partition -- image and size configurations: read often, written seldom, so no high requirements on the storage of these entities. -- images: os images are already replicated from a central s3 storage to a per partition s3 service. metal-hammer kernel and initrd are small and pull always from the central s3, can be done similar to os images. -- machine and machine allocation: must be only synchronous in the partition -- switch: must be only synchronous in the partition -- nsq messages: do not need to cross partition boundaries. No need to keep the messages persistent, even the opposite is true, we don't want to have the messages persist for a longer period. - -Now we can see that the most critical state to held and synchronize are the IPAM data, because these entities must be guaranteed to be synchronously updated, while being updated frequently. - -Datastores: - -We use three different types of datastores to persist the states of the metal application. - -- rethinkdb is the main datastore for almost all entities managed by metal-api -- postgresql is used for masterdata and ipam data. -- nsq uses disk and memory tho store the messages. - -### Stateless services - -These are the easy part, all of our services which are stateless can be scaled up and down without any impact on functionality. Even the stateful services like masterdata and metal-api rely fully on the underlying datastore and can therefore also be scaled up and down to meet scalability requirements. - -Albeit, most of these services need to be placed behind a loadbalancer which does the L4/L7 balancing across the started/available replicas of the service for the clients talking to it. This is actually provided by kubernetes with either service type loadbalancer or type clusterip. - -One exception is the `metal-console` service which must have the partition in it´s dns name now, because there is no direct network connectivity between the management networks of the partitions. See "Network Setup) - -## Distributed setup - -### State - -In order to replicate certain data which must be available across all partitions we can use on of the existing open source databases which enable such kind of setup. There are a few available out there, the following incomplete list will highlight the pro´s and cons of each. - -- RethinkDB - - We already store most of our data in RethinkDB and it gives already the ability to synchronize the data in a distributed manner with different guarantees for consistency and latency. This is described here: [Scaling, Sharding and replication](https://rethinkdb.com/docs/sharding-and-replication/). But because rethinkdb has a rough history and unsure future with the last release took more than a year, we in the team already thought that we eventually must move away from rethinkdb in the future. - -- Postgresql - - Postgres does not have a multi datacenter with replication in both directions, it just can make the remote instance store the same data. - -- CockroachDB - - Is a Postgresql compatible database engine on the wire. CockroachDB gives you both, ACID and geo replication with writes allowed from all connected members. It is even possible to configure [Follow the Workload](https://www.cockroachlabs.com/docs/stable/topology-follow-the-workload) and [Geo Partitioning and Replication](https://www.cockroachlabs.com/docs/v19.2/topology-geo-partitioned-replicas). - -If we migrate all metal-api entities to be stored the same way we store masterdata, we could use cockroachdb to store all metal entities in one ore more databases spread across all partitions and still ensure consistency and high availability. - -A simple setup how this would look like is shown here. - -![Simple CockroachDB setup](Distributed.png) - -go-ipam was modified in a example PR here: [PR 17](https://github.com/metal-stack/go-ipam/pull/17) - -### API Access - -In order to make the metal-api accessible for api users like `cloud-api` or `metalctl` as easy at it is today, some effort has to be taken. One possible approach would be to use a external loadbalancer which spread the requests evenly to all metal-api endpoints in all partitions. Because all data are accessible from all partitions, a api request going to partition A with a request to create a machine in partition B, will still work. If on the other hand partition B is not in a connected state because the interconnection between both partitions is broken, then of course the request will fail. - -**IMPORTANT** -The NSQ Message to inform `metal-core` must end in the correct partition - -To provide such a external loadbalancer we have several opportunities: - -- Cloudflare or comparable CDN service. -- BGP Anycast from every partition - -Another setup would place a small gateway behind the metal-api address, which forwards to the metal-api in the partition where the request must be executed. This gateway, `metal-api-router` must inspect the payload, extract the desired partition, and forward the request without any modifications to the metal-api endpoint in this partition. This can be done for all requests, or if we want to optimize, only for write accesses. - -## Network setup - -In order to have the impact to the overall security concept as minimal as possible i would not modify the current network setup. The only modifications which has to be made are: - -- Allow https ingress traffic to all metal-api instances. -- Allow ssh ingress traffic to all metal-console instances. -- Allow CockroachDB Replication between all partitions. -- No NSQ traffic from outside required anymore, except we cant solve the topic above. - -A simple setup how this would look like is shown here, this does not work though because of the forementioned NSQ issue. - -![API and Console Access](Distributed-API.png) - -Therefore we need the `metal-api-router`: - -![Working API and Console Access](Distributed-API-Working.png) - -## Deployment - -The deployment of our components will substantially differ in a partition compared to a the deployment we have actually. Deploying it in kubernetes in the partition would be very difficult to achieve because we have no sane way to deploy kubernetes on physical machines without a underlying API. -I would therefore suggest to deploy our components in the same way we do that for the services running on the management server. Use systemd to start docker containers. - -![Deployment](Distributed-Deployment.png) diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP10/README.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP10/README.md deleted file mode 100644 index 6811cdc0..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP10/README.md +++ /dev/null @@ -1,197 +0,0 @@ ---- -slug: /MEP-10-sonic-support -title: MEP-10 -sidebar_position: 10 ---- - -# SONiC Support - -As writing this proposal, metal-stack only supports Cumulus on Broadcom ASICs. Unfortunately, after the acquisition of -Cumulus Networks by Nvidia, Broadcom decided to cut its relationship with Cumulus, and therefore Cumulus 4.2 is the last -version that supports Broadcom ASICs. Since trashing the existing hardware is not a solution, adding support for a -different network operating system is necessary. - -One of the remaining big players is [SONiC](https://sonic-net.github.io/SONiC/), which Microsoft created to scale the -network of Azure. It's an open-source project and is now part of the [Linux Foundation](https://www.linuxfoundation.org/press/press-release/software-for-open-networking-in-the-cloud-sonic-moves-to-the-linux-foundation). - -For a general introduction to SONiC, please follow the [Architecture](https://github.com/sonic-net/SONiC/wiki/Architecture) official -documentation. - -## ConfigDB - -On a cold start, the content of `/etc/sonic/config_db.json` will be loaded into the Redis database `CONFIG_DB`, and both -contain the switch's configuration except the BGP unnumbered configuration, which still has to be configured directly by -the frr configuration files. The SONiC community is working to remove this exception, but no release date is known. - -## BGP Configuration - -Frr runs inside a container, and a shell script configured it on the container startup. For BGP unnumbered, we must set -the configuration variable `docker_routing_config_mode` to `split` to prevent SONiC from overwriting our configuration -files created by `metal-core`. But by using the split mode, the integrated configuration mode of frr is deactivated, and -we have to write our BGP configuration to the daemon-specific files `bgp.conf`, `staticd.conf`, and `zebra.conf` instead -to `frr.conf`. - -```bash -elif [ "$CONFIG_TYPE" == "split" ]; then - echo "no service integrated-vtysh-config" > /etc/frr/vtysh.conf - rm -f /etc/frr/frr.conf -``` - -Reference: [docker-init](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/docker_init.sh#L69) - -Adding support for the integrated configuration mode, we must at least adjust the startup shell script and the supervisor configuration: - -```bash -{% if DEVICE_METADATA.localhost.docker_routing_config_mode is defined and DEVICE_METADATA.localhost.docker_routing_config_mode == "unified" %} -[program:vtysh_b] -command=/usr/bin/vtysh -b -``` - -Reference: [supervisord.conf](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/frr/supervisord/supervisord.conf.j2#L157) - -## Non-BGP Configuration - -For the Non-BGP configuration we have to write it into the Redis database directly or via one of the following interfaces: - -- `config replace ` -- the Mgmt Framework -- the SONiC restapi - -Directly writing into the Redis database isn't a stable interface, and we must determine the create, delete, and update -operations on our own. The last point is also valid for the Mgmt Framework and the SONiC restapi. Furthermore, the -Mgmt Framework doesn't start anymore for several months, and a [potential fix](https://github.com/sonic-net/sonic-buildimage/pull/10893) -is still not merged. And the SONiC restapi isn't enabled by default, and we must build and maintain our own SONiC images. - -Using `config replace` would reduce the complexity in the `metal-core` codebase because we don't have to determine the -actual changes between the running and the desired configuration. The approach's drawbacks are using a version of SONiC -that contains the PR [Yang support for VXLAN](https://github.com/sonic-net/sonic-buildimage/pull/7294), and we must provide -the whole new startup configuration to prevent unwanted deconfiguration. - -### Configure Loopback interface and activate VXLAN - -```json -{ - "LOOPBACK_INTERFACE": { - "Loopback0": {}, - "Loopback0|": {} - }, - "VXLAN_TUNNEL": { - "vtep": { - "src_ip": "" - } - } -} -``` - -#### Configure MTU - -```json -{ - "PORT": { - "Ethernet0": { - "mtu": "9000" - } - } -} -``` - -#### Configure PXE Vlan - -```json -{ - "VLAN": { - "Vlan4000": { - "vlanid": "4000" - } - }, - "VLAN_INTERFACE": { - "Vlan4000": {}, - "Vlan4000|": {} - }, - "VLAN_MEMBER": { - "Vlan4000|": { - "tagging_mode": "untagged" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104000_Vlan4000": { - "vlan": "Vlan4000", - "vni": "104000" - } - } -} -``` - -#### Configure VRF - -```json -{ - "INTERFACE": { - "Ethernet0": { - "vrf_name": "vrf104001" - } - }, - "VLAN": { - "Vlan4001": { - "vlanid": "4001" - } - }, - "VLAN_INTERFACE": { - "Vlan4001": { - "vrf_name": "vrf104001" - } - }, - "VRF": { - "vrf104001": { - "vni": "104001" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104001_Vlan4001": { - "vlan": "Vlan4001", - "vni": "104001" - } - } -} -``` - -## DHCP Relay - -The DHCP relay container only starts if `DEVICE_METADATA.localhost.type` is equal to `ToRRouter`. - -## LLDP - -SONiC always uses the local port subtype for LLDP and sets it to some freely configurable alias field of the interface. - -```python -# Get the port alias. If None or empty string, use port name instead -port_alias = port_table_dict.get("alias") -if not port_alias: - self.log_info("Unable to retrieve port alias for port '{}'. Using port name instead.".format(port_name)) - port_alias = port_name - -lldpcli_cmd = "lldpcli configure ports {0} lldp portidsubtype local {1}".format(port_name, port_alias) -``` - -Reference: [lldpmgr](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-lldp/lldpmgrd#L153) - -## Mgmt Interface - -The mgmt interface is `eth0`. To configure a static IP address and activate the Mgmt VRF, use: - -```json -{ - "MGMT_INTERFACE": { - "eth0|": { - "gwaddr": "" - } - }, - "MGMT_VRF_CONFIG": { - "vrf_global": { - "mgmtVrfEnabled": "true" - } - } -} -``` - -[IP forwarding is deactivated on `eth0`](https://github.com/sonic-net/sonic-buildimage/blob/202205/files/image_config/sysctl/sysctl-net.conf#L7), and no IP Masquerade is configured. diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP11/README.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP11/README.md deleted file mode 100644 index 87f48a10..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP11/README.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -slug: /MEP-11-auditing-of-metal-stack-resources -title: MEP-11 -sidebar_position: 11 ---- - -# Auditing of metal-stack resources - -Currently no logs of the ownership of resources like machines, networks, ips and volumes are generated or kept. Though due to legal requirements data centers are required to keep track of this ownership over time to prevent liability issues when opening the platform for external users. - -In this proposal we want to introduce a flexible and low-maintenance approach for auditing on top of [Meilisearch](https://www.meilisearch.com/). - -## Overview - -In general our auditing logs will be collected by a request interceptor or middleware. Every request and response will be processed and eventually logged to Meilisearch. -Meilisearch will be configured to regularly create chunks of the auditing logs. These finished chunks will be backed up to a S3 compatible storage with a read-only option enabled. - -Of course sensitive data like session keys or passwords will be redacted before logging. We want to track relevant requests and responses. If auditing the request fails, the request itself will be aborted and will not be processed further. The requests and responses that will be audited will be annotated with a correlation id. - -Transferring the meilisearch auditing data chunks to the S3 compatible storage will be done by a sidecar cronjob that is executed periodically. -To avoid data manipulation the S3 compatible storage will be configured to be read-only. - -## Whitelisting - -To reduce the amount of unnecessary logs we want to introduce a whitelist of resources and operations on those that should be logged. -Other requests will be passed directly to the next middleware or web service without any further processing. - -As we are only interested in mutating endpoints, we ignore all `GET` requests. -The whitelist includes all `POST`, `PUT`, `PATCH` and `DELETE` endpoints of the HTTP middleware except for the following (non-manipulating) route suffixes: - -- `/find` -- `/notify` -- `/try` and `/match` -- `/capacity` -- `/from-hardware` - -Regarding GRPC audit trails, they are not so interesting because only internal clients are using this API. However, we can log the trails of the `Boot` service, which can be interesting to revise the machine lifecycle. - -## Chunking in Meilisearch - -We want our data to be chunked in Meilisearch. To accomplish this, we rotate the index identifier on a scheduled basis. The index identifiers will be derived from the current date and time. - -To keep things simple, we only support hourly, daily and monthly rotation. The eventually prefixed index names will only include relevant parts of date and time like `2021-01`, `2021-01-01` or `2021-01-01_13`. - -The metal-api will only write to the current index and switches to the new index on rotation. The metal-api will never read or update data in any indices. - -## Moving chunks to S3 compatible storage - -As Meilisearch will be filled with data over time, we want to move completed chunks to a S3 compatible storage. This will be done by a sidecar cronjob that is executed periodically. Note that the periods of the index rotation and the cronjob execution don't have to match. - -When the backup process gets started, it initiates a [Meilisearch dump](https://www.meilisearch.com/docs/learn/advanced/dumps) of the whole database across all indices. Once the returned task is finished, the dump must be copied from a Meilisearch volume to the S3 compatible storage. After a successful copy, the dump can be deleted. - -Now we want to remove all indices from Meilisearch, except the most recent one. For this, we [get all indices](https://www.meilisearch.com/docs/reference/api/indexes#list-all-indexes), sort them and [delete each index](https://www.meilisearch.com/docs/reference/api/indexes#delete-an-index) except the most recent one to avoid data loss. - -For the actual implementation, we can build upon [backup-restore-sidecar](https://github.com/metal-stack/backup-restore-sidecar). But due to the index rotation and the fact, that older indices need to be deleted, this probably does not fit into the mentioned sidecar. - -## S3 compatible storage - -The dumps of chunks should automatically deleted after a certain amount of time, once we are either no longer allowed or required to keep them. -The default retention time will be 6 months. Ideally already uploaded chunks should be read-only to prevent data manipulation. - -A candidate for the S3 compatible storage is Google Cloud Storage, which allows to configure automatic expiration of objects through a [lifecycle rule](https://cloud.google.com/storage/docs/managing-lifecycles?hl=en#storage-set-lifecycle-config-go). - -## Affected components - -- metal-api grpc server needs an auditing interceptor -- metal-api web server needs an auditing filter chain / middleware -- metal-api needs new command line arguments to configure the auditing -- mini-lab needs a Meilisearch instance -- mini-lab may need a local S3 compatible storage -- we need a sidecar to implement the backup to S3 compatible storage -- Consider auditing of volume allocations and freeings outside of metal-stack - -## Alternatives considered - -Instead of using Meilisearch we investigated using an immutable database like [immudb](https://immudb.io/). But immudb does not support chunking of data and due to its immutable nature, we will never be able to free up space of expired data. Even if we are legally allowed or required to delete data, we will not be able to do so with immudb. - -In another variant of the Meilisearch approach the metal-api would also be responsible for copying chunks to the S3 compatible storage and deleting old indices. But separating the concerns allows completely different implementations for every deployment stage. diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP12/README.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP12/README.md deleted file mode 100644 index 65532c57..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP12/README.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -slug: /MEP-12-rack-spreading -title: MEP-12 -sidebar_position: 12 ---- - -# Rack Spreading - -Currently, when creating a machine through the metal-api, the machine is placed randomly inside a partition. This algorithm does not consider spreading machines across different racks and different chassis. This may lead to the situation that a group of machines (that for example form a cluster) can end up being placed in the same rack and the same chassis. - -Spreading a group of machines across racks can enhance availability for scenarios like a rack losing power or a chassis meltdown. - -So, instead of just randomly deciding the placement of a machine candidate, we want to propose a placement strategy that attempts to spread machine candidates across the racks inside a partition. - -Furthermore a followup improvement to guarantee that machines are really spread across multiple racks, even if multiple machines are ordered in parallel, was implemented with [PR490](https://github.com/metal-stack/metal-api/pull/490). - -## Placement Strategy - -Machines in the project are spread across all available racks evenly within a partition (best effort). For this, an additional request to the datastore has to be made in order to find allocated machines within the project in the partition. - -The algorithm will then figure out the least occupied racks and elect a machine candidate randomly from those racks. - -The user can optionally pass placement tags which will be considered for spreading the machines as well (this will for example allow spreading by a cluster id tag inside the same project). - -## API - -```golang -// service/v1/machine.go - -type MachineAllocation struct { - // existing fields are omitted for readability - PlacementTags []string `json:"placement_tags" description:"by default machines are spread across the racks inside a partition for every project. if placement tags are provided, the machine candidate has an additional anti-affinity to other machines having the same tags"` -} -``` diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP13/README.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP13/README.md deleted file mode 100644 index 2dde20f5..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP13/README.md +++ /dev/null @@ -1,111 +0,0 @@ ---- -slug: /MEP-13-dual-stack-support -title: MEP-13 -sidebar_position: 13 ---- - -# Dual-stack Support - -dual-stack support is required to be able to create Kubernetes clusters with either IPv6 single-stack or dual-stack enabled. -With the inherent scarcity of IPv4 addresses, the need to be able to use IPv6 has increased. - -Full IPv6 dual-stack support was added to Kubernetes with v1.23 as stable. - -Gardeners have had full IPv6 dual-stack support since `v1.109`. - -metal-stack manages CIDRs and IP addresses with the [go-ipam](https://github.com/metal-stack/go-ipam) library, which already got full IPv6 support in 2021 (see [https://metal-stack.io/blog/2021/02/ipv6-part1](https://metal-stack.io/blog/2021/02/ipv6-part1)). -But this was only the foundation, more work needs to be done to get full IPv6 support for all aspects managed by metal-stack.io. - -## General Decisions - -For the general decision we do not look at the isolated clusters feature for now as this would make the solution even more complex and we want to introduce IPv6 in smaller steps to the users. - -### Networks - -Currently, metal-stack organizes CIDRs / prefixes into a `network' resource in the metal-api. A network can consist of multiple CIDRs from the same address family. For example, if an operator wants to provide Internet connectivity to provisioned machines, they can start with small network CIDRs. The number of managed network prefixes can then be expanded as needed over time. - -With dual-stack we have to choose between two options: Network per address family or networks with both address families. These options are described in the next section. - -#### Network per Address Family - -This means that we allow networks with CIDRs from one address family only, one for IPv4 and one for IPv6. - -The machine creation process will not change if the machine only needs to be either IPv4 or IPv6 addressable. -But if on the other side, the machine need to be able to connect to both address families, the machine creation needs to specify two networks, one for IPv4 and one for IPv6. -Also there will be 2 distinct VRF IDs for every network with a different address family. - -#### Network with both Address Families - -Make a network dual address family capable, meaning that you can add multiple cidrs from both address families to a network. -Then the machine creation will remain the same for single-stack and dual-stack cases, but the ip address allocation will need to specify the address family from which to allocate an ip address when the network is dual-stack. -This does not break the existing API, but allows existing extensions to easily add dual-stack support. -To avoid additional checking of which address families are available on this network during an ip allocation call, we could store the address families in the network. - -#### Decision - -The decision was made to go with the having both address families in a single network entity because we think this is the most flexible way to support dual-stack machines and Kubernetes clusters as well as single-stack with the least amount of modifications on the networking side. - -### Examples - -To illustrate the the usage we start by creating a tenant super network which has both address families: - -```yaml ---- -id: tenant-super-network-mini-lab -name: Project Super Network -description: Super network of all project networks -partitionid: mini-lab -prefixes: - - 10.0.0.0/16 - - 2001:db8:0:10::/64 -defaultchildprefixlength: - IPv4: 22 - IPv6: 96 -privatesuper: true -``` - -In order to create this network, we simple call: - -```bash -metalctl network create -f tenant-super.yaml -``` - -This is usually done during the initial setup of the environment. - -Next step is to allocate a tenant network where the machines of a project can be placed: - -```bash -metalctl network allocate --partition mini-lab --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 --name my-node-network -``` - -This leads to the following network allocation: - -```yaml -id: 2d2c0350-3f66-4597-ae97-ef6797232212 -name: my-node-network -parentnetworkid: tenant-super-network-mini-lab -partitionid: mini-lab -prefixes: - - 10.0.0.0/22 - - 2001:db8:0:10::/96 -projectid: 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -vrf: 20 -consumption: - ipv4: - available_ips: 1024 - available_prefixes: 256 - used_ips: 2 - used_prefixes: 0 - ipv6: - available_ips: 2147483647 - available_prefixes: 1073741824 - used_ips: 1 - used_prefixes: 0 -privatesuper: false -``` - -Users can the create IP addresses from these child networks. By default, they retrieve an IPv4 address except a super network only consists of IPv6 prefixes. In the latter case the users acquire an IPv6 address. - -```bash -metalctl network ip create --network 2d2c0350-3f66-4597-ae97-ef6797232212 --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -``` diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP14/README.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP14/README.md deleted file mode 100644 index 47c06434..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP14/README.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -slug: /MEP-14-independence-from-external-sources -title: MEP-14 -sidebar_position: 14 ---- - -# Independence from external sources - -In certain situations some customers may need to operate and create machines without making use of external services like DNS or NTP through the internet. To make this possible, all metal-stack components reaching external services need to be configurable with custom endpoints. - -So far, the following components have been identified as requiring changes: - -- pixiecore -- metal-hammer -- metal-images - -More components are likely to be added to the list during processing. -For DNS and NTP servers it should be possible to provide default values within a partition. They can either be inherited from machines and firewalls or overwritten with own ones. - -## pixiecore - -A NTP server endpoint need to be configured on the pixiecore. This can be achieved by providing it through environment variables on start up. - -## metal-hammer - -If using a self-deployed NTP server, also the metal-hammer need to be configured with it. For backward compatibility, default values from `pool.ntp.org` and `time.google.com` are used. - -## metal-images - -Configurations for the `metal-images` are different for machines and firewalls. - -## metalctl - -In order to pass DNS and NTP servers to partitions and machines while creating them, the flags `dnsservers` and `ntpservers` need to be added. - -The implementation of this MEP will make metal-stack possible to create and maintain machines without requiring an internet connection. diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP16/README.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP16/README.md deleted file mode 100644 index 205670ab..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP16/README.md +++ /dev/null @@ -1,318 +0,0 @@ ---- -slug: /MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller -title: MEP-16 -sidebar_position: 16 ---- - -# metal-api as an Alternative Configuration Source for the firewall-controller - -In the current situation, a firewall as provisioned by metal-stack is a fully immutable entity. Any modifications on the firewall like changing the firewall ruleset must be done _somehow_ by the user – the metal-api and hence metal-stack is not aware of its current state. - -As part of our [integration with the Gardener project](https://docs.metal-stack.io/stable/overview/kubernetes/#Gardener) we offer a solution called the [firewall-controller](https://github.com/metal-stack/firewall-controller), which is part of our [firewall OS images](https://github.com/metal-stack/metal-images/blob/6318a624861b18a559a9d37299bca5f760eef524/firewall/Dockerfile#L57-L58) and addresses shortcomings of the firewall resource's immutability, which would otherwise be completely impractible to work with. The firewall-controller crashes infinitely if it is not properly configured through the userdata when using the firewall image of metal-stack. - -The firewall-controller approach is tightly coupled to Gardener and it requires the administrator of the Gardener installation to pass a shoot and a seed kubeconfig through machine userdata when creating the firewall. How this userdata has to look like is not documented and is just part of another project called the [firewall-controller-manager](https://github.com/metal-stack/firewall-controller-manager) (FCM), which task is to orchestrate rolling updates of firewall machines in a way that network traffic interruption is minimal when updating a firewall or applying a change to an immutable firewall configuration. - -In general, a firewall entity in metal-stack has similarities to the machine entity but it has a fundamental difference: A user gains ownership over a machine after provisioning. They can access it through SSH, modify it at will and this is completely wanted. For firewalls, however, we do not want a user to access the provisioned firewall as the firewall is a privileged part of the infrastructure with access to the underlay network. The underlay can not be tampered with at any given point in time by a user as it can destroy the entire network traffic flow inside a metal-stack partition. - -For this reason, we have a gap in the metal-stack project in terms of a missing solution for people who do not rely on the Gardener integration. We are basically leaving a user with the option to implement an orchestrated recreation of every possible change on the firewall to minimize traffic interruption for the machines sitting behind the firewall or re-implement the firewall-controller to how they want to use it for their use-case. - -Also we do not have a clear distinction in the API between user and metal-stack operator for firewalls. If a user would allocate a firewall it is also possible for the user to inject his own SSH keys and access the firewall and tamper with the underlay network. - -Parts of these problems are probably going to decrease with the work on [MEP-4](../MEP4/README.md) where there will be dedicated APIs for users and administrators of metal-stack including fine-grained access tokens. - -With this MEP we want to describe a way to improve this current situation and allow other users that do not rely on the Gardener integration – for whatever motivation they have – to adequately manage firewalls. For this, we propose an alternative configuration for the firewall-controller that is native to metal-stack and more independent of Gardener. - -## Proposal - -The central idea of this proposal is allowing the firewall-controller to use the metal-api as a configuration source. This should serve as an alternative strategy to the currently used FCM `Firewall` resource based approach in the Gardener use-case. -Updates of the firewall rules should be possible through the metal-api. - -The firewall-controller itself should now be able to decide which of the two main strategies should be used for the base configuration: a kubeconfig or the metal-api. This should be possible through a dedicated _firewall-controller-config_. - -Using this config will now allow operators to fine-tune the data sources for all of its dynamic configuration tasks independently. -For example the data source of the core firewall rules could be set either from the `Firewall` resource located in the Gardener `Seed` or the metal-apiserver node network entity, while the CWNPs should be fetched and applied from a given kubeconfig (the `Shoot` Kubeconfig in the Gardener case). -This configuration file is intended to be injected during firewall creation through the userdata along with potential source connection credentials. - -```yaml -# the name of the firewall, defaulted to the hostname -name: best-firewall-ever - -sources: - seed: - kubeconfig: /path/to/seed.yaml # current gardener behavior - namespace: shoot--proj--name - shoot: - kubeconfig: /path/to/shoot.yaml # current gardener behavior - namespace: firewall - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - static: - # static should mirror all information provided by the metal or seed/shoot sources - firewall: # optional - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -# all sub-controllers running on the firewall -# each can be configured independently -controllers: - # this is the base controller - firewall: - source: seed # or: metal, static - - # these are optional: when not provided, they are disabled - selfUpdate: - enabled: true - droptailer: - enabled: true - - # these are optional: when not provided, they are disabled - service: - source: shoot # or: metal, static - cwnp: - source: shoot # or: metal, static - monitor: - source: shoot # currently only shoot is supported -``` - -The existing behavior of the firewall-controller writing into `/etc/nftables/firewall-controller.v4` is not changed. The different controller configuration sources are internally treated in the same way as before. The `static` source can be used to prevent the firewall-controller from crashing and consistently providing a static ruleset. This might be interesting for metal-stack native use cases or environments where the metal-api cannot be accessed. - -There must be one central nftables-rule-file-controller that is notified and triggered by all other controllers that contribute to the nftables configuration. - -For example, in order to maintain the existing Gardener integration, the configuration file for the firewall-controller will look like this: - -```yaml -name: shoot--abc--cluster-firewall-def -sources: - seed: - kubeconfig: /etc/firewall-controller/seed.yaml - namespace: shoot--abc--cluster - shoot: - kubeconfig: /etc/firewall-controller/shoot.yaml - namespace: firewall - -controllers: - firewall: - source: seed - - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Plain metal-stack users might use a configuration like this: - -```yaml -name: best-firewall-ever - -sources: - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - -controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - cwnp: - # firewall rules stored in firewall entity - # potential improvement would be to attach the rules to the node network entity - # be aware that the firewall and private networks are immutable - # eventually we introduce a firewall ruleset entity - source: metal -``` - -In highly restricted environments that cannot access metal-api the static source could be used: - -```yaml -name: most-restricted-firewall-ever - -sources: - static: - firewall: - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -controllers: - firewall: - source: static - - cwnp: - source: static -``` - -### Non-Goals - -- Resolving the missing differentiation between users and administrators by letting users pass userdata and SSH keys to the firewall creation. - - This is even more related to [MEP-4](../MEP4/README.md) than this MEP. - -### Advantages - -- Offers a native metal-stack solution that improves managing firewalls for users by adding dynamic reconfiguration through the metal-api - - e.g., in the mini-lab, users can now allocate a machine, then an IP address and announce this IP from the machine without having to re-create the firewall but by adding a firewall rule to the metal-api. -- Improve consistency throughout the API (firewall rules would reflect what is persisted in metal-api). -- Other providers like Cluster API can leverage this approach, too. -- It can contribute to solving the shoot migration issue (in Cluster API case the `clusterctl move` for firewall objects) - - For Gardener takes the seed out of the equation (of which the kubeconfig changes during shoot migration) - - However: Things like egress rules, rate limiting, etc. are currently not part of the firewall or network entity in the metal-api. These would need to be added to one of them. -- Potentially resolve the issue that end-users can manipulate accounting data of the firewall through the `FirewallMonitor` - - for this we would need to be able to report traffic data to metal-api - -### Caveats - -- Metal-View access is too broad for firewalls. Mitigated by [MEP-4](../MEP4/README.md). -- Polling of the firewall-controller is bad for performance. Mitigated by [MEP-4](../MEP4/README.md). - -### Firewall Controller Manager - -Currently the firewall-controller-manager expects the creators of a `FirewallDeployment` to use the defaulting webhook that is tailored to the Gardener integration in order to generate `Firewall.spec.userdata` or to override it manually. Currently `Firewall.spec.userdata` will never be set explicitly. - -Instead we'd like to propose `Firewall.spec.userdataContents` which will replace the old `userdata`-string by a typed data structure. The FCM will do the heavy lifting while the `FirewallDeployment` creator decides what should be configured. - -```yaml -kind: FirewallDeployment -spec: - template: - spec: - userdataContents: - - path: /etc/firewall-controller/config.yaml - content: | - --- - sources: - static: {} - controllers: - firewall: - source: static - - path: /etc/firewall-controller/seed.yaml - secretRef: - name: seed-kubeconfig - generateFirewallControllerKubeconfig: true - - path: /etc/firewall-controller/shoot.yaml - secretRef: - name: shoot-kubeconfig -``` - -### Gardener Extension Provider Metal Stack - -The GEPM should be migrated to the new `Firewall.spec.userdataContents` field. - -### Cluster API Provider Metal Stack - -![architectural overview](firewall-for-capms-overview.svg) - -In Cluster API there are essentially two main clusters: the management cluster and the workload cluster while the CAPMS takes in the role of the GEPM. -Typically a local bootstrap cluster is created in KinD which acts as the management cluster. It creates the workload cluster. Thereafter the ownership of the workload cluster is typically moved (using `clusterctl move`) to a different cluster which will then become the management cluster. -The new management cluster might actually be the workload cluster itself. - -In contrast to Gardener, Cluster API aims to be less opinionated and minimal. It is common practice to not install any non-required components or CRDs into the workload cluster by default. Therefore we cannot expect custom resources like `ClusterwideNetworkPolicy` or `FirewallMonitor` to be installed in the workload cluster but strongly recommend our users to do it. Therefore it's the responsibility of the operator to tell [cluster-api-provider-metal-stack](https://github.com/metal-stack/cluster-api-provider-metal-stack) the kubeconfig for the cluster where these CRDs are installed and defined in. - -A viable configuration for a `MetalStackCluster` that generates firewall rules based of `Service` type `LoadBalancer` and `ClusterwideNetworkPolicy` and expects them to be deployed in the workload cluster is shown below. The `FirewallMonitor` will be reported into the same cluster. - -```yaml -kind: MetalStackCluster -metadata: - name: ${CLUSTER_NAME} -spec: - firewallTemplate: - userdataContents: - - path: /etc/firewall-controller/config.yaml - secretName: ${CLUSTER_NAME}-firewall-controller-config - - - path: /etc/firewall-controller/workload.yaml - # this is the kubeconfig generated by kubeadm - secretName: ${CLUSTER_NAME}-kubeconfig ---- -kind: Secret -metadata: - name: ${CLUSTER_NAME}-firewall-controller-config -stringData: - controllerConfig: | - --- - name: ${CLUSTER_NAME}-firewall - - sources: - metal: - url: ${METAL_API_URL} - hmac: ${METAL_API_HMAC} - type: ${METAL_API_HMAC_TYPE} - projectID: ${METAL_API_PROJECT_ID} - shoot: - kubeconfig: /etc/firewall-controller/workload.yaml - namespace: firewall - - controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Here the firewall-controller-config will be referenced by the `MetalStackCluster` as a `Secret`. Please note that the `Secret`s in `userdataContents` will not be fetched and will directly be passed to the `FirewallDeployment`. At first the reconciliation of it in the FCM will fail due to the missing Kubeconfig secret. After the `MetalStackCluster` has been marked as ready, CAPI will create this missing secret. Effectively the firewall and initial control plane node should be created at the same time. - -This approach allows maximum flexibility as intended by Cluster API and is still able to provide robust rolling updates of firewalls. - -An advanced use case of this flexibility would be a management cluster, that is in charge of multiple workload clusters. Where one workload cluster acts as a monitoring or tooling cluster, receives logs and the firewall monitor for the other workload clusters. The CWNPs could be defined here, all in a separate namespace. - -#### Cluster API Caveats - -When the cluster is pivoted and reconciles its own firewall, a malfunctioning firewall prevents the cluster from self-healing and requires manual intervention by creating a new firewall. This is an inherent problem of the cluster-api approach. It can be circumvented by using an extra cluster to manage workload clusters. - -In the current form of this approach firewalls and therefore the firewall egress and ingress rules are managed by the cluster operators that manage the cluster-api resources. -Hence it will not be possible to gain a fine-grained control over every cluster operator's choices from a central ruleset at the level of metal-stack firewalls. -In case this control surfaces as a requirement, it would need to be implemented in a firewall external to metal-stack. - -## Roadmap - -In general this proposal is not thought to be implemented in one batch. Instead an incremental approach is required. - -1. Enhance firewall-controller - - - Reduce coupling between controllers - - Introduce controller config - - Abstract module to write into distinct nftable rules for every controller - - Implement `sources.static`, but not `sources.metal` - - GEPM should set `FirewallDeployment.spec.template.spec.userdataContents` - -2. Allow Cluster API to use the FCM with static ruleset - - - Add `firewall.metal-stack.io/paused` annotation (managed by CAPMS during `clusterctl move`, theoretically useful for Gardener shoot migration as well to avoid shallow deletion). - - Reconcile multiple `FirewallDeployment` resources across multiple namespaces. For Gardener the old behavior of reconciling only one namespace should persist. - - Allow setting the `firewall.metal-stack.io/no-controller-connection` annotation through the `FirewallDeployment` (either through the template or inheritance). - - Add `MetalStackCluster.spec.firewallTemplate`. - - Make `MetalStackCluster.spec.nodeNetworkID` optional if `spec.firewallTemplate` given. - -3. Add `sources.metal` as configuration option. - - - Allow updates of firewall rules in the metal-apiserver. - - Depends on [MEP-4](../MEP4/README.md) metal-apiserver progress - -4. Potentially migrate the GEPM to use `sources.metal` diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio deleted file mode 100644 index faea3e3d..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio +++ /dev/null @@ -1,4 +0,0 @@ - - - -
handles traffic
Firewall
Firewall Controller
node-exporter
nftables-exporter
droptailer-client
Workload Cluster
droptailer
Configures
Bootstrap or Management Cluster
reconcile
configures
reconcile
Cluster API Provider metal-stack
Metal Stack Cluster CRD
Firewall Deployment CRD
Firewall CRD
Firewall Set CRD
rec
reconcile
reconcile
Firewall Controller Manager
Metal Stack Machine CRD
manages
Admin
Kubeconfig FirewallMonitor
FirewallMonitor CRD
main metal-api
Firewall entity
kubeconfig CWNP
Clusterwide Network Policy CRD
base config
controllerConfig
user-defined
network rules
reports firewall
state
send firewall log lines
controllerConfig
controllerConfig
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg deleted file mode 100644 index 853f8175..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg +++ /dev/null @@ -1 +0,0 @@ -
handles traffic
handles traffic
Firewall
Firewall
Firewall Controller
Firewall Controller
node-exporter
node-exporter
nftables-exporter
nftables-exporter
droptailer-client
droptailer-client
Workload Cluster
Workload Cluster
droptailer
droptailer
Configures
Configures
Bootstrap or Management Cluster
Bootstrap or Management Cluster
reconcile
reconcile
configures
configures
reconcile
reconcile
Cluster API Provider metal-stack
Cluster API Provider...
Metal Stack Cluster CRD
Metal Stack Cluster...
Firewall Deployment CRD
Firewall Deployment...
Firewall CRD
Firewall CRD
Firewall Set CRD
Firewall Set CRD
rec
rec
reconcile
reconcile
reconcile
reconcile
Firewall Controller Manager
Firewall Controller...
Metal Stack Machine CRD
Metal Stack Machine...
manages
manages
Admin
Admin
Kubeconfig FirewallMonitor
Kubeconfig FirewallMonitor
FirewallMonitor CRD
FirewallMonitor CRD
main metal-api
main metal-api
Firewall entity
Firewall entity
kubeconfig CWNP
kubeconfig CWNP
Clusterwide Network PolicyCRD
Clusterwide Network...
base config
base config
controllerConfig
controllerConfig
user-defined
network rules
user-defined...
reports firewall
state
reports firewall...
send firewall log lines
send firewall log lines
controllerConfig
controllerConfig
controllerConfig
controllerConfig
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP17/README.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP17/README.md deleted file mode 100644 index 35f48970..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP17/README.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -slug: /MEP-17-global-network-view -title: MEP-17 -sidebar_position: 17 ---- - -# Global Network View - -> [!IMPORTANT] -> This MEP assumes the implementation of the metal-apiserver as described by [MEP-4](../MEP4/README.md) which is currently work in progress. - -Having a complete view of the network topology is useful when working with deployments or troubleshooting connectivity issues. -Currently, the API doesn't know of any other switches than the leaf switches. -Information about all other switches and their connections must be gathered from Ansible inventories or by accessing the switches via SSH. -Documentation of each partition's network must be kept in-sync with all changes made to the deployment or cabling. -We would like to expand the API's knowledge of the network to the entire underlay including inter-switch connections as well as BGP statistics and health status. - -## Switch Types - -Registering a switch at the API is done by the metal-core. -Apart from that, it also reconciles port and FRR configuration to adapt to the machine provisioning cycle. -This reconfiguration is only necessary on the leaf switches. -To allow deploying the metal-core on other switches than leaves we need a way of telling it what type of switch it is running on so it can act accordingly. -On any non-leaf switches it will only register the switch and report statistic but not change any configuration. -Supported switch types are - -- `leaf` -- `spine` -- `exit` -- `mgmtleaf` -- `mgmtspine` - -## Network Topology - -All switches should periodically report their LLDP neighbors and port configuration. -This information can be used to quickly identify common network issues, like MTU mismatch or the like. -Ideally, there would be some graphical representation of the network topology containing only the most important information for a quick overview. -It should contain all switches and machines as nodes and all connections as edges of a graph. -Ports, VRFs, and maybe also IPs should be associated with a connection. - -Apart from the topology graph, there should be a way to display more detailed information about both ports of a connection, like - -- MTU -- speed -- IP -- UP/DOWN status -- VRF -- VLAN -- whether it participates in a BGP session - -## BGP Announcements - -The metal-core should collect all routes it knows about and send them to the API along with a timestamp. -Reported routes should be stored to a redis database along with the switch that reported them and the timestamp of the last time they were reported. -An expiration threshold should be defined and all expired routes should be cleaned up periodically. -Whenever new routes are reported they get merged into the existing ones by the strategy: - -- when new, just add -- when existing, update `last_announced` timestamp - -By querying the BGP announcements we can find out whether an allocated IP is still in use. diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/README.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/README.md deleted file mode 100644 index 9c02c0b7..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/README.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -slug: /MEP-18-autonomous-control-plane -title: MEP-18 -sidebar_position: 18 ---- - -# Autonomous Control Plane - -As described in the [deployment chapter](../../../docs/04-For%20Operators/03-deployment-guide.mdx), we strongly recommend Kubernetes as the target platform for running the metal-stack control plane. - -Kubernetes clusters for this purpose are readily available from hyperscalers, metalstack.cloud, or other cloud providers. Simply using a managed Kubernetes cluster greatly simplifies a metal-stack installation. However, sometimes it might be desirable to host the metal-stack control plane autonomously, without the help of another cloud provider. Reasons for this might include corporate policies that prohibit the use of external data center products, or network constraints. - -The Kubernetes cluster hosting the metal-stack control plane must provide at least the following features: - -- Load balancing (for exposing the APIs) -- Persistent storage (for the databases and key-value stores) -- Access to object storage for automated backups of the stateful sets -- Access to a DNS provider supported by one of the used DNS extensions -- Externally accessible DNS records for obtaining officially signed certificates through DNS challenges - -This metal-stack control plane cluster must also be highly available to prevent a complete loss of control over the managed resources in the data center. -Regular Kubernetes updates to apply security fixes and feature updates must be possible in an automated manner. The Day-2 operational overhead of running this cluster in your own datacenter must be reasonable. - -In this chapter, we propose a solution for setting up a metal-stack environment with an autonomous control plane that is independent of another cloud provider. - -## Use Your Own Dogfood - -The most obvious solution is to just deploy a Kubernetes cluster manually in your own data center by utilizing existing tooling for the deployment: - -- k3s -- kubeadm -- vmware and rancher -- talos -- kubespray -- ... (not a complete list) - -However, all these solutions add another layer of complexity that needs to be maintained and operated by people who also need to learn and understand metal-stack. In general, metal-stack in combination with [Gardener](https://gardener.cloud) contains all the necessary tools to provide KaaS, so it makes sense to reuse what is already in place without introducing new dependencies on other products and vendors. - -The only problem here is that Gardener is not yet able to create an initial cluster, which may change with the implementation of [GEP-28](https://github.com/gardener/gardener/blob/master/docs/proposals/28-autonomous-shoot-clusters.md). In the meantime, we suggest using [k3s](https://k3s.io/), which manages the initial metal-stack partition to host the control plane, since the maintenance overhead is acceptable and it is easy to deploy. - -## The Matryoshka Principle - -Instead of directly using the K3s cluster for the production control plane, we propose using it as a minimal control plane cluster which only purpose is to host the production control plane cluster. This layer of indirection brings some reasonable advantages: - -- In the event of an interruption or loss of this minimal control plane cluster, the production control plane remains unaffected, and end users can continue to manage their clusters as normal. -- A dedicated operations team can take care of the Day-2 maintenance of this installation, which can be handy because the tools like k3s are a little different from the rest of the setup (it is likely that more manual maintenance is required than for any other cluster). This would also be true if the initial cluster problem would be solved by the Gardener itself and not using k3s. -- Since the number of shoot clusters to host is static, the resource requirements are minimal and will not change significantly over time. There are no huge resource requirements in terms of cpu, memory and storage. As such, the lack of scalability is not such a big issue. - -So, our proposal is to chain two metal-stack control planes. The initial control plane cluster would use k3s and on this cluster we can spin up a cluster for the production control plane with the use of Gardener. - -The following figure shows how the high-level architecture of this setup looks like. A even more simplified illustration of this setup can be looked up in the appendix[^1]. - -![Autonomous Control Plane Architecture](./autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg) - -The k3s nodes can either be bare metal machines or virtual machines. When using VMs a single k3s node might be a viable solution, too. These nodes are supposed to be setup manually / partly automated with an operating system like Debian. - -To name the cluster that hosts the initial metal-stack control plane and Gardener we use the term _initial cluster_. The initial cluster creates worker nodes to host the _target cluster_. - -## Initial Cluster - -The initial cluster is kept very small. The physical bare metal machines can be any machines and switches which are supported by metal-stack, but can be smaller in terms of cpu, memory and network speed because these machines must only be capable of running the target cluster for the metal-stack control plane. A typical single socket server with 8-16 cores and 64GB of RAM and two NVMe drives of 1TB would be a good starting point. - -In a typical k3s setup, a stateful set would lose the data once the k3s cluster was terminated and started again. But there is a possibility to define parts of the local storage of the server to be provided to the k3s cluster for the PVCs. With that, k3s could be terminated and started again, for example to update and reboot the host os, or update k3s itself and the data will persist. - -Example k3s configuration for persistent storage on the hosts os: - -```yaml -k3s: Cluster -apiVersion: k3s.x-k8s.io/v1alpha4 -name: needle-control-plane -nodes: - - role: control-plane - # add a mount from /path/to/my/files on the host to /files on the node - extraMounts: - - hostPath: /path/to/my/files - containerPath: /files -``` - -Into this cluster metal-stack and Gardener will be deployed. This deployment can be done by a Gitlab runner which is running on this machine. -The mini-lab will be used as a base for this deployment. The current development of [gardener-in-minilab](https://github.com/metal-stack/mini-lab/pull/202) must be extended to host all required extensions to make this a working metal-stack control plane which can manage the machines in the attached bare metal setup. - -In addition to the metal-stack and Gardener deployment, some additional required services are deployed (non-complete list): - -- PowerDNS to serve as a DNS Server for all DNS entries used in the initial and the target cluster, like `api.initial.metal-stack.local`, `gardener-api.initial.metal-stack.local` and the DNS entries for the api servers of the created kubernetes clusters. -- NTP -- Monitoring for the initial cluster and partition -- Optional: OIDC Server for authenticating against the metal-api -- Optional: Container Registry to host all metal-stack and gardener containers -- Optional: Let's Encrypt [boulder](https://github.com/letsencrypt/boulder) as a certificate authority -- ... - -Physical view, minimal setup for a initial cluster with a single physical node: - -![Small Initial Cluster](autonomous-control-plane-images/small-initial-cluster.svg) - -Physical View, bigger ha setup which is spread across two data centers: - -![HA Initial Cluster](autonomous-control-plane-images/ha-initial-cluster.svg) - -### Control Plane High Availability - -Running the initial control plane on a single physical server is not as available as it should be in such a use case. It should be possible to survive a loss of this server, because the server could be lost by many events, such as hardware failure, disk corruption or even failure of the datacenter location where this server is deployed. - -Setting up a second server with the same software components is an option, but the problem of data redundancy must be solved, because neither the gardener control plane, nor the metal-stack control plane can be instantiated twice. - -Given that we provide part of the local storage of the server as backing storage for the stateful sets in the k3s cluster, the data stored on the server itself must be replicated to another server and backed up on a regular basis. - -The replication of ETCD can be achieved through [clustered configuration](https://docs.k3s.io/datastore/ha-embedded) of k3s. Components of metal-stack and Gardener can run standalone and already utilize backup-restore mechanism that must be configured accordingly. For two or more bare metal machine used for the initial cluster, a loadbalancing mechanism for the ingress is required. kube-vip could be a possible solution. - -For monitoring a backend like a Victoria Metrics Cluster would allow spearding the monitoring data across the initial cluster nodes. These metrics should also be backed up in object storage. - -### Partition - -The partition which is managed by the initial cluster can be a simple and small hardware setup but yet capable enough to host the target cluster. It would even be a good practice to create separate target clusters on the initial cluster, e.g. one for the metal-stack control plane and one for the Gardener (maybe one more for monitoring). - -It can follow the metal-stack minimal setup which provides about 8-16 small servers connected to a 1G/s or 10G/s network dataplane. Central storage is optional as the persistence of the services running in these clusters is always backed up to a central object storage. Operations would be much easier if a central storage is provided. - -## Target Cluster - -The target cluster is the metal-stack environment which serves for end-user production use, the control plane is running in a shoot hosted in the initial cluster. The seed(s) and shoot(s) for end-users are created on the machines provided by the target cluster. -These machines can be of a different type in terms of size, but more importantly, these machines are connected to another network dataplane. Also the management infrastructure is separated from the initial cluster management network. - -## Failure Scenarios - -Everything could fail, everything will fail at some point. But this must kept in mind and nothing bad should happen if only one component at a time fails. -If more than one fails, the restoration to a working state must be easily possible and well documented. - -To ensure all possible breakages are documented, we suggest writing a list which summarizes all failure scenarios that might occur including the remediation. - -Here is an example of how a scenario documentation could look like: - -**Scenario**: Initial cluster is gone, all machines have died -**Impact**: Management of the initial cluster infrastructure not possible anymore, the target cluster continues to run but cannot be managed because the API servers are gone. end-users are not affected by this incident. -**Remediation**: The initial cluster nodes must be provisioned from scratch and re-deployed through the CI mechanism. The backups of the stateful sets are automatically restored during this process. - -## Implementation - -As part of this proposal, we provide the following tools and integrations in order to setup an autonomous control plane: - -- Deployment roles for the services like PowerDNS and NTP for the initial cluster -- Stretch goal: Deployment role to setup k3s in clustered configuration for the initial cluster and update it -- Extend the Gardener on mini-lab integration to allow shoot creation in the mini-lab -- Steady integration of the setup (maybe something like [k3d](https://github.com/k3d-io/k3d) in the mini-lab) - -## Appendix - -[^1]: ![metal-stack-chain](autonomous-control-plane-images/metal-stack-chain.svg) diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio deleted file mode 100644 index eafcb514..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio +++ /dev/null @@ -1,535 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine01 -
-
-
-
- - spine01 - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 01 - -
-
-
-
- - Initial cluster node 01 - -
-
-
- - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine02 -
-
-
-
- - spine02 - -
-
-
- - - - - - - - - -
-
-
- leaf03 -
-
-
-
- - leaf03 - -
-
-
- - - - - - - - - -
-
-
- leaf04 -
-
-
-
- - leaf04 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 02 - -
-
-
-
- - Initial cluster node 02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 03 - -
-
-
-
- - Initial cluster node 03 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg deleted file mode 100644 index 99261ada..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine01
spine01
leaf01
leaf01
leaf02
leaf02
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Initial cluster node 01
Initial cluster node 01
123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine02
spine02
leaf03
leaf03
leaf04
leaf04
Initial cluster node 02
Initial cluster node 02
Initial cluster node 03
Initial cluster node 03
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio deleted file mode 100644 index aae8a12d..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio +++ /dev/null @@ -1,1133 +0,0 @@ - - - - - - - - - - - - - - - - - - - -
-
-
- Initial Cluster -
-
-
-
- - Initial Cluster - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - -
-
-
- K3s Standalone - - - (on Debian) - - -
-
-
-
- - K3s Standalone (on Debian) - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Initial Partition -
-
-
-
- - Initial Partition - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for metal-stack -
-
-
-
- - Target Cluster for metal-stack - -
-
-
- - - - - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for Gardener -
-
-
-
- - Target Cluster for Gardener - -
-
-
- - - - - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - - - - - - - -
-
-
- Target Partition -
-
-
-
- - Target Partition - -
-
-
- - - - - - - - - - -
-
-
- Gardener Seeds and End-User Shoots -
-
-
-
- - Gardener Seeds and End-User Shoots - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - - - - -
-
-
- ETCD can be clustered or standalone, backed up by sidecar -
-
-
-
- - ETCD can be clustere... - -
-
-
- - - - - - - - - - -
-
-
- This data will get lost in case local PV gets deleted -
-
-
-
- - This data will get l... - -
-
-
- - - - - - - - - - -
-
-
- We can work with local PVs here, too. -
- backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered. -
-
-
-
- - We can work with local PVs he... - -
-
-
- - - - - - - -
-
-
- ETCD will be deployed in HA configuration on local PVs. -
-
- csi-driver-lvm needs to implement auto deletion of orphaned PVs. -
-
- Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot. -
-
-
-
- - ETCD will be deployed in HA c... - -
-
-
- - - - - - - - - - -
-
-
- More sophisticated storage solutions can be in place. -
-
- (Lightbits, NetApp, ...) -
-
-
-
- - More sophisticated storage so... - -
-
-
- - - - - - - - - - -
-
-
- TODO: Evaluate how to persist these metrics. -
-
-
-
- - TODO: Evaluate how to persist... - -
-
-
- - - - - - - - - - -
-
-
- - 1 VM or -
-
-
- - - 3 Bare Metal Machines - - -
-
-
-
-
- - 1 VM or... - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-stack -
-
-
-
- - metal-stack - -
-
-
- - - - - - - -
-
-
- metal-api -
-
-
-
- - metal-api - -
-
-
- - - - - - - -
-
-
- metal-db -
-
-
-
- - metal-db - -
-
-
- - - - - - - -
-
-
- ipam-db -
-
-
-
- - ipam-db - -
-
-
- - - - - - - -
-
-
- masterdata-db -
-
-
-
- - masterdata-db - -
-
-
- - - - - - - -
-
-
- headscale-db -
-
-
-
- - headscale-db - -
-
-
- - - - - - - -
-
-
- auditing-db -
-
-
-
- - auditing-db - -
-
-
- - - - - - - -
-
-
- nsqd -
-
-
-
- - nsqd - -
-
-
- - - - - - - - - - - -
-
-
- Gardener -
-
-
-
- - Gardener - -
-
-
- - - - - - - - - - -
-
-
- Virtual Garden -
-
-
-
- - Virtual Garden - -
-
-
- - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - -
-
-
- gardenlet -
-
-
-
- - gardenlet - -
-
-
- - - - - - - -
-
-
- Garden etcd -
-
-
-
- - Garden etcd - -
-
-
- - - - - - - -
-
-
- Prometheus -
-
-
-
- - Prometheus - -
-
-
- - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - -
-
-
- - Gitlab - -
- - Runner - -
-
-
-
-
- - Gitlab... - -
-
-
- - - - - - - - - - -
-
-
- Services -
-
-
-
- - Services - -
-
-
- - - - - - - -
-
-
- PowerDNS -
-
-
-
- - PowerDNS - -
-
-
- - - - - - - -
-
-
- boulder -
-
-
-
- - boulder - -
-
-
- - - - - - - -
-
-
- NTP -
-
-
-
- - NTP - -
-
-
- - - - - - - -
-
-
- OIDC -
-
-
-
- - OIDC - -
-
-
- - - - - - - -
-
-
- ... -
-
-
-
- - ... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg deleted file mode 100644 index e58e783b..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg +++ /dev/null @@ -1 +0,0 @@ -
Initial Cluster
Initial Cluster
metal-roles
metal-roles
CI
CI
K3s Standalone(on Debian)
K3s Standalone (on Debian)
Initial Partition
Initial Partition
Target Cluster for metal-stack
Target Cluster for metal-stack
Metal Control Plane
Metal Control Plane
provisions
provisions
Target Cluster for Gardener
Target Cluster for Gardener
Gardener Control Plane
Gardener Control Plane
Monitoring
Monitoring
Target Partition
Target Partition
Gardener Seeds and End-User Shoots
Gardener Seeds and End-User Shoots
provisions
provisions
metal-roles
metal-roles
CI
CI
metal-roles
metal-roles
ETCD can be clustered or standalone, backed up by sidecar
ETCD can be clustere...
This data will get lost in case local PV gets deleted
This data will get l...
We can work with local PVs here, too.
backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered.
We can work with local PVs he...
ETCD will be deployed in HA configuration on local PVs.

csi-driver-lvm needs to implement auto deletion of orphaned PVs.

Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot.
ETCD will be deployed in HA c...
More sophisticated storage solutions can be in place.

(Lightbits, NetApp, ...)
More sophisticated storage so...
TODO: Evaluate how to persist these metrics.
TODO: Evaluate how to persist...
1 VM or
3 Bare Metal Machines
1 VM or...
metal-stack
metal-stack
metal-api
metal-api
metal-db
metal-db
ipam-db
ipam-db
masterdata-db
masterdata-db
headscale-db
headscale-db
auditing-db
auditing-db
nsqd
nsqd
Gardener
Gardener
Virtual Garden
Virtual Garden
Gardener Control Plane
Gardener Control Plane
gardenlet
gardenlet
Garden etcd
Garden etcd
Prometheus
Prometheus
Monitoring
Monitoring
Gitlab
Runner
Gitlab...
Services
Services
PowerDNS
PowerDNS
boulder
boulder
NTP
NTP
OIDC
OIDC
...
...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio deleted file mode 100644 index cd5cf007..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio +++ /dev/null @@ -1,404 +0,0 @@ - - - - - - - - - - -
-
-
- Partition 1 -
-
-
-
- - Partition 1 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 2 -
-
-
-
- - Partition 2 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 3 -
-
-
-
- - Partition 3 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Production Control Plane -
-
-
-
- - Production Control Plane - -
-
- - - - -
-
-
- metal-stack -
- kubernetes cluster -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- gardener -
- kubernetes cluster -
-
-
-
- - gardener... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - - - - -
-
-
- Control Plane Partition -
-
-
-
- - Control Plane Partition - -
-
- - - - - -
-
-
- backup of stateful sets -
-
-
-
- - backup of stateful sets - -
-
- - - - - - -
-
-
- bare metal machine -
-
-
-
- - bare metal machine - -
-
- - - - -
-
-
- metal-stack -
- and -
- gardener -
- kubernetes cluster -
- running in kind -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - -
-
-
- S3 -
-
-
-
- - S3 - -
-
- - - - -
-
-
- Needle -
-
-
-
- - Needle - -
-
- - - -
-
-
- - Nail - -
-
-
-
- - Nail - -
-
-
- - - - - Text is not SVG - cannot display - - - -
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg deleted file mode 100644 index 8f88ba14..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg +++ /dev/null @@ -1 +0,0 @@ -
Partition 1
Partition 1
seeds
seeds
shoots
shoots
Partition 2
Partition 2
seeds
seeds
shoots
shoots
Partition 3
Partition 3
seeds
seeds
shoots
shoots
Production Control Plane
Production Control Plane
metal-stack
kubernetes cluster
metal-stack...
gardener
kubernetes cluster
gardener...
Manages
Manages
Control Plane Partition
Control Plane Partition
backup of stateful sets
backup of stateful sets
bare metal machine
bare metal machine
metal-stack
and
gardener
kubernetes cluster
running in kind
metal-stack...
Manages
Manages
S3
S3
Needle
Needle 
Nail
Nail
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio deleted file mode 100644 index a75ee340..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- Initial cluster node -
-
-
-
- - Initial cluster node - -
-
-
- - - - - - - - - - - - - -
-
-
- mirocloud (initial cluster partition nodes) -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg deleted file mode 100644 index a9d29f05..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
leaf01
leaf01
leaf02
leaf02
Initial cluster node
Initial cluster node
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP2/README.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP2/README.md deleted file mode 100644 index c7f2360a..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP2/README.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -slug: /MEP-2-two-factor-authentication -title: MEP-2 -sidebar_position: 2 ---- - -# Two Factor Authentication diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP3/README.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP3/README.md deleted file mode 100644 index 5ce36721..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP3/README.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -slug: /MEP-3-machine-re-installation -title: MEP-3 -sidebar_position: 3 ---- - -# Machine Re-Installation - -In the current metal-api only machine installations are possible, performing a machine upgrade is only possible by creating a new machine and delete the old one. -This has the drawback that in case a lot of data is stored on the local disks, a full restore of the original data must be performed. - -To prevent this, we will introduce a new metal-api endpoint to reinstall the machine with a new image, _without_ actually deleting the data stored on the additional hard disks. - -Storage is a difficult task to get right and reliable. A short analysis of our different storage requirements lead to 3 different scenarios. - -- Storage for the etcd pvs in the seed cluster of every partition. - This is the most important storage in our setup because these etcd pods serve as configuration backend for all customer kubernetes clusters. If they fail, the cluster is down. However gardener deploys a backup and restore sidecar into the etcd pod of every customer kubernetes control plane, and if this sidecar detects a corrupt or missing etcd database file(s) it starts automatic restore from the configured backup location. This will take some minutes. If for example a node dies, and gardener creates a new node instead, the csi-lvm created pv is not present on that node. Kubernetes will not schedule the missing etcd pod on this node because it has a local PV configured and is therefore tainted to run only on that node. To let kubernetes create that pod anyhow, someone has to either remove the taint, or delete the pod. If this is done, the pod starts and the restore of the etcd data can start as well. You can see this is a bit too complicated and will take the customer cluster down for a while (not measured yet but in the range of 5-10 minutes). -- Storage in customer clusters. - This was not promised in 2020. We have a intermediate solution with the provisioning of csi-lvm by default into all customer clusters. Albeit this is only local storage and will get deleted if a node dies. -- S3 Storage. - We have two possibilities to cope with storage: - - In place update of the OS with a daemonset - This will be fast and simple, but might fail because the packages being installed are broken right now, or a filesystem gets full, or any other failure you can think of during a os update. Another drawback is that metal-api does not reflect the updated os image. - - metal-api get a machine reinstall endpoint - With this approach we leverage from existing and already proven mechanisms. Reinstall must keep all data except the sata-dom. Gardener currently is not able to do an update with this approach because it can only do `rolling` updates. Therefore a additional `osupdatestrategy` has to be implemented for metal and other providers in gardener to be able to leverage the metal reinstall on the same machineID approach. - -If reinstall is implemented, we should focus on the same technology for all scenarios and put ceph via rook.io into the kubernetes clusters as additional StorageClass. It has to be checked whether to use the raw disk or a PV as the underlay block device where ceph stores its data. - -## API and behavior - -The API will get an new endpoint "reinstall" this endpoint takes two arguments: - -- machineID -- image - -No other aspects of the machine can be modified during the re-installation. All data stored in the existing allocation will be preserved, only the image will be modified. -Once this endpoint was called, the machine will get a `reboot` signal with the boot order set to PXE instead of HDD and the network interfaces on the leaf are set to PXE as well. Then the normal installation process starts: - -- unchanged: PXE boot with metal-hammer -- changed: metal-hammer first checks with the machineID in the metal-api (through metal-core) if there is already a allocation present -- changed: if a allocation is present and the allocation has set `reinstall: true`, wipe disk is only executed for the root disk, all other disks are untouched. -- unchanged: the specified image is downloaded and burned, `/install.sh` is executed -- unchanged: successful installation is reported back, network is set the the vrf, boot order is set to HDD. -- unchanged: distribution kernel is booted via kexec - -We can see that the `allocation` requires one additional parameter: `reinstall` and metal-hammer must check for already existing allocation at an earlier stage. - -Components which requires modifications (first guess): - -- metal-hammer: - - check for allocation present earlier - - evaluation of `reinstall` flag set - - wipe of disks depends on that flag - - Bonus: move configuration of disk layout and primary disk detection algorithm (PDDA) from metal-hammer into metal-api. - metal-api **MUST** reject reinstallation if the disk found by PDDA does not have the `/etc/metal` directory! -- metal-core: - - probably nothing -- metal-api: - - new endpoint `/machine/reinstall` - - add `Reinstall bool` to data model of `allocation` - - make sure to reset `Reinstall` after reinstallation to prevent endless reinstallation loop -- metalctl: - - implement `reinstall` -- metal-go: - - implement `reinstall` -- gardener (longterm): - - add the `OSUpgradeStrategy` `reinstall` diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP4/README.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP4/README.md deleted file mode 100644 index 389a02d4..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP4/README.md +++ /dev/null @@ -1,211 +0,0 @@ ---- -slug: /MEP-4-multi-tenancy-for-the-metal-api -title: MEP-4 -sidebar_position: 4 ---- - -# Multi-Tenancy for the metal-api -:::info -This document is work in progress. -::: - -In the past we decided to treat the metal-api as a "low-level API", i.e. the API does not specifically deal with projects and tenants. A user with editor access can for example assign machines to every project he desires, he can see all the machines available and can control them. We tried to keep the metal-api code base as small as possible and we added resource scoping to a "higher-level APIs". From there, a user would be able to only see his own clusters and IP addresses. - -As time passed metal-stack has become an open-source project and people are willing to adopt. Adopters who want to put their own technologies on top of the metal-stack infrastructure don't have those "higher-level APIs" that we implemented closed-source for our user base. So, external adopters most likely need to implement resource scoping on their own. - -Introducing multi-tenancy to the metal-api is a serious chance of making our product better and more successful as it opens the door for: - -- Becoming a "fully-featured" API -- Narrowing down attack surfaces and possibility of unintended resource modification produced by bugs or human errors -- Discouraging people to implement their own scoping layers in front of the metal-stack -- Gaining performance through resource scopes -- Letting untrusted / third-parties work with the API - -## Requirements - -These are some general requirements / higher objectives that MEP-4 has to fulfill. - -- Should be able to run with mini-lab without requiring to setup complex auth backends (dex, LDAP, keycloak, ...) - - Simple to start with, more complex options for production setups -- Fine-grained access permissions (every endpoint maps to a permission) -- Tenant scoping (disallow resource access to resources of other tenants) -- Project scoping (disallow resource access to resources of other projects) -- Access tokens in self-service for technical user access - -## Implementation - -We gathered a lot of knowledge while implementing a multi-tenancy-capable backend for metalstack.cloud. The goal is now to use the same technology and adopt that to the metal-api, this includes: - -- gRPC in combination with connectrpc -- OPA for making auth decisions -- REST HTTP only for OIDC login flows - -### API Definitions - -The API definitions should be located on a separate Github repository separate from the server implementation. The proposed repository location is: https://github.com/metal-stack/api. - -This repository contains the `proto3` specification of the exposed metal-stack api. This includes the messages, simple validations, services and the access permission to these services. The input parameters for the authorization in the backend are generated from the `proto3` annotations. - -Client implementations for the most relevant languages (go, python) are generated automatically. - -This api is divided into end-user and admin access at the top level. The proposed APIs are: - -- `metalstack.api.v2`: For end-user facing services -- `metalstack.admin.v2`: For operators and controllers which need access to unscoped entities - -The methods of the API can have different role scopes (and can be narrowed down further with fine-grained method permissions): - -- `tenant`: Tenant-scoped methods, e.g. project creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `project`: Project-scoped methods, e.g. machine creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `admin` Admin-scoped methods, e.g. unscoped tenant list or switch register - - Available roles: VIEWER, EDITOR - -And has methods with different visibility scopes: - -- `self`: Methods that only the logged in user can access, e.g. show permissions with the presented token -- `public`: Methods that do not require any specific authorization - -### API - -The API server implements the services defined in the API and validates access to a method using OPA with the JWT tokens passed in the requests. The server is implemented using the connectrpc.com framework. - -The API server implements the login flow through OIDC. After successful authentication, the API server derives user permissions from the OIDC provider and issues a new JWT token which is passed on to the user. The tokens including the permissions are stored in a redis compatible backend. - -With these tokens, users can create Access Tokens for CI/CD or other use cases. - -JWT Tokens can be revoked by admins and the user itself. - -### API Server - -Is put into a new github repo which implements the services defined in the `api` repository. It opens a `https` endpoints where the grpc (via connectrpc.com) and oidc services are exposed. - -### Migration of the Consumers - -To allow consumers to migrate to the `v2` API gradually, both apis, the new and the old, are deployed in parallel. In the control-plane both apis are deployed side-by-side behind the ingress. `api.example.com` is forwarded to `metal-api` and `metal.example.com` is forwarded to the new `metal-apiserver`. - -The api-server will talk to the existing metal-api during the process of migration services away to the new grpc api. - -The migration process can be done in the following manner: - -for each resource in the metal-api: - -- create a new proto3 based definition in the `api` repo. -- implement the business logic per service in the new `metal-apiserver` without calling the metal-api. -- clients must be able to talk to `v1` and `v2` backend in parallel -- Deprecate the already migrated service in the swagger route to notify the client that this route should not be used anymore. -- identify all consumers of this resource and replace them to use the grpc instead of the rest api -- move the business logic incl. the backend calls to ipam, metal-db, masterdata-api, nsq for this resource from the metal-api to the `metal-apiserver` - -We will migrate the rethinkdb backend implementation to a generic approach during this effort. - -- Try to enhance the generic rethinkdb interface with `project` scoped methods. - -There are a lot of consumers of metal-api, which need to be migrated: - -- ansible -- firewall-controller -- firewall-controller-manager -- gardener-extension-auth -- gardener-extension-provider-metal - - Do not point the secret bindings to a the shared provider secret in the seed anymore. Instead, use individual provider-secret containing project-scoped API access tokens in the Gardener project namespaces. -- machine-controller-manager-provider-metal -- metal-ccm -- metal-console -- metal-bmc -- metal-core -- metal-hammer -- metal-image-cache-sync -- metal-images -- metal-metrics-exporter -- metal-networker -- metalctl -- pixie - -## User Scenarios - -This section gathers a collection of workflows from the perspective of a user that we want to provide with the implementation of this proposal. - -### Machine Creation - -A regular user wants to create a machine resource. - -Requirements: Project was created, permissions are present - -- The user can see networks that were provided by the admin. - - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` - -- The user has to set the project scope first or provide `--project` flags for all commands. - ``` - $ metalctl project set 793bb6cd-8b46-479d-9209-0fedca428fe1 - You are now acting on project 793bb6cd-8b46-479d-9209-0fedca428fe1. - ``` -- The user can create the child network required for machine allocation. - ``` - $ metalctl network allocate --partition fra-equ01 --name test - ``` -- Now, the user sees his own child network. - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - └─╴08b9114b-ec47-4697-b402-a11421788dc6 test 793bb6cd-8b46-479d-9209-0fedca428fe1 fra-equ01 false false 10.128.64.0/22  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` -- The user does not see any machines yet. - ``` - $ metalctl machine ls - ``` -- The user can create a machine. - ``` - $ metalctl machine create --networks internet,08b9114b-ec47-4697-b402-a11421788dc6 --name test --hostname test --image ubuntu-20.04 --partition fra-equ01 --size c1-xlarge-x86` - ``` -- The machine will now be provisioned. - ``` - $ metalctl machine ls - ID LAST EVENT WHEN AGE HOSTNAME PROJECT SIZE IMAGE PARTITION - 00000000-0000-0000-0000-ac1f6b7befb2 Phoned Home 20s 50d 4h test 793bb6cd-8b46-479d-9209-0fedca428fe1 c1-xlarge-x86 Ubuntu 20.04 20210415 fra-equ01 - ``` - -:::warning -A user **cannot** list all allocated machines for all projects. The user **must** always switch project context first and can only view the machines inside this project. Only admins can see all machines at once. -::: -### Scopes for Resources - -The admins / operators of the metal-stack should be able to provide _global_ resources that users are able to use along with their own resources. In particular, users can view and use _global_ resources, but they are not allowed to create, modify or delete them. - -:::info -When a project ID field is empty on a resource, the resource is considered _global_. -::: - -Where possible, users should be capable of creating their own resource entities. - -| Resource | User | Global | -| :----------------- | :--- | :----- | -| File System Layout | yes | yes | -| Firewall | yes | | -| Firmware | | yes | -| OS Image | | yes | -| Machine | yes | | -| Network (Base) | | yes | -| Network (Children) | yes | | -| IP | yes | | -| Partition | | yes | -| Project | yes | | -| Project Token | yes | | -| Size | | yes | -| Switch | | | -| Tenant | | yes | - -:::info -Example: A user can make use of the file system layouts provided by the admins, but can also create own layouts. Same applies for images. As soon as a user creates own resources, the user takes over the responsibility for the machine provisioning to succeed. -::: diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP5/README.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP5/README.md deleted file mode 100644 index 3b7fc45c..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP5/README.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -slug: /MEP-5-shared-networks -title: MEP-5 -sidebar_position: 5 ---- - -# Shared Networks - -## Why are shared networks needed - -For special purpose machines that serve shared services with performance critical workloads to all machines of a partition (like persistent storage) it would be good to have kind of a "shared network" that is easily accessible. -They do not necessarily need another firewall. This would avoid having two firewalls in the datapath between a machine in a private network and the machines of a shared service. - -## Constraints that need to hold - -- a shared network is usable from all machines that have a firewall in front, that uses it -- a shared network is only usable within a single partition (currently we are constrained in bandwidth and have no routing of 10.0.0.0/8 addresses btw. partitions and failure domain should be the partition but this constraint might get lifted in the future) -- networks may be marked as shared after network allocation (but there should be no way back from shared to unshared) -- neither machines nor firewalls may have multiple private, unshared networks configured -- machines must have a single primary network configured - - this might be a shared network - - OR a plain, unshared private network -- firewalls may participate in multiple shared networks -- machines can be allocated with a primary network using auto IP allocation or with `noauto` and a specific IP - -## Should shared networks be private - -**Alternative 1:** If we implemented shared networks by extending functions around plain, private networks we would not have to manage another CIDR (mini point) and it would be possible to create a k8s cluster with a private network, mark the network as `shared` and produce shared services from this k8s cluster. - -**Alternative 2:** If shared networks are implemented as first class networks we could customize the VRF and also accomplish an other goal of our roadmap: being able to create machines directly in an external network. - -Together with @majst01 and @Gerrit91 we decided to continue to implement **Alternative 1**. - -## Firewalls accessing a shared network - -Firewalls that access shared networks need to: - -- hide the private network behind an ip address of the shared network if the shared network was configured with `nat=true`. -- import the prefixes of the shared VRF to the private VRF and import the prefixes of the private VRF to the shared VRF so that the communication between the two is working in both directions. As long as no `nat=true` was set on the shared VRF, the original machine ips are visible in both communication directions. - -## Setup with shared networks and single consumer - -![Simple Setup](./shared.png) - -## Setup with single shared network and multiple consumers - -![Advanced Setup](./shared_advanced.png) - -## Getting internet access - -Machines contained in a shared network can access the internet with different scenarios: - -- if they have an own firewall: this is internet accessibility, as common (check whether all traffic gets routed through it!) -- if they don't have an own firewall, an external HTTP proxy is needed that has an endpoint exposed as Service Type NodePort diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP5/shared.drawio b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP5/shared.drawio deleted file mode 100644 index aa7af045..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP5/shared.drawio +++ /dev/null @@ -1,121 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP5/shared.png b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP5/shared.png deleted file mode 100644 index b0b47f03..00000000 Binary files a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP5/shared.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP5/shared_advanced.drawio b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP5/shared_advanced.drawio deleted file mode 100644 index 6f96eca0..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP5/shared_advanced.drawio +++ /dev/null @@ -1,187 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP5/shared_advanced.png b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP5/shared_advanced.png deleted file mode 100644 index da989915..00000000 Binary files a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP5/shared_advanced.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP6/README.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP6/README.md deleted file mode 100644 index edf52a6c..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP6/README.md +++ /dev/null @@ -1,123 +0,0 @@ ---- -slug: /MEP-6-dmz-networks -title: MEP-6 -sidebar_position: 6 ---- - -# DMZ Networks - -## Reasoning - -To fulfill higher levels of security measures the standard metal-stack approach with a single firewall in front of a set of machines might be insufficient. -There are cases where two physically distinct firewalls in front of application workload are mandatory. In traditional network terms this is known as DMZ approach. - -For Kubernetes workloads it makes sense to use the front cluster for ingress, WAF purposes and as outgoing proxy. The clusters may be used for application workload. - -## DMZ network - -- Use a separate DMZ network prefix for every tenant -- This is used as intermediate network btw. private networks of a tenant and the internet -- For every partition a distinct DMZ firewall/cluster is needed for a tenant -- For Gardener orchestrated Kubernetes clusters this network must be a publicly reachable internet prefix because shoot clusters need a vpn service that is used for instrumentation from the seed cluster - this will be a requirement as long as the inverse vpn tunnel feature Konnectivity is not available to us. - -## Approach 1: DMZ with publicly reachable internet prefix - -![DMZ Internet](dmz-internet_public.svg) - -A DMZ network with publicly reachable internet prefix will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: null -partitionid: "" -prefixes: - - 212.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 104007 -vrfshared: false -nat: true -shared: false -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -This is currently supported by the metal-networker and needs no further changes! - -## Approach 2: DMZ with private IPs - -![DMZ Internet](dmz-internet_private.svg) - -A DMZ network with private IPs will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: tenant-super-network-fra-equ01 -partitionid: fra-equ01 -prefixes: - - 10.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 4711 -vrfshared: false -nat: true -shared: true # it's usable from multiple projects -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network (only locally, no-export) -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -## Code Changes / Implications - -- `metal-networker` and `metal-ccm` assume that there is only one network providing the default-route -- `metal-networker` needs to - - import the default route from the internet network to the dmz network (DMZ Firewall) - - import the DMZ network to the internet network and adjusting NAT rules (DMZ Firewall) - - import destination prefixes of the DMZ network to the private primary network (DMZ Firewall, Application Firewall) - - import DMZ-IPs of the private primary network to the DMZ network (DMZ Firewall, Application Firewall) -- `metal-api`: destination prefixes of private networks need to be configurable (`allocateNetwork`) -- `gardener-extension-provider-metal`: needs to be able to delete DMZ clusters (but skip the network deletion part) -- the application firewall is not publicly reachable - for debugging purposes a hop over the DMZ firewall is needed - -## Decision - -We decided to follow the second approach with private DMZ networks. diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP6/dmz-internet_private.drawio b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP6/dmz-internet_private.drawio deleted file mode 100644 index 7b83bbfc..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP6/dmz-internet_private.drawio +++ /dev/null @@ -1,178 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP6/dmz-internet_private.svg b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP6/dmz-internet_private.svg deleted file mode 100644 index f5e58204..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP6/dmz-internet_private.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
10.90.30.129
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
10.90.30.128/25
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
10.90.30.129 is reachable
/0 via Firewall DMZ
10.0.0.0/24 is reachable
10.0.1.0/24 is reachable
10.90.30.129 is reachable...
Internet
212.1.1.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0

import 10.0.0.0/24 no export
import 10.0.1.0/24 no export
import 10.90.30.128/25 no export
import 10.0.0.0/24 no exp...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP6/dmz-internet_public.drawio b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP6/dmz-internet_public.drawio deleted file mode 100644 index 544939e5..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP6/dmz-internet_public.drawio +++ /dev/null @@ -1,184 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP6/dmz-internet_public.svg b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP6/dmz-internet_public.svg deleted file mode 100644 index 5e825081..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP6/dmz-internet_public.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
212.1.2.3
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
212.1.2.0/27
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
212.1.2.3 is reachable
/0 via Firewall DMZ
212.1.2.3 is reachable...
Internet
212.1.1.0/27 212.1.2.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0
import 212.1.2.0/27
import 10.0.0.0/24 no redistribute
import 10.0.1.0/24 no redistribute

import 212.1.2.0/27...
SNAT to
212.1.2.1
SNAT to...
SNAT to
212.1.2.2
SNAT to...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP8/README.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP8/README.md deleted file mode 100644 index 14748fae..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP8/README.md +++ /dev/null @@ -1,503 +0,0 @@ ---- -slug: /MEP-7-configurable-filesystem-layout-for-machine-allocation -title: MEP-7 -sidebar_position: 7 ---- - -# Configurable Filesystem layout for Machine Allocation - -The current implementation uses a hard coded filesystem layout depending on the specified size and image. This is done in the metal-hammer. This worked well in the past because we had a small amount of sizes and images. But we reached a point where this is to restricted for all use cases we have to fulfill. It also forces us to modify the metal-hammer source code to support a new filesystem layout. - -This proposal tries to address this issue by introducing a filesystem layout struct in the metal-api which is then configurable per machine allocation. -The original behavior of automatic filesystem layout decision must still be present, because there must be no API change for existing API consumers. It should be a additional feature during machine allocation. - -## API and behavior - -The API will get a new endpoint `filesystemlayouts`to create/update/delete a set of available `filesystemlayouts`. - -### Constraints - -In order to keep the actual machine allocation api compatible, there must be no difference while allocating a machine. To achieve this every -`filesystemlayout` defines constraints which specifies for which combination of `sizes` and `images` this layout should be used by default. -The specified constraints over all `filesystemlayouts` therefore must be collision free, to be more specific, there must be exactly one layout outcome -for every possible combination of `sizes` and `images`. - -The `size` constraint must be a list of the exact size ids, the `image` constraint must be a map of os to semver compatible version constraint. For example: - -- `debian: ">= 10.20210101"` or `debian: "< 10.20210101"` - -The general form of a `image` constraint is a map from `os` to `versionconstraint` where: - -`os` must match the first part of the image without the version. -`versionconstraint` must be the comparator, a space and the version, or simply `*` to match all versions of this `os`. -The comparator must be one of: "=", "!=", ">", "<", ">=", "=>", "<=", "=<", "~", "~>", "^" - -It must also be possible to have a `filesystemlayout` in development or for other special purposes, which can be specified during the machine allocation. -To have such a layout, both constraints `sizes` and `images`must be empty list. - -### Reinstall - -The current reinstall implementation the metal-hammer detects during the installation on which disk the OS was installed and reports back to the metal-api the Report struct which has two properties `primarydisk` and `ospartition`. -Both fields are not required anymore because the logic is now shifted to the `filesystemlayout` definition. If `Disk.WipeOnReinstall` is set to true, this disk will be wiped, default is false and is preserved. - -### Handling of s2-xlarge machines - -These machines are a bit special compared to our `c1-*` machines because they have rotating hard disks for the mass storage purpose. -The downside is that the on board SATA-DOM has the same naming as the HDDs and can not be specified as the first /dev/sda disk because all HDDs are also /dev/sd\* disks. -Therefore we had a special SATA-DOM detection algorithm inside metal-hammer which simply checks for the smallest /dev/sd disk and took this to install the OS. - -This is not possible with the current approach, but we figured out that the SATA-DOM is always `/dev/sde`. So we can create a special `filesystemlayout` where the installations is made on this disk. - -### Possible Filesystemlayout hierarchies - -It is only possible to create a filesystem on top of a block device. The creation of a block device can be done on multiple ways, depending on the requirements regarding performance, space and redundancy of the filesystem. -It also depends on the disks available on the server. - -The current approach implements the following hierarchies: - -![filesystems](filesystems.png) - -### Implementation - -```go -// FilesystemLayout to be created on the given machine -type FilesystemLayout struct { - // ID unique layout identifier - ID string - // Description is human readable - Description string - // Filesystems to create on the server - Filesystems []Filesystem - // Disks to configure in the server with their partitions - Disks []Disk - // Raid if not empty, create raid arrays out of the individual disks, to place filesystems onto - Raid []Raid - // VolumeGroups to create - VolumeGroups []VolumeGroup - // LogicalVolumes to create on top of VolumeGroups - LogicalVolumes []LogicalVolume - // Constraints which must match to select this Layout - Constraints FilesystemLayoutConstraints -} - -type FilesystemLayoutConstraints struct { - // Sizes defines the list of sizes this layout applies to - Sizes []string - // Images defines a map from os to versionconstraint - // the combination of os and versionconstraint per size must be conflict free over all filesystemlayouts - Images map[string]string -} - -type RaidLevel string -type Format string -type GPTType string - -// Filesystem defines a single filesystem to be mounted -type Filesystem struct { - // Path defines the mountpoint, if nil, it will not be mounted - Path *string - // Device where the filesystem is created on, must be the full device path seen by the OS - Device string - // Format is the type of filesystem should be created - Format Format - // Label is optional enhances readability - Label *string - // MountOptions which might be required - MountOptions []string - // CreateOptions during filesystem creation - CreateOptions []string -} - -// Disk represents a single block device visible from the OS, required -type Disk struct { - // Device is the full device path - Device string - // Partitions to create on this device - Partitions []Partition - // WipeOnReinstall, if set to true the whole disk will be erased if reinstall happens - // during fresh install all disks are wiped - WipeOnReinstall bool -} - -// Raid is optional, if given the devices must match. -// TODO inherit GPTType from underlay device ? -type Raid struct { - // ArrayName of the raid device, most often this will be /dev/md0 and so forth - ArrayName string - // Devices the devices to form a raid device - Devices []Device - // Level the raidlevel to use, can be one of 0,1,5,10 - // TODO what should be support - Level RaidLevel - // CreateOptions required during raid creation, example: --metadata=1.0 for uefi boot partition - CreateOptions []string - // Spares defaults to 0 - Spares int -} - - -// VolumeGroup is optional, if given the devices must match. -type VolumeGroup struct { - // Name of the volumegroup without the /dev prefix - Name string - // Devices the devices to form a volumegroup device - Devices []string - // Tags to attach to the volumegroup - Tags []string -} - -// LogicalVolume is a block devices created with lvm on top of a volumegroup -type LogicalVolume struct { - // Name the name of the logical volume, without /dev prefix, will be accessible at /dev/vgname/lvname - Name string - // VolumeGroup the name of the volumegroup - VolumeGroup string - // Size of this LV in mebibytes (MiB) - Size uint64 - // LVMType can be either striped or raid1 - LVMType LVMType -} - -// Partition is a single partition on a device, only GPT partition types are supported -type Partition struct { - // Number of this partition, will be added to the device once partitioned - Number int - // Label to enhance readability - Label *string - // Size given in MebiBytes (MiB) - // if "0" is given the rest of the device will be used, this requires Number to be the highest in this partition - Size string - // GPTType defines the GPT partition type - GPTType *GPTType -} - -const ( - // VFAT is used for the UEFI boot partition - VFAT = Format("vfat") - // EXT3 is usually only used for /boot - EXT3 = Format("ext3") - // EXT4 is the default fs - EXT4 = Format("ext4") - // SWAP is for the swap partition - SWAP = Format("swap") - // None - NONE = Format("none") - - // GPTBoot EFI Boot Partition - GPTBoot = GPTType("ef00") - // GPTLinux Linux Partition - GPTLinux = GPTType("8300") - // GPTLinuxRaid Linux Raid Partition - GPTLinuxRaid = GPTType("fd00") - // GPTLinux Linux Partition - GPTLinuxLVM = GPTType("8e00") - - // LVMTypeLinear append across all physical volumes - LVMTypeLinear = LVMType("linear") - // LVMTypeStriped stripe across all physical volumes - LVMTypeStriped = LVMType("striped") - // LVMTypeStripe mirror with raid across all physical volumes - LVMTypeRaid1 = LVMType("raid1") -) -``` - -Example `metalctl` outputs: - -```bash -$ metalctl filesystemlayouts ls -ID DESCRIPTION SIZES IMAGES -default default fs layout c1-large-x86, c1-xlarge-x86 debian >=10, ubuntu >=20.04, centos >=7 -ceph fs layout for ceph s2-large-x86, s2-xlarge-x86 debian >=10, ubuntu >=20.04 -firewall firewall fs layout c1-large-x86, c1-xlarge-x86 firewall >=2 -storage storage fs layout s3-large-x86 centos >=7 -s3 storage fs layout s2-xlarge-x86 debian >=10, ubuntu >=20.04, >=firewall-2 -default-devel devel fs layout -``` - -The `default` layout reflects what is actually implemented in metal-hammer to guarantee backward compatibility. - -```yaml ---- -id: default -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - label: "efi" # required to be compatible with old images - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" # required to be compatible with old images - - path: "/var/lib" - device: "/dev/sda3" - format: "ext4" - label: "varlib" # required to be compatible with old images - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -The `firewall` layout reuses the built in nvme disk to store the logs, which is way faster and larger than what the sata-dom ssd provides. - -```yaml ---- -id: firewall -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - firewall: ">=2" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sda2" - format: "ext4" - - path: "/var" - device: "/dev/nvme0n1p1" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - device: "/dev/nvme0n1" - wipe: true - partitions: - - number: 1 - label: "var" - size: 0 - type: GPTLinux -``` - -The `storage` layout will be used for the storage servers, which must have mirrored boot disks. - -```yaml ---- -id: storage -constraints: - sizes: - - s3-large-x86 - images: - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/md1" - format: "vfat" - options: "-F32" - - path: "/" - device: "/dev/md2" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid - - device: "/dev/sdb" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid -raid: - - name: "/dev/md1" - level: 1 - devices: - - "/dev/sda1" - - "/dev/sdb1" - options: "--metadata=1.0" - - name: "/dev/md2" - level: 1 - devices: - - "/dev/sda2" - - "/dev/sdb2" - options: "--metadata=1.0" -``` - -The `s3-storage` layout matches the special situation on the s2-xlarge machines. - -```yaml ---- -id: s3-storage -constraints: - sizes: - - c1-large-x86 - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sde1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sde2" - format: "ext4" - - path: "/var/lib" - device: "/dev/sde3" - format: "ext4" -disks: - - device: "/dev/sde" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -A sample `lvm` layout which puts `/var/lib` as stripe on the nvme device - -```yaml ---- -id: lvm -description: "lvm layout" -constraints: - size: - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - createoptions: - - "-F 32" - label: "efi" - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" - - path: "/var/lib" - device: "/dev/vg00/varlib" - format: "ext4" - label: "varlib" - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -volumegroups: - - name: "vg00" - devices: - - "/dev/nvmne0n1" - - "/dev/nvmne0n2" -logicalvolumes: - - name: "varlib" - volumegroup: "vg00" - size: 200 - lvmtype: "striped" -disks: - - device: "/dev/sda" - wipeonreinstall: true - partitions: - - number: 1 - label: "efi" - size: 500 - gpttype: "ef00" - - number: 2 - label: "root" - size: 5000 - gpttype: "8300" - - device: "/dev/nvmne0n1" - wipeonreinstall: false - - device: "/dev/nvmne0n2" - wipeonreinstall: false -``` - -## Components which requires modifications - -- metal-hammer: - - change implementation from build in hard coded logic - - move logic to create fstab from install.sh to metal-hammer -- metal-api: - - new endpoint `filesystemlayouts` - - add optional spec of `filesystemlayout` during `allocation` with validation if given `filesystemlayout` is possible on given size. - - add `allocation.filesystemlayout` in the response, based on either the specified `filesystemlayout` or the calculated one. - - implement `filesystemlayouts` validation for: - - matching to disks in the size - - no overlapping with the sizes/imagefilter specified in `filesystemlayouts` - - all devices specified exists from top to bottom (fs -> disks -> device || fs -> raid -> devices) -- metalctl: - - implement `filesystemlayouts` -- metal-go: - - adopt api changes -- metal-images: - - install mdadm for raid support diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP8/filesystems.drawio b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP8/filesystems.drawio deleted file mode 100644 index 0f0c6ab5..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP8/filesystems.drawio +++ /dev/null @@ -1,43 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP8/filesystems.png b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP8/filesystems.png deleted file mode 100644 index 6d903b7e..00000000 Binary files a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP8/filesystems.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP9/README.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP9/README.md deleted file mode 100644 index a8cae83d..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP9/README.md +++ /dev/null @@ -1,132 +0,0 @@ ---- -slug: /MEP-9-no-open-ports-to-the-data-center -title: MEP-9 -sidebar_position: 9 ---- - -# No Open Ports To the Data Center - -Our metal-stack partitions typically have open ports for metal-stack native services, these are: - -- SSH port on the firewalls -- bmc-reverse-proxy for serial console access through the metal-console - -These open ports are potential security risks. For example, while SSH access is possible only with private key it's still vulnerable to DoS attack. - -Therefore, we want to get rid off these open ports to reduce the attack surface to the data center. - -## Requirements - -- Access to firewall SSH only via VPN -- Easy to update VPN components - -As a next step, we can also consider joining the management servers to the VPN mesh, which would replace typical WireGuard setups for operators to enter resources inside the partition. - -## High Level Design - -[](./architecture.svg) - -> Simplified drawing showing old vs. new architecture. - -### Concerns - -There's few concerns when using WireGuard for implementing VPN: - -1. WireGuard doesn't implement dynamic cipher substitution. Which is important in case one of the crypto methods, used by WireGuard will be broken. The only possible solution for that will be to update WireGuard to a fixed version. -2. Coordination server(Headscale) is a single point of failure. In case it fails, it potentially can disconnect existing members of the network, as WireGuard can't manage dynamic IPs by itself. -3. Headscale is already falls behind Tailscale coordination server implementation. Which can complicate the upgrade to newer version of Tailscale client in case of emergency. - -### Solutions to concerns - -1. Tailscale node software is using userspace implementation of WireGuard -- `wireguard-go`. One of the options is to inject Tailscale client into `metalctl`. And make it available as `metalctl vpn` or similar command. It should be possible to do as `tailscale` node is already available as open sourced Go pkg. That would allow us to control, what version of Tailscale users are using and in case of any critical changes to enforce them to update `metalctl` to use VPN functionality. -2. Would it be a considerable risk? We could look into `wg-dynamic` project to cover this problem. -3. At the moment, repository looks well maintained and the metal-stack team already contributes to it. - -## Implementation Details - -### metal-roles - -`metal-roles` will be responsible for deployment of `headscale` server(via new `headscale` role). It also should provide sufficient config to `metal-api` so it establishes connection with `headscale` gRPC server. - -### New `metalctl` commands - -`metalctl` will be responsible for client-side implementation of this MEP. Specifically, it's by using `metalctl` user expected to connect to firewalls. - -- `metalctl vpn` -- section for VPN related commands: - - `metalctl vpn get key [vpn name] --namespace [namespace name]` -- returns auth key to be used with `tailscale` client for establishing connection. - -Extend `metalctl firewall`: - -- `metalctl firewall ssh [ID]` -- connect to firewall via SSH. - -Extend `metalctl machine`: - -- `metalctl machine ssh [ID]` -- connect to machine via SSH. - -`metalctl` will be able to connect to firewall and machines by running `tailscale` in container. - -### metal-api - -Updates to `metal-api` should be made, so that it's able to add firewalls to VPNs. There should be one Tailscale namespace per project. So if multiple firewalls are created in single project, they will join the same namespace. - -Two new flags should be introduced to connect `metal-api` to `headscale` gRPC server: - -- `headscale-addr` -- specifies address of Headscale grpc API. -- `headscale-api-key` -- specifies temporary API key to connect to Headscale. It should be replaced and then rotated by `metal-api`. - -If `metal-api` initialized with `headscale` connection it should automatically join all created firewalls to VPN. - -Add new endpoint, that will be used by `metalctl` to connect to VPN: - -- `/v1/vpn GET` -- requests auth key from `headscale` server. - -### metal-hammer - -`metal-hammer` acts as an intermediary for machine configuration between `metal-api` and machine's image. Specifically it writes to `/etc/metal/install.yaml` file, data from which later will be used by image's `install.sh` file. - -To implement VPN support we have to add authentication key and VPN server address to `install.yaml` file. This key will be used to join machine to a VPN. - -### metal-images - -Images `install.sh` script have to be updated to work with authentication key and VPN server address, provided in `install.yaml` file. If this key is present, machine should connect to VPN. - -### metal-networker - -`metal-networker` also have to know if VPN was configured. In that case we need to disable public access to SSH and allow all(?) traffic from WireGuard interface. - -### firewall-controller - -`firewall-controller` have to monitor changes in `Firewall` resource and keep `tailscaled` version up-to-date. - -### Resources - -Update `Firewall` resource to include desired/actual `tailscale` version: - -``` -Firewall: - Spec: - tailscale: - Version: Minimal version - ... - Status: - ... - VPN: - Status: Boolean field - tailscale: - Version: Actual version - ... -``` - -### bmc-reverse-proxy - -TODO - -## References - -1. [WireGuard: Next Generation Secure Network Tunnel](https://www.youtube.com/watch?v=88GyLoZbDNw) -2. [How Tailscale works](https://tailscale.com/blog/how-tailscale-works) -3. [Tailscale is officially SOC 2 compliant](https://tailscale.com/blog/soc2) -4. [Why not Wireguard](https://www.ipfire.org/blog/why-not-wireguard) -5. [Wireguard: Known Limitations](https://www.wireguard.com/known-limitations/) -6. [Wireguard: Things That Might Be Accomplished](https://www.wireguard.com/todo/) -7. [Headscale: Tailscale control protocol v2](https://github.com/juanfont/headscale/issues/526) diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP9/architecture.drawio b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP9/architecture.drawio deleted file mode 100644 index adb09214..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP9/architecture.drawio +++ /dev/null @@ -1,324 +0,0 @@ - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - -
-
-
- headscale -
-
-
-
- - headscale - -
-
- - - - - - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
-
- - - - - Viewer does not support full SVG 1.1 - - - -
diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP9/architecture.svg b/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP9/architecture.svg deleted file mode 100644 index fd268d2f..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/MEP9/architecture.svg +++ /dev/null @@ -1 +0,0 @@ -
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
headscale
headscale
tailscaled
tailscaled
tailscaled
tailscaled
Internet
Internet
Internet
Internet
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/_category_.json b/versioned_docs/version-v0.21.11/contributing/01-Proposals/_category_.json deleted file mode 100644 index 2e7fa4bf..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "position": 1, - "label": "Enhancement Proposals" -} \ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/contributing/01-Proposals/index.md b/versioned_docs/version-v0.21.11/contributing/01-Proposals/index.md deleted file mode 100644 index 0f6eddc3..00000000 --- a/versioned_docs/version-v0.21.11/contributing/01-Proposals/index.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -slug: /enhancement-proposals -title: Enhancement Proposals -sidebar_position: 1 ---- - -# Metal Stack Enhancement Proposals (MEPs) - -This section contains proposals which address substantial modifications to metal-stack. - -Every proposal has a short name which starts with _MEP_ followed by an incremental, unique number. Proposals should be raised as pull requests in the [website](https://github.com/metal-stack/website) repository and can be discussed in Github issues. - -The list of proposals and their current state is listed in the table below. - -Possible states are: - -- `In Discussion` -- `Accepted` -- `Declined` -- `In Progress` -- `Completed` -- `Aborted` - -Once a proposal was accepted, an issue should be raised and the implementation should be done in a separate PR. - -| Name | Description | State | Progress | -| :------------------------------------------------------------- | :--------------------------------------------- | :-------------: | :----------------------------------------------------------------: | -| [MEP-1](MEP1/README.md) | Distributed Control Plane Deployment | `Declined` | | -| [MEP-2](MEP2/README.md) | Two Factor Authentication | `Aborted` | | -| [MEP-3](MEP3/README.md) | Machine Re-Installation to preserve local data | `Completed` | | -| [MEP-4](MEP4/README.md) | Multi-tenancy for the metal-api | `In Progress` | [releases#236](https://github.com/metal-stack/releases/issues/236) | -| [MEP-5](MEP5/README.md) | Shared Networks | `Completed` | | -| [MEP-6](MEP6/README.md) | DMZ Networks | `Completed` | | -| [MEP-7](https://github.com/metal-stack/docs-archive/pull/51) | Passing environment variables to machines | `Declined` | | -| [MEP-8](MEP8/README.md) | Configurable Filesystemlayout | `Completed` | | -| [MEP-9](MEP9/README.md) | No Open Ports To the Data Center | `Completed` | | -| [MEP-10](MEP10/README.md) | SONiC Support | `Completed` | | -| [MEP-11](MEP11/README.md) | Auditing of metal-stack resources | `Completed` | | -| [MEP-12](MEP12/README.md) | Rack Spreading | `Completed` | | -| [MEP-13](MEP13/README.md) | IPv6 | `Completed` | | -| [MEP-14](MEP14/README.md) | Independence from external sources | `Completed` | | -| [MEP-15](https://github.com/metal-stack/docs-archive/pull/232) | HAL Improvements | `In Discussion` | [releases#238](https://github.com/metal-stack/releases/issues/238) | -| [MEP-16](MEP16/README.md) | Firewall Support for Cluster API Provider | `Accepted` | [releases#237](https://github.com/metal-stack/releases/issues/237) | -| [MEP-17](MEP17/README.md) | Global Network View | `In Discussion` | | -| [MEP-18](MEP18/README.md) | Autonomous Control Plane | `In Discussion` | | - -## Proposal Process - -1. Before starting a new proposal, it is advised to have a quick chat with one of the maintainers. -2. Create a draft pull request in the [website](https://github.com/metal-stack/website) repository with your proposal. Your proposal doesn't have to be finished at this point. -3. Share the PR in the [metal-stack Slack](https://metal-stack.slack.com/) and invite maintainers to review it. -4. The review itself will probably take place in multiple iterations. Don't be discouraged if your proposal is not accepted right away. The goal is to reach consensus. -5. Once your proposal is accepted, create an umbrella issue in the relevant repository or when multiple repositories are involved in the [releases](https://github.com/metal-stack/releases). -6. Other issues should be created in different repositories and linked to the umbrella issue. -7. Unless stated otherwise, the proposer is responsible for the implementation of the proposal. - -## How to Write a Good MEP - -In the first section of your MEP, start with the current situation and the motivation for the change. Summarize your proposal briefly. - -Next follows the main part: describe your proposal in detail. Which parts of of metal-stack are affected? Are there API changes? If yes, describe them and provide examples here. -Try to think of side effects your proposal might have. Try to provide a view on how your proposal affects users of metal-stack. -Highlight breaking changes and think of a migration path for existing users. If your proposal affects multiple components, try to describe the interaction between them. - -After the main part of your proposal, feel free to add additional sections, e.g. about alternatives that were considered, non-goals or future possibilities. - -Depending on the complexity of your proposal, you might want to add a section about the implementation plan or roadmap. - -You can have a look at the existing MEPs for inspiration. As you will notice: not every MEP has the same structure. Feel free to structure your MEP in a way that makes sense for your proposal. diff --git a/versioned_docs/version-v0.21.11/contributing/02-planning-meetings.mdx b/versioned_docs/version-v0.21.11/contributing/02-planning-meetings.mdx deleted file mode 100644 index df10177b..00000000 --- a/versioned_docs/version-v0.21.11/contributing/02-planning-meetings.mdx +++ /dev/null @@ -1,120 +0,0 @@ ---- -slug: /planning-meetings -title: Planning Meetings -sidebar_position: 2 ---- - -# Planning Meetings - -Public planning meetings are held **biweekly** on **odd calendar weeks** from **14:00 to 14:30** (Berlin/Europe timezone) on Microsoft Teams. The purpose is to provide an overview of our current projects and priorities, as well as to discuss new topics and issues within the group. - -export function PlanningMeetingDatesTable() { - const today = new Date(); - const dayOfWeek = today.getDay(); - - let daysUntilMonday = 0; - switch (dayOfWeek) { - case 0: - daysUntilMonday = 1; - break; - case 1: - daysUntilMonday = 0; - break; - default: - daysUntilMonday = 8 - dayOfWeek; - } - - const nextMonday = new Date(); - nextMonday.setDate(nextMonday.getDate() + daysUntilMonday) - - let onejan = new Date(today.getFullYear(), 0, 1); - let week = Math.ceil((((nextMonday.getTime() - onejan.getTime()) / 86400000) + onejan.getDay() + 1) / 7); - - if (week % 2 === 0) { - nextMonday.setDate(nextMonday.getDate() + 7) - } - - const blacklist = [ - new Date('2025-12-29'), - ] - - const amount = 8 - const dates = []; - - for (let i = 0; i < amount; i++) { - const nextDate = new Date(nextMonday); - nextDate.setDate(nextDate.getDate() + (i * 14)) - - if (blacklist.find(item => {return item.toDateString() == nextDate.toDateString()}) !== undefined ) { - continue - } - - dates.push(nextDate.toDateString()) - } - - return ( - - - - - - - - - - {dates.map((date, index) => ( - - - - - - ))} - -
DateTimeLink
{date}14:00 – 14:30Join Link
- ) -} - - - -Our [development planning board](https://github.com/orgs/metal-stack/projects/34) can be found on GitHub. - -[//]: <> (The C025PB1EUKC in the slack url references the #devs channel.) -If you want to get an invitation to the event, please drop us a line on our [Slack channel](https://metal-stack.slack.com/archives/C025PB1EUKC). - -Planning meetings are currently not recorded. The meetings are held either in English or German depending on the attendees. - -:::info -Note that anyone can contribute to metal-stack without participating in planning meetings. However, if you want to speed up the review process for your requirements, it might be helpful to attend the meetings. -::: - -## Agenda - -Here is the agenda that we generally want to follow in a planning meeting: - -- Possibility to bring up news that are interesting for every developer of the metal-stack org -- Check `Done` column and archive cards - - Attendees have the chance to briefly present achievements if they want -- Check the `In Progress` column and discuss whether these tasks are still worked on, there were significant blockers or they can be lower-prioritized -- Check new issues labelled with `triage` and prioritize them -- Allow attendees to bring up issues and prioritize them - - Attendees have the chance to briefly present these new issues - -## Idea Backlog - -The backlog contains ideas of what could become part of the roadmap in the future. The list is ordered alphabetically. Therefore, the order does not express the importance or weight of a backlog item. - -We incorporate community feedback into the roadmap. If you think that important points are missing in the backlog, please share your ideas with us. We have a Slack channel. Please check out [metal-stack.io](https://metal-stack.io) for contact information. - -:::danger -By no means this list is a promise of what is being worked on in the near future. It is just a summary of ideas that was agreed on to be "nice to have". It is up to the investors, maintainers and the community to choose topics from this list and to implement them or to remove them from the list. -::: - -- Add metal-stack to [Gardener conformance test grid](https://testgrid.k8s.io/gardener-all) -- Autoscaler for metal control plane components -- CI dashboard and public integration testing -- Improved release and deploy processes (GitOps, [Spinnaker](https://spinnaker.io/), [Flux](https://fluxcd.io/)) -- Machine internet without firewalls -- metal-stack dashboard (UI) -- Offer our metal-stack extensions as enterprise products (accounting, cluster-api, S3) (neither of them will ever be required for running metal-stack, they just add extra value for certain enterprises) -- Partition managed by Kubernetes (with Kubelets joining the control plane cluster) -- Public offering / demo playground diff --git a/versioned_docs/version-v0.21.11/contributing/03-contribution-guideline.md b/versioned_docs/version-v0.21.11/contributing/03-contribution-guideline.md deleted file mode 100644 index 010c2a05..00000000 --- a/versioned_docs/version-v0.21.11/contributing/03-contribution-guideline.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -slug: /contribution-guideline -title: Contribution Guideline -sidebar_position: 3 ---- - -# Contribution Guideline - -This document describes the way we want to contribute code to the projects of metal-stack, which are hosted on [github.com/metal-stack](https://github.com/metal-stack). - -The document is meant to be understood as a general guideline for contributions, but not as burden to be placed on a developer. Use your best judgment when contributing code. Try to be as clean and precise as possible when writing code and try to make your code as maintainable and understandable as possible for other people. - -Even if it should go without saying, we live an open culture of discussion, in which everybody is welcome to participate. We treat every contribution with respect and objectiveness with the general aim to write software of quality. - -If you want, feel free to propose changes to this document in a pull request. - -## How Can I Contribute? - -Open a Github issue in the project you would like to contribute. Within the issue, your idea can be discussed. It is also possible to directly create a pull request when the set of changes is relatively small. - -When opening an issue please consider the following aspects: - -1. Create a meaningful issue describing the WHY? of your contribution. -1. Try to set appropriate labels to the issue. For example, attach the `triage` label to your issue if you want it to be discussed in the next [planning meeting](./02-planning-meetings.mdx). It might be useful to attend the meeting if you want to emphasize it being worked on. - -### Pull Requests - -The process described here has several goals: - -- Maintain quality -- Enable a sustainable system to review contributions -- Enable documented and reproducible addition of contributions - -1. Create a repository fork within the context of that issue. Members of the organization may work on the repository directly without a fork, which allows building development artifacts more easily. -1. Develop, document and test your contribution (try not to solve more than one issue in a single pull request). -1. Create a Draft Pull Request to the repository's main branch. -1. Create a meaningful description of the pull request or reference the related issue. The pull request template explains what the content should include, please read it. -1. Ask for merging your contribution by removing the draft marker. Repository maintainers (see [Code Ownership](#code-ownership)) are notified automatically, but you can also reach out to people directly on Slack if you want a review from a specific person. - -## General Objectives - -This section contains language-agnostic topics that all metal-stack projects are trying to follow. - -### Code Ownership - -The code base is owned by the entire team and every member is allowed to contribute changes to any of the projects. This is considered as collective code ownership[^1]. - -As a matter of fact, there are persons in a project, which already have experience with the sources. These are defined directly in the repository's [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) file. If you want to merge changes into the master branch, it is advisable to include code owners into the process of discussion and merging. - -### Microservices - -One major ambition of metal-stack is to follow the idea of [microservices](https://en.wikipedia.org/wiki/Microservices). This way, we want to achieve that we can - -- adapt to changes faster than with monolithic architectures, -- be free of restrictions due to certain choices of technology, -- leverage powerful traits of cloud infrastructures (e.g. high-scalability, high-availability, ...). - -### Programming Languages - -We are generally open to write code in any language that fits best to the function of the software. However, we encourage [golang](https://en.wikipedia.org/wiki/Go_(programming_language)) to be the main language of metal-stack as we think that it makes development faster when not establishing too many different languages in our architecture. Reason for this is that we are striving for consistent behavior of the microservices, similar to what has been described for the Twelve-Factor App (see [12 Factor](https://12factor.net/)). We help enforcing unified behavior by allowing a small layer of shared code for every programming language. We will refer to this shared code as "libraries" for the rest of this document. - -### Artifacts - -Artifacts are always produced by a CI process (Github Actions). - -Docker images are published on the Github Container Registry of the metal-stack organization. - -Binary artifacts or OS images can be uploaded to `images.metal-stack.io` if necessary. - -When building Docker images, please consider our build tool [docker-make](https://github.com/fi-ts/docker-make) or the specific [docker-make action](https://github.com/fi-ts/action-docker-make) respectively. - -### APIs - -We are currently making use of [Swagger](https://swagger.io/) when we exposing traditional REST APIs for end-users. This helps us with being technology-agnostic as we can generate clients in almost any language using [go-swagger](https://goswagger.io/). Swagger additionally simplifies the documentation of our APIs. - -Most APIs though are not required to be user-facing but are of technical nature. These are preferred to be implemented using [grpc](https://grpc.io/). - -#### Versioning - -Artifacts are versioned by tagging the respective repository with a tag starting with the letter `v`. After the letter, there stands a valid [semantic version](https://semver.org/). - -### Documentation - -In order to make it easier for others to understand a project, we document general information and usage instructions in a `README.md` in any project. - -In addition to that, we document a microservice in the [docs](https://github.com/metal-stack/docs) repository. The documentation should contain the reasoning why this service exists and why it was being implemented the way it was being implemented. The aim of this procedure is to reduce the time for contributors to comprehend architectural decisions that were made during the process of writing the software and to clarify the general purpose of this service in the entire context of the software. - -## Guidelines - -This chapter describes general guidelines on how to develop and contribute code for a certain programming language. - -### Golang - -Development follows the official guide to: - -- Write clear, idiomatic Go code[^2] -- Learn from mistakes that must not be repeated[^3] -- Apply appropriate names to your artifacts: - - [https://go.dev/talks/2014/names.slide](https://go.dev/talks/2014/names.slide) - - [https://go.dev/blog/package-names](https://go.dev/blog/package-names) - - [https://go.dev/doc/effective_go#names](https://go.dev/doc/effective_go#names) -- Enable others to understand the reasoning of non-trivial code sequences by applying a meaningful documentation. - -#### Development Decisions - -- **Dependency Management** by using Go modules -- **Build and Test Automation** by using [GNU Make](https://man7.org/linux/man-pages/man1/make.1p.html). -- **End-user APIs** should consider using go-swagger and [Go-Restful](https://github.com/emicklei/go-restful) - **Technical APIs** should consider using [grpc](https://grpc.io/) - -#### Libraries - -metal-stack maintains several libraries that you should utilize in your project in order to unify common behavior. Some of these projects are: - -- [metal-go](https://github.com/metal-stack/metal-go) -- [metal-lib](https://github.com/metal-stack/metal-lib) - -#### Error Handling with Generated Swagger Clients - -From the server-side you should ensure that you are returning the common error json struct in case of an error as defined in the `metal-lib/httperrors`. Ensure you are using `go-restful >= v2.9.1` and `go-restful-openapi >= v0.13.1` (allows default responses with error codes other than 200). - -### Documentation - -We want to share knowledge and keep things simple. If things cannot kept simple we want to enable everybody to understand them by: - -- Document in short sentences[^4]. -- Do not explain the HOW (this is already documented by your code and documenting the obvious is considered a defect). -- Explain the WHY. Add a "to" in your documentation line to force yourself to explain the reasonning (e.g. "` to `"). - -### Python - -Development follows the official guide to: - -- Style Guide for Python Code (PEP 8)[^5] - - The use of an IDE like [PyCharm](https://www.jetbrains.com/pycharm/) helps to write compliant code easily -- Consider [setuptools](https://pythonhosted.org/an_example_pypi_project/setuptools.html) for packaging -- If you want to add a Python microservice to the mix, consider [pyinstaller](https://github.com/pyinstaller/pyinstaller) on Alpine to achieve small image sizes - -[^1]: [https://martinfowler.com/bliki/CodeOwnership.html](https://martinfowler.com/bliki/CodeOwnership.html) - -[^2]: [https://go.dev/doc/effective_go](https://go.dev/doc/effective_go) - -[^3]: [https://github.com/golang/go/wiki/CodeReviewComments](https://github.com/golang/go/wiki/CodeReviewComments) - -[^4]: [https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences](https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences) - -[^5]: [https://www.python.org/dev/peps/pep-0008/](https://www.python.org/dev/peps/pep-0008/) diff --git a/versioned_docs/version-v0.21.11/contributing/04-release-flow.md b/versioned_docs/version-v0.21.11/contributing/04-release-flow.md deleted file mode 100644 index 2a6403b7..00000000 --- a/versioned_docs/version-v0.21.11/contributing/04-release-flow.md +++ /dev/null @@ -1,107 +0,0 @@ ---- -slug: /release-flow -title: Release Flow -sidebar_position: 4 ---- - -# Releases - -The metal-stack contains of many microservices that depend on each other. The automated release flow is there to ensure that all components work together flawlessly for every metal-stack release. - -Releases and integration tests are published through our [release repository](https://github.com/metal-stack/releases). You can also find the [release notes](https://github.com/metal-stack/releases/releases) for this metal-stack version in there. The release notes contain information about new features, upgrade paths and bug fixes. - -If you want, you can sign up at our Slack channel where we are announcing every new release. Often, we provide additional information for metal-stack administrators and adopters at this place, too. - -This document is intended for developers, especially maintainers of metal-stack projects. - -## Release Flow - -The following diagram attempts to describe our current release flow: - -![](release_flow.svg) - -A release is created in the following way: - -- Individual repository maintainers within the metal-stack GitHub Organization can publish a release of their component. -- This release is automatically pushed to the `develop` branch of the release repository by the metal-robot. -- A push triggers a virtual release integration test using the mini-lab environment. This setup launches metal-stack with the `sonic` and `gardener` flavors to validate the different Ansible roles and execute basic operations across the metal-stack layer. -- To contribute components that are not directly part of the release vector, a pull request must be made against the `develop` branch of the release repository. Release maintainers may push directly to the `develop` branch. -- The release maintainers can `/freeze` the `develop` branch, effectively stopping the metal-robot from pushing component releases to this branch. -- The `develop` branch is tagged by a release maintainer with a `-rc.x` suffix to create a __release candidate__. -- The release candidate must pass a large integration test suite on a real environment, which is currently run by FI-TS. It tests the entire machine provisioning engine including the integration with Gardener, the deployment, metal-images and Kubernetes conformance tests. -- If the integration tests pass, the PR of the `develop` branch must be approved by at least two release maintainers. -- A release is created via GitHub releases, including all release notes, with a tag on the `main` branch. - -## FAQ - -**Question: I need PR #xyz to go into the release, why did you not include it?** - -Answer: It's not on purpose if we miss a PR to be included into a metal-stack release. Please use the pending pull request from `develop` into `master` as soon as it is open and comment which pull request you want to have included into the release. Also consider attending our planning meetings or contact us in our Slack channel if you have urgent requirements that need to be dealt with. - -**Question: Who is responsible for the releases? Who can freeze a release?** - -Answer: Every repository in metal-stack has a `CODEOWNERS` file pointing to a maintainer team. This is also true for the releases repository. Only release repository maintainers are allowed to `/freeze` a release (meaning the metal-robot does not automatically append new component releases to the release vector anymore). - -**Question: I can't push to the `develop` branch of this repository? How can I request changes to the release vector?** - -Answer: Most changes are automatically integrated by the metal-robot. For manually managed components, please raise a pull request against the `develop` branch. Only release maintainers are allowed to push to `develop` as otherwise it would be possible to mess up the release pipeline. - -**Question: What requirements need to be fulfilled to add a repository to the release vector?** - -Please see the section below named [Requirements for Release Vector Repositories](#requirements-for-release-vector-repositories). - -### Requirements for Release Vector Repositories - -Before adding a repository in the metal-stack org to the releases repository, it is advised for the maintainer to fulfill the following points: - -- The following files should be present at the repository root: - - [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) - - When a repository is created, the metal-robot automatically creates a -maintainers team in our GitHub org. - - The CODEOWNERS file should reference this team. - - The team should contain at least two maintainers. - - `LICENSE` - - This usually should be MIT with "metal-stack" as authors. - - `CONTRIBUTING.md` - - This should contain the following content: - ``` - # Contributing - - Please check out the [contributing section](https://docs.metal-stack.io/stable/development/contributing/) in our [docs](https://docs.metal-stack.io/). - ``` - - `README.md` -- The `developers-core` team should be given repository access with `write` role, the codeowners team should have the `maintain` role -- Release artifacts should have an SPDX-formatted SBOM attached. - - For container images these are embedded using Buildx. -- The following branch protection rules should be set: - - The mainline should be protected. - - A pull request should be required before merging (required by at least one code owner). - - Status checks should be required to pass. - - Force push should not be allowed on this branch. -- One person from the releases maintainers has to add the repository to the metal-robot in order to pick up the releases, add them to the release vector and generate release notes. - -### How-To Release a Project - -[release-drafter](https://github.com/release-drafter/release-drafter) is preferred in order to generate release notes from merged PRs for your projects. It should be triggered for pushes on your main branch. - -The draft is then used to create a project release. The release has to be published through the GitHub UI as demonstrated in the screenshot below. - -**Tagging the repository is not enough as repository tagging does not associate your release notes to your release!** - -![](release.png) - -Some further remarks: - -- Use semver versions with `v` prefix for your tags -- Name your release after your release tag -- The metal-robot only picks up lines from your release notes that start with `-` or `*` (unordered list items) and appends them to the according section in the aggregated release draft -- A tag created through a GitHub UI release does not trigger a `push` event . This means, your pipeline will not start to run with the `push` trigger when publishing through the UI. - - Instead, use the `published` [release event trigger](https://docs.github.com/en/actions/reference/events-that-trigger-workflows#release) for your actions: - - ```yaml - on: - release: - types: - - published - ``` -- In case they are necessary, please do not forget to include `NOTEWORTHY`, `ACTIONS_REQUIRED` or `BREAKING_CHANGE` sections into releases. More information on those release draft sections can be read in a pull request template. diff --git a/versioned_docs/version-v0.21.11/contributing/05-community.md b/versioned_docs/version-v0.21.11/contributing/05-community.md deleted file mode 100644 index 61eaf099..00000000 --- a/versioned_docs/version-v0.21.11/contributing/05-community.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -slug: /community -title: Community -sidebar_position: 5 -draft: true ---- - -# Community - -(Slack channel, community events like FOSDEM, Kubernetes Community Days..., blog -articles) diff --git a/versioned_docs/version-v0.21.11/contributing/release.png b/versioned_docs/version-v0.21.11/contributing/release.png deleted file mode 100644 index 598b1182..00000000 Binary files a/versioned_docs/version-v0.21.11/contributing/release.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.11/contributing/release_flow.drawio b/versioned_docs/version-v0.21.11/contributing/release_flow.drawio deleted file mode 100644 index 6ca6b34f..00000000 --- a/versioned_docs/version-v0.21.11/contributing/release_flow.drawio +++ /dev/null @@ -1,721 +0,0 @@ - - - - - - - - - - - -
-
-
- Review release notes -
-
-
-
- - Review release notes - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- Organization Webhook -
-
-
-
- - Organization Webhook - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- - Publish release - -
-
-
-
- - Publish release - -
-
-
- - - - - - - - -
-
-
- Maintainer -
-
-
-
- - Maint... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- metal-robot release handler -
-
-
-
- - metal-robot release han... - -
-
-
- - - - - - - - -
-
-
- - no - -
-
-
-
- - no - -
-
-
- - - - - - - - -
-
-
- - yes - -
-
-
-
- - yes - -
-
-
- - - - - - - -
-
-
- version in event newer than release vector version -
-
-
-
- - version in event newer than... - -
-
-
- - - - - - - -
-
-
- - do nothing - -
-
-
-
- - do nothing - -
-
-
- - - - - - - - - - - - -
-
-
- Github Action -
-
-
-
- - Github Action - -
-
-
- - - - - - - -
-
-
- Bump version in release vector and push to - - develop - -
-
-
-
- - Bump version in release vector... - -
-
-
- - - - - - - - - - - -
-
-
- Open pull request from - - develop - - to - - master - -
-
-
-
- - Open pull request from develop... - -
-
-
- - - - - - - -
-
-
- Update aggregated release draft in - - metal-stack/releases - -
-
-
-
- - Update aggregated release draf... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Integration Testing -
-
-
-
- - Integration Testing - -
-
-
- - - - - - - - - - - -
-
-
- Merge to - - master - -
-
-
-
- - Merge to master - -
-
-
- - - - - - - - - - - - -
-
-
- Review -
-
-
-
- - Review - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Tests suceeded and PR changes reviewed -
-
-
-
- - Tests suceeded and PR chang... - -
-
-
- - - - - - - -
-
-
- - publish results to #integration - -
-
-
-
- - publish results to #integr... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Release metal-stack -
-
-
-
- - Release metal-stack - -
-
-
- - - - - - - - - - - -
-
-
- - publish to #announcements - -
-
-
-
- - publish to #announcements - -
-
-
- - - - - - - -
-
-
- - - metal-stack/docs - - pull request - -
-
-
-
- - metal-stack/docs pull requ... - -
-
-
- - - - - - - - - - - - -
-
-
- Freeze -
-
-
-
- - Freeze - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Freeze - - develop - - and create a release candidate -
-
-
-
- - Freeze develop and create a rel... - -
-
-
- - - - - - - -
-
-
- Large integration suites -
- - (currently owned by FI-TS, not public) - -
-
-
-
-
- - Large integration suites... - -
-
-
- - - - - - - - -
-
-
- Run -
-
-
-
- - Run - -
-
-
- - - - -
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.21.11/contributing/release_flow.svg b/versioned_docs/version-v0.21.11/contributing/release_flow.svg deleted file mode 100644 index 55cdd493..00000000 --- a/versioned_docs/version-v0.21.11/contributing/release_flow.svg +++ /dev/null @@ -1 +0,0 @@ -
Review release notes
Review release notes
projects
projects
projects
projects
Organization Webhook
Organization Webhook
projects
projects
Publish release
Publish release
Maintainer
Maint...
metal-robot release handler
metal-robot release han...
no
no
yes
yes
version in event newer than release vector version
version in event newer than...
do nothing
do nothing
Github Action
Github Action
Bump version in release vector and push todevelop
Bump version in release vector...
Open pull request fromdeveloptomaster
Open pull request from develop...
Update aggregated release draft inmetal-stack/releases
Update aggregated release draf...
Integration Testing
Integration Testing
Merge tomaster
Merge to master
Review
Review
Tests suceeded and PR changes reviewed
Tests suceeded and PR chang...
publish results to #integration
publish results to #integr...
Release metal-stack
Release metal-stack
publish to #announcements
publish to #announcements
metal-stack/docspull request
metal-stack/docs pull requ...
Freeze
Freeze
Freezedevelopand create a release candidate
Freeze develop and create a rel...
Large integration suites
(currently owned by FI-TS, not public)
Large integration suites...
Run
Run
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.11/docs/02-General/04-flavors-of-metalstack.md b/versioned_docs/version-v0.21.11/docs/02-General/04-flavors-of-metalstack.md index 7da427fc..2277ca6b 100644 --- a/versioned_docs/version-v0.21.11/docs/02-General/04-flavors-of-metalstack.md +++ b/versioned_docs/version-v0.21.11/docs/02-General/04-flavors-of-metalstack.md @@ -14,7 +14,7 @@ As modern infrastructure and cloud native applications are designed with Kuberne Regardless which flavor of metal-stack you use, it is always possible to manually provision machines, networks and ip addresses. This is the most basic way of using metal-stack and is very similar to how traditional bare metal infrastructures are managed. -Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](../../contributing/01-Proposals/MEP4/README.md) and [MEP-16](../../contributing/01-Proposals/MEP16/README.md) in the future. +Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api) and [MEP-16](/community/MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller) in the future. ## Gardener diff --git a/versioned_docs/version-v0.21.11/docs/04-For Operators/03-deployment-guide.mdx b/versioned_docs/version-v0.21.11/docs/04-For Operators/03-deployment-guide.mdx index 58ddafd3..6be800cd 100644 --- a/versioned_docs/version-v0.21.11/docs/04-For Operators/03-deployment-guide.mdx +++ b/versioned_docs/version-v0.21.11/docs/04-For Operators/03-deployment-guide.mdx @@ -31,7 +31,7 @@ You can use the [mini-lab](https://github.com/metal-stack/mini-lab) as a templat The metal control plane is typically deployed in a Kubernetes cluster. Therefore, this document will assume that you have a Kubernetes cluster ready for getting deployed. Even though it is theoretically possible to deploy metal-stack without Kubernetes, we strongly advise you to use the described method because we believe that Kubernetes gives you a lot of benefits regarding the stability and maintainability of the application deployment. :::tip -For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](../../contributing/01-Proposals/MEP18/README.md) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). +For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](/community/MEP-18-autonomous-control-plane) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). ::: Let's start off with a fresh folder for your deployment: diff --git a/versioned_docs/version-v0.21.11/docs/05-Concepts/01-architecture.mdx b/versioned_docs/version-v0.21.11/docs/05-Concepts/01-architecture.mdx index 709960e3..75298df9 100644 --- a/versioned_docs/version-v0.21.11/docs/05-Concepts/01-architecture.mdx +++ b/versioned_docs/version-v0.21.11/docs/05-Concepts/01-architecture.mdx @@ -152,4 +152,4 @@ Thus, for creating a partition as well as a machine or a firewall, the flags `dn In order to be fully offline resilient, make sure to check out `metal-image-cache-sync`. This component provides copies of `metal-images`, `metal-kernel` and `metal-hammer`. -This feature is related to [MEP14](../../contributing/01-Proposals/MEP14/README.md). +This feature is related to [MEP14](/community/MEP-14-independence-from-external-sources). diff --git a/versioned_docs/version-v0.21.11/docs/05-Concepts/02-user-management.md b/versioned_docs/version-v0.21.11/docs/05-Concepts/02-user-management.md index f1ee2778..ba742ee9 100644 --- a/versioned_docs/version-v0.21.11/docs/05-Concepts/02-user-management.md +++ b/versioned_docs/version-v0.21.11/docs/05-Concepts/02-user-management.md @@ -7,7 +7,7 @@ sidebar_position: 2 # User Management At the moment, metal-stack can more or less be seen as a low-level API that does not scope access based on projects and tenants. -Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](../../contributing/01-Proposals/MEP4/README.md). +Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](/community/MEP-4-multi-tenancy-for-the-metal-api). Until then projects and tenants can be created, but have no effect on access control. diff --git a/versioned_docs/version-v0.21.11/docs/06-For CISOs/Security/01-principles.md b/versioned_docs/version-v0.21.11/docs/06-For CISOs/Security/01-principles.md index 8e7030f5..e327ec4a 100644 --- a/versioned_docs/version-v0.21.11/docs/06-For CISOs/Security/01-principles.md +++ b/versioned_docs/version-v0.21.11/docs/06-For CISOs/Security/01-principles.md @@ -15,7 +15,7 @@ The minimal need to know principle is a security concept that restricts access t ### RBAC :::info -As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](../../../contributing/01-Proposals/MEP4/README.md). +As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](..//community/MEP-4-multi-tenancy-for-the-metal-api). ::: As described in our [User Management](../../05-Concepts/02-user-management.md) concept the [metal-api](https://github.com/metal-stack/metal-api) currently offers three different user roles for authorization: diff --git a/versioned_docs/version-v0.21.11/docs/06-For CISOs/Security/04-communication-matrix.md b/versioned_docs/version-v0.21.11/docs/06-For CISOs/Security/04-communication-matrix.md index 07df2607..24c1bc1d 100644 --- a/versioned_docs/version-v0.21.11/docs/06-For CISOs/Security/04-communication-matrix.md +++ b/versioned_docs/version-v0.21.11/docs/06-For CISOs/Security/04-communication-matrix.md @@ -116,7 +116,7 @@ Please note that every [networking setup](../../05-Concepts/03-Network/01-theory | VLAN | Switches, Firewalls | Layer 2 traffic segmentation. | | VXLAN | Switches, Firewalls | Encapsulate Layer 2 frames in Layer 3 packets for network virtualization. | | EVPN | Switches, Firewalls | Overlay network technology for scalable and flexible network architectures. | -| VPN | Firewalls | Management access [without open SSH ports](../../../contributing/01-Proposals/MEP9/README.md). | +| VPN | Firewalls | Management access [without open SSH ports](..//community/MEP-9-no-open-ports-to-the-data-center). | | BGP | Multiple | Routing protocol for dynamic routing and network management. | | SSH | Management Server, Switches | Secure shell access for management and configuration. | | LLDP | Switches, Machines | Link Layer Discovery Protocol for network device discovery. | diff --git a/versioned_docs/version-v0.21.11/docs/06-For CISOs/rbac.md b/versioned_docs/version-v0.21.11/docs/06-For CISOs/rbac.md index 9a87b896..06c902bb 100644 --- a/versioned_docs/version-v0.21.11/docs/06-For CISOs/rbac.md +++ b/versioned_docs/version-v0.21.11/docs/06-For CISOs/rbac.md @@ -31,4 +31,4 @@ To ensure that internal components interact securely with the metal-api, metal-s Users can interact with the metal-api using [metalctl](https://github.com/metal-stack/metalctl), the command-line interface provided by metal-stack. Depending on the required operations, users should authenticate with the appropriate role to match their level of access. -As part of [MEP-4](../../contributing/01-Proposals/MEP4/README.md), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. +As part of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. diff --git a/versioned_docs/version-v0.21.11/docs/06-For CISOs/remote-access.md b/versioned_docs/version-v0.21.11/docs/06-For CISOs/remote-access.md index 0b8dbb19..dc24e82f 100644 --- a/versioned_docs/version-v0.21.11/docs/06-For CISOs/remote-access.md +++ b/versioned_docs/version-v0.21.11/docs/06-For CISOs/remote-access.md @@ -6,7 +6,7 @@ title: Remote Access ## Machines and Firewalls -Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](../../contributing/01-Proposals/MEP9/README.md). Administrators can access machines in two primary ways. +Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](/community/MEP-9-no-open-ports-to-the-data-center). Administrators can access machines in two primary ways. **Out-of-band management via SOL** @@ -26,4 +26,4 @@ This approach uses the [`metal-console`](../08-References/Control%20Plane/metal- Both methods ensure secure and controlled access to machines without exposing them unnecessarily to the network, maintaining the integrity and safety of the infrastructure. -Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](../../contributing/01-Proposals/MEP4/README.md). \ No newline at end of file +Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api). \ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/Distributed-API-Working.png b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/Distributed-API-Working.png deleted file mode 100644 index 899e223d..00000000 Binary files a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/Distributed-API-Working.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/Distributed-API.png b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/Distributed-API.png deleted file mode 100644 index 688c7c2e..00000000 Binary files a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/Distributed-API.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/Distributed-Deployment.png b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/Distributed-Deployment.png deleted file mode 100644 index 8bba51b8..00000000 Binary files a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/Distributed-Deployment.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/Distributed.drawio b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/Distributed.drawio deleted file mode 100644 index f7c6fe79..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/Distributed.drawio +++ /dev/null @@ -1 +0,0 @@ -7V3bcts2EP0aP8pDgPfH2ImbziRTt+m0zVMHpmCJMUUoFG1L/fqC4kUkAEoULwDkRC8WIBIkF2d3z+6C8JV5u9r+kqD18jOZ4+gKGvPtlfn+CtKPY9M/Wc8u7/FtM+9YJOE87wKHji/hf7joNIre53CON40DU0KiNFw3OwMSxzhIG30oSchr87BHEjWvukYLzHV8CVDE9/4dztNl3uvZxqH/Iw4Xy/LKwCh+eUDB0yIhz3FxvStoPu4/+c8rVI5VHL9Zojl5rXWZH67M24SQNP+22t7iKJNtKbb8vLuWX6v7TnCcdjkhWVoPJP5zF3/w/trOvG+vv70+z4rJe0HRMy4fw4noeDePhA5L5YmC/Afn+3N2pzcfcfSC0zBAh67s4dJd1DwuO3+22U/3O3oAgOtt/Qxnkf29R0kapiGJy6vS288vnP9eSK4aHFIhrrOvz6voLkEr+vXmdRmm+Ms6v89XClLat0xXEW0B+nU/QziTgUFb1SRkjYCswqD4HqEHHN1UU3pLIpLQn2ISZ9fYpAl5qvABise7Q6swymD/F07mKEZFd4FxkA2LonAR00ZAJwkn1RO94CTF29aJBBU8qNphssJpsqOHFCfMbLeAVKFywC0x+VpDcNm3rKHX9opOVGjNohr9gBz6pQDPGUACkEPSBif0OWkfHGEayyfzMqlWj2QaI4kUcCK1eJGajkikYASRrrboX/K79Qh+vYe77+Ef8RMBM4+TaECCp4SgYClDoJz0BDJuF6jFCNQ0O8oTGiPI055D4NsPc8+Yo0cAwMy0OJHhOfUDRZMk6ZIsSIyiD4dexnTUZDdHm+W+H3SwHNTE3YXZne5HwfH8Xea1souucZz3NH+v24+OZqZ1xrKHPDpfCY5QGr40naFI9PtT6a2jXe2ANQnjdFMb+T7rOMDAAoxaGdBvOqkzT6Bf8nsQn256LadXd7whz0mAi9MYQFVi6a+zrsCfDsTd4ZhPhKwL0H3DaborEICeU9LE5x50JcyCCG02mZ9rYBEcQ00upCOPWdAGOt4Cp0eOc8ZG4SCDypOdmkE1ADdTpVGlPGFNtTkTUuXQI/yYNTfUvobx4tO+9d50xrKeVhPHlsDBAyiwns5Uzsg5Krt2Dy9fdpUMVMgOuCh4QLZredg2fedhBji5MQT7bObckZJzBNt498OQ7HKm3Wm4jW0zXsbmEWaL6LdlTqWegLdesv1MC+Hp72T8SSgMxxkoN2yhquUYuZubjDP4nImgI6JohtahRmbVYsxqlcBR5pLKkPMtYb4U6uSgh23xmSTQlw9aRz3apJmNT5FGsIeedrA3j1ExzfMC0G6BnZS0gFieloCivcGYDXQN2oBeURu4mLCNtRXqozZwMWEbSy80kJ0ol6ModLv5GbqNgjIuza9B5OYp9zbjs1hJoRub7qVs4tqofWBzwKkp7UUEcmx6TD2hrQrkb0gDJqq/8PUSnk8r1IAKjXoHdWx2XQMVgAKuwerEoXLIhAd8dw3neBum/2R4vva8ovl137SL1vttgfZ9Y1dr3OMkpM+X+eWiNkmfNR8LOGW7NljWPIy2b+3qLXa8TurVllE/Hca4HVWw4fv5SS/7hmZc2KyxYzNoailNCkYymJHY2HhqNb/kDAS3MgFUpFBdDgL+IDkI2DUHAfXKQcCLyUFwpWMABSuZJHu3i8lCcMVjHaR3Mg8hbY3mzxLyVCVkv3Miwp0MZ9omIg4UtuSsX2vkVsxfB/goVXXnAxGRxeMuImHBEzZDoCxybXKZDdRPV/rj3pSUsuBKz9Jxb15GmoKiTD/g84mKywn9gK9f5GfysbRy0zJF5FcuwD8Z+Zn22GZo2PzwkbmmkV/1opk+oYt5PGzWKPCzWFOrgfD4qPln/fnC4z4AtAv7LNEKddYB/Zilh6PpmNOOrGsKU1H9AVg+g6neBQhQJR0lMXhLVIGIN4QCVhuXwr9TKqTvIm2fzKdYGr6f1vqiZKXxdkPhT6h7f822Or/VZnXU7IEqa9sN/Hjsy9tTKxmfHvoJlrPB4koCi8nSf8Pyr53Dx5WLHZ75o3V4nacX6ewFT9ch0chWM6badQSW2pVpqUvd11DXpGbj7dELwS3qw754LjspafPhnobJeMC9YK88Jeko8Uo1j+Oe5XL0UzGna0RTsu7JKwSA8WU+u8fKxLs4OKauxrd1lk/zEElvFtpmv7k7OdCe0Hjm4WNLtc8Onwiu7POMzn2WW9DGTHM1U19Q6QCmVDMtWgS0D9mv12WmcfZwiiHS50+bWjOCtNglU1WgVRMWFMXpk70U4vBxOi8spERYM2hoJy1tV64McMqSVqFwhQ/ZvNfhsww6FuNN/RahlHekcxKUyxSrz4G6auOFqtGtejEtKZS31o0tfPVlhTPTsBlEWdkrTwda6HQyX+fuZMc/QQHa9hvltrLzGmc0t7IbbQM6vjKSJd6Uswb2VU31rMG9JELP5l3U83lXSYJSiRmdPPdosablaKDb1VTyywfdPgH0uZaSf5rjhpJbBi3HTvLhaNNOqglF2bWxUs2keGNnTk7Vvs7ti9+02dfZZsEld19noUQhJ1EVlvSsFZ+MBTwZ0gqfu2AmdVIqPM4aaGQHTc7RV1tHXu45ENoMvxRuZjJZRNo+c5KWew4SHrcBgHLRqdkEY0TtFhSRhMf5KrUb8gost1bYY3WK1NnJNxdUNT181nuaEvi4hhf4ULn1UJvTOrcG3r++PYoy+BehDNLy4sO0wWTLtOq1QZMdPUdELOjKnZUittxtUpkVgr2sEKjZoNKSyTBDnSd1aEDUK43DRheGb9cBcvL4MhvZdrx7/PjBWZ+jIq9ZBmo4E7KlkqHgNUH+2tGpF1rVcw4otfwoliX//6mUqH/FJdzuZKI3cxlT/btycqX5EMGmlhdKNaESrtl5lod67l5GnjPC7P+QZIuaFjx+xkRmmw8ML8Jss2km9Ua73GjuMhIgvWz7iMor2q7uCEDHXzbB/vMJg90zcrzFWWIB6OHjVUwpHDqlw/SUf39Kx1QYYMtr6oN/qDoI7ctPFJm4zpnhiUycrdrECZLOGubZ9FO0Mu/3hnxD18SwWtdwZNe+KdatjVws8UTAtaQCF7414JYb2p0mNbZK5Ir23dMXuRy38UT/JmAk5NJmQrLtlw6uLUHr5Wcyx9kR/wM= \ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/Distributed.png b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/Distributed.png deleted file mode 100644 index d96ca216..00000000 Binary files a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/Distributed.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/README.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/README.md deleted file mode 100644 index 0fd4bb63..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP1/README.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -slug: /MEP-1-distributed-metal-control-plane -title: MEP-1 -sidebar_position: 1 ---- - -# Distributed Metal Control Plane - -This enhancement proposal was replaced by [MEP18](../MEP18/README.md). - -## Problem Statement - -We face the situation that we argue for running bare metal on-premises because this way the customers can control where and how their software and data are processed and stored. -On the other hand, we have currently decided that our metal-api control plane components run on a kubernetes cluster (in our case on a cluster provided by one of the available hyperscalers). - -Running the control plane on Kubernetes has the following benefits: - -- Ease of deployment -- Get most, if not all, of the required infrastructure services like (probably incomplete): - - IPs - - DNS - - L7-Loadbalancing - - Storage - - S3 Backup - - High Availability - -Using a kubernetes as a service offering from one of the hyperscalers, enables us to focus on using kubernetes instead of maintaining it as well. - -## Goal - -It would be much saner if metal-stack has no, or only minimal dependencies to external services. Imagine a metal-stack deployment in a plant, it would be optimal if we only have to deliver a single rack with servers and networking gear installed and wired, plug that rack to the power supply and a internet uplink and its ready to go. - -Have a second plant which you want to be part of all your plants? Just tell both that they are part of something bigger and metal-api knows of two partitions. - -## Possible Solutions - -We can think of two different solutions to this vision: - -1. Keep the central control plane approach and require some sort of kubernetes deployment accessible from the internet. This has the downside that the user must, provide a managed kubernetes deployment in his own datacenter or uses a hyperscaler. Still not optimal. -1. Install the metal-api and all its dependencies in every partition, replicate or shard the databases to every connected partition, make them know each other. Connect the partitions over the internet with some sort of vpn to make the services visible to each other. - -As we can see, the first approach does not really address the problem, therefore i will describe solution #2 in more details. - -## Central/Current setup - -### Stateful services - -Every distributed system suffer from handling state in a scalable, fast and correct way. To start how to cope with the state, we first must identify which state can be seen as partition local only and which state must be synchronous for read, and synchronous for writes across partitions. - -Affected states: - -- masterdata: e.g. tenant and project must be present in every partition, but these are entities which are read often but updates are rare. A write can therefore be visible with a decent delay in a distinct partition with no consequences. -- ipam: the prefixes and ip´s allocated from machines. These entities are also read often and rare updates. But we must differentiate between dirty reads for different types. A machine network is partition local, ips acquired from such a network must by synchronous in the same partition. Ips acquired from global networks such as internet must by synchronous for all partitions, as otherwise a internet ip could be acquired twice. -- vrf ids: they must only be unique in one partition -- image and size configurations: read often, written seldom, so no high requirements on the storage of these entities. -- images: os images are already replicated from a central s3 storage to a per partition s3 service. metal-hammer kernel and initrd are small and pull always from the central s3, can be done similar to os images. -- machine and machine allocation: must be only synchronous in the partition -- switch: must be only synchronous in the partition -- nsq messages: do not need to cross partition boundaries. No need to keep the messages persistent, even the opposite is true, we don't want to have the messages persist for a longer period. - -Now we can see that the most critical state to held and synchronize are the IPAM data, because these entities must be guaranteed to be synchronously updated, while being updated frequently. - -Datastores: - -We use three different types of datastores to persist the states of the metal application. - -- rethinkdb is the main datastore for almost all entities managed by metal-api -- postgresql is used for masterdata and ipam data. -- nsq uses disk and memory tho store the messages. - -### Stateless services - -These are the easy part, all of our services which are stateless can be scaled up and down without any impact on functionality. Even the stateful services like masterdata and metal-api rely fully on the underlying datastore and can therefore also be scaled up and down to meet scalability requirements. - -Albeit, most of these services need to be placed behind a loadbalancer which does the L4/L7 balancing across the started/available replicas of the service for the clients talking to it. This is actually provided by kubernetes with either service type loadbalancer or type clusterip. - -One exception is the `metal-console` service which must have the partition in it´s dns name now, because there is no direct network connectivity between the management networks of the partitions. See "Network Setup) - -## Distributed setup - -### State - -In order to replicate certain data which must be available across all partitions we can use on of the existing open source databases which enable such kind of setup. There are a few available out there, the following incomplete list will highlight the pro´s and cons of each. - -- RethinkDB - - We already store most of our data in RethinkDB and it gives already the ability to synchronize the data in a distributed manner with different guarantees for consistency and latency. This is described here: [Scaling, Sharding and replication](https://rethinkdb.com/docs/sharding-and-replication/). But because rethinkdb has a rough history and unsure future with the last release took more than a year, we in the team already thought that we eventually must move away from rethinkdb in the future. - -- Postgresql - - Postgres does not have a multi datacenter with replication in both directions, it just can make the remote instance store the same data. - -- CockroachDB - - Is a Postgresql compatible database engine on the wire. CockroachDB gives you both, ACID and geo replication with writes allowed from all connected members. It is even possible to configure [Follow the Workload](https://www.cockroachlabs.com/docs/stable/topology-follow-the-workload) and [Geo Partitioning and Replication](https://www.cockroachlabs.com/docs/v19.2/topology-geo-partitioned-replicas). - -If we migrate all metal-api entities to be stored the same way we store masterdata, we could use cockroachdb to store all metal entities in one ore more databases spread across all partitions and still ensure consistency and high availability. - -A simple setup how this would look like is shown here. - -![Simple CockroachDB setup](Distributed.png) - -go-ipam was modified in a example PR here: [PR 17](https://github.com/metal-stack/go-ipam/pull/17) - -### API Access - -In order to make the metal-api accessible for api users like `cloud-api` or `metalctl` as easy at it is today, some effort has to be taken. One possible approach would be to use a external loadbalancer which spread the requests evenly to all metal-api endpoints in all partitions. Because all data are accessible from all partitions, a api request going to partition A with a request to create a machine in partition B, will still work. If on the other hand partition B is not in a connected state because the interconnection between both partitions is broken, then of course the request will fail. - -**IMPORTANT** -The NSQ Message to inform `metal-core` must end in the correct partition - -To provide such a external loadbalancer we have several opportunities: - -- Cloudflare or comparable CDN service. -- BGP Anycast from every partition - -Another setup would place a small gateway behind the metal-api address, which forwards to the metal-api in the partition where the request must be executed. This gateway, `metal-api-router` must inspect the payload, extract the desired partition, and forward the request without any modifications to the metal-api endpoint in this partition. This can be done for all requests, or if we want to optimize, only for write accesses. - -## Network setup - -In order to have the impact to the overall security concept as minimal as possible i would not modify the current network setup. The only modifications which has to be made are: - -- Allow https ingress traffic to all metal-api instances. -- Allow ssh ingress traffic to all metal-console instances. -- Allow CockroachDB Replication between all partitions. -- No NSQ traffic from outside required anymore, except we cant solve the topic above. - -A simple setup how this would look like is shown here, this does not work though because of the forementioned NSQ issue. - -![API and Console Access](Distributed-API.png) - -Therefore we need the `metal-api-router`: - -![Working API and Console Access](Distributed-API-Working.png) - -## Deployment - -The deployment of our components will substantially differ in a partition compared to a the deployment we have actually. Deploying it in kubernetes in the partition would be very difficult to achieve because we have no sane way to deploy kubernetes on physical machines without a underlying API. -I would therefore suggest to deploy our components in the same way we do that for the services running on the management server. Use systemd to start docker containers. - -![Deployment](Distributed-Deployment.png) diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP10/README.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP10/README.md deleted file mode 100644 index 6811cdc0..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP10/README.md +++ /dev/null @@ -1,197 +0,0 @@ ---- -slug: /MEP-10-sonic-support -title: MEP-10 -sidebar_position: 10 ---- - -# SONiC Support - -As writing this proposal, metal-stack only supports Cumulus on Broadcom ASICs. Unfortunately, after the acquisition of -Cumulus Networks by Nvidia, Broadcom decided to cut its relationship with Cumulus, and therefore Cumulus 4.2 is the last -version that supports Broadcom ASICs. Since trashing the existing hardware is not a solution, adding support for a -different network operating system is necessary. - -One of the remaining big players is [SONiC](https://sonic-net.github.io/SONiC/), which Microsoft created to scale the -network of Azure. It's an open-source project and is now part of the [Linux Foundation](https://www.linuxfoundation.org/press/press-release/software-for-open-networking-in-the-cloud-sonic-moves-to-the-linux-foundation). - -For a general introduction to SONiC, please follow the [Architecture](https://github.com/sonic-net/SONiC/wiki/Architecture) official -documentation. - -## ConfigDB - -On a cold start, the content of `/etc/sonic/config_db.json` will be loaded into the Redis database `CONFIG_DB`, and both -contain the switch's configuration except the BGP unnumbered configuration, which still has to be configured directly by -the frr configuration files. The SONiC community is working to remove this exception, but no release date is known. - -## BGP Configuration - -Frr runs inside a container, and a shell script configured it on the container startup. For BGP unnumbered, we must set -the configuration variable `docker_routing_config_mode` to `split` to prevent SONiC from overwriting our configuration -files created by `metal-core`. But by using the split mode, the integrated configuration mode of frr is deactivated, and -we have to write our BGP configuration to the daemon-specific files `bgp.conf`, `staticd.conf`, and `zebra.conf` instead -to `frr.conf`. - -```bash -elif [ "$CONFIG_TYPE" == "split" ]; then - echo "no service integrated-vtysh-config" > /etc/frr/vtysh.conf - rm -f /etc/frr/frr.conf -``` - -Reference: [docker-init](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/docker_init.sh#L69) - -Adding support for the integrated configuration mode, we must at least adjust the startup shell script and the supervisor configuration: - -```bash -{% if DEVICE_METADATA.localhost.docker_routing_config_mode is defined and DEVICE_METADATA.localhost.docker_routing_config_mode == "unified" %} -[program:vtysh_b] -command=/usr/bin/vtysh -b -``` - -Reference: [supervisord.conf](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/frr/supervisord/supervisord.conf.j2#L157) - -## Non-BGP Configuration - -For the Non-BGP configuration we have to write it into the Redis database directly or via one of the following interfaces: - -- `config replace ` -- the Mgmt Framework -- the SONiC restapi - -Directly writing into the Redis database isn't a stable interface, and we must determine the create, delete, and update -operations on our own. The last point is also valid for the Mgmt Framework and the SONiC restapi. Furthermore, the -Mgmt Framework doesn't start anymore for several months, and a [potential fix](https://github.com/sonic-net/sonic-buildimage/pull/10893) -is still not merged. And the SONiC restapi isn't enabled by default, and we must build and maintain our own SONiC images. - -Using `config replace` would reduce the complexity in the `metal-core` codebase because we don't have to determine the -actual changes between the running and the desired configuration. The approach's drawbacks are using a version of SONiC -that contains the PR [Yang support for VXLAN](https://github.com/sonic-net/sonic-buildimage/pull/7294), and we must provide -the whole new startup configuration to prevent unwanted deconfiguration. - -### Configure Loopback interface and activate VXLAN - -```json -{ - "LOOPBACK_INTERFACE": { - "Loopback0": {}, - "Loopback0|": {} - }, - "VXLAN_TUNNEL": { - "vtep": { - "src_ip": "" - } - } -} -``` - -#### Configure MTU - -```json -{ - "PORT": { - "Ethernet0": { - "mtu": "9000" - } - } -} -``` - -#### Configure PXE Vlan - -```json -{ - "VLAN": { - "Vlan4000": { - "vlanid": "4000" - } - }, - "VLAN_INTERFACE": { - "Vlan4000": {}, - "Vlan4000|": {} - }, - "VLAN_MEMBER": { - "Vlan4000|": { - "tagging_mode": "untagged" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104000_Vlan4000": { - "vlan": "Vlan4000", - "vni": "104000" - } - } -} -``` - -#### Configure VRF - -```json -{ - "INTERFACE": { - "Ethernet0": { - "vrf_name": "vrf104001" - } - }, - "VLAN": { - "Vlan4001": { - "vlanid": "4001" - } - }, - "VLAN_INTERFACE": { - "Vlan4001": { - "vrf_name": "vrf104001" - } - }, - "VRF": { - "vrf104001": { - "vni": "104001" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104001_Vlan4001": { - "vlan": "Vlan4001", - "vni": "104001" - } - } -} -``` - -## DHCP Relay - -The DHCP relay container only starts if `DEVICE_METADATA.localhost.type` is equal to `ToRRouter`. - -## LLDP - -SONiC always uses the local port subtype for LLDP and sets it to some freely configurable alias field of the interface. - -```python -# Get the port alias. If None or empty string, use port name instead -port_alias = port_table_dict.get("alias") -if not port_alias: - self.log_info("Unable to retrieve port alias for port '{}'. Using port name instead.".format(port_name)) - port_alias = port_name - -lldpcli_cmd = "lldpcli configure ports {0} lldp portidsubtype local {1}".format(port_name, port_alias) -``` - -Reference: [lldpmgr](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-lldp/lldpmgrd#L153) - -## Mgmt Interface - -The mgmt interface is `eth0`. To configure a static IP address and activate the Mgmt VRF, use: - -```json -{ - "MGMT_INTERFACE": { - "eth0|": { - "gwaddr": "" - } - }, - "MGMT_VRF_CONFIG": { - "vrf_global": { - "mgmtVrfEnabled": "true" - } - } -} -``` - -[IP forwarding is deactivated on `eth0`](https://github.com/sonic-net/sonic-buildimage/blob/202205/files/image_config/sysctl/sysctl-net.conf#L7), and no IP Masquerade is configured. diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP11/README.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP11/README.md deleted file mode 100644 index 87f48a10..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP11/README.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -slug: /MEP-11-auditing-of-metal-stack-resources -title: MEP-11 -sidebar_position: 11 ---- - -# Auditing of metal-stack resources - -Currently no logs of the ownership of resources like machines, networks, ips and volumes are generated or kept. Though due to legal requirements data centers are required to keep track of this ownership over time to prevent liability issues when opening the platform for external users. - -In this proposal we want to introduce a flexible and low-maintenance approach for auditing on top of [Meilisearch](https://www.meilisearch.com/). - -## Overview - -In general our auditing logs will be collected by a request interceptor or middleware. Every request and response will be processed and eventually logged to Meilisearch. -Meilisearch will be configured to regularly create chunks of the auditing logs. These finished chunks will be backed up to a S3 compatible storage with a read-only option enabled. - -Of course sensitive data like session keys or passwords will be redacted before logging. We want to track relevant requests and responses. If auditing the request fails, the request itself will be aborted and will not be processed further. The requests and responses that will be audited will be annotated with a correlation id. - -Transferring the meilisearch auditing data chunks to the S3 compatible storage will be done by a sidecar cronjob that is executed periodically. -To avoid data manipulation the S3 compatible storage will be configured to be read-only. - -## Whitelisting - -To reduce the amount of unnecessary logs we want to introduce a whitelist of resources and operations on those that should be logged. -Other requests will be passed directly to the next middleware or web service without any further processing. - -As we are only interested in mutating endpoints, we ignore all `GET` requests. -The whitelist includes all `POST`, `PUT`, `PATCH` and `DELETE` endpoints of the HTTP middleware except for the following (non-manipulating) route suffixes: - -- `/find` -- `/notify` -- `/try` and `/match` -- `/capacity` -- `/from-hardware` - -Regarding GRPC audit trails, they are not so interesting because only internal clients are using this API. However, we can log the trails of the `Boot` service, which can be interesting to revise the machine lifecycle. - -## Chunking in Meilisearch - -We want our data to be chunked in Meilisearch. To accomplish this, we rotate the index identifier on a scheduled basis. The index identifiers will be derived from the current date and time. - -To keep things simple, we only support hourly, daily and monthly rotation. The eventually prefixed index names will only include relevant parts of date and time like `2021-01`, `2021-01-01` or `2021-01-01_13`. - -The metal-api will only write to the current index and switches to the new index on rotation. The metal-api will never read or update data in any indices. - -## Moving chunks to S3 compatible storage - -As Meilisearch will be filled with data over time, we want to move completed chunks to a S3 compatible storage. This will be done by a sidecar cronjob that is executed periodically. Note that the periods of the index rotation and the cronjob execution don't have to match. - -When the backup process gets started, it initiates a [Meilisearch dump](https://www.meilisearch.com/docs/learn/advanced/dumps) of the whole database across all indices. Once the returned task is finished, the dump must be copied from a Meilisearch volume to the S3 compatible storage. After a successful copy, the dump can be deleted. - -Now we want to remove all indices from Meilisearch, except the most recent one. For this, we [get all indices](https://www.meilisearch.com/docs/reference/api/indexes#list-all-indexes), sort them and [delete each index](https://www.meilisearch.com/docs/reference/api/indexes#delete-an-index) except the most recent one to avoid data loss. - -For the actual implementation, we can build upon [backup-restore-sidecar](https://github.com/metal-stack/backup-restore-sidecar). But due to the index rotation and the fact, that older indices need to be deleted, this probably does not fit into the mentioned sidecar. - -## S3 compatible storage - -The dumps of chunks should automatically deleted after a certain amount of time, once we are either no longer allowed or required to keep them. -The default retention time will be 6 months. Ideally already uploaded chunks should be read-only to prevent data manipulation. - -A candidate for the S3 compatible storage is Google Cloud Storage, which allows to configure automatic expiration of objects through a [lifecycle rule](https://cloud.google.com/storage/docs/managing-lifecycles?hl=en#storage-set-lifecycle-config-go). - -## Affected components - -- metal-api grpc server needs an auditing interceptor -- metal-api web server needs an auditing filter chain / middleware -- metal-api needs new command line arguments to configure the auditing -- mini-lab needs a Meilisearch instance -- mini-lab may need a local S3 compatible storage -- we need a sidecar to implement the backup to S3 compatible storage -- Consider auditing of volume allocations and freeings outside of metal-stack - -## Alternatives considered - -Instead of using Meilisearch we investigated using an immutable database like [immudb](https://immudb.io/). But immudb does not support chunking of data and due to its immutable nature, we will never be able to free up space of expired data. Even if we are legally allowed or required to delete data, we will not be able to do so with immudb. - -In another variant of the Meilisearch approach the metal-api would also be responsible for copying chunks to the S3 compatible storage and deleting old indices. But separating the concerns allows completely different implementations for every deployment stage. diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP12/README.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP12/README.md deleted file mode 100644 index 65532c57..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP12/README.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -slug: /MEP-12-rack-spreading -title: MEP-12 -sidebar_position: 12 ---- - -# Rack Spreading - -Currently, when creating a machine through the metal-api, the machine is placed randomly inside a partition. This algorithm does not consider spreading machines across different racks and different chassis. This may lead to the situation that a group of machines (that for example form a cluster) can end up being placed in the same rack and the same chassis. - -Spreading a group of machines across racks can enhance availability for scenarios like a rack losing power or a chassis meltdown. - -So, instead of just randomly deciding the placement of a machine candidate, we want to propose a placement strategy that attempts to spread machine candidates across the racks inside a partition. - -Furthermore a followup improvement to guarantee that machines are really spread across multiple racks, even if multiple machines are ordered in parallel, was implemented with [PR490](https://github.com/metal-stack/metal-api/pull/490). - -## Placement Strategy - -Machines in the project are spread across all available racks evenly within a partition (best effort). For this, an additional request to the datastore has to be made in order to find allocated machines within the project in the partition. - -The algorithm will then figure out the least occupied racks and elect a machine candidate randomly from those racks. - -The user can optionally pass placement tags which will be considered for spreading the machines as well (this will for example allow spreading by a cluster id tag inside the same project). - -## API - -```golang -// service/v1/machine.go - -type MachineAllocation struct { - // existing fields are omitted for readability - PlacementTags []string `json:"placement_tags" description:"by default machines are spread across the racks inside a partition for every project. if placement tags are provided, the machine candidate has an additional anti-affinity to other machines having the same tags"` -} -``` diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP13/README.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP13/README.md deleted file mode 100644 index 2dde20f5..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP13/README.md +++ /dev/null @@ -1,111 +0,0 @@ ---- -slug: /MEP-13-dual-stack-support -title: MEP-13 -sidebar_position: 13 ---- - -# Dual-stack Support - -dual-stack support is required to be able to create Kubernetes clusters with either IPv6 single-stack or dual-stack enabled. -With the inherent scarcity of IPv4 addresses, the need to be able to use IPv6 has increased. - -Full IPv6 dual-stack support was added to Kubernetes with v1.23 as stable. - -Gardeners have had full IPv6 dual-stack support since `v1.109`. - -metal-stack manages CIDRs and IP addresses with the [go-ipam](https://github.com/metal-stack/go-ipam) library, which already got full IPv6 support in 2021 (see [https://metal-stack.io/blog/2021/02/ipv6-part1](https://metal-stack.io/blog/2021/02/ipv6-part1)). -But this was only the foundation, more work needs to be done to get full IPv6 support for all aspects managed by metal-stack.io. - -## General Decisions - -For the general decision we do not look at the isolated clusters feature for now as this would make the solution even more complex and we want to introduce IPv6 in smaller steps to the users. - -### Networks - -Currently, metal-stack organizes CIDRs / prefixes into a `network' resource in the metal-api. A network can consist of multiple CIDRs from the same address family. For example, if an operator wants to provide Internet connectivity to provisioned machines, they can start with small network CIDRs. The number of managed network prefixes can then be expanded as needed over time. - -With dual-stack we have to choose between two options: Network per address family or networks with both address families. These options are described in the next section. - -#### Network per Address Family - -This means that we allow networks with CIDRs from one address family only, one for IPv4 and one for IPv6. - -The machine creation process will not change if the machine only needs to be either IPv4 or IPv6 addressable. -But if on the other side, the machine need to be able to connect to both address families, the machine creation needs to specify two networks, one for IPv4 and one for IPv6. -Also there will be 2 distinct VRF IDs for every network with a different address family. - -#### Network with both Address Families - -Make a network dual address family capable, meaning that you can add multiple cidrs from both address families to a network. -Then the machine creation will remain the same for single-stack and dual-stack cases, but the ip address allocation will need to specify the address family from which to allocate an ip address when the network is dual-stack. -This does not break the existing API, but allows existing extensions to easily add dual-stack support. -To avoid additional checking of which address families are available on this network during an ip allocation call, we could store the address families in the network. - -#### Decision - -The decision was made to go with the having both address families in a single network entity because we think this is the most flexible way to support dual-stack machines and Kubernetes clusters as well as single-stack with the least amount of modifications on the networking side. - -### Examples - -To illustrate the the usage we start by creating a tenant super network which has both address families: - -```yaml ---- -id: tenant-super-network-mini-lab -name: Project Super Network -description: Super network of all project networks -partitionid: mini-lab -prefixes: - - 10.0.0.0/16 - - 2001:db8:0:10::/64 -defaultchildprefixlength: - IPv4: 22 - IPv6: 96 -privatesuper: true -``` - -In order to create this network, we simple call: - -```bash -metalctl network create -f tenant-super.yaml -``` - -This is usually done during the initial setup of the environment. - -Next step is to allocate a tenant network where the machines of a project can be placed: - -```bash -metalctl network allocate --partition mini-lab --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 --name my-node-network -``` - -This leads to the following network allocation: - -```yaml -id: 2d2c0350-3f66-4597-ae97-ef6797232212 -name: my-node-network -parentnetworkid: tenant-super-network-mini-lab -partitionid: mini-lab -prefixes: - - 10.0.0.0/22 - - 2001:db8:0:10::/96 -projectid: 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -vrf: 20 -consumption: - ipv4: - available_ips: 1024 - available_prefixes: 256 - used_ips: 2 - used_prefixes: 0 - ipv6: - available_ips: 2147483647 - available_prefixes: 1073741824 - used_ips: 1 - used_prefixes: 0 -privatesuper: false -``` - -Users can the create IP addresses from these child networks. By default, they retrieve an IPv4 address except a super network only consists of IPv6 prefixes. In the latter case the users acquire an IPv6 address. - -```bash -metalctl network ip create --network 2d2c0350-3f66-4597-ae97-ef6797232212 --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -``` diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP14/README.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP14/README.md deleted file mode 100644 index 47c06434..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP14/README.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -slug: /MEP-14-independence-from-external-sources -title: MEP-14 -sidebar_position: 14 ---- - -# Independence from external sources - -In certain situations some customers may need to operate and create machines without making use of external services like DNS or NTP through the internet. To make this possible, all metal-stack components reaching external services need to be configurable with custom endpoints. - -So far, the following components have been identified as requiring changes: - -- pixiecore -- metal-hammer -- metal-images - -More components are likely to be added to the list during processing. -For DNS and NTP servers it should be possible to provide default values within a partition. They can either be inherited from machines and firewalls or overwritten with own ones. - -## pixiecore - -A NTP server endpoint need to be configured on the pixiecore. This can be achieved by providing it through environment variables on start up. - -## metal-hammer - -If using a self-deployed NTP server, also the metal-hammer need to be configured with it. For backward compatibility, default values from `pool.ntp.org` and `time.google.com` are used. - -## metal-images - -Configurations for the `metal-images` are different for machines and firewalls. - -## metalctl - -In order to pass DNS and NTP servers to partitions and machines while creating them, the flags `dnsservers` and `ntpservers` need to be added. - -The implementation of this MEP will make metal-stack possible to create and maintain machines without requiring an internet connection. diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP16/README.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP16/README.md deleted file mode 100644 index 205670ab..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP16/README.md +++ /dev/null @@ -1,318 +0,0 @@ ---- -slug: /MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller -title: MEP-16 -sidebar_position: 16 ---- - -# metal-api as an Alternative Configuration Source for the firewall-controller - -In the current situation, a firewall as provisioned by metal-stack is a fully immutable entity. Any modifications on the firewall like changing the firewall ruleset must be done _somehow_ by the user – the metal-api and hence metal-stack is not aware of its current state. - -As part of our [integration with the Gardener project](https://docs.metal-stack.io/stable/overview/kubernetes/#Gardener) we offer a solution called the [firewall-controller](https://github.com/metal-stack/firewall-controller), which is part of our [firewall OS images](https://github.com/metal-stack/metal-images/blob/6318a624861b18a559a9d37299bca5f760eef524/firewall/Dockerfile#L57-L58) and addresses shortcomings of the firewall resource's immutability, which would otherwise be completely impractible to work with. The firewall-controller crashes infinitely if it is not properly configured through the userdata when using the firewall image of metal-stack. - -The firewall-controller approach is tightly coupled to Gardener and it requires the administrator of the Gardener installation to pass a shoot and a seed kubeconfig through machine userdata when creating the firewall. How this userdata has to look like is not documented and is just part of another project called the [firewall-controller-manager](https://github.com/metal-stack/firewall-controller-manager) (FCM), which task is to orchestrate rolling updates of firewall machines in a way that network traffic interruption is minimal when updating a firewall or applying a change to an immutable firewall configuration. - -In general, a firewall entity in metal-stack has similarities to the machine entity but it has a fundamental difference: A user gains ownership over a machine after provisioning. They can access it through SSH, modify it at will and this is completely wanted. For firewalls, however, we do not want a user to access the provisioned firewall as the firewall is a privileged part of the infrastructure with access to the underlay network. The underlay can not be tampered with at any given point in time by a user as it can destroy the entire network traffic flow inside a metal-stack partition. - -For this reason, we have a gap in the metal-stack project in terms of a missing solution for people who do not rely on the Gardener integration. We are basically leaving a user with the option to implement an orchestrated recreation of every possible change on the firewall to minimize traffic interruption for the machines sitting behind the firewall or re-implement the firewall-controller to how they want to use it for their use-case. - -Also we do not have a clear distinction in the API between user and metal-stack operator for firewalls. If a user would allocate a firewall it is also possible for the user to inject his own SSH keys and access the firewall and tamper with the underlay network. - -Parts of these problems are probably going to decrease with the work on [MEP-4](../MEP4/README.md) where there will be dedicated APIs for users and administrators of metal-stack including fine-grained access tokens. - -With this MEP we want to describe a way to improve this current situation and allow other users that do not rely on the Gardener integration – for whatever motivation they have – to adequately manage firewalls. For this, we propose an alternative configuration for the firewall-controller that is native to metal-stack and more independent of Gardener. - -## Proposal - -The central idea of this proposal is allowing the firewall-controller to use the metal-api as a configuration source. This should serve as an alternative strategy to the currently used FCM `Firewall` resource based approach in the Gardener use-case. -Updates of the firewall rules should be possible through the metal-api. - -The firewall-controller itself should now be able to decide which of the two main strategies should be used for the base configuration: a kubeconfig or the metal-api. This should be possible through a dedicated _firewall-controller-config_. - -Using this config will now allow operators to fine-tune the data sources for all of its dynamic configuration tasks independently. -For example the data source of the core firewall rules could be set either from the `Firewall` resource located in the Gardener `Seed` or the metal-apiserver node network entity, while the CWNPs should be fetched and applied from a given kubeconfig (the `Shoot` Kubeconfig in the Gardener case). -This configuration file is intended to be injected during firewall creation through the userdata along with potential source connection credentials. - -```yaml -# the name of the firewall, defaulted to the hostname -name: best-firewall-ever - -sources: - seed: - kubeconfig: /path/to/seed.yaml # current gardener behavior - namespace: shoot--proj--name - shoot: - kubeconfig: /path/to/shoot.yaml # current gardener behavior - namespace: firewall - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - static: - # static should mirror all information provided by the metal or seed/shoot sources - firewall: # optional - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -# all sub-controllers running on the firewall -# each can be configured independently -controllers: - # this is the base controller - firewall: - source: seed # or: metal, static - - # these are optional: when not provided, they are disabled - selfUpdate: - enabled: true - droptailer: - enabled: true - - # these are optional: when not provided, they are disabled - service: - source: shoot # or: metal, static - cwnp: - source: shoot # or: metal, static - monitor: - source: shoot # currently only shoot is supported -``` - -The existing behavior of the firewall-controller writing into `/etc/nftables/firewall-controller.v4` is not changed. The different controller configuration sources are internally treated in the same way as before. The `static` source can be used to prevent the firewall-controller from crashing and consistently providing a static ruleset. This might be interesting for metal-stack native use cases or environments where the metal-api cannot be accessed. - -There must be one central nftables-rule-file-controller that is notified and triggered by all other controllers that contribute to the nftables configuration. - -For example, in order to maintain the existing Gardener integration, the configuration file for the firewall-controller will look like this: - -```yaml -name: shoot--abc--cluster-firewall-def -sources: - seed: - kubeconfig: /etc/firewall-controller/seed.yaml - namespace: shoot--abc--cluster - shoot: - kubeconfig: /etc/firewall-controller/shoot.yaml - namespace: firewall - -controllers: - firewall: - source: seed - - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Plain metal-stack users might use a configuration like this: - -```yaml -name: best-firewall-ever - -sources: - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - -controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - cwnp: - # firewall rules stored in firewall entity - # potential improvement would be to attach the rules to the node network entity - # be aware that the firewall and private networks are immutable - # eventually we introduce a firewall ruleset entity - source: metal -``` - -In highly restricted environments that cannot access metal-api the static source could be used: - -```yaml -name: most-restricted-firewall-ever - -sources: - static: - firewall: - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -controllers: - firewall: - source: static - - cwnp: - source: static -``` - -### Non-Goals - -- Resolving the missing differentiation between users and administrators by letting users pass userdata and SSH keys to the firewall creation. - - This is even more related to [MEP-4](../MEP4/README.md) than this MEP. - -### Advantages - -- Offers a native metal-stack solution that improves managing firewalls for users by adding dynamic reconfiguration through the metal-api - - e.g., in the mini-lab, users can now allocate a machine, then an IP address and announce this IP from the machine without having to re-create the firewall but by adding a firewall rule to the metal-api. -- Improve consistency throughout the API (firewall rules would reflect what is persisted in metal-api). -- Other providers like Cluster API can leverage this approach, too. -- It can contribute to solving the shoot migration issue (in Cluster API case the `clusterctl move` for firewall objects) - - For Gardener takes the seed out of the equation (of which the kubeconfig changes during shoot migration) - - However: Things like egress rules, rate limiting, etc. are currently not part of the firewall or network entity in the metal-api. These would need to be added to one of them. -- Potentially resolve the issue that end-users can manipulate accounting data of the firewall through the `FirewallMonitor` - - for this we would need to be able to report traffic data to metal-api - -### Caveats - -- Metal-View access is too broad for firewalls. Mitigated by [MEP-4](../MEP4/README.md). -- Polling of the firewall-controller is bad for performance. Mitigated by [MEP-4](../MEP4/README.md). - -### Firewall Controller Manager - -Currently the firewall-controller-manager expects the creators of a `FirewallDeployment` to use the defaulting webhook that is tailored to the Gardener integration in order to generate `Firewall.spec.userdata` or to override it manually. Currently `Firewall.spec.userdata` will never be set explicitly. - -Instead we'd like to propose `Firewall.spec.userdataContents` which will replace the old `userdata`-string by a typed data structure. The FCM will do the heavy lifting while the `FirewallDeployment` creator decides what should be configured. - -```yaml -kind: FirewallDeployment -spec: - template: - spec: - userdataContents: - - path: /etc/firewall-controller/config.yaml - content: | - --- - sources: - static: {} - controllers: - firewall: - source: static - - path: /etc/firewall-controller/seed.yaml - secretRef: - name: seed-kubeconfig - generateFirewallControllerKubeconfig: true - - path: /etc/firewall-controller/shoot.yaml - secretRef: - name: shoot-kubeconfig -``` - -### Gardener Extension Provider Metal Stack - -The GEPM should be migrated to the new `Firewall.spec.userdataContents` field. - -### Cluster API Provider Metal Stack - -![architectural overview](firewall-for-capms-overview.svg) - -In Cluster API there are essentially two main clusters: the management cluster and the workload cluster while the CAPMS takes in the role of the GEPM. -Typically a local bootstrap cluster is created in KinD which acts as the management cluster. It creates the workload cluster. Thereafter the ownership of the workload cluster is typically moved (using `clusterctl move`) to a different cluster which will then become the management cluster. -The new management cluster might actually be the workload cluster itself. - -In contrast to Gardener, Cluster API aims to be less opinionated and minimal. It is common practice to not install any non-required components or CRDs into the workload cluster by default. Therefore we cannot expect custom resources like `ClusterwideNetworkPolicy` or `FirewallMonitor` to be installed in the workload cluster but strongly recommend our users to do it. Therefore it's the responsibility of the operator to tell [cluster-api-provider-metal-stack](https://github.com/metal-stack/cluster-api-provider-metal-stack) the kubeconfig for the cluster where these CRDs are installed and defined in. - -A viable configuration for a `MetalStackCluster` that generates firewall rules based of `Service` type `LoadBalancer` and `ClusterwideNetworkPolicy` and expects them to be deployed in the workload cluster is shown below. The `FirewallMonitor` will be reported into the same cluster. - -```yaml -kind: MetalStackCluster -metadata: - name: ${CLUSTER_NAME} -spec: - firewallTemplate: - userdataContents: - - path: /etc/firewall-controller/config.yaml - secretName: ${CLUSTER_NAME}-firewall-controller-config - - - path: /etc/firewall-controller/workload.yaml - # this is the kubeconfig generated by kubeadm - secretName: ${CLUSTER_NAME}-kubeconfig ---- -kind: Secret -metadata: - name: ${CLUSTER_NAME}-firewall-controller-config -stringData: - controllerConfig: | - --- - name: ${CLUSTER_NAME}-firewall - - sources: - metal: - url: ${METAL_API_URL} - hmac: ${METAL_API_HMAC} - type: ${METAL_API_HMAC_TYPE} - projectID: ${METAL_API_PROJECT_ID} - shoot: - kubeconfig: /etc/firewall-controller/workload.yaml - namespace: firewall - - controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Here the firewall-controller-config will be referenced by the `MetalStackCluster` as a `Secret`. Please note that the `Secret`s in `userdataContents` will not be fetched and will directly be passed to the `FirewallDeployment`. At first the reconciliation of it in the FCM will fail due to the missing Kubeconfig secret. After the `MetalStackCluster` has been marked as ready, CAPI will create this missing secret. Effectively the firewall and initial control plane node should be created at the same time. - -This approach allows maximum flexibility as intended by Cluster API and is still able to provide robust rolling updates of firewalls. - -An advanced use case of this flexibility would be a management cluster, that is in charge of multiple workload clusters. Where one workload cluster acts as a monitoring or tooling cluster, receives logs and the firewall monitor for the other workload clusters. The CWNPs could be defined here, all in a separate namespace. - -#### Cluster API Caveats - -When the cluster is pivoted and reconciles its own firewall, a malfunctioning firewall prevents the cluster from self-healing and requires manual intervention by creating a new firewall. This is an inherent problem of the cluster-api approach. It can be circumvented by using an extra cluster to manage workload clusters. - -In the current form of this approach firewalls and therefore the firewall egress and ingress rules are managed by the cluster operators that manage the cluster-api resources. -Hence it will not be possible to gain a fine-grained control over every cluster operator's choices from a central ruleset at the level of metal-stack firewalls. -In case this control surfaces as a requirement, it would need to be implemented in a firewall external to metal-stack. - -## Roadmap - -In general this proposal is not thought to be implemented in one batch. Instead an incremental approach is required. - -1. Enhance firewall-controller - - - Reduce coupling between controllers - - Introduce controller config - - Abstract module to write into distinct nftable rules for every controller - - Implement `sources.static`, but not `sources.metal` - - GEPM should set `FirewallDeployment.spec.template.spec.userdataContents` - -2. Allow Cluster API to use the FCM with static ruleset - - - Add `firewall.metal-stack.io/paused` annotation (managed by CAPMS during `clusterctl move`, theoretically useful for Gardener shoot migration as well to avoid shallow deletion). - - Reconcile multiple `FirewallDeployment` resources across multiple namespaces. For Gardener the old behavior of reconciling only one namespace should persist. - - Allow setting the `firewall.metal-stack.io/no-controller-connection` annotation through the `FirewallDeployment` (either through the template or inheritance). - - Add `MetalStackCluster.spec.firewallTemplate`. - - Make `MetalStackCluster.spec.nodeNetworkID` optional if `spec.firewallTemplate` given. - -3. Add `sources.metal` as configuration option. - - - Allow updates of firewall rules in the metal-apiserver. - - Depends on [MEP-4](../MEP4/README.md) metal-apiserver progress - -4. Potentially migrate the GEPM to use `sources.metal` diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio deleted file mode 100644 index faea3e3d..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio +++ /dev/null @@ -1,4 +0,0 @@ - - - -
handles traffic
Firewall
Firewall Controller
node-exporter
nftables-exporter
droptailer-client
Workload Cluster
droptailer
Configures
Bootstrap or Management Cluster
reconcile
configures
reconcile
Cluster API Provider metal-stack
Metal Stack Cluster CRD
Firewall Deployment CRD
Firewall CRD
Firewall Set CRD
rec
reconcile
reconcile
Firewall Controller Manager
Metal Stack Machine CRD
manages
Admin
Kubeconfig FirewallMonitor
FirewallMonitor CRD
main metal-api
Firewall entity
kubeconfig CWNP
Clusterwide Network Policy CRD
base config
controllerConfig
user-defined
network rules
reports firewall
state
send firewall log lines
controllerConfig
controllerConfig
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg deleted file mode 100644 index 853f8175..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg +++ /dev/null @@ -1 +0,0 @@ -
handles traffic
handles traffic
Firewall
Firewall
Firewall Controller
Firewall Controller
node-exporter
node-exporter
nftables-exporter
nftables-exporter
droptailer-client
droptailer-client
Workload Cluster
Workload Cluster
droptailer
droptailer
Configures
Configures
Bootstrap or Management Cluster
Bootstrap or Management Cluster
reconcile
reconcile
configures
configures
reconcile
reconcile
Cluster API Provider metal-stack
Cluster API Provider...
Metal Stack Cluster CRD
Metal Stack Cluster...
Firewall Deployment CRD
Firewall Deployment...
Firewall CRD
Firewall CRD
Firewall Set CRD
Firewall Set CRD
rec
rec
reconcile
reconcile
reconcile
reconcile
Firewall Controller Manager
Firewall Controller...
Metal Stack Machine CRD
Metal Stack Machine...
manages
manages
Admin
Admin
Kubeconfig FirewallMonitor
Kubeconfig FirewallMonitor
FirewallMonitor CRD
FirewallMonitor CRD
main metal-api
main metal-api
Firewall entity
Firewall entity
kubeconfig CWNP
kubeconfig CWNP
Clusterwide Network PolicyCRD
Clusterwide Network...
base config
base config
controllerConfig
controllerConfig
user-defined
network rules
user-defined...
reports firewall
state
reports firewall...
send firewall log lines
send firewall log lines
controllerConfig
controllerConfig
controllerConfig
controllerConfig
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP17/README.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP17/README.md deleted file mode 100644 index 35f48970..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP17/README.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -slug: /MEP-17-global-network-view -title: MEP-17 -sidebar_position: 17 ---- - -# Global Network View - -> [!IMPORTANT] -> This MEP assumes the implementation of the metal-apiserver as described by [MEP-4](../MEP4/README.md) which is currently work in progress. - -Having a complete view of the network topology is useful when working with deployments or troubleshooting connectivity issues. -Currently, the API doesn't know of any other switches than the leaf switches. -Information about all other switches and their connections must be gathered from Ansible inventories or by accessing the switches via SSH. -Documentation of each partition's network must be kept in-sync with all changes made to the deployment or cabling. -We would like to expand the API's knowledge of the network to the entire underlay including inter-switch connections as well as BGP statistics and health status. - -## Switch Types - -Registering a switch at the API is done by the metal-core. -Apart from that, it also reconciles port and FRR configuration to adapt to the machine provisioning cycle. -This reconfiguration is only necessary on the leaf switches. -To allow deploying the metal-core on other switches than leaves we need a way of telling it what type of switch it is running on so it can act accordingly. -On any non-leaf switches it will only register the switch and report statistic but not change any configuration. -Supported switch types are - -- `leaf` -- `spine` -- `exit` -- `mgmtleaf` -- `mgmtspine` - -## Network Topology - -All switches should periodically report their LLDP neighbors and port configuration. -This information can be used to quickly identify common network issues, like MTU mismatch or the like. -Ideally, there would be some graphical representation of the network topology containing only the most important information for a quick overview. -It should contain all switches and machines as nodes and all connections as edges of a graph. -Ports, VRFs, and maybe also IPs should be associated with a connection. - -Apart from the topology graph, there should be a way to display more detailed information about both ports of a connection, like - -- MTU -- speed -- IP -- UP/DOWN status -- VRF -- VLAN -- whether it participates in a BGP session - -## BGP Announcements - -The metal-core should collect all routes it knows about and send them to the API along with a timestamp. -Reported routes should be stored to a redis database along with the switch that reported them and the timestamp of the last time they were reported. -An expiration threshold should be defined and all expired routes should be cleaned up periodically. -Whenever new routes are reported they get merged into the existing ones by the strategy: - -- when new, just add -- when existing, update `last_announced` timestamp - -By querying the BGP announcements we can find out whether an allocated IP is still in use. diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/README.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/README.md deleted file mode 100644 index 9c02c0b7..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/README.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -slug: /MEP-18-autonomous-control-plane -title: MEP-18 -sidebar_position: 18 ---- - -# Autonomous Control Plane - -As described in the [deployment chapter](../../../docs/04-For%20Operators/03-deployment-guide.mdx), we strongly recommend Kubernetes as the target platform for running the metal-stack control plane. - -Kubernetes clusters for this purpose are readily available from hyperscalers, metalstack.cloud, or other cloud providers. Simply using a managed Kubernetes cluster greatly simplifies a metal-stack installation. However, sometimes it might be desirable to host the metal-stack control plane autonomously, without the help of another cloud provider. Reasons for this might include corporate policies that prohibit the use of external data center products, or network constraints. - -The Kubernetes cluster hosting the metal-stack control plane must provide at least the following features: - -- Load balancing (for exposing the APIs) -- Persistent storage (for the databases and key-value stores) -- Access to object storage for automated backups of the stateful sets -- Access to a DNS provider supported by one of the used DNS extensions -- Externally accessible DNS records for obtaining officially signed certificates through DNS challenges - -This metal-stack control plane cluster must also be highly available to prevent a complete loss of control over the managed resources in the data center. -Regular Kubernetes updates to apply security fixes and feature updates must be possible in an automated manner. The Day-2 operational overhead of running this cluster in your own datacenter must be reasonable. - -In this chapter, we propose a solution for setting up a metal-stack environment with an autonomous control plane that is independent of another cloud provider. - -## Use Your Own Dogfood - -The most obvious solution is to just deploy a Kubernetes cluster manually in your own data center by utilizing existing tooling for the deployment: - -- k3s -- kubeadm -- vmware and rancher -- talos -- kubespray -- ... (not a complete list) - -However, all these solutions add another layer of complexity that needs to be maintained and operated by people who also need to learn and understand metal-stack. In general, metal-stack in combination with [Gardener](https://gardener.cloud) contains all the necessary tools to provide KaaS, so it makes sense to reuse what is already in place without introducing new dependencies on other products and vendors. - -The only problem here is that Gardener is not yet able to create an initial cluster, which may change with the implementation of [GEP-28](https://github.com/gardener/gardener/blob/master/docs/proposals/28-autonomous-shoot-clusters.md). In the meantime, we suggest using [k3s](https://k3s.io/), which manages the initial metal-stack partition to host the control plane, since the maintenance overhead is acceptable and it is easy to deploy. - -## The Matryoshka Principle - -Instead of directly using the K3s cluster for the production control plane, we propose using it as a minimal control plane cluster which only purpose is to host the production control plane cluster. This layer of indirection brings some reasonable advantages: - -- In the event of an interruption or loss of this minimal control plane cluster, the production control plane remains unaffected, and end users can continue to manage their clusters as normal. -- A dedicated operations team can take care of the Day-2 maintenance of this installation, which can be handy because the tools like k3s are a little different from the rest of the setup (it is likely that more manual maintenance is required than for any other cluster). This would also be true if the initial cluster problem would be solved by the Gardener itself and not using k3s. -- Since the number of shoot clusters to host is static, the resource requirements are minimal and will not change significantly over time. There are no huge resource requirements in terms of cpu, memory and storage. As such, the lack of scalability is not such a big issue. - -So, our proposal is to chain two metal-stack control planes. The initial control plane cluster would use k3s and on this cluster we can spin up a cluster for the production control plane with the use of Gardener. - -The following figure shows how the high-level architecture of this setup looks like. A even more simplified illustration of this setup can be looked up in the appendix[^1]. - -![Autonomous Control Plane Architecture](./autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg) - -The k3s nodes can either be bare metal machines or virtual machines. When using VMs a single k3s node might be a viable solution, too. These nodes are supposed to be setup manually / partly automated with an operating system like Debian. - -To name the cluster that hosts the initial metal-stack control plane and Gardener we use the term _initial cluster_. The initial cluster creates worker nodes to host the _target cluster_. - -## Initial Cluster - -The initial cluster is kept very small. The physical bare metal machines can be any machines and switches which are supported by metal-stack, but can be smaller in terms of cpu, memory and network speed because these machines must only be capable of running the target cluster for the metal-stack control plane. A typical single socket server with 8-16 cores and 64GB of RAM and two NVMe drives of 1TB would be a good starting point. - -In a typical k3s setup, a stateful set would lose the data once the k3s cluster was terminated and started again. But there is a possibility to define parts of the local storage of the server to be provided to the k3s cluster for the PVCs. With that, k3s could be terminated and started again, for example to update and reboot the host os, or update k3s itself and the data will persist. - -Example k3s configuration for persistent storage on the hosts os: - -```yaml -k3s: Cluster -apiVersion: k3s.x-k8s.io/v1alpha4 -name: needle-control-plane -nodes: - - role: control-plane - # add a mount from /path/to/my/files on the host to /files on the node - extraMounts: - - hostPath: /path/to/my/files - containerPath: /files -``` - -Into this cluster metal-stack and Gardener will be deployed. This deployment can be done by a Gitlab runner which is running on this machine. -The mini-lab will be used as a base for this deployment. The current development of [gardener-in-minilab](https://github.com/metal-stack/mini-lab/pull/202) must be extended to host all required extensions to make this a working metal-stack control plane which can manage the machines in the attached bare metal setup. - -In addition to the metal-stack and Gardener deployment, some additional required services are deployed (non-complete list): - -- PowerDNS to serve as a DNS Server for all DNS entries used in the initial and the target cluster, like `api.initial.metal-stack.local`, `gardener-api.initial.metal-stack.local` and the DNS entries for the api servers of the created kubernetes clusters. -- NTP -- Monitoring for the initial cluster and partition -- Optional: OIDC Server for authenticating against the metal-api -- Optional: Container Registry to host all metal-stack and gardener containers -- Optional: Let's Encrypt [boulder](https://github.com/letsencrypt/boulder) as a certificate authority -- ... - -Physical view, minimal setup for a initial cluster with a single physical node: - -![Small Initial Cluster](autonomous-control-plane-images/small-initial-cluster.svg) - -Physical View, bigger ha setup which is spread across two data centers: - -![HA Initial Cluster](autonomous-control-plane-images/ha-initial-cluster.svg) - -### Control Plane High Availability - -Running the initial control plane on a single physical server is not as available as it should be in such a use case. It should be possible to survive a loss of this server, because the server could be lost by many events, such as hardware failure, disk corruption or even failure of the datacenter location where this server is deployed. - -Setting up a second server with the same software components is an option, but the problem of data redundancy must be solved, because neither the gardener control plane, nor the metal-stack control plane can be instantiated twice. - -Given that we provide part of the local storage of the server as backing storage for the stateful sets in the k3s cluster, the data stored on the server itself must be replicated to another server and backed up on a regular basis. - -The replication of ETCD can be achieved through [clustered configuration](https://docs.k3s.io/datastore/ha-embedded) of k3s. Components of metal-stack and Gardener can run standalone and already utilize backup-restore mechanism that must be configured accordingly. For two or more bare metal machine used for the initial cluster, a loadbalancing mechanism for the ingress is required. kube-vip could be a possible solution. - -For monitoring a backend like a Victoria Metrics Cluster would allow spearding the monitoring data across the initial cluster nodes. These metrics should also be backed up in object storage. - -### Partition - -The partition which is managed by the initial cluster can be a simple and small hardware setup but yet capable enough to host the target cluster. It would even be a good practice to create separate target clusters on the initial cluster, e.g. one for the metal-stack control plane and one for the Gardener (maybe one more for monitoring). - -It can follow the metal-stack minimal setup which provides about 8-16 small servers connected to a 1G/s or 10G/s network dataplane. Central storage is optional as the persistence of the services running in these clusters is always backed up to a central object storage. Operations would be much easier if a central storage is provided. - -## Target Cluster - -The target cluster is the metal-stack environment which serves for end-user production use, the control plane is running in a shoot hosted in the initial cluster. The seed(s) and shoot(s) for end-users are created on the machines provided by the target cluster. -These machines can be of a different type in terms of size, but more importantly, these machines are connected to another network dataplane. Also the management infrastructure is separated from the initial cluster management network. - -## Failure Scenarios - -Everything could fail, everything will fail at some point. But this must kept in mind and nothing bad should happen if only one component at a time fails. -If more than one fails, the restoration to a working state must be easily possible and well documented. - -To ensure all possible breakages are documented, we suggest writing a list which summarizes all failure scenarios that might occur including the remediation. - -Here is an example of how a scenario documentation could look like: - -**Scenario**: Initial cluster is gone, all machines have died -**Impact**: Management of the initial cluster infrastructure not possible anymore, the target cluster continues to run but cannot be managed because the API servers are gone. end-users are not affected by this incident. -**Remediation**: The initial cluster nodes must be provisioned from scratch and re-deployed through the CI mechanism. The backups of the stateful sets are automatically restored during this process. - -## Implementation - -As part of this proposal, we provide the following tools and integrations in order to setup an autonomous control plane: - -- Deployment roles for the services like PowerDNS and NTP for the initial cluster -- Stretch goal: Deployment role to setup k3s in clustered configuration for the initial cluster and update it -- Extend the Gardener on mini-lab integration to allow shoot creation in the mini-lab -- Steady integration of the setup (maybe something like [k3d](https://github.com/k3d-io/k3d) in the mini-lab) - -## Appendix - -[^1]: ![metal-stack-chain](autonomous-control-plane-images/metal-stack-chain.svg) diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio deleted file mode 100644 index eafcb514..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio +++ /dev/null @@ -1,535 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine01 -
-
-
-
- - spine01 - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 01 - -
-
-
-
- - Initial cluster node 01 - -
-
-
- - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine02 -
-
-
-
- - spine02 - -
-
-
- - - - - - - - - -
-
-
- leaf03 -
-
-
-
- - leaf03 - -
-
-
- - - - - - - - - -
-
-
- leaf04 -
-
-
-
- - leaf04 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 02 - -
-
-
-
- - Initial cluster node 02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 03 - -
-
-
-
- - Initial cluster node 03 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg deleted file mode 100644 index 99261ada..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine01
spine01
leaf01
leaf01
leaf02
leaf02
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Initial cluster node 01
Initial cluster node 01
123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine02
spine02
leaf03
leaf03
leaf04
leaf04
Initial cluster node 02
Initial cluster node 02
Initial cluster node 03
Initial cluster node 03
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio deleted file mode 100644 index aae8a12d..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio +++ /dev/null @@ -1,1133 +0,0 @@ - - - - - - - - - - - - - - - - - - - -
-
-
- Initial Cluster -
-
-
-
- - Initial Cluster - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - -
-
-
- K3s Standalone - - - (on Debian) - - -
-
-
-
- - K3s Standalone (on Debian) - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Initial Partition -
-
-
-
- - Initial Partition - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for metal-stack -
-
-
-
- - Target Cluster for metal-stack - -
-
-
- - - - - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for Gardener -
-
-
-
- - Target Cluster for Gardener - -
-
-
- - - - - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - - - - - - - -
-
-
- Target Partition -
-
-
-
- - Target Partition - -
-
-
- - - - - - - - - - -
-
-
- Gardener Seeds and End-User Shoots -
-
-
-
- - Gardener Seeds and End-User Shoots - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - - - - -
-
-
- ETCD can be clustered or standalone, backed up by sidecar -
-
-
-
- - ETCD can be clustere... - -
-
-
- - - - - - - - - - -
-
-
- This data will get lost in case local PV gets deleted -
-
-
-
- - This data will get l... - -
-
-
- - - - - - - - - - -
-
-
- We can work with local PVs here, too. -
- backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered. -
-
-
-
- - We can work with local PVs he... - -
-
-
- - - - - - - -
-
-
- ETCD will be deployed in HA configuration on local PVs. -
-
- csi-driver-lvm needs to implement auto deletion of orphaned PVs. -
-
- Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot. -
-
-
-
- - ETCD will be deployed in HA c... - -
-
-
- - - - - - - - - - -
-
-
- More sophisticated storage solutions can be in place. -
-
- (Lightbits, NetApp, ...) -
-
-
-
- - More sophisticated storage so... - -
-
-
- - - - - - - - - - -
-
-
- TODO: Evaluate how to persist these metrics. -
-
-
-
- - TODO: Evaluate how to persist... - -
-
-
- - - - - - - - - - -
-
-
- - 1 VM or -
-
-
- - - 3 Bare Metal Machines - - -
-
-
-
-
- - 1 VM or... - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-stack -
-
-
-
- - metal-stack - -
-
-
- - - - - - - -
-
-
- metal-api -
-
-
-
- - metal-api - -
-
-
- - - - - - - -
-
-
- metal-db -
-
-
-
- - metal-db - -
-
-
- - - - - - - -
-
-
- ipam-db -
-
-
-
- - ipam-db - -
-
-
- - - - - - - -
-
-
- masterdata-db -
-
-
-
- - masterdata-db - -
-
-
- - - - - - - -
-
-
- headscale-db -
-
-
-
- - headscale-db - -
-
-
- - - - - - - -
-
-
- auditing-db -
-
-
-
- - auditing-db - -
-
-
- - - - - - - -
-
-
- nsqd -
-
-
-
- - nsqd - -
-
-
- - - - - - - - - - - -
-
-
- Gardener -
-
-
-
- - Gardener - -
-
-
- - - - - - - - - - -
-
-
- Virtual Garden -
-
-
-
- - Virtual Garden - -
-
-
- - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - -
-
-
- gardenlet -
-
-
-
- - gardenlet - -
-
-
- - - - - - - -
-
-
- Garden etcd -
-
-
-
- - Garden etcd - -
-
-
- - - - - - - -
-
-
- Prometheus -
-
-
-
- - Prometheus - -
-
-
- - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - -
-
-
- - Gitlab - -
- - Runner - -
-
-
-
-
- - Gitlab... - -
-
-
- - - - - - - - - - -
-
-
- Services -
-
-
-
- - Services - -
-
-
- - - - - - - -
-
-
- PowerDNS -
-
-
-
- - PowerDNS - -
-
-
- - - - - - - -
-
-
- boulder -
-
-
-
- - boulder - -
-
-
- - - - - - - -
-
-
- NTP -
-
-
-
- - NTP - -
-
-
- - - - - - - -
-
-
- OIDC -
-
-
-
- - OIDC - -
-
-
- - - - - - - -
-
-
- ... -
-
-
-
- - ... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg deleted file mode 100644 index e58e783b..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg +++ /dev/null @@ -1 +0,0 @@ -
Initial Cluster
Initial Cluster
metal-roles
metal-roles
CI
CI
K3s Standalone(on Debian)
K3s Standalone (on Debian)
Initial Partition
Initial Partition
Target Cluster for metal-stack
Target Cluster for metal-stack
Metal Control Plane
Metal Control Plane
provisions
provisions
Target Cluster for Gardener
Target Cluster for Gardener
Gardener Control Plane
Gardener Control Plane
Monitoring
Monitoring
Target Partition
Target Partition
Gardener Seeds and End-User Shoots
Gardener Seeds and End-User Shoots
provisions
provisions
metal-roles
metal-roles
CI
CI
metal-roles
metal-roles
ETCD can be clustered or standalone, backed up by sidecar
ETCD can be clustere...
This data will get lost in case local PV gets deleted
This data will get l...
We can work with local PVs here, too.
backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered.
We can work with local PVs he...
ETCD will be deployed in HA configuration on local PVs.

csi-driver-lvm needs to implement auto deletion of orphaned PVs.

Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot.
ETCD will be deployed in HA c...
More sophisticated storage solutions can be in place.

(Lightbits, NetApp, ...)
More sophisticated storage so...
TODO: Evaluate how to persist these metrics.
TODO: Evaluate how to persist...
1 VM or
3 Bare Metal Machines
1 VM or...
metal-stack
metal-stack
metal-api
metal-api
metal-db
metal-db
ipam-db
ipam-db
masterdata-db
masterdata-db
headscale-db
headscale-db
auditing-db
auditing-db
nsqd
nsqd
Gardener
Gardener
Virtual Garden
Virtual Garden
Gardener Control Plane
Gardener Control Plane
gardenlet
gardenlet
Garden etcd
Garden etcd
Prometheus
Prometheus
Monitoring
Monitoring
Gitlab
Runner
Gitlab...
Services
Services
PowerDNS
PowerDNS
boulder
boulder
NTP
NTP
OIDC
OIDC
...
...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio deleted file mode 100644 index cd5cf007..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio +++ /dev/null @@ -1,404 +0,0 @@ - - - - - - - - - - -
-
-
- Partition 1 -
-
-
-
- - Partition 1 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 2 -
-
-
-
- - Partition 2 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 3 -
-
-
-
- - Partition 3 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Production Control Plane -
-
-
-
- - Production Control Plane - -
-
- - - - -
-
-
- metal-stack -
- kubernetes cluster -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- gardener -
- kubernetes cluster -
-
-
-
- - gardener... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - - - - -
-
-
- Control Plane Partition -
-
-
-
- - Control Plane Partition - -
-
- - - - - -
-
-
- backup of stateful sets -
-
-
-
- - backup of stateful sets - -
-
- - - - - - -
-
-
- bare metal machine -
-
-
-
- - bare metal machine - -
-
- - - - -
-
-
- metal-stack -
- and -
- gardener -
- kubernetes cluster -
- running in kind -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - -
-
-
- S3 -
-
-
-
- - S3 - -
-
- - - - -
-
-
- Needle -
-
-
-
- - Needle - -
-
- - - -
-
-
- - Nail - -
-
-
-
- - Nail - -
-
-
- - - - - Text is not SVG - cannot display - - - -
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg deleted file mode 100644 index 8f88ba14..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg +++ /dev/null @@ -1 +0,0 @@ -
Partition 1
Partition 1
seeds
seeds
shoots
shoots
Partition 2
Partition 2
seeds
seeds
shoots
shoots
Partition 3
Partition 3
seeds
seeds
shoots
shoots
Production Control Plane
Production Control Plane
metal-stack
kubernetes cluster
metal-stack...
gardener
kubernetes cluster
gardener...
Manages
Manages
Control Plane Partition
Control Plane Partition
backup of stateful sets
backup of stateful sets
bare metal machine
bare metal machine
metal-stack
and
gardener
kubernetes cluster
running in kind
metal-stack...
Manages
Manages
S3
S3
Needle
Needle 
Nail
Nail
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio deleted file mode 100644 index a75ee340..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- Initial cluster node -
-
-
-
- - Initial cluster node - -
-
-
- - - - - - - - - - - - - -
-
-
- mirocloud (initial cluster partition nodes) -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg deleted file mode 100644 index a9d29f05..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
leaf01
leaf01
leaf02
leaf02
Initial cluster node
Initial cluster node
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP2/README.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP2/README.md deleted file mode 100644 index c7f2360a..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP2/README.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -slug: /MEP-2-two-factor-authentication -title: MEP-2 -sidebar_position: 2 ---- - -# Two Factor Authentication diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP3/README.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP3/README.md deleted file mode 100644 index 5ce36721..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP3/README.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -slug: /MEP-3-machine-re-installation -title: MEP-3 -sidebar_position: 3 ---- - -# Machine Re-Installation - -In the current metal-api only machine installations are possible, performing a machine upgrade is only possible by creating a new machine and delete the old one. -This has the drawback that in case a lot of data is stored on the local disks, a full restore of the original data must be performed. - -To prevent this, we will introduce a new metal-api endpoint to reinstall the machine with a new image, _without_ actually deleting the data stored on the additional hard disks. - -Storage is a difficult task to get right and reliable. A short analysis of our different storage requirements lead to 3 different scenarios. - -- Storage for the etcd pvs in the seed cluster of every partition. - This is the most important storage in our setup because these etcd pods serve as configuration backend for all customer kubernetes clusters. If they fail, the cluster is down. However gardener deploys a backup and restore sidecar into the etcd pod of every customer kubernetes control plane, and if this sidecar detects a corrupt or missing etcd database file(s) it starts automatic restore from the configured backup location. This will take some minutes. If for example a node dies, and gardener creates a new node instead, the csi-lvm created pv is not present on that node. Kubernetes will not schedule the missing etcd pod on this node because it has a local PV configured and is therefore tainted to run only on that node. To let kubernetes create that pod anyhow, someone has to either remove the taint, or delete the pod. If this is done, the pod starts and the restore of the etcd data can start as well. You can see this is a bit too complicated and will take the customer cluster down for a while (not measured yet but in the range of 5-10 minutes). -- Storage in customer clusters. - This was not promised in 2020. We have a intermediate solution with the provisioning of csi-lvm by default into all customer clusters. Albeit this is only local storage and will get deleted if a node dies. -- S3 Storage. - We have two possibilities to cope with storage: - - In place update of the OS with a daemonset - This will be fast and simple, but might fail because the packages being installed are broken right now, or a filesystem gets full, or any other failure you can think of during a os update. Another drawback is that metal-api does not reflect the updated os image. - - metal-api get a machine reinstall endpoint - With this approach we leverage from existing and already proven mechanisms. Reinstall must keep all data except the sata-dom. Gardener currently is not able to do an update with this approach because it can only do `rolling` updates. Therefore a additional `osupdatestrategy` has to be implemented for metal and other providers in gardener to be able to leverage the metal reinstall on the same machineID approach. - -If reinstall is implemented, we should focus on the same technology for all scenarios and put ceph via rook.io into the kubernetes clusters as additional StorageClass. It has to be checked whether to use the raw disk or a PV as the underlay block device where ceph stores its data. - -## API and behavior - -The API will get an new endpoint "reinstall" this endpoint takes two arguments: - -- machineID -- image - -No other aspects of the machine can be modified during the re-installation. All data stored in the existing allocation will be preserved, only the image will be modified. -Once this endpoint was called, the machine will get a `reboot` signal with the boot order set to PXE instead of HDD and the network interfaces on the leaf are set to PXE as well. Then the normal installation process starts: - -- unchanged: PXE boot with metal-hammer -- changed: metal-hammer first checks with the machineID in the metal-api (through metal-core) if there is already a allocation present -- changed: if a allocation is present and the allocation has set `reinstall: true`, wipe disk is only executed for the root disk, all other disks are untouched. -- unchanged: the specified image is downloaded and burned, `/install.sh` is executed -- unchanged: successful installation is reported back, network is set the the vrf, boot order is set to HDD. -- unchanged: distribution kernel is booted via kexec - -We can see that the `allocation` requires one additional parameter: `reinstall` and metal-hammer must check for already existing allocation at an earlier stage. - -Components which requires modifications (first guess): - -- metal-hammer: - - check for allocation present earlier - - evaluation of `reinstall` flag set - - wipe of disks depends on that flag - - Bonus: move configuration of disk layout and primary disk detection algorithm (PDDA) from metal-hammer into metal-api. - metal-api **MUST** reject reinstallation if the disk found by PDDA does not have the `/etc/metal` directory! -- metal-core: - - probably nothing -- metal-api: - - new endpoint `/machine/reinstall` - - add `Reinstall bool` to data model of `allocation` - - make sure to reset `Reinstall` after reinstallation to prevent endless reinstallation loop -- metalctl: - - implement `reinstall` -- metal-go: - - implement `reinstall` -- gardener (longterm): - - add the `OSUpgradeStrategy` `reinstall` diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP4/README.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP4/README.md deleted file mode 100644 index 389a02d4..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP4/README.md +++ /dev/null @@ -1,211 +0,0 @@ ---- -slug: /MEP-4-multi-tenancy-for-the-metal-api -title: MEP-4 -sidebar_position: 4 ---- - -# Multi-Tenancy for the metal-api -:::info -This document is work in progress. -::: - -In the past we decided to treat the metal-api as a "low-level API", i.e. the API does not specifically deal with projects and tenants. A user with editor access can for example assign machines to every project he desires, he can see all the machines available and can control them. We tried to keep the metal-api code base as small as possible and we added resource scoping to a "higher-level APIs". From there, a user would be able to only see his own clusters and IP addresses. - -As time passed metal-stack has become an open-source project and people are willing to adopt. Adopters who want to put their own technologies on top of the metal-stack infrastructure don't have those "higher-level APIs" that we implemented closed-source for our user base. So, external adopters most likely need to implement resource scoping on their own. - -Introducing multi-tenancy to the metal-api is a serious chance of making our product better and more successful as it opens the door for: - -- Becoming a "fully-featured" API -- Narrowing down attack surfaces and possibility of unintended resource modification produced by bugs or human errors -- Discouraging people to implement their own scoping layers in front of the metal-stack -- Gaining performance through resource scopes -- Letting untrusted / third-parties work with the API - -## Requirements - -These are some general requirements / higher objectives that MEP-4 has to fulfill. - -- Should be able to run with mini-lab without requiring to setup complex auth backends (dex, LDAP, keycloak, ...) - - Simple to start with, more complex options for production setups -- Fine-grained access permissions (every endpoint maps to a permission) -- Tenant scoping (disallow resource access to resources of other tenants) -- Project scoping (disallow resource access to resources of other projects) -- Access tokens in self-service for technical user access - -## Implementation - -We gathered a lot of knowledge while implementing a multi-tenancy-capable backend for metalstack.cloud. The goal is now to use the same technology and adopt that to the metal-api, this includes: - -- gRPC in combination with connectrpc -- OPA for making auth decisions -- REST HTTP only for OIDC login flows - -### API Definitions - -The API definitions should be located on a separate Github repository separate from the server implementation. The proposed repository location is: https://github.com/metal-stack/api. - -This repository contains the `proto3` specification of the exposed metal-stack api. This includes the messages, simple validations, services and the access permission to these services. The input parameters for the authorization in the backend are generated from the `proto3` annotations. - -Client implementations for the most relevant languages (go, python) are generated automatically. - -This api is divided into end-user and admin access at the top level. The proposed APIs are: - -- `metalstack.api.v2`: For end-user facing services -- `metalstack.admin.v2`: For operators and controllers which need access to unscoped entities - -The methods of the API can have different role scopes (and can be narrowed down further with fine-grained method permissions): - -- `tenant`: Tenant-scoped methods, e.g. project creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `project`: Project-scoped methods, e.g. machine creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `admin` Admin-scoped methods, e.g. unscoped tenant list or switch register - - Available roles: VIEWER, EDITOR - -And has methods with different visibility scopes: - -- `self`: Methods that only the logged in user can access, e.g. show permissions with the presented token -- `public`: Methods that do not require any specific authorization - -### API - -The API server implements the services defined in the API and validates access to a method using OPA with the JWT tokens passed in the requests. The server is implemented using the connectrpc.com framework. - -The API server implements the login flow through OIDC. After successful authentication, the API server derives user permissions from the OIDC provider and issues a new JWT token which is passed on to the user. The tokens including the permissions are stored in a redis compatible backend. - -With these tokens, users can create Access Tokens for CI/CD or other use cases. - -JWT Tokens can be revoked by admins and the user itself. - -### API Server - -Is put into a new github repo which implements the services defined in the `api` repository. It opens a `https` endpoints where the grpc (via connectrpc.com) and oidc services are exposed. - -### Migration of the Consumers - -To allow consumers to migrate to the `v2` API gradually, both apis, the new and the old, are deployed in parallel. In the control-plane both apis are deployed side-by-side behind the ingress. `api.example.com` is forwarded to `metal-api` and `metal.example.com` is forwarded to the new `metal-apiserver`. - -The api-server will talk to the existing metal-api during the process of migration services away to the new grpc api. - -The migration process can be done in the following manner: - -for each resource in the metal-api: - -- create a new proto3 based definition in the `api` repo. -- implement the business logic per service in the new `metal-apiserver` without calling the metal-api. -- clients must be able to talk to `v1` and `v2` backend in parallel -- Deprecate the already migrated service in the swagger route to notify the client that this route should not be used anymore. -- identify all consumers of this resource and replace them to use the grpc instead of the rest api -- move the business logic incl. the backend calls to ipam, metal-db, masterdata-api, nsq for this resource from the metal-api to the `metal-apiserver` - -We will migrate the rethinkdb backend implementation to a generic approach during this effort. - -- Try to enhance the generic rethinkdb interface with `project` scoped methods. - -There are a lot of consumers of metal-api, which need to be migrated: - -- ansible -- firewall-controller -- firewall-controller-manager -- gardener-extension-auth -- gardener-extension-provider-metal - - Do not point the secret bindings to a the shared provider secret in the seed anymore. Instead, use individual provider-secret containing project-scoped API access tokens in the Gardener project namespaces. -- machine-controller-manager-provider-metal -- metal-ccm -- metal-console -- metal-bmc -- metal-core -- metal-hammer -- metal-image-cache-sync -- metal-images -- metal-metrics-exporter -- metal-networker -- metalctl -- pixie - -## User Scenarios - -This section gathers a collection of workflows from the perspective of a user that we want to provide with the implementation of this proposal. - -### Machine Creation - -A regular user wants to create a machine resource. - -Requirements: Project was created, permissions are present - -- The user can see networks that were provided by the admin. - - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` - -- The user has to set the project scope first or provide `--project` flags for all commands. - ``` - $ metalctl project set 793bb6cd-8b46-479d-9209-0fedca428fe1 - You are now acting on project 793bb6cd-8b46-479d-9209-0fedca428fe1. - ``` -- The user can create the child network required for machine allocation. - ``` - $ metalctl network allocate --partition fra-equ01 --name test - ``` -- Now, the user sees his own child network. - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - └─╴08b9114b-ec47-4697-b402-a11421788dc6 test 793bb6cd-8b46-479d-9209-0fedca428fe1 fra-equ01 false false 10.128.64.0/22  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` -- The user does not see any machines yet. - ``` - $ metalctl machine ls - ``` -- The user can create a machine. - ``` - $ metalctl machine create --networks internet,08b9114b-ec47-4697-b402-a11421788dc6 --name test --hostname test --image ubuntu-20.04 --partition fra-equ01 --size c1-xlarge-x86` - ``` -- The machine will now be provisioned. - ``` - $ metalctl machine ls - ID LAST EVENT WHEN AGE HOSTNAME PROJECT SIZE IMAGE PARTITION - 00000000-0000-0000-0000-ac1f6b7befb2 Phoned Home 20s 50d 4h test 793bb6cd-8b46-479d-9209-0fedca428fe1 c1-xlarge-x86 Ubuntu 20.04 20210415 fra-equ01 - ``` - -:::warning -A user **cannot** list all allocated machines for all projects. The user **must** always switch project context first and can only view the machines inside this project. Only admins can see all machines at once. -::: -### Scopes for Resources - -The admins / operators of the metal-stack should be able to provide _global_ resources that users are able to use along with their own resources. In particular, users can view and use _global_ resources, but they are not allowed to create, modify or delete them. - -:::info -When a project ID field is empty on a resource, the resource is considered _global_. -::: - -Where possible, users should be capable of creating their own resource entities. - -| Resource | User | Global | -| :----------------- | :--- | :----- | -| File System Layout | yes | yes | -| Firewall | yes | | -| Firmware | | yes | -| OS Image | | yes | -| Machine | yes | | -| Network (Base) | | yes | -| Network (Children) | yes | | -| IP | yes | | -| Partition | | yes | -| Project | yes | | -| Project Token | yes | | -| Size | | yes | -| Switch | | | -| Tenant | | yes | - -:::info -Example: A user can make use of the file system layouts provided by the admins, but can also create own layouts. Same applies for images. As soon as a user creates own resources, the user takes over the responsibility for the machine provisioning to succeed. -::: diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP5/README.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP5/README.md deleted file mode 100644 index 3b7fc45c..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP5/README.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -slug: /MEP-5-shared-networks -title: MEP-5 -sidebar_position: 5 ---- - -# Shared Networks - -## Why are shared networks needed - -For special purpose machines that serve shared services with performance critical workloads to all machines of a partition (like persistent storage) it would be good to have kind of a "shared network" that is easily accessible. -They do not necessarily need another firewall. This would avoid having two firewalls in the datapath between a machine in a private network and the machines of a shared service. - -## Constraints that need to hold - -- a shared network is usable from all machines that have a firewall in front, that uses it -- a shared network is only usable within a single partition (currently we are constrained in bandwidth and have no routing of 10.0.0.0/8 addresses btw. partitions and failure domain should be the partition but this constraint might get lifted in the future) -- networks may be marked as shared after network allocation (but there should be no way back from shared to unshared) -- neither machines nor firewalls may have multiple private, unshared networks configured -- machines must have a single primary network configured - - this might be a shared network - - OR a plain, unshared private network -- firewalls may participate in multiple shared networks -- machines can be allocated with a primary network using auto IP allocation or with `noauto` and a specific IP - -## Should shared networks be private - -**Alternative 1:** If we implemented shared networks by extending functions around plain, private networks we would not have to manage another CIDR (mini point) and it would be possible to create a k8s cluster with a private network, mark the network as `shared` and produce shared services from this k8s cluster. - -**Alternative 2:** If shared networks are implemented as first class networks we could customize the VRF and also accomplish an other goal of our roadmap: being able to create machines directly in an external network. - -Together with @majst01 and @Gerrit91 we decided to continue to implement **Alternative 1**. - -## Firewalls accessing a shared network - -Firewalls that access shared networks need to: - -- hide the private network behind an ip address of the shared network if the shared network was configured with `nat=true`. -- import the prefixes of the shared VRF to the private VRF and import the prefixes of the private VRF to the shared VRF so that the communication between the two is working in both directions. As long as no `nat=true` was set on the shared VRF, the original machine ips are visible in both communication directions. - -## Setup with shared networks and single consumer - -![Simple Setup](./shared.png) - -## Setup with single shared network and multiple consumers - -![Advanced Setup](./shared_advanced.png) - -## Getting internet access - -Machines contained in a shared network can access the internet with different scenarios: - -- if they have an own firewall: this is internet accessibility, as common (check whether all traffic gets routed through it!) -- if they don't have an own firewall, an external HTTP proxy is needed that has an endpoint exposed as Service Type NodePort diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP5/shared.drawio b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP5/shared.drawio deleted file mode 100644 index aa7af045..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP5/shared.drawio +++ /dev/null @@ -1,121 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP5/shared.png b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP5/shared.png deleted file mode 100644 index b0b47f03..00000000 Binary files a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP5/shared.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP5/shared_advanced.drawio b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP5/shared_advanced.drawio deleted file mode 100644 index 6f96eca0..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP5/shared_advanced.drawio +++ /dev/null @@ -1,187 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP5/shared_advanced.png b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP5/shared_advanced.png deleted file mode 100644 index da989915..00000000 Binary files a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP5/shared_advanced.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP6/README.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP6/README.md deleted file mode 100644 index edf52a6c..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP6/README.md +++ /dev/null @@ -1,123 +0,0 @@ ---- -slug: /MEP-6-dmz-networks -title: MEP-6 -sidebar_position: 6 ---- - -# DMZ Networks - -## Reasoning - -To fulfill higher levels of security measures the standard metal-stack approach with a single firewall in front of a set of machines might be insufficient. -There are cases where two physically distinct firewalls in front of application workload are mandatory. In traditional network terms this is known as DMZ approach. - -For Kubernetes workloads it makes sense to use the front cluster for ingress, WAF purposes and as outgoing proxy. The clusters may be used for application workload. - -## DMZ network - -- Use a separate DMZ network prefix for every tenant -- This is used as intermediate network btw. private networks of a tenant and the internet -- For every partition a distinct DMZ firewall/cluster is needed for a tenant -- For Gardener orchestrated Kubernetes clusters this network must be a publicly reachable internet prefix because shoot clusters need a vpn service that is used for instrumentation from the seed cluster - this will be a requirement as long as the inverse vpn tunnel feature Konnectivity is not available to us. - -## Approach 1: DMZ with publicly reachable internet prefix - -![DMZ Internet](dmz-internet_public.svg) - -A DMZ network with publicly reachable internet prefix will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: null -partitionid: "" -prefixes: - - 212.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 104007 -vrfshared: false -nat: true -shared: false -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -This is currently supported by the metal-networker and needs no further changes! - -## Approach 2: DMZ with private IPs - -![DMZ Internet](dmz-internet_private.svg) - -A DMZ network with private IPs will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: tenant-super-network-fra-equ01 -partitionid: fra-equ01 -prefixes: - - 10.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 4711 -vrfshared: false -nat: true -shared: true # it's usable from multiple projects -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network (only locally, no-export) -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -## Code Changes / Implications - -- `metal-networker` and `metal-ccm` assume that there is only one network providing the default-route -- `metal-networker` needs to - - import the default route from the internet network to the dmz network (DMZ Firewall) - - import the DMZ network to the internet network and adjusting NAT rules (DMZ Firewall) - - import destination prefixes of the DMZ network to the private primary network (DMZ Firewall, Application Firewall) - - import DMZ-IPs of the private primary network to the DMZ network (DMZ Firewall, Application Firewall) -- `metal-api`: destination prefixes of private networks need to be configurable (`allocateNetwork`) -- `gardener-extension-provider-metal`: needs to be able to delete DMZ clusters (but skip the network deletion part) -- the application firewall is not publicly reachable - for debugging purposes a hop over the DMZ firewall is needed - -## Decision - -We decided to follow the second approach with private DMZ networks. diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP6/dmz-internet_private.drawio b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP6/dmz-internet_private.drawio deleted file mode 100644 index 7b83bbfc..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP6/dmz-internet_private.drawio +++ /dev/null @@ -1,178 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP6/dmz-internet_private.svg b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP6/dmz-internet_private.svg deleted file mode 100644 index f5e58204..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP6/dmz-internet_private.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
10.90.30.129
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
10.90.30.128/25
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
10.90.30.129 is reachable
/0 via Firewall DMZ
10.0.0.0/24 is reachable
10.0.1.0/24 is reachable
10.90.30.129 is reachable...
Internet
212.1.1.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0

import 10.0.0.0/24 no export
import 10.0.1.0/24 no export
import 10.90.30.128/25 no export
import 10.0.0.0/24 no exp...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP6/dmz-internet_public.drawio b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP6/dmz-internet_public.drawio deleted file mode 100644 index 544939e5..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP6/dmz-internet_public.drawio +++ /dev/null @@ -1,184 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP6/dmz-internet_public.svg b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP6/dmz-internet_public.svg deleted file mode 100644 index 5e825081..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP6/dmz-internet_public.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
212.1.2.3
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
212.1.2.0/27
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
212.1.2.3 is reachable
/0 via Firewall DMZ
212.1.2.3 is reachable...
Internet
212.1.1.0/27 212.1.2.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0
import 212.1.2.0/27
import 10.0.0.0/24 no redistribute
import 10.0.1.0/24 no redistribute

import 212.1.2.0/27...
SNAT to
212.1.2.1
SNAT to...
SNAT to
212.1.2.2
SNAT to...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP8/README.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP8/README.md deleted file mode 100644 index 14748fae..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP8/README.md +++ /dev/null @@ -1,503 +0,0 @@ ---- -slug: /MEP-7-configurable-filesystem-layout-for-machine-allocation -title: MEP-7 -sidebar_position: 7 ---- - -# Configurable Filesystem layout for Machine Allocation - -The current implementation uses a hard coded filesystem layout depending on the specified size and image. This is done in the metal-hammer. This worked well in the past because we had a small amount of sizes and images. But we reached a point where this is to restricted for all use cases we have to fulfill. It also forces us to modify the metal-hammer source code to support a new filesystem layout. - -This proposal tries to address this issue by introducing a filesystem layout struct in the metal-api which is then configurable per machine allocation. -The original behavior of automatic filesystem layout decision must still be present, because there must be no API change for existing API consumers. It should be a additional feature during machine allocation. - -## API and behavior - -The API will get a new endpoint `filesystemlayouts`to create/update/delete a set of available `filesystemlayouts`. - -### Constraints - -In order to keep the actual machine allocation api compatible, there must be no difference while allocating a machine. To achieve this every -`filesystemlayout` defines constraints which specifies for which combination of `sizes` and `images` this layout should be used by default. -The specified constraints over all `filesystemlayouts` therefore must be collision free, to be more specific, there must be exactly one layout outcome -for every possible combination of `sizes` and `images`. - -The `size` constraint must be a list of the exact size ids, the `image` constraint must be a map of os to semver compatible version constraint. For example: - -- `debian: ">= 10.20210101"` or `debian: "< 10.20210101"` - -The general form of a `image` constraint is a map from `os` to `versionconstraint` where: - -`os` must match the first part of the image without the version. -`versionconstraint` must be the comparator, a space and the version, or simply `*` to match all versions of this `os`. -The comparator must be one of: "=", "!=", ">", "<", ">=", "=>", "<=", "=<", "~", "~>", "^" - -It must also be possible to have a `filesystemlayout` in development or for other special purposes, which can be specified during the machine allocation. -To have such a layout, both constraints `sizes` and `images`must be empty list. - -### Reinstall - -The current reinstall implementation the metal-hammer detects during the installation on which disk the OS was installed and reports back to the metal-api the Report struct which has two properties `primarydisk` and `ospartition`. -Both fields are not required anymore because the logic is now shifted to the `filesystemlayout` definition. If `Disk.WipeOnReinstall` is set to true, this disk will be wiped, default is false and is preserved. - -### Handling of s2-xlarge machines - -These machines are a bit special compared to our `c1-*` machines because they have rotating hard disks for the mass storage purpose. -The downside is that the on board SATA-DOM has the same naming as the HDDs and can not be specified as the first /dev/sda disk because all HDDs are also /dev/sd\* disks. -Therefore we had a special SATA-DOM detection algorithm inside metal-hammer which simply checks for the smallest /dev/sd disk and took this to install the OS. - -This is not possible with the current approach, but we figured out that the SATA-DOM is always `/dev/sde`. So we can create a special `filesystemlayout` where the installations is made on this disk. - -### Possible Filesystemlayout hierarchies - -It is only possible to create a filesystem on top of a block device. The creation of a block device can be done on multiple ways, depending on the requirements regarding performance, space and redundancy of the filesystem. -It also depends on the disks available on the server. - -The current approach implements the following hierarchies: - -![filesystems](filesystems.png) - -### Implementation - -```go -// FilesystemLayout to be created on the given machine -type FilesystemLayout struct { - // ID unique layout identifier - ID string - // Description is human readable - Description string - // Filesystems to create on the server - Filesystems []Filesystem - // Disks to configure in the server with their partitions - Disks []Disk - // Raid if not empty, create raid arrays out of the individual disks, to place filesystems onto - Raid []Raid - // VolumeGroups to create - VolumeGroups []VolumeGroup - // LogicalVolumes to create on top of VolumeGroups - LogicalVolumes []LogicalVolume - // Constraints which must match to select this Layout - Constraints FilesystemLayoutConstraints -} - -type FilesystemLayoutConstraints struct { - // Sizes defines the list of sizes this layout applies to - Sizes []string - // Images defines a map from os to versionconstraint - // the combination of os and versionconstraint per size must be conflict free over all filesystemlayouts - Images map[string]string -} - -type RaidLevel string -type Format string -type GPTType string - -// Filesystem defines a single filesystem to be mounted -type Filesystem struct { - // Path defines the mountpoint, if nil, it will not be mounted - Path *string - // Device where the filesystem is created on, must be the full device path seen by the OS - Device string - // Format is the type of filesystem should be created - Format Format - // Label is optional enhances readability - Label *string - // MountOptions which might be required - MountOptions []string - // CreateOptions during filesystem creation - CreateOptions []string -} - -// Disk represents a single block device visible from the OS, required -type Disk struct { - // Device is the full device path - Device string - // Partitions to create on this device - Partitions []Partition - // WipeOnReinstall, if set to true the whole disk will be erased if reinstall happens - // during fresh install all disks are wiped - WipeOnReinstall bool -} - -// Raid is optional, if given the devices must match. -// TODO inherit GPTType from underlay device ? -type Raid struct { - // ArrayName of the raid device, most often this will be /dev/md0 and so forth - ArrayName string - // Devices the devices to form a raid device - Devices []Device - // Level the raidlevel to use, can be one of 0,1,5,10 - // TODO what should be support - Level RaidLevel - // CreateOptions required during raid creation, example: --metadata=1.0 for uefi boot partition - CreateOptions []string - // Spares defaults to 0 - Spares int -} - - -// VolumeGroup is optional, if given the devices must match. -type VolumeGroup struct { - // Name of the volumegroup without the /dev prefix - Name string - // Devices the devices to form a volumegroup device - Devices []string - // Tags to attach to the volumegroup - Tags []string -} - -// LogicalVolume is a block devices created with lvm on top of a volumegroup -type LogicalVolume struct { - // Name the name of the logical volume, without /dev prefix, will be accessible at /dev/vgname/lvname - Name string - // VolumeGroup the name of the volumegroup - VolumeGroup string - // Size of this LV in mebibytes (MiB) - Size uint64 - // LVMType can be either striped or raid1 - LVMType LVMType -} - -// Partition is a single partition on a device, only GPT partition types are supported -type Partition struct { - // Number of this partition, will be added to the device once partitioned - Number int - // Label to enhance readability - Label *string - // Size given in MebiBytes (MiB) - // if "0" is given the rest of the device will be used, this requires Number to be the highest in this partition - Size string - // GPTType defines the GPT partition type - GPTType *GPTType -} - -const ( - // VFAT is used for the UEFI boot partition - VFAT = Format("vfat") - // EXT3 is usually only used for /boot - EXT3 = Format("ext3") - // EXT4 is the default fs - EXT4 = Format("ext4") - // SWAP is for the swap partition - SWAP = Format("swap") - // None - NONE = Format("none") - - // GPTBoot EFI Boot Partition - GPTBoot = GPTType("ef00") - // GPTLinux Linux Partition - GPTLinux = GPTType("8300") - // GPTLinuxRaid Linux Raid Partition - GPTLinuxRaid = GPTType("fd00") - // GPTLinux Linux Partition - GPTLinuxLVM = GPTType("8e00") - - // LVMTypeLinear append across all physical volumes - LVMTypeLinear = LVMType("linear") - // LVMTypeStriped stripe across all physical volumes - LVMTypeStriped = LVMType("striped") - // LVMTypeStripe mirror with raid across all physical volumes - LVMTypeRaid1 = LVMType("raid1") -) -``` - -Example `metalctl` outputs: - -```bash -$ metalctl filesystemlayouts ls -ID DESCRIPTION SIZES IMAGES -default default fs layout c1-large-x86, c1-xlarge-x86 debian >=10, ubuntu >=20.04, centos >=7 -ceph fs layout for ceph s2-large-x86, s2-xlarge-x86 debian >=10, ubuntu >=20.04 -firewall firewall fs layout c1-large-x86, c1-xlarge-x86 firewall >=2 -storage storage fs layout s3-large-x86 centos >=7 -s3 storage fs layout s2-xlarge-x86 debian >=10, ubuntu >=20.04, >=firewall-2 -default-devel devel fs layout -``` - -The `default` layout reflects what is actually implemented in metal-hammer to guarantee backward compatibility. - -```yaml ---- -id: default -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - label: "efi" # required to be compatible with old images - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" # required to be compatible with old images - - path: "/var/lib" - device: "/dev/sda3" - format: "ext4" - label: "varlib" # required to be compatible with old images - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -The `firewall` layout reuses the built in nvme disk to store the logs, which is way faster and larger than what the sata-dom ssd provides. - -```yaml ---- -id: firewall -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - firewall: ">=2" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sda2" - format: "ext4" - - path: "/var" - device: "/dev/nvme0n1p1" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - device: "/dev/nvme0n1" - wipe: true - partitions: - - number: 1 - label: "var" - size: 0 - type: GPTLinux -``` - -The `storage` layout will be used for the storage servers, which must have mirrored boot disks. - -```yaml ---- -id: storage -constraints: - sizes: - - s3-large-x86 - images: - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/md1" - format: "vfat" - options: "-F32" - - path: "/" - device: "/dev/md2" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid - - device: "/dev/sdb" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid -raid: - - name: "/dev/md1" - level: 1 - devices: - - "/dev/sda1" - - "/dev/sdb1" - options: "--metadata=1.0" - - name: "/dev/md2" - level: 1 - devices: - - "/dev/sda2" - - "/dev/sdb2" - options: "--metadata=1.0" -``` - -The `s3-storage` layout matches the special situation on the s2-xlarge machines. - -```yaml ---- -id: s3-storage -constraints: - sizes: - - c1-large-x86 - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sde1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sde2" - format: "ext4" - - path: "/var/lib" - device: "/dev/sde3" - format: "ext4" -disks: - - device: "/dev/sde" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -A sample `lvm` layout which puts `/var/lib` as stripe on the nvme device - -```yaml ---- -id: lvm -description: "lvm layout" -constraints: - size: - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - createoptions: - - "-F 32" - label: "efi" - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" - - path: "/var/lib" - device: "/dev/vg00/varlib" - format: "ext4" - label: "varlib" - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -volumegroups: - - name: "vg00" - devices: - - "/dev/nvmne0n1" - - "/dev/nvmne0n2" -logicalvolumes: - - name: "varlib" - volumegroup: "vg00" - size: 200 - lvmtype: "striped" -disks: - - device: "/dev/sda" - wipeonreinstall: true - partitions: - - number: 1 - label: "efi" - size: 500 - gpttype: "ef00" - - number: 2 - label: "root" - size: 5000 - gpttype: "8300" - - device: "/dev/nvmne0n1" - wipeonreinstall: false - - device: "/dev/nvmne0n2" - wipeonreinstall: false -``` - -## Components which requires modifications - -- metal-hammer: - - change implementation from build in hard coded logic - - move logic to create fstab from install.sh to metal-hammer -- metal-api: - - new endpoint `filesystemlayouts` - - add optional spec of `filesystemlayout` during `allocation` with validation if given `filesystemlayout` is possible on given size. - - add `allocation.filesystemlayout` in the response, based on either the specified `filesystemlayout` or the calculated one. - - implement `filesystemlayouts` validation for: - - matching to disks in the size - - no overlapping with the sizes/imagefilter specified in `filesystemlayouts` - - all devices specified exists from top to bottom (fs -> disks -> device || fs -> raid -> devices) -- metalctl: - - implement `filesystemlayouts` -- metal-go: - - adopt api changes -- metal-images: - - install mdadm for raid support diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP8/filesystems.drawio b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP8/filesystems.drawio deleted file mode 100644 index 0f0c6ab5..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP8/filesystems.drawio +++ /dev/null @@ -1,43 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP8/filesystems.png b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP8/filesystems.png deleted file mode 100644 index 6d903b7e..00000000 Binary files a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP8/filesystems.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP9/README.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP9/README.md deleted file mode 100644 index a8cae83d..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP9/README.md +++ /dev/null @@ -1,132 +0,0 @@ ---- -slug: /MEP-9-no-open-ports-to-the-data-center -title: MEP-9 -sidebar_position: 9 ---- - -# No Open Ports To the Data Center - -Our metal-stack partitions typically have open ports for metal-stack native services, these are: - -- SSH port on the firewalls -- bmc-reverse-proxy for serial console access through the metal-console - -These open ports are potential security risks. For example, while SSH access is possible only with private key it's still vulnerable to DoS attack. - -Therefore, we want to get rid off these open ports to reduce the attack surface to the data center. - -## Requirements - -- Access to firewall SSH only via VPN -- Easy to update VPN components - -As a next step, we can also consider joining the management servers to the VPN mesh, which would replace typical WireGuard setups for operators to enter resources inside the partition. - -## High Level Design - -[](./architecture.svg) - -> Simplified drawing showing old vs. new architecture. - -### Concerns - -There's few concerns when using WireGuard for implementing VPN: - -1. WireGuard doesn't implement dynamic cipher substitution. Which is important in case one of the crypto methods, used by WireGuard will be broken. The only possible solution for that will be to update WireGuard to a fixed version. -2. Coordination server(Headscale) is a single point of failure. In case it fails, it potentially can disconnect existing members of the network, as WireGuard can't manage dynamic IPs by itself. -3. Headscale is already falls behind Tailscale coordination server implementation. Which can complicate the upgrade to newer version of Tailscale client in case of emergency. - -### Solutions to concerns - -1. Tailscale node software is using userspace implementation of WireGuard -- `wireguard-go`. One of the options is to inject Tailscale client into `metalctl`. And make it available as `metalctl vpn` or similar command. It should be possible to do as `tailscale` node is already available as open sourced Go pkg. That would allow us to control, what version of Tailscale users are using and in case of any critical changes to enforce them to update `metalctl` to use VPN functionality. -2. Would it be a considerable risk? We could look into `wg-dynamic` project to cover this problem. -3. At the moment, repository looks well maintained and the metal-stack team already contributes to it. - -## Implementation Details - -### metal-roles - -`metal-roles` will be responsible for deployment of `headscale` server(via new `headscale` role). It also should provide sufficient config to `metal-api` so it establishes connection with `headscale` gRPC server. - -### New `metalctl` commands - -`metalctl` will be responsible for client-side implementation of this MEP. Specifically, it's by using `metalctl` user expected to connect to firewalls. - -- `metalctl vpn` -- section for VPN related commands: - - `metalctl vpn get key [vpn name] --namespace [namespace name]` -- returns auth key to be used with `tailscale` client for establishing connection. - -Extend `metalctl firewall`: - -- `metalctl firewall ssh [ID]` -- connect to firewall via SSH. - -Extend `metalctl machine`: - -- `metalctl machine ssh [ID]` -- connect to machine via SSH. - -`metalctl` will be able to connect to firewall and machines by running `tailscale` in container. - -### metal-api - -Updates to `metal-api` should be made, so that it's able to add firewalls to VPNs. There should be one Tailscale namespace per project. So if multiple firewalls are created in single project, they will join the same namespace. - -Two new flags should be introduced to connect `metal-api` to `headscale` gRPC server: - -- `headscale-addr` -- specifies address of Headscale grpc API. -- `headscale-api-key` -- specifies temporary API key to connect to Headscale. It should be replaced and then rotated by `metal-api`. - -If `metal-api` initialized with `headscale` connection it should automatically join all created firewalls to VPN. - -Add new endpoint, that will be used by `metalctl` to connect to VPN: - -- `/v1/vpn GET` -- requests auth key from `headscale` server. - -### metal-hammer - -`metal-hammer` acts as an intermediary for machine configuration between `metal-api` and machine's image. Specifically it writes to `/etc/metal/install.yaml` file, data from which later will be used by image's `install.sh` file. - -To implement VPN support we have to add authentication key and VPN server address to `install.yaml` file. This key will be used to join machine to a VPN. - -### metal-images - -Images `install.sh` script have to be updated to work with authentication key and VPN server address, provided in `install.yaml` file. If this key is present, machine should connect to VPN. - -### metal-networker - -`metal-networker` also have to know if VPN was configured. In that case we need to disable public access to SSH and allow all(?) traffic from WireGuard interface. - -### firewall-controller - -`firewall-controller` have to monitor changes in `Firewall` resource and keep `tailscaled` version up-to-date. - -### Resources - -Update `Firewall` resource to include desired/actual `tailscale` version: - -``` -Firewall: - Spec: - tailscale: - Version: Minimal version - ... - Status: - ... - VPN: - Status: Boolean field - tailscale: - Version: Actual version - ... -``` - -### bmc-reverse-proxy - -TODO - -## References - -1. [WireGuard: Next Generation Secure Network Tunnel](https://www.youtube.com/watch?v=88GyLoZbDNw) -2. [How Tailscale works](https://tailscale.com/blog/how-tailscale-works) -3. [Tailscale is officially SOC 2 compliant](https://tailscale.com/blog/soc2) -4. [Why not Wireguard](https://www.ipfire.org/blog/why-not-wireguard) -5. [Wireguard: Known Limitations](https://www.wireguard.com/known-limitations/) -6. [Wireguard: Things That Might Be Accomplished](https://www.wireguard.com/todo/) -7. [Headscale: Tailscale control protocol v2](https://github.com/juanfont/headscale/issues/526) diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP9/architecture.drawio b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP9/architecture.drawio deleted file mode 100644 index adb09214..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP9/architecture.drawio +++ /dev/null @@ -1,324 +0,0 @@ - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - -
-
-
- headscale -
-
-
-
- - headscale - -
-
- - - - - - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
-
- - - - - Viewer does not support full SVG 1.1 - - - -
diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP9/architecture.svg b/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP9/architecture.svg deleted file mode 100644 index fd268d2f..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/MEP9/architecture.svg +++ /dev/null @@ -1 +0,0 @@ -
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
headscale
headscale
tailscaled
tailscaled
tailscaled
tailscaled
Internet
Internet
Internet
Internet
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/_category_.json b/versioned_docs/version-v0.21.8/contributing/01-Proposals/_category_.json deleted file mode 100644 index 2e7fa4bf..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "position": 1, - "label": "Enhancement Proposals" -} \ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/contributing/01-Proposals/index.md b/versioned_docs/version-v0.21.8/contributing/01-Proposals/index.md deleted file mode 100644 index 9f3ef30d..00000000 --- a/versioned_docs/version-v0.21.8/contributing/01-Proposals/index.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -slug: /enhancement-proposals -title: Enhancement Proposals -sidebar_position: 1 ---- - -# Metal Stack Enhancement Proposals (MEPs) - -This section contains proposals which address substantial modifications to metal-stack. - -Every proposal has a short name which starts with _MEP_ followed by an incremental, unique number. Proposals should be raised as pull requests in the [docs](https://github.com/metal-stack/docs) repository and can be discussed in Github issues. - -The list of proposals and their current state is listed in the table below. - -Possible states are: - -- `In Discussion` -- `Accepted` -- `Declined` -- `In Progress` -- `Completed` -- `Aborted` - -Once a proposal was accepted, an issue should be raised and the implementation should be done in a separate PR. - -| Name | Description | State | -| :------------------------ | :--------------------------------------------- | :-------------: | -| [MEP-1](MEP1/README.md) | Distributed Control Plane Deployment | `Declined` | -| [MEP-2](MEP2/README.md) | Two Factor Authentication | `Aborted` | -| [MEP-3](MEP3/README.md) | Machine Re-Installation to preserve local data | `Completed` | -| [MEP-4](MEP4/README.md) | Multi-tenancy for the metal-api | `In Progress` | -| [MEP-5](MEP5/README.md) | Shared Networks | `Completed` | -| [MEP-6](MEP6/README.md) | DMZ Networks | `Completed` | -| MEP-7 | Passing environment variables to machines | `Declined` | -| [MEP-8](MEP8/README.md) | Configurable Filesystemlayout | `Completed` | -| [MEP-9](MEP9/README.md) | No Open Ports To the Data Center | `Completed` | -| [MEP-10](MEP10/README.md) | SONiC Support | `Completed` | -| [MEP-11](MEP11/README.md) | Auditing ^of metal-stack resources | `Completed` | -| [MEP-12](MEP12/README.md) | Rack Spreading | `Completed` | -| [MEP-13](MEP13/README.md) | IPv6 | `Completed` | -| [MEP-14](MEP14/README.md) | Independence from external sources | `Completed` | -| MEP-15 | HAL Improvements | `In Discussion` | -| [MEP-16](MEP16/README.md) | Firewall Support for Cluster API Provider | `In Discussion` | -| [MEP-17](MEP17/README.md) | Global Network View | `In Discussion` | -| [MEP-18](MEP18/README.md) | Autonomous Control Plane | `In Discussion` | diff --git a/versioned_docs/version-v0.21.8/contributing/02-planning-meetings.md b/versioned_docs/version-v0.21.8/contributing/02-planning-meetings.md deleted file mode 100644 index ef602204..00000000 --- a/versioned_docs/version-v0.21.8/contributing/02-planning-meetings.md +++ /dev/null @@ -1,51 +0,0 @@ ---- -slug: /planning-meetings -title: Planning Meetings -sidebar_position: 2 ---- - -# Planning Meetings - -Public planning meetings are held **biweekly** on **odd calendar weeks** from **14:00 to 14:30** on Microsoft Teams. The purpose is to provide an overview of our current projects and priorities, as well as to discuss new topics and issues within the group. - -Our [development planning board](https://github.com/orgs/metal-stack/projects/34) can be found on GitHub. - -You can use [this link](https://teams.microsoft.com/l/meetup-join/19%3ameeting_ZTVmNWFkYjYtMzVmYi00ZTMxLTk5ZTUtMGFjYjU2OTk0MjQz%40thread.v2/0?context=%7b%22Tid%22%3a%22f9d9b921-8f78-466d-95fd-4495e73d8d65%22%2c%22Oid%22%3a%228ac2a791-e637-4a90-8505-0a1ee175ebfc%22%7d) to join. If you want to get an invitation to the event, please drop us a line on our Slack channel. - -Planning meetings are currently not recorded. The meetings are held either in English or German depending on the attendees. - -:::info -Note that anyone can contribute to metal-stack without participating in planning meetings. However, if you want to speed up the review process for your requirements, it might be helpful to attend the meetings. -::: - -## Agenda - -Here is the agenda that we generally want to follow in a planning meeting: - -- Possibility to bring up news that are interesting for every developer of the metal-stack org -- Check `Done` column and archive cards - - Attendees have the chance to briefly present achievements if they want -- Check the `In Progress` column and discuss whether these tasks are still worked on, there were significant blockers or they can be lower-prioritized -- Check new issues labelled with `triage` and prioritize them -- Allow attendees to bring up issues and prioritize them - - Attendees have the chance to briefly present these new issues - -## Idea Backlog - -The backlog contains ideas of what could become part of the roadmap in the future. The list is ordered alphabetically. Therefore, the order does not express the importance or weight of a backlog item. - -We incorporate community feedback into the roadmap. If you think that important points are missing in the backlog, please share your ideas with us. We have a Slack channel. Please check out [metal-stack.io](https://metal-stack.io) for contact information. - -:::danger -By no means this list is a promise of what is being worked on in the near future. It is just a summary of ideas that was agreed on to be "nice to have". It is up to the investors, maintainers and the community to choose topics from this list and to implement them or to remove them from the list. -::: - -- Add metal-stack to [Gardener conformance test grid](https://testgrid.k8s.io/gardener-all) -- Autoscaler for metal control plane components -- CI dashboard and public integration testing -- Improved release and deploy processes (GitOps, [Spinnaker](https://spinnaker.io/), [Flux](https://fluxcd.io/)) -- Machine internet without firewalls -- metal-stack dashboard (UI) -- Offer our metal-stack extensions as enterprise products (accounting, cluster-api, S3) (neither of them will ever be required for running metal-stack, they just add extra value for certain enterprises) -- Partition managed by Kubernetes (with Kubelets joining the control plane cluster) -- Public offering / demo playground diff --git a/versioned_docs/version-v0.21.8/contributing/03-contribution-guideline.md b/versioned_docs/version-v0.21.8/contributing/03-contribution-guideline.md deleted file mode 100644 index 15a73d0d..00000000 --- a/versioned_docs/version-v0.21.8/contributing/03-contribution-guideline.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -slug: /contribution-guideline -title: Contribution Guideline -sidebar_position: 3 ---- - -# Contribution Guideline - -This document describes the way we want to contribute code to the projects of metal-stack, which are hosted on [github.com/metal-stack](https://github.com/metal-stack). - -The document is meant to be understood as a general guideline for contributions, but not as burden to be placed on a developer. Use your best judgment when contributing code. Try to be as clean and precise as possible when writing code and try to make your code as maintainable and understandable as possible for other people. - -Even if it should go without saying, we live an open culture of discussion, in which everybody is welcome to participate. We treat every contribution with respect and objectiveness with the general aim to write software of quality. - -If you want, feel free to propose changes to this document in a pull request. - -## How Can I Contribute? - -Open a Github issue in the project you would like to contribute. Within the issue, your idea can be discussed. It is also possible to directly create a pull request when the set of changes is relatively small. - -When opening an issue please consider the following aspects: - -1. Create a meaningful issue describing the WHY? of your contribution. -1. Try to set appropriate labels to the issue. For example, attach the `triage` label to your issue if you want it to be discussed in the next [planning meeting](./02-planning-meetings.md). It might be useful to attend the meeting if you want to emphasize it being worked on. - -### Pull Requests - -The process described here has several goals: - -- Maintain quality -- Enable a sustainable system to review contributions -- Enable documented and reproducible addition of contributions - -1. Create a repository fork within the context of that issue. Members of the organization may work on the repository directly without a fork, which allows building development artifacts more easily. -1. Develop, document and test your contribution (try not to solve more than one issue in a single pull request). -1. Create a Draft Pull Request to the repository's main branch. -1. Create a meaningful description of the pull request or reference the related issue. The pull request template explains what the content should include, please read it. -1. Ask for merging your contribution by removing the draft marker. Repository maintainers (see [Code Ownership](#code-ownership)) are notified automatically, but you can also reach out to people directly on Slack if you want a review from a specific person. - -## General Objectives - -This section contains language-agnostic topics that all metal-stack projects are trying to follow. - -### Code Ownership - -The code base is owned by the entire team and every member is allowed to contribute changes to any of the projects. This is considered as collective code ownership[^1]. - -As a matter of fact, there are persons in a project, which already have experience with the sources. These are defined directly in the repository's [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) file. If you want to merge changes into the master branch, it is advisable to include code owners into the process of discussion and merging. - -### Microservices - -One major ambition of metal-stack is to follow the idea of [microservices](https://en.wikipedia.org/wiki/Microservices). This way, we want to achieve that we can - -- adapt to changes faster than with monolithic architectures, -- be free of restrictions due to certain choices of technology, -- leverage powerful traits of cloud infrastructures (e.g. high-scalability, high-availability, ...). - -### Programming Languages - -We are generally open to write code in any language that fits best to the function of the software. However, we encourage [golang](https://en.wikipedia.org/wiki/Go_(programming_language)) to be the main language of metal-stack as we think that it makes development faster when not establishing too many different languages in our architecture. Reason for this is that we are striving for consistent behavior of the microservices, similar to what has been described for the Twelve-Factor App (see [12 Factor](https://12factor.net/)). We help enforcing unified behavior by allowing a small layer of shared code for every programming language. We will refer to this shared code as "libraries" for the rest of this document. - -### Artifacts - -Artifacts are always produced by a CI process (Github Actions). - -Docker images are published on the Github Container Registry of the metal-stack organization. - -Binary artifacts or OS images can be uploaded to `images.metal-stack.io` if necessary. - -When building Docker images, please consider our build tool [docker-make](https://github.com/fi-ts/docker-make) or the specific [docker-make action](https://github.com/fi-ts/action-docker-make) respectively. - -### APIs - -We are currently making use of [Swagger](https://swagger.io/) when we exposing traditional REST APIs for end-users. This helps us with being technology-agnostic as we can generate clients in almost any language using [go-swagger](https://goswagger.io/). Swagger additionally simplifies the documentation of our APIs. - -Most APIs though are not required to be user-facing but are of technical nature. These are preferred to be implemented using [grpc](https://grpc.io/). - -#### Versioning - -Artifacts are versioned by tagging the respective repository with a tag starting with the letter `v`. After the letter, there stands a valid [semantic version](https://semver.org/). - -### Documentation - -In order to make it easier for others to understand a project, we document general information and usage instructions in a `README.md` in any project. - -In addition to that, we document a microservice in the [docs](https://github.com/metal-stack/docs) repository. The documentation should contain the reasoning why this service exists and why it was being implemented the way it was being implemented. The aim of this procedure is to reduce the time for contributors to comprehend architectural decisions that were made during the process of writing the software and to clarify the general purpose of this service in the entire context of the software. - -## Guidelines - -This chapter describes general guidelines on how to develop and contribute code for a certain programming language. - -### Golang - -Development follows the official guide to: - -- Write clear, idiomatic Go code[^2] -- Learn from mistakes that must not be repeated[^3] -- Apply appropriate names to your artifacts: - - [https://go.dev/talks/2014/names.slide](https://go.dev/talks/2014/names.slide) - - [https://go.dev/blog/package-names](https://go.dev/blog/package-names) - - [https://go.dev/doc/effective_go#names](https://go.dev/doc/effective_go#names) -- Enable others to understand the reasoning of non-trivial code sequences by applying a meaningful documentation. - -#### Development Decisions - -- **Dependency Management** by using Go modules -- **Build and Test Automation** by using [GNU Make](https://man7.org/linux/man-pages/man1/make.1p.html). -- **End-user APIs** should consider using go-swagger and [Go-Restful](https://github.com/emicklei/go-restful) - **Technical APIs** should consider using [grpc](https://grpc.io/) - -#### Libraries - -metal-stack maintains several libraries that you should utilize in your project in order to unify common behavior. Some of these projects are: - -- [metal-go](https://github.com/metal-stack/metal-go) -- [metal-lib](https://github.com/metal-stack/metal-lib) - -#### Error Handling with Generated Swagger Clients - -From the server-side you should ensure that you are returning the common error json struct in case of an error as defined in the `metal-lib/httperrors`. Ensure you are using `go-restful >= v2.9.1` and `go-restful-openapi >= v0.13.1` (allows default responses with error codes other than 200). - -### Documentation - -We want to share knowledge and keep things simple. If things cannot kept simple we want to enable everybody to understand them by: - -- Document in short sentences[^4]. -- Do not explain the HOW (this is already documented by your code and documenting the obvious is considered a defect). -- Explain the WHY. Add a "to" in your documentation line to force yourself to explain the reasonning (e.g. "` to `"). - -### Python - -Development follows the official guide to: - -- Style Guide for Python Code (PEP 8)[^5] - - The use of an IDE like [PyCharm](https://www.jetbrains.com/pycharm/) helps to write compliant code easily -- Consider [setuptools](https://pythonhosted.org/an_example_pypi_project/setuptools.html) for packaging -- If you want to add a Python microservice to the mix, consider [pyinstaller](https://github.com/pyinstaller/pyinstaller) on Alpine to achieve small image sizes - -[^1]: [https://martinfowler.com/bliki/CodeOwnership.html](https://martinfowler.com/bliki/CodeOwnership.html) - -[^2]: [https://go.dev/doc/effective_go](https://go.dev/doc/effective_go) - -[^3]: [https://github.com/golang/go/wiki/CodeReviewComments](https://github.com/golang/go/wiki/CodeReviewComments) - -[^4]: [https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences](https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences) - -[^5]: [https://www.python.org/dev/peps/pep-0008/](https://www.python.org/dev/peps/pep-0008/) diff --git a/versioned_docs/version-v0.21.8/contributing/04-release-flow.md b/versioned_docs/version-v0.21.8/contributing/04-release-flow.md deleted file mode 100644 index 2a6403b7..00000000 --- a/versioned_docs/version-v0.21.8/contributing/04-release-flow.md +++ /dev/null @@ -1,107 +0,0 @@ ---- -slug: /release-flow -title: Release Flow -sidebar_position: 4 ---- - -# Releases - -The metal-stack contains of many microservices that depend on each other. The automated release flow is there to ensure that all components work together flawlessly for every metal-stack release. - -Releases and integration tests are published through our [release repository](https://github.com/metal-stack/releases). You can also find the [release notes](https://github.com/metal-stack/releases/releases) for this metal-stack version in there. The release notes contain information about new features, upgrade paths and bug fixes. - -If you want, you can sign up at our Slack channel where we are announcing every new release. Often, we provide additional information for metal-stack administrators and adopters at this place, too. - -This document is intended for developers, especially maintainers of metal-stack projects. - -## Release Flow - -The following diagram attempts to describe our current release flow: - -![](release_flow.svg) - -A release is created in the following way: - -- Individual repository maintainers within the metal-stack GitHub Organization can publish a release of their component. -- This release is automatically pushed to the `develop` branch of the release repository by the metal-robot. -- A push triggers a virtual release integration test using the mini-lab environment. This setup launches metal-stack with the `sonic` and `gardener` flavors to validate the different Ansible roles and execute basic operations across the metal-stack layer. -- To contribute components that are not directly part of the release vector, a pull request must be made against the `develop` branch of the release repository. Release maintainers may push directly to the `develop` branch. -- The release maintainers can `/freeze` the `develop` branch, effectively stopping the metal-robot from pushing component releases to this branch. -- The `develop` branch is tagged by a release maintainer with a `-rc.x` suffix to create a __release candidate__. -- The release candidate must pass a large integration test suite on a real environment, which is currently run by FI-TS. It tests the entire machine provisioning engine including the integration with Gardener, the deployment, metal-images and Kubernetes conformance tests. -- If the integration tests pass, the PR of the `develop` branch must be approved by at least two release maintainers. -- A release is created via GitHub releases, including all release notes, with a tag on the `main` branch. - -## FAQ - -**Question: I need PR #xyz to go into the release, why did you not include it?** - -Answer: It's not on purpose if we miss a PR to be included into a metal-stack release. Please use the pending pull request from `develop` into `master` as soon as it is open and comment which pull request you want to have included into the release. Also consider attending our planning meetings or contact us in our Slack channel if you have urgent requirements that need to be dealt with. - -**Question: Who is responsible for the releases? Who can freeze a release?** - -Answer: Every repository in metal-stack has a `CODEOWNERS` file pointing to a maintainer team. This is also true for the releases repository. Only release repository maintainers are allowed to `/freeze` a release (meaning the metal-robot does not automatically append new component releases to the release vector anymore). - -**Question: I can't push to the `develop` branch of this repository? How can I request changes to the release vector?** - -Answer: Most changes are automatically integrated by the metal-robot. For manually managed components, please raise a pull request against the `develop` branch. Only release maintainers are allowed to push to `develop` as otherwise it would be possible to mess up the release pipeline. - -**Question: What requirements need to be fulfilled to add a repository to the release vector?** - -Please see the section below named [Requirements for Release Vector Repositories](#requirements-for-release-vector-repositories). - -### Requirements for Release Vector Repositories - -Before adding a repository in the metal-stack org to the releases repository, it is advised for the maintainer to fulfill the following points: - -- The following files should be present at the repository root: - - [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) - - When a repository is created, the metal-robot automatically creates a -maintainers team in our GitHub org. - - The CODEOWNERS file should reference this team. - - The team should contain at least two maintainers. - - `LICENSE` - - This usually should be MIT with "metal-stack" as authors. - - `CONTRIBUTING.md` - - This should contain the following content: - ``` - # Contributing - - Please check out the [contributing section](https://docs.metal-stack.io/stable/development/contributing/) in our [docs](https://docs.metal-stack.io/). - ``` - - `README.md` -- The `developers-core` team should be given repository access with `write` role, the codeowners team should have the `maintain` role -- Release artifacts should have an SPDX-formatted SBOM attached. - - For container images these are embedded using Buildx. -- The following branch protection rules should be set: - - The mainline should be protected. - - A pull request should be required before merging (required by at least one code owner). - - Status checks should be required to pass. - - Force push should not be allowed on this branch. -- One person from the releases maintainers has to add the repository to the metal-robot in order to pick up the releases, add them to the release vector and generate release notes. - -### How-To Release a Project - -[release-drafter](https://github.com/release-drafter/release-drafter) is preferred in order to generate release notes from merged PRs for your projects. It should be triggered for pushes on your main branch. - -The draft is then used to create a project release. The release has to be published through the GitHub UI as demonstrated in the screenshot below. - -**Tagging the repository is not enough as repository tagging does not associate your release notes to your release!** - -![](release.png) - -Some further remarks: - -- Use semver versions with `v` prefix for your tags -- Name your release after your release tag -- The metal-robot only picks up lines from your release notes that start with `-` or `*` (unordered list items) and appends them to the according section in the aggregated release draft -- A tag created through a GitHub UI release does not trigger a `push` event . This means, your pipeline will not start to run with the `push` trigger when publishing through the UI. - - Instead, use the `published` [release event trigger](https://docs.github.com/en/actions/reference/events-that-trigger-workflows#release) for your actions: - - ```yaml - on: - release: - types: - - published - ``` -- In case they are necessary, please do not forget to include `NOTEWORTHY`, `ACTIONS_REQUIRED` or `BREAKING_CHANGE` sections into releases. More information on those release draft sections can be read in a pull request template. diff --git a/versioned_docs/version-v0.21.8/contributing/05-community.md b/versioned_docs/version-v0.21.8/contributing/05-community.md deleted file mode 100644 index 61eaf099..00000000 --- a/versioned_docs/version-v0.21.8/contributing/05-community.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -slug: /community -title: Community -sidebar_position: 5 -draft: true ---- - -# Community - -(Slack channel, community events like FOSDEM, Kubernetes Community Days..., blog -articles) diff --git a/versioned_docs/version-v0.21.8/contributing/release.png b/versioned_docs/version-v0.21.8/contributing/release.png deleted file mode 100644 index 598b1182..00000000 Binary files a/versioned_docs/version-v0.21.8/contributing/release.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.8/contributing/release_flow.drawio b/versioned_docs/version-v0.21.8/contributing/release_flow.drawio deleted file mode 100644 index 6ca6b34f..00000000 --- a/versioned_docs/version-v0.21.8/contributing/release_flow.drawio +++ /dev/null @@ -1,721 +0,0 @@ - - - - - - - - - - - -
-
-
- Review release notes -
-
-
-
- - Review release notes - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- Organization Webhook -
-
-
-
- - Organization Webhook - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- - Publish release - -
-
-
-
- - Publish release - -
-
-
- - - - - - - - -
-
-
- Maintainer -
-
-
-
- - Maint... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- metal-robot release handler -
-
-
-
- - metal-robot release han... - -
-
-
- - - - - - - - -
-
-
- - no - -
-
-
-
- - no - -
-
-
- - - - - - - - -
-
-
- - yes - -
-
-
-
- - yes - -
-
-
- - - - - - - -
-
-
- version in event newer than release vector version -
-
-
-
- - version in event newer than... - -
-
-
- - - - - - - -
-
-
- - do nothing - -
-
-
-
- - do nothing - -
-
-
- - - - - - - - - - - - -
-
-
- Github Action -
-
-
-
- - Github Action - -
-
-
- - - - - - - -
-
-
- Bump version in release vector and push to - - develop - -
-
-
-
- - Bump version in release vector... - -
-
-
- - - - - - - - - - - -
-
-
- Open pull request from - - develop - - to - - master - -
-
-
-
- - Open pull request from develop... - -
-
-
- - - - - - - -
-
-
- Update aggregated release draft in - - metal-stack/releases - -
-
-
-
- - Update aggregated release draf... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Integration Testing -
-
-
-
- - Integration Testing - -
-
-
- - - - - - - - - - - -
-
-
- Merge to - - master - -
-
-
-
- - Merge to master - -
-
-
- - - - - - - - - - - - -
-
-
- Review -
-
-
-
- - Review - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Tests suceeded and PR changes reviewed -
-
-
-
- - Tests suceeded and PR chang... - -
-
-
- - - - - - - -
-
-
- - publish results to #integration - -
-
-
-
- - publish results to #integr... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Release metal-stack -
-
-
-
- - Release metal-stack - -
-
-
- - - - - - - - - - - -
-
-
- - publish to #announcements - -
-
-
-
- - publish to #announcements - -
-
-
- - - - - - - -
-
-
- - - metal-stack/docs - - pull request - -
-
-
-
- - metal-stack/docs pull requ... - -
-
-
- - - - - - - - - - - - -
-
-
- Freeze -
-
-
-
- - Freeze - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Freeze - - develop - - and create a release candidate -
-
-
-
- - Freeze develop and create a rel... - -
-
-
- - - - - - - -
-
-
- Large integration suites -
- - (currently owned by FI-TS, not public) - -
-
-
-
-
- - Large integration suites... - -
-
-
- - - - - - - - -
-
-
- Run -
-
-
-
- - Run - -
-
-
- - - - -
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.21.8/contributing/release_flow.svg b/versioned_docs/version-v0.21.8/contributing/release_flow.svg deleted file mode 100644 index 55cdd493..00000000 --- a/versioned_docs/version-v0.21.8/contributing/release_flow.svg +++ /dev/null @@ -1 +0,0 @@ -
Review release notes
Review release notes
projects
projects
projects
projects
Organization Webhook
Organization Webhook
projects
projects
Publish release
Publish release
Maintainer
Maint...
metal-robot release handler
metal-robot release han...
no
no
yes
yes
version in event newer than release vector version
version in event newer than...
do nothing
do nothing
Github Action
Github Action
Bump version in release vector and push todevelop
Bump version in release vector...
Open pull request fromdeveloptomaster
Open pull request from develop...
Update aggregated release draft inmetal-stack/releases
Update aggregated release draf...
Integration Testing
Integration Testing
Merge tomaster
Merge to master
Review
Review
Tests suceeded and PR changes reviewed
Tests suceeded and PR chang...
publish results to #integration
publish results to #integr...
Release metal-stack
Release metal-stack
publish to #announcements
publish to #announcements
metal-stack/docspull request
metal-stack/docs pull requ...
Freeze
Freeze
Freezedevelopand create a release candidate
Freeze develop and create a rel...
Large integration suites
(currently owned by FI-TS, not public)
Large integration suites...
Run
Run
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.8/docs/02-General/04-flavors-of-metalstack.md b/versioned_docs/version-v0.21.8/docs/02-General/04-flavors-of-metalstack.md index 7da427fc..2277ca6b 100644 --- a/versioned_docs/version-v0.21.8/docs/02-General/04-flavors-of-metalstack.md +++ b/versioned_docs/version-v0.21.8/docs/02-General/04-flavors-of-metalstack.md @@ -14,7 +14,7 @@ As modern infrastructure and cloud native applications are designed with Kuberne Regardless which flavor of metal-stack you use, it is always possible to manually provision machines, networks and ip addresses. This is the most basic way of using metal-stack and is very similar to how traditional bare metal infrastructures are managed. -Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](../../contributing/01-Proposals/MEP4/README.md) and [MEP-16](../../contributing/01-Proposals/MEP16/README.md) in the future. +Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api) and [MEP-16](/community/MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller) in the future. ## Gardener diff --git a/versioned_docs/version-v0.21.8/docs/04-For Operators/03-deployment-guide.mdx b/versioned_docs/version-v0.21.8/docs/04-For Operators/03-deployment-guide.mdx index d45ff631..208e0aa2 100644 --- a/versioned_docs/version-v0.21.8/docs/04-For Operators/03-deployment-guide.mdx +++ b/versioned_docs/version-v0.21.8/docs/04-For Operators/03-deployment-guide.mdx @@ -31,7 +31,7 @@ You can use the [mini-lab](https://github.com/metal-stack/mini-lab) as a templat The metal control plane is typically deployed in a Kubernetes cluster. Therefore, this document will assume that you have a Kubernetes cluster ready for getting deployed. Even though it is theoretically possible to deploy metal-stack without Kubernetes, we strongly advise you to use the described method because we believe that Kubernetes gives you a lot of benefits regarding the stability and maintainability of the application deployment. :::tip -For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](../../contributing/01-Proposals/MEP18/README.md) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). +For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](/community/MEP-18-autonomous-control-plane) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). ::: Let's start off with a fresh folder for your deployment: diff --git a/versioned_docs/version-v0.21.8/docs/05-Concepts/01-architecture.mdx b/versioned_docs/version-v0.21.8/docs/05-Concepts/01-architecture.mdx index 709960e3..75298df9 100644 --- a/versioned_docs/version-v0.21.8/docs/05-Concepts/01-architecture.mdx +++ b/versioned_docs/version-v0.21.8/docs/05-Concepts/01-architecture.mdx @@ -152,4 +152,4 @@ Thus, for creating a partition as well as a machine or a firewall, the flags `dn In order to be fully offline resilient, make sure to check out `metal-image-cache-sync`. This component provides copies of `metal-images`, `metal-kernel` and `metal-hammer`. -This feature is related to [MEP14](../../contributing/01-Proposals/MEP14/README.md). +This feature is related to [MEP14](/community/MEP-14-independence-from-external-sources). diff --git a/versioned_docs/version-v0.21.8/docs/05-Concepts/02-user-management.md b/versioned_docs/version-v0.21.8/docs/05-Concepts/02-user-management.md index f1ee2778..ba742ee9 100644 --- a/versioned_docs/version-v0.21.8/docs/05-Concepts/02-user-management.md +++ b/versioned_docs/version-v0.21.8/docs/05-Concepts/02-user-management.md @@ -7,7 +7,7 @@ sidebar_position: 2 # User Management At the moment, metal-stack can more or less be seen as a low-level API that does not scope access based on projects and tenants. -Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](../../contributing/01-Proposals/MEP4/README.md). +Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](/community/MEP-4-multi-tenancy-for-the-metal-api). Until then projects and tenants can be created, but have no effect on access control. diff --git a/versioned_docs/version-v0.21.8/docs/06-For CISOs/Security/01-principles.md b/versioned_docs/version-v0.21.8/docs/06-For CISOs/Security/01-principles.md index 680b95d8..3f3c8794 100644 --- a/versioned_docs/version-v0.21.8/docs/06-For CISOs/Security/01-principles.md +++ b/versioned_docs/version-v0.21.8/docs/06-For CISOs/Security/01-principles.md @@ -15,7 +15,7 @@ The minimal need to know principle is a security concept that restricts access t ### RBAC :::info -As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](../../../contributing/01-Proposals/MEP4/README.md). +As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](..//community/MEP-4-multi-tenancy-for-the-metal-api). ::: As described in our [User Management](../../05-Concepts/02-user-management.md) concept the [metal-api](https://github.com/metal-stack/metal-api) currently offers three different user roles for authorization: diff --git a/versioned_docs/version-v0.21.8/docs/06-For CISOs/Security/04-communication-matrix.md b/versioned_docs/version-v0.21.8/docs/06-For CISOs/Security/04-communication-matrix.md index 07df2607..24c1bc1d 100644 --- a/versioned_docs/version-v0.21.8/docs/06-For CISOs/Security/04-communication-matrix.md +++ b/versioned_docs/version-v0.21.8/docs/06-For CISOs/Security/04-communication-matrix.md @@ -116,7 +116,7 @@ Please note that every [networking setup](../../05-Concepts/03-Network/01-theory | VLAN | Switches, Firewalls | Layer 2 traffic segmentation. | | VXLAN | Switches, Firewalls | Encapsulate Layer 2 frames in Layer 3 packets for network virtualization. | | EVPN | Switches, Firewalls | Overlay network technology for scalable and flexible network architectures. | -| VPN | Firewalls | Management access [without open SSH ports](../../../contributing/01-Proposals/MEP9/README.md). | +| VPN | Firewalls | Management access [without open SSH ports](..//community/MEP-9-no-open-ports-to-the-data-center). | | BGP | Multiple | Routing protocol for dynamic routing and network management. | | SSH | Management Server, Switches | Secure shell access for management and configuration. | | LLDP | Switches, Machines | Link Layer Discovery Protocol for network device discovery. | diff --git a/versioned_docs/version-v0.21.8/docs/06-For CISOs/rbac.md b/versioned_docs/version-v0.21.8/docs/06-For CISOs/rbac.md index 9a87b896..06c902bb 100644 --- a/versioned_docs/version-v0.21.8/docs/06-For CISOs/rbac.md +++ b/versioned_docs/version-v0.21.8/docs/06-For CISOs/rbac.md @@ -31,4 +31,4 @@ To ensure that internal components interact securely with the metal-api, metal-s Users can interact with the metal-api using [metalctl](https://github.com/metal-stack/metalctl), the command-line interface provided by metal-stack. Depending on the required operations, users should authenticate with the appropriate role to match their level of access. -As part of [MEP-4](../../contributing/01-Proposals/MEP4/README.md), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. +As part of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/Distributed-API-Working.png b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/Distributed-API-Working.png deleted file mode 100644 index 899e223d..00000000 Binary files a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/Distributed-API-Working.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/Distributed-API.png b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/Distributed-API.png deleted file mode 100644 index 688c7c2e..00000000 Binary files a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/Distributed-API.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/Distributed-Deployment.png b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/Distributed-Deployment.png deleted file mode 100644 index 8bba51b8..00000000 Binary files a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/Distributed-Deployment.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/Distributed.drawio b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/Distributed.drawio deleted file mode 100644 index f7c6fe79..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/Distributed.drawio +++ /dev/null @@ -1 +0,0 @@ -7V3bcts2EP0aP8pDgPfH2ImbziRTt+m0zVMHpmCJMUUoFG1L/fqC4kUkAEoULwDkRC8WIBIkF2d3z+6C8JV5u9r+kqD18jOZ4+gKGvPtlfn+CtKPY9M/Wc8u7/FtM+9YJOE87wKHji/hf7joNIre53CON40DU0KiNFw3OwMSxzhIG30oSchr87BHEjWvukYLzHV8CVDE9/4dztNl3uvZxqH/Iw4Xy/LKwCh+eUDB0yIhz3FxvStoPu4/+c8rVI5VHL9Zojl5rXWZH67M24SQNP+22t7iKJNtKbb8vLuWX6v7TnCcdjkhWVoPJP5zF3/w/trOvG+vv70+z4rJe0HRMy4fw4noeDePhA5L5YmC/Afn+3N2pzcfcfSC0zBAh67s4dJd1DwuO3+22U/3O3oAgOtt/Qxnkf29R0kapiGJy6vS288vnP9eSK4aHFIhrrOvz6voLkEr+vXmdRmm+Ms6v89XClLat0xXEW0B+nU/QziTgUFb1SRkjYCswqD4HqEHHN1UU3pLIpLQn2ISZ9fYpAl5qvABise7Q6swymD/F07mKEZFd4FxkA2LonAR00ZAJwkn1RO94CTF29aJBBU8qNphssJpsqOHFCfMbLeAVKFywC0x+VpDcNm3rKHX9opOVGjNohr9gBz6pQDPGUACkEPSBif0OWkfHGEayyfzMqlWj2QaI4kUcCK1eJGajkikYASRrrboX/K79Qh+vYe77+Ef8RMBM4+TaECCp4SgYClDoJz0BDJuF6jFCNQ0O8oTGiPI055D4NsPc8+Yo0cAwMy0OJHhOfUDRZMk6ZIsSIyiD4dexnTUZDdHm+W+H3SwHNTE3YXZne5HwfH8Xea1souucZz3NH+v24+OZqZ1xrKHPDpfCY5QGr40naFI9PtT6a2jXe2ANQnjdFMb+T7rOMDAAoxaGdBvOqkzT6Bf8nsQn256LadXd7whz0mAi9MYQFVi6a+zrsCfDsTd4ZhPhKwL0H3DaborEICeU9LE5x50JcyCCG02mZ9rYBEcQ00upCOPWdAGOt4Cp0eOc8ZG4SCDypOdmkE1ADdTpVGlPGFNtTkTUuXQI/yYNTfUvobx4tO+9d50xrKeVhPHlsDBAyiwns5Uzsg5Krt2Dy9fdpUMVMgOuCh4QLZredg2fedhBji5MQT7bObckZJzBNt498OQ7HKm3Wm4jW0zXsbmEWaL6LdlTqWegLdesv1MC+Hp72T8SSgMxxkoN2yhquUYuZubjDP4nImgI6JohtahRmbVYsxqlcBR5pLKkPMtYb4U6uSgh23xmSTQlw9aRz3apJmNT5FGsIeedrA3j1ExzfMC0G6BnZS0gFieloCivcGYDXQN2oBeURu4mLCNtRXqozZwMWEbSy80kJ0ol6ModLv5GbqNgjIuza9B5OYp9zbjs1hJoRub7qVs4tqofWBzwKkp7UUEcmx6TD2hrQrkb0gDJqq/8PUSnk8r1IAKjXoHdWx2XQMVgAKuwerEoXLIhAd8dw3neBum/2R4vva8ovl137SL1vttgfZ9Y1dr3OMkpM+X+eWiNkmfNR8LOGW7NljWPIy2b+3qLXa8TurVllE/Hca4HVWw4fv5SS/7hmZc2KyxYzNoailNCkYymJHY2HhqNb/kDAS3MgFUpFBdDgL+IDkI2DUHAfXKQcCLyUFwpWMABSuZJHu3i8lCcMVjHaR3Mg8hbY3mzxLyVCVkv3Miwp0MZ9omIg4UtuSsX2vkVsxfB/goVXXnAxGRxeMuImHBEzZDoCxybXKZDdRPV/rj3pSUsuBKz9Jxb15GmoKiTD/g84mKywn9gK9f5GfysbRy0zJF5FcuwD8Z+Zn22GZo2PzwkbmmkV/1opk+oYt5PGzWKPCzWFOrgfD4qPln/fnC4z4AtAv7LNEKddYB/Zilh6PpmNOOrGsKU1H9AVg+g6neBQhQJR0lMXhLVIGIN4QCVhuXwr9TKqTvIm2fzKdYGr6f1vqiZKXxdkPhT6h7f822Or/VZnXU7IEqa9sN/Hjsy9tTKxmfHvoJlrPB4koCi8nSf8Pyr53Dx5WLHZ75o3V4nacX6ewFT9ch0chWM6badQSW2pVpqUvd11DXpGbj7dELwS3qw754LjspafPhnobJeMC9YK88Jeko8Uo1j+Oe5XL0UzGna0RTsu7JKwSA8WU+u8fKxLs4OKauxrd1lk/zEElvFtpmv7k7OdCe0Hjm4WNLtc8Onwiu7POMzn2WW9DGTHM1U19Q6QCmVDMtWgS0D9mv12WmcfZwiiHS50+bWjOCtNglU1WgVRMWFMXpk70U4vBxOi8spERYM2hoJy1tV64McMqSVqFwhQ/ZvNfhsww6FuNN/RahlHekcxKUyxSrz4G6auOFqtGtejEtKZS31o0tfPVlhTPTsBlEWdkrTwda6HQyX+fuZMc/QQHa9hvltrLzGmc0t7IbbQM6vjKSJd6Uswb2VU31rMG9JELP5l3U83lXSYJSiRmdPPdosablaKDb1VTyywfdPgH0uZaSf5rjhpJbBi3HTvLhaNNOqglF2bWxUs2keGNnTk7Vvs7ti9+02dfZZsEld19noUQhJ1EVlvSsFZ+MBTwZ0gqfu2AmdVIqPM4aaGQHTc7RV1tHXu45ENoMvxRuZjJZRNo+c5KWew4SHrcBgHLRqdkEY0TtFhSRhMf5KrUb8gost1bYY3WK1NnJNxdUNT181nuaEvi4hhf4ULn1UJvTOrcG3r++PYoy+BehDNLy4sO0wWTLtOq1QZMdPUdELOjKnZUittxtUpkVgr2sEKjZoNKSyTBDnSd1aEDUK43DRheGb9cBcvL4MhvZdrx7/PjBWZ+jIq9ZBmo4E7KlkqHgNUH+2tGpF1rVcw4otfwoliX//6mUqH/FJdzuZKI3cxlT/btycqX5EMGmlhdKNaESrtl5lod67l5GnjPC7P+QZIuaFjx+xkRmmw8ML8Jss2km9Ua73GjuMhIgvWz7iMor2q7uCEDHXzbB/vMJg90zcrzFWWIB6OHjVUwpHDqlw/SUf39Kx1QYYMtr6oN/qDoI7ctPFJm4zpnhiUycrdrECZLOGubZ9FO0Mu/3hnxD18SwWtdwZNe+KdatjVws8UTAtaQCF7414JYb2p0mNbZK5Ir23dMXuRy38UT/JmAk5NJmQrLtlw6uLUHr5Wcyx9kR/wM= \ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/Distributed.png b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/Distributed.png deleted file mode 100644 index d96ca216..00000000 Binary files a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/Distributed.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/README.md b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/README.md deleted file mode 100644 index 0fd4bb63..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP1/README.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -slug: /MEP-1-distributed-metal-control-plane -title: MEP-1 -sidebar_position: 1 ---- - -# Distributed Metal Control Plane - -This enhancement proposal was replaced by [MEP18](../MEP18/README.md). - -## Problem Statement - -We face the situation that we argue for running bare metal on-premises because this way the customers can control where and how their software and data are processed and stored. -On the other hand, we have currently decided that our metal-api control plane components run on a kubernetes cluster (in our case on a cluster provided by one of the available hyperscalers). - -Running the control plane on Kubernetes has the following benefits: - -- Ease of deployment -- Get most, if not all, of the required infrastructure services like (probably incomplete): - - IPs - - DNS - - L7-Loadbalancing - - Storage - - S3 Backup - - High Availability - -Using a kubernetes as a service offering from one of the hyperscalers, enables us to focus on using kubernetes instead of maintaining it as well. - -## Goal - -It would be much saner if metal-stack has no, or only minimal dependencies to external services. Imagine a metal-stack deployment in a plant, it would be optimal if we only have to deliver a single rack with servers and networking gear installed and wired, plug that rack to the power supply and a internet uplink and its ready to go. - -Have a second plant which you want to be part of all your plants? Just tell both that they are part of something bigger and metal-api knows of two partitions. - -## Possible Solutions - -We can think of two different solutions to this vision: - -1. Keep the central control plane approach and require some sort of kubernetes deployment accessible from the internet. This has the downside that the user must, provide a managed kubernetes deployment in his own datacenter or uses a hyperscaler. Still not optimal. -1. Install the metal-api and all its dependencies in every partition, replicate or shard the databases to every connected partition, make them know each other. Connect the partitions over the internet with some sort of vpn to make the services visible to each other. - -As we can see, the first approach does not really address the problem, therefore i will describe solution #2 in more details. - -## Central/Current setup - -### Stateful services - -Every distributed system suffer from handling state in a scalable, fast and correct way. To start how to cope with the state, we first must identify which state can be seen as partition local only and which state must be synchronous for read, and synchronous for writes across partitions. - -Affected states: - -- masterdata: e.g. tenant and project must be present in every partition, but these are entities which are read often but updates are rare. A write can therefore be visible with a decent delay in a distinct partition with no consequences. -- ipam: the prefixes and ip´s allocated from machines. These entities are also read often and rare updates. But we must differentiate between dirty reads for different types. A machine network is partition local, ips acquired from such a network must by synchronous in the same partition. Ips acquired from global networks such as internet must by synchronous for all partitions, as otherwise a internet ip could be acquired twice. -- vrf ids: they must only be unique in one partition -- image and size configurations: read often, written seldom, so no high requirements on the storage of these entities. -- images: os images are already replicated from a central s3 storage to a per partition s3 service. metal-hammer kernel and initrd are small and pull always from the central s3, can be done similar to os images. -- machine and machine allocation: must be only synchronous in the partition -- switch: must be only synchronous in the partition -- nsq messages: do not need to cross partition boundaries. No need to keep the messages persistent, even the opposite is true, we don't want to have the messages persist for a longer period. - -Now we can see that the most critical state to held and synchronize are the IPAM data, because these entities must be guaranteed to be synchronously updated, while being updated frequently. - -Datastores: - -We use three different types of datastores to persist the states of the metal application. - -- rethinkdb is the main datastore for almost all entities managed by metal-api -- postgresql is used for masterdata and ipam data. -- nsq uses disk and memory tho store the messages. - -### Stateless services - -These are the easy part, all of our services which are stateless can be scaled up and down without any impact on functionality. Even the stateful services like masterdata and metal-api rely fully on the underlying datastore and can therefore also be scaled up and down to meet scalability requirements. - -Albeit, most of these services need to be placed behind a loadbalancer which does the L4/L7 balancing across the started/available replicas of the service for the clients talking to it. This is actually provided by kubernetes with either service type loadbalancer or type clusterip. - -One exception is the `metal-console` service which must have the partition in it´s dns name now, because there is no direct network connectivity between the management networks of the partitions. See "Network Setup) - -## Distributed setup - -### State - -In order to replicate certain data which must be available across all partitions we can use on of the existing open source databases which enable such kind of setup. There are a few available out there, the following incomplete list will highlight the pro´s and cons of each. - -- RethinkDB - - We already store most of our data in RethinkDB and it gives already the ability to synchronize the data in a distributed manner with different guarantees for consistency and latency. This is described here: [Scaling, Sharding and replication](https://rethinkdb.com/docs/sharding-and-replication/). But because rethinkdb has a rough history and unsure future with the last release took more than a year, we in the team already thought that we eventually must move away from rethinkdb in the future. - -- Postgresql - - Postgres does not have a multi datacenter with replication in both directions, it just can make the remote instance store the same data. - -- CockroachDB - - Is a Postgresql compatible database engine on the wire. CockroachDB gives you both, ACID and geo replication with writes allowed from all connected members. It is even possible to configure [Follow the Workload](https://www.cockroachlabs.com/docs/stable/topology-follow-the-workload) and [Geo Partitioning and Replication](https://www.cockroachlabs.com/docs/v19.2/topology-geo-partitioned-replicas). - -If we migrate all metal-api entities to be stored the same way we store masterdata, we could use cockroachdb to store all metal entities in one ore more databases spread across all partitions and still ensure consistency and high availability. - -A simple setup how this would look like is shown here. - -![Simple CockroachDB setup](Distributed.png) - -go-ipam was modified in a example PR here: [PR 17](https://github.com/metal-stack/go-ipam/pull/17) - -### API Access - -In order to make the metal-api accessible for api users like `cloud-api` or `metalctl` as easy at it is today, some effort has to be taken. One possible approach would be to use a external loadbalancer which spread the requests evenly to all metal-api endpoints in all partitions. Because all data are accessible from all partitions, a api request going to partition A with a request to create a machine in partition B, will still work. If on the other hand partition B is not in a connected state because the interconnection between both partitions is broken, then of course the request will fail. - -**IMPORTANT** -The NSQ Message to inform `metal-core` must end in the correct partition - -To provide such a external loadbalancer we have several opportunities: - -- Cloudflare or comparable CDN service. -- BGP Anycast from every partition - -Another setup would place a small gateway behind the metal-api address, which forwards to the metal-api in the partition where the request must be executed. This gateway, `metal-api-router` must inspect the payload, extract the desired partition, and forward the request without any modifications to the metal-api endpoint in this partition. This can be done for all requests, or if we want to optimize, only for write accesses. - -## Network setup - -In order to have the impact to the overall security concept as minimal as possible i would not modify the current network setup. The only modifications which has to be made are: - -- Allow https ingress traffic to all metal-api instances. -- Allow ssh ingress traffic to all metal-console instances. -- Allow CockroachDB Replication between all partitions. -- No NSQ traffic from outside required anymore, except we cant solve the topic above. - -A simple setup how this would look like is shown here, this does not work though because of the forementioned NSQ issue. - -![API and Console Access](Distributed-API.png) - -Therefore we need the `metal-api-router`: - -![Working API and Console Access](Distributed-API-Working.png) - -## Deployment - -The deployment of our components will substantially differ in a partition compared to a the deployment we have actually. Deploying it in kubernetes in the partition would be very difficult to achieve because we have no sane way to deploy kubernetes on physical machines without a underlying API. -I would therefore suggest to deploy our components in the same way we do that for the services running on the management server. Use systemd to start docker containers. - -![Deployment](Distributed-Deployment.png) diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP10/README.md b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP10/README.md deleted file mode 100644 index b9cf4f2f..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP10/README.md +++ /dev/null @@ -1,197 +0,0 @@ ---- -slug: /MEP-10-sonic-support -title: MEP-10 -sidebar_position: 10 ---- - -# SONiC Support - -As writing this proposal, metal-stack only supports Cumulus on Broadcom ASICs. Unfortunately, after the acquisition of -Cumulus Networks by Nvidia, Broadcom decided to cut its relationship with Cumulus, and therefore Cumulus 4.2 is the last -version that supports Broadcom ASICs. Since trashing the existing hardware is not a solution, adding support for a -different network operating system is necessary. - -One of the remaining big players is [SONiC](https://sonic-net.github.io/SONiC/), which Microsoft created to scale the -network of Azure. It's an open-source project and is now part of the [Linux Foundation](https://www.linuxfoundation.org/press/press-release/software-for-open-networking-in-the-cloud-sonic-moves-to-the-linux-foundation). - -For a general introduction to SONiC, please follow the [Architecture](https://github.com/sonic-net/SONiC/wiki/Architecture) official -documentation. - -## ConfigDB - -On a cold start, the content of `/etc/sonic/config_db.json` will be loaded into the Redis database `CONFIG_DB`, and both -contain the switch's configuration except the BGP unnumbered configuration, which still has to be configured directly by -the frr configuration files. The SONiC community is working to remove this exception, but no release date is known. - -## BGP Configuration - -Frr runs inside a container, and a shell script configured it on the container startup. For BGP unnumbered, we must set -the configuration variable `docker_routing_config_mode` to `split` to prevent SONiC from overwriting our configuration -files created by `metal-core`. But by using the split mode, the integrated configuration mode of frr is deactivated, and -we have to write our BGP configuration to the daemon-specific files `bgp.conf`, `staticd.conf`, and `zebra.conf` instead -to `frr.conf`. - -```shell -elif [ "$CONFIG_TYPE" == "split" ]; then - echo "no service integrated-vtysh-config" > /etc/frr/vtysh.conf - rm -f /etc/frr/frr.conf -``` - -Reference: [docker-init](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/docker_init.sh#L69) - -Adding support for the integrated configuration mode, we must at least adjust the startup shell script and the supervisor configuration: - -```bash -{% if DEVICE_METADATA.localhost.docker_routing_config_mode is defined and DEVICE_METADATA.localhost.docker_routing_config_mode == "unified" %} -[program:vtysh_b] -command=/usr/bin/vtysh -b -``` - -Reference: [supervisord.conf](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/frr/supervisord/supervisord.conf.j2#L157) - -## Non-BGP Configuration - -For the Non-BGP configuration we have to write it into the Redis database directly or via one of the following interfaces: - -- `config replace ` -- the Mgmt Framework -- the SONiC restapi - -Directly writing into the Redis database isn't a stable interface, and we must determine the create, delete, and update -operations on our own. The last point is also valid for the Mgmt Framework and the SONiC restapi. Furthermore, the -Mgmt Framework doesn't start anymore for several months, and a [potential fix](https://github.com/sonic-net/sonic-buildimage/pull/10893) -is still not merged. And the SONiC restapi isn't enabled by default, and we must build and maintain our own SONiC images. - -Using `config replace` would reduce the complexity in the `metal-core` codebase because we don't have to determine the -actual changes between the running and the desired configuration. The approach's drawbacks are using a version of SONiC -that contains the PR [Yang support for VXLAN](https://github.com/sonic-net/sonic-buildimage/pull/7294), and we must provide -the whole new startup configuration to prevent unwanted deconfiguration. - -### Configure Loopback interface and activate VXLAN - -```json -{ - "LOOPBACK_INTERFACE": { - "Loopback0": {}, - "Loopback0|": {} - }, - "VXLAN_TUNNEL": { - "vtep": { - "src_ip": "" - } - } -} -``` - -#### Configure MTU - -```json -{ - "PORT": { - "Ethernet0": { - "mtu": "9000" - } - } -} -``` - -#### Configure PXE Vlan - -```json -{ - "VLAN": { - "Vlan4000": { - "vlanid": "4000" - } - }, - "VLAN_INTERFACE": { - "Vlan4000": {}, - "Vlan4000|": {} - }, - "VLAN_MEMBER": { - "Vlan4000|": { - "tagging_mode": "untagged" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104000_Vlan4000": { - "vlan": "Vlan4000", - "vni": "104000" - } - } -} -``` - -#### Configure VRF - -```json -{ - "INTERFACE": { - "Ethernet0": { - "vrf_name": "vrf104001" - } - }, - "VLAN": { - "Vlan4001": { - "vlanid": "4001" - } - }, - "VLAN_INTERFACE": { - "Vlan4001": { - "vrf_name": "vrf104001" - } - }, - "VRF": { - "vrf104001": { - "vni": "104001" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104001_Vlan4001": { - "vlan": "Vlan4001", - "vni": "104001" - } - } -} -``` - -## DHCP Relay - -The DHCP relay container only starts if `DEVICE_METADATA.localhost.type` is equal to `ToRRouter`. - -## LLDP - -SONiC always uses the local port subtype for LLDP and sets it to some freely configurable alias field of the interface. - -```python -# Get the port alias. If None or empty string, use port name instead -port_alias = port_table_dict.get("alias") -if not port_alias: - self.log_info("Unable to retrieve port alias for port '{}'. Using port name instead.".format(port_name)) - port_alias = port_name - -lldpcli_cmd = "lldpcli configure ports {0} lldp portidsubtype local {1}".format(port_name, port_alias) -``` - -Reference: [lldpmgr](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-lldp/lldpmgrd#L153) - -## Mgmt Interface - -The mgmt interface is `eth0`. To configure a static IP address and activate the Mgmt VRF, use: - -```json -{ - "MGMT_INTERFACE": { - "eth0|": { - "gwaddr": "" - } - }, - "MGMT_VRF_CONFIG": { - "vrf_global": { - "mgmtVrfEnabled": "true" - } - } -} -``` - -[IP forwarding is deactivated on `eth0`](https://github.com/sonic-net/sonic-buildimage/blob/202205/files/image_config/sysctl/sysctl-net.conf#L7), and no IP Masquerade is configured. diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP11/README.md b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP11/README.md deleted file mode 100644 index 87f48a10..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP11/README.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -slug: /MEP-11-auditing-of-metal-stack-resources -title: MEP-11 -sidebar_position: 11 ---- - -# Auditing of metal-stack resources - -Currently no logs of the ownership of resources like machines, networks, ips and volumes are generated or kept. Though due to legal requirements data centers are required to keep track of this ownership over time to prevent liability issues when opening the platform for external users. - -In this proposal we want to introduce a flexible and low-maintenance approach for auditing on top of [Meilisearch](https://www.meilisearch.com/). - -## Overview - -In general our auditing logs will be collected by a request interceptor or middleware. Every request and response will be processed and eventually logged to Meilisearch. -Meilisearch will be configured to regularly create chunks of the auditing logs. These finished chunks will be backed up to a S3 compatible storage with a read-only option enabled. - -Of course sensitive data like session keys or passwords will be redacted before logging. We want to track relevant requests and responses. If auditing the request fails, the request itself will be aborted and will not be processed further. The requests and responses that will be audited will be annotated with a correlation id. - -Transferring the meilisearch auditing data chunks to the S3 compatible storage will be done by a sidecar cronjob that is executed periodically. -To avoid data manipulation the S3 compatible storage will be configured to be read-only. - -## Whitelisting - -To reduce the amount of unnecessary logs we want to introduce a whitelist of resources and operations on those that should be logged. -Other requests will be passed directly to the next middleware or web service without any further processing. - -As we are only interested in mutating endpoints, we ignore all `GET` requests. -The whitelist includes all `POST`, `PUT`, `PATCH` and `DELETE` endpoints of the HTTP middleware except for the following (non-manipulating) route suffixes: - -- `/find` -- `/notify` -- `/try` and `/match` -- `/capacity` -- `/from-hardware` - -Regarding GRPC audit trails, they are not so interesting because only internal clients are using this API. However, we can log the trails of the `Boot` service, which can be interesting to revise the machine lifecycle. - -## Chunking in Meilisearch - -We want our data to be chunked in Meilisearch. To accomplish this, we rotate the index identifier on a scheduled basis. The index identifiers will be derived from the current date and time. - -To keep things simple, we only support hourly, daily and monthly rotation. The eventually prefixed index names will only include relevant parts of date and time like `2021-01`, `2021-01-01` or `2021-01-01_13`. - -The metal-api will only write to the current index and switches to the new index on rotation. The metal-api will never read or update data in any indices. - -## Moving chunks to S3 compatible storage - -As Meilisearch will be filled with data over time, we want to move completed chunks to a S3 compatible storage. This will be done by a sidecar cronjob that is executed periodically. Note that the periods of the index rotation and the cronjob execution don't have to match. - -When the backup process gets started, it initiates a [Meilisearch dump](https://www.meilisearch.com/docs/learn/advanced/dumps) of the whole database across all indices. Once the returned task is finished, the dump must be copied from a Meilisearch volume to the S3 compatible storage. After a successful copy, the dump can be deleted. - -Now we want to remove all indices from Meilisearch, except the most recent one. For this, we [get all indices](https://www.meilisearch.com/docs/reference/api/indexes#list-all-indexes), sort them and [delete each index](https://www.meilisearch.com/docs/reference/api/indexes#delete-an-index) except the most recent one to avoid data loss. - -For the actual implementation, we can build upon [backup-restore-sidecar](https://github.com/metal-stack/backup-restore-sidecar). But due to the index rotation and the fact, that older indices need to be deleted, this probably does not fit into the mentioned sidecar. - -## S3 compatible storage - -The dumps of chunks should automatically deleted after a certain amount of time, once we are either no longer allowed or required to keep them. -The default retention time will be 6 months. Ideally already uploaded chunks should be read-only to prevent data manipulation. - -A candidate for the S3 compatible storage is Google Cloud Storage, which allows to configure automatic expiration of objects through a [lifecycle rule](https://cloud.google.com/storage/docs/managing-lifecycles?hl=en#storage-set-lifecycle-config-go). - -## Affected components - -- metal-api grpc server needs an auditing interceptor -- metal-api web server needs an auditing filter chain / middleware -- metal-api needs new command line arguments to configure the auditing -- mini-lab needs a Meilisearch instance -- mini-lab may need a local S3 compatible storage -- we need a sidecar to implement the backup to S3 compatible storage -- Consider auditing of volume allocations and freeings outside of metal-stack - -## Alternatives considered - -Instead of using Meilisearch we investigated using an immutable database like [immudb](https://immudb.io/). But immudb does not support chunking of data and due to its immutable nature, we will never be able to free up space of expired data. Even if we are legally allowed or required to delete data, we will not be able to do so with immudb. - -In another variant of the Meilisearch approach the metal-api would also be responsible for copying chunks to the S3 compatible storage and deleting old indices. But separating the concerns allows completely different implementations for every deployment stage. diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP12/README.md b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP12/README.md deleted file mode 100644 index 65532c57..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP12/README.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -slug: /MEP-12-rack-spreading -title: MEP-12 -sidebar_position: 12 ---- - -# Rack Spreading - -Currently, when creating a machine through the metal-api, the machine is placed randomly inside a partition. This algorithm does not consider spreading machines across different racks and different chassis. This may lead to the situation that a group of machines (that for example form a cluster) can end up being placed in the same rack and the same chassis. - -Spreading a group of machines across racks can enhance availability for scenarios like a rack losing power or a chassis meltdown. - -So, instead of just randomly deciding the placement of a machine candidate, we want to propose a placement strategy that attempts to spread machine candidates across the racks inside a partition. - -Furthermore a followup improvement to guarantee that machines are really spread across multiple racks, even if multiple machines are ordered in parallel, was implemented with [PR490](https://github.com/metal-stack/metal-api/pull/490). - -## Placement Strategy - -Machines in the project are spread across all available racks evenly within a partition (best effort). For this, an additional request to the datastore has to be made in order to find allocated machines within the project in the partition. - -The algorithm will then figure out the least occupied racks and elect a machine candidate randomly from those racks. - -The user can optionally pass placement tags which will be considered for spreading the machines as well (this will for example allow spreading by a cluster id tag inside the same project). - -## API - -```golang -// service/v1/machine.go - -type MachineAllocation struct { - // existing fields are omitted for readability - PlacementTags []string `json:"placement_tags" description:"by default machines are spread across the racks inside a partition for every project. if placement tags are provided, the machine candidate has an additional anti-affinity to other machines having the same tags"` -} -``` diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP13/README.md b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP13/README.md deleted file mode 100644 index 2dde20f5..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP13/README.md +++ /dev/null @@ -1,111 +0,0 @@ ---- -slug: /MEP-13-dual-stack-support -title: MEP-13 -sidebar_position: 13 ---- - -# Dual-stack Support - -dual-stack support is required to be able to create Kubernetes clusters with either IPv6 single-stack or dual-stack enabled. -With the inherent scarcity of IPv4 addresses, the need to be able to use IPv6 has increased. - -Full IPv6 dual-stack support was added to Kubernetes with v1.23 as stable. - -Gardeners have had full IPv6 dual-stack support since `v1.109`. - -metal-stack manages CIDRs and IP addresses with the [go-ipam](https://github.com/metal-stack/go-ipam) library, which already got full IPv6 support in 2021 (see [https://metal-stack.io/blog/2021/02/ipv6-part1](https://metal-stack.io/blog/2021/02/ipv6-part1)). -But this was only the foundation, more work needs to be done to get full IPv6 support for all aspects managed by metal-stack.io. - -## General Decisions - -For the general decision we do not look at the isolated clusters feature for now as this would make the solution even more complex and we want to introduce IPv6 in smaller steps to the users. - -### Networks - -Currently, metal-stack organizes CIDRs / prefixes into a `network' resource in the metal-api. A network can consist of multiple CIDRs from the same address family. For example, if an operator wants to provide Internet connectivity to provisioned machines, they can start with small network CIDRs. The number of managed network prefixes can then be expanded as needed over time. - -With dual-stack we have to choose between two options: Network per address family or networks with both address families. These options are described in the next section. - -#### Network per Address Family - -This means that we allow networks with CIDRs from one address family only, one for IPv4 and one for IPv6. - -The machine creation process will not change if the machine only needs to be either IPv4 or IPv6 addressable. -But if on the other side, the machine need to be able to connect to both address families, the machine creation needs to specify two networks, one for IPv4 and one for IPv6. -Also there will be 2 distinct VRF IDs for every network with a different address family. - -#### Network with both Address Families - -Make a network dual address family capable, meaning that you can add multiple cidrs from both address families to a network. -Then the machine creation will remain the same for single-stack and dual-stack cases, but the ip address allocation will need to specify the address family from which to allocate an ip address when the network is dual-stack. -This does not break the existing API, but allows existing extensions to easily add dual-stack support. -To avoid additional checking of which address families are available on this network during an ip allocation call, we could store the address families in the network. - -#### Decision - -The decision was made to go with the having both address families in a single network entity because we think this is the most flexible way to support dual-stack machines and Kubernetes clusters as well as single-stack with the least amount of modifications on the networking side. - -### Examples - -To illustrate the the usage we start by creating a tenant super network which has both address families: - -```yaml ---- -id: tenant-super-network-mini-lab -name: Project Super Network -description: Super network of all project networks -partitionid: mini-lab -prefixes: - - 10.0.0.0/16 - - 2001:db8:0:10::/64 -defaultchildprefixlength: - IPv4: 22 - IPv6: 96 -privatesuper: true -``` - -In order to create this network, we simple call: - -```bash -metalctl network create -f tenant-super.yaml -``` - -This is usually done during the initial setup of the environment. - -Next step is to allocate a tenant network where the machines of a project can be placed: - -```bash -metalctl network allocate --partition mini-lab --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 --name my-node-network -``` - -This leads to the following network allocation: - -```yaml -id: 2d2c0350-3f66-4597-ae97-ef6797232212 -name: my-node-network -parentnetworkid: tenant-super-network-mini-lab -partitionid: mini-lab -prefixes: - - 10.0.0.0/22 - - 2001:db8:0:10::/96 -projectid: 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -vrf: 20 -consumption: - ipv4: - available_ips: 1024 - available_prefixes: 256 - used_ips: 2 - used_prefixes: 0 - ipv6: - available_ips: 2147483647 - available_prefixes: 1073741824 - used_ips: 1 - used_prefixes: 0 -privatesuper: false -``` - -Users can the create IP addresses from these child networks. By default, they retrieve an IPv4 address except a super network only consists of IPv6 prefixes. In the latter case the users acquire an IPv6 address. - -```bash -metalctl network ip create --network 2d2c0350-3f66-4597-ae97-ef6797232212 --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -``` diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP14/README.md b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP14/README.md deleted file mode 100644 index 47c06434..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP14/README.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -slug: /MEP-14-independence-from-external-sources -title: MEP-14 -sidebar_position: 14 ---- - -# Independence from external sources - -In certain situations some customers may need to operate and create machines without making use of external services like DNS or NTP through the internet. To make this possible, all metal-stack components reaching external services need to be configurable with custom endpoints. - -So far, the following components have been identified as requiring changes: - -- pixiecore -- metal-hammer -- metal-images - -More components are likely to be added to the list during processing. -For DNS and NTP servers it should be possible to provide default values within a partition. They can either be inherited from machines and firewalls or overwritten with own ones. - -## pixiecore - -A NTP server endpoint need to be configured on the pixiecore. This can be achieved by providing it through environment variables on start up. - -## metal-hammer - -If using a self-deployed NTP server, also the metal-hammer need to be configured with it. For backward compatibility, default values from `pool.ntp.org` and `time.google.com` are used. - -## metal-images - -Configurations for the `metal-images` are different for machines and firewalls. - -## metalctl - -In order to pass DNS and NTP servers to partitions and machines while creating them, the flags `dnsservers` and `ntpservers` need to be added. - -The implementation of this MEP will make metal-stack possible to create and maintain machines without requiring an internet connection. diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP16/README.md b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP16/README.md deleted file mode 100644 index 205670ab..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP16/README.md +++ /dev/null @@ -1,318 +0,0 @@ ---- -slug: /MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller -title: MEP-16 -sidebar_position: 16 ---- - -# metal-api as an Alternative Configuration Source for the firewall-controller - -In the current situation, a firewall as provisioned by metal-stack is a fully immutable entity. Any modifications on the firewall like changing the firewall ruleset must be done _somehow_ by the user – the metal-api and hence metal-stack is not aware of its current state. - -As part of our [integration with the Gardener project](https://docs.metal-stack.io/stable/overview/kubernetes/#Gardener) we offer a solution called the [firewall-controller](https://github.com/metal-stack/firewall-controller), which is part of our [firewall OS images](https://github.com/metal-stack/metal-images/blob/6318a624861b18a559a9d37299bca5f760eef524/firewall/Dockerfile#L57-L58) and addresses shortcomings of the firewall resource's immutability, which would otherwise be completely impractible to work with. The firewall-controller crashes infinitely if it is not properly configured through the userdata when using the firewall image of metal-stack. - -The firewall-controller approach is tightly coupled to Gardener and it requires the administrator of the Gardener installation to pass a shoot and a seed kubeconfig through machine userdata when creating the firewall. How this userdata has to look like is not documented and is just part of another project called the [firewall-controller-manager](https://github.com/metal-stack/firewall-controller-manager) (FCM), which task is to orchestrate rolling updates of firewall machines in a way that network traffic interruption is minimal when updating a firewall or applying a change to an immutable firewall configuration. - -In general, a firewall entity in metal-stack has similarities to the machine entity but it has a fundamental difference: A user gains ownership over a machine after provisioning. They can access it through SSH, modify it at will and this is completely wanted. For firewalls, however, we do not want a user to access the provisioned firewall as the firewall is a privileged part of the infrastructure with access to the underlay network. The underlay can not be tampered with at any given point in time by a user as it can destroy the entire network traffic flow inside a metal-stack partition. - -For this reason, we have a gap in the metal-stack project in terms of a missing solution for people who do not rely on the Gardener integration. We are basically leaving a user with the option to implement an orchestrated recreation of every possible change on the firewall to minimize traffic interruption for the machines sitting behind the firewall or re-implement the firewall-controller to how they want to use it for their use-case. - -Also we do not have a clear distinction in the API between user and metal-stack operator for firewalls. If a user would allocate a firewall it is also possible for the user to inject his own SSH keys and access the firewall and tamper with the underlay network. - -Parts of these problems are probably going to decrease with the work on [MEP-4](../MEP4/README.md) where there will be dedicated APIs for users and administrators of metal-stack including fine-grained access tokens. - -With this MEP we want to describe a way to improve this current situation and allow other users that do not rely on the Gardener integration – for whatever motivation they have – to adequately manage firewalls. For this, we propose an alternative configuration for the firewall-controller that is native to metal-stack and more independent of Gardener. - -## Proposal - -The central idea of this proposal is allowing the firewall-controller to use the metal-api as a configuration source. This should serve as an alternative strategy to the currently used FCM `Firewall` resource based approach in the Gardener use-case. -Updates of the firewall rules should be possible through the metal-api. - -The firewall-controller itself should now be able to decide which of the two main strategies should be used for the base configuration: a kubeconfig or the metal-api. This should be possible through a dedicated _firewall-controller-config_. - -Using this config will now allow operators to fine-tune the data sources for all of its dynamic configuration tasks independently. -For example the data source of the core firewall rules could be set either from the `Firewall` resource located in the Gardener `Seed` or the metal-apiserver node network entity, while the CWNPs should be fetched and applied from a given kubeconfig (the `Shoot` Kubeconfig in the Gardener case). -This configuration file is intended to be injected during firewall creation through the userdata along with potential source connection credentials. - -```yaml -# the name of the firewall, defaulted to the hostname -name: best-firewall-ever - -sources: - seed: - kubeconfig: /path/to/seed.yaml # current gardener behavior - namespace: shoot--proj--name - shoot: - kubeconfig: /path/to/shoot.yaml # current gardener behavior - namespace: firewall - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - static: - # static should mirror all information provided by the metal or seed/shoot sources - firewall: # optional - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -# all sub-controllers running on the firewall -# each can be configured independently -controllers: - # this is the base controller - firewall: - source: seed # or: metal, static - - # these are optional: when not provided, they are disabled - selfUpdate: - enabled: true - droptailer: - enabled: true - - # these are optional: when not provided, they are disabled - service: - source: shoot # or: metal, static - cwnp: - source: shoot # or: metal, static - monitor: - source: shoot # currently only shoot is supported -``` - -The existing behavior of the firewall-controller writing into `/etc/nftables/firewall-controller.v4` is not changed. The different controller configuration sources are internally treated in the same way as before. The `static` source can be used to prevent the firewall-controller from crashing and consistently providing a static ruleset. This might be interesting for metal-stack native use cases or environments where the metal-api cannot be accessed. - -There must be one central nftables-rule-file-controller that is notified and triggered by all other controllers that contribute to the nftables configuration. - -For example, in order to maintain the existing Gardener integration, the configuration file for the firewall-controller will look like this: - -```yaml -name: shoot--abc--cluster-firewall-def -sources: - seed: - kubeconfig: /etc/firewall-controller/seed.yaml - namespace: shoot--abc--cluster - shoot: - kubeconfig: /etc/firewall-controller/shoot.yaml - namespace: firewall - -controllers: - firewall: - source: seed - - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Plain metal-stack users might use a configuration like this: - -```yaml -name: best-firewall-ever - -sources: - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - -controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - cwnp: - # firewall rules stored in firewall entity - # potential improvement would be to attach the rules to the node network entity - # be aware that the firewall and private networks are immutable - # eventually we introduce a firewall ruleset entity - source: metal -``` - -In highly restricted environments that cannot access metal-api the static source could be used: - -```yaml -name: most-restricted-firewall-ever - -sources: - static: - firewall: - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -controllers: - firewall: - source: static - - cwnp: - source: static -``` - -### Non-Goals - -- Resolving the missing differentiation between users and administrators by letting users pass userdata and SSH keys to the firewall creation. - - This is even more related to [MEP-4](../MEP4/README.md) than this MEP. - -### Advantages - -- Offers a native metal-stack solution that improves managing firewalls for users by adding dynamic reconfiguration through the metal-api - - e.g., in the mini-lab, users can now allocate a machine, then an IP address and announce this IP from the machine without having to re-create the firewall but by adding a firewall rule to the metal-api. -- Improve consistency throughout the API (firewall rules would reflect what is persisted in metal-api). -- Other providers like Cluster API can leverage this approach, too. -- It can contribute to solving the shoot migration issue (in Cluster API case the `clusterctl move` for firewall objects) - - For Gardener takes the seed out of the equation (of which the kubeconfig changes during shoot migration) - - However: Things like egress rules, rate limiting, etc. are currently not part of the firewall or network entity in the metal-api. These would need to be added to one of them. -- Potentially resolve the issue that end-users can manipulate accounting data of the firewall through the `FirewallMonitor` - - for this we would need to be able to report traffic data to metal-api - -### Caveats - -- Metal-View access is too broad for firewalls. Mitigated by [MEP-4](../MEP4/README.md). -- Polling of the firewall-controller is bad for performance. Mitigated by [MEP-4](../MEP4/README.md). - -### Firewall Controller Manager - -Currently the firewall-controller-manager expects the creators of a `FirewallDeployment` to use the defaulting webhook that is tailored to the Gardener integration in order to generate `Firewall.spec.userdata` or to override it manually. Currently `Firewall.spec.userdata` will never be set explicitly. - -Instead we'd like to propose `Firewall.spec.userdataContents` which will replace the old `userdata`-string by a typed data structure. The FCM will do the heavy lifting while the `FirewallDeployment` creator decides what should be configured. - -```yaml -kind: FirewallDeployment -spec: - template: - spec: - userdataContents: - - path: /etc/firewall-controller/config.yaml - content: | - --- - sources: - static: {} - controllers: - firewall: - source: static - - path: /etc/firewall-controller/seed.yaml - secretRef: - name: seed-kubeconfig - generateFirewallControllerKubeconfig: true - - path: /etc/firewall-controller/shoot.yaml - secretRef: - name: shoot-kubeconfig -``` - -### Gardener Extension Provider Metal Stack - -The GEPM should be migrated to the new `Firewall.spec.userdataContents` field. - -### Cluster API Provider Metal Stack - -![architectural overview](firewall-for-capms-overview.svg) - -In Cluster API there are essentially two main clusters: the management cluster and the workload cluster while the CAPMS takes in the role of the GEPM. -Typically a local bootstrap cluster is created in KinD which acts as the management cluster. It creates the workload cluster. Thereafter the ownership of the workload cluster is typically moved (using `clusterctl move`) to a different cluster which will then become the management cluster. -The new management cluster might actually be the workload cluster itself. - -In contrast to Gardener, Cluster API aims to be less opinionated and minimal. It is common practice to not install any non-required components or CRDs into the workload cluster by default. Therefore we cannot expect custom resources like `ClusterwideNetworkPolicy` or `FirewallMonitor` to be installed in the workload cluster but strongly recommend our users to do it. Therefore it's the responsibility of the operator to tell [cluster-api-provider-metal-stack](https://github.com/metal-stack/cluster-api-provider-metal-stack) the kubeconfig for the cluster where these CRDs are installed and defined in. - -A viable configuration for a `MetalStackCluster` that generates firewall rules based of `Service` type `LoadBalancer` and `ClusterwideNetworkPolicy` and expects them to be deployed in the workload cluster is shown below. The `FirewallMonitor` will be reported into the same cluster. - -```yaml -kind: MetalStackCluster -metadata: - name: ${CLUSTER_NAME} -spec: - firewallTemplate: - userdataContents: - - path: /etc/firewall-controller/config.yaml - secretName: ${CLUSTER_NAME}-firewall-controller-config - - - path: /etc/firewall-controller/workload.yaml - # this is the kubeconfig generated by kubeadm - secretName: ${CLUSTER_NAME}-kubeconfig ---- -kind: Secret -metadata: - name: ${CLUSTER_NAME}-firewall-controller-config -stringData: - controllerConfig: | - --- - name: ${CLUSTER_NAME}-firewall - - sources: - metal: - url: ${METAL_API_URL} - hmac: ${METAL_API_HMAC} - type: ${METAL_API_HMAC_TYPE} - projectID: ${METAL_API_PROJECT_ID} - shoot: - kubeconfig: /etc/firewall-controller/workload.yaml - namespace: firewall - - controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Here the firewall-controller-config will be referenced by the `MetalStackCluster` as a `Secret`. Please note that the `Secret`s in `userdataContents` will not be fetched and will directly be passed to the `FirewallDeployment`. At first the reconciliation of it in the FCM will fail due to the missing Kubeconfig secret. After the `MetalStackCluster` has been marked as ready, CAPI will create this missing secret. Effectively the firewall and initial control plane node should be created at the same time. - -This approach allows maximum flexibility as intended by Cluster API and is still able to provide robust rolling updates of firewalls. - -An advanced use case of this flexibility would be a management cluster, that is in charge of multiple workload clusters. Where one workload cluster acts as a monitoring or tooling cluster, receives logs and the firewall monitor for the other workload clusters. The CWNPs could be defined here, all in a separate namespace. - -#### Cluster API Caveats - -When the cluster is pivoted and reconciles its own firewall, a malfunctioning firewall prevents the cluster from self-healing and requires manual intervention by creating a new firewall. This is an inherent problem of the cluster-api approach. It can be circumvented by using an extra cluster to manage workload clusters. - -In the current form of this approach firewalls and therefore the firewall egress and ingress rules are managed by the cluster operators that manage the cluster-api resources. -Hence it will not be possible to gain a fine-grained control over every cluster operator's choices from a central ruleset at the level of metal-stack firewalls. -In case this control surfaces as a requirement, it would need to be implemented in a firewall external to metal-stack. - -## Roadmap - -In general this proposal is not thought to be implemented in one batch. Instead an incremental approach is required. - -1. Enhance firewall-controller - - - Reduce coupling between controllers - - Introduce controller config - - Abstract module to write into distinct nftable rules for every controller - - Implement `sources.static`, but not `sources.metal` - - GEPM should set `FirewallDeployment.spec.template.spec.userdataContents` - -2. Allow Cluster API to use the FCM with static ruleset - - - Add `firewall.metal-stack.io/paused` annotation (managed by CAPMS during `clusterctl move`, theoretically useful for Gardener shoot migration as well to avoid shallow deletion). - - Reconcile multiple `FirewallDeployment` resources across multiple namespaces. For Gardener the old behavior of reconciling only one namespace should persist. - - Allow setting the `firewall.metal-stack.io/no-controller-connection` annotation through the `FirewallDeployment` (either through the template or inheritance). - - Add `MetalStackCluster.spec.firewallTemplate`. - - Make `MetalStackCluster.spec.nodeNetworkID` optional if `spec.firewallTemplate` given. - -3. Add `sources.metal` as configuration option. - - - Allow updates of firewall rules in the metal-apiserver. - - Depends on [MEP-4](../MEP4/README.md) metal-apiserver progress - -4. Potentially migrate the GEPM to use `sources.metal` diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio deleted file mode 100644 index faea3e3d..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio +++ /dev/null @@ -1,4 +0,0 @@ - - - -
handles traffic
Firewall
Firewall Controller
node-exporter
nftables-exporter
droptailer-client
Workload Cluster
droptailer
Configures
Bootstrap or Management Cluster
reconcile
configures
reconcile
Cluster API Provider metal-stack
Metal Stack Cluster CRD
Firewall Deployment CRD
Firewall CRD
Firewall Set CRD
rec
reconcile
reconcile
Firewall Controller Manager
Metal Stack Machine CRD
manages
Admin
Kubeconfig FirewallMonitor
FirewallMonitor CRD
main metal-api
Firewall entity
kubeconfig CWNP
Clusterwide Network Policy CRD
base config
controllerConfig
user-defined
network rules
reports firewall
state
send firewall log lines
controllerConfig
controllerConfig
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg deleted file mode 100644 index 853f8175..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg +++ /dev/null @@ -1 +0,0 @@ -
handles traffic
handles traffic
Firewall
Firewall
Firewall Controller
Firewall Controller
node-exporter
node-exporter
nftables-exporter
nftables-exporter
droptailer-client
droptailer-client
Workload Cluster
Workload Cluster
droptailer
droptailer
Configures
Configures
Bootstrap or Management Cluster
Bootstrap or Management Cluster
reconcile
reconcile
configures
configures
reconcile
reconcile
Cluster API Provider metal-stack
Cluster API Provider...
Metal Stack Cluster CRD
Metal Stack Cluster...
Firewall Deployment CRD
Firewall Deployment...
Firewall CRD
Firewall CRD
Firewall Set CRD
Firewall Set CRD
rec
rec
reconcile
reconcile
reconcile
reconcile
Firewall Controller Manager
Firewall Controller...
Metal Stack Machine CRD
Metal Stack Machine...
manages
manages
Admin
Admin
Kubeconfig FirewallMonitor
Kubeconfig FirewallMonitor
FirewallMonitor CRD
FirewallMonitor CRD
main metal-api
main metal-api
Firewall entity
Firewall entity
kubeconfig CWNP
kubeconfig CWNP
Clusterwide Network PolicyCRD
Clusterwide Network...
base config
base config
controllerConfig
controllerConfig
user-defined
network rules
user-defined...
reports firewall
state
reports firewall...
send firewall log lines
send firewall log lines
controllerConfig
controllerConfig
controllerConfig
controllerConfig
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP17/README.md b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP17/README.md deleted file mode 100644 index 35f48970..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP17/README.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -slug: /MEP-17-global-network-view -title: MEP-17 -sidebar_position: 17 ---- - -# Global Network View - -> [!IMPORTANT] -> This MEP assumes the implementation of the metal-apiserver as described by [MEP-4](../MEP4/README.md) which is currently work in progress. - -Having a complete view of the network topology is useful when working with deployments or troubleshooting connectivity issues. -Currently, the API doesn't know of any other switches than the leaf switches. -Information about all other switches and their connections must be gathered from Ansible inventories or by accessing the switches via SSH. -Documentation of each partition's network must be kept in-sync with all changes made to the deployment or cabling. -We would like to expand the API's knowledge of the network to the entire underlay including inter-switch connections as well as BGP statistics and health status. - -## Switch Types - -Registering a switch at the API is done by the metal-core. -Apart from that, it also reconciles port and FRR configuration to adapt to the machine provisioning cycle. -This reconfiguration is only necessary on the leaf switches. -To allow deploying the metal-core on other switches than leaves we need a way of telling it what type of switch it is running on so it can act accordingly. -On any non-leaf switches it will only register the switch and report statistic but not change any configuration. -Supported switch types are - -- `leaf` -- `spine` -- `exit` -- `mgmtleaf` -- `mgmtspine` - -## Network Topology - -All switches should periodically report their LLDP neighbors and port configuration. -This information can be used to quickly identify common network issues, like MTU mismatch or the like. -Ideally, there would be some graphical representation of the network topology containing only the most important information for a quick overview. -It should contain all switches and machines as nodes and all connections as edges of a graph. -Ports, VRFs, and maybe also IPs should be associated with a connection. - -Apart from the topology graph, there should be a way to display more detailed information about both ports of a connection, like - -- MTU -- speed -- IP -- UP/DOWN status -- VRF -- VLAN -- whether it participates in a BGP session - -## BGP Announcements - -The metal-core should collect all routes it knows about and send them to the API along with a timestamp. -Reported routes should be stored to a redis database along with the switch that reported them and the timestamp of the last time they were reported. -An expiration threshold should be defined and all expired routes should be cleaned up periodically. -Whenever new routes are reported they get merged into the existing ones by the strategy: - -- when new, just add -- when existing, update `last_announced` timestamp - -By querying the BGP announcements we can find out whether an allocated IP is still in use. diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio deleted file mode 100644 index eafcb514..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio +++ /dev/null @@ -1,535 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine01 -
-
-
-
- - spine01 - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 01 - -
-
-
-
- - Initial cluster node 01 - -
-
-
- - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine02 -
-
-
-
- - spine02 - -
-
-
- - - - - - - - - -
-
-
- leaf03 -
-
-
-
- - leaf03 - -
-
-
- - - - - - - - - -
-
-
- leaf04 -
-
-
-
- - leaf04 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 02 - -
-
-
-
- - Initial cluster node 02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 03 - -
-
-
-
- - Initial cluster node 03 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg deleted file mode 100644 index 99261ada..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine01
spine01
leaf01
leaf01
leaf02
leaf02
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Initial cluster node 01
Initial cluster node 01
123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine02
spine02
leaf03
leaf03
leaf04
leaf04
Initial cluster node 02
Initial cluster node 02
Initial cluster node 03
Initial cluster node 03
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio deleted file mode 100644 index aae8a12d..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio +++ /dev/null @@ -1,1133 +0,0 @@ - - - - - - - - - - - - - - - - - - - -
-
-
- Initial Cluster -
-
-
-
- - Initial Cluster - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - -
-
-
- K3s Standalone - - - (on Debian) - - -
-
-
-
- - K3s Standalone (on Debian) - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Initial Partition -
-
-
-
- - Initial Partition - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for metal-stack -
-
-
-
- - Target Cluster for metal-stack - -
-
-
- - - - - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for Gardener -
-
-
-
- - Target Cluster for Gardener - -
-
-
- - - - - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - - - - - - - -
-
-
- Target Partition -
-
-
-
- - Target Partition - -
-
-
- - - - - - - - - - -
-
-
- Gardener Seeds and End-User Shoots -
-
-
-
- - Gardener Seeds and End-User Shoots - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - - - - -
-
-
- ETCD can be clustered or standalone, backed up by sidecar -
-
-
-
- - ETCD can be clustere... - -
-
-
- - - - - - - - - - -
-
-
- This data will get lost in case local PV gets deleted -
-
-
-
- - This data will get l... - -
-
-
- - - - - - - - - - -
-
-
- We can work with local PVs here, too. -
- backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered. -
-
-
-
- - We can work with local PVs he... - -
-
-
- - - - - - - -
-
-
- ETCD will be deployed in HA configuration on local PVs. -
-
- csi-driver-lvm needs to implement auto deletion of orphaned PVs. -
-
- Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot. -
-
-
-
- - ETCD will be deployed in HA c... - -
-
-
- - - - - - - - - - -
-
-
- More sophisticated storage solutions can be in place. -
-
- (Lightbits, NetApp, ...) -
-
-
-
- - More sophisticated storage so... - -
-
-
- - - - - - - - - - -
-
-
- TODO: Evaluate how to persist these metrics. -
-
-
-
- - TODO: Evaluate how to persist... - -
-
-
- - - - - - - - - - -
-
-
- - 1 VM or -
-
-
- - - 3 Bare Metal Machines - - -
-
-
-
-
- - 1 VM or... - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-stack -
-
-
-
- - metal-stack - -
-
-
- - - - - - - -
-
-
- metal-api -
-
-
-
- - metal-api - -
-
-
- - - - - - - -
-
-
- metal-db -
-
-
-
- - metal-db - -
-
-
- - - - - - - -
-
-
- ipam-db -
-
-
-
- - ipam-db - -
-
-
- - - - - - - -
-
-
- masterdata-db -
-
-
-
- - masterdata-db - -
-
-
- - - - - - - -
-
-
- headscale-db -
-
-
-
- - headscale-db - -
-
-
- - - - - - - -
-
-
- auditing-db -
-
-
-
- - auditing-db - -
-
-
- - - - - - - -
-
-
- nsqd -
-
-
-
- - nsqd - -
-
-
- - - - - - - - - - - -
-
-
- Gardener -
-
-
-
- - Gardener - -
-
-
- - - - - - - - - - -
-
-
- Virtual Garden -
-
-
-
- - Virtual Garden - -
-
-
- - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - -
-
-
- gardenlet -
-
-
-
- - gardenlet - -
-
-
- - - - - - - -
-
-
- Garden etcd -
-
-
-
- - Garden etcd - -
-
-
- - - - - - - -
-
-
- Prometheus -
-
-
-
- - Prometheus - -
-
-
- - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - -
-
-
- - Gitlab - -
- - Runner - -
-
-
-
-
- - Gitlab... - -
-
-
- - - - - - - - - - -
-
-
- Services -
-
-
-
- - Services - -
-
-
- - - - - - - -
-
-
- PowerDNS -
-
-
-
- - PowerDNS - -
-
-
- - - - - - - -
-
-
- boulder -
-
-
-
- - boulder - -
-
-
- - - - - - - -
-
-
- NTP -
-
-
-
- - NTP - -
-
-
- - - - - - - -
-
-
- OIDC -
-
-
-
- - OIDC - -
-
-
- - - - - - - -
-
-
- ... -
-
-
-
- - ... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg deleted file mode 100644 index e58e783b..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg +++ /dev/null @@ -1 +0,0 @@ -
Initial Cluster
Initial Cluster
metal-roles
metal-roles
CI
CI
K3s Standalone(on Debian)
K3s Standalone (on Debian)
Initial Partition
Initial Partition
Target Cluster for metal-stack
Target Cluster for metal-stack
Metal Control Plane
Metal Control Plane
provisions
provisions
Target Cluster for Gardener
Target Cluster for Gardener
Gardener Control Plane
Gardener Control Plane
Monitoring
Monitoring
Target Partition
Target Partition
Gardener Seeds and End-User Shoots
Gardener Seeds and End-User Shoots
provisions
provisions
metal-roles
metal-roles
CI
CI
metal-roles
metal-roles
ETCD can be clustered or standalone, backed up by sidecar
ETCD can be clustere...
This data will get lost in case local PV gets deleted
This data will get l...
We can work with local PVs here, too.
backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered.
We can work with local PVs he...
ETCD will be deployed in HA configuration on local PVs.

csi-driver-lvm needs to implement auto deletion of orphaned PVs.

Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot.
ETCD will be deployed in HA c...
More sophisticated storage solutions can be in place.

(Lightbits, NetApp, ...)
More sophisticated storage so...
TODO: Evaluate how to persist these metrics.
TODO: Evaluate how to persist...
1 VM or
3 Bare Metal Machines
1 VM or...
metal-stack
metal-stack
metal-api
metal-api
metal-db
metal-db
ipam-db
ipam-db
masterdata-db
masterdata-db
headscale-db
headscale-db
auditing-db
auditing-db
nsqd
nsqd
Gardener
Gardener
Virtual Garden
Virtual Garden
Gardener Control Plane
Gardener Control Plane
gardenlet
gardenlet
Garden etcd
Garden etcd
Prometheus
Prometheus
Monitoring
Monitoring
Gitlab
Runner
Gitlab...
Services
Services
PowerDNS
PowerDNS
boulder
boulder
NTP
NTP
OIDC
OIDC
...
...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio deleted file mode 100644 index cd5cf007..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio +++ /dev/null @@ -1,404 +0,0 @@ - - - - - - - - - - -
-
-
- Partition 1 -
-
-
-
- - Partition 1 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 2 -
-
-
-
- - Partition 2 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 3 -
-
-
-
- - Partition 3 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Production Control Plane -
-
-
-
- - Production Control Plane - -
-
- - - - -
-
-
- metal-stack -
- kubernetes cluster -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- gardener -
- kubernetes cluster -
-
-
-
- - gardener... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - - - - -
-
-
- Control Plane Partition -
-
-
-
- - Control Plane Partition - -
-
- - - - - -
-
-
- backup of stateful sets -
-
-
-
- - backup of stateful sets - -
-
- - - - - - -
-
-
- bare metal machine -
-
-
-
- - bare metal machine - -
-
- - - - -
-
-
- metal-stack -
- and -
- gardener -
- kubernetes cluster -
- running in kind -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - -
-
-
- S3 -
-
-
-
- - S3 - -
-
- - - - -
-
-
- Needle -
-
-
-
- - Needle - -
-
- - - -
-
-
- - Nail - -
-
-
-
- - Nail - -
-
-
- - - - - Text is not SVG - cannot display - - - -
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg deleted file mode 100644 index 8f88ba14..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg +++ /dev/null @@ -1 +0,0 @@ -
Partition 1
Partition 1
seeds
seeds
shoots
shoots
Partition 2
Partition 2
seeds
seeds
shoots
shoots
Partition 3
Partition 3
seeds
seeds
shoots
shoots
Production Control Plane
Production Control Plane
metal-stack
kubernetes cluster
metal-stack...
gardener
kubernetes cluster
gardener...
Manages
Manages
Control Plane Partition
Control Plane Partition
backup of stateful sets
backup of stateful sets
bare metal machine
bare metal machine
metal-stack
and
gardener
kubernetes cluster
running in kind
metal-stack...
Manages
Manages
S3
S3
Needle
Needle 
Nail
Nail
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio deleted file mode 100644 index a75ee340..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- Initial cluster node -
-
-
-
- - Initial cluster node - -
-
-
- - - - - - - - - - - - - -
-
-
- mirocloud (initial cluster partition nodes) -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg deleted file mode 100644 index a9d29f05..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
leaf01
leaf01
leaf02
leaf02
Initial cluster node
Initial cluster node
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP2/README.md b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP2/README.md deleted file mode 100644 index c7f2360a..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP2/README.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -slug: /MEP-2-two-factor-authentication -title: MEP-2 -sidebar_position: 2 ---- - -# Two Factor Authentication diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP3/README.md b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP3/README.md deleted file mode 100644 index 5ce36721..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP3/README.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -slug: /MEP-3-machine-re-installation -title: MEP-3 -sidebar_position: 3 ---- - -# Machine Re-Installation - -In the current metal-api only machine installations are possible, performing a machine upgrade is only possible by creating a new machine and delete the old one. -This has the drawback that in case a lot of data is stored on the local disks, a full restore of the original data must be performed. - -To prevent this, we will introduce a new metal-api endpoint to reinstall the machine with a new image, _without_ actually deleting the data stored on the additional hard disks. - -Storage is a difficult task to get right and reliable. A short analysis of our different storage requirements lead to 3 different scenarios. - -- Storage for the etcd pvs in the seed cluster of every partition. - This is the most important storage in our setup because these etcd pods serve as configuration backend for all customer kubernetes clusters. If they fail, the cluster is down. However gardener deploys a backup and restore sidecar into the etcd pod of every customer kubernetes control plane, and if this sidecar detects a corrupt or missing etcd database file(s) it starts automatic restore from the configured backup location. This will take some minutes. If for example a node dies, and gardener creates a new node instead, the csi-lvm created pv is not present on that node. Kubernetes will not schedule the missing etcd pod on this node because it has a local PV configured and is therefore tainted to run only on that node. To let kubernetes create that pod anyhow, someone has to either remove the taint, or delete the pod. If this is done, the pod starts and the restore of the etcd data can start as well. You can see this is a bit too complicated and will take the customer cluster down for a while (not measured yet but in the range of 5-10 minutes). -- Storage in customer clusters. - This was not promised in 2020. We have a intermediate solution with the provisioning of csi-lvm by default into all customer clusters. Albeit this is only local storage and will get deleted if a node dies. -- S3 Storage. - We have two possibilities to cope with storage: - - In place update of the OS with a daemonset - This will be fast and simple, but might fail because the packages being installed are broken right now, or a filesystem gets full, or any other failure you can think of during a os update. Another drawback is that metal-api does not reflect the updated os image. - - metal-api get a machine reinstall endpoint - With this approach we leverage from existing and already proven mechanisms. Reinstall must keep all data except the sata-dom. Gardener currently is not able to do an update with this approach because it can only do `rolling` updates. Therefore a additional `osupdatestrategy` has to be implemented for metal and other providers in gardener to be able to leverage the metal reinstall on the same machineID approach. - -If reinstall is implemented, we should focus on the same technology for all scenarios and put ceph via rook.io into the kubernetes clusters as additional StorageClass. It has to be checked whether to use the raw disk or a PV as the underlay block device where ceph stores its data. - -## API and behavior - -The API will get an new endpoint "reinstall" this endpoint takes two arguments: - -- machineID -- image - -No other aspects of the machine can be modified during the re-installation. All data stored in the existing allocation will be preserved, only the image will be modified. -Once this endpoint was called, the machine will get a `reboot` signal with the boot order set to PXE instead of HDD and the network interfaces on the leaf are set to PXE as well. Then the normal installation process starts: - -- unchanged: PXE boot with metal-hammer -- changed: metal-hammer first checks with the machineID in the metal-api (through metal-core) if there is already a allocation present -- changed: if a allocation is present and the allocation has set `reinstall: true`, wipe disk is only executed for the root disk, all other disks are untouched. -- unchanged: the specified image is downloaded and burned, `/install.sh` is executed -- unchanged: successful installation is reported back, network is set the the vrf, boot order is set to HDD. -- unchanged: distribution kernel is booted via kexec - -We can see that the `allocation` requires one additional parameter: `reinstall` and metal-hammer must check for already existing allocation at an earlier stage. - -Components which requires modifications (first guess): - -- metal-hammer: - - check for allocation present earlier - - evaluation of `reinstall` flag set - - wipe of disks depends on that flag - - Bonus: move configuration of disk layout and primary disk detection algorithm (PDDA) from metal-hammer into metal-api. - metal-api **MUST** reject reinstallation if the disk found by PDDA does not have the `/etc/metal` directory! -- metal-core: - - probably nothing -- metal-api: - - new endpoint `/machine/reinstall` - - add `Reinstall bool` to data model of `allocation` - - make sure to reset `Reinstall` after reinstallation to prevent endless reinstallation loop -- metalctl: - - implement `reinstall` -- metal-go: - - implement `reinstall` -- gardener (longterm): - - add the `OSUpgradeStrategy` `reinstall` diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP4/README.md b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP4/README.md deleted file mode 100644 index 389a02d4..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP4/README.md +++ /dev/null @@ -1,211 +0,0 @@ ---- -slug: /MEP-4-multi-tenancy-for-the-metal-api -title: MEP-4 -sidebar_position: 4 ---- - -# Multi-Tenancy for the metal-api -:::info -This document is work in progress. -::: - -In the past we decided to treat the metal-api as a "low-level API", i.e. the API does not specifically deal with projects and tenants. A user with editor access can for example assign machines to every project he desires, he can see all the machines available and can control them. We tried to keep the metal-api code base as small as possible and we added resource scoping to a "higher-level APIs". From there, a user would be able to only see his own clusters and IP addresses. - -As time passed metal-stack has become an open-source project and people are willing to adopt. Adopters who want to put their own technologies on top of the metal-stack infrastructure don't have those "higher-level APIs" that we implemented closed-source for our user base. So, external adopters most likely need to implement resource scoping on their own. - -Introducing multi-tenancy to the metal-api is a serious chance of making our product better and more successful as it opens the door for: - -- Becoming a "fully-featured" API -- Narrowing down attack surfaces and possibility of unintended resource modification produced by bugs or human errors -- Discouraging people to implement their own scoping layers in front of the metal-stack -- Gaining performance through resource scopes -- Letting untrusted / third-parties work with the API - -## Requirements - -These are some general requirements / higher objectives that MEP-4 has to fulfill. - -- Should be able to run with mini-lab without requiring to setup complex auth backends (dex, LDAP, keycloak, ...) - - Simple to start with, more complex options for production setups -- Fine-grained access permissions (every endpoint maps to a permission) -- Tenant scoping (disallow resource access to resources of other tenants) -- Project scoping (disallow resource access to resources of other projects) -- Access tokens in self-service for technical user access - -## Implementation - -We gathered a lot of knowledge while implementing a multi-tenancy-capable backend for metalstack.cloud. The goal is now to use the same technology and adopt that to the metal-api, this includes: - -- gRPC in combination with connectrpc -- OPA for making auth decisions -- REST HTTP only for OIDC login flows - -### API Definitions - -The API definitions should be located on a separate Github repository separate from the server implementation. The proposed repository location is: https://github.com/metal-stack/api. - -This repository contains the `proto3` specification of the exposed metal-stack api. This includes the messages, simple validations, services and the access permission to these services. The input parameters for the authorization in the backend are generated from the `proto3` annotations. - -Client implementations for the most relevant languages (go, python) are generated automatically. - -This api is divided into end-user and admin access at the top level. The proposed APIs are: - -- `metalstack.api.v2`: For end-user facing services -- `metalstack.admin.v2`: For operators and controllers which need access to unscoped entities - -The methods of the API can have different role scopes (and can be narrowed down further with fine-grained method permissions): - -- `tenant`: Tenant-scoped methods, e.g. project creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `project`: Project-scoped methods, e.g. machine creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `admin` Admin-scoped methods, e.g. unscoped tenant list or switch register - - Available roles: VIEWER, EDITOR - -And has methods with different visibility scopes: - -- `self`: Methods that only the logged in user can access, e.g. show permissions with the presented token -- `public`: Methods that do not require any specific authorization - -### API - -The API server implements the services defined in the API and validates access to a method using OPA with the JWT tokens passed in the requests. The server is implemented using the connectrpc.com framework. - -The API server implements the login flow through OIDC. After successful authentication, the API server derives user permissions from the OIDC provider and issues a new JWT token which is passed on to the user. The tokens including the permissions are stored in a redis compatible backend. - -With these tokens, users can create Access Tokens for CI/CD or other use cases. - -JWT Tokens can be revoked by admins and the user itself. - -### API Server - -Is put into a new github repo which implements the services defined in the `api` repository. It opens a `https` endpoints where the grpc (via connectrpc.com) and oidc services are exposed. - -### Migration of the Consumers - -To allow consumers to migrate to the `v2` API gradually, both apis, the new and the old, are deployed in parallel. In the control-plane both apis are deployed side-by-side behind the ingress. `api.example.com` is forwarded to `metal-api` and `metal.example.com` is forwarded to the new `metal-apiserver`. - -The api-server will talk to the existing metal-api during the process of migration services away to the new grpc api. - -The migration process can be done in the following manner: - -for each resource in the metal-api: - -- create a new proto3 based definition in the `api` repo. -- implement the business logic per service in the new `metal-apiserver` without calling the metal-api. -- clients must be able to talk to `v1` and `v2` backend in parallel -- Deprecate the already migrated service in the swagger route to notify the client that this route should not be used anymore. -- identify all consumers of this resource and replace them to use the grpc instead of the rest api -- move the business logic incl. the backend calls to ipam, metal-db, masterdata-api, nsq for this resource from the metal-api to the `metal-apiserver` - -We will migrate the rethinkdb backend implementation to a generic approach during this effort. - -- Try to enhance the generic rethinkdb interface with `project` scoped methods. - -There are a lot of consumers of metal-api, which need to be migrated: - -- ansible -- firewall-controller -- firewall-controller-manager -- gardener-extension-auth -- gardener-extension-provider-metal - - Do not point the secret bindings to a the shared provider secret in the seed anymore. Instead, use individual provider-secret containing project-scoped API access tokens in the Gardener project namespaces. -- machine-controller-manager-provider-metal -- metal-ccm -- metal-console -- metal-bmc -- metal-core -- metal-hammer -- metal-image-cache-sync -- metal-images -- metal-metrics-exporter -- metal-networker -- metalctl -- pixie - -## User Scenarios - -This section gathers a collection of workflows from the perspective of a user that we want to provide with the implementation of this proposal. - -### Machine Creation - -A regular user wants to create a machine resource. - -Requirements: Project was created, permissions are present - -- The user can see networks that were provided by the admin. - - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` - -- The user has to set the project scope first or provide `--project` flags for all commands. - ``` - $ metalctl project set 793bb6cd-8b46-479d-9209-0fedca428fe1 - You are now acting on project 793bb6cd-8b46-479d-9209-0fedca428fe1. - ``` -- The user can create the child network required for machine allocation. - ``` - $ metalctl network allocate --partition fra-equ01 --name test - ``` -- Now, the user sees his own child network. - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - └─╴08b9114b-ec47-4697-b402-a11421788dc6 test 793bb6cd-8b46-479d-9209-0fedca428fe1 fra-equ01 false false 10.128.64.0/22  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` -- The user does not see any machines yet. - ``` - $ metalctl machine ls - ``` -- The user can create a machine. - ``` - $ metalctl machine create --networks internet,08b9114b-ec47-4697-b402-a11421788dc6 --name test --hostname test --image ubuntu-20.04 --partition fra-equ01 --size c1-xlarge-x86` - ``` -- The machine will now be provisioned. - ``` - $ metalctl machine ls - ID LAST EVENT WHEN AGE HOSTNAME PROJECT SIZE IMAGE PARTITION - 00000000-0000-0000-0000-ac1f6b7befb2 Phoned Home 20s 50d 4h test 793bb6cd-8b46-479d-9209-0fedca428fe1 c1-xlarge-x86 Ubuntu 20.04 20210415 fra-equ01 - ``` - -:::warning -A user **cannot** list all allocated machines for all projects. The user **must** always switch project context first and can only view the machines inside this project. Only admins can see all machines at once. -::: -### Scopes for Resources - -The admins / operators of the metal-stack should be able to provide _global_ resources that users are able to use along with their own resources. In particular, users can view and use _global_ resources, but they are not allowed to create, modify or delete them. - -:::info -When a project ID field is empty on a resource, the resource is considered _global_. -::: - -Where possible, users should be capable of creating their own resource entities. - -| Resource | User | Global | -| :----------------- | :--- | :----- | -| File System Layout | yes | yes | -| Firewall | yes | | -| Firmware | | yes | -| OS Image | | yes | -| Machine | yes | | -| Network (Base) | | yes | -| Network (Children) | yes | | -| IP | yes | | -| Partition | | yes | -| Project | yes | | -| Project Token | yes | | -| Size | | yes | -| Switch | | | -| Tenant | | yes | - -:::info -Example: A user can make use of the file system layouts provided by the admins, but can also create own layouts. Same applies for images. As soon as a user creates own resources, the user takes over the responsibility for the machine provisioning to succeed. -::: diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP5/README.md b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP5/README.md deleted file mode 100644 index 3b7fc45c..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP5/README.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -slug: /MEP-5-shared-networks -title: MEP-5 -sidebar_position: 5 ---- - -# Shared Networks - -## Why are shared networks needed - -For special purpose machines that serve shared services with performance critical workloads to all machines of a partition (like persistent storage) it would be good to have kind of a "shared network" that is easily accessible. -They do not necessarily need another firewall. This would avoid having two firewalls in the datapath between a machine in a private network and the machines of a shared service. - -## Constraints that need to hold - -- a shared network is usable from all machines that have a firewall in front, that uses it -- a shared network is only usable within a single partition (currently we are constrained in bandwidth and have no routing of 10.0.0.0/8 addresses btw. partitions and failure domain should be the partition but this constraint might get lifted in the future) -- networks may be marked as shared after network allocation (but there should be no way back from shared to unshared) -- neither machines nor firewalls may have multiple private, unshared networks configured -- machines must have a single primary network configured - - this might be a shared network - - OR a plain, unshared private network -- firewalls may participate in multiple shared networks -- machines can be allocated with a primary network using auto IP allocation or with `noauto` and a specific IP - -## Should shared networks be private - -**Alternative 1:** If we implemented shared networks by extending functions around plain, private networks we would not have to manage another CIDR (mini point) and it would be possible to create a k8s cluster with a private network, mark the network as `shared` and produce shared services from this k8s cluster. - -**Alternative 2:** If shared networks are implemented as first class networks we could customize the VRF and also accomplish an other goal of our roadmap: being able to create machines directly in an external network. - -Together with @majst01 and @Gerrit91 we decided to continue to implement **Alternative 1**. - -## Firewalls accessing a shared network - -Firewalls that access shared networks need to: - -- hide the private network behind an ip address of the shared network if the shared network was configured with `nat=true`. -- import the prefixes of the shared VRF to the private VRF and import the prefixes of the private VRF to the shared VRF so that the communication between the two is working in both directions. As long as no `nat=true` was set on the shared VRF, the original machine ips are visible in both communication directions. - -## Setup with shared networks and single consumer - -![Simple Setup](./shared.png) - -## Setup with single shared network and multiple consumers - -![Advanced Setup](./shared_advanced.png) - -## Getting internet access - -Machines contained in a shared network can access the internet with different scenarios: - -- if they have an own firewall: this is internet accessibility, as common (check whether all traffic gets routed through it!) -- if they don't have an own firewall, an external HTTP proxy is needed that has an endpoint exposed as Service Type NodePort diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP5/shared.drawio b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP5/shared.drawio deleted file mode 100644 index aa7af045..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP5/shared.drawio +++ /dev/null @@ -1,121 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP5/shared.png b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP5/shared.png deleted file mode 100644 index b0b47f03..00000000 Binary files a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP5/shared.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP5/shared_advanced.drawio b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP5/shared_advanced.drawio deleted file mode 100644 index 6f96eca0..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP5/shared_advanced.drawio +++ /dev/null @@ -1,187 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP5/shared_advanced.png b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP5/shared_advanced.png deleted file mode 100644 index da989915..00000000 Binary files a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP5/shared_advanced.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP6/README.md b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP6/README.md deleted file mode 100644 index edf52a6c..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP6/README.md +++ /dev/null @@ -1,123 +0,0 @@ ---- -slug: /MEP-6-dmz-networks -title: MEP-6 -sidebar_position: 6 ---- - -# DMZ Networks - -## Reasoning - -To fulfill higher levels of security measures the standard metal-stack approach with a single firewall in front of a set of machines might be insufficient. -There are cases where two physically distinct firewalls in front of application workload are mandatory. In traditional network terms this is known as DMZ approach. - -For Kubernetes workloads it makes sense to use the front cluster for ingress, WAF purposes and as outgoing proxy. The clusters may be used for application workload. - -## DMZ network - -- Use a separate DMZ network prefix for every tenant -- This is used as intermediate network btw. private networks of a tenant and the internet -- For every partition a distinct DMZ firewall/cluster is needed for a tenant -- For Gardener orchestrated Kubernetes clusters this network must be a publicly reachable internet prefix because shoot clusters need a vpn service that is used for instrumentation from the seed cluster - this will be a requirement as long as the inverse vpn tunnel feature Konnectivity is not available to us. - -## Approach 1: DMZ with publicly reachable internet prefix - -![DMZ Internet](dmz-internet_public.svg) - -A DMZ network with publicly reachable internet prefix will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: null -partitionid: "" -prefixes: - - 212.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 104007 -vrfshared: false -nat: true -shared: false -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -This is currently supported by the metal-networker and needs no further changes! - -## Approach 2: DMZ with private IPs - -![DMZ Internet](dmz-internet_private.svg) - -A DMZ network with private IPs will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: tenant-super-network-fra-equ01 -partitionid: fra-equ01 -prefixes: - - 10.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 4711 -vrfshared: false -nat: true -shared: true # it's usable from multiple projects -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network (only locally, no-export) -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -## Code Changes / Implications - -- `metal-networker` and `metal-ccm` assume that there is only one network providing the default-route -- `metal-networker` needs to - - import the default route from the internet network to the dmz network (DMZ Firewall) - - import the DMZ network to the internet network and adjusting NAT rules (DMZ Firewall) - - import destination prefixes of the DMZ network to the private primary network (DMZ Firewall, Application Firewall) - - import DMZ-IPs of the private primary network to the DMZ network (DMZ Firewall, Application Firewall) -- `metal-api`: destination prefixes of private networks need to be configurable (`allocateNetwork`) -- `gardener-extension-provider-metal`: needs to be able to delete DMZ clusters (but skip the network deletion part) -- the application firewall is not publicly reachable - for debugging purposes a hop over the DMZ firewall is needed - -## Decision - -We decided to follow the second approach with private DMZ networks. diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP6/dmz-internet_private.drawio b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP6/dmz-internet_private.drawio deleted file mode 100644 index 7b83bbfc..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP6/dmz-internet_private.drawio +++ /dev/null @@ -1,178 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP6/dmz-internet_private.svg b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP6/dmz-internet_private.svg deleted file mode 100644 index f5e58204..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP6/dmz-internet_private.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
10.90.30.129
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
10.90.30.128/25
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
10.90.30.129 is reachable
/0 via Firewall DMZ
10.0.0.0/24 is reachable
10.0.1.0/24 is reachable
10.90.30.129 is reachable...
Internet
212.1.1.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0

import 10.0.0.0/24 no export
import 10.0.1.0/24 no export
import 10.90.30.128/25 no export
import 10.0.0.0/24 no exp...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP6/dmz-internet_public.drawio b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP6/dmz-internet_public.drawio deleted file mode 100644 index 544939e5..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP6/dmz-internet_public.drawio +++ /dev/null @@ -1,184 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP6/dmz-internet_public.svg b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP6/dmz-internet_public.svg deleted file mode 100644 index 5e825081..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP6/dmz-internet_public.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
212.1.2.3
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
212.1.2.0/27
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
212.1.2.3 is reachable
/0 via Firewall DMZ
212.1.2.3 is reachable...
Internet
212.1.1.0/27 212.1.2.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0
import 212.1.2.0/27
import 10.0.0.0/24 no redistribute
import 10.0.1.0/24 no redistribute

import 212.1.2.0/27...
SNAT to
212.1.2.1
SNAT to...
SNAT to
212.1.2.2
SNAT to...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP8/README.md b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP8/README.md deleted file mode 100644 index 14748fae..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP8/README.md +++ /dev/null @@ -1,503 +0,0 @@ ---- -slug: /MEP-7-configurable-filesystem-layout-for-machine-allocation -title: MEP-7 -sidebar_position: 7 ---- - -# Configurable Filesystem layout for Machine Allocation - -The current implementation uses a hard coded filesystem layout depending on the specified size and image. This is done in the metal-hammer. This worked well in the past because we had a small amount of sizes and images. But we reached a point where this is to restricted for all use cases we have to fulfill. It also forces us to modify the metal-hammer source code to support a new filesystem layout. - -This proposal tries to address this issue by introducing a filesystem layout struct in the metal-api which is then configurable per machine allocation. -The original behavior of automatic filesystem layout decision must still be present, because there must be no API change for existing API consumers. It should be a additional feature during machine allocation. - -## API and behavior - -The API will get a new endpoint `filesystemlayouts`to create/update/delete a set of available `filesystemlayouts`. - -### Constraints - -In order to keep the actual machine allocation api compatible, there must be no difference while allocating a machine. To achieve this every -`filesystemlayout` defines constraints which specifies for which combination of `sizes` and `images` this layout should be used by default. -The specified constraints over all `filesystemlayouts` therefore must be collision free, to be more specific, there must be exactly one layout outcome -for every possible combination of `sizes` and `images`. - -The `size` constraint must be a list of the exact size ids, the `image` constraint must be a map of os to semver compatible version constraint. For example: - -- `debian: ">= 10.20210101"` or `debian: "< 10.20210101"` - -The general form of a `image` constraint is a map from `os` to `versionconstraint` where: - -`os` must match the first part of the image without the version. -`versionconstraint` must be the comparator, a space and the version, or simply `*` to match all versions of this `os`. -The comparator must be one of: "=", "!=", ">", "<", ">=", "=>", "<=", "=<", "~", "~>", "^" - -It must also be possible to have a `filesystemlayout` in development or for other special purposes, which can be specified during the machine allocation. -To have such a layout, both constraints `sizes` and `images`must be empty list. - -### Reinstall - -The current reinstall implementation the metal-hammer detects during the installation on which disk the OS was installed and reports back to the metal-api the Report struct which has two properties `primarydisk` and `ospartition`. -Both fields are not required anymore because the logic is now shifted to the `filesystemlayout` definition. If `Disk.WipeOnReinstall` is set to true, this disk will be wiped, default is false and is preserved. - -### Handling of s2-xlarge machines - -These machines are a bit special compared to our `c1-*` machines because they have rotating hard disks for the mass storage purpose. -The downside is that the on board SATA-DOM has the same naming as the HDDs and can not be specified as the first /dev/sda disk because all HDDs are also /dev/sd\* disks. -Therefore we had a special SATA-DOM detection algorithm inside metal-hammer which simply checks for the smallest /dev/sd disk and took this to install the OS. - -This is not possible with the current approach, but we figured out that the SATA-DOM is always `/dev/sde`. So we can create a special `filesystemlayout` where the installations is made on this disk. - -### Possible Filesystemlayout hierarchies - -It is only possible to create a filesystem on top of a block device. The creation of a block device can be done on multiple ways, depending on the requirements regarding performance, space and redundancy of the filesystem. -It also depends on the disks available on the server. - -The current approach implements the following hierarchies: - -![filesystems](filesystems.png) - -### Implementation - -```go -// FilesystemLayout to be created on the given machine -type FilesystemLayout struct { - // ID unique layout identifier - ID string - // Description is human readable - Description string - // Filesystems to create on the server - Filesystems []Filesystem - // Disks to configure in the server with their partitions - Disks []Disk - // Raid if not empty, create raid arrays out of the individual disks, to place filesystems onto - Raid []Raid - // VolumeGroups to create - VolumeGroups []VolumeGroup - // LogicalVolumes to create on top of VolumeGroups - LogicalVolumes []LogicalVolume - // Constraints which must match to select this Layout - Constraints FilesystemLayoutConstraints -} - -type FilesystemLayoutConstraints struct { - // Sizes defines the list of sizes this layout applies to - Sizes []string - // Images defines a map from os to versionconstraint - // the combination of os and versionconstraint per size must be conflict free over all filesystemlayouts - Images map[string]string -} - -type RaidLevel string -type Format string -type GPTType string - -// Filesystem defines a single filesystem to be mounted -type Filesystem struct { - // Path defines the mountpoint, if nil, it will not be mounted - Path *string - // Device where the filesystem is created on, must be the full device path seen by the OS - Device string - // Format is the type of filesystem should be created - Format Format - // Label is optional enhances readability - Label *string - // MountOptions which might be required - MountOptions []string - // CreateOptions during filesystem creation - CreateOptions []string -} - -// Disk represents a single block device visible from the OS, required -type Disk struct { - // Device is the full device path - Device string - // Partitions to create on this device - Partitions []Partition - // WipeOnReinstall, if set to true the whole disk will be erased if reinstall happens - // during fresh install all disks are wiped - WipeOnReinstall bool -} - -// Raid is optional, if given the devices must match. -// TODO inherit GPTType from underlay device ? -type Raid struct { - // ArrayName of the raid device, most often this will be /dev/md0 and so forth - ArrayName string - // Devices the devices to form a raid device - Devices []Device - // Level the raidlevel to use, can be one of 0,1,5,10 - // TODO what should be support - Level RaidLevel - // CreateOptions required during raid creation, example: --metadata=1.0 for uefi boot partition - CreateOptions []string - // Spares defaults to 0 - Spares int -} - - -// VolumeGroup is optional, if given the devices must match. -type VolumeGroup struct { - // Name of the volumegroup without the /dev prefix - Name string - // Devices the devices to form a volumegroup device - Devices []string - // Tags to attach to the volumegroup - Tags []string -} - -// LogicalVolume is a block devices created with lvm on top of a volumegroup -type LogicalVolume struct { - // Name the name of the logical volume, without /dev prefix, will be accessible at /dev/vgname/lvname - Name string - // VolumeGroup the name of the volumegroup - VolumeGroup string - // Size of this LV in mebibytes (MiB) - Size uint64 - // LVMType can be either striped or raid1 - LVMType LVMType -} - -// Partition is a single partition on a device, only GPT partition types are supported -type Partition struct { - // Number of this partition, will be added to the device once partitioned - Number int - // Label to enhance readability - Label *string - // Size given in MebiBytes (MiB) - // if "0" is given the rest of the device will be used, this requires Number to be the highest in this partition - Size string - // GPTType defines the GPT partition type - GPTType *GPTType -} - -const ( - // VFAT is used for the UEFI boot partition - VFAT = Format("vfat") - // EXT3 is usually only used for /boot - EXT3 = Format("ext3") - // EXT4 is the default fs - EXT4 = Format("ext4") - // SWAP is for the swap partition - SWAP = Format("swap") - // None - NONE = Format("none") - - // GPTBoot EFI Boot Partition - GPTBoot = GPTType("ef00") - // GPTLinux Linux Partition - GPTLinux = GPTType("8300") - // GPTLinuxRaid Linux Raid Partition - GPTLinuxRaid = GPTType("fd00") - // GPTLinux Linux Partition - GPTLinuxLVM = GPTType("8e00") - - // LVMTypeLinear append across all physical volumes - LVMTypeLinear = LVMType("linear") - // LVMTypeStriped stripe across all physical volumes - LVMTypeStriped = LVMType("striped") - // LVMTypeStripe mirror with raid across all physical volumes - LVMTypeRaid1 = LVMType("raid1") -) -``` - -Example `metalctl` outputs: - -```bash -$ metalctl filesystemlayouts ls -ID DESCRIPTION SIZES IMAGES -default default fs layout c1-large-x86, c1-xlarge-x86 debian >=10, ubuntu >=20.04, centos >=7 -ceph fs layout for ceph s2-large-x86, s2-xlarge-x86 debian >=10, ubuntu >=20.04 -firewall firewall fs layout c1-large-x86, c1-xlarge-x86 firewall >=2 -storage storage fs layout s3-large-x86 centos >=7 -s3 storage fs layout s2-xlarge-x86 debian >=10, ubuntu >=20.04, >=firewall-2 -default-devel devel fs layout -``` - -The `default` layout reflects what is actually implemented in metal-hammer to guarantee backward compatibility. - -```yaml ---- -id: default -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - label: "efi" # required to be compatible with old images - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" # required to be compatible with old images - - path: "/var/lib" - device: "/dev/sda3" - format: "ext4" - label: "varlib" # required to be compatible with old images - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -The `firewall` layout reuses the built in nvme disk to store the logs, which is way faster and larger than what the sata-dom ssd provides. - -```yaml ---- -id: firewall -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - firewall: ">=2" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sda2" - format: "ext4" - - path: "/var" - device: "/dev/nvme0n1p1" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - device: "/dev/nvme0n1" - wipe: true - partitions: - - number: 1 - label: "var" - size: 0 - type: GPTLinux -``` - -The `storage` layout will be used for the storage servers, which must have mirrored boot disks. - -```yaml ---- -id: storage -constraints: - sizes: - - s3-large-x86 - images: - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/md1" - format: "vfat" - options: "-F32" - - path: "/" - device: "/dev/md2" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid - - device: "/dev/sdb" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid -raid: - - name: "/dev/md1" - level: 1 - devices: - - "/dev/sda1" - - "/dev/sdb1" - options: "--metadata=1.0" - - name: "/dev/md2" - level: 1 - devices: - - "/dev/sda2" - - "/dev/sdb2" - options: "--metadata=1.0" -``` - -The `s3-storage` layout matches the special situation on the s2-xlarge machines. - -```yaml ---- -id: s3-storage -constraints: - sizes: - - c1-large-x86 - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sde1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sde2" - format: "ext4" - - path: "/var/lib" - device: "/dev/sde3" - format: "ext4" -disks: - - device: "/dev/sde" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -A sample `lvm` layout which puts `/var/lib` as stripe on the nvme device - -```yaml ---- -id: lvm -description: "lvm layout" -constraints: - size: - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - createoptions: - - "-F 32" - label: "efi" - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" - - path: "/var/lib" - device: "/dev/vg00/varlib" - format: "ext4" - label: "varlib" - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -volumegroups: - - name: "vg00" - devices: - - "/dev/nvmne0n1" - - "/dev/nvmne0n2" -logicalvolumes: - - name: "varlib" - volumegroup: "vg00" - size: 200 - lvmtype: "striped" -disks: - - device: "/dev/sda" - wipeonreinstall: true - partitions: - - number: 1 - label: "efi" - size: 500 - gpttype: "ef00" - - number: 2 - label: "root" - size: 5000 - gpttype: "8300" - - device: "/dev/nvmne0n1" - wipeonreinstall: false - - device: "/dev/nvmne0n2" - wipeonreinstall: false -``` - -## Components which requires modifications - -- metal-hammer: - - change implementation from build in hard coded logic - - move logic to create fstab from install.sh to metal-hammer -- metal-api: - - new endpoint `filesystemlayouts` - - add optional spec of `filesystemlayout` during `allocation` with validation if given `filesystemlayout` is possible on given size. - - add `allocation.filesystemlayout` in the response, based on either the specified `filesystemlayout` or the calculated one. - - implement `filesystemlayouts` validation for: - - matching to disks in the size - - no overlapping with the sizes/imagefilter specified in `filesystemlayouts` - - all devices specified exists from top to bottom (fs -> disks -> device || fs -> raid -> devices) -- metalctl: - - implement `filesystemlayouts` -- metal-go: - - adopt api changes -- metal-images: - - install mdadm for raid support diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP8/filesystems.drawio b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP8/filesystems.drawio deleted file mode 100644 index 0f0c6ab5..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP8/filesystems.drawio +++ /dev/null @@ -1,43 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP8/filesystems.png b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP8/filesystems.png deleted file mode 100644 index 6d903b7e..00000000 Binary files a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP8/filesystems.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP9/README.md b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP9/README.md deleted file mode 100644 index a8cae83d..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP9/README.md +++ /dev/null @@ -1,132 +0,0 @@ ---- -slug: /MEP-9-no-open-ports-to-the-data-center -title: MEP-9 -sidebar_position: 9 ---- - -# No Open Ports To the Data Center - -Our metal-stack partitions typically have open ports for metal-stack native services, these are: - -- SSH port on the firewalls -- bmc-reverse-proxy for serial console access through the metal-console - -These open ports are potential security risks. For example, while SSH access is possible only with private key it's still vulnerable to DoS attack. - -Therefore, we want to get rid off these open ports to reduce the attack surface to the data center. - -## Requirements - -- Access to firewall SSH only via VPN -- Easy to update VPN components - -As a next step, we can also consider joining the management servers to the VPN mesh, which would replace typical WireGuard setups for operators to enter resources inside the partition. - -## High Level Design - -[](./architecture.svg) - -> Simplified drawing showing old vs. new architecture. - -### Concerns - -There's few concerns when using WireGuard for implementing VPN: - -1. WireGuard doesn't implement dynamic cipher substitution. Which is important in case one of the crypto methods, used by WireGuard will be broken. The only possible solution for that will be to update WireGuard to a fixed version. -2. Coordination server(Headscale) is a single point of failure. In case it fails, it potentially can disconnect existing members of the network, as WireGuard can't manage dynamic IPs by itself. -3. Headscale is already falls behind Tailscale coordination server implementation. Which can complicate the upgrade to newer version of Tailscale client in case of emergency. - -### Solutions to concerns - -1. Tailscale node software is using userspace implementation of WireGuard -- `wireguard-go`. One of the options is to inject Tailscale client into `metalctl`. And make it available as `metalctl vpn` or similar command. It should be possible to do as `tailscale` node is already available as open sourced Go pkg. That would allow us to control, what version of Tailscale users are using and in case of any critical changes to enforce them to update `metalctl` to use VPN functionality. -2. Would it be a considerable risk? We could look into `wg-dynamic` project to cover this problem. -3. At the moment, repository looks well maintained and the metal-stack team already contributes to it. - -## Implementation Details - -### metal-roles - -`metal-roles` will be responsible for deployment of `headscale` server(via new `headscale` role). It also should provide sufficient config to `metal-api` so it establishes connection with `headscale` gRPC server. - -### New `metalctl` commands - -`metalctl` will be responsible for client-side implementation of this MEP. Specifically, it's by using `metalctl` user expected to connect to firewalls. - -- `metalctl vpn` -- section for VPN related commands: - - `metalctl vpn get key [vpn name] --namespace [namespace name]` -- returns auth key to be used with `tailscale` client for establishing connection. - -Extend `metalctl firewall`: - -- `metalctl firewall ssh [ID]` -- connect to firewall via SSH. - -Extend `metalctl machine`: - -- `metalctl machine ssh [ID]` -- connect to machine via SSH. - -`metalctl` will be able to connect to firewall and machines by running `tailscale` in container. - -### metal-api - -Updates to `metal-api` should be made, so that it's able to add firewalls to VPNs. There should be one Tailscale namespace per project. So if multiple firewalls are created in single project, they will join the same namespace. - -Two new flags should be introduced to connect `metal-api` to `headscale` gRPC server: - -- `headscale-addr` -- specifies address of Headscale grpc API. -- `headscale-api-key` -- specifies temporary API key to connect to Headscale. It should be replaced and then rotated by `metal-api`. - -If `metal-api` initialized with `headscale` connection it should automatically join all created firewalls to VPN. - -Add new endpoint, that will be used by `metalctl` to connect to VPN: - -- `/v1/vpn GET` -- requests auth key from `headscale` server. - -### metal-hammer - -`metal-hammer` acts as an intermediary for machine configuration between `metal-api` and machine's image. Specifically it writes to `/etc/metal/install.yaml` file, data from which later will be used by image's `install.sh` file. - -To implement VPN support we have to add authentication key and VPN server address to `install.yaml` file. This key will be used to join machine to a VPN. - -### metal-images - -Images `install.sh` script have to be updated to work with authentication key and VPN server address, provided in `install.yaml` file. If this key is present, machine should connect to VPN. - -### metal-networker - -`metal-networker` also have to know if VPN was configured. In that case we need to disable public access to SSH and allow all(?) traffic from WireGuard interface. - -### firewall-controller - -`firewall-controller` have to monitor changes in `Firewall` resource and keep `tailscaled` version up-to-date. - -### Resources - -Update `Firewall` resource to include desired/actual `tailscale` version: - -``` -Firewall: - Spec: - tailscale: - Version: Minimal version - ... - Status: - ... - VPN: - Status: Boolean field - tailscale: - Version: Actual version - ... -``` - -### bmc-reverse-proxy - -TODO - -## References - -1. [WireGuard: Next Generation Secure Network Tunnel](https://www.youtube.com/watch?v=88GyLoZbDNw) -2. [How Tailscale works](https://tailscale.com/blog/how-tailscale-works) -3. [Tailscale is officially SOC 2 compliant](https://tailscale.com/blog/soc2) -4. [Why not Wireguard](https://www.ipfire.org/blog/why-not-wireguard) -5. [Wireguard: Known Limitations](https://www.wireguard.com/known-limitations/) -6. [Wireguard: Things That Might Be Accomplished](https://www.wireguard.com/todo/) -7. [Headscale: Tailscale control protocol v2](https://github.com/juanfont/headscale/issues/526) diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP9/architecture.drawio b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP9/architecture.drawio deleted file mode 100644 index adb09214..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP9/architecture.drawio +++ /dev/null @@ -1,324 +0,0 @@ - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - -
-
-
- headscale -
-
-
-
- - headscale - -
-
- - - - - - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
-
- - - - - Viewer does not support full SVG 1.1 - - - -
diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP9/architecture.svg b/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP9/architecture.svg deleted file mode 100644 index fd268d2f..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/MEP9/architecture.svg +++ /dev/null @@ -1 +0,0 @@ -
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
headscale
headscale
tailscaled
tailscaled
tailscaled
tailscaled
Internet
Internet
Internet
Internet
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/_category_.json b/versioned_docs/version-v0.21.9/contributing/01-Proposals/_category_.json deleted file mode 100644 index ec1a4ebc..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "position": 3, - "label": "Enhancement Proposals" -} diff --git a/versioned_docs/version-v0.21.9/contributing/01-Proposals/index.md b/versioned_docs/version-v0.21.9/contributing/01-Proposals/index.md deleted file mode 100644 index 9046bdf5..00000000 --- a/versioned_docs/version-v0.21.9/contributing/01-Proposals/index.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -slug: /enhancement-proposals -title: Enhancement Proposals -sidebar_position: 1 ---- - -# Metal Stack Enhancement Proposals (MEPs) - -This section contains proposals which address substantial modifications to metal-stack. - -Every proposal has a short name which starts with _MEP_ followed by an incremental, unique number. Proposals should be raised as pull requests in the [website](https://github.com/metal-stack/website) repository and can be discussed in Github issues. - -The list of proposals and their current state is listed in the table below. - -Possible states are: - -- `In Discussion` -- `Accepted` -- `Declined` -- `In Progress` -- `Completed` -- `Aborted` - -Once a proposal was accepted, an issue should be raised and the implementation should be done in a separate PR. - -| Name | Description | State | -| :------------------------ | :--------------------------------------------- | :-------------: | -| [MEP-1](MEP1/README.md) | Distributed Control Plane Deployment | `Declined` | -| [MEP-2](MEP2/README.md) | Two Factor Authentication | `Aborted` | -| [MEP-3](MEP3/README.md) | Machine Re-Installation to preserve local data | `Completed` | -| [MEP-4](MEP4/README.md) | Multi-tenancy for the metal-api | `In Progress` | -| [MEP-5](MEP5/README.md) | Shared Networks | `Completed` | -| [MEP-6](MEP6/README.md) | DMZ Networks | `Completed` | -| MEP-7 | Passing environment variables to machines | `Declined` | -| [MEP-8](MEP8/README.md) | Configurable Filesystemlayout | `Completed` | -| [MEP-9](MEP9/README.md) | No Open Ports To the Data Center | `Completed` | -| [MEP-10](MEP10/README.md) | SONiC Support | `Completed` | -| [MEP-11](MEP11/README.md) | Auditing ^of metal-stack resources | `Completed` | -| [MEP-12](MEP12/README.md) | Rack Spreading | `Completed` | -| [MEP-13](MEP13/README.md) | IPv6 | `Completed` | -| [MEP-14](MEP14/README.md) | Independence from external sources | `Completed` | -| MEP-15 | HAL Improvements | `In Discussion` | -| [MEP-16](MEP16/README.md) | Firewall Support for Cluster API Provider | `In Discussion` | -| [MEP-17](MEP17/README.md) | Global Network View | `In Discussion` | -| [MEP-18](MEP18/README.md) | Autonomous Control Plane | `In Discussion` | - -## Proposal Process - -1. Before starting a new proposal, it is advised to have a quick chat with one of the maintainers. -2. Create a draft pull request in the [website](https://github.com/metal-stack/website) repository with your proposal. Your proposal doesn't have to be finished at this point. -3. Share the PR in the [metal-stack Slack](https://metal-stack.slack.com/) and invite maintainers to review it. -4. The review itself will probably take place in multiple iterations. Don't be discouraged if your proposal is not accepted right away. The goal is to reach consensus. -5. Once your proposal is accepted, create an umbrella issue in the relevant repository or when multiple repositories are involved in the [releases](https://github.com/metal-stack/releases). -6. Other issues should be created in different repositories and linked to the umbrella issue. -7. Unless stated otherwise, the proposer is responsible for the implementation of the proposal. - -## How to Write a Good MEP - -In the first section of your MEP, start with the current situation and the motivation for the change. Summarize your proposal briefly. - -Next follows the main part: describe your proposal in detail. Which parts of of metal-stack are affected? Are there API changes? If yes, describe them and provide examples here. -Try to think of side effects your proposal might have. Try to provide a view on how your proposal affects users of metal-stack. -Highlight breaking changes and think of a migration path for existing users. If your proposal affects multiple components, try to describe the interaction between them. - -After the main part of your proposal, feel free to add additional sections, e.g. about alternatives that were considered, non-goals or future possibilities. - -Depending on the complexity of your proposal, you might want to add a section about the implementation plan or roadmap. - -You can have a look at the existing MEPs for inspiration. As you will notice: not every MEP has the same structure. Feel free to structure your MEP in a way that makes sense for your proposal. diff --git a/versioned_docs/version-v0.21.9/contributing/02-planning-meetings.md b/versioned_docs/version-v0.21.9/contributing/02-planning-meetings.md deleted file mode 100644 index ef602204..00000000 --- a/versioned_docs/version-v0.21.9/contributing/02-planning-meetings.md +++ /dev/null @@ -1,51 +0,0 @@ ---- -slug: /planning-meetings -title: Planning Meetings -sidebar_position: 2 ---- - -# Planning Meetings - -Public planning meetings are held **biweekly** on **odd calendar weeks** from **14:00 to 14:30** on Microsoft Teams. The purpose is to provide an overview of our current projects and priorities, as well as to discuss new topics and issues within the group. - -Our [development planning board](https://github.com/orgs/metal-stack/projects/34) can be found on GitHub. - -You can use [this link](https://teams.microsoft.com/l/meetup-join/19%3ameeting_ZTVmNWFkYjYtMzVmYi00ZTMxLTk5ZTUtMGFjYjU2OTk0MjQz%40thread.v2/0?context=%7b%22Tid%22%3a%22f9d9b921-8f78-466d-95fd-4495e73d8d65%22%2c%22Oid%22%3a%228ac2a791-e637-4a90-8505-0a1ee175ebfc%22%7d) to join. If you want to get an invitation to the event, please drop us a line on our Slack channel. - -Planning meetings are currently not recorded. The meetings are held either in English or German depending on the attendees. - -:::info -Note that anyone can contribute to metal-stack without participating in planning meetings. However, if you want to speed up the review process for your requirements, it might be helpful to attend the meetings. -::: - -## Agenda - -Here is the agenda that we generally want to follow in a planning meeting: - -- Possibility to bring up news that are interesting for every developer of the metal-stack org -- Check `Done` column and archive cards - - Attendees have the chance to briefly present achievements if they want -- Check the `In Progress` column and discuss whether these tasks are still worked on, there were significant blockers or they can be lower-prioritized -- Check new issues labelled with `triage` and prioritize them -- Allow attendees to bring up issues and prioritize them - - Attendees have the chance to briefly present these new issues - -## Idea Backlog - -The backlog contains ideas of what could become part of the roadmap in the future. The list is ordered alphabetically. Therefore, the order does not express the importance or weight of a backlog item. - -We incorporate community feedback into the roadmap. If you think that important points are missing in the backlog, please share your ideas with us. We have a Slack channel. Please check out [metal-stack.io](https://metal-stack.io) for contact information. - -:::danger -By no means this list is a promise of what is being worked on in the near future. It is just a summary of ideas that was agreed on to be "nice to have". It is up to the investors, maintainers and the community to choose topics from this list and to implement them or to remove them from the list. -::: - -- Add metal-stack to [Gardener conformance test grid](https://testgrid.k8s.io/gardener-all) -- Autoscaler for metal control plane components -- CI dashboard and public integration testing -- Improved release and deploy processes (GitOps, [Spinnaker](https://spinnaker.io/), [Flux](https://fluxcd.io/)) -- Machine internet without firewalls -- metal-stack dashboard (UI) -- Offer our metal-stack extensions as enterprise products (accounting, cluster-api, S3) (neither of them will ever be required for running metal-stack, they just add extra value for certain enterprises) -- Partition managed by Kubernetes (with Kubelets joining the control plane cluster) -- Public offering / demo playground diff --git a/versioned_docs/version-v0.21.9/contributing/03-contribution-guideline.md b/versioned_docs/version-v0.21.9/contributing/03-contribution-guideline.md deleted file mode 100644 index 15a73d0d..00000000 --- a/versioned_docs/version-v0.21.9/contributing/03-contribution-guideline.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -slug: /contribution-guideline -title: Contribution Guideline -sidebar_position: 3 ---- - -# Contribution Guideline - -This document describes the way we want to contribute code to the projects of metal-stack, which are hosted on [github.com/metal-stack](https://github.com/metal-stack). - -The document is meant to be understood as a general guideline for contributions, but not as burden to be placed on a developer. Use your best judgment when contributing code. Try to be as clean and precise as possible when writing code and try to make your code as maintainable and understandable as possible for other people. - -Even if it should go without saying, we live an open culture of discussion, in which everybody is welcome to participate. We treat every contribution with respect and objectiveness with the general aim to write software of quality. - -If you want, feel free to propose changes to this document in a pull request. - -## How Can I Contribute? - -Open a Github issue in the project you would like to contribute. Within the issue, your idea can be discussed. It is also possible to directly create a pull request when the set of changes is relatively small. - -When opening an issue please consider the following aspects: - -1. Create a meaningful issue describing the WHY? of your contribution. -1. Try to set appropriate labels to the issue. For example, attach the `triage` label to your issue if you want it to be discussed in the next [planning meeting](./02-planning-meetings.md). It might be useful to attend the meeting if you want to emphasize it being worked on. - -### Pull Requests - -The process described here has several goals: - -- Maintain quality -- Enable a sustainable system to review contributions -- Enable documented and reproducible addition of contributions - -1. Create a repository fork within the context of that issue. Members of the organization may work on the repository directly without a fork, which allows building development artifacts more easily. -1. Develop, document and test your contribution (try not to solve more than one issue in a single pull request). -1. Create a Draft Pull Request to the repository's main branch. -1. Create a meaningful description of the pull request or reference the related issue. The pull request template explains what the content should include, please read it. -1. Ask for merging your contribution by removing the draft marker. Repository maintainers (see [Code Ownership](#code-ownership)) are notified automatically, but you can also reach out to people directly on Slack if you want a review from a specific person. - -## General Objectives - -This section contains language-agnostic topics that all metal-stack projects are trying to follow. - -### Code Ownership - -The code base is owned by the entire team and every member is allowed to contribute changes to any of the projects. This is considered as collective code ownership[^1]. - -As a matter of fact, there are persons in a project, which already have experience with the sources. These are defined directly in the repository's [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) file. If you want to merge changes into the master branch, it is advisable to include code owners into the process of discussion and merging. - -### Microservices - -One major ambition of metal-stack is to follow the idea of [microservices](https://en.wikipedia.org/wiki/Microservices). This way, we want to achieve that we can - -- adapt to changes faster than with monolithic architectures, -- be free of restrictions due to certain choices of technology, -- leverage powerful traits of cloud infrastructures (e.g. high-scalability, high-availability, ...). - -### Programming Languages - -We are generally open to write code in any language that fits best to the function of the software. However, we encourage [golang](https://en.wikipedia.org/wiki/Go_(programming_language)) to be the main language of metal-stack as we think that it makes development faster when not establishing too many different languages in our architecture. Reason for this is that we are striving for consistent behavior of the microservices, similar to what has been described for the Twelve-Factor App (see [12 Factor](https://12factor.net/)). We help enforcing unified behavior by allowing a small layer of shared code for every programming language. We will refer to this shared code as "libraries" for the rest of this document. - -### Artifacts - -Artifacts are always produced by a CI process (Github Actions). - -Docker images are published on the Github Container Registry of the metal-stack organization. - -Binary artifacts or OS images can be uploaded to `images.metal-stack.io` if necessary. - -When building Docker images, please consider our build tool [docker-make](https://github.com/fi-ts/docker-make) or the specific [docker-make action](https://github.com/fi-ts/action-docker-make) respectively. - -### APIs - -We are currently making use of [Swagger](https://swagger.io/) when we exposing traditional REST APIs for end-users. This helps us with being technology-agnostic as we can generate clients in almost any language using [go-swagger](https://goswagger.io/). Swagger additionally simplifies the documentation of our APIs. - -Most APIs though are not required to be user-facing but are of technical nature. These are preferred to be implemented using [grpc](https://grpc.io/). - -#### Versioning - -Artifacts are versioned by tagging the respective repository with a tag starting with the letter `v`. After the letter, there stands a valid [semantic version](https://semver.org/). - -### Documentation - -In order to make it easier for others to understand a project, we document general information and usage instructions in a `README.md` in any project. - -In addition to that, we document a microservice in the [docs](https://github.com/metal-stack/docs) repository. The documentation should contain the reasoning why this service exists and why it was being implemented the way it was being implemented. The aim of this procedure is to reduce the time for contributors to comprehend architectural decisions that were made during the process of writing the software and to clarify the general purpose of this service in the entire context of the software. - -## Guidelines - -This chapter describes general guidelines on how to develop and contribute code for a certain programming language. - -### Golang - -Development follows the official guide to: - -- Write clear, idiomatic Go code[^2] -- Learn from mistakes that must not be repeated[^3] -- Apply appropriate names to your artifacts: - - [https://go.dev/talks/2014/names.slide](https://go.dev/talks/2014/names.slide) - - [https://go.dev/blog/package-names](https://go.dev/blog/package-names) - - [https://go.dev/doc/effective_go#names](https://go.dev/doc/effective_go#names) -- Enable others to understand the reasoning of non-trivial code sequences by applying a meaningful documentation. - -#### Development Decisions - -- **Dependency Management** by using Go modules -- **Build and Test Automation** by using [GNU Make](https://man7.org/linux/man-pages/man1/make.1p.html). -- **End-user APIs** should consider using go-swagger and [Go-Restful](https://github.com/emicklei/go-restful) - **Technical APIs** should consider using [grpc](https://grpc.io/) - -#### Libraries - -metal-stack maintains several libraries that you should utilize in your project in order to unify common behavior. Some of these projects are: - -- [metal-go](https://github.com/metal-stack/metal-go) -- [metal-lib](https://github.com/metal-stack/metal-lib) - -#### Error Handling with Generated Swagger Clients - -From the server-side you should ensure that you are returning the common error json struct in case of an error as defined in the `metal-lib/httperrors`. Ensure you are using `go-restful >= v2.9.1` and `go-restful-openapi >= v0.13.1` (allows default responses with error codes other than 200). - -### Documentation - -We want to share knowledge and keep things simple. If things cannot kept simple we want to enable everybody to understand them by: - -- Document in short sentences[^4]. -- Do not explain the HOW (this is already documented by your code and documenting the obvious is considered a defect). -- Explain the WHY. Add a "to" in your documentation line to force yourself to explain the reasonning (e.g. "` to `"). - -### Python - -Development follows the official guide to: - -- Style Guide for Python Code (PEP 8)[^5] - - The use of an IDE like [PyCharm](https://www.jetbrains.com/pycharm/) helps to write compliant code easily -- Consider [setuptools](https://pythonhosted.org/an_example_pypi_project/setuptools.html) for packaging -- If you want to add a Python microservice to the mix, consider [pyinstaller](https://github.com/pyinstaller/pyinstaller) on Alpine to achieve small image sizes - -[^1]: [https://martinfowler.com/bliki/CodeOwnership.html](https://martinfowler.com/bliki/CodeOwnership.html) - -[^2]: [https://go.dev/doc/effective_go](https://go.dev/doc/effective_go) - -[^3]: [https://github.com/golang/go/wiki/CodeReviewComments](https://github.com/golang/go/wiki/CodeReviewComments) - -[^4]: [https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences](https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences) - -[^5]: [https://www.python.org/dev/peps/pep-0008/](https://www.python.org/dev/peps/pep-0008/) diff --git a/versioned_docs/version-v0.21.9/contributing/04-release-flow.md b/versioned_docs/version-v0.21.9/contributing/04-release-flow.md deleted file mode 100644 index 2a6403b7..00000000 --- a/versioned_docs/version-v0.21.9/contributing/04-release-flow.md +++ /dev/null @@ -1,107 +0,0 @@ ---- -slug: /release-flow -title: Release Flow -sidebar_position: 4 ---- - -# Releases - -The metal-stack contains of many microservices that depend on each other. The automated release flow is there to ensure that all components work together flawlessly for every metal-stack release. - -Releases and integration tests are published through our [release repository](https://github.com/metal-stack/releases). You can also find the [release notes](https://github.com/metal-stack/releases/releases) for this metal-stack version in there. The release notes contain information about new features, upgrade paths and bug fixes. - -If you want, you can sign up at our Slack channel where we are announcing every new release. Often, we provide additional information for metal-stack administrators and adopters at this place, too. - -This document is intended for developers, especially maintainers of metal-stack projects. - -## Release Flow - -The following diagram attempts to describe our current release flow: - -![](release_flow.svg) - -A release is created in the following way: - -- Individual repository maintainers within the metal-stack GitHub Organization can publish a release of their component. -- This release is automatically pushed to the `develop` branch of the release repository by the metal-robot. -- A push triggers a virtual release integration test using the mini-lab environment. This setup launches metal-stack with the `sonic` and `gardener` flavors to validate the different Ansible roles and execute basic operations across the metal-stack layer. -- To contribute components that are not directly part of the release vector, a pull request must be made against the `develop` branch of the release repository. Release maintainers may push directly to the `develop` branch. -- The release maintainers can `/freeze` the `develop` branch, effectively stopping the metal-robot from pushing component releases to this branch. -- The `develop` branch is tagged by a release maintainer with a `-rc.x` suffix to create a __release candidate__. -- The release candidate must pass a large integration test suite on a real environment, which is currently run by FI-TS. It tests the entire machine provisioning engine including the integration with Gardener, the deployment, metal-images and Kubernetes conformance tests. -- If the integration tests pass, the PR of the `develop` branch must be approved by at least two release maintainers. -- A release is created via GitHub releases, including all release notes, with a tag on the `main` branch. - -## FAQ - -**Question: I need PR #xyz to go into the release, why did you not include it?** - -Answer: It's not on purpose if we miss a PR to be included into a metal-stack release. Please use the pending pull request from `develop` into `master` as soon as it is open and comment which pull request you want to have included into the release. Also consider attending our planning meetings or contact us in our Slack channel if you have urgent requirements that need to be dealt with. - -**Question: Who is responsible for the releases? Who can freeze a release?** - -Answer: Every repository in metal-stack has a `CODEOWNERS` file pointing to a maintainer team. This is also true for the releases repository. Only release repository maintainers are allowed to `/freeze` a release (meaning the metal-robot does not automatically append new component releases to the release vector anymore). - -**Question: I can't push to the `develop` branch of this repository? How can I request changes to the release vector?** - -Answer: Most changes are automatically integrated by the metal-robot. For manually managed components, please raise a pull request against the `develop` branch. Only release maintainers are allowed to push to `develop` as otherwise it would be possible to mess up the release pipeline. - -**Question: What requirements need to be fulfilled to add a repository to the release vector?** - -Please see the section below named [Requirements for Release Vector Repositories](#requirements-for-release-vector-repositories). - -### Requirements for Release Vector Repositories - -Before adding a repository in the metal-stack org to the releases repository, it is advised for the maintainer to fulfill the following points: - -- The following files should be present at the repository root: - - [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) - - When a repository is created, the metal-robot automatically creates a -maintainers team in our GitHub org. - - The CODEOWNERS file should reference this team. - - The team should contain at least two maintainers. - - `LICENSE` - - This usually should be MIT with "metal-stack" as authors. - - `CONTRIBUTING.md` - - This should contain the following content: - ``` - # Contributing - - Please check out the [contributing section](https://docs.metal-stack.io/stable/development/contributing/) in our [docs](https://docs.metal-stack.io/). - ``` - - `README.md` -- The `developers-core` team should be given repository access with `write` role, the codeowners team should have the `maintain` role -- Release artifacts should have an SPDX-formatted SBOM attached. - - For container images these are embedded using Buildx. -- The following branch protection rules should be set: - - The mainline should be protected. - - A pull request should be required before merging (required by at least one code owner). - - Status checks should be required to pass. - - Force push should not be allowed on this branch. -- One person from the releases maintainers has to add the repository to the metal-robot in order to pick up the releases, add them to the release vector and generate release notes. - -### How-To Release a Project - -[release-drafter](https://github.com/release-drafter/release-drafter) is preferred in order to generate release notes from merged PRs for your projects. It should be triggered for pushes on your main branch. - -The draft is then used to create a project release. The release has to be published through the GitHub UI as demonstrated in the screenshot below. - -**Tagging the repository is not enough as repository tagging does not associate your release notes to your release!** - -![](release.png) - -Some further remarks: - -- Use semver versions with `v` prefix for your tags -- Name your release after your release tag -- The metal-robot only picks up lines from your release notes that start with `-` or `*` (unordered list items) and appends them to the according section in the aggregated release draft -- A tag created through a GitHub UI release does not trigger a `push` event . This means, your pipeline will not start to run with the `push` trigger when publishing through the UI. - - Instead, use the `published` [release event trigger](https://docs.github.com/en/actions/reference/events-that-trigger-workflows#release) for your actions: - - ```yaml - on: - release: - types: - - published - ``` -- In case they are necessary, please do not forget to include `NOTEWORTHY`, `ACTIONS_REQUIRED` or `BREAKING_CHANGE` sections into releases. More information on those release draft sections can be read in a pull request template. diff --git a/versioned_docs/version-v0.21.9/contributing/05-community.md b/versioned_docs/version-v0.21.9/contributing/05-community.md deleted file mode 100644 index 61eaf099..00000000 --- a/versioned_docs/version-v0.21.9/contributing/05-community.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -slug: /community -title: Community -sidebar_position: 5 -draft: true ---- - -# Community - -(Slack channel, community events like FOSDEM, Kubernetes Community Days..., blog -articles) diff --git a/versioned_docs/version-v0.21.9/contributing/release.png b/versioned_docs/version-v0.21.9/contributing/release.png deleted file mode 100644 index 598b1182..00000000 Binary files a/versioned_docs/version-v0.21.9/contributing/release.png and /dev/null differ diff --git a/versioned_docs/version-v0.21.9/contributing/release_flow.drawio b/versioned_docs/version-v0.21.9/contributing/release_flow.drawio deleted file mode 100644 index 6ca6b34f..00000000 --- a/versioned_docs/version-v0.21.9/contributing/release_flow.drawio +++ /dev/null @@ -1,721 +0,0 @@ - - - - - - - - - - - -
-
-
- Review release notes -
-
-
-
- - Review release notes - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- Organization Webhook -
-
-
-
- - Organization Webhook - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- - Publish release - -
-
-
-
- - Publish release - -
-
-
- - - - - - - - -
-
-
- Maintainer -
-
-
-
- - Maint... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- metal-robot release handler -
-
-
-
- - metal-robot release han... - -
-
-
- - - - - - - - -
-
-
- - no - -
-
-
-
- - no - -
-
-
- - - - - - - - -
-
-
- - yes - -
-
-
-
- - yes - -
-
-
- - - - - - - -
-
-
- version in event newer than release vector version -
-
-
-
- - version in event newer than... - -
-
-
- - - - - - - -
-
-
- - do nothing - -
-
-
-
- - do nothing - -
-
-
- - - - - - - - - - - - -
-
-
- Github Action -
-
-
-
- - Github Action - -
-
-
- - - - - - - -
-
-
- Bump version in release vector and push to - - develop - -
-
-
-
- - Bump version in release vector... - -
-
-
- - - - - - - - - - - -
-
-
- Open pull request from - - develop - - to - - master - -
-
-
-
- - Open pull request from develop... - -
-
-
- - - - - - - -
-
-
- Update aggregated release draft in - - metal-stack/releases - -
-
-
-
- - Update aggregated release draf... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Integration Testing -
-
-
-
- - Integration Testing - -
-
-
- - - - - - - - - - - -
-
-
- Merge to - - master - -
-
-
-
- - Merge to master - -
-
-
- - - - - - - - - - - - -
-
-
- Review -
-
-
-
- - Review - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Tests suceeded and PR changes reviewed -
-
-
-
- - Tests suceeded and PR chang... - -
-
-
- - - - - - - -
-
-
- - publish results to #integration - -
-
-
-
- - publish results to #integr... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Release metal-stack -
-
-
-
- - Release metal-stack - -
-
-
- - - - - - - - - - - -
-
-
- - publish to #announcements - -
-
-
-
- - publish to #announcements - -
-
-
- - - - - - - -
-
-
- - - metal-stack/docs - - pull request - -
-
-
-
- - metal-stack/docs pull requ... - -
-
-
- - - - - - - - - - - - -
-
-
- Freeze -
-
-
-
- - Freeze - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Freeze - - develop - - and create a release candidate -
-
-
-
- - Freeze develop and create a rel... - -
-
-
- - - - - - - -
-
-
- Large integration suites -
- - (currently owned by FI-TS, not public) - -
-
-
-
-
- - Large integration suites... - -
-
-
- - - - - - - - -
-
-
- Run -
-
-
-
- - Run - -
-
-
- - - - -
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.21.9/contributing/release_flow.svg b/versioned_docs/version-v0.21.9/contributing/release_flow.svg deleted file mode 100644 index 55cdd493..00000000 --- a/versioned_docs/version-v0.21.9/contributing/release_flow.svg +++ /dev/null @@ -1 +0,0 @@ -
Review release notes
Review release notes
projects
projects
projects
projects
Organization Webhook
Organization Webhook
projects
projects
Publish release
Publish release
Maintainer
Maint...
metal-robot release handler
metal-robot release han...
no
no
yes
yes
version in event newer than release vector version
version in event newer than...
do nothing
do nothing
Github Action
Github Action
Bump version in release vector and push todevelop
Bump version in release vector...
Open pull request fromdeveloptomaster
Open pull request from develop...
Update aggregated release draft inmetal-stack/releases
Update aggregated release draf...
Integration Testing
Integration Testing
Merge tomaster
Merge to master
Review
Review
Tests suceeded and PR changes reviewed
Tests suceeded and PR chang...
publish results to #integration
publish results to #integr...
Release metal-stack
Release metal-stack
publish to #announcements
publish to #announcements
metal-stack/docspull request
metal-stack/docs pull requ...
Freeze
Freeze
Freezedevelopand create a release candidate
Freeze develop and create a rel...
Large integration suites
(currently owned by FI-TS, not public)
Large integration suites...
Run
Run
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.21.9/docs/02-General/04-flavors-of-metalstack.md b/versioned_docs/version-v0.21.9/docs/02-General/04-flavors-of-metalstack.md index 7da427fc..2277ca6b 100644 --- a/versioned_docs/version-v0.21.9/docs/02-General/04-flavors-of-metalstack.md +++ b/versioned_docs/version-v0.21.9/docs/02-General/04-flavors-of-metalstack.md @@ -14,7 +14,7 @@ As modern infrastructure and cloud native applications are designed with Kuberne Regardless which flavor of metal-stack you use, it is always possible to manually provision machines, networks and ip addresses. This is the most basic way of using metal-stack and is very similar to how traditional bare metal infrastructures are managed. -Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](../../contributing/01-Proposals/MEP4/README.md) and [MEP-16](../../contributing/01-Proposals/MEP16/README.md) in the future. +Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api) and [MEP-16](/community/MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller) in the future. ## Gardener diff --git a/versioned_docs/version-v0.21.9/docs/04-For Operators/03-deployment-guide.md b/versioned_docs/version-v0.21.9/docs/04-For Operators/03-deployment-guide.md index 3a73e919..441b240b 100644 --- a/versioned_docs/version-v0.21.9/docs/04-For Operators/03-deployment-guide.md +++ b/versioned_docs/version-v0.21.9/docs/04-For Operators/03-deployment-guide.md @@ -31,7 +31,7 @@ You can use the [mini-lab](https://github.com/metal-stack/mini-lab) as a templat The metal control plane is typically deployed in a Kubernetes cluster. Therefore, this document will assume that you have a Kubernetes cluster ready for getting deployed. Even though it is theoretically possible to deploy metal-stack without Kubernetes, we strongly advise you to use the described method because we believe that Kubernetes gives you a lot of benefits regarding the stability and maintainability of the application deployment. :::tip -For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](../../contributing/01-Proposals/MEP18/README.md) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.md#target-deployment-platforms). +For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](/community/MEP-18-autonomous-control-plane) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.md#target-deployment-platforms). ::: Let's start off with a fresh folder for your deployment: diff --git a/versioned_docs/version-v0.21.9/docs/05-Concepts/01-architecture.md b/versioned_docs/version-v0.21.9/docs/05-Concepts/01-architecture.md index 3c81cc0a..61602bf0 100644 --- a/versioned_docs/version-v0.21.9/docs/05-Concepts/01-architecture.md +++ b/versioned_docs/version-v0.21.9/docs/05-Concepts/01-architecture.md @@ -150,4 +150,4 @@ Thus, for creating a partition as well as a machine or a firewall, the flags `dn In order to be fully offline resilient, make sure to check out `metal-image-cache-sync`. This component provides copies of `metal-images`, `metal-kernel` and `metal-hammer`. -This feature is related to [MEP14](../../contributing/01-Proposals/MEP14/README.md). +This feature is related to [MEP14](/community/MEP-14-independence-from-external-sources). diff --git a/versioned_docs/version-v0.21.9/docs/05-Concepts/02-user-management.md b/versioned_docs/version-v0.21.9/docs/05-Concepts/02-user-management.md index f1ee2778..ba742ee9 100644 --- a/versioned_docs/version-v0.21.9/docs/05-Concepts/02-user-management.md +++ b/versioned_docs/version-v0.21.9/docs/05-Concepts/02-user-management.md @@ -7,7 +7,7 @@ sidebar_position: 2 # User Management At the moment, metal-stack can more or less be seen as a low-level API that does not scope access based on projects and tenants. -Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](../../contributing/01-Proposals/MEP4/README.md). +Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](/community/MEP-4-multi-tenancy-for-the-metal-api). Until then projects and tenants can be created, but have no effect on access control. diff --git a/versioned_docs/version-v0.21.9/docs/06-For CISOs/Security/01-principles.md b/versioned_docs/version-v0.21.9/docs/06-For CISOs/Security/01-principles.md index 02318fbe..a288346c 100644 --- a/versioned_docs/version-v0.21.9/docs/06-For CISOs/Security/01-principles.md +++ b/versioned_docs/version-v0.21.9/docs/06-For CISOs/Security/01-principles.md @@ -15,7 +15,7 @@ The minimal need to know principle is a security concept that restricts access t ### RBAC :::info -As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](../../../contributing/01-Proposals/MEP4/README.md). +As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](..//community/MEP-4-multi-tenancy-for-the-metal-api). ::: As described in our [User Management](../../05-Concepts/02-user-management.md) concept the [metal-api](https://github.com/metal-stack/metal-api) currently offers three different user roles for authorization: diff --git a/versioned_docs/version-v0.21.9/docs/06-For CISOs/Security/04-communication-matrix.md b/versioned_docs/version-v0.21.9/docs/06-For CISOs/Security/04-communication-matrix.md index 07df2607..24c1bc1d 100644 --- a/versioned_docs/version-v0.21.9/docs/06-For CISOs/Security/04-communication-matrix.md +++ b/versioned_docs/version-v0.21.9/docs/06-For CISOs/Security/04-communication-matrix.md @@ -116,7 +116,7 @@ Please note that every [networking setup](../../05-Concepts/03-Network/01-theory | VLAN | Switches, Firewalls | Layer 2 traffic segmentation. | | VXLAN | Switches, Firewalls | Encapsulate Layer 2 frames in Layer 3 packets for network virtualization. | | EVPN | Switches, Firewalls | Overlay network technology for scalable and flexible network architectures. | -| VPN | Firewalls | Management access [without open SSH ports](../../../contributing/01-Proposals/MEP9/README.md). | +| VPN | Firewalls | Management access [without open SSH ports](..//community/MEP-9-no-open-ports-to-the-data-center). | | BGP | Multiple | Routing protocol for dynamic routing and network management. | | SSH | Management Server, Switches | Secure shell access for management and configuration. | | LLDP | Switches, Machines | Link Layer Discovery Protocol for network device discovery. | diff --git a/versioned_docs/version-v0.21.9/docs/06-For CISOs/rbac.md b/versioned_docs/version-v0.21.9/docs/06-For CISOs/rbac.md index 9a87b896..06c902bb 100644 --- a/versioned_docs/version-v0.21.9/docs/06-For CISOs/rbac.md +++ b/versioned_docs/version-v0.21.9/docs/06-For CISOs/rbac.md @@ -31,4 +31,4 @@ To ensure that internal components interact securely with the metal-api, metal-s Users can interact with the metal-api using [metalctl](https://github.com/metal-stack/metalctl), the command-line interface provided by metal-stack. Depending on the required operations, users should authenticate with the appropriate role to match their level of access. -As part of [MEP-4](../../contributing/01-Proposals/MEP4/README.md), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. +As part of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. diff --git a/versioned_docs/version-v0.21.9/docs/06-For CISOs/remote-access.md b/versioned_docs/version-v0.21.9/docs/06-For CISOs/remote-access.md index 0b8dbb19..dc24e82f 100644 --- a/versioned_docs/version-v0.21.9/docs/06-For CISOs/remote-access.md +++ b/versioned_docs/version-v0.21.9/docs/06-For CISOs/remote-access.md @@ -6,7 +6,7 @@ title: Remote Access ## Machines and Firewalls -Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](../../contributing/01-Proposals/MEP9/README.md). Administrators can access machines in two primary ways. +Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](/community/MEP-9-no-open-ports-to-the-data-center). Administrators can access machines in two primary ways. **Out-of-band management via SOL** @@ -26,4 +26,4 @@ This approach uses the [`metal-console`](../08-References/Control%20Plane/metal- Both methods ensure secure and controlled access to machines without exposing them unnecessarily to the network, maintaining the integrity and safety of the infrastructure. -Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](../../contributing/01-Proposals/MEP4/README.md). \ No newline at end of file +Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api). \ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/Distributed-API-Working.png b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/Distributed-API-Working.png deleted file mode 100644 index 899e223d..00000000 Binary files a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/Distributed-API-Working.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/Distributed-API.png b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/Distributed-API.png deleted file mode 100644 index 688c7c2e..00000000 Binary files a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/Distributed-API.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/Distributed-Deployment.png b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/Distributed-Deployment.png deleted file mode 100644 index 8bba51b8..00000000 Binary files a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/Distributed-Deployment.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/Distributed.drawio b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/Distributed.drawio deleted file mode 100644 index f7c6fe79..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/Distributed.drawio +++ /dev/null @@ -1 +0,0 @@ -7V3bcts2EP0aP8pDgPfH2ImbziRTt+m0zVMHpmCJMUUoFG1L/fqC4kUkAEoULwDkRC8WIBIkF2d3z+6C8JV5u9r+kqD18jOZ4+gKGvPtlfn+CtKPY9M/Wc8u7/FtM+9YJOE87wKHji/hf7joNIre53CON40DU0KiNFw3OwMSxzhIG30oSchr87BHEjWvukYLzHV8CVDE9/4dztNl3uvZxqH/Iw4Xy/LKwCh+eUDB0yIhz3FxvStoPu4/+c8rVI5VHL9Zojl5rXWZH67M24SQNP+22t7iKJNtKbb8vLuWX6v7TnCcdjkhWVoPJP5zF3/w/trOvG+vv70+z4rJe0HRMy4fw4noeDePhA5L5YmC/Afn+3N2pzcfcfSC0zBAh67s4dJd1DwuO3+22U/3O3oAgOtt/Qxnkf29R0kapiGJy6vS288vnP9eSK4aHFIhrrOvz6voLkEr+vXmdRmm+Ms6v89XClLat0xXEW0B+nU/QziTgUFb1SRkjYCswqD4HqEHHN1UU3pLIpLQn2ISZ9fYpAl5qvABise7Q6swymD/F07mKEZFd4FxkA2LonAR00ZAJwkn1RO94CTF29aJBBU8qNphssJpsqOHFCfMbLeAVKFywC0x+VpDcNm3rKHX9opOVGjNohr9gBz6pQDPGUACkEPSBif0OWkfHGEayyfzMqlWj2QaI4kUcCK1eJGajkikYASRrrboX/K79Qh+vYe77+Ef8RMBM4+TaECCp4SgYClDoJz0BDJuF6jFCNQ0O8oTGiPI055D4NsPc8+Yo0cAwMy0OJHhOfUDRZMk6ZIsSIyiD4dexnTUZDdHm+W+H3SwHNTE3YXZne5HwfH8Xea1souucZz3NH+v24+OZqZ1xrKHPDpfCY5QGr40naFI9PtT6a2jXe2ANQnjdFMb+T7rOMDAAoxaGdBvOqkzT6Bf8nsQn256LadXd7whz0mAi9MYQFVi6a+zrsCfDsTd4ZhPhKwL0H3DaborEICeU9LE5x50JcyCCG02mZ9rYBEcQ00upCOPWdAGOt4Cp0eOc8ZG4SCDypOdmkE1ADdTpVGlPGFNtTkTUuXQI/yYNTfUvobx4tO+9d50xrKeVhPHlsDBAyiwns5Uzsg5Krt2Dy9fdpUMVMgOuCh4QLZredg2fedhBji5MQT7bObckZJzBNt498OQ7HKm3Wm4jW0zXsbmEWaL6LdlTqWegLdesv1MC+Hp72T8SSgMxxkoN2yhquUYuZubjDP4nImgI6JohtahRmbVYsxqlcBR5pLKkPMtYb4U6uSgh23xmSTQlw9aRz3apJmNT5FGsIeedrA3j1ExzfMC0G6BnZS0gFieloCivcGYDXQN2oBeURu4mLCNtRXqozZwMWEbSy80kJ0ol6ModLv5GbqNgjIuza9B5OYp9zbjs1hJoRub7qVs4tqofWBzwKkp7UUEcmx6TD2hrQrkb0gDJqq/8PUSnk8r1IAKjXoHdWx2XQMVgAKuwerEoXLIhAd8dw3neBum/2R4vva8ovl137SL1vttgfZ9Y1dr3OMkpM+X+eWiNkmfNR8LOGW7NljWPIy2b+3qLXa8TurVllE/Hca4HVWw4fv5SS/7hmZc2KyxYzNoailNCkYymJHY2HhqNb/kDAS3MgFUpFBdDgL+IDkI2DUHAfXKQcCLyUFwpWMABSuZJHu3i8lCcMVjHaR3Mg8hbY3mzxLyVCVkv3Miwp0MZ9omIg4UtuSsX2vkVsxfB/goVXXnAxGRxeMuImHBEzZDoCxybXKZDdRPV/rj3pSUsuBKz9Jxb15GmoKiTD/g84mKywn9gK9f5GfysbRy0zJF5FcuwD8Z+Zn22GZo2PzwkbmmkV/1opk+oYt5PGzWKPCzWFOrgfD4qPln/fnC4z4AtAv7LNEKddYB/Zilh6PpmNOOrGsKU1H9AVg+g6neBQhQJR0lMXhLVIGIN4QCVhuXwr9TKqTvIm2fzKdYGr6f1vqiZKXxdkPhT6h7f822Or/VZnXU7IEqa9sN/Hjsy9tTKxmfHvoJlrPB4koCi8nSf8Pyr53Dx5WLHZ75o3V4nacX6ewFT9ch0chWM6badQSW2pVpqUvd11DXpGbj7dELwS3qw754LjspafPhnobJeMC9YK88Jeko8Uo1j+Oe5XL0UzGna0RTsu7JKwSA8WU+u8fKxLs4OKauxrd1lk/zEElvFtpmv7k7OdCe0Hjm4WNLtc8Onwiu7POMzn2WW9DGTHM1U19Q6QCmVDMtWgS0D9mv12WmcfZwiiHS50+bWjOCtNglU1WgVRMWFMXpk70U4vBxOi8spERYM2hoJy1tV64McMqSVqFwhQ/ZvNfhsww6FuNN/RahlHekcxKUyxSrz4G6auOFqtGtejEtKZS31o0tfPVlhTPTsBlEWdkrTwda6HQyX+fuZMc/QQHa9hvltrLzGmc0t7IbbQM6vjKSJd6Uswb2VU31rMG9JELP5l3U83lXSYJSiRmdPPdosablaKDb1VTyywfdPgH0uZaSf5rjhpJbBi3HTvLhaNNOqglF2bWxUs2keGNnTk7Vvs7ti9+02dfZZsEld19noUQhJ1EVlvSsFZ+MBTwZ0gqfu2AmdVIqPM4aaGQHTc7RV1tHXu45ENoMvxRuZjJZRNo+c5KWew4SHrcBgHLRqdkEY0TtFhSRhMf5KrUb8gost1bYY3WK1NnJNxdUNT181nuaEvi4hhf4ULn1UJvTOrcG3r++PYoy+BehDNLy4sO0wWTLtOq1QZMdPUdELOjKnZUittxtUpkVgr2sEKjZoNKSyTBDnSd1aEDUK43DRheGb9cBcvL4MhvZdrx7/PjBWZ+jIq9ZBmo4E7KlkqHgNUH+2tGpF1rVcw4otfwoliX//6mUqH/FJdzuZKI3cxlT/btycqX5EMGmlhdKNaESrtl5lod67l5GnjPC7P+QZIuaFjx+xkRmmw8ML8Jss2km9Ua73GjuMhIgvWz7iMor2q7uCEDHXzbB/vMJg90zcrzFWWIB6OHjVUwpHDqlw/SUf39Kx1QYYMtr6oN/qDoI7ctPFJm4zpnhiUycrdrECZLOGubZ9FO0Mu/3hnxD18SwWtdwZNe+KdatjVws8UTAtaQCF7414JYb2p0mNbZK5Ir23dMXuRy38UT/JmAk5NJmQrLtlw6uLUHr5Wcyx9kR/wM= \ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/Distributed.png b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/Distributed.png deleted file mode 100644 index d96ca216..00000000 Binary files a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/Distributed.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/README.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/README.md deleted file mode 100644 index 0fd4bb63..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP1/README.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -slug: /MEP-1-distributed-metal-control-plane -title: MEP-1 -sidebar_position: 1 ---- - -# Distributed Metal Control Plane - -This enhancement proposal was replaced by [MEP18](../MEP18/README.md). - -## Problem Statement - -We face the situation that we argue for running bare metal on-premises because this way the customers can control where and how their software and data are processed and stored. -On the other hand, we have currently decided that our metal-api control plane components run on a kubernetes cluster (in our case on a cluster provided by one of the available hyperscalers). - -Running the control plane on Kubernetes has the following benefits: - -- Ease of deployment -- Get most, if not all, of the required infrastructure services like (probably incomplete): - - IPs - - DNS - - L7-Loadbalancing - - Storage - - S3 Backup - - High Availability - -Using a kubernetes as a service offering from one of the hyperscalers, enables us to focus on using kubernetes instead of maintaining it as well. - -## Goal - -It would be much saner if metal-stack has no, or only minimal dependencies to external services. Imagine a metal-stack deployment in a plant, it would be optimal if we only have to deliver a single rack with servers and networking gear installed and wired, plug that rack to the power supply and a internet uplink and its ready to go. - -Have a second plant which you want to be part of all your plants? Just tell both that they are part of something bigger and metal-api knows of two partitions. - -## Possible Solutions - -We can think of two different solutions to this vision: - -1. Keep the central control plane approach and require some sort of kubernetes deployment accessible from the internet. This has the downside that the user must, provide a managed kubernetes deployment in his own datacenter or uses a hyperscaler. Still not optimal. -1. Install the metal-api and all its dependencies in every partition, replicate or shard the databases to every connected partition, make them know each other. Connect the partitions over the internet with some sort of vpn to make the services visible to each other. - -As we can see, the first approach does not really address the problem, therefore i will describe solution #2 in more details. - -## Central/Current setup - -### Stateful services - -Every distributed system suffer from handling state in a scalable, fast and correct way. To start how to cope with the state, we first must identify which state can be seen as partition local only and which state must be synchronous for read, and synchronous for writes across partitions. - -Affected states: - -- masterdata: e.g. tenant and project must be present in every partition, but these are entities which are read often but updates are rare. A write can therefore be visible with a decent delay in a distinct partition with no consequences. -- ipam: the prefixes and ip´s allocated from machines. These entities are also read often and rare updates. But we must differentiate between dirty reads for different types. A machine network is partition local, ips acquired from such a network must by synchronous in the same partition. Ips acquired from global networks such as internet must by synchronous for all partitions, as otherwise a internet ip could be acquired twice. -- vrf ids: they must only be unique in one partition -- image and size configurations: read often, written seldom, so no high requirements on the storage of these entities. -- images: os images are already replicated from a central s3 storage to a per partition s3 service. metal-hammer kernel and initrd are small and pull always from the central s3, can be done similar to os images. -- machine and machine allocation: must be only synchronous in the partition -- switch: must be only synchronous in the partition -- nsq messages: do not need to cross partition boundaries. No need to keep the messages persistent, even the opposite is true, we don't want to have the messages persist for a longer period. - -Now we can see that the most critical state to held and synchronize are the IPAM data, because these entities must be guaranteed to be synchronously updated, while being updated frequently. - -Datastores: - -We use three different types of datastores to persist the states of the metal application. - -- rethinkdb is the main datastore for almost all entities managed by metal-api -- postgresql is used for masterdata and ipam data. -- nsq uses disk and memory tho store the messages. - -### Stateless services - -These are the easy part, all of our services which are stateless can be scaled up and down without any impact on functionality. Even the stateful services like masterdata and metal-api rely fully on the underlying datastore and can therefore also be scaled up and down to meet scalability requirements. - -Albeit, most of these services need to be placed behind a loadbalancer which does the L4/L7 balancing across the started/available replicas of the service for the clients talking to it. This is actually provided by kubernetes with either service type loadbalancer or type clusterip. - -One exception is the `metal-console` service which must have the partition in it´s dns name now, because there is no direct network connectivity between the management networks of the partitions. See "Network Setup) - -## Distributed setup - -### State - -In order to replicate certain data which must be available across all partitions we can use on of the existing open source databases which enable such kind of setup. There are a few available out there, the following incomplete list will highlight the pro´s and cons of each. - -- RethinkDB - - We already store most of our data in RethinkDB and it gives already the ability to synchronize the data in a distributed manner with different guarantees for consistency and latency. This is described here: [Scaling, Sharding and replication](https://rethinkdb.com/docs/sharding-and-replication/). But because rethinkdb has a rough history and unsure future with the last release took more than a year, we in the team already thought that we eventually must move away from rethinkdb in the future. - -- Postgresql - - Postgres does not have a multi datacenter with replication in both directions, it just can make the remote instance store the same data. - -- CockroachDB - - Is a Postgresql compatible database engine on the wire. CockroachDB gives you both, ACID and geo replication with writes allowed from all connected members. It is even possible to configure [Follow the Workload](https://www.cockroachlabs.com/docs/stable/topology-follow-the-workload) and [Geo Partitioning and Replication](https://www.cockroachlabs.com/docs/v19.2/topology-geo-partitioned-replicas). - -If we migrate all metal-api entities to be stored the same way we store masterdata, we could use cockroachdb to store all metal entities in one ore more databases spread across all partitions and still ensure consistency and high availability. - -A simple setup how this would look like is shown here. - -![Simple CockroachDB setup](Distributed.png) - -go-ipam was modified in a example PR here: [PR 17](https://github.com/metal-stack/go-ipam/pull/17) - -### API Access - -In order to make the metal-api accessible for api users like `cloud-api` or `metalctl` as easy at it is today, some effort has to be taken. One possible approach would be to use a external loadbalancer which spread the requests evenly to all metal-api endpoints in all partitions. Because all data are accessible from all partitions, a api request going to partition A with a request to create a machine in partition B, will still work. If on the other hand partition B is not in a connected state because the interconnection between both partitions is broken, then of course the request will fail. - -**IMPORTANT** -The NSQ Message to inform `metal-core` must end in the correct partition - -To provide such a external loadbalancer we have several opportunities: - -- Cloudflare or comparable CDN service. -- BGP Anycast from every partition - -Another setup would place a small gateway behind the metal-api address, which forwards to the metal-api in the partition where the request must be executed. This gateway, `metal-api-router` must inspect the payload, extract the desired partition, and forward the request without any modifications to the metal-api endpoint in this partition. This can be done for all requests, or if we want to optimize, only for write accesses. - -## Network setup - -In order to have the impact to the overall security concept as minimal as possible i would not modify the current network setup. The only modifications which has to be made are: - -- Allow https ingress traffic to all metal-api instances. -- Allow ssh ingress traffic to all metal-console instances. -- Allow CockroachDB Replication between all partitions. -- No NSQ traffic from outside required anymore, except we cant solve the topic above. - -A simple setup how this would look like is shown here, this does not work though because of the forementioned NSQ issue. - -![API and Console Access](Distributed-API.png) - -Therefore we need the `metal-api-router`: - -![Working API and Console Access](Distributed-API-Working.png) - -## Deployment - -The deployment of our components will substantially differ in a partition compared to a the deployment we have actually. Deploying it in kubernetes in the partition would be very difficult to achieve because we have no sane way to deploy kubernetes on physical machines without a underlying API. -I would therefore suggest to deploy our components in the same way we do that for the services running on the management server. Use systemd to start docker containers. - -![Deployment](Distributed-Deployment.png) diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP10/README.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP10/README.md deleted file mode 100644 index 6811cdc0..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP10/README.md +++ /dev/null @@ -1,197 +0,0 @@ ---- -slug: /MEP-10-sonic-support -title: MEP-10 -sidebar_position: 10 ---- - -# SONiC Support - -As writing this proposal, metal-stack only supports Cumulus on Broadcom ASICs. Unfortunately, after the acquisition of -Cumulus Networks by Nvidia, Broadcom decided to cut its relationship with Cumulus, and therefore Cumulus 4.2 is the last -version that supports Broadcom ASICs. Since trashing the existing hardware is not a solution, adding support for a -different network operating system is necessary. - -One of the remaining big players is [SONiC](https://sonic-net.github.io/SONiC/), which Microsoft created to scale the -network of Azure. It's an open-source project and is now part of the [Linux Foundation](https://www.linuxfoundation.org/press/press-release/software-for-open-networking-in-the-cloud-sonic-moves-to-the-linux-foundation). - -For a general introduction to SONiC, please follow the [Architecture](https://github.com/sonic-net/SONiC/wiki/Architecture) official -documentation. - -## ConfigDB - -On a cold start, the content of `/etc/sonic/config_db.json` will be loaded into the Redis database `CONFIG_DB`, and both -contain the switch's configuration except the BGP unnumbered configuration, which still has to be configured directly by -the frr configuration files. The SONiC community is working to remove this exception, but no release date is known. - -## BGP Configuration - -Frr runs inside a container, and a shell script configured it on the container startup. For BGP unnumbered, we must set -the configuration variable `docker_routing_config_mode` to `split` to prevent SONiC from overwriting our configuration -files created by `metal-core`. But by using the split mode, the integrated configuration mode of frr is deactivated, and -we have to write our BGP configuration to the daemon-specific files `bgp.conf`, `staticd.conf`, and `zebra.conf` instead -to `frr.conf`. - -```bash -elif [ "$CONFIG_TYPE" == "split" ]; then - echo "no service integrated-vtysh-config" > /etc/frr/vtysh.conf - rm -f /etc/frr/frr.conf -``` - -Reference: [docker-init](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/docker_init.sh#L69) - -Adding support for the integrated configuration mode, we must at least adjust the startup shell script and the supervisor configuration: - -```bash -{% if DEVICE_METADATA.localhost.docker_routing_config_mode is defined and DEVICE_METADATA.localhost.docker_routing_config_mode == "unified" %} -[program:vtysh_b] -command=/usr/bin/vtysh -b -``` - -Reference: [supervisord.conf](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/frr/supervisord/supervisord.conf.j2#L157) - -## Non-BGP Configuration - -For the Non-BGP configuration we have to write it into the Redis database directly or via one of the following interfaces: - -- `config replace ` -- the Mgmt Framework -- the SONiC restapi - -Directly writing into the Redis database isn't a stable interface, and we must determine the create, delete, and update -operations on our own. The last point is also valid for the Mgmt Framework and the SONiC restapi. Furthermore, the -Mgmt Framework doesn't start anymore for several months, and a [potential fix](https://github.com/sonic-net/sonic-buildimage/pull/10893) -is still not merged. And the SONiC restapi isn't enabled by default, and we must build and maintain our own SONiC images. - -Using `config replace` would reduce the complexity in the `metal-core` codebase because we don't have to determine the -actual changes between the running and the desired configuration. The approach's drawbacks are using a version of SONiC -that contains the PR [Yang support for VXLAN](https://github.com/sonic-net/sonic-buildimage/pull/7294), and we must provide -the whole new startup configuration to prevent unwanted deconfiguration. - -### Configure Loopback interface and activate VXLAN - -```json -{ - "LOOPBACK_INTERFACE": { - "Loopback0": {}, - "Loopback0|": {} - }, - "VXLAN_TUNNEL": { - "vtep": { - "src_ip": "" - } - } -} -``` - -#### Configure MTU - -```json -{ - "PORT": { - "Ethernet0": { - "mtu": "9000" - } - } -} -``` - -#### Configure PXE Vlan - -```json -{ - "VLAN": { - "Vlan4000": { - "vlanid": "4000" - } - }, - "VLAN_INTERFACE": { - "Vlan4000": {}, - "Vlan4000|": {} - }, - "VLAN_MEMBER": { - "Vlan4000|": { - "tagging_mode": "untagged" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104000_Vlan4000": { - "vlan": "Vlan4000", - "vni": "104000" - } - } -} -``` - -#### Configure VRF - -```json -{ - "INTERFACE": { - "Ethernet0": { - "vrf_name": "vrf104001" - } - }, - "VLAN": { - "Vlan4001": { - "vlanid": "4001" - } - }, - "VLAN_INTERFACE": { - "Vlan4001": { - "vrf_name": "vrf104001" - } - }, - "VRF": { - "vrf104001": { - "vni": "104001" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104001_Vlan4001": { - "vlan": "Vlan4001", - "vni": "104001" - } - } -} -``` - -## DHCP Relay - -The DHCP relay container only starts if `DEVICE_METADATA.localhost.type` is equal to `ToRRouter`. - -## LLDP - -SONiC always uses the local port subtype for LLDP and sets it to some freely configurable alias field of the interface. - -```python -# Get the port alias. If None or empty string, use port name instead -port_alias = port_table_dict.get("alias") -if not port_alias: - self.log_info("Unable to retrieve port alias for port '{}'. Using port name instead.".format(port_name)) - port_alias = port_name - -lldpcli_cmd = "lldpcli configure ports {0} lldp portidsubtype local {1}".format(port_name, port_alias) -``` - -Reference: [lldpmgr](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-lldp/lldpmgrd#L153) - -## Mgmt Interface - -The mgmt interface is `eth0`. To configure a static IP address and activate the Mgmt VRF, use: - -```json -{ - "MGMT_INTERFACE": { - "eth0|": { - "gwaddr": "" - } - }, - "MGMT_VRF_CONFIG": { - "vrf_global": { - "mgmtVrfEnabled": "true" - } - } -} -``` - -[IP forwarding is deactivated on `eth0`](https://github.com/sonic-net/sonic-buildimage/blob/202205/files/image_config/sysctl/sysctl-net.conf#L7), and no IP Masquerade is configured. diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP11/README.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP11/README.md deleted file mode 100644 index 87f48a10..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP11/README.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -slug: /MEP-11-auditing-of-metal-stack-resources -title: MEP-11 -sidebar_position: 11 ---- - -# Auditing of metal-stack resources - -Currently no logs of the ownership of resources like machines, networks, ips and volumes are generated or kept. Though due to legal requirements data centers are required to keep track of this ownership over time to prevent liability issues when opening the platform for external users. - -In this proposal we want to introduce a flexible and low-maintenance approach for auditing on top of [Meilisearch](https://www.meilisearch.com/). - -## Overview - -In general our auditing logs will be collected by a request interceptor or middleware. Every request and response will be processed and eventually logged to Meilisearch. -Meilisearch will be configured to regularly create chunks of the auditing logs. These finished chunks will be backed up to a S3 compatible storage with a read-only option enabled. - -Of course sensitive data like session keys or passwords will be redacted before logging. We want to track relevant requests and responses. If auditing the request fails, the request itself will be aborted and will not be processed further. The requests and responses that will be audited will be annotated with a correlation id. - -Transferring the meilisearch auditing data chunks to the S3 compatible storage will be done by a sidecar cronjob that is executed periodically. -To avoid data manipulation the S3 compatible storage will be configured to be read-only. - -## Whitelisting - -To reduce the amount of unnecessary logs we want to introduce a whitelist of resources and operations on those that should be logged. -Other requests will be passed directly to the next middleware or web service without any further processing. - -As we are only interested in mutating endpoints, we ignore all `GET` requests. -The whitelist includes all `POST`, `PUT`, `PATCH` and `DELETE` endpoints of the HTTP middleware except for the following (non-manipulating) route suffixes: - -- `/find` -- `/notify` -- `/try` and `/match` -- `/capacity` -- `/from-hardware` - -Regarding GRPC audit trails, they are not so interesting because only internal clients are using this API. However, we can log the trails of the `Boot` service, which can be interesting to revise the machine lifecycle. - -## Chunking in Meilisearch - -We want our data to be chunked in Meilisearch. To accomplish this, we rotate the index identifier on a scheduled basis. The index identifiers will be derived from the current date and time. - -To keep things simple, we only support hourly, daily and monthly rotation. The eventually prefixed index names will only include relevant parts of date and time like `2021-01`, `2021-01-01` or `2021-01-01_13`. - -The metal-api will only write to the current index and switches to the new index on rotation. The metal-api will never read or update data in any indices. - -## Moving chunks to S3 compatible storage - -As Meilisearch will be filled with data over time, we want to move completed chunks to a S3 compatible storage. This will be done by a sidecar cronjob that is executed periodically. Note that the periods of the index rotation and the cronjob execution don't have to match. - -When the backup process gets started, it initiates a [Meilisearch dump](https://www.meilisearch.com/docs/learn/advanced/dumps) of the whole database across all indices. Once the returned task is finished, the dump must be copied from a Meilisearch volume to the S3 compatible storage. After a successful copy, the dump can be deleted. - -Now we want to remove all indices from Meilisearch, except the most recent one. For this, we [get all indices](https://www.meilisearch.com/docs/reference/api/indexes#list-all-indexes), sort them and [delete each index](https://www.meilisearch.com/docs/reference/api/indexes#delete-an-index) except the most recent one to avoid data loss. - -For the actual implementation, we can build upon [backup-restore-sidecar](https://github.com/metal-stack/backup-restore-sidecar). But due to the index rotation and the fact, that older indices need to be deleted, this probably does not fit into the mentioned sidecar. - -## S3 compatible storage - -The dumps of chunks should automatically deleted after a certain amount of time, once we are either no longer allowed or required to keep them. -The default retention time will be 6 months. Ideally already uploaded chunks should be read-only to prevent data manipulation. - -A candidate for the S3 compatible storage is Google Cloud Storage, which allows to configure automatic expiration of objects through a [lifecycle rule](https://cloud.google.com/storage/docs/managing-lifecycles?hl=en#storage-set-lifecycle-config-go). - -## Affected components - -- metal-api grpc server needs an auditing interceptor -- metal-api web server needs an auditing filter chain / middleware -- metal-api needs new command line arguments to configure the auditing -- mini-lab needs a Meilisearch instance -- mini-lab may need a local S3 compatible storage -- we need a sidecar to implement the backup to S3 compatible storage -- Consider auditing of volume allocations and freeings outside of metal-stack - -## Alternatives considered - -Instead of using Meilisearch we investigated using an immutable database like [immudb](https://immudb.io/). But immudb does not support chunking of data and due to its immutable nature, we will never be able to free up space of expired data. Even if we are legally allowed or required to delete data, we will not be able to do so with immudb. - -In another variant of the Meilisearch approach the metal-api would also be responsible for copying chunks to the S3 compatible storage and deleting old indices. But separating the concerns allows completely different implementations for every deployment stage. diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP12/README.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP12/README.md deleted file mode 100644 index 65532c57..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP12/README.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -slug: /MEP-12-rack-spreading -title: MEP-12 -sidebar_position: 12 ---- - -# Rack Spreading - -Currently, when creating a machine through the metal-api, the machine is placed randomly inside a partition. This algorithm does not consider spreading machines across different racks and different chassis. This may lead to the situation that a group of machines (that for example form a cluster) can end up being placed in the same rack and the same chassis. - -Spreading a group of machines across racks can enhance availability for scenarios like a rack losing power or a chassis meltdown. - -So, instead of just randomly deciding the placement of a machine candidate, we want to propose a placement strategy that attempts to spread machine candidates across the racks inside a partition. - -Furthermore a followup improvement to guarantee that machines are really spread across multiple racks, even if multiple machines are ordered in parallel, was implemented with [PR490](https://github.com/metal-stack/metal-api/pull/490). - -## Placement Strategy - -Machines in the project are spread across all available racks evenly within a partition (best effort). For this, an additional request to the datastore has to be made in order to find allocated machines within the project in the partition. - -The algorithm will then figure out the least occupied racks and elect a machine candidate randomly from those racks. - -The user can optionally pass placement tags which will be considered for spreading the machines as well (this will for example allow spreading by a cluster id tag inside the same project). - -## API - -```golang -// service/v1/machine.go - -type MachineAllocation struct { - // existing fields are omitted for readability - PlacementTags []string `json:"placement_tags" description:"by default machines are spread across the racks inside a partition for every project. if placement tags are provided, the machine candidate has an additional anti-affinity to other machines having the same tags"` -} -``` diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP13/README.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP13/README.md deleted file mode 100644 index 2dde20f5..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP13/README.md +++ /dev/null @@ -1,111 +0,0 @@ ---- -slug: /MEP-13-dual-stack-support -title: MEP-13 -sidebar_position: 13 ---- - -# Dual-stack Support - -dual-stack support is required to be able to create Kubernetes clusters with either IPv6 single-stack or dual-stack enabled. -With the inherent scarcity of IPv4 addresses, the need to be able to use IPv6 has increased. - -Full IPv6 dual-stack support was added to Kubernetes with v1.23 as stable. - -Gardeners have had full IPv6 dual-stack support since `v1.109`. - -metal-stack manages CIDRs and IP addresses with the [go-ipam](https://github.com/metal-stack/go-ipam) library, which already got full IPv6 support in 2021 (see [https://metal-stack.io/blog/2021/02/ipv6-part1](https://metal-stack.io/blog/2021/02/ipv6-part1)). -But this was only the foundation, more work needs to be done to get full IPv6 support for all aspects managed by metal-stack.io. - -## General Decisions - -For the general decision we do not look at the isolated clusters feature for now as this would make the solution even more complex and we want to introduce IPv6 in smaller steps to the users. - -### Networks - -Currently, metal-stack organizes CIDRs / prefixes into a `network' resource in the metal-api. A network can consist of multiple CIDRs from the same address family. For example, if an operator wants to provide Internet connectivity to provisioned machines, they can start with small network CIDRs. The number of managed network prefixes can then be expanded as needed over time. - -With dual-stack we have to choose between two options: Network per address family or networks with both address families. These options are described in the next section. - -#### Network per Address Family - -This means that we allow networks with CIDRs from one address family only, one for IPv4 and one for IPv6. - -The machine creation process will not change if the machine only needs to be either IPv4 or IPv6 addressable. -But if on the other side, the machine need to be able to connect to both address families, the machine creation needs to specify two networks, one for IPv4 and one for IPv6. -Also there will be 2 distinct VRF IDs for every network with a different address family. - -#### Network with both Address Families - -Make a network dual address family capable, meaning that you can add multiple cidrs from both address families to a network. -Then the machine creation will remain the same for single-stack and dual-stack cases, but the ip address allocation will need to specify the address family from which to allocate an ip address when the network is dual-stack. -This does not break the existing API, but allows existing extensions to easily add dual-stack support. -To avoid additional checking of which address families are available on this network during an ip allocation call, we could store the address families in the network. - -#### Decision - -The decision was made to go with the having both address families in a single network entity because we think this is the most flexible way to support dual-stack machines and Kubernetes clusters as well as single-stack with the least amount of modifications on the networking side. - -### Examples - -To illustrate the the usage we start by creating a tenant super network which has both address families: - -```yaml ---- -id: tenant-super-network-mini-lab -name: Project Super Network -description: Super network of all project networks -partitionid: mini-lab -prefixes: - - 10.0.0.0/16 - - 2001:db8:0:10::/64 -defaultchildprefixlength: - IPv4: 22 - IPv6: 96 -privatesuper: true -``` - -In order to create this network, we simple call: - -```bash -metalctl network create -f tenant-super.yaml -``` - -This is usually done during the initial setup of the environment. - -Next step is to allocate a tenant network where the machines of a project can be placed: - -```bash -metalctl network allocate --partition mini-lab --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 --name my-node-network -``` - -This leads to the following network allocation: - -```yaml -id: 2d2c0350-3f66-4597-ae97-ef6797232212 -name: my-node-network -parentnetworkid: tenant-super-network-mini-lab -partitionid: mini-lab -prefixes: - - 10.0.0.0/22 - - 2001:db8:0:10::/96 -projectid: 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -vrf: 20 -consumption: - ipv4: - available_ips: 1024 - available_prefixes: 256 - used_ips: 2 - used_prefixes: 0 - ipv6: - available_ips: 2147483647 - available_prefixes: 1073741824 - used_ips: 1 - used_prefixes: 0 -privatesuper: false -``` - -Users can the create IP addresses from these child networks. By default, they retrieve an IPv4 address except a super network only consists of IPv6 prefixes. In the latter case the users acquire an IPv6 address. - -```bash -metalctl network ip create --network 2d2c0350-3f66-4597-ae97-ef6797232212 --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -``` diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP14/README.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP14/README.md deleted file mode 100644 index 47c06434..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP14/README.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -slug: /MEP-14-independence-from-external-sources -title: MEP-14 -sidebar_position: 14 ---- - -# Independence from external sources - -In certain situations some customers may need to operate and create machines without making use of external services like DNS or NTP through the internet. To make this possible, all metal-stack components reaching external services need to be configurable with custom endpoints. - -So far, the following components have been identified as requiring changes: - -- pixiecore -- metal-hammer -- metal-images - -More components are likely to be added to the list during processing. -For DNS and NTP servers it should be possible to provide default values within a partition. They can either be inherited from machines and firewalls or overwritten with own ones. - -## pixiecore - -A NTP server endpoint need to be configured on the pixiecore. This can be achieved by providing it through environment variables on start up. - -## metal-hammer - -If using a self-deployed NTP server, also the metal-hammer need to be configured with it. For backward compatibility, default values from `pool.ntp.org` and `time.google.com` are used. - -## metal-images - -Configurations for the `metal-images` are different for machines and firewalls. - -## metalctl - -In order to pass DNS and NTP servers to partitions and machines while creating them, the flags `dnsservers` and `ntpservers` need to be added. - -The implementation of this MEP will make metal-stack possible to create and maintain machines without requiring an internet connection. diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP16/README.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP16/README.md deleted file mode 100644 index 205670ab..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP16/README.md +++ /dev/null @@ -1,318 +0,0 @@ ---- -slug: /MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller -title: MEP-16 -sidebar_position: 16 ---- - -# metal-api as an Alternative Configuration Source for the firewall-controller - -In the current situation, a firewall as provisioned by metal-stack is a fully immutable entity. Any modifications on the firewall like changing the firewall ruleset must be done _somehow_ by the user – the metal-api and hence metal-stack is not aware of its current state. - -As part of our [integration with the Gardener project](https://docs.metal-stack.io/stable/overview/kubernetes/#Gardener) we offer a solution called the [firewall-controller](https://github.com/metal-stack/firewall-controller), which is part of our [firewall OS images](https://github.com/metal-stack/metal-images/blob/6318a624861b18a559a9d37299bca5f760eef524/firewall/Dockerfile#L57-L58) and addresses shortcomings of the firewall resource's immutability, which would otherwise be completely impractible to work with. The firewall-controller crashes infinitely if it is not properly configured through the userdata when using the firewall image of metal-stack. - -The firewall-controller approach is tightly coupled to Gardener and it requires the administrator of the Gardener installation to pass a shoot and a seed kubeconfig through machine userdata when creating the firewall. How this userdata has to look like is not documented and is just part of another project called the [firewall-controller-manager](https://github.com/metal-stack/firewall-controller-manager) (FCM), which task is to orchestrate rolling updates of firewall machines in a way that network traffic interruption is minimal when updating a firewall or applying a change to an immutable firewall configuration. - -In general, a firewall entity in metal-stack has similarities to the machine entity but it has a fundamental difference: A user gains ownership over a machine after provisioning. They can access it through SSH, modify it at will and this is completely wanted. For firewalls, however, we do not want a user to access the provisioned firewall as the firewall is a privileged part of the infrastructure with access to the underlay network. The underlay can not be tampered with at any given point in time by a user as it can destroy the entire network traffic flow inside a metal-stack partition. - -For this reason, we have a gap in the metal-stack project in terms of a missing solution for people who do not rely on the Gardener integration. We are basically leaving a user with the option to implement an orchestrated recreation of every possible change on the firewall to minimize traffic interruption for the machines sitting behind the firewall or re-implement the firewall-controller to how they want to use it for their use-case. - -Also we do not have a clear distinction in the API between user and metal-stack operator for firewalls. If a user would allocate a firewall it is also possible for the user to inject his own SSH keys and access the firewall and tamper with the underlay network. - -Parts of these problems are probably going to decrease with the work on [MEP-4](../MEP4/README.md) where there will be dedicated APIs for users and administrators of metal-stack including fine-grained access tokens. - -With this MEP we want to describe a way to improve this current situation and allow other users that do not rely on the Gardener integration – for whatever motivation they have – to adequately manage firewalls. For this, we propose an alternative configuration for the firewall-controller that is native to metal-stack and more independent of Gardener. - -## Proposal - -The central idea of this proposal is allowing the firewall-controller to use the metal-api as a configuration source. This should serve as an alternative strategy to the currently used FCM `Firewall` resource based approach in the Gardener use-case. -Updates of the firewall rules should be possible through the metal-api. - -The firewall-controller itself should now be able to decide which of the two main strategies should be used for the base configuration: a kubeconfig or the metal-api. This should be possible through a dedicated _firewall-controller-config_. - -Using this config will now allow operators to fine-tune the data sources for all of its dynamic configuration tasks independently. -For example the data source of the core firewall rules could be set either from the `Firewall` resource located in the Gardener `Seed` or the metal-apiserver node network entity, while the CWNPs should be fetched and applied from a given kubeconfig (the `Shoot` Kubeconfig in the Gardener case). -This configuration file is intended to be injected during firewall creation through the userdata along with potential source connection credentials. - -```yaml -# the name of the firewall, defaulted to the hostname -name: best-firewall-ever - -sources: - seed: - kubeconfig: /path/to/seed.yaml # current gardener behavior - namespace: shoot--proj--name - shoot: - kubeconfig: /path/to/shoot.yaml # current gardener behavior - namespace: firewall - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - static: - # static should mirror all information provided by the metal or seed/shoot sources - firewall: # optional - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -# all sub-controllers running on the firewall -# each can be configured independently -controllers: - # this is the base controller - firewall: - source: seed # or: metal, static - - # these are optional: when not provided, they are disabled - selfUpdate: - enabled: true - droptailer: - enabled: true - - # these are optional: when not provided, they are disabled - service: - source: shoot # or: metal, static - cwnp: - source: shoot # or: metal, static - monitor: - source: shoot # currently only shoot is supported -``` - -The existing behavior of the firewall-controller writing into `/etc/nftables/firewall-controller.v4` is not changed. The different controller configuration sources are internally treated in the same way as before. The `static` source can be used to prevent the firewall-controller from crashing and consistently providing a static ruleset. This might be interesting for metal-stack native use cases or environments where the metal-api cannot be accessed. - -There must be one central nftables-rule-file-controller that is notified and triggered by all other controllers that contribute to the nftables configuration. - -For example, in order to maintain the existing Gardener integration, the configuration file for the firewall-controller will look like this: - -```yaml -name: shoot--abc--cluster-firewall-def -sources: - seed: - kubeconfig: /etc/firewall-controller/seed.yaml - namespace: shoot--abc--cluster - shoot: - kubeconfig: /etc/firewall-controller/shoot.yaml - namespace: firewall - -controllers: - firewall: - source: seed - - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Plain metal-stack users might use a configuration like this: - -```yaml -name: best-firewall-ever - -sources: - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - -controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - cwnp: - # firewall rules stored in firewall entity - # potential improvement would be to attach the rules to the node network entity - # be aware that the firewall and private networks are immutable - # eventually we introduce a firewall ruleset entity - source: metal -``` - -In highly restricted environments that cannot access metal-api the static source could be used: - -```yaml -name: most-restricted-firewall-ever - -sources: - static: - firewall: - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -controllers: - firewall: - source: static - - cwnp: - source: static -``` - -### Non-Goals - -- Resolving the missing differentiation between users and administrators by letting users pass userdata and SSH keys to the firewall creation. - - This is even more related to [MEP-4](../MEP4/README.md) than this MEP. - -### Advantages - -- Offers a native metal-stack solution that improves managing firewalls for users by adding dynamic reconfiguration through the metal-api - - e.g., in the mini-lab, users can now allocate a machine, then an IP address and announce this IP from the machine without having to re-create the firewall but by adding a firewall rule to the metal-api. -- Improve consistency throughout the API (firewall rules would reflect what is persisted in metal-api). -- Other providers like Cluster API can leverage this approach, too. -- It can contribute to solving the shoot migration issue (in Cluster API case the `clusterctl move` for firewall objects) - - For Gardener takes the seed out of the equation (of which the kubeconfig changes during shoot migration) - - However: Things like egress rules, rate limiting, etc. are currently not part of the firewall or network entity in the metal-api. These would need to be added to one of them. -- Potentially resolve the issue that end-users can manipulate accounting data of the firewall through the `FirewallMonitor` - - for this we would need to be able to report traffic data to metal-api - -### Caveats - -- Metal-View access is too broad for firewalls. Mitigated by [MEP-4](../MEP4/README.md). -- Polling of the firewall-controller is bad for performance. Mitigated by [MEP-4](../MEP4/README.md). - -### Firewall Controller Manager - -Currently the firewall-controller-manager expects the creators of a `FirewallDeployment` to use the defaulting webhook that is tailored to the Gardener integration in order to generate `Firewall.spec.userdata` or to override it manually. Currently `Firewall.spec.userdata` will never be set explicitly. - -Instead we'd like to propose `Firewall.spec.userdataContents` which will replace the old `userdata`-string by a typed data structure. The FCM will do the heavy lifting while the `FirewallDeployment` creator decides what should be configured. - -```yaml -kind: FirewallDeployment -spec: - template: - spec: - userdataContents: - - path: /etc/firewall-controller/config.yaml - content: | - --- - sources: - static: {} - controllers: - firewall: - source: static - - path: /etc/firewall-controller/seed.yaml - secretRef: - name: seed-kubeconfig - generateFirewallControllerKubeconfig: true - - path: /etc/firewall-controller/shoot.yaml - secretRef: - name: shoot-kubeconfig -``` - -### Gardener Extension Provider Metal Stack - -The GEPM should be migrated to the new `Firewall.spec.userdataContents` field. - -### Cluster API Provider Metal Stack - -![architectural overview](firewall-for-capms-overview.svg) - -In Cluster API there are essentially two main clusters: the management cluster and the workload cluster while the CAPMS takes in the role of the GEPM. -Typically a local bootstrap cluster is created in KinD which acts as the management cluster. It creates the workload cluster. Thereafter the ownership of the workload cluster is typically moved (using `clusterctl move`) to a different cluster which will then become the management cluster. -The new management cluster might actually be the workload cluster itself. - -In contrast to Gardener, Cluster API aims to be less opinionated and minimal. It is common practice to not install any non-required components or CRDs into the workload cluster by default. Therefore we cannot expect custom resources like `ClusterwideNetworkPolicy` or `FirewallMonitor` to be installed in the workload cluster but strongly recommend our users to do it. Therefore it's the responsibility of the operator to tell [cluster-api-provider-metal-stack](https://github.com/metal-stack/cluster-api-provider-metal-stack) the kubeconfig for the cluster where these CRDs are installed and defined in. - -A viable configuration for a `MetalStackCluster` that generates firewall rules based of `Service` type `LoadBalancer` and `ClusterwideNetworkPolicy` and expects them to be deployed in the workload cluster is shown below. The `FirewallMonitor` will be reported into the same cluster. - -```yaml -kind: MetalStackCluster -metadata: - name: ${CLUSTER_NAME} -spec: - firewallTemplate: - userdataContents: - - path: /etc/firewall-controller/config.yaml - secretName: ${CLUSTER_NAME}-firewall-controller-config - - - path: /etc/firewall-controller/workload.yaml - # this is the kubeconfig generated by kubeadm - secretName: ${CLUSTER_NAME}-kubeconfig ---- -kind: Secret -metadata: - name: ${CLUSTER_NAME}-firewall-controller-config -stringData: - controllerConfig: | - --- - name: ${CLUSTER_NAME}-firewall - - sources: - metal: - url: ${METAL_API_URL} - hmac: ${METAL_API_HMAC} - type: ${METAL_API_HMAC_TYPE} - projectID: ${METAL_API_PROJECT_ID} - shoot: - kubeconfig: /etc/firewall-controller/workload.yaml - namespace: firewall - - controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Here the firewall-controller-config will be referenced by the `MetalStackCluster` as a `Secret`. Please note that the `Secret`s in `userdataContents` will not be fetched and will directly be passed to the `FirewallDeployment`. At first the reconciliation of it in the FCM will fail due to the missing Kubeconfig secret. After the `MetalStackCluster` has been marked as ready, CAPI will create this missing secret. Effectively the firewall and initial control plane node should be created at the same time. - -This approach allows maximum flexibility as intended by Cluster API and is still able to provide robust rolling updates of firewalls. - -An advanced use case of this flexibility would be a management cluster, that is in charge of multiple workload clusters. Where one workload cluster acts as a monitoring or tooling cluster, receives logs and the firewall monitor for the other workload clusters. The CWNPs could be defined here, all in a separate namespace. - -#### Cluster API Caveats - -When the cluster is pivoted and reconciles its own firewall, a malfunctioning firewall prevents the cluster from self-healing and requires manual intervention by creating a new firewall. This is an inherent problem of the cluster-api approach. It can be circumvented by using an extra cluster to manage workload clusters. - -In the current form of this approach firewalls and therefore the firewall egress and ingress rules are managed by the cluster operators that manage the cluster-api resources. -Hence it will not be possible to gain a fine-grained control over every cluster operator's choices from a central ruleset at the level of metal-stack firewalls. -In case this control surfaces as a requirement, it would need to be implemented in a firewall external to metal-stack. - -## Roadmap - -In general this proposal is not thought to be implemented in one batch. Instead an incremental approach is required. - -1. Enhance firewall-controller - - - Reduce coupling between controllers - - Introduce controller config - - Abstract module to write into distinct nftable rules for every controller - - Implement `sources.static`, but not `sources.metal` - - GEPM should set `FirewallDeployment.spec.template.spec.userdataContents` - -2. Allow Cluster API to use the FCM with static ruleset - - - Add `firewall.metal-stack.io/paused` annotation (managed by CAPMS during `clusterctl move`, theoretically useful for Gardener shoot migration as well to avoid shallow deletion). - - Reconcile multiple `FirewallDeployment` resources across multiple namespaces. For Gardener the old behavior of reconciling only one namespace should persist. - - Allow setting the `firewall.metal-stack.io/no-controller-connection` annotation through the `FirewallDeployment` (either through the template or inheritance). - - Add `MetalStackCluster.spec.firewallTemplate`. - - Make `MetalStackCluster.spec.nodeNetworkID` optional if `spec.firewallTemplate` given. - -3. Add `sources.metal` as configuration option. - - - Allow updates of firewall rules in the metal-apiserver. - - Depends on [MEP-4](../MEP4/README.md) metal-apiserver progress - -4. Potentially migrate the GEPM to use `sources.metal` diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio deleted file mode 100644 index faea3e3d..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio +++ /dev/null @@ -1,4 +0,0 @@ - - - -
handles traffic
Firewall
Firewall Controller
node-exporter
nftables-exporter
droptailer-client
Workload Cluster
droptailer
Configures
Bootstrap or Management Cluster
reconcile
configures
reconcile
Cluster API Provider metal-stack
Metal Stack Cluster CRD
Firewall Deployment CRD
Firewall CRD
Firewall Set CRD
rec
reconcile
reconcile
Firewall Controller Manager
Metal Stack Machine CRD
manages
Admin
Kubeconfig FirewallMonitor
FirewallMonitor CRD
main metal-api
Firewall entity
kubeconfig CWNP
Clusterwide Network Policy CRD
base config
controllerConfig
user-defined
network rules
reports firewall
state
send firewall log lines
controllerConfig
controllerConfig
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg deleted file mode 100644 index 853f8175..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg +++ /dev/null @@ -1 +0,0 @@ -
handles traffic
handles traffic
Firewall
Firewall
Firewall Controller
Firewall Controller
node-exporter
node-exporter
nftables-exporter
nftables-exporter
droptailer-client
droptailer-client
Workload Cluster
Workload Cluster
droptailer
droptailer
Configures
Configures
Bootstrap or Management Cluster
Bootstrap or Management Cluster
reconcile
reconcile
configures
configures
reconcile
reconcile
Cluster API Provider metal-stack
Cluster API Provider...
Metal Stack Cluster CRD
Metal Stack Cluster...
Firewall Deployment CRD
Firewall Deployment...
Firewall CRD
Firewall CRD
Firewall Set CRD
Firewall Set CRD
rec
rec
reconcile
reconcile
reconcile
reconcile
Firewall Controller Manager
Firewall Controller...
Metal Stack Machine CRD
Metal Stack Machine...
manages
manages
Admin
Admin
Kubeconfig FirewallMonitor
Kubeconfig FirewallMonitor
FirewallMonitor CRD
FirewallMonitor CRD
main metal-api
main metal-api
Firewall entity
Firewall entity
kubeconfig CWNP
kubeconfig CWNP
Clusterwide Network PolicyCRD
Clusterwide Network...
base config
base config
controllerConfig
controllerConfig
user-defined
network rules
user-defined...
reports firewall
state
reports firewall...
send firewall log lines
send firewall log lines
controllerConfig
controllerConfig
controllerConfig
controllerConfig
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP17/README.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP17/README.md deleted file mode 100644 index 35f48970..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP17/README.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -slug: /MEP-17-global-network-view -title: MEP-17 -sidebar_position: 17 ---- - -# Global Network View - -> [!IMPORTANT] -> This MEP assumes the implementation of the metal-apiserver as described by [MEP-4](../MEP4/README.md) which is currently work in progress. - -Having a complete view of the network topology is useful when working with deployments or troubleshooting connectivity issues. -Currently, the API doesn't know of any other switches than the leaf switches. -Information about all other switches and their connections must be gathered from Ansible inventories or by accessing the switches via SSH. -Documentation of each partition's network must be kept in-sync with all changes made to the deployment or cabling. -We would like to expand the API's knowledge of the network to the entire underlay including inter-switch connections as well as BGP statistics and health status. - -## Switch Types - -Registering a switch at the API is done by the metal-core. -Apart from that, it also reconciles port and FRR configuration to adapt to the machine provisioning cycle. -This reconfiguration is only necessary on the leaf switches. -To allow deploying the metal-core on other switches than leaves we need a way of telling it what type of switch it is running on so it can act accordingly. -On any non-leaf switches it will only register the switch and report statistic but not change any configuration. -Supported switch types are - -- `leaf` -- `spine` -- `exit` -- `mgmtleaf` -- `mgmtspine` - -## Network Topology - -All switches should periodically report their LLDP neighbors and port configuration. -This information can be used to quickly identify common network issues, like MTU mismatch or the like. -Ideally, there would be some graphical representation of the network topology containing only the most important information for a quick overview. -It should contain all switches and machines as nodes and all connections as edges of a graph. -Ports, VRFs, and maybe also IPs should be associated with a connection. - -Apart from the topology graph, there should be a way to display more detailed information about both ports of a connection, like - -- MTU -- speed -- IP -- UP/DOWN status -- VRF -- VLAN -- whether it participates in a BGP session - -## BGP Announcements - -The metal-core should collect all routes it knows about and send them to the API along with a timestamp. -Reported routes should be stored to a redis database along with the switch that reported them and the timestamp of the last time they were reported. -An expiration threshold should be defined and all expired routes should be cleaned up periodically. -Whenever new routes are reported they get merged into the existing ones by the strategy: - -- when new, just add -- when existing, update `last_announced` timestamp - -By querying the BGP announcements we can find out whether an allocated IP is still in use. diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/README.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/README.md deleted file mode 100644 index 9c02c0b7..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/README.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -slug: /MEP-18-autonomous-control-plane -title: MEP-18 -sidebar_position: 18 ---- - -# Autonomous Control Plane - -As described in the [deployment chapter](../../../docs/04-For%20Operators/03-deployment-guide.mdx), we strongly recommend Kubernetes as the target platform for running the metal-stack control plane. - -Kubernetes clusters for this purpose are readily available from hyperscalers, metalstack.cloud, or other cloud providers. Simply using a managed Kubernetes cluster greatly simplifies a metal-stack installation. However, sometimes it might be desirable to host the metal-stack control plane autonomously, without the help of another cloud provider. Reasons for this might include corporate policies that prohibit the use of external data center products, or network constraints. - -The Kubernetes cluster hosting the metal-stack control plane must provide at least the following features: - -- Load balancing (for exposing the APIs) -- Persistent storage (for the databases and key-value stores) -- Access to object storage for automated backups of the stateful sets -- Access to a DNS provider supported by one of the used DNS extensions -- Externally accessible DNS records for obtaining officially signed certificates through DNS challenges - -This metal-stack control plane cluster must also be highly available to prevent a complete loss of control over the managed resources in the data center. -Regular Kubernetes updates to apply security fixes and feature updates must be possible in an automated manner. The Day-2 operational overhead of running this cluster in your own datacenter must be reasonable. - -In this chapter, we propose a solution for setting up a metal-stack environment with an autonomous control plane that is independent of another cloud provider. - -## Use Your Own Dogfood - -The most obvious solution is to just deploy a Kubernetes cluster manually in your own data center by utilizing existing tooling for the deployment: - -- k3s -- kubeadm -- vmware and rancher -- talos -- kubespray -- ... (not a complete list) - -However, all these solutions add another layer of complexity that needs to be maintained and operated by people who also need to learn and understand metal-stack. In general, metal-stack in combination with [Gardener](https://gardener.cloud) contains all the necessary tools to provide KaaS, so it makes sense to reuse what is already in place without introducing new dependencies on other products and vendors. - -The only problem here is that Gardener is not yet able to create an initial cluster, which may change with the implementation of [GEP-28](https://github.com/gardener/gardener/blob/master/docs/proposals/28-autonomous-shoot-clusters.md). In the meantime, we suggest using [k3s](https://k3s.io/), which manages the initial metal-stack partition to host the control plane, since the maintenance overhead is acceptable and it is easy to deploy. - -## The Matryoshka Principle - -Instead of directly using the K3s cluster for the production control plane, we propose using it as a minimal control plane cluster which only purpose is to host the production control plane cluster. This layer of indirection brings some reasonable advantages: - -- In the event of an interruption or loss of this minimal control plane cluster, the production control plane remains unaffected, and end users can continue to manage their clusters as normal. -- A dedicated operations team can take care of the Day-2 maintenance of this installation, which can be handy because the tools like k3s are a little different from the rest of the setup (it is likely that more manual maintenance is required than for any other cluster). This would also be true if the initial cluster problem would be solved by the Gardener itself and not using k3s. -- Since the number of shoot clusters to host is static, the resource requirements are minimal and will not change significantly over time. There are no huge resource requirements in terms of cpu, memory and storage. As such, the lack of scalability is not such a big issue. - -So, our proposal is to chain two metal-stack control planes. The initial control plane cluster would use k3s and on this cluster we can spin up a cluster for the production control plane with the use of Gardener. - -The following figure shows how the high-level architecture of this setup looks like. A even more simplified illustration of this setup can be looked up in the appendix[^1]. - -![Autonomous Control Plane Architecture](./autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg) - -The k3s nodes can either be bare metal machines or virtual machines. When using VMs a single k3s node might be a viable solution, too. These nodes are supposed to be setup manually / partly automated with an operating system like Debian. - -To name the cluster that hosts the initial metal-stack control plane and Gardener we use the term _initial cluster_. The initial cluster creates worker nodes to host the _target cluster_. - -## Initial Cluster - -The initial cluster is kept very small. The physical bare metal machines can be any machines and switches which are supported by metal-stack, but can be smaller in terms of cpu, memory and network speed because these machines must only be capable of running the target cluster for the metal-stack control plane. A typical single socket server with 8-16 cores and 64GB of RAM and two NVMe drives of 1TB would be a good starting point. - -In a typical k3s setup, a stateful set would lose the data once the k3s cluster was terminated and started again. But there is a possibility to define parts of the local storage of the server to be provided to the k3s cluster for the PVCs. With that, k3s could be terminated and started again, for example to update and reboot the host os, or update k3s itself and the data will persist. - -Example k3s configuration for persistent storage on the hosts os: - -```yaml -k3s: Cluster -apiVersion: k3s.x-k8s.io/v1alpha4 -name: needle-control-plane -nodes: - - role: control-plane - # add a mount from /path/to/my/files on the host to /files on the node - extraMounts: - - hostPath: /path/to/my/files - containerPath: /files -``` - -Into this cluster metal-stack and Gardener will be deployed. This deployment can be done by a Gitlab runner which is running on this machine. -The mini-lab will be used as a base for this deployment. The current development of [gardener-in-minilab](https://github.com/metal-stack/mini-lab/pull/202) must be extended to host all required extensions to make this a working metal-stack control plane which can manage the machines in the attached bare metal setup. - -In addition to the metal-stack and Gardener deployment, some additional required services are deployed (non-complete list): - -- PowerDNS to serve as a DNS Server for all DNS entries used in the initial and the target cluster, like `api.initial.metal-stack.local`, `gardener-api.initial.metal-stack.local` and the DNS entries for the api servers of the created kubernetes clusters. -- NTP -- Monitoring for the initial cluster and partition -- Optional: OIDC Server for authenticating against the metal-api -- Optional: Container Registry to host all metal-stack and gardener containers -- Optional: Let's Encrypt [boulder](https://github.com/letsencrypt/boulder) as a certificate authority -- ... - -Physical view, minimal setup for a initial cluster with a single physical node: - -![Small Initial Cluster](autonomous-control-plane-images/small-initial-cluster.svg) - -Physical View, bigger ha setup which is spread across two data centers: - -![HA Initial Cluster](autonomous-control-plane-images/ha-initial-cluster.svg) - -### Control Plane High Availability - -Running the initial control plane on a single physical server is not as available as it should be in such a use case. It should be possible to survive a loss of this server, because the server could be lost by many events, such as hardware failure, disk corruption or even failure of the datacenter location where this server is deployed. - -Setting up a second server with the same software components is an option, but the problem of data redundancy must be solved, because neither the gardener control plane, nor the metal-stack control plane can be instantiated twice. - -Given that we provide part of the local storage of the server as backing storage for the stateful sets in the k3s cluster, the data stored on the server itself must be replicated to another server and backed up on a regular basis. - -The replication of ETCD can be achieved through [clustered configuration](https://docs.k3s.io/datastore/ha-embedded) of k3s. Components of metal-stack and Gardener can run standalone and already utilize backup-restore mechanism that must be configured accordingly. For two or more bare metal machine used for the initial cluster, a loadbalancing mechanism for the ingress is required. kube-vip could be a possible solution. - -For monitoring a backend like a Victoria Metrics Cluster would allow spearding the monitoring data across the initial cluster nodes. These metrics should also be backed up in object storage. - -### Partition - -The partition which is managed by the initial cluster can be a simple and small hardware setup but yet capable enough to host the target cluster. It would even be a good practice to create separate target clusters on the initial cluster, e.g. one for the metal-stack control plane and one for the Gardener (maybe one more for monitoring). - -It can follow the metal-stack minimal setup which provides about 8-16 small servers connected to a 1G/s or 10G/s network dataplane. Central storage is optional as the persistence of the services running in these clusters is always backed up to a central object storage. Operations would be much easier if a central storage is provided. - -## Target Cluster - -The target cluster is the metal-stack environment which serves for end-user production use, the control plane is running in a shoot hosted in the initial cluster. The seed(s) and shoot(s) for end-users are created on the machines provided by the target cluster. -These machines can be of a different type in terms of size, but more importantly, these machines are connected to another network dataplane. Also the management infrastructure is separated from the initial cluster management network. - -## Failure Scenarios - -Everything could fail, everything will fail at some point. But this must kept in mind and nothing bad should happen if only one component at a time fails. -If more than one fails, the restoration to a working state must be easily possible and well documented. - -To ensure all possible breakages are documented, we suggest writing a list which summarizes all failure scenarios that might occur including the remediation. - -Here is an example of how a scenario documentation could look like: - -**Scenario**: Initial cluster is gone, all machines have died -**Impact**: Management of the initial cluster infrastructure not possible anymore, the target cluster continues to run but cannot be managed because the API servers are gone. end-users are not affected by this incident. -**Remediation**: The initial cluster nodes must be provisioned from scratch and re-deployed through the CI mechanism. The backups of the stateful sets are automatically restored during this process. - -## Implementation - -As part of this proposal, we provide the following tools and integrations in order to setup an autonomous control plane: - -- Deployment roles for the services like PowerDNS and NTP for the initial cluster -- Stretch goal: Deployment role to setup k3s in clustered configuration for the initial cluster and update it -- Extend the Gardener on mini-lab integration to allow shoot creation in the mini-lab -- Steady integration of the setup (maybe something like [k3d](https://github.com/k3d-io/k3d) in the mini-lab) - -## Appendix - -[^1]: ![metal-stack-chain](autonomous-control-plane-images/metal-stack-chain.svg) diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio deleted file mode 100644 index eafcb514..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio +++ /dev/null @@ -1,535 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine01 -
-
-
-
- - spine01 - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 01 - -
-
-
-
- - Initial cluster node 01 - -
-
-
- - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine02 -
-
-
-
- - spine02 - -
-
-
- - - - - - - - - -
-
-
- leaf03 -
-
-
-
- - leaf03 - -
-
-
- - - - - - - - - -
-
-
- leaf04 -
-
-
-
- - leaf04 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 02 - -
-
-
-
- - Initial cluster node 02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 03 - -
-
-
-
- - Initial cluster node 03 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg deleted file mode 100644 index 99261ada..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine01
spine01
leaf01
leaf01
leaf02
leaf02
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Initial cluster node 01
Initial cluster node 01
123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine02
spine02
leaf03
leaf03
leaf04
leaf04
Initial cluster node 02
Initial cluster node 02
Initial cluster node 03
Initial cluster node 03
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio deleted file mode 100644 index aae8a12d..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio +++ /dev/null @@ -1,1133 +0,0 @@ - - - - - - - - - - - - - - - - - - - -
-
-
- Initial Cluster -
-
-
-
- - Initial Cluster - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - -
-
-
- K3s Standalone - - - (on Debian) - - -
-
-
-
- - K3s Standalone (on Debian) - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Initial Partition -
-
-
-
- - Initial Partition - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for metal-stack -
-
-
-
- - Target Cluster for metal-stack - -
-
-
- - - - - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for Gardener -
-
-
-
- - Target Cluster for Gardener - -
-
-
- - - - - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - - - - - - - -
-
-
- Target Partition -
-
-
-
- - Target Partition - -
-
-
- - - - - - - - - - -
-
-
- Gardener Seeds and End-User Shoots -
-
-
-
- - Gardener Seeds and End-User Shoots - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - - - - -
-
-
- ETCD can be clustered or standalone, backed up by sidecar -
-
-
-
- - ETCD can be clustere... - -
-
-
- - - - - - - - - - -
-
-
- This data will get lost in case local PV gets deleted -
-
-
-
- - This data will get l... - -
-
-
- - - - - - - - - - -
-
-
- We can work with local PVs here, too. -
- backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered. -
-
-
-
- - We can work with local PVs he... - -
-
-
- - - - - - - -
-
-
- ETCD will be deployed in HA configuration on local PVs. -
-
- csi-driver-lvm needs to implement auto deletion of orphaned PVs. -
-
- Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot. -
-
-
-
- - ETCD will be deployed in HA c... - -
-
-
- - - - - - - - - - -
-
-
- More sophisticated storage solutions can be in place. -
-
- (Lightbits, NetApp, ...) -
-
-
-
- - More sophisticated storage so... - -
-
-
- - - - - - - - - - -
-
-
- TODO: Evaluate how to persist these metrics. -
-
-
-
- - TODO: Evaluate how to persist... - -
-
-
- - - - - - - - - - -
-
-
- - 1 VM or -
-
-
- - - 3 Bare Metal Machines - - -
-
-
-
-
- - 1 VM or... - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-stack -
-
-
-
- - metal-stack - -
-
-
- - - - - - - -
-
-
- metal-api -
-
-
-
- - metal-api - -
-
-
- - - - - - - -
-
-
- metal-db -
-
-
-
- - metal-db - -
-
-
- - - - - - - -
-
-
- ipam-db -
-
-
-
- - ipam-db - -
-
-
- - - - - - - -
-
-
- masterdata-db -
-
-
-
- - masterdata-db - -
-
-
- - - - - - - -
-
-
- headscale-db -
-
-
-
- - headscale-db - -
-
-
- - - - - - - -
-
-
- auditing-db -
-
-
-
- - auditing-db - -
-
-
- - - - - - - -
-
-
- nsqd -
-
-
-
- - nsqd - -
-
-
- - - - - - - - - - - -
-
-
- Gardener -
-
-
-
- - Gardener - -
-
-
- - - - - - - - - - -
-
-
- Virtual Garden -
-
-
-
- - Virtual Garden - -
-
-
- - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - -
-
-
- gardenlet -
-
-
-
- - gardenlet - -
-
-
- - - - - - - -
-
-
- Garden etcd -
-
-
-
- - Garden etcd - -
-
-
- - - - - - - -
-
-
- Prometheus -
-
-
-
- - Prometheus - -
-
-
- - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - -
-
-
- - Gitlab - -
- - Runner - -
-
-
-
-
- - Gitlab... - -
-
-
- - - - - - - - - - -
-
-
- Services -
-
-
-
- - Services - -
-
-
- - - - - - - -
-
-
- PowerDNS -
-
-
-
- - PowerDNS - -
-
-
- - - - - - - -
-
-
- boulder -
-
-
-
- - boulder - -
-
-
- - - - - - - -
-
-
- NTP -
-
-
-
- - NTP - -
-
-
- - - - - - - -
-
-
- OIDC -
-
-
-
- - OIDC - -
-
-
- - - - - - - -
-
-
- ... -
-
-
-
- - ... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg deleted file mode 100644 index e58e783b..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg +++ /dev/null @@ -1 +0,0 @@ -
Initial Cluster
Initial Cluster
metal-roles
metal-roles
CI
CI
K3s Standalone(on Debian)
K3s Standalone (on Debian)
Initial Partition
Initial Partition
Target Cluster for metal-stack
Target Cluster for metal-stack
Metal Control Plane
Metal Control Plane
provisions
provisions
Target Cluster for Gardener
Target Cluster for Gardener
Gardener Control Plane
Gardener Control Plane
Monitoring
Monitoring
Target Partition
Target Partition
Gardener Seeds and End-User Shoots
Gardener Seeds and End-User Shoots
provisions
provisions
metal-roles
metal-roles
CI
CI
metal-roles
metal-roles
ETCD can be clustered or standalone, backed up by sidecar
ETCD can be clustere...
This data will get lost in case local PV gets deleted
This data will get l...
We can work with local PVs here, too.
backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered.
We can work with local PVs he...
ETCD will be deployed in HA configuration on local PVs.

csi-driver-lvm needs to implement auto deletion of orphaned PVs.

Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot.
ETCD will be deployed in HA c...
More sophisticated storage solutions can be in place.

(Lightbits, NetApp, ...)
More sophisticated storage so...
TODO: Evaluate how to persist these metrics.
TODO: Evaluate how to persist...
1 VM or
3 Bare Metal Machines
1 VM or...
metal-stack
metal-stack
metal-api
metal-api
metal-db
metal-db
ipam-db
ipam-db
masterdata-db
masterdata-db
headscale-db
headscale-db
auditing-db
auditing-db
nsqd
nsqd
Gardener
Gardener
Virtual Garden
Virtual Garden
Gardener Control Plane
Gardener Control Plane
gardenlet
gardenlet
Garden etcd
Garden etcd
Prometheus
Prometheus
Monitoring
Monitoring
Gitlab
Runner
Gitlab...
Services
Services
PowerDNS
PowerDNS
boulder
boulder
NTP
NTP
OIDC
OIDC
...
...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio deleted file mode 100644 index cd5cf007..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio +++ /dev/null @@ -1,404 +0,0 @@ - - - - - - - - - - -
-
-
- Partition 1 -
-
-
-
- - Partition 1 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 2 -
-
-
-
- - Partition 2 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 3 -
-
-
-
- - Partition 3 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Production Control Plane -
-
-
-
- - Production Control Plane - -
-
- - - - -
-
-
- metal-stack -
- kubernetes cluster -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- gardener -
- kubernetes cluster -
-
-
-
- - gardener... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - - - - -
-
-
- Control Plane Partition -
-
-
-
- - Control Plane Partition - -
-
- - - - - -
-
-
- backup of stateful sets -
-
-
-
- - backup of stateful sets - -
-
- - - - - - -
-
-
- bare metal machine -
-
-
-
- - bare metal machine - -
-
- - - - -
-
-
- metal-stack -
- and -
- gardener -
- kubernetes cluster -
- running in kind -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - -
-
-
- S3 -
-
-
-
- - S3 - -
-
- - - - -
-
-
- Needle -
-
-
-
- - Needle - -
-
- - - -
-
-
- - Nail - -
-
-
-
- - Nail - -
-
-
- - - - - Text is not SVG - cannot display - - - -
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg deleted file mode 100644 index 8f88ba14..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg +++ /dev/null @@ -1 +0,0 @@ -
Partition 1
Partition 1
seeds
seeds
shoots
shoots
Partition 2
Partition 2
seeds
seeds
shoots
shoots
Partition 3
Partition 3
seeds
seeds
shoots
shoots
Production Control Plane
Production Control Plane
metal-stack
kubernetes cluster
metal-stack...
gardener
kubernetes cluster
gardener...
Manages
Manages
Control Plane Partition
Control Plane Partition
backup of stateful sets
backup of stateful sets
bare metal machine
bare metal machine
metal-stack
and
gardener
kubernetes cluster
running in kind
metal-stack...
Manages
Manages
S3
S3
Needle
Needle 
Nail
Nail
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio deleted file mode 100644 index a75ee340..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- Initial cluster node -
-
-
-
- - Initial cluster node - -
-
-
- - - - - - - - - - - - - -
-
-
- mirocloud (initial cluster partition nodes) -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg deleted file mode 100644 index a9d29f05..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
leaf01
leaf01
leaf02
leaf02
Initial cluster node
Initial cluster node
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP2/README.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP2/README.md deleted file mode 100644 index c7f2360a..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP2/README.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -slug: /MEP-2-two-factor-authentication -title: MEP-2 -sidebar_position: 2 ---- - -# Two Factor Authentication diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP3/README.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP3/README.md deleted file mode 100644 index 5ce36721..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP3/README.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -slug: /MEP-3-machine-re-installation -title: MEP-3 -sidebar_position: 3 ---- - -# Machine Re-Installation - -In the current metal-api only machine installations are possible, performing a machine upgrade is only possible by creating a new machine and delete the old one. -This has the drawback that in case a lot of data is stored on the local disks, a full restore of the original data must be performed. - -To prevent this, we will introduce a new metal-api endpoint to reinstall the machine with a new image, _without_ actually deleting the data stored on the additional hard disks. - -Storage is a difficult task to get right and reliable. A short analysis of our different storage requirements lead to 3 different scenarios. - -- Storage for the etcd pvs in the seed cluster of every partition. - This is the most important storage in our setup because these etcd pods serve as configuration backend for all customer kubernetes clusters. If they fail, the cluster is down. However gardener deploys a backup and restore sidecar into the etcd pod of every customer kubernetes control plane, and if this sidecar detects a corrupt or missing etcd database file(s) it starts automatic restore from the configured backup location. This will take some minutes. If for example a node dies, and gardener creates a new node instead, the csi-lvm created pv is not present on that node. Kubernetes will not schedule the missing etcd pod on this node because it has a local PV configured and is therefore tainted to run only on that node. To let kubernetes create that pod anyhow, someone has to either remove the taint, or delete the pod. If this is done, the pod starts and the restore of the etcd data can start as well. You can see this is a bit too complicated and will take the customer cluster down for a while (not measured yet but in the range of 5-10 minutes). -- Storage in customer clusters. - This was not promised in 2020. We have a intermediate solution with the provisioning of csi-lvm by default into all customer clusters. Albeit this is only local storage and will get deleted if a node dies. -- S3 Storage. - We have two possibilities to cope with storage: - - In place update of the OS with a daemonset - This will be fast and simple, but might fail because the packages being installed are broken right now, or a filesystem gets full, or any other failure you can think of during a os update. Another drawback is that metal-api does not reflect the updated os image. - - metal-api get a machine reinstall endpoint - With this approach we leverage from existing and already proven mechanisms. Reinstall must keep all data except the sata-dom. Gardener currently is not able to do an update with this approach because it can only do `rolling` updates. Therefore a additional `osupdatestrategy` has to be implemented for metal and other providers in gardener to be able to leverage the metal reinstall on the same machineID approach. - -If reinstall is implemented, we should focus on the same technology for all scenarios and put ceph via rook.io into the kubernetes clusters as additional StorageClass. It has to be checked whether to use the raw disk or a PV as the underlay block device where ceph stores its data. - -## API and behavior - -The API will get an new endpoint "reinstall" this endpoint takes two arguments: - -- machineID -- image - -No other aspects of the machine can be modified during the re-installation. All data stored in the existing allocation will be preserved, only the image will be modified. -Once this endpoint was called, the machine will get a `reboot` signal with the boot order set to PXE instead of HDD and the network interfaces on the leaf are set to PXE as well. Then the normal installation process starts: - -- unchanged: PXE boot with metal-hammer -- changed: metal-hammer first checks with the machineID in the metal-api (through metal-core) if there is already a allocation present -- changed: if a allocation is present and the allocation has set `reinstall: true`, wipe disk is only executed for the root disk, all other disks are untouched. -- unchanged: the specified image is downloaded and burned, `/install.sh` is executed -- unchanged: successful installation is reported back, network is set the the vrf, boot order is set to HDD. -- unchanged: distribution kernel is booted via kexec - -We can see that the `allocation` requires one additional parameter: `reinstall` and metal-hammer must check for already existing allocation at an earlier stage. - -Components which requires modifications (first guess): - -- metal-hammer: - - check for allocation present earlier - - evaluation of `reinstall` flag set - - wipe of disks depends on that flag - - Bonus: move configuration of disk layout and primary disk detection algorithm (PDDA) from metal-hammer into metal-api. - metal-api **MUST** reject reinstallation if the disk found by PDDA does not have the `/etc/metal` directory! -- metal-core: - - probably nothing -- metal-api: - - new endpoint `/machine/reinstall` - - add `Reinstall bool` to data model of `allocation` - - make sure to reset `Reinstall` after reinstallation to prevent endless reinstallation loop -- metalctl: - - implement `reinstall` -- metal-go: - - implement `reinstall` -- gardener (longterm): - - add the `OSUpgradeStrategy` `reinstall` diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP4/README.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP4/README.md deleted file mode 100644 index 389a02d4..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP4/README.md +++ /dev/null @@ -1,211 +0,0 @@ ---- -slug: /MEP-4-multi-tenancy-for-the-metal-api -title: MEP-4 -sidebar_position: 4 ---- - -# Multi-Tenancy for the metal-api -:::info -This document is work in progress. -::: - -In the past we decided to treat the metal-api as a "low-level API", i.e. the API does not specifically deal with projects and tenants. A user with editor access can for example assign machines to every project he desires, he can see all the machines available and can control them. We tried to keep the metal-api code base as small as possible and we added resource scoping to a "higher-level APIs". From there, a user would be able to only see his own clusters and IP addresses. - -As time passed metal-stack has become an open-source project and people are willing to adopt. Adopters who want to put their own technologies on top of the metal-stack infrastructure don't have those "higher-level APIs" that we implemented closed-source for our user base. So, external adopters most likely need to implement resource scoping on their own. - -Introducing multi-tenancy to the metal-api is a serious chance of making our product better and more successful as it opens the door for: - -- Becoming a "fully-featured" API -- Narrowing down attack surfaces and possibility of unintended resource modification produced by bugs or human errors -- Discouraging people to implement their own scoping layers in front of the metal-stack -- Gaining performance through resource scopes -- Letting untrusted / third-parties work with the API - -## Requirements - -These are some general requirements / higher objectives that MEP-4 has to fulfill. - -- Should be able to run with mini-lab without requiring to setup complex auth backends (dex, LDAP, keycloak, ...) - - Simple to start with, more complex options for production setups -- Fine-grained access permissions (every endpoint maps to a permission) -- Tenant scoping (disallow resource access to resources of other tenants) -- Project scoping (disallow resource access to resources of other projects) -- Access tokens in self-service for technical user access - -## Implementation - -We gathered a lot of knowledge while implementing a multi-tenancy-capable backend for metalstack.cloud. The goal is now to use the same technology and adopt that to the metal-api, this includes: - -- gRPC in combination with connectrpc -- OPA for making auth decisions -- REST HTTP only for OIDC login flows - -### API Definitions - -The API definitions should be located on a separate Github repository separate from the server implementation. The proposed repository location is: https://github.com/metal-stack/api. - -This repository contains the `proto3` specification of the exposed metal-stack api. This includes the messages, simple validations, services and the access permission to these services. The input parameters for the authorization in the backend are generated from the `proto3` annotations. - -Client implementations for the most relevant languages (go, python) are generated automatically. - -This api is divided into end-user and admin access at the top level. The proposed APIs are: - -- `metalstack.api.v2`: For end-user facing services -- `metalstack.admin.v2`: For operators and controllers which need access to unscoped entities - -The methods of the API can have different role scopes (and can be narrowed down further with fine-grained method permissions): - -- `tenant`: Tenant-scoped methods, e.g. project creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `project`: Project-scoped methods, e.g. machine creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `admin` Admin-scoped methods, e.g. unscoped tenant list or switch register - - Available roles: VIEWER, EDITOR - -And has methods with different visibility scopes: - -- `self`: Methods that only the logged in user can access, e.g. show permissions with the presented token -- `public`: Methods that do not require any specific authorization - -### API - -The API server implements the services defined in the API and validates access to a method using OPA with the JWT tokens passed in the requests. The server is implemented using the connectrpc.com framework. - -The API server implements the login flow through OIDC. After successful authentication, the API server derives user permissions from the OIDC provider and issues a new JWT token which is passed on to the user. The tokens including the permissions are stored in a redis compatible backend. - -With these tokens, users can create Access Tokens for CI/CD or other use cases. - -JWT Tokens can be revoked by admins and the user itself. - -### API Server - -Is put into a new github repo which implements the services defined in the `api` repository. It opens a `https` endpoints where the grpc (via connectrpc.com) and oidc services are exposed. - -### Migration of the Consumers - -To allow consumers to migrate to the `v2` API gradually, both apis, the new and the old, are deployed in parallel. In the control-plane both apis are deployed side-by-side behind the ingress. `api.example.com` is forwarded to `metal-api` and `metal.example.com` is forwarded to the new `metal-apiserver`. - -The api-server will talk to the existing metal-api during the process of migration services away to the new grpc api. - -The migration process can be done in the following manner: - -for each resource in the metal-api: - -- create a new proto3 based definition in the `api` repo. -- implement the business logic per service in the new `metal-apiserver` without calling the metal-api. -- clients must be able to talk to `v1` and `v2` backend in parallel -- Deprecate the already migrated service in the swagger route to notify the client that this route should not be used anymore. -- identify all consumers of this resource and replace them to use the grpc instead of the rest api -- move the business logic incl. the backend calls to ipam, metal-db, masterdata-api, nsq for this resource from the metal-api to the `metal-apiserver` - -We will migrate the rethinkdb backend implementation to a generic approach during this effort. - -- Try to enhance the generic rethinkdb interface with `project` scoped methods. - -There are a lot of consumers of metal-api, which need to be migrated: - -- ansible -- firewall-controller -- firewall-controller-manager -- gardener-extension-auth -- gardener-extension-provider-metal - - Do not point the secret bindings to a the shared provider secret in the seed anymore. Instead, use individual provider-secret containing project-scoped API access tokens in the Gardener project namespaces. -- machine-controller-manager-provider-metal -- metal-ccm -- metal-console -- metal-bmc -- metal-core -- metal-hammer -- metal-image-cache-sync -- metal-images -- metal-metrics-exporter -- metal-networker -- metalctl -- pixie - -## User Scenarios - -This section gathers a collection of workflows from the perspective of a user that we want to provide with the implementation of this proposal. - -### Machine Creation - -A regular user wants to create a machine resource. - -Requirements: Project was created, permissions are present - -- The user can see networks that were provided by the admin. - - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` - -- The user has to set the project scope first or provide `--project` flags for all commands. - ``` - $ metalctl project set 793bb6cd-8b46-479d-9209-0fedca428fe1 - You are now acting on project 793bb6cd-8b46-479d-9209-0fedca428fe1. - ``` -- The user can create the child network required for machine allocation. - ``` - $ metalctl network allocate --partition fra-equ01 --name test - ``` -- Now, the user sees his own child network. - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - └─╴08b9114b-ec47-4697-b402-a11421788dc6 test 793bb6cd-8b46-479d-9209-0fedca428fe1 fra-equ01 false false 10.128.64.0/22  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` -- The user does not see any machines yet. - ``` - $ metalctl machine ls - ``` -- The user can create a machine. - ``` - $ metalctl machine create --networks internet,08b9114b-ec47-4697-b402-a11421788dc6 --name test --hostname test --image ubuntu-20.04 --partition fra-equ01 --size c1-xlarge-x86` - ``` -- The machine will now be provisioned. - ``` - $ metalctl machine ls - ID LAST EVENT WHEN AGE HOSTNAME PROJECT SIZE IMAGE PARTITION - 00000000-0000-0000-0000-ac1f6b7befb2 Phoned Home 20s 50d 4h test 793bb6cd-8b46-479d-9209-0fedca428fe1 c1-xlarge-x86 Ubuntu 20.04 20210415 fra-equ01 - ``` - -:::warning -A user **cannot** list all allocated machines for all projects. The user **must** always switch project context first and can only view the machines inside this project. Only admins can see all machines at once. -::: -### Scopes for Resources - -The admins / operators of the metal-stack should be able to provide _global_ resources that users are able to use along with their own resources. In particular, users can view and use _global_ resources, but they are not allowed to create, modify or delete them. - -:::info -When a project ID field is empty on a resource, the resource is considered _global_. -::: - -Where possible, users should be capable of creating their own resource entities. - -| Resource | User | Global | -| :----------------- | :--- | :----- | -| File System Layout | yes | yes | -| Firewall | yes | | -| Firmware | | yes | -| OS Image | | yes | -| Machine | yes | | -| Network (Base) | | yes | -| Network (Children) | yes | | -| IP | yes | | -| Partition | | yes | -| Project | yes | | -| Project Token | yes | | -| Size | | yes | -| Switch | | | -| Tenant | | yes | - -:::info -Example: A user can make use of the file system layouts provided by the admins, but can also create own layouts. Same applies for images. As soon as a user creates own resources, the user takes over the responsibility for the machine provisioning to succeed. -::: diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP5/README.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP5/README.md deleted file mode 100644 index 3b7fc45c..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP5/README.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -slug: /MEP-5-shared-networks -title: MEP-5 -sidebar_position: 5 ---- - -# Shared Networks - -## Why are shared networks needed - -For special purpose machines that serve shared services with performance critical workloads to all machines of a partition (like persistent storage) it would be good to have kind of a "shared network" that is easily accessible. -They do not necessarily need another firewall. This would avoid having two firewalls in the datapath between a machine in a private network and the machines of a shared service. - -## Constraints that need to hold - -- a shared network is usable from all machines that have a firewall in front, that uses it -- a shared network is only usable within a single partition (currently we are constrained in bandwidth and have no routing of 10.0.0.0/8 addresses btw. partitions and failure domain should be the partition but this constraint might get lifted in the future) -- networks may be marked as shared after network allocation (but there should be no way back from shared to unshared) -- neither machines nor firewalls may have multiple private, unshared networks configured -- machines must have a single primary network configured - - this might be a shared network - - OR a plain, unshared private network -- firewalls may participate in multiple shared networks -- machines can be allocated with a primary network using auto IP allocation or with `noauto` and a specific IP - -## Should shared networks be private - -**Alternative 1:** If we implemented shared networks by extending functions around plain, private networks we would not have to manage another CIDR (mini point) and it would be possible to create a k8s cluster with a private network, mark the network as `shared` and produce shared services from this k8s cluster. - -**Alternative 2:** If shared networks are implemented as first class networks we could customize the VRF and also accomplish an other goal of our roadmap: being able to create machines directly in an external network. - -Together with @majst01 and @Gerrit91 we decided to continue to implement **Alternative 1**. - -## Firewalls accessing a shared network - -Firewalls that access shared networks need to: - -- hide the private network behind an ip address of the shared network if the shared network was configured with `nat=true`. -- import the prefixes of the shared VRF to the private VRF and import the prefixes of the private VRF to the shared VRF so that the communication between the two is working in both directions. As long as no `nat=true` was set on the shared VRF, the original machine ips are visible in both communication directions. - -## Setup with shared networks and single consumer - -![Simple Setup](./shared.png) - -## Setup with single shared network and multiple consumers - -![Advanced Setup](./shared_advanced.png) - -## Getting internet access - -Machines contained in a shared network can access the internet with different scenarios: - -- if they have an own firewall: this is internet accessibility, as common (check whether all traffic gets routed through it!) -- if they don't have an own firewall, an external HTTP proxy is needed that has an endpoint exposed as Service Type NodePort diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP5/shared.drawio b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP5/shared.drawio deleted file mode 100644 index aa7af045..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP5/shared.drawio +++ /dev/null @@ -1,121 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP5/shared.png b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP5/shared.png deleted file mode 100644 index b0b47f03..00000000 Binary files a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP5/shared.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP5/shared_advanced.drawio b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP5/shared_advanced.drawio deleted file mode 100644 index 6f96eca0..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP5/shared_advanced.drawio +++ /dev/null @@ -1,187 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP5/shared_advanced.png b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP5/shared_advanced.png deleted file mode 100644 index da989915..00000000 Binary files a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP5/shared_advanced.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP6/README.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP6/README.md deleted file mode 100644 index edf52a6c..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP6/README.md +++ /dev/null @@ -1,123 +0,0 @@ ---- -slug: /MEP-6-dmz-networks -title: MEP-6 -sidebar_position: 6 ---- - -# DMZ Networks - -## Reasoning - -To fulfill higher levels of security measures the standard metal-stack approach with a single firewall in front of a set of machines might be insufficient. -There are cases where two physically distinct firewalls in front of application workload are mandatory. In traditional network terms this is known as DMZ approach. - -For Kubernetes workloads it makes sense to use the front cluster for ingress, WAF purposes and as outgoing proxy. The clusters may be used for application workload. - -## DMZ network - -- Use a separate DMZ network prefix for every tenant -- This is used as intermediate network btw. private networks of a tenant and the internet -- For every partition a distinct DMZ firewall/cluster is needed for a tenant -- For Gardener orchestrated Kubernetes clusters this network must be a publicly reachable internet prefix because shoot clusters need a vpn service that is used for instrumentation from the seed cluster - this will be a requirement as long as the inverse vpn tunnel feature Konnectivity is not available to us. - -## Approach 1: DMZ with publicly reachable internet prefix - -![DMZ Internet](dmz-internet_public.svg) - -A DMZ network with publicly reachable internet prefix will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: null -partitionid: "" -prefixes: - - 212.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 104007 -vrfshared: false -nat: true -shared: false -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -This is currently supported by the metal-networker and needs no further changes! - -## Approach 2: DMZ with private IPs - -![DMZ Internet](dmz-internet_private.svg) - -A DMZ network with private IPs will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: tenant-super-network-fra-equ01 -partitionid: fra-equ01 -prefixes: - - 10.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 4711 -vrfshared: false -nat: true -shared: true # it's usable from multiple projects -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network (only locally, no-export) -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -## Code Changes / Implications - -- `metal-networker` and `metal-ccm` assume that there is only one network providing the default-route -- `metal-networker` needs to - - import the default route from the internet network to the dmz network (DMZ Firewall) - - import the DMZ network to the internet network and adjusting NAT rules (DMZ Firewall) - - import destination prefixes of the DMZ network to the private primary network (DMZ Firewall, Application Firewall) - - import DMZ-IPs of the private primary network to the DMZ network (DMZ Firewall, Application Firewall) -- `metal-api`: destination prefixes of private networks need to be configurable (`allocateNetwork`) -- `gardener-extension-provider-metal`: needs to be able to delete DMZ clusters (but skip the network deletion part) -- the application firewall is not publicly reachable - for debugging purposes a hop over the DMZ firewall is needed - -## Decision - -We decided to follow the second approach with private DMZ networks. diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP6/dmz-internet_private.drawio b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP6/dmz-internet_private.drawio deleted file mode 100644 index 7b83bbfc..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP6/dmz-internet_private.drawio +++ /dev/null @@ -1,178 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP6/dmz-internet_private.svg b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP6/dmz-internet_private.svg deleted file mode 100644 index f5e58204..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP6/dmz-internet_private.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
10.90.30.129
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
10.90.30.128/25
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
10.90.30.129 is reachable
/0 via Firewall DMZ
10.0.0.0/24 is reachable
10.0.1.0/24 is reachable
10.90.30.129 is reachable...
Internet
212.1.1.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0

import 10.0.0.0/24 no export
import 10.0.1.0/24 no export
import 10.90.30.128/25 no export
import 10.0.0.0/24 no exp...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP6/dmz-internet_public.drawio b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP6/dmz-internet_public.drawio deleted file mode 100644 index 544939e5..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP6/dmz-internet_public.drawio +++ /dev/null @@ -1,184 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP6/dmz-internet_public.svg b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP6/dmz-internet_public.svg deleted file mode 100644 index 5e825081..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP6/dmz-internet_public.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
212.1.2.3
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
212.1.2.0/27
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
212.1.2.3 is reachable
/0 via Firewall DMZ
212.1.2.3 is reachable...
Internet
212.1.1.0/27 212.1.2.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0
import 212.1.2.0/27
import 10.0.0.0/24 no redistribute
import 10.0.1.0/24 no redistribute

import 212.1.2.0/27...
SNAT to
212.1.2.1
SNAT to...
SNAT to
212.1.2.2
SNAT to...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP8/README.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP8/README.md deleted file mode 100644 index 14748fae..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP8/README.md +++ /dev/null @@ -1,503 +0,0 @@ ---- -slug: /MEP-7-configurable-filesystem-layout-for-machine-allocation -title: MEP-7 -sidebar_position: 7 ---- - -# Configurable Filesystem layout for Machine Allocation - -The current implementation uses a hard coded filesystem layout depending on the specified size and image. This is done in the metal-hammer. This worked well in the past because we had a small amount of sizes and images. But we reached a point where this is to restricted for all use cases we have to fulfill. It also forces us to modify the metal-hammer source code to support a new filesystem layout. - -This proposal tries to address this issue by introducing a filesystem layout struct in the metal-api which is then configurable per machine allocation. -The original behavior of automatic filesystem layout decision must still be present, because there must be no API change for existing API consumers. It should be a additional feature during machine allocation. - -## API and behavior - -The API will get a new endpoint `filesystemlayouts`to create/update/delete a set of available `filesystemlayouts`. - -### Constraints - -In order to keep the actual machine allocation api compatible, there must be no difference while allocating a machine. To achieve this every -`filesystemlayout` defines constraints which specifies for which combination of `sizes` and `images` this layout should be used by default. -The specified constraints over all `filesystemlayouts` therefore must be collision free, to be more specific, there must be exactly one layout outcome -for every possible combination of `sizes` and `images`. - -The `size` constraint must be a list of the exact size ids, the `image` constraint must be a map of os to semver compatible version constraint. For example: - -- `debian: ">= 10.20210101"` or `debian: "< 10.20210101"` - -The general form of a `image` constraint is a map from `os` to `versionconstraint` where: - -`os` must match the first part of the image without the version. -`versionconstraint` must be the comparator, a space and the version, or simply `*` to match all versions of this `os`. -The comparator must be one of: "=", "!=", ">", "<", ">=", "=>", "<=", "=<", "~", "~>", "^" - -It must also be possible to have a `filesystemlayout` in development or for other special purposes, which can be specified during the machine allocation. -To have such a layout, both constraints `sizes` and `images`must be empty list. - -### Reinstall - -The current reinstall implementation the metal-hammer detects during the installation on which disk the OS was installed and reports back to the metal-api the Report struct which has two properties `primarydisk` and `ospartition`. -Both fields are not required anymore because the logic is now shifted to the `filesystemlayout` definition. If `Disk.WipeOnReinstall` is set to true, this disk will be wiped, default is false and is preserved. - -### Handling of s2-xlarge machines - -These machines are a bit special compared to our `c1-*` machines because they have rotating hard disks for the mass storage purpose. -The downside is that the on board SATA-DOM has the same naming as the HDDs and can not be specified as the first /dev/sda disk because all HDDs are also /dev/sd\* disks. -Therefore we had a special SATA-DOM detection algorithm inside metal-hammer which simply checks for the smallest /dev/sd disk and took this to install the OS. - -This is not possible with the current approach, but we figured out that the SATA-DOM is always `/dev/sde`. So we can create a special `filesystemlayout` where the installations is made on this disk. - -### Possible Filesystemlayout hierarchies - -It is only possible to create a filesystem on top of a block device. The creation of a block device can be done on multiple ways, depending on the requirements regarding performance, space and redundancy of the filesystem. -It also depends on the disks available on the server. - -The current approach implements the following hierarchies: - -![filesystems](filesystems.png) - -### Implementation - -```go -// FilesystemLayout to be created on the given machine -type FilesystemLayout struct { - // ID unique layout identifier - ID string - // Description is human readable - Description string - // Filesystems to create on the server - Filesystems []Filesystem - // Disks to configure in the server with their partitions - Disks []Disk - // Raid if not empty, create raid arrays out of the individual disks, to place filesystems onto - Raid []Raid - // VolumeGroups to create - VolumeGroups []VolumeGroup - // LogicalVolumes to create on top of VolumeGroups - LogicalVolumes []LogicalVolume - // Constraints which must match to select this Layout - Constraints FilesystemLayoutConstraints -} - -type FilesystemLayoutConstraints struct { - // Sizes defines the list of sizes this layout applies to - Sizes []string - // Images defines a map from os to versionconstraint - // the combination of os and versionconstraint per size must be conflict free over all filesystemlayouts - Images map[string]string -} - -type RaidLevel string -type Format string -type GPTType string - -// Filesystem defines a single filesystem to be mounted -type Filesystem struct { - // Path defines the mountpoint, if nil, it will not be mounted - Path *string - // Device where the filesystem is created on, must be the full device path seen by the OS - Device string - // Format is the type of filesystem should be created - Format Format - // Label is optional enhances readability - Label *string - // MountOptions which might be required - MountOptions []string - // CreateOptions during filesystem creation - CreateOptions []string -} - -// Disk represents a single block device visible from the OS, required -type Disk struct { - // Device is the full device path - Device string - // Partitions to create on this device - Partitions []Partition - // WipeOnReinstall, if set to true the whole disk will be erased if reinstall happens - // during fresh install all disks are wiped - WipeOnReinstall bool -} - -// Raid is optional, if given the devices must match. -// TODO inherit GPTType from underlay device ? -type Raid struct { - // ArrayName of the raid device, most often this will be /dev/md0 and so forth - ArrayName string - // Devices the devices to form a raid device - Devices []Device - // Level the raidlevel to use, can be one of 0,1,5,10 - // TODO what should be support - Level RaidLevel - // CreateOptions required during raid creation, example: --metadata=1.0 for uefi boot partition - CreateOptions []string - // Spares defaults to 0 - Spares int -} - - -// VolumeGroup is optional, if given the devices must match. -type VolumeGroup struct { - // Name of the volumegroup without the /dev prefix - Name string - // Devices the devices to form a volumegroup device - Devices []string - // Tags to attach to the volumegroup - Tags []string -} - -// LogicalVolume is a block devices created with lvm on top of a volumegroup -type LogicalVolume struct { - // Name the name of the logical volume, without /dev prefix, will be accessible at /dev/vgname/lvname - Name string - // VolumeGroup the name of the volumegroup - VolumeGroup string - // Size of this LV in mebibytes (MiB) - Size uint64 - // LVMType can be either striped or raid1 - LVMType LVMType -} - -// Partition is a single partition on a device, only GPT partition types are supported -type Partition struct { - // Number of this partition, will be added to the device once partitioned - Number int - // Label to enhance readability - Label *string - // Size given in MebiBytes (MiB) - // if "0" is given the rest of the device will be used, this requires Number to be the highest in this partition - Size string - // GPTType defines the GPT partition type - GPTType *GPTType -} - -const ( - // VFAT is used for the UEFI boot partition - VFAT = Format("vfat") - // EXT3 is usually only used for /boot - EXT3 = Format("ext3") - // EXT4 is the default fs - EXT4 = Format("ext4") - // SWAP is for the swap partition - SWAP = Format("swap") - // None - NONE = Format("none") - - // GPTBoot EFI Boot Partition - GPTBoot = GPTType("ef00") - // GPTLinux Linux Partition - GPTLinux = GPTType("8300") - // GPTLinuxRaid Linux Raid Partition - GPTLinuxRaid = GPTType("fd00") - // GPTLinux Linux Partition - GPTLinuxLVM = GPTType("8e00") - - // LVMTypeLinear append across all physical volumes - LVMTypeLinear = LVMType("linear") - // LVMTypeStriped stripe across all physical volumes - LVMTypeStriped = LVMType("striped") - // LVMTypeStripe mirror with raid across all physical volumes - LVMTypeRaid1 = LVMType("raid1") -) -``` - -Example `metalctl` outputs: - -```bash -$ metalctl filesystemlayouts ls -ID DESCRIPTION SIZES IMAGES -default default fs layout c1-large-x86, c1-xlarge-x86 debian >=10, ubuntu >=20.04, centos >=7 -ceph fs layout for ceph s2-large-x86, s2-xlarge-x86 debian >=10, ubuntu >=20.04 -firewall firewall fs layout c1-large-x86, c1-xlarge-x86 firewall >=2 -storage storage fs layout s3-large-x86 centos >=7 -s3 storage fs layout s2-xlarge-x86 debian >=10, ubuntu >=20.04, >=firewall-2 -default-devel devel fs layout -``` - -The `default` layout reflects what is actually implemented in metal-hammer to guarantee backward compatibility. - -```yaml ---- -id: default -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - label: "efi" # required to be compatible with old images - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" # required to be compatible with old images - - path: "/var/lib" - device: "/dev/sda3" - format: "ext4" - label: "varlib" # required to be compatible with old images - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -The `firewall` layout reuses the built in nvme disk to store the logs, which is way faster and larger than what the sata-dom ssd provides. - -```yaml ---- -id: firewall -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - firewall: ">=2" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sda2" - format: "ext4" - - path: "/var" - device: "/dev/nvme0n1p1" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - device: "/dev/nvme0n1" - wipe: true - partitions: - - number: 1 - label: "var" - size: 0 - type: GPTLinux -``` - -The `storage` layout will be used for the storage servers, which must have mirrored boot disks. - -```yaml ---- -id: storage -constraints: - sizes: - - s3-large-x86 - images: - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/md1" - format: "vfat" - options: "-F32" - - path: "/" - device: "/dev/md2" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid - - device: "/dev/sdb" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid -raid: - - name: "/dev/md1" - level: 1 - devices: - - "/dev/sda1" - - "/dev/sdb1" - options: "--metadata=1.0" - - name: "/dev/md2" - level: 1 - devices: - - "/dev/sda2" - - "/dev/sdb2" - options: "--metadata=1.0" -``` - -The `s3-storage` layout matches the special situation on the s2-xlarge machines. - -```yaml ---- -id: s3-storage -constraints: - sizes: - - c1-large-x86 - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sde1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sde2" - format: "ext4" - - path: "/var/lib" - device: "/dev/sde3" - format: "ext4" -disks: - - device: "/dev/sde" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -A sample `lvm` layout which puts `/var/lib` as stripe on the nvme device - -```yaml ---- -id: lvm -description: "lvm layout" -constraints: - size: - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - createoptions: - - "-F 32" - label: "efi" - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" - - path: "/var/lib" - device: "/dev/vg00/varlib" - format: "ext4" - label: "varlib" - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -volumegroups: - - name: "vg00" - devices: - - "/dev/nvmne0n1" - - "/dev/nvmne0n2" -logicalvolumes: - - name: "varlib" - volumegroup: "vg00" - size: 200 - lvmtype: "striped" -disks: - - device: "/dev/sda" - wipeonreinstall: true - partitions: - - number: 1 - label: "efi" - size: 500 - gpttype: "ef00" - - number: 2 - label: "root" - size: 5000 - gpttype: "8300" - - device: "/dev/nvmne0n1" - wipeonreinstall: false - - device: "/dev/nvmne0n2" - wipeonreinstall: false -``` - -## Components which requires modifications - -- metal-hammer: - - change implementation from build in hard coded logic - - move logic to create fstab from install.sh to metal-hammer -- metal-api: - - new endpoint `filesystemlayouts` - - add optional spec of `filesystemlayout` during `allocation` with validation if given `filesystemlayout` is possible on given size. - - add `allocation.filesystemlayout` in the response, based on either the specified `filesystemlayout` or the calculated one. - - implement `filesystemlayouts` validation for: - - matching to disks in the size - - no overlapping with the sizes/imagefilter specified in `filesystemlayouts` - - all devices specified exists from top to bottom (fs -> disks -> device || fs -> raid -> devices) -- metalctl: - - implement `filesystemlayouts` -- metal-go: - - adopt api changes -- metal-images: - - install mdadm for raid support diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP8/filesystems.drawio b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP8/filesystems.drawio deleted file mode 100644 index 0f0c6ab5..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP8/filesystems.drawio +++ /dev/null @@ -1,43 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP8/filesystems.png b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP8/filesystems.png deleted file mode 100644 index 6d903b7e..00000000 Binary files a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP8/filesystems.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP9/README.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP9/README.md deleted file mode 100644 index a8cae83d..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP9/README.md +++ /dev/null @@ -1,132 +0,0 @@ ---- -slug: /MEP-9-no-open-ports-to-the-data-center -title: MEP-9 -sidebar_position: 9 ---- - -# No Open Ports To the Data Center - -Our metal-stack partitions typically have open ports for metal-stack native services, these are: - -- SSH port on the firewalls -- bmc-reverse-proxy for serial console access through the metal-console - -These open ports are potential security risks. For example, while SSH access is possible only with private key it's still vulnerable to DoS attack. - -Therefore, we want to get rid off these open ports to reduce the attack surface to the data center. - -## Requirements - -- Access to firewall SSH only via VPN -- Easy to update VPN components - -As a next step, we can also consider joining the management servers to the VPN mesh, which would replace typical WireGuard setups for operators to enter resources inside the partition. - -## High Level Design - -[](./architecture.svg) - -> Simplified drawing showing old vs. new architecture. - -### Concerns - -There's few concerns when using WireGuard for implementing VPN: - -1. WireGuard doesn't implement dynamic cipher substitution. Which is important in case one of the crypto methods, used by WireGuard will be broken. The only possible solution for that will be to update WireGuard to a fixed version. -2. Coordination server(Headscale) is a single point of failure. In case it fails, it potentially can disconnect existing members of the network, as WireGuard can't manage dynamic IPs by itself. -3. Headscale is already falls behind Tailscale coordination server implementation. Which can complicate the upgrade to newer version of Tailscale client in case of emergency. - -### Solutions to concerns - -1. Tailscale node software is using userspace implementation of WireGuard -- `wireguard-go`. One of the options is to inject Tailscale client into `metalctl`. And make it available as `metalctl vpn` or similar command. It should be possible to do as `tailscale` node is already available as open sourced Go pkg. That would allow us to control, what version of Tailscale users are using and in case of any critical changes to enforce them to update `metalctl` to use VPN functionality. -2. Would it be a considerable risk? We could look into `wg-dynamic` project to cover this problem. -3. At the moment, repository looks well maintained and the metal-stack team already contributes to it. - -## Implementation Details - -### metal-roles - -`metal-roles` will be responsible for deployment of `headscale` server(via new `headscale` role). It also should provide sufficient config to `metal-api` so it establishes connection with `headscale` gRPC server. - -### New `metalctl` commands - -`metalctl` will be responsible for client-side implementation of this MEP. Specifically, it's by using `metalctl` user expected to connect to firewalls. - -- `metalctl vpn` -- section for VPN related commands: - - `metalctl vpn get key [vpn name] --namespace [namespace name]` -- returns auth key to be used with `tailscale` client for establishing connection. - -Extend `metalctl firewall`: - -- `metalctl firewall ssh [ID]` -- connect to firewall via SSH. - -Extend `metalctl machine`: - -- `metalctl machine ssh [ID]` -- connect to machine via SSH. - -`metalctl` will be able to connect to firewall and machines by running `tailscale` in container. - -### metal-api - -Updates to `metal-api` should be made, so that it's able to add firewalls to VPNs. There should be one Tailscale namespace per project. So if multiple firewalls are created in single project, they will join the same namespace. - -Two new flags should be introduced to connect `metal-api` to `headscale` gRPC server: - -- `headscale-addr` -- specifies address of Headscale grpc API. -- `headscale-api-key` -- specifies temporary API key to connect to Headscale. It should be replaced and then rotated by `metal-api`. - -If `metal-api` initialized with `headscale` connection it should automatically join all created firewalls to VPN. - -Add new endpoint, that will be used by `metalctl` to connect to VPN: - -- `/v1/vpn GET` -- requests auth key from `headscale` server. - -### metal-hammer - -`metal-hammer` acts as an intermediary for machine configuration between `metal-api` and machine's image. Specifically it writes to `/etc/metal/install.yaml` file, data from which later will be used by image's `install.sh` file. - -To implement VPN support we have to add authentication key and VPN server address to `install.yaml` file. This key will be used to join machine to a VPN. - -### metal-images - -Images `install.sh` script have to be updated to work with authentication key and VPN server address, provided in `install.yaml` file. If this key is present, machine should connect to VPN. - -### metal-networker - -`metal-networker` also have to know if VPN was configured. In that case we need to disable public access to SSH and allow all(?) traffic from WireGuard interface. - -### firewall-controller - -`firewall-controller` have to monitor changes in `Firewall` resource and keep `tailscaled` version up-to-date. - -### Resources - -Update `Firewall` resource to include desired/actual `tailscale` version: - -``` -Firewall: - Spec: - tailscale: - Version: Minimal version - ... - Status: - ... - VPN: - Status: Boolean field - tailscale: - Version: Actual version - ... -``` - -### bmc-reverse-proxy - -TODO - -## References - -1. [WireGuard: Next Generation Secure Network Tunnel](https://www.youtube.com/watch?v=88GyLoZbDNw) -2. [How Tailscale works](https://tailscale.com/blog/how-tailscale-works) -3. [Tailscale is officially SOC 2 compliant](https://tailscale.com/blog/soc2) -4. [Why not Wireguard](https://www.ipfire.org/blog/why-not-wireguard) -5. [Wireguard: Known Limitations](https://www.wireguard.com/known-limitations/) -6. [Wireguard: Things That Might Be Accomplished](https://www.wireguard.com/todo/) -7. [Headscale: Tailscale control protocol v2](https://github.com/juanfont/headscale/issues/526) diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP9/architecture.drawio b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP9/architecture.drawio deleted file mode 100644 index adb09214..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP9/architecture.drawio +++ /dev/null @@ -1,324 +0,0 @@ - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - -
-
-
- headscale -
-
-
-
- - headscale - -
-
- - - - - - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
-
- - - - - Viewer does not support full SVG 1.1 - - - -
diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP9/architecture.svg b/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP9/architecture.svg deleted file mode 100644 index fd268d2f..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/MEP9/architecture.svg +++ /dev/null @@ -1 +0,0 @@ -
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
headscale
headscale
tailscaled
tailscaled
tailscaled
tailscaled
Internet
Internet
Internet
Internet
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/_category_.json b/versioned_docs/version-v0.22.0/contributing/01-Proposals/_category_.json deleted file mode 100644 index 2e7fa4bf..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "position": 1, - "label": "Enhancement Proposals" -} \ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/contributing/01-Proposals/index.md b/versioned_docs/version-v0.22.0/contributing/01-Proposals/index.md deleted file mode 100644 index 0f6eddc3..00000000 --- a/versioned_docs/version-v0.22.0/contributing/01-Proposals/index.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -slug: /enhancement-proposals -title: Enhancement Proposals -sidebar_position: 1 ---- - -# Metal Stack Enhancement Proposals (MEPs) - -This section contains proposals which address substantial modifications to metal-stack. - -Every proposal has a short name which starts with _MEP_ followed by an incremental, unique number. Proposals should be raised as pull requests in the [website](https://github.com/metal-stack/website) repository and can be discussed in Github issues. - -The list of proposals and their current state is listed in the table below. - -Possible states are: - -- `In Discussion` -- `Accepted` -- `Declined` -- `In Progress` -- `Completed` -- `Aborted` - -Once a proposal was accepted, an issue should be raised and the implementation should be done in a separate PR. - -| Name | Description | State | Progress | -| :------------------------------------------------------------- | :--------------------------------------------- | :-------------: | :----------------------------------------------------------------: | -| [MEP-1](MEP1/README.md) | Distributed Control Plane Deployment | `Declined` | | -| [MEP-2](MEP2/README.md) | Two Factor Authentication | `Aborted` | | -| [MEP-3](MEP3/README.md) | Machine Re-Installation to preserve local data | `Completed` | | -| [MEP-4](MEP4/README.md) | Multi-tenancy for the metal-api | `In Progress` | [releases#236](https://github.com/metal-stack/releases/issues/236) | -| [MEP-5](MEP5/README.md) | Shared Networks | `Completed` | | -| [MEP-6](MEP6/README.md) | DMZ Networks | `Completed` | | -| [MEP-7](https://github.com/metal-stack/docs-archive/pull/51) | Passing environment variables to machines | `Declined` | | -| [MEP-8](MEP8/README.md) | Configurable Filesystemlayout | `Completed` | | -| [MEP-9](MEP9/README.md) | No Open Ports To the Data Center | `Completed` | | -| [MEP-10](MEP10/README.md) | SONiC Support | `Completed` | | -| [MEP-11](MEP11/README.md) | Auditing of metal-stack resources | `Completed` | | -| [MEP-12](MEP12/README.md) | Rack Spreading | `Completed` | | -| [MEP-13](MEP13/README.md) | IPv6 | `Completed` | | -| [MEP-14](MEP14/README.md) | Independence from external sources | `Completed` | | -| [MEP-15](https://github.com/metal-stack/docs-archive/pull/232) | HAL Improvements | `In Discussion` | [releases#238](https://github.com/metal-stack/releases/issues/238) | -| [MEP-16](MEP16/README.md) | Firewall Support for Cluster API Provider | `Accepted` | [releases#237](https://github.com/metal-stack/releases/issues/237) | -| [MEP-17](MEP17/README.md) | Global Network View | `In Discussion` | | -| [MEP-18](MEP18/README.md) | Autonomous Control Plane | `In Discussion` | | - -## Proposal Process - -1. Before starting a new proposal, it is advised to have a quick chat with one of the maintainers. -2. Create a draft pull request in the [website](https://github.com/metal-stack/website) repository with your proposal. Your proposal doesn't have to be finished at this point. -3. Share the PR in the [metal-stack Slack](https://metal-stack.slack.com/) and invite maintainers to review it. -4. The review itself will probably take place in multiple iterations. Don't be discouraged if your proposal is not accepted right away. The goal is to reach consensus. -5. Once your proposal is accepted, create an umbrella issue in the relevant repository or when multiple repositories are involved in the [releases](https://github.com/metal-stack/releases). -6. Other issues should be created in different repositories and linked to the umbrella issue. -7. Unless stated otherwise, the proposer is responsible for the implementation of the proposal. - -## How to Write a Good MEP - -In the first section of your MEP, start with the current situation and the motivation for the change. Summarize your proposal briefly. - -Next follows the main part: describe your proposal in detail. Which parts of of metal-stack are affected? Are there API changes? If yes, describe them and provide examples here. -Try to think of side effects your proposal might have. Try to provide a view on how your proposal affects users of metal-stack. -Highlight breaking changes and think of a migration path for existing users. If your proposal affects multiple components, try to describe the interaction between them. - -After the main part of your proposal, feel free to add additional sections, e.g. about alternatives that were considered, non-goals or future possibilities. - -Depending on the complexity of your proposal, you might want to add a section about the implementation plan or roadmap. - -You can have a look at the existing MEPs for inspiration. As you will notice: not every MEP has the same structure. Feel free to structure your MEP in a way that makes sense for your proposal. diff --git a/versioned_docs/version-v0.22.0/contributing/02-planning-meetings.mdx b/versioned_docs/version-v0.22.0/contributing/02-planning-meetings.mdx deleted file mode 100644 index df10177b..00000000 --- a/versioned_docs/version-v0.22.0/contributing/02-planning-meetings.mdx +++ /dev/null @@ -1,120 +0,0 @@ ---- -slug: /planning-meetings -title: Planning Meetings -sidebar_position: 2 ---- - -# Planning Meetings - -Public planning meetings are held **biweekly** on **odd calendar weeks** from **14:00 to 14:30** (Berlin/Europe timezone) on Microsoft Teams. The purpose is to provide an overview of our current projects and priorities, as well as to discuss new topics and issues within the group. - -export function PlanningMeetingDatesTable() { - const today = new Date(); - const dayOfWeek = today.getDay(); - - let daysUntilMonday = 0; - switch (dayOfWeek) { - case 0: - daysUntilMonday = 1; - break; - case 1: - daysUntilMonday = 0; - break; - default: - daysUntilMonday = 8 - dayOfWeek; - } - - const nextMonday = new Date(); - nextMonday.setDate(nextMonday.getDate() + daysUntilMonday) - - let onejan = new Date(today.getFullYear(), 0, 1); - let week = Math.ceil((((nextMonday.getTime() - onejan.getTime()) / 86400000) + onejan.getDay() + 1) / 7); - - if (week % 2 === 0) { - nextMonday.setDate(nextMonday.getDate() + 7) - } - - const blacklist = [ - new Date('2025-12-29'), - ] - - const amount = 8 - const dates = []; - - for (let i = 0; i < amount; i++) { - const nextDate = new Date(nextMonday); - nextDate.setDate(nextDate.getDate() + (i * 14)) - - if (blacklist.find(item => {return item.toDateString() == nextDate.toDateString()}) !== undefined ) { - continue - } - - dates.push(nextDate.toDateString()) - } - - return ( - - - - - - - - - - {dates.map((date, index) => ( - - - - - - ))} - -
DateTimeLink
{date}14:00 – 14:30Join Link
- ) -} - - - -Our [development planning board](https://github.com/orgs/metal-stack/projects/34) can be found on GitHub. - -[//]: <> (The C025PB1EUKC in the slack url references the #devs channel.) -If you want to get an invitation to the event, please drop us a line on our [Slack channel](https://metal-stack.slack.com/archives/C025PB1EUKC). - -Planning meetings are currently not recorded. The meetings are held either in English or German depending on the attendees. - -:::info -Note that anyone can contribute to metal-stack without participating in planning meetings. However, if you want to speed up the review process for your requirements, it might be helpful to attend the meetings. -::: - -## Agenda - -Here is the agenda that we generally want to follow in a planning meeting: - -- Possibility to bring up news that are interesting for every developer of the metal-stack org -- Check `Done` column and archive cards - - Attendees have the chance to briefly present achievements if they want -- Check the `In Progress` column and discuss whether these tasks are still worked on, there were significant blockers or they can be lower-prioritized -- Check new issues labelled with `triage` and prioritize them -- Allow attendees to bring up issues and prioritize them - - Attendees have the chance to briefly present these new issues - -## Idea Backlog - -The backlog contains ideas of what could become part of the roadmap in the future. The list is ordered alphabetically. Therefore, the order does not express the importance or weight of a backlog item. - -We incorporate community feedback into the roadmap. If you think that important points are missing in the backlog, please share your ideas with us. We have a Slack channel. Please check out [metal-stack.io](https://metal-stack.io) for contact information. - -:::danger -By no means this list is a promise of what is being worked on in the near future. It is just a summary of ideas that was agreed on to be "nice to have". It is up to the investors, maintainers and the community to choose topics from this list and to implement them or to remove them from the list. -::: - -- Add metal-stack to [Gardener conformance test grid](https://testgrid.k8s.io/gardener-all) -- Autoscaler for metal control plane components -- CI dashboard and public integration testing -- Improved release and deploy processes (GitOps, [Spinnaker](https://spinnaker.io/), [Flux](https://fluxcd.io/)) -- Machine internet without firewalls -- metal-stack dashboard (UI) -- Offer our metal-stack extensions as enterprise products (accounting, cluster-api, S3) (neither of them will ever be required for running metal-stack, they just add extra value for certain enterprises) -- Partition managed by Kubernetes (with Kubelets joining the control plane cluster) -- Public offering / demo playground diff --git a/versioned_docs/version-v0.22.0/contributing/03-contribution-guideline.md b/versioned_docs/version-v0.22.0/contributing/03-contribution-guideline.md deleted file mode 100644 index 010c2a05..00000000 --- a/versioned_docs/version-v0.22.0/contributing/03-contribution-guideline.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -slug: /contribution-guideline -title: Contribution Guideline -sidebar_position: 3 ---- - -# Contribution Guideline - -This document describes the way we want to contribute code to the projects of metal-stack, which are hosted on [github.com/metal-stack](https://github.com/metal-stack). - -The document is meant to be understood as a general guideline for contributions, but not as burden to be placed on a developer. Use your best judgment when contributing code. Try to be as clean and precise as possible when writing code and try to make your code as maintainable and understandable as possible for other people. - -Even if it should go without saying, we live an open culture of discussion, in which everybody is welcome to participate. We treat every contribution with respect and objectiveness with the general aim to write software of quality. - -If you want, feel free to propose changes to this document in a pull request. - -## How Can I Contribute? - -Open a Github issue in the project you would like to contribute. Within the issue, your idea can be discussed. It is also possible to directly create a pull request when the set of changes is relatively small. - -When opening an issue please consider the following aspects: - -1. Create a meaningful issue describing the WHY? of your contribution. -1. Try to set appropriate labels to the issue. For example, attach the `triage` label to your issue if you want it to be discussed in the next [planning meeting](./02-planning-meetings.mdx). It might be useful to attend the meeting if you want to emphasize it being worked on. - -### Pull Requests - -The process described here has several goals: - -- Maintain quality -- Enable a sustainable system to review contributions -- Enable documented and reproducible addition of contributions - -1. Create a repository fork within the context of that issue. Members of the organization may work on the repository directly without a fork, which allows building development artifacts more easily. -1. Develop, document and test your contribution (try not to solve more than one issue in a single pull request). -1. Create a Draft Pull Request to the repository's main branch. -1. Create a meaningful description of the pull request or reference the related issue. The pull request template explains what the content should include, please read it. -1. Ask for merging your contribution by removing the draft marker. Repository maintainers (see [Code Ownership](#code-ownership)) are notified automatically, but you can also reach out to people directly on Slack if you want a review from a specific person. - -## General Objectives - -This section contains language-agnostic topics that all metal-stack projects are trying to follow. - -### Code Ownership - -The code base is owned by the entire team and every member is allowed to contribute changes to any of the projects. This is considered as collective code ownership[^1]. - -As a matter of fact, there are persons in a project, which already have experience with the sources. These are defined directly in the repository's [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) file. If you want to merge changes into the master branch, it is advisable to include code owners into the process of discussion and merging. - -### Microservices - -One major ambition of metal-stack is to follow the idea of [microservices](https://en.wikipedia.org/wiki/Microservices). This way, we want to achieve that we can - -- adapt to changes faster than with monolithic architectures, -- be free of restrictions due to certain choices of technology, -- leverage powerful traits of cloud infrastructures (e.g. high-scalability, high-availability, ...). - -### Programming Languages - -We are generally open to write code in any language that fits best to the function of the software. However, we encourage [golang](https://en.wikipedia.org/wiki/Go_(programming_language)) to be the main language of metal-stack as we think that it makes development faster when not establishing too many different languages in our architecture. Reason for this is that we are striving for consistent behavior of the microservices, similar to what has been described for the Twelve-Factor App (see [12 Factor](https://12factor.net/)). We help enforcing unified behavior by allowing a small layer of shared code for every programming language. We will refer to this shared code as "libraries" for the rest of this document. - -### Artifacts - -Artifacts are always produced by a CI process (Github Actions). - -Docker images are published on the Github Container Registry of the metal-stack organization. - -Binary artifacts or OS images can be uploaded to `images.metal-stack.io` if necessary. - -When building Docker images, please consider our build tool [docker-make](https://github.com/fi-ts/docker-make) or the specific [docker-make action](https://github.com/fi-ts/action-docker-make) respectively. - -### APIs - -We are currently making use of [Swagger](https://swagger.io/) when we exposing traditional REST APIs for end-users. This helps us with being technology-agnostic as we can generate clients in almost any language using [go-swagger](https://goswagger.io/). Swagger additionally simplifies the documentation of our APIs. - -Most APIs though are not required to be user-facing but are of technical nature. These are preferred to be implemented using [grpc](https://grpc.io/). - -#### Versioning - -Artifacts are versioned by tagging the respective repository with a tag starting with the letter `v`. After the letter, there stands a valid [semantic version](https://semver.org/). - -### Documentation - -In order to make it easier for others to understand a project, we document general information and usage instructions in a `README.md` in any project. - -In addition to that, we document a microservice in the [docs](https://github.com/metal-stack/docs) repository. The documentation should contain the reasoning why this service exists and why it was being implemented the way it was being implemented. The aim of this procedure is to reduce the time for contributors to comprehend architectural decisions that were made during the process of writing the software and to clarify the general purpose of this service in the entire context of the software. - -## Guidelines - -This chapter describes general guidelines on how to develop and contribute code for a certain programming language. - -### Golang - -Development follows the official guide to: - -- Write clear, idiomatic Go code[^2] -- Learn from mistakes that must not be repeated[^3] -- Apply appropriate names to your artifacts: - - [https://go.dev/talks/2014/names.slide](https://go.dev/talks/2014/names.slide) - - [https://go.dev/blog/package-names](https://go.dev/blog/package-names) - - [https://go.dev/doc/effective_go#names](https://go.dev/doc/effective_go#names) -- Enable others to understand the reasoning of non-trivial code sequences by applying a meaningful documentation. - -#### Development Decisions - -- **Dependency Management** by using Go modules -- **Build and Test Automation** by using [GNU Make](https://man7.org/linux/man-pages/man1/make.1p.html). -- **End-user APIs** should consider using go-swagger and [Go-Restful](https://github.com/emicklei/go-restful) - **Technical APIs** should consider using [grpc](https://grpc.io/) - -#### Libraries - -metal-stack maintains several libraries that you should utilize in your project in order to unify common behavior. Some of these projects are: - -- [metal-go](https://github.com/metal-stack/metal-go) -- [metal-lib](https://github.com/metal-stack/metal-lib) - -#### Error Handling with Generated Swagger Clients - -From the server-side you should ensure that you are returning the common error json struct in case of an error as defined in the `metal-lib/httperrors`. Ensure you are using `go-restful >= v2.9.1` and `go-restful-openapi >= v0.13.1` (allows default responses with error codes other than 200). - -### Documentation - -We want to share knowledge and keep things simple. If things cannot kept simple we want to enable everybody to understand them by: - -- Document in short sentences[^4]. -- Do not explain the HOW (this is already documented by your code and documenting the obvious is considered a defect). -- Explain the WHY. Add a "to" in your documentation line to force yourself to explain the reasonning (e.g. "` to `"). - -### Python - -Development follows the official guide to: - -- Style Guide for Python Code (PEP 8)[^5] - - The use of an IDE like [PyCharm](https://www.jetbrains.com/pycharm/) helps to write compliant code easily -- Consider [setuptools](https://pythonhosted.org/an_example_pypi_project/setuptools.html) for packaging -- If you want to add a Python microservice to the mix, consider [pyinstaller](https://github.com/pyinstaller/pyinstaller) on Alpine to achieve small image sizes - -[^1]: [https://martinfowler.com/bliki/CodeOwnership.html](https://martinfowler.com/bliki/CodeOwnership.html) - -[^2]: [https://go.dev/doc/effective_go](https://go.dev/doc/effective_go) - -[^3]: [https://github.com/golang/go/wiki/CodeReviewComments](https://github.com/golang/go/wiki/CodeReviewComments) - -[^4]: [https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences](https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences) - -[^5]: [https://www.python.org/dev/peps/pep-0008/](https://www.python.org/dev/peps/pep-0008/) diff --git a/versioned_docs/version-v0.22.0/contributing/04-release-flow.md b/versioned_docs/version-v0.22.0/contributing/04-release-flow.md deleted file mode 100644 index 2a6403b7..00000000 --- a/versioned_docs/version-v0.22.0/contributing/04-release-flow.md +++ /dev/null @@ -1,107 +0,0 @@ ---- -slug: /release-flow -title: Release Flow -sidebar_position: 4 ---- - -# Releases - -The metal-stack contains of many microservices that depend on each other. The automated release flow is there to ensure that all components work together flawlessly for every metal-stack release. - -Releases and integration tests are published through our [release repository](https://github.com/metal-stack/releases). You can also find the [release notes](https://github.com/metal-stack/releases/releases) for this metal-stack version in there. The release notes contain information about new features, upgrade paths and bug fixes. - -If you want, you can sign up at our Slack channel where we are announcing every new release. Often, we provide additional information for metal-stack administrators and adopters at this place, too. - -This document is intended for developers, especially maintainers of metal-stack projects. - -## Release Flow - -The following diagram attempts to describe our current release flow: - -![](release_flow.svg) - -A release is created in the following way: - -- Individual repository maintainers within the metal-stack GitHub Organization can publish a release of their component. -- This release is automatically pushed to the `develop` branch of the release repository by the metal-robot. -- A push triggers a virtual release integration test using the mini-lab environment. This setup launches metal-stack with the `sonic` and `gardener` flavors to validate the different Ansible roles and execute basic operations across the metal-stack layer. -- To contribute components that are not directly part of the release vector, a pull request must be made against the `develop` branch of the release repository. Release maintainers may push directly to the `develop` branch. -- The release maintainers can `/freeze` the `develop` branch, effectively stopping the metal-robot from pushing component releases to this branch. -- The `develop` branch is tagged by a release maintainer with a `-rc.x` suffix to create a __release candidate__. -- The release candidate must pass a large integration test suite on a real environment, which is currently run by FI-TS. It tests the entire machine provisioning engine including the integration with Gardener, the deployment, metal-images and Kubernetes conformance tests. -- If the integration tests pass, the PR of the `develop` branch must be approved by at least two release maintainers. -- A release is created via GitHub releases, including all release notes, with a tag on the `main` branch. - -## FAQ - -**Question: I need PR #xyz to go into the release, why did you not include it?** - -Answer: It's not on purpose if we miss a PR to be included into a metal-stack release. Please use the pending pull request from `develop` into `master` as soon as it is open and comment which pull request you want to have included into the release. Also consider attending our planning meetings or contact us in our Slack channel if you have urgent requirements that need to be dealt with. - -**Question: Who is responsible for the releases? Who can freeze a release?** - -Answer: Every repository in metal-stack has a `CODEOWNERS` file pointing to a maintainer team. This is also true for the releases repository. Only release repository maintainers are allowed to `/freeze` a release (meaning the metal-robot does not automatically append new component releases to the release vector anymore). - -**Question: I can't push to the `develop` branch of this repository? How can I request changes to the release vector?** - -Answer: Most changes are automatically integrated by the metal-robot. For manually managed components, please raise a pull request against the `develop` branch. Only release maintainers are allowed to push to `develop` as otherwise it would be possible to mess up the release pipeline. - -**Question: What requirements need to be fulfilled to add a repository to the release vector?** - -Please see the section below named [Requirements for Release Vector Repositories](#requirements-for-release-vector-repositories). - -### Requirements for Release Vector Repositories - -Before adding a repository in the metal-stack org to the releases repository, it is advised for the maintainer to fulfill the following points: - -- The following files should be present at the repository root: - - [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) - - When a repository is created, the metal-robot automatically creates a -maintainers team in our GitHub org. - - The CODEOWNERS file should reference this team. - - The team should contain at least two maintainers. - - `LICENSE` - - This usually should be MIT with "metal-stack" as authors. - - `CONTRIBUTING.md` - - This should contain the following content: - ``` - # Contributing - - Please check out the [contributing section](https://docs.metal-stack.io/stable/development/contributing/) in our [docs](https://docs.metal-stack.io/). - ``` - - `README.md` -- The `developers-core` team should be given repository access with `write` role, the codeowners team should have the `maintain` role -- Release artifacts should have an SPDX-formatted SBOM attached. - - For container images these are embedded using Buildx. -- The following branch protection rules should be set: - - The mainline should be protected. - - A pull request should be required before merging (required by at least one code owner). - - Status checks should be required to pass. - - Force push should not be allowed on this branch. -- One person from the releases maintainers has to add the repository to the metal-robot in order to pick up the releases, add them to the release vector and generate release notes. - -### How-To Release a Project - -[release-drafter](https://github.com/release-drafter/release-drafter) is preferred in order to generate release notes from merged PRs for your projects. It should be triggered for pushes on your main branch. - -The draft is then used to create a project release. The release has to be published through the GitHub UI as demonstrated in the screenshot below. - -**Tagging the repository is not enough as repository tagging does not associate your release notes to your release!** - -![](release.png) - -Some further remarks: - -- Use semver versions with `v` prefix for your tags -- Name your release after your release tag -- The metal-robot only picks up lines from your release notes that start with `-` or `*` (unordered list items) and appends them to the according section in the aggregated release draft -- A tag created through a GitHub UI release does not trigger a `push` event . This means, your pipeline will not start to run with the `push` trigger when publishing through the UI. - - Instead, use the `published` [release event trigger](https://docs.github.com/en/actions/reference/events-that-trigger-workflows#release) for your actions: - - ```yaml - on: - release: - types: - - published - ``` -- In case they are necessary, please do not forget to include `NOTEWORTHY`, `ACTIONS_REQUIRED` or `BREAKING_CHANGE` sections into releases. More information on those release draft sections can be read in a pull request template. diff --git a/versioned_docs/version-v0.22.0/contributing/05-community.md b/versioned_docs/version-v0.22.0/contributing/05-community.md deleted file mode 100644 index 61eaf099..00000000 --- a/versioned_docs/version-v0.22.0/contributing/05-community.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -slug: /community -title: Community -sidebar_position: 5 -draft: true ---- - -# Community - -(Slack channel, community events like FOSDEM, Kubernetes Community Days..., blog -articles) diff --git a/versioned_docs/version-v0.22.0/contributing/release.png b/versioned_docs/version-v0.22.0/contributing/release.png deleted file mode 100644 index 598b1182..00000000 Binary files a/versioned_docs/version-v0.22.0/contributing/release.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.0/contributing/release_flow.drawio b/versioned_docs/version-v0.22.0/contributing/release_flow.drawio deleted file mode 100644 index 6ca6b34f..00000000 --- a/versioned_docs/version-v0.22.0/contributing/release_flow.drawio +++ /dev/null @@ -1,721 +0,0 @@ - - - - - - - - - - - -
-
-
- Review release notes -
-
-
-
- - Review release notes - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- Organization Webhook -
-
-
-
- - Organization Webhook - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- - Publish release - -
-
-
-
- - Publish release - -
-
-
- - - - - - - - -
-
-
- Maintainer -
-
-
-
- - Maint... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- metal-robot release handler -
-
-
-
- - metal-robot release han... - -
-
-
- - - - - - - - -
-
-
- - no - -
-
-
-
- - no - -
-
-
- - - - - - - - -
-
-
- - yes - -
-
-
-
- - yes - -
-
-
- - - - - - - -
-
-
- version in event newer than release vector version -
-
-
-
- - version in event newer than... - -
-
-
- - - - - - - -
-
-
- - do nothing - -
-
-
-
- - do nothing - -
-
-
- - - - - - - - - - - - -
-
-
- Github Action -
-
-
-
- - Github Action - -
-
-
- - - - - - - -
-
-
- Bump version in release vector and push to - - develop - -
-
-
-
- - Bump version in release vector... - -
-
-
- - - - - - - - - - - -
-
-
- Open pull request from - - develop - - to - - master - -
-
-
-
- - Open pull request from develop... - -
-
-
- - - - - - - -
-
-
- Update aggregated release draft in - - metal-stack/releases - -
-
-
-
- - Update aggregated release draf... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Integration Testing -
-
-
-
- - Integration Testing - -
-
-
- - - - - - - - - - - -
-
-
- Merge to - - master - -
-
-
-
- - Merge to master - -
-
-
- - - - - - - - - - - - -
-
-
- Review -
-
-
-
- - Review - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Tests suceeded and PR changes reviewed -
-
-
-
- - Tests suceeded and PR chang... - -
-
-
- - - - - - - -
-
-
- - publish results to #integration - -
-
-
-
- - publish results to #integr... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Release metal-stack -
-
-
-
- - Release metal-stack - -
-
-
- - - - - - - - - - - -
-
-
- - publish to #announcements - -
-
-
-
- - publish to #announcements - -
-
-
- - - - - - - -
-
-
- - - metal-stack/docs - - pull request - -
-
-
-
- - metal-stack/docs pull requ... - -
-
-
- - - - - - - - - - - - -
-
-
- Freeze -
-
-
-
- - Freeze - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Freeze - - develop - - and create a release candidate -
-
-
-
- - Freeze develop and create a rel... - -
-
-
- - - - - - - -
-
-
- Large integration suites -
- - (currently owned by FI-TS, not public) - -
-
-
-
-
- - Large integration suites... - -
-
-
- - - - - - - - -
-
-
- Run -
-
-
-
- - Run - -
-
-
- - - - -
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.0/contributing/release_flow.svg b/versioned_docs/version-v0.22.0/contributing/release_flow.svg deleted file mode 100644 index 55cdd493..00000000 --- a/versioned_docs/version-v0.22.0/contributing/release_flow.svg +++ /dev/null @@ -1 +0,0 @@ -
Review release notes
Review release notes
projects
projects
projects
projects
Organization Webhook
Organization Webhook
projects
projects
Publish release
Publish release
Maintainer
Maint...
metal-robot release handler
metal-robot release han...
no
no
yes
yes
version in event newer than release vector version
version in event newer than...
do nothing
do nothing
Github Action
Github Action
Bump version in release vector and push todevelop
Bump version in release vector...
Open pull request fromdeveloptomaster
Open pull request from develop...
Update aggregated release draft inmetal-stack/releases
Update aggregated release draf...
Integration Testing
Integration Testing
Merge tomaster
Merge to master
Review
Review
Tests suceeded and PR changes reviewed
Tests suceeded and PR chang...
publish results to #integration
publish results to #integr...
Release metal-stack
Release metal-stack
publish to #announcements
publish to #announcements
metal-stack/docspull request
metal-stack/docs pull requ...
Freeze
Freeze
Freezedevelopand create a release candidate
Freeze develop and create a rel...
Large integration suites
(currently owned by FI-TS, not public)
Large integration suites...
Run
Run
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.0/docs/02-General/04-flavors-of-metalstack.md b/versioned_docs/version-v0.22.0/docs/02-General/04-flavors-of-metalstack.md index 7da427fc..2277ca6b 100644 --- a/versioned_docs/version-v0.22.0/docs/02-General/04-flavors-of-metalstack.md +++ b/versioned_docs/version-v0.22.0/docs/02-General/04-flavors-of-metalstack.md @@ -14,7 +14,7 @@ As modern infrastructure and cloud native applications are designed with Kuberne Regardless which flavor of metal-stack you use, it is always possible to manually provision machines, networks and ip addresses. This is the most basic way of using metal-stack and is very similar to how traditional bare metal infrastructures are managed. -Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](../../contributing/01-Proposals/MEP4/README.md) and [MEP-16](../../contributing/01-Proposals/MEP16/README.md) in the future. +Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api) and [MEP-16](/community/MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller) in the future. ## Gardener diff --git a/versioned_docs/version-v0.22.0/docs/04-For Operators/03-deployment-guide.mdx b/versioned_docs/version-v0.22.0/docs/04-For Operators/03-deployment-guide.mdx index 58ddafd3..6be800cd 100644 --- a/versioned_docs/version-v0.22.0/docs/04-For Operators/03-deployment-guide.mdx +++ b/versioned_docs/version-v0.22.0/docs/04-For Operators/03-deployment-guide.mdx @@ -31,7 +31,7 @@ You can use the [mini-lab](https://github.com/metal-stack/mini-lab) as a templat The metal control plane is typically deployed in a Kubernetes cluster. Therefore, this document will assume that you have a Kubernetes cluster ready for getting deployed. Even though it is theoretically possible to deploy metal-stack without Kubernetes, we strongly advise you to use the described method because we believe that Kubernetes gives you a lot of benefits regarding the stability and maintainability of the application deployment. :::tip -For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](../../contributing/01-Proposals/MEP18/README.md) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). +For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](/community/MEP-18-autonomous-control-plane) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). ::: Let's start off with a fresh folder for your deployment: diff --git a/versioned_docs/version-v0.22.0/docs/05-Concepts/01-architecture.mdx b/versioned_docs/version-v0.22.0/docs/05-Concepts/01-architecture.mdx index 709960e3..75298df9 100644 --- a/versioned_docs/version-v0.22.0/docs/05-Concepts/01-architecture.mdx +++ b/versioned_docs/version-v0.22.0/docs/05-Concepts/01-architecture.mdx @@ -152,4 +152,4 @@ Thus, for creating a partition as well as a machine or a firewall, the flags `dn In order to be fully offline resilient, make sure to check out `metal-image-cache-sync`. This component provides copies of `metal-images`, `metal-kernel` and `metal-hammer`. -This feature is related to [MEP14](../../contributing/01-Proposals/MEP14/README.md). +This feature is related to [MEP14](/community/MEP-14-independence-from-external-sources). diff --git a/versioned_docs/version-v0.22.0/docs/05-Concepts/02-user-management.md b/versioned_docs/version-v0.22.0/docs/05-Concepts/02-user-management.md index f1ee2778..ba742ee9 100644 --- a/versioned_docs/version-v0.22.0/docs/05-Concepts/02-user-management.md +++ b/versioned_docs/version-v0.22.0/docs/05-Concepts/02-user-management.md @@ -7,7 +7,7 @@ sidebar_position: 2 # User Management At the moment, metal-stack can more or less be seen as a low-level API that does not scope access based on projects and tenants. -Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](../../contributing/01-Proposals/MEP4/README.md). +Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](/community/MEP-4-multi-tenancy-for-the-metal-api). Until then projects and tenants can be created, but have no effect on access control. diff --git a/versioned_docs/version-v0.22.0/docs/06-For CISOs/Security/01-principles.md b/versioned_docs/version-v0.22.0/docs/06-For CISOs/Security/01-principles.md index 8e7030f5..e327ec4a 100644 --- a/versioned_docs/version-v0.22.0/docs/06-For CISOs/Security/01-principles.md +++ b/versioned_docs/version-v0.22.0/docs/06-For CISOs/Security/01-principles.md @@ -15,7 +15,7 @@ The minimal need to know principle is a security concept that restricts access t ### RBAC :::info -As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](../../../contributing/01-Proposals/MEP4/README.md). +As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](..//community/MEP-4-multi-tenancy-for-the-metal-api). ::: As described in our [User Management](../../05-Concepts/02-user-management.md) concept the [metal-api](https://github.com/metal-stack/metal-api) currently offers three different user roles for authorization: diff --git a/versioned_docs/version-v0.22.0/docs/06-For CISOs/Security/04-communication-matrix.md b/versioned_docs/version-v0.22.0/docs/06-For CISOs/Security/04-communication-matrix.md index 07df2607..24c1bc1d 100644 --- a/versioned_docs/version-v0.22.0/docs/06-For CISOs/Security/04-communication-matrix.md +++ b/versioned_docs/version-v0.22.0/docs/06-For CISOs/Security/04-communication-matrix.md @@ -116,7 +116,7 @@ Please note that every [networking setup](../../05-Concepts/03-Network/01-theory | VLAN | Switches, Firewalls | Layer 2 traffic segmentation. | | VXLAN | Switches, Firewalls | Encapsulate Layer 2 frames in Layer 3 packets for network virtualization. | | EVPN | Switches, Firewalls | Overlay network technology for scalable and flexible network architectures. | -| VPN | Firewalls | Management access [without open SSH ports](../../../contributing/01-Proposals/MEP9/README.md). | +| VPN | Firewalls | Management access [without open SSH ports](..//community/MEP-9-no-open-ports-to-the-data-center). | | BGP | Multiple | Routing protocol for dynamic routing and network management. | | SSH | Management Server, Switches | Secure shell access for management and configuration. | | LLDP | Switches, Machines | Link Layer Discovery Protocol for network device discovery. | diff --git a/versioned_docs/version-v0.22.0/docs/06-For CISOs/rbac.md b/versioned_docs/version-v0.22.0/docs/06-For CISOs/rbac.md index 9a87b896..06c902bb 100644 --- a/versioned_docs/version-v0.22.0/docs/06-For CISOs/rbac.md +++ b/versioned_docs/version-v0.22.0/docs/06-For CISOs/rbac.md @@ -31,4 +31,4 @@ To ensure that internal components interact securely with the metal-api, metal-s Users can interact with the metal-api using [metalctl](https://github.com/metal-stack/metalctl), the command-line interface provided by metal-stack. Depending on the required operations, users should authenticate with the appropriate role to match their level of access. -As part of [MEP-4](../../contributing/01-Proposals/MEP4/README.md), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. +As part of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. diff --git a/versioned_docs/version-v0.22.0/docs/06-For CISOs/remote-access.md b/versioned_docs/version-v0.22.0/docs/06-For CISOs/remote-access.md index 0b8dbb19..dc24e82f 100644 --- a/versioned_docs/version-v0.22.0/docs/06-For CISOs/remote-access.md +++ b/versioned_docs/version-v0.22.0/docs/06-For CISOs/remote-access.md @@ -6,7 +6,7 @@ title: Remote Access ## Machines and Firewalls -Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](../../contributing/01-Proposals/MEP9/README.md). Administrators can access machines in two primary ways. +Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](/community/MEP-9-no-open-ports-to-the-data-center). Administrators can access machines in two primary ways. **Out-of-band management via SOL** @@ -26,4 +26,4 @@ This approach uses the [`metal-console`](../08-References/Control%20Plane/metal- Both methods ensure secure and controlled access to machines without exposing them unnecessarily to the network, maintaining the integrity and safety of the infrastructure. -Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](../../contributing/01-Proposals/MEP4/README.md). \ No newline at end of file +Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api). \ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/Distributed-API-Working.png b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/Distributed-API-Working.png deleted file mode 100644 index 899e223d..00000000 Binary files a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/Distributed-API-Working.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/Distributed-API.png b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/Distributed-API.png deleted file mode 100644 index 688c7c2e..00000000 Binary files a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/Distributed-API.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/Distributed-Deployment.png b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/Distributed-Deployment.png deleted file mode 100644 index 8bba51b8..00000000 Binary files a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/Distributed-Deployment.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/Distributed.drawio b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/Distributed.drawio deleted file mode 100644 index f7c6fe79..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/Distributed.drawio +++ /dev/null @@ -1 +0,0 @@ -7V3bcts2EP0aP8pDgPfH2ImbziRTt+m0zVMHpmCJMUUoFG1L/fqC4kUkAEoULwDkRC8WIBIkF2d3z+6C8JV5u9r+kqD18jOZ4+gKGvPtlfn+CtKPY9M/Wc8u7/FtM+9YJOE87wKHji/hf7joNIre53CON40DU0KiNFw3OwMSxzhIG30oSchr87BHEjWvukYLzHV8CVDE9/4dztNl3uvZxqH/Iw4Xy/LKwCh+eUDB0yIhz3FxvStoPu4/+c8rVI5VHL9Zojl5rXWZH67M24SQNP+22t7iKJNtKbb8vLuWX6v7TnCcdjkhWVoPJP5zF3/w/trOvG+vv70+z4rJe0HRMy4fw4noeDePhA5L5YmC/Afn+3N2pzcfcfSC0zBAh67s4dJd1DwuO3+22U/3O3oAgOtt/Qxnkf29R0kapiGJy6vS288vnP9eSK4aHFIhrrOvz6voLkEr+vXmdRmm+Ms6v89XClLat0xXEW0B+nU/QziTgUFb1SRkjYCswqD4HqEHHN1UU3pLIpLQn2ISZ9fYpAl5qvABise7Q6swymD/F07mKEZFd4FxkA2LonAR00ZAJwkn1RO94CTF29aJBBU8qNphssJpsqOHFCfMbLeAVKFywC0x+VpDcNm3rKHX9opOVGjNohr9gBz6pQDPGUACkEPSBif0OWkfHGEayyfzMqlWj2QaI4kUcCK1eJGajkikYASRrrboX/K79Qh+vYe77+Ef8RMBM4+TaECCp4SgYClDoJz0BDJuF6jFCNQ0O8oTGiPI055D4NsPc8+Yo0cAwMy0OJHhOfUDRZMk6ZIsSIyiD4dexnTUZDdHm+W+H3SwHNTE3YXZne5HwfH8Xea1souucZz3NH+v24+OZqZ1xrKHPDpfCY5QGr40naFI9PtT6a2jXe2ANQnjdFMb+T7rOMDAAoxaGdBvOqkzT6Bf8nsQn256LadXd7whz0mAi9MYQFVi6a+zrsCfDsTd4ZhPhKwL0H3DaborEICeU9LE5x50JcyCCG02mZ9rYBEcQ00upCOPWdAGOt4Cp0eOc8ZG4SCDypOdmkE1ADdTpVGlPGFNtTkTUuXQI/yYNTfUvobx4tO+9d50xrKeVhPHlsDBAyiwns5Uzsg5Krt2Dy9fdpUMVMgOuCh4QLZredg2fedhBji5MQT7bObckZJzBNt498OQ7HKm3Wm4jW0zXsbmEWaL6LdlTqWegLdesv1MC+Hp72T8SSgMxxkoN2yhquUYuZubjDP4nImgI6JohtahRmbVYsxqlcBR5pLKkPMtYb4U6uSgh23xmSTQlw9aRz3apJmNT5FGsIeedrA3j1ExzfMC0G6BnZS0gFieloCivcGYDXQN2oBeURu4mLCNtRXqozZwMWEbSy80kJ0ol6ModLv5GbqNgjIuza9B5OYp9zbjs1hJoRub7qVs4tqofWBzwKkp7UUEcmx6TD2hrQrkb0gDJqq/8PUSnk8r1IAKjXoHdWx2XQMVgAKuwerEoXLIhAd8dw3neBum/2R4vva8ovl137SL1vttgfZ9Y1dr3OMkpM+X+eWiNkmfNR8LOGW7NljWPIy2b+3qLXa8TurVllE/Hca4HVWw4fv5SS/7hmZc2KyxYzNoailNCkYymJHY2HhqNb/kDAS3MgFUpFBdDgL+IDkI2DUHAfXKQcCLyUFwpWMABSuZJHu3i8lCcMVjHaR3Mg8hbY3mzxLyVCVkv3Miwp0MZ9omIg4UtuSsX2vkVsxfB/goVXXnAxGRxeMuImHBEzZDoCxybXKZDdRPV/rj3pSUsuBKz9Jxb15GmoKiTD/g84mKywn9gK9f5GfysbRy0zJF5FcuwD8Z+Zn22GZo2PzwkbmmkV/1opk+oYt5PGzWKPCzWFOrgfD4qPln/fnC4z4AtAv7LNEKddYB/Zilh6PpmNOOrGsKU1H9AVg+g6neBQhQJR0lMXhLVIGIN4QCVhuXwr9TKqTvIm2fzKdYGr6f1vqiZKXxdkPhT6h7f822Or/VZnXU7IEqa9sN/Hjsy9tTKxmfHvoJlrPB4koCi8nSf8Pyr53Dx5WLHZ75o3V4nacX6ewFT9ch0chWM6badQSW2pVpqUvd11DXpGbj7dELwS3qw754LjspafPhnobJeMC9YK88Jeko8Uo1j+Oe5XL0UzGna0RTsu7JKwSA8WU+u8fKxLs4OKauxrd1lk/zEElvFtpmv7k7OdCe0Hjm4WNLtc8Onwiu7POMzn2WW9DGTHM1U19Q6QCmVDMtWgS0D9mv12WmcfZwiiHS50+bWjOCtNglU1WgVRMWFMXpk70U4vBxOi8spERYM2hoJy1tV64McMqSVqFwhQ/ZvNfhsww6FuNN/RahlHekcxKUyxSrz4G6auOFqtGtejEtKZS31o0tfPVlhTPTsBlEWdkrTwda6HQyX+fuZMc/QQHa9hvltrLzGmc0t7IbbQM6vjKSJd6Uswb2VU31rMG9JELP5l3U83lXSYJSiRmdPPdosablaKDb1VTyywfdPgH0uZaSf5rjhpJbBi3HTvLhaNNOqglF2bWxUs2keGNnTk7Vvs7ti9+02dfZZsEld19noUQhJ1EVlvSsFZ+MBTwZ0gqfu2AmdVIqPM4aaGQHTc7RV1tHXu45ENoMvxRuZjJZRNo+c5KWew4SHrcBgHLRqdkEY0TtFhSRhMf5KrUb8gost1bYY3WK1NnJNxdUNT181nuaEvi4hhf4ULn1UJvTOrcG3r++PYoy+BehDNLy4sO0wWTLtOq1QZMdPUdELOjKnZUittxtUpkVgr2sEKjZoNKSyTBDnSd1aEDUK43DRheGb9cBcvL4MhvZdrx7/PjBWZ+jIq9ZBmo4E7KlkqHgNUH+2tGpF1rVcw4otfwoliX//6mUqH/FJdzuZKI3cxlT/btycqX5EMGmlhdKNaESrtl5lod67l5GnjPC7P+QZIuaFjx+xkRmmw8ML8Jss2km9Ua73GjuMhIgvWz7iMor2q7uCEDHXzbB/vMJg90zcrzFWWIB6OHjVUwpHDqlw/SUf39Kx1QYYMtr6oN/qDoI7ctPFJm4zpnhiUycrdrECZLOGubZ9FO0Mu/3hnxD18SwWtdwZNe+KdatjVws8UTAtaQCF7414JYb2p0mNbZK5Ir23dMXuRy38UT/JmAk5NJmQrLtlw6uLUHr5Wcyx9kR/wM= \ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/Distributed.png b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/Distributed.png deleted file mode 100644 index d96ca216..00000000 Binary files a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/Distributed.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/README.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/README.md deleted file mode 100644 index 0fd4bb63..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP1/README.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -slug: /MEP-1-distributed-metal-control-plane -title: MEP-1 -sidebar_position: 1 ---- - -# Distributed Metal Control Plane - -This enhancement proposal was replaced by [MEP18](../MEP18/README.md). - -## Problem Statement - -We face the situation that we argue for running bare metal on-premises because this way the customers can control where and how their software and data are processed and stored. -On the other hand, we have currently decided that our metal-api control plane components run on a kubernetes cluster (in our case on a cluster provided by one of the available hyperscalers). - -Running the control plane on Kubernetes has the following benefits: - -- Ease of deployment -- Get most, if not all, of the required infrastructure services like (probably incomplete): - - IPs - - DNS - - L7-Loadbalancing - - Storage - - S3 Backup - - High Availability - -Using a kubernetes as a service offering from one of the hyperscalers, enables us to focus on using kubernetes instead of maintaining it as well. - -## Goal - -It would be much saner if metal-stack has no, or only minimal dependencies to external services. Imagine a metal-stack deployment in a plant, it would be optimal if we only have to deliver a single rack with servers and networking gear installed and wired, plug that rack to the power supply and a internet uplink and its ready to go. - -Have a second plant which you want to be part of all your plants? Just tell both that they are part of something bigger and metal-api knows of two partitions. - -## Possible Solutions - -We can think of two different solutions to this vision: - -1. Keep the central control plane approach and require some sort of kubernetes deployment accessible from the internet. This has the downside that the user must, provide a managed kubernetes deployment in his own datacenter or uses a hyperscaler. Still not optimal. -1. Install the metal-api and all its dependencies in every partition, replicate or shard the databases to every connected partition, make them know each other. Connect the partitions over the internet with some sort of vpn to make the services visible to each other. - -As we can see, the first approach does not really address the problem, therefore i will describe solution #2 in more details. - -## Central/Current setup - -### Stateful services - -Every distributed system suffer from handling state in a scalable, fast and correct way. To start how to cope with the state, we first must identify which state can be seen as partition local only and which state must be synchronous for read, and synchronous for writes across partitions. - -Affected states: - -- masterdata: e.g. tenant and project must be present in every partition, but these are entities which are read often but updates are rare. A write can therefore be visible with a decent delay in a distinct partition with no consequences. -- ipam: the prefixes and ip´s allocated from machines. These entities are also read often and rare updates. But we must differentiate between dirty reads for different types. A machine network is partition local, ips acquired from such a network must by synchronous in the same partition. Ips acquired from global networks such as internet must by synchronous for all partitions, as otherwise a internet ip could be acquired twice. -- vrf ids: they must only be unique in one partition -- image and size configurations: read often, written seldom, so no high requirements on the storage of these entities. -- images: os images are already replicated from a central s3 storage to a per partition s3 service. metal-hammer kernel and initrd are small and pull always from the central s3, can be done similar to os images. -- machine and machine allocation: must be only synchronous in the partition -- switch: must be only synchronous in the partition -- nsq messages: do not need to cross partition boundaries. No need to keep the messages persistent, even the opposite is true, we don't want to have the messages persist for a longer period. - -Now we can see that the most critical state to held and synchronize are the IPAM data, because these entities must be guaranteed to be synchronously updated, while being updated frequently. - -Datastores: - -We use three different types of datastores to persist the states of the metal application. - -- rethinkdb is the main datastore for almost all entities managed by metal-api -- postgresql is used for masterdata and ipam data. -- nsq uses disk and memory tho store the messages. - -### Stateless services - -These are the easy part, all of our services which are stateless can be scaled up and down without any impact on functionality. Even the stateful services like masterdata and metal-api rely fully on the underlying datastore and can therefore also be scaled up and down to meet scalability requirements. - -Albeit, most of these services need to be placed behind a loadbalancer which does the L4/L7 balancing across the started/available replicas of the service for the clients talking to it. This is actually provided by kubernetes with either service type loadbalancer or type clusterip. - -One exception is the `metal-console` service which must have the partition in it´s dns name now, because there is no direct network connectivity between the management networks of the partitions. See "Network Setup) - -## Distributed setup - -### State - -In order to replicate certain data which must be available across all partitions we can use on of the existing open source databases which enable such kind of setup. There are a few available out there, the following incomplete list will highlight the pro´s and cons of each. - -- RethinkDB - - We already store most of our data in RethinkDB and it gives already the ability to synchronize the data in a distributed manner with different guarantees for consistency and latency. This is described here: [Scaling, Sharding and replication](https://rethinkdb.com/docs/sharding-and-replication/). But because rethinkdb has a rough history and unsure future with the last release took more than a year, we in the team already thought that we eventually must move away from rethinkdb in the future. - -- Postgresql - - Postgres does not have a multi datacenter with replication in both directions, it just can make the remote instance store the same data. - -- CockroachDB - - Is a Postgresql compatible database engine on the wire. CockroachDB gives you both, ACID and geo replication with writes allowed from all connected members. It is even possible to configure [Follow the Workload](https://www.cockroachlabs.com/docs/stable/topology-follow-the-workload) and [Geo Partitioning and Replication](https://www.cockroachlabs.com/docs/v19.2/topology-geo-partitioned-replicas). - -If we migrate all metal-api entities to be stored the same way we store masterdata, we could use cockroachdb to store all metal entities in one ore more databases spread across all partitions and still ensure consistency and high availability. - -A simple setup how this would look like is shown here. - -![Simple CockroachDB setup](Distributed.png) - -go-ipam was modified in a example PR here: [PR 17](https://github.com/metal-stack/go-ipam/pull/17) - -### API Access - -In order to make the metal-api accessible for api users like `cloud-api` or `metalctl` as easy at it is today, some effort has to be taken. One possible approach would be to use a external loadbalancer which spread the requests evenly to all metal-api endpoints in all partitions. Because all data are accessible from all partitions, a api request going to partition A with a request to create a machine in partition B, will still work. If on the other hand partition B is not in a connected state because the interconnection between both partitions is broken, then of course the request will fail. - -**IMPORTANT** -The NSQ Message to inform `metal-core` must end in the correct partition - -To provide such a external loadbalancer we have several opportunities: - -- Cloudflare or comparable CDN service. -- BGP Anycast from every partition - -Another setup would place a small gateway behind the metal-api address, which forwards to the metal-api in the partition where the request must be executed. This gateway, `metal-api-router` must inspect the payload, extract the desired partition, and forward the request without any modifications to the metal-api endpoint in this partition. This can be done for all requests, or if we want to optimize, only for write accesses. - -## Network setup - -In order to have the impact to the overall security concept as minimal as possible i would not modify the current network setup. The only modifications which has to be made are: - -- Allow https ingress traffic to all metal-api instances. -- Allow ssh ingress traffic to all metal-console instances. -- Allow CockroachDB Replication between all partitions. -- No NSQ traffic from outside required anymore, except we cant solve the topic above. - -A simple setup how this would look like is shown here, this does not work though because of the forementioned NSQ issue. - -![API and Console Access](Distributed-API.png) - -Therefore we need the `metal-api-router`: - -![Working API and Console Access](Distributed-API-Working.png) - -## Deployment - -The deployment of our components will substantially differ in a partition compared to a the deployment we have actually. Deploying it in kubernetes in the partition would be very difficult to achieve because we have no sane way to deploy kubernetes on physical machines without a underlying API. -I would therefore suggest to deploy our components in the same way we do that for the services running on the management server. Use systemd to start docker containers. - -![Deployment](Distributed-Deployment.png) diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP10/README.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP10/README.md deleted file mode 100644 index 6811cdc0..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP10/README.md +++ /dev/null @@ -1,197 +0,0 @@ ---- -slug: /MEP-10-sonic-support -title: MEP-10 -sidebar_position: 10 ---- - -# SONiC Support - -As writing this proposal, metal-stack only supports Cumulus on Broadcom ASICs. Unfortunately, after the acquisition of -Cumulus Networks by Nvidia, Broadcom decided to cut its relationship with Cumulus, and therefore Cumulus 4.2 is the last -version that supports Broadcom ASICs. Since trashing the existing hardware is not a solution, adding support for a -different network operating system is necessary. - -One of the remaining big players is [SONiC](https://sonic-net.github.io/SONiC/), which Microsoft created to scale the -network of Azure. It's an open-source project and is now part of the [Linux Foundation](https://www.linuxfoundation.org/press/press-release/software-for-open-networking-in-the-cloud-sonic-moves-to-the-linux-foundation). - -For a general introduction to SONiC, please follow the [Architecture](https://github.com/sonic-net/SONiC/wiki/Architecture) official -documentation. - -## ConfigDB - -On a cold start, the content of `/etc/sonic/config_db.json` will be loaded into the Redis database `CONFIG_DB`, and both -contain the switch's configuration except the BGP unnumbered configuration, which still has to be configured directly by -the frr configuration files. The SONiC community is working to remove this exception, but no release date is known. - -## BGP Configuration - -Frr runs inside a container, and a shell script configured it on the container startup. For BGP unnumbered, we must set -the configuration variable `docker_routing_config_mode` to `split` to prevent SONiC from overwriting our configuration -files created by `metal-core`. But by using the split mode, the integrated configuration mode of frr is deactivated, and -we have to write our BGP configuration to the daemon-specific files `bgp.conf`, `staticd.conf`, and `zebra.conf` instead -to `frr.conf`. - -```bash -elif [ "$CONFIG_TYPE" == "split" ]; then - echo "no service integrated-vtysh-config" > /etc/frr/vtysh.conf - rm -f /etc/frr/frr.conf -``` - -Reference: [docker-init](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/docker_init.sh#L69) - -Adding support for the integrated configuration mode, we must at least adjust the startup shell script and the supervisor configuration: - -```bash -{% if DEVICE_METADATA.localhost.docker_routing_config_mode is defined and DEVICE_METADATA.localhost.docker_routing_config_mode == "unified" %} -[program:vtysh_b] -command=/usr/bin/vtysh -b -``` - -Reference: [supervisord.conf](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/frr/supervisord/supervisord.conf.j2#L157) - -## Non-BGP Configuration - -For the Non-BGP configuration we have to write it into the Redis database directly or via one of the following interfaces: - -- `config replace ` -- the Mgmt Framework -- the SONiC restapi - -Directly writing into the Redis database isn't a stable interface, and we must determine the create, delete, and update -operations on our own. The last point is also valid for the Mgmt Framework and the SONiC restapi. Furthermore, the -Mgmt Framework doesn't start anymore for several months, and a [potential fix](https://github.com/sonic-net/sonic-buildimage/pull/10893) -is still not merged. And the SONiC restapi isn't enabled by default, and we must build and maintain our own SONiC images. - -Using `config replace` would reduce the complexity in the `metal-core` codebase because we don't have to determine the -actual changes between the running and the desired configuration. The approach's drawbacks are using a version of SONiC -that contains the PR [Yang support for VXLAN](https://github.com/sonic-net/sonic-buildimage/pull/7294), and we must provide -the whole new startup configuration to prevent unwanted deconfiguration. - -### Configure Loopback interface and activate VXLAN - -```json -{ - "LOOPBACK_INTERFACE": { - "Loopback0": {}, - "Loopback0|": {} - }, - "VXLAN_TUNNEL": { - "vtep": { - "src_ip": "" - } - } -} -``` - -#### Configure MTU - -```json -{ - "PORT": { - "Ethernet0": { - "mtu": "9000" - } - } -} -``` - -#### Configure PXE Vlan - -```json -{ - "VLAN": { - "Vlan4000": { - "vlanid": "4000" - } - }, - "VLAN_INTERFACE": { - "Vlan4000": {}, - "Vlan4000|": {} - }, - "VLAN_MEMBER": { - "Vlan4000|": { - "tagging_mode": "untagged" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104000_Vlan4000": { - "vlan": "Vlan4000", - "vni": "104000" - } - } -} -``` - -#### Configure VRF - -```json -{ - "INTERFACE": { - "Ethernet0": { - "vrf_name": "vrf104001" - } - }, - "VLAN": { - "Vlan4001": { - "vlanid": "4001" - } - }, - "VLAN_INTERFACE": { - "Vlan4001": { - "vrf_name": "vrf104001" - } - }, - "VRF": { - "vrf104001": { - "vni": "104001" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104001_Vlan4001": { - "vlan": "Vlan4001", - "vni": "104001" - } - } -} -``` - -## DHCP Relay - -The DHCP relay container only starts if `DEVICE_METADATA.localhost.type` is equal to `ToRRouter`. - -## LLDP - -SONiC always uses the local port subtype for LLDP and sets it to some freely configurable alias field of the interface. - -```python -# Get the port alias. If None or empty string, use port name instead -port_alias = port_table_dict.get("alias") -if not port_alias: - self.log_info("Unable to retrieve port alias for port '{}'. Using port name instead.".format(port_name)) - port_alias = port_name - -lldpcli_cmd = "lldpcli configure ports {0} lldp portidsubtype local {1}".format(port_name, port_alias) -``` - -Reference: [lldpmgr](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-lldp/lldpmgrd#L153) - -## Mgmt Interface - -The mgmt interface is `eth0`. To configure a static IP address and activate the Mgmt VRF, use: - -```json -{ - "MGMT_INTERFACE": { - "eth0|": { - "gwaddr": "" - } - }, - "MGMT_VRF_CONFIG": { - "vrf_global": { - "mgmtVrfEnabled": "true" - } - } -} -``` - -[IP forwarding is deactivated on `eth0`](https://github.com/sonic-net/sonic-buildimage/blob/202205/files/image_config/sysctl/sysctl-net.conf#L7), and no IP Masquerade is configured. diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP11/README.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP11/README.md deleted file mode 100644 index 87f48a10..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP11/README.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -slug: /MEP-11-auditing-of-metal-stack-resources -title: MEP-11 -sidebar_position: 11 ---- - -# Auditing of metal-stack resources - -Currently no logs of the ownership of resources like machines, networks, ips and volumes are generated or kept. Though due to legal requirements data centers are required to keep track of this ownership over time to prevent liability issues when opening the platform for external users. - -In this proposal we want to introduce a flexible and low-maintenance approach for auditing on top of [Meilisearch](https://www.meilisearch.com/). - -## Overview - -In general our auditing logs will be collected by a request interceptor or middleware. Every request and response will be processed and eventually logged to Meilisearch. -Meilisearch will be configured to regularly create chunks of the auditing logs. These finished chunks will be backed up to a S3 compatible storage with a read-only option enabled. - -Of course sensitive data like session keys or passwords will be redacted before logging. We want to track relevant requests and responses. If auditing the request fails, the request itself will be aborted and will not be processed further. The requests and responses that will be audited will be annotated with a correlation id. - -Transferring the meilisearch auditing data chunks to the S3 compatible storage will be done by a sidecar cronjob that is executed periodically. -To avoid data manipulation the S3 compatible storage will be configured to be read-only. - -## Whitelisting - -To reduce the amount of unnecessary logs we want to introduce a whitelist of resources and operations on those that should be logged. -Other requests will be passed directly to the next middleware or web service without any further processing. - -As we are only interested in mutating endpoints, we ignore all `GET` requests. -The whitelist includes all `POST`, `PUT`, `PATCH` and `DELETE` endpoints of the HTTP middleware except for the following (non-manipulating) route suffixes: - -- `/find` -- `/notify` -- `/try` and `/match` -- `/capacity` -- `/from-hardware` - -Regarding GRPC audit trails, they are not so interesting because only internal clients are using this API. However, we can log the trails of the `Boot` service, which can be interesting to revise the machine lifecycle. - -## Chunking in Meilisearch - -We want our data to be chunked in Meilisearch. To accomplish this, we rotate the index identifier on a scheduled basis. The index identifiers will be derived from the current date and time. - -To keep things simple, we only support hourly, daily and monthly rotation. The eventually prefixed index names will only include relevant parts of date and time like `2021-01`, `2021-01-01` or `2021-01-01_13`. - -The metal-api will only write to the current index and switches to the new index on rotation. The metal-api will never read or update data in any indices. - -## Moving chunks to S3 compatible storage - -As Meilisearch will be filled with data over time, we want to move completed chunks to a S3 compatible storage. This will be done by a sidecar cronjob that is executed periodically. Note that the periods of the index rotation and the cronjob execution don't have to match. - -When the backup process gets started, it initiates a [Meilisearch dump](https://www.meilisearch.com/docs/learn/advanced/dumps) of the whole database across all indices. Once the returned task is finished, the dump must be copied from a Meilisearch volume to the S3 compatible storage. After a successful copy, the dump can be deleted. - -Now we want to remove all indices from Meilisearch, except the most recent one. For this, we [get all indices](https://www.meilisearch.com/docs/reference/api/indexes#list-all-indexes), sort them and [delete each index](https://www.meilisearch.com/docs/reference/api/indexes#delete-an-index) except the most recent one to avoid data loss. - -For the actual implementation, we can build upon [backup-restore-sidecar](https://github.com/metal-stack/backup-restore-sidecar). But due to the index rotation and the fact, that older indices need to be deleted, this probably does not fit into the mentioned sidecar. - -## S3 compatible storage - -The dumps of chunks should automatically deleted after a certain amount of time, once we are either no longer allowed or required to keep them. -The default retention time will be 6 months. Ideally already uploaded chunks should be read-only to prevent data manipulation. - -A candidate for the S3 compatible storage is Google Cloud Storage, which allows to configure automatic expiration of objects through a [lifecycle rule](https://cloud.google.com/storage/docs/managing-lifecycles?hl=en#storage-set-lifecycle-config-go). - -## Affected components - -- metal-api grpc server needs an auditing interceptor -- metal-api web server needs an auditing filter chain / middleware -- metal-api needs new command line arguments to configure the auditing -- mini-lab needs a Meilisearch instance -- mini-lab may need a local S3 compatible storage -- we need a sidecar to implement the backup to S3 compatible storage -- Consider auditing of volume allocations and freeings outside of metal-stack - -## Alternatives considered - -Instead of using Meilisearch we investigated using an immutable database like [immudb](https://immudb.io/). But immudb does not support chunking of data and due to its immutable nature, we will never be able to free up space of expired data. Even if we are legally allowed or required to delete data, we will not be able to do so with immudb. - -In another variant of the Meilisearch approach the metal-api would also be responsible for copying chunks to the S3 compatible storage and deleting old indices. But separating the concerns allows completely different implementations for every deployment stage. diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP12/README.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP12/README.md deleted file mode 100644 index 65532c57..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP12/README.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -slug: /MEP-12-rack-spreading -title: MEP-12 -sidebar_position: 12 ---- - -# Rack Spreading - -Currently, when creating a machine through the metal-api, the machine is placed randomly inside a partition. This algorithm does not consider spreading machines across different racks and different chassis. This may lead to the situation that a group of machines (that for example form a cluster) can end up being placed in the same rack and the same chassis. - -Spreading a group of machines across racks can enhance availability for scenarios like a rack losing power or a chassis meltdown. - -So, instead of just randomly deciding the placement of a machine candidate, we want to propose a placement strategy that attempts to spread machine candidates across the racks inside a partition. - -Furthermore a followup improvement to guarantee that machines are really spread across multiple racks, even if multiple machines are ordered in parallel, was implemented with [PR490](https://github.com/metal-stack/metal-api/pull/490). - -## Placement Strategy - -Machines in the project are spread across all available racks evenly within a partition (best effort). For this, an additional request to the datastore has to be made in order to find allocated machines within the project in the partition. - -The algorithm will then figure out the least occupied racks and elect a machine candidate randomly from those racks. - -The user can optionally pass placement tags which will be considered for spreading the machines as well (this will for example allow spreading by a cluster id tag inside the same project). - -## API - -```golang -// service/v1/machine.go - -type MachineAllocation struct { - // existing fields are omitted for readability - PlacementTags []string `json:"placement_tags" description:"by default machines are spread across the racks inside a partition for every project. if placement tags are provided, the machine candidate has an additional anti-affinity to other machines having the same tags"` -} -``` diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP13/README.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP13/README.md deleted file mode 100644 index 2dde20f5..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP13/README.md +++ /dev/null @@ -1,111 +0,0 @@ ---- -slug: /MEP-13-dual-stack-support -title: MEP-13 -sidebar_position: 13 ---- - -# Dual-stack Support - -dual-stack support is required to be able to create Kubernetes clusters with either IPv6 single-stack or dual-stack enabled. -With the inherent scarcity of IPv4 addresses, the need to be able to use IPv6 has increased. - -Full IPv6 dual-stack support was added to Kubernetes with v1.23 as stable. - -Gardeners have had full IPv6 dual-stack support since `v1.109`. - -metal-stack manages CIDRs and IP addresses with the [go-ipam](https://github.com/metal-stack/go-ipam) library, which already got full IPv6 support in 2021 (see [https://metal-stack.io/blog/2021/02/ipv6-part1](https://metal-stack.io/blog/2021/02/ipv6-part1)). -But this was only the foundation, more work needs to be done to get full IPv6 support for all aspects managed by metal-stack.io. - -## General Decisions - -For the general decision we do not look at the isolated clusters feature for now as this would make the solution even more complex and we want to introduce IPv6 in smaller steps to the users. - -### Networks - -Currently, metal-stack organizes CIDRs / prefixes into a `network' resource in the metal-api. A network can consist of multiple CIDRs from the same address family. For example, if an operator wants to provide Internet connectivity to provisioned machines, they can start with small network CIDRs. The number of managed network prefixes can then be expanded as needed over time. - -With dual-stack we have to choose between two options: Network per address family or networks with both address families. These options are described in the next section. - -#### Network per Address Family - -This means that we allow networks with CIDRs from one address family only, one for IPv4 and one for IPv6. - -The machine creation process will not change if the machine only needs to be either IPv4 or IPv6 addressable. -But if on the other side, the machine need to be able to connect to both address families, the machine creation needs to specify two networks, one for IPv4 and one for IPv6. -Also there will be 2 distinct VRF IDs for every network with a different address family. - -#### Network with both Address Families - -Make a network dual address family capable, meaning that you can add multiple cidrs from both address families to a network. -Then the machine creation will remain the same for single-stack and dual-stack cases, but the ip address allocation will need to specify the address family from which to allocate an ip address when the network is dual-stack. -This does not break the existing API, but allows existing extensions to easily add dual-stack support. -To avoid additional checking of which address families are available on this network during an ip allocation call, we could store the address families in the network. - -#### Decision - -The decision was made to go with the having both address families in a single network entity because we think this is the most flexible way to support dual-stack machines and Kubernetes clusters as well as single-stack with the least amount of modifications on the networking side. - -### Examples - -To illustrate the the usage we start by creating a tenant super network which has both address families: - -```yaml ---- -id: tenant-super-network-mini-lab -name: Project Super Network -description: Super network of all project networks -partitionid: mini-lab -prefixes: - - 10.0.0.0/16 - - 2001:db8:0:10::/64 -defaultchildprefixlength: - IPv4: 22 - IPv6: 96 -privatesuper: true -``` - -In order to create this network, we simple call: - -```bash -metalctl network create -f tenant-super.yaml -``` - -This is usually done during the initial setup of the environment. - -Next step is to allocate a tenant network where the machines of a project can be placed: - -```bash -metalctl network allocate --partition mini-lab --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 --name my-node-network -``` - -This leads to the following network allocation: - -```yaml -id: 2d2c0350-3f66-4597-ae97-ef6797232212 -name: my-node-network -parentnetworkid: tenant-super-network-mini-lab -partitionid: mini-lab -prefixes: - - 10.0.0.0/22 - - 2001:db8:0:10::/96 -projectid: 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -vrf: 20 -consumption: - ipv4: - available_ips: 1024 - available_prefixes: 256 - used_ips: 2 - used_prefixes: 0 - ipv6: - available_ips: 2147483647 - available_prefixes: 1073741824 - used_ips: 1 - used_prefixes: 0 -privatesuper: false -``` - -Users can the create IP addresses from these child networks. By default, they retrieve an IPv4 address except a super network only consists of IPv6 prefixes. In the latter case the users acquire an IPv6 address. - -```bash -metalctl network ip create --network 2d2c0350-3f66-4597-ae97-ef6797232212 --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -``` diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP14/README.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP14/README.md deleted file mode 100644 index 47c06434..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP14/README.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -slug: /MEP-14-independence-from-external-sources -title: MEP-14 -sidebar_position: 14 ---- - -# Independence from external sources - -In certain situations some customers may need to operate and create machines without making use of external services like DNS or NTP through the internet. To make this possible, all metal-stack components reaching external services need to be configurable with custom endpoints. - -So far, the following components have been identified as requiring changes: - -- pixiecore -- metal-hammer -- metal-images - -More components are likely to be added to the list during processing. -For DNS and NTP servers it should be possible to provide default values within a partition. They can either be inherited from machines and firewalls or overwritten with own ones. - -## pixiecore - -A NTP server endpoint need to be configured on the pixiecore. This can be achieved by providing it through environment variables on start up. - -## metal-hammer - -If using a self-deployed NTP server, also the metal-hammer need to be configured with it. For backward compatibility, default values from `pool.ntp.org` and `time.google.com` are used. - -## metal-images - -Configurations for the `metal-images` are different for machines and firewalls. - -## metalctl - -In order to pass DNS and NTP servers to partitions and machines while creating them, the flags `dnsservers` and `ntpservers` need to be added. - -The implementation of this MEP will make metal-stack possible to create and maintain machines without requiring an internet connection. diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP16/README.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP16/README.md deleted file mode 100644 index dbfa59d6..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP16/README.md +++ /dev/null @@ -1,332 +0,0 @@ ---- -slug: /MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller -title: MEP-16 -sidebar_position: 16 ---- - -# metal-api as an Alternative Configuration Source for the firewall-controller - -In the current situation, a firewall as provisioned by metal-stack is a fully immutable entity. Any modifications on the firewall like changing the firewall ruleset must be done _somehow_ by the user – the metal-api and hence metal-stack is not aware of its current state. - -As part of our [integration with the Gardener project](https://docs.metal-stack.io/stable/overview/kubernetes/#Gardener) we offer a solution called the [firewall-controller](https://github.com/metal-stack/firewall-controller), which is part of our [firewall OS images](https://github.com/metal-stack/metal-images/blob/6318a624861b18a559a9d37299bca5f760eef524/firewall/Dockerfile#L57-L58) and addresses shortcomings of the firewall resource's immutability, which would otherwise be completely impractible to work with. The firewall-controller crashes infinitely if it is not properly configured through the userdata when using the firewall image of metal-stack. - -The firewall-controller approach is tightly coupled to Gardener and it requires the administrator of the Gardener installation to pass a shoot and a seed kubeconfig through machine userdata when creating the firewall. How this userdata has to look like is not documented and is just part of another project called the [firewall-controller-manager](https://github.com/metal-stack/firewall-controller-manager) (FCM), which task is to orchestrate rolling updates of firewall machines in a way that network traffic interruption is minimal when updating a firewall or applying a change to an immutable firewall configuration. - -In general, a firewall entity in metal-stack has similarities to the machine entity but it has a fundamental difference: A user gains ownership over a machine after provisioning. They can access it through SSH, modify it at will and this is completely wanted. For firewalls, however, we do not want a user to access the provisioned firewall as the firewall is a privileged part of the infrastructure with access to the underlay network. The underlay can not be tampered with at any given point in time by a user as it can destroy the entire network traffic flow inside a metal-stack partition. - -For this reason, we have a gap in the metal-stack project in terms of a missing solution for people who do not rely on the Gardener integration. We are basically leaving a user with the option to implement an orchestrated recreation of every possible change on the firewall to minimize traffic interruption for the machines sitting behind the firewall or re-implement the firewall-controller to how they want to use it for their use-case. - -Also we do not have a clear distinction in the API between user and metal-stack operator for firewalls. If a user would allocate a firewall it is also possible for the user to inject his own SSH keys and access the firewall and tamper with the underlay network. - -Parts of these problems are probably going to decrease with the work on [MEP-4](../MEP4/README.md) where there will be dedicated APIs for users and administrators of metal-stack including fine-grained access tokens. - -With this MEP we want to describe a way to improve this current situation and allow other users that do not rely on the Gardener integration – for whatever motivation they have – to adequately manage firewalls. For this, we propose an alternative configuration for the firewall-controller that is native to metal-stack and more independent of Gardener. - -## Proposal - -The central idea of this proposal is allowing the firewall-controller to use the metal-api as a configuration source. This should serve as an alternative strategy to the currently used FCM `Firewall` resource based approach in the Gardener use-case. -Updates of the firewall rules should be possible through the metal-api. - -The firewall-controller itself should now be able to decide which of the two main strategies should be used for the base configuration: a kubeconfig or the metal-api. This should be possible through a dedicated _firewall-controller-config_. - -Using this config will now allow operators to fine-tune the data sources for all of its dynamic configuration tasks independently. -For example the data source of the core firewall rules could be set either from the `Firewall` resource located in the Gardener `Seed` or the metal-apiserver node network entity, while the CWNPs should be fetched and applied from a given kubeconfig (the `Shoot` Kubeconfig in the Gardener case). -This configuration file is intended to be injected during firewall creation through the userdata along with potential source connection credentials. - -```yaml -# the name of the firewall, defaulted to the hostname -name: best-firewall-ever - -sources: - seed: - kubeconfig: /path/to/seed.yaml # current gardener behavior - namespace: shoot--proj--name - shoot: - kubeconfig: /path/to/shoot.yaml # current gardener behavior - namespace: firewall - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - static: - # static should mirror all information provided by the metal or seed/shoot sources - firewall: # optional - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -# all sub-controllers running on the firewall -# each can be configured independently -controllers: - # this is the base controller - firewall: - source: seed # or: metal, static - - # these are optional: when not provided, they are disabled - selfUpdate: - enabled: true - droptailer: - enabled: true - - # these are optional: when not provided, they are disabled - service: - source: shoot # or: metal, static - cwnp: - source: shoot # or: metal, static - monitor: - source: shoot # currently only shoot is supported -``` - -The existing behavior of the firewall-controller writing into `/etc/nftables/firewall-controller.v4` is not changed. The different controller configuration sources are internally treated in the same way as before. The `static` source can be used to prevent the firewall-controller from crashing and consistently providing a static ruleset. This might be interesting for metal-stack native use cases or environments where the metal-api cannot be accessed. - -There must be one central nftables-rule-file-controller that is notified and triggered by all other controllers that contribute to the nftables configuration. - -For example, in order to maintain the existing Gardener integration, the configuration file for the firewall-controller will look like this: - -```yaml -name: shoot--abc--cluster-firewall-def -sources: - seed: - kubeconfig: /etc/firewall-controller/seed.yaml - namespace: shoot--abc--cluster - shoot: - kubeconfig: /etc/firewall-controller/shoot.yaml - namespace: firewall - -controllers: - firewall: - source: seed - - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Plain metal-stack users might use a configuration like this: - -```yaml -name: best-firewall-ever - -sources: - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - -controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - cwnp: - # firewall rules stored in firewall entity - # potential improvement would be to attach the rules to the node network entity - # be aware that the firewall and private networks are immutable - # eventually we introduce a firewall ruleset entity - source: metal -``` - -In highly restricted environments that cannot access metal-api the static source could be used: - -```yaml -name: most-restricted-firewall-ever - -sources: - static: - firewall: - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -controllers: - firewall: - source: static - - cwnp: - source: static -``` - -### Non-Goals - -- Resolving the missing differentiation between users and administrators by letting users pass userdata and SSH keys to the firewall creation. - - This is even more related to [MEP-4](../MEP4/README.md) than this MEP. - -### Advantages - -- Offers a native metal-stack solution that improves managing firewalls for users by adding dynamic reconfiguration through the metal-api - - e.g., in the mini-lab, users can now allocate a machine, then an IP address and announce this IP from the machine without having to re-create the firewall but by adding a firewall rule to the metal-api. -- Improve consistency throughout the API (firewall rules would reflect what is persisted in metal-api). -- Other providers like Cluster API can leverage this approach, too. -- It can contribute to solving the shoot migration issue (in Cluster API case the `clusterctl move` for firewall objects) - - For Gardener takes the seed out of the equation (of which the kubeconfig changes during shoot migration) - - However: Things like egress rules, rate limiting, etc. are currently not part of the firewall or network entity in the metal-api. These would need to be added to one of them. -- Potentially resolve the issue that end-users can manipulate accounting data of the firewall through the `FirewallMonitor` - - for this we would need to be able to report traffic data to metal-api - -### Caveats - -- Metal-View access is too broad for firewalls. Mitigated by [MEP-4](../MEP4/README.md). -- Polling of the firewall-controller is bad for performance. Mitigated by [MEP-4](../MEP4/README.md). - -### Firewall Controller Manager - -Currently the firewall-controller-manager expects the creators of a `FirewallDeployment` to use the defaulting webhook that is tailored to the Gardener integration in order to generate `Firewall.spec.userdata` or to override it manually. Currently `Firewall.spec.userdata` will never be set explicitly. - -Instead we'd like to propose `Firewall.spec.userdataContents` which will replace the old `userdata`-string by a typed data structure. The FCM will do the heavy lifting while the `FirewallDeployment` creator decides what should be configured. - -```yaml -kind: FirewallDeployment -spec: - template: - spec: - userdataContents: - - path: /etc/firewall-controller/config.yaml - content: | - --- - sources: - static: {} - controllers: - firewall: - source: static - - path: /etc/firewall-controller/seed.yaml - contentFrom: - firewallControllerKubeconfigSecret: - name: seed-kubeconfig - key: kubeconfig - - - path: /etc/firewall-controller/shoot.yaml - contentFrom: - secretRef: - name: shoot-kubeconfig - key: kubeconfig -``` - -### Gardener Extension Provider Metal Stack - -The GEPM should be migrated to the new `Firewall.spec.userdataContents` field. - -### Cluster API Provider Metal Stack - -![architectural overview](firewall-for-capms-overview.svg) - -In Cluster API there are essentially two main clusters: the management cluster and the workload cluster while the CAPMS takes in the role of the GEPM. -Typically a local bootstrap cluster is created in KinD which acts as the management cluster. It creates the workload cluster. Thereafter the ownership of the workload cluster is typically moved (using `clusterctl move`) to a different cluster which will then become the management cluster. -The new management cluster might actually be the workload cluster itself. - -In contrast to Gardener, Cluster API aims to be less opinionated and minimal. It is common practice to not install any non-required components or CRDs into the workload cluster by default. Therefore we cannot expect custom resources like `ClusterwideNetworkPolicy` or `FirewallMonitor` to be installed in the workload cluster but strongly recommend our users to do it. Therefore it's the responsibility of the operator to tell [cluster-api-provider-metal-stack](https://github.com/metal-stack/cluster-api-provider-metal-stack) the kubeconfig for the cluster where these CRDs are installed and defined in. - -A viable configuration for a `MetalStackCluster` that generates firewall rules based of `Service` type `LoadBalancer` and `ClusterwideNetworkPolicy` and expects them to be deployed in the workload cluster is shown below. The `FirewallMonitor` will be reported into the same cluster. - -```yaml -kind: MetalStackCluster -metadata: - name: ${CLUSTER_NAME} -spec: - firewallTemplate: - userdataContents: - - path: /etc/firewall-controller/config.yaml - contentFrom: - secretRef: - name: ${CLUSTER_NAME}-firewall-controller-config - key: controllerConfig - - - path: /etc/firewall-controller/workload.yaml - contentFrom: - # this is the kubeconfig generated by kubeadm - secretRef: - name: ${CLUSTER_NAME}-kubeconfig - key: value ---- -kind: Secret -metadata: - name: ${CLUSTER_NAME}-firewall-controller-config -stringData: - controllerConfig: | - --- - name: ${CLUSTER_NAME}-firewall - - sources: - metal: - url: ${METAL_API_URL} - hmac: ${METAL_API_HMAC} - type: ${METAL_API_HMAC_TYPE} - projectID: ${METAL_API_PROJECT_ID} - shoot: - kubeconfig: /etc/firewall-controller/workload.yaml - namespace: firewall - - controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Here the firewall-controller-config will be referenced by the `MetalStackCluster` as a `Secret`. Please note that the `Secret`s in `userdataContents` will not be fetched and will directly be passed to the `FirewallDeployment`. At first the reconciliation of it in the FCM will fail due to the missing Kubeconfig secret. After the `MetalStackCluster` has been marked as ready, CAPI will create this missing secret. Effectively the firewall and initial control plane node should be created at the same time. - -This approach allows maximum flexibility as intended by Cluster API and is still able to provide robust rolling updates of firewalls. - -An advanced use case of this flexibility would be a management cluster, that is in charge of multiple workload clusters. Where one workload cluster acts as a monitoring or tooling cluster, receives logs and the firewall monitor for the other workload clusters. The CWNPs could be defined here, all in a separate namespace. - -#### Cluster API Caveats - -When the cluster is pivoted and reconciles its own firewall, a malfunctioning firewall prevents the cluster from self-healing and requires manual intervention by creating a new firewall. This is an inherent problem of the cluster-api approach. It can be circumvented by using an extra cluster to manage workload clusters. - -In the current form of this approach firewalls and therefore the firewall egress and ingress rules are managed by the cluster operators that manage the cluster-api resources. -Hence it will not be possible to gain a fine-grained control over every cluster operator's choices from a central ruleset at the level of metal-stack firewalls. -In case this control surfaces as a requirement, it would need to be implemented in a firewall external to metal-stack. - -## Roadmap - -In general this proposal is not thought to be implemented in one batch. Instead an incremental approach is required. - -1. Enhance firewall-controller-manager - - - Add `FirewallDeployment.spec.template.spec.userdataContents` - -2. Enhance firewall-controller - - - Reduce coupling between controllers - - Introduce controller config - - Abstract module to write into distinct nftable rules for every controller - - Implement `sources.static`, but not `sources.metal` - - GEPM should set `FirewallDeployment.spec.template.spec.userdataContents` - -3. Allow Cluster API to use the FCM with static ruleset - - - Add `firewall.metal-stack.io/paused` annotation (managed by CAPMS during `clusterctl move`, theoretically useful for Gardener shoot migration as well to avoid shallow deletion). - - Reconcile multiple `FirewallDeployment` resources across multiple namespaces. For Gardener the old behavior of reconciling only one namespace should persist. - - Allow setting the `firewall.metal-stack.io/no-controller-connection` annotation through the `FirewallDeployment` (either through the template or inheritance). - - Add `MetalStackCluster.spec.firewallTemplate`. - - Make `MetalStackCluster.spec.nodeNetworkID` optional if `spec.firewallTemplate` given. - -4. Add `sources.metal` as configuration option. - - - Allow updates of firewall rules in the metal-apiserver. - - Depends on [MEP-4](../MEP4/README.md) metal-apiserver progress - -5. Potentially migrate the GEPM to use `sources.metal` diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio deleted file mode 100644 index faea3e3d..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio +++ /dev/null @@ -1,4 +0,0 @@ - - - -
handles traffic
Firewall
Firewall Controller
node-exporter
nftables-exporter
droptailer-client
Workload Cluster
droptailer
Configures
Bootstrap or Management Cluster
reconcile
configures
reconcile
Cluster API Provider metal-stack
Metal Stack Cluster CRD
Firewall Deployment CRD
Firewall CRD
Firewall Set CRD
rec
reconcile
reconcile
Firewall Controller Manager
Metal Stack Machine CRD
manages
Admin
Kubeconfig FirewallMonitor
FirewallMonitor CRD
main metal-api
Firewall entity
kubeconfig CWNP
Clusterwide Network Policy CRD
base config
controllerConfig
user-defined
network rules
reports firewall
state
send firewall log lines
controllerConfig
controllerConfig
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg deleted file mode 100644 index 853f8175..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg +++ /dev/null @@ -1 +0,0 @@ -
handles traffic
handles traffic
Firewall
Firewall
Firewall Controller
Firewall Controller
node-exporter
node-exporter
nftables-exporter
nftables-exporter
droptailer-client
droptailer-client
Workload Cluster
Workload Cluster
droptailer
droptailer
Configures
Configures
Bootstrap or Management Cluster
Bootstrap or Management Cluster
reconcile
reconcile
configures
configures
reconcile
reconcile
Cluster API Provider metal-stack
Cluster API Provider...
Metal Stack Cluster CRD
Metal Stack Cluster...
Firewall Deployment CRD
Firewall Deployment...
Firewall CRD
Firewall CRD
Firewall Set CRD
Firewall Set CRD
rec
rec
reconcile
reconcile
reconcile
reconcile
Firewall Controller Manager
Firewall Controller...
Metal Stack Machine CRD
Metal Stack Machine...
manages
manages
Admin
Admin
Kubeconfig FirewallMonitor
Kubeconfig FirewallMonitor
FirewallMonitor CRD
FirewallMonitor CRD
main metal-api
main metal-api
Firewall entity
Firewall entity
kubeconfig CWNP
kubeconfig CWNP
Clusterwide Network PolicyCRD
Clusterwide Network...
base config
base config
controllerConfig
controllerConfig
user-defined
network rules
user-defined...
reports firewall
state
reports firewall...
send firewall log lines
send firewall log lines
controllerConfig
controllerConfig
controllerConfig
controllerConfig
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP17/README.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP17/README.md deleted file mode 100644 index 35f48970..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP17/README.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -slug: /MEP-17-global-network-view -title: MEP-17 -sidebar_position: 17 ---- - -# Global Network View - -> [!IMPORTANT] -> This MEP assumes the implementation of the metal-apiserver as described by [MEP-4](../MEP4/README.md) which is currently work in progress. - -Having a complete view of the network topology is useful when working with deployments or troubleshooting connectivity issues. -Currently, the API doesn't know of any other switches than the leaf switches. -Information about all other switches and their connections must be gathered from Ansible inventories or by accessing the switches via SSH. -Documentation of each partition's network must be kept in-sync with all changes made to the deployment or cabling. -We would like to expand the API's knowledge of the network to the entire underlay including inter-switch connections as well as BGP statistics and health status. - -## Switch Types - -Registering a switch at the API is done by the metal-core. -Apart from that, it also reconciles port and FRR configuration to adapt to the machine provisioning cycle. -This reconfiguration is only necessary on the leaf switches. -To allow deploying the metal-core on other switches than leaves we need a way of telling it what type of switch it is running on so it can act accordingly. -On any non-leaf switches it will only register the switch and report statistic but not change any configuration. -Supported switch types are - -- `leaf` -- `spine` -- `exit` -- `mgmtleaf` -- `mgmtspine` - -## Network Topology - -All switches should periodically report their LLDP neighbors and port configuration. -This information can be used to quickly identify common network issues, like MTU mismatch or the like. -Ideally, there would be some graphical representation of the network topology containing only the most important information for a quick overview. -It should contain all switches and machines as nodes and all connections as edges of a graph. -Ports, VRFs, and maybe also IPs should be associated with a connection. - -Apart from the topology graph, there should be a way to display more detailed information about both ports of a connection, like - -- MTU -- speed -- IP -- UP/DOWN status -- VRF -- VLAN -- whether it participates in a BGP session - -## BGP Announcements - -The metal-core should collect all routes it knows about and send them to the API along with a timestamp. -Reported routes should be stored to a redis database along with the switch that reported them and the timestamp of the last time they were reported. -An expiration threshold should be defined and all expired routes should be cleaned up periodically. -Whenever new routes are reported they get merged into the existing ones by the strategy: - -- when new, just add -- when existing, update `last_announced` timestamp - -By querying the BGP announcements we can find out whether an allocated IP is still in use. diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/README.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/README.md deleted file mode 100644 index 9c02c0b7..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/README.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -slug: /MEP-18-autonomous-control-plane -title: MEP-18 -sidebar_position: 18 ---- - -# Autonomous Control Plane - -As described in the [deployment chapter](../../../docs/04-For%20Operators/03-deployment-guide.mdx), we strongly recommend Kubernetes as the target platform for running the metal-stack control plane. - -Kubernetes clusters for this purpose are readily available from hyperscalers, metalstack.cloud, or other cloud providers. Simply using a managed Kubernetes cluster greatly simplifies a metal-stack installation. However, sometimes it might be desirable to host the metal-stack control plane autonomously, without the help of another cloud provider. Reasons for this might include corporate policies that prohibit the use of external data center products, or network constraints. - -The Kubernetes cluster hosting the metal-stack control plane must provide at least the following features: - -- Load balancing (for exposing the APIs) -- Persistent storage (for the databases and key-value stores) -- Access to object storage for automated backups of the stateful sets -- Access to a DNS provider supported by one of the used DNS extensions -- Externally accessible DNS records for obtaining officially signed certificates through DNS challenges - -This metal-stack control plane cluster must also be highly available to prevent a complete loss of control over the managed resources in the data center. -Regular Kubernetes updates to apply security fixes and feature updates must be possible in an automated manner. The Day-2 operational overhead of running this cluster in your own datacenter must be reasonable. - -In this chapter, we propose a solution for setting up a metal-stack environment with an autonomous control plane that is independent of another cloud provider. - -## Use Your Own Dogfood - -The most obvious solution is to just deploy a Kubernetes cluster manually in your own data center by utilizing existing tooling for the deployment: - -- k3s -- kubeadm -- vmware and rancher -- talos -- kubespray -- ... (not a complete list) - -However, all these solutions add another layer of complexity that needs to be maintained and operated by people who also need to learn and understand metal-stack. In general, metal-stack in combination with [Gardener](https://gardener.cloud) contains all the necessary tools to provide KaaS, so it makes sense to reuse what is already in place without introducing new dependencies on other products and vendors. - -The only problem here is that Gardener is not yet able to create an initial cluster, which may change with the implementation of [GEP-28](https://github.com/gardener/gardener/blob/master/docs/proposals/28-autonomous-shoot-clusters.md). In the meantime, we suggest using [k3s](https://k3s.io/), which manages the initial metal-stack partition to host the control plane, since the maintenance overhead is acceptable and it is easy to deploy. - -## The Matryoshka Principle - -Instead of directly using the K3s cluster for the production control plane, we propose using it as a minimal control plane cluster which only purpose is to host the production control plane cluster. This layer of indirection brings some reasonable advantages: - -- In the event of an interruption or loss of this minimal control plane cluster, the production control plane remains unaffected, and end users can continue to manage their clusters as normal. -- A dedicated operations team can take care of the Day-2 maintenance of this installation, which can be handy because the tools like k3s are a little different from the rest of the setup (it is likely that more manual maintenance is required than for any other cluster). This would also be true if the initial cluster problem would be solved by the Gardener itself and not using k3s. -- Since the number of shoot clusters to host is static, the resource requirements are minimal and will not change significantly over time. There are no huge resource requirements in terms of cpu, memory and storage. As such, the lack of scalability is not such a big issue. - -So, our proposal is to chain two metal-stack control planes. The initial control plane cluster would use k3s and on this cluster we can spin up a cluster for the production control plane with the use of Gardener. - -The following figure shows how the high-level architecture of this setup looks like. A even more simplified illustration of this setup can be looked up in the appendix[^1]. - -![Autonomous Control Plane Architecture](./autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg) - -The k3s nodes can either be bare metal machines or virtual machines. When using VMs a single k3s node might be a viable solution, too. These nodes are supposed to be setup manually / partly automated with an operating system like Debian. - -To name the cluster that hosts the initial metal-stack control plane and Gardener we use the term _initial cluster_. The initial cluster creates worker nodes to host the _target cluster_. - -## Initial Cluster - -The initial cluster is kept very small. The physical bare metal machines can be any machines and switches which are supported by metal-stack, but can be smaller in terms of cpu, memory and network speed because these machines must only be capable of running the target cluster for the metal-stack control plane. A typical single socket server with 8-16 cores and 64GB of RAM and two NVMe drives of 1TB would be a good starting point. - -In a typical k3s setup, a stateful set would lose the data once the k3s cluster was terminated and started again. But there is a possibility to define parts of the local storage of the server to be provided to the k3s cluster for the PVCs. With that, k3s could be terminated and started again, for example to update and reboot the host os, or update k3s itself and the data will persist. - -Example k3s configuration for persistent storage on the hosts os: - -```yaml -k3s: Cluster -apiVersion: k3s.x-k8s.io/v1alpha4 -name: needle-control-plane -nodes: - - role: control-plane - # add a mount from /path/to/my/files on the host to /files on the node - extraMounts: - - hostPath: /path/to/my/files - containerPath: /files -``` - -Into this cluster metal-stack and Gardener will be deployed. This deployment can be done by a Gitlab runner which is running on this machine. -The mini-lab will be used as a base for this deployment. The current development of [gardener-in-minilab](https://github.com/metal-stack/mini-lab/pull/202) must be extended to host all required extensions to make this a working metal-stack control plane which can manage the machines in the attached bare metal setup. - -In addition to the metal-stack and Gardener deployment, some additional required services are deployed (non-complete list): - -- PowerDNS to serve as a DNS Server for all DNS entries used in the initial and the target cluster, like `api.initial.metal-stack.local`, `gardener-api.initial.metal-stack.local` and the DNS entries for the api servers of the created kubernetes clusters. -- NTP -- Monitoring for the initial cluster and partition -- Optional: OIDC Server for authenticating against the metal-api -- Optional: Container Registry to host all metal-stack and gardener containers -- Optional: Let's Encrypt [boulder](https://github.com/letsencrypt/boulder) as a certificate authority -- ... - -Physical view, minimal setup for a initial cluster with a single physical node: - -![Small Initial Cluster](autonomous-control-plane-images/small-initial-cluster.svg) - -Physical View, bigger ha setup which is spread across two data centers: - -![HA Initial Cluster](autonomous-control-plane-images/ha-initial-cluster.svg) - -### Control Plane High Availability - -Running the initial control plane on a single physical server is not as available as it should be in such a use case. It should be possible to survive a loss of this server, because the server could be lost by many events, such as hardware failure, disk corruption or even failure of the datacenter location where this server is deployed. - -Setting up a second server with the same software components is an option, but the problem of data redundancy must be solved, because neither the gardener control plane, nor the metal-stack control plane can be instantiated twice. - -Given that we provide part of the local storage of the server as backing storage for the stateful sets in the k3s cluster, the data stored on the server itself must be replicated to another server and backed up on a regular basis. - -The replication of ETCD can be achieved through [clustered configuration](https://docs.k3s.io/datastore/ha-embedded) of k3s. Components of metal-stack and Gardener can run standalone and already utilize backup-restore mechanism that must be configured accordingly. For two or more bare metal machine used for the initial cluster, a loadbalancing mechanism for the ingress is required. kube-vip could be a possible solution. - -For monitoring a backend like a Victoria Metrics Cluster would allow spearding the monitoring data across the initial cluster nodes. These metrics should also be backed up in object storage. - -### Partition - -The partition which is managed by the initial cluster can be a simple and small hardware setup but yet capable enough to host the target cluster. It would even be a good practice to create separate target clusters on the initial cluster, e.g. one for the metal-stack control plane and one for the Gardener (maybe one more for monitoring). - -It can follow the metal-stack minimal setup which provides about 8-16 small servers connected to a 1G/s or 10G/s network dataplane. Central storage is optional as the persistence of the services running in these clusters is always backed up to a central object storage. Operations would be much easier if a central storage is provided. - -## Target Cluster - -The target cluster is the metal-stack environment which serves for end-user production use, the control plane is running in a shoot hosted in the initial cluster. The seed(s) and shoot(s) for end-users are created on the machines provided by the target cluster. -These machines can be of a different type in terms of size, but more importantly, these machines are connected to another network dataplane. Also the management infrastructure is separated from the initial cluster management network. - -## Failure Scenarios - -Everything could fail, everything will fail at some point. But this must kept in mind and nothing bad should happen if only one component at a time fails. -If more than one fails, the restoration to a working state must be easily possible and well documented. - -To ensure all possible breakages are documented, we suggest writing a list which summarizes all failure scenarios that might occur including the remediation. - -Here is an example of how a scenario documentation could look like: - -**Scenario**: Initial cluster is gone, all machines have died -**Impact**: Management of the initial cluster infrastructure not possible anymore, the target cluster continues to run but cannot be managed because the API servers are gone. end-users are not affected by this incident. -**Remediation**: The initial cluster nodes must be provisioned from scratch and re-deployed through the CI mechanism. The backups of the stateful sets are automatically restored during this process. - -## Implementation - -As part of this proposal, we provide the following tools and integrations in order to setup an autonomous control plane: - -- Deployment roles for the services like PowerDNS and NTP for the initial cluster -- Stretch goal: Deployment role to setup k3s in clustered configuration for the initial cluster and update it -- Extend the Gardener on mini-lab integration to allow shoot creation in the mini-lab -- Steady integration of the setup (maybe something like [k3d](https://github.com/k3d-io/k3d) in the mini-lab) - -## Appendix - -[^1]: ![metal-stack-chain](autonomous-control-plane-images/metal-stack-chain.svg) diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio deleted file mode 100644 index eafcb514..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio +++ /dev/null @@ -1,535 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine01 -
-
-
-
- - spine01 - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 01 - -
-
-
-
- - Initial cluster node 01 - -
-
-
- - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine02 -
-
-
-
- - spine02 - -
-
-
- - - - - - - - - -
-
-
- leaf03 -
-
-
-
- - leaf03 - -
-
-
- - - - - - - - - -
-
-
- leaf04 -
-
-
-
- - leaf04 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 02 - -
-
-
-
- - Initial cluster node 02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 03 - -
-
-
-
- - Initial cluster node 03 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg deleted file mode 100644 index 99261ada..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine01
spine01
leaf01
leaf01
leaf02
leaf02
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Initial cluster node 01
Initial cluster node 01
123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine02
spine02
leaf03
leaf03
leaf04
leaf04
Initial cluster node 02
Initial cluster node 02
Initial cluster node 03
Initial cluster node 03
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio deleted file mode 100644 index aae8a12d..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio +++ /dev/null @@ -1,1133 +0,0 @@ - - - - - - - - - - - - - - - - - - - -
-
-
- Initial Cluster -
-
-
-
- - Initial Cluster - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - -
-
-
- K3s Standalone - - - (on Debian) - - -
-
-
-
- - K3s Standalone (on Debian) - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Initial Partition -
-
-
-
- - Initial Partition - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for metal-stack -
-
-
-
- - Target Cluster for metal-stack - -
-
-
- - - - - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for Gardener -
-
-
-
- - Target Cluster for Gardener - -
-
-
- - - - - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - - - - - - - -
-
-
- Target Partition -
-
-
-
- - Target Partition - -
-
-
- - - - - - - - - - -
-
-
- Gardener Seeds and End-User Shoots -
-
-
-
- - Gardener Seeds and End-User Shoots - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - - - - -
-
-
- ETCD can be clustered or standalone, backed up by sidecar -
-
-
-
- - ETCD can be clustere... - -
-
-
- - - - - - - - - - -
-
-
- This data will get lost in case local PV gets deleted -
-
-
-
- - This data will get l... - -
-
-
- - - - - - - - - - -
-
-
- We can work with local PVs here, too. -
- backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered. -
-
-
-
- - We can work with local PVs he... - -
-
-
- - - - - - - -
-
-
- ETCD will be deployed in HA configuration on local PVs. -
-
- csi-driver-lvm needs to implement auto deletion of orphaned PVs. -
-
- Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot. -
-
-
-
- - ETCD will be deployed in HA c... - -
-
-
- - - - - - - - - - -
-
-
- More sophisticated storage solutions can be in place. -
-
- (Lightbits, NetApp, ...) -
-
-
-
- - More sophisticated storage so... - -
-
-
- - - - - - - - - - -
-
-
- TODO: Evaluate how to persist these metrics. -
-
-
-
- - TODO: Evaluate how to persist... - -
-
-
- - - - - - - - - - -
-
-
- - 1 VM or -
-
-
- - - 3 Bare Metal Machines - - -
-
-
-
-
- - 1 VM or... - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-stack -
-
-
-
- - metal-stack - -
-
-
- - - - - - - -
-
-
- metal-api -
-
-
-
- - metal-api - -
-
-
- - - - - - - -
-
-
- metal-db -
-
-
-
- - metal-db - -
-
-
- - - - - - - -
-
-
- ipam-db -
-
-
-
- - ipam-db - -
-
-
- - - - - - - -
-
-
- masterdata-db -
-
-
-
- - masterdata-db - -
-
-
- - - - - - - -
-
-
- headscale-db -
-
-
-
- - headscale-db - -
-
-
- - - - - - - -
-
-
- auditing-db -
-
-
-
- - auditing-db - -
-
-
- - - - - - - -
-
-
- nsqd -
-
-
-
- - nsqd - -
-
-
- - - - - - - - - - - -
-
-
- Gardener -
-
-
-
- - Gardener - -
-
-
- - - - - - - - - - -
-
-
- Virtual Garden -
-
-
-
- - Virtual Garden - -
-
-
- - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - -
-
-
- gardenlet -
-
-
-
- - gardenlet - -
-
-
- - - - - - - -
-
-
- Garden etcd -
-
-
-
- - Garden etcd - -
-
-
- - - - - - - -
-
-
- Prometheus -
-
-
-
- - Prometheus - -
-
-
- - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - -
-
-
- - Gitlab - -
- - Runner - -
-
-
-
-
- - Gitlab... - -
-
-
- - - - - - - - - - -
-
-
- Services -
-
-
-
- - Services - -
-
-
- - - - - - - -
-
-
- PowerDNS -
-
-
-
- - PowerDNS - -
-
-
- - - - - - - -
-
-
- boulder -
-
-
-
- - boulder - -
-
-
- - - - - - - -
-
-
- NTP -
-
-
-
- - NTP - -
-
-
- - - - - - - -
-
-
- OIDC -
-
-
-
- - OIDC - -
-
-
- - - - - - - -
-
-
- ... -
-
-
-
- - ... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg deleted file mode 100644 index e58e783b..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg +++ /dev/null @@ -1 +0,0 @@ -
Initial Cluster
Initial Cluster
metal-roles
metal-roles
CI
CI
K3s Standalone(on Debian)
K3s Standalone (on Debian)
Initial Partition
Initial Partition
Target Cluster for metal-stack
Target Cluster for metal-stack
Metal Control Plane
Metal Control Plane
provisions
provisions
Target Cluster for Gardener
Target Cluster for Gardener
Gardener Control Plane
Gardener Control Plane
Monitoring
Monitoring
Target Partition
Target Partition
Gardener Seeds and End-User Shoots
Gardener Seeds and End-User Shoots
provisions
provisions
metal-roles
metal-roles
CI
CI
metal-roles
metal-roles
ETCD can be clustered or standalone, backed up by sidecar
ETCD can be clustere...
This data will get lost in case local PV gets deleted
This data will get l...
We can work with local PVs here, too.
backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered.
We can work with local PVs he...
ETCD will be deployed in HA configuration on local PVs.

csi-driver-lvm needs to implement auto deletion of orphaned PVs.

Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot.
ETCD will be deployed in HA c...
More sophisticated storage solutions can be in place.

(Lightbits, NetApp, ...)
More sophisticated storage so...
TODO: Evaluate how to persist these metrics.
TODO: Evaluate how to persist...
1 VM or
3 Bare Metal Machines
1 VM or...
metal-stack
metal-stack
metal-api
metal-api
metal-db
metal-db
ipam-db
ipam-db
masterdata-db
masterdata-db
headscale-db
headscale-db
auditing-db
auditing-db
nsqd
nsqd
Gardener
Gardener
Virtual Garden
Virtual Garden
Gardener Control Plane
Gardener Control Plane
gardenlet
gardenlet
Garden etcd
Garden etcd
Prometheus
Prometheus
Monitoring
Monitoring
Gitlab
Runner
Gitlab...
Services
Services
PowerDNS
PowerDNS
boulder
boulder
NTP
NTP
OIDC
OIDC
...
...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio deleted file mode 100644 index cd5cf007..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio +++ /dev/null @@ -1,404 +0,0 @@ - - - - - - - - - - -
-
-
- Partition 1 -
-
-
-
- - Partition 1 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 2 -
-
-
-
- - Partition 2 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 3 -
-
-
-
- - Partition 3 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Production Control Plane -
-
-
-
- - Production Control Plane - -
-
- - - - -
-
-
- metal-stack -
- kubernetes cluster -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- gardener -
- kubernetes cluster -
-
-
-
- - gardener... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - - - - -
-
-
- Control Plane Partition -
-
-
-
- - Control Plane Partition - -
-
- - - - - -
-
-
- backup of stateful sets -
-
-
-
- - backup of stateful sets - -
-
- - - - - - -
-
-
- bare metal machine -
-
-
-
- - bare metal machine - -
-
- - - - -
-
-
- metal-stack -
- and -
- gardener -
- kubernetes cluster -
- running in kind -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - -
-
-
- S3 -
-
-
-
- - S3 - -
-
- - - - -
-
-
- Needle -
-
-
-
- - Needle - -
-
- - - -
-
-
- - Nail - -
-
-
-
- - Nail - -
-
-
- - - - - Text is not SVG - cannot display - - - -
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg deleted file mode 100644 index 8f88ba14..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg +++ /dev/null @@ -1 +0,0 @@ -
Partition 1
Partition 1
seeds
seeds
shoots
shoots
Partition 2
Partition 2
seeds
seeds
shoots
shoots
Partition 3
Partition 3
seeds
seeds
shoots
shoots
Production Control Plane
Production Control Plane
metal-stack
kubernetes cluster
metal-stack...
gardener
kubernetes cluster
gardener...
Manages
Manages
Control Plane Partition
Control Plane Partition
backup of stateful sets
backup of stateful sets
bare metal machine
bare metal machine
metal-stack
and
gardener
kubernetes cluster
running in kind
metal-stack...
Manages
Manages
S3
S3
Needle
Needle 
Nail
Nail
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio deleted file mode 100644 index a75ee340..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- Initial cluster node -
-
-
-
- - Initial cluster node - -
-
-
- - - - - - - - - - - - - -
-
-
- mirocloud (initial cluster partition nodes) -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg deleted file mode 100644 index a9d29f05..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
leaf01
leaf01
leaf02
leaf02
Initial cluster node
Initial cluster node
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP2/README.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP2/README.md deleted file mode 100644 index c7f2360a..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP2/README.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -slug: /MEP-2-two-factor-authentication -title: MEP-2 -sidebar_position: 2 ---- - -# Two Factor Authentication diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP3/README.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP3/README.md deleted file mode 100644 index 5ce36721..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP3/README.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -slug: /MEP-3-machine-re-installation -title: MEP-3 -sidebar_position: 3 ---- - -# Machine Re-Installation - -In the current metal-api only machine installations are possible, performing a machine upgrade is only possible by creating a new machine and delete the old one. -This has the drawback that in case a lot of data is stored on the local disks, a full restore of the original data must be performed. - -To prevent this, we will introduce a new metal-api endpoint to reinstall the machine with a new image, _without_ actually deleting the data stored on the additional hard disks. - -Storage is a difficult task to get right and reliable. A short analysis of our different storage requirements lead to 3 different scenarios. - -- Storage for the etcd pvs in the seed cluster of every partition. - This is the most important storage in our setup because these etcd pods serve as configuration backend for all customer kubernetes clusters. If they fail, the cluster is down. However gardener deploys a backup and restore sidecar into the etcd pod of every customer kubernetes control plane, and if this sidecar detects a corrupt or missing etcd database file(s) it starts automatic restore from the configured backup location. This will take some minutes. If for example a node dies, and gardener creates a new node instead, the csi-lvm created pv is not present on that node. Kubernetes will not schedule the missing etcd pod on this node because it has a local PV configured and is therefore tainted to run only on that node. To let kubernetes create that pod anyhow, someone has to either remove the taint, or delete the pod. If this is done, the pod starts and the restore of the etcd data can start as well. You can see this is a bit too complicated and will take the customer cluster down for a while (not measured yet but in the range of 5-10 minutes). -- Storage in customer clusters. - This was not promised in 2020. We have a intermediate solution with the provisioning of csi-lvm by default into all customer clusters. Albeit this is only local storage and will get deleted if a node dies. -- S3 Storage. - We have two possibilities to cope with storage: - - In place update of the OS with a daemonset - This will be fast and simple, but might fail because the packages being installed are broken right now, or a filesystem gets full, or any other failure you can think of during a os update. Another drawback is that metal-api does not reflect the updated os image. - - metal-api get a machine reinstall endpoint - With this approach we leverage from existing and already proven mechanisms. Reinstall must keep all data except the sata-dom. Gardener currently is not able to do an update with this approach because it can only do `rolling` updates. Therefore a additional `osupdatestrategy` has to be implemented for metal and other providers in gardener to be able to leverage the metal reinstall on the same machineID approach. - -If reinstall is implemented, we should focus on the same technology for all scenarios and put ceph via rook.io into the kubernetes clusters as additional StorageClass. It has to be checked whether to use the raw disk or a PV as the underlay block device where ceph stores its data. - -## API and behavior - -The API will get an new endpoint "reinstall" this endpoint takes two arguments: - -- machineID -- image - -No other aspects of the machine can be modified during the re-installation. All data stored in the existing allocation will be preserved, only the image will be modified. -Once this endpoint was called, the machine will get a `reboot` signal with the boot order set to PXE instead of HDD and the network interfaces on the leaf are set to PXE as well. Then the normal installation process starts: - -- unchanged: PXE boot with metal-hammer -- changed: metal-hammer first checks with the machineID in the metal-api (through metal-core) if there is already a allocation present -- changed: if a allocation is present and the allocation has set `reinstall: true`, wipe disk is only executed for the root disk, all other disks are untouched. -- unchanged: the specified image is downloaded and burned, `/install.sh` is executed -- unchanged: successful installation is reported back, network is set the the vrf, boot order is set to HDD. -- unchanged: distribution kernel is booted via kexec - -We can see that the `allocation` requires one additional parameter: `reinstall` and metal-hammer must check for already existing allocation at an earlier stage. - -Components which requires modifications (first guess): - -- metal-hammer: - - check for allocation present earlier - - evaluation of `reinstall` flag set - - wipe of disks depends on that flag - - Bonus: move configuration of disk layout and primary disk detection algorithm (PDDA) from metal-hammer into metal-api. - metal-api **MUST** reject reinstallation if the disk found by PDDA does not have the `/etc/metal` directory! -- metal-core: - - probably nothing -- metal-api: - - new endpoint `/machine/reinstall` - - add `Reinstall bool` to data model of `allocation` - - make sure to reset `Reinstall` after reinstallation to prevent endless reinstallation loop -- metalctl: - - implement `reinstall` -- metal-go: - - implement `reinstall` -- gardener (longterm): - - add the `OSUpgradeStrategy` `reinstall` diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP4/README.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP4/README.md deleted file mode 100644 index 389a02d4..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP4/README.md +++ /dev/null @@ -1,211 +0,0 @@ ---- -slug: /MEP-4-multi-tenancy-for-the-metal-api -title: MEP-4 -sidebar_position: 4 ---- - -# Multi-Tenancy for the metal-api -:::info -This document is work in progress. -::: - -In the past we decided to treat the metal-api as a "low-level API", i.e. the API does not specifically deal with projects and tenants. A user with editor access can for example assign machines to every project he desires, he can see all the machines available and can control them. We tried to keep the metal-api code base as small as possible and we added resource scoping to a "higher-level APIs". From there, a user would be able to only see his own clusters and IP addresses. - -As time passed metal-stack has become an open-source project and people are willing to adopt. Adopters who want to put their own technologies on top of the metal-stack infrastructure don't have those "higher-level APIs" that we implemented closed-source for our user base. So, external adopters most likely need to implement resource scoping on their own. - -Introducing multi-tenancy to the metal-api is a serious chance of making our product better and more successful as it opens the door for: - -- Becoming a "fully-featured" API -- Narrowing down attack surfaces and possibility of unintended resource modification produced by bugs or human errors -- Discouraging people to implement their own scoping layers in front of the metal-stack -- Gaining performance through resource scopes -- Letting untrusted / third-parties work with the API - -## Requirements - -These are some general requirements / higher objectives that MEP-4 has to fulfill. - -- Should be able to run with mini-lab without requiring to setup complex auth backends (dex, LDAP, keycloak, ...) - - Simple to start with, more complex options for production setups -- Fine-grained access permissions (every endpoint maps to a permission) -- Tenant scoping (disallow resource access to resources of other tenants) -- Project scoping (disallow resource access to resources of other projects) -- Access tokens in self-service for technical user access - -## Implementation - -We gathered a lot of knowledge while implementing a multi-tenancy-capable backend for metalstack.cloud. The goal is now to use the same technology and adopt that to the metal-api, this includes: - -- gRPC in combination with connectrpc -- OPA for making auth decisions -- REST HTTP only for OIDC login flows - -### API Definitions - -The API definitions should be located on a separate Github repository separate from the server implementation. The proposed repository location is: https://github.com/metal-stack/api. - -This repository contains the `proto3` specification of the exposed metal-stack api. This includes the messages, simple validations, services and the access permission to these services. The input parameters for the authorization in the backend are generated from the `proto3` annotations. - -Client implementations for the most relevant languages (go, python) are generated automatically. - -This api is divided into end-user and admin access at the top level. The proposed APIs are: - -- `metalstack.api.v2`: For end-user facing services -- `metalstack.admin.v2`: For operators and controllers which need access to unscoped entities - -The methods of the API can have different role scopes (and can be narrowed down further with fine-grained method permissions): - -- `tenant`: Tenant-scoped methods, e.g. project creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `project`: Project-scoped methods, e.g. machine creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `admin` Admin-scoped methods, e.g. unscoped tenant list or switch register - - Available roles: VIEWER, EDITOR - -And has methods with different visibility scopes: - -- `self`: Methods that only the logged in user can access, e.g. show permissions with the presented token -- `public`: Methods that do not require any specific authorization - -### API - -The API server implements the services defined in the API and validates access to a method using OPA with the JWT tokens passed in the requests. The server is implemented using the connectrpc.com framework. - -The API server implements the login flow through OIDC. After successful authentication, the API server derives user permissions from the OIDC provider and issues a new JWT token which is passed on to the user. The tokens including the permissions are stored in a redis compatible backend. - -With these tokens, users can create Access Tokens for CI/CD or other use cases. - -JWT Tokens can be revoked by admins and the user itself. - -### API Server - -Is put into a new github repo which implements the services defined in the `api` repository. It opens a `https` endpoints where the grpc (via connectrpc.com) and oidc services are exposed. - -### Migration of the Consumers - -To allow consumers to migrate to the `v2` API gradually, both apis, the new and the old, are deployed in parallel. In the control-plane both apis are deployed side-by-side behind the ingress. `api.example.com` is forwarded to `metal-api` and `metal.example.com` is forwarded to the new `metal-apiserver`. - -The api-server will talk to the existing metal-api during the process of migration services away to the new grpc api. - -The migration process can be done in the following manner: - -for each resource in the metal-api: - -- create a new proto3 based definition in the `api` repo. -- implement the business logic per service in the new `metal-apiserver` without calling the metal-api. -- clients must be able to talk to `v1` and `v2` backend in parallel -- Deprecate the already migrated service in the swagger route to notify the client that this route should not be used anymore. -- identify all consumers of this resource and replace them to use the grpc instead of the rest api -- move the business logic incl. the backend calls to ipam, metal-db, masterdata-api, nsq for this resource from the metal-api to the `metal-apiserver` - -We will migrate the rethinkdb backend implementation to a generic approach during this effort. - -- Try to enhance the generic rethinkdb interface with `project` scoped methods. - -There are a lot of consumers of metal-api, which need to be migrated: - -- ansible -- firewall-controller -- firewall-controller-manager -- gardener-extension-auth -- gardener-extension-provider-metal - - Do not point the secret bindings to a the shared provider secret in the seed anymore. Instead, use individual provider-secret containing project-scoped API access tokens in the Gardener project namespaces. -- machine-controller-manager-provider-metal -- metal-ccm -- metal-console -- metal-bmc -- metal-core -- metal-hammer -- metal-image-cache-sync -- metal-images -- metal-metrics-exporter -- metal-networker -- metalctl -- pixie - -## User Scenarios - -This section gathers a collection of workflows from the perspective of a user that we want to provide with the implementation of this proposal. - -### Machine Creation - -A regular user wants to create a machine resource. - -Requirements: Project was created, permissions are present - -- The user can see networks that were provided by the admin. - - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` - -- The user has to set the project scope first or provide `--project` flags for all commands. - ``` - $ metalctl project set 793bb6cd-8b46-479d-9209-0fedca428fe1 - You are now acting on project 793bb6cd-8b46-479d-9209-0fedca428fe1. - ``` -- The user can create the child network required for machine allocation. - ``` - $ metalctl network allocate --partition fra-equ01 --name test - ``` -- Now, the user sees his own child network. - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - └─╴08b9114b-ec47-4697-b402-a11421788dc6 test 793bb6cd-8b46-479d-9209-0fedca428fe1 fra-equ01 false false 10.128.64.0/22  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` -- The user does not see any machines yet. - ``` - $ metalctl machine ls - ``` -- The user can create a machine. - ``` - $ metalctl machine create --networks internet,08b9114b-ec47-4697-b402-a11421788dc6 --name test --hostname test --image ubuntu-20.04 --partition fra-equ01 --size c1-xlarge-x86` - ``` -- The machine will now be provisioned. - ``` - $ metalctl machine ls - ID LAST EVENT WHEN AGE HOSTNAME PROJECT SIZE IMAGE PARTITION - 00000000-0000-0000-0000-ac1f6b7befb2 Phoned Home 20s 50d 4h test 793bb6cd-8b46-479d-9209-0fedca428fe1 c1-xlarge-x86 Ubuntu 20.04 20210415 fra-equ01 - ``` - -:::warning -A user **cannot** list all allocated machines for all projects. The user **must** always switch project context first and can only view the machines inside this project. Only admins can see all machines at once. -::: -### Scopes for Resources - -The admins / operators of the metal-stack should be able to provide _global_ resources that users are able to use along with their own resources. In particular, users can view and use _global_ resources, but they are not allowed to create, modify or delete them. - -:::info -When a project ID field is empty on a resource, the resource is considered _global_. -::: - -Where possible, users should be capable of creating their own resource entities. - -| Resource | User | Global | -| :----------------- | :--- | :----- | -| File System Layout | yes | yes | -| Firewall | yes | | -| Firmware | | yes | -| OS Image | | yes | -| Machine | yes | | -| Network (Base) | | yes | -| Network (Children) | yes | | -| IP | yes | | -| Partition | | yes | -| Project | yes | | -| Project Token | yes | | -| Size | | yes | -| Switch | | | -| Tenant | | yes | - -:::info -Example: A user can make use of the file system layouts provided by the admins, but can also create own layouts. Same applies for images. As soon as a user creates own resources, the user takes over the responsibility for the machine provisioning to succeed. -::: diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP5/README.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP5/README.md deleted file mode 100644 index 3b7fc45c..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP5/README.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -slug: /MEP-5-shared-networks -title: MEP-5 -sidebar_position: 5 ---- - -# Shared Networks - -## Why are shared networks needed - -For special purpose machines that serve shared services with performance critical workloads to all machines of a partition (like persistent storage) it would be good to have kind of a "shared network" that is easily accessible. -They do not necessarily need another firewall. This would avoid having two firewalls in the datapath between a machine in a private network and the machines of a shared service. - -## Constraints that need to hold - -- a shared network is usable from all machines that have a firewall in front, that uses it -- a shared network is only usable within a single partition (currently we are constrained in bandwidth and have no routing of 10.0.0.0/8 addresses btw. partitions and failure domain should be the partition but this constraint might get lifted in the future) -- networks may be marked as shared after network allocation (but there should be no way back from shared to unshared) -- neither machines nor firewalls may have multiple private, unshared networks configured -- machines must have a single primary network configured - - this might be a shared network - - OR a plain, unshared private network -- firewalls may participate in multiple shared networks -- machines can be allocated with a primary network using auto IP allocation or with `noauto` and a specific IP - -## Should shared networks be private - -**Alternative 1:** If we implemented shared networks by extending functions around plain, private networks we would not have to manage another CIDR (mini point) and it would be possible to create a k8s cluster with a private network, mark the network as `shared` and produce shared services from this k8s cluster. - -**Alternative 2:** If shared networks are implemented as first class networks we could customize the VRF and also accomplish an other goal of our roadmap: being able to create machines directly in an external network. - -Together with @majst01 and @Gerrit91 we decided to continue to implement **Alternative 1**. - -## Firewalls accessing a shared network - -Firewalls that access shared networks need to: - -- hide the private network behind an ip address of the shared network if the shared network was configured with `nat=true`. -- import the prefixes of the shared VRF to the private VRF and import the prefixes of the private VRF to the shared VRF so that the communication between the two is working in both directions. As long as no `nat=true` was set on the shared VRF, the original machine ips are visible in both communication directions. - -## Setup with shared networks and single consumer - -![Simple Setup](./shared.png) - -## Setup with single shared network and multiple consumers - -![Advanced Setup](./shared_advanced.png) - -## Getting internet access - -Machines contained in a shared network can access the internet with different scenarios: - -- if they have an own firewall: this is internet accessibility, as common (check whether all traffic gets routed through it!) -- if they don't have an own firewall, an external HTTP proxy is needed that has an endpoint exposed as Service Type NodePort diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP5/shared.drawio b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP5/shared.drawio deleted file mode 100644 index aa7af045..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP5/shared.drawio +++ /dev/null @@ -1,121 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP5/shared.png b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP5/shared.png deleted file mode 100644 index b0b47f03..00000000 Binary files a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP5/shared.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP5/shared_advanced.drawio b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP5/shared_advanced.drawio deleted file mode 100644 index 6f96eca0..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP5/shared_advanced.drawio +++ /dev/null @@ -1,187 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP5/shared_advanced.png b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP5/shared_advanced.png deleted file mode 100644 index da989915..00000000 Binary files a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP5/shared_advanced.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP6/README.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP6/README.md deleted file mode 100644 index edf52a6c..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP6/README.md +++ /dev/null @@ -1,123 +0,0 @@ ---- -slug: /MEP-6-dmz-networks -title: MEP-6 -sidebar_position: 6 ---- - -# DMZ Networks - -## Reasoning - -To fulfill higher levels of security measures the standard metal-stack approach with a single firewall in front of a set of machines might be insufficient. -There are cases where two physically distinct firewalls in front of application workload are mandatory. In traditional network terms this is known as DMZ approach. - -For Kubernetes workloads it makes sense to use the front cluster for ingress, WAF purposes and as outgoing proxy. The clusters may be used for application workload. - -## DMZ network - -- Use a separate DMZ network prefix for every tenant -- This is used as intermediate network btw. private networks of a tenant and the internet -- For every partition a distinct DMZ firewall/cluster is needed for a tenant -- For Gardener orchestrated Kubernetes clusters this network must be a publicly reachable internet prefix because shoot clusters need a vpn service that is used for instrumentation from the seed cluster - this will be a requirement as long as the inverse vpn tunnel feature Konnectivity is not available to us. - -## Approach 1: DMZ with publicly reachable internet prefix - -![DMZ Internet](dmz-internet_public.svg) - -A DMZ network with publicly reachable internet prefix will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: null -partitionid: "" -prefixes: - - 212.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 104007 -vrfshared: false -nat: true -shared: false -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -This is currently supported by the metal-networker and needs no further changes! - -## Approach 2: DMZ with private IPs - -![DMZ Internet](dmz-internet_private.svg) - -A DMZ network with private IPs will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: tenant-super-network-fra-equ01 -partitionid: fra-equ01 -prefixes: - - 10.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 4711 -vrfshared: false -nat: true -shared: true # it's usable from multiple projects -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network (only locally, no-export) -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -## Code Changes / Implications - -- `metal-networker` and `metal-ccm` assume that there is only one network providing the default-route -- `metal-networker` needs to - - import the default route from the internet network to the dmz network (DMZ Firewall) - - import the DMZ network to the internet network and adjusting NAT rules (DMZ Firewall) - - import destination prefixes of the DMZ network to the private primary network (DMZ Firewall, Application Firewall) - - import DMZ-IPs of the private primary network to the DMZ network (DMZ Firewall, Application Firewall) -- `metal-api`: destination prefixes of private networks need to be configurable (`allocateNetwork`) -- `gardener-extension-provider-metal`: needs to be able to delete DMZ clusters (but skip the network deletion part) -- the application firewall is not publicly reachable - for debugging purposes a hop over the DMZ firewall is needed - -## Decision - -We decided to follow the second approach with private DMZ networks. diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP6/dmz-internet_private.drawio b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP6/dmz-internet_private.drawio deleted file mode 100644 index 7b83bbfc..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP6/dmz-internet_private.drawio +++ /dev/null @@ -1,178 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP6/dmz-internet_private.svg b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP6/dmz-internet_private.svg deleted file mode 100644 index f5e58204..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP6/dmz-internet_private.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
10.90.30.129
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
10.90.30.128/25
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
10.90.30.129 is reachable
/0 via Firewall DMZ
10.0.0.0/24 is reachable
10.0.1.0/24 is reachable
10.90.30.129 is reachable...
Internet
212.1.1.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0

import 10.0.0.0/24 no export
import 10.0.1.0/24 no export
import 10.90.30.128/25 no export
import 10.0.0.0/24 no exp...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP6/dmz-internet_public.drawio b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP6/dmz-internet_public.drawio deleted file mode 100644 index 544939e5..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP6/dmz-internet_public.drawio +++ /dev/null @@ -1,184 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP6/dmz-internet_public.svg b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP6/dmz-internet_public.svg deleted file mode 100644 index 5e825081..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP6/dmz-internet_public.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
212.1.2.3
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
212.1.2.0/27
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
212.1.2.3 is reachable
/0 via Firewall DMZ
212.1.2.3 is reachable...
Internet
212.1.1.0/27 212.1.2.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0
import 212.1.2.0/27
import 10.0.0.0/24 no redistribute
import 10.0.1.0/24 no redistribute

import 212.1.2.0/27...
SNAT to
212.1.2.1
SNAT to...
SNAT to
212.1.2.2
SNAT to...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP8/README.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP8/README.md deleted file mode 100644 index 14748fae..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP8/README.md +++ /dev/null @@ -1,503 +0,0 @@ ---- -slug: /MEP-7-configurable-filesystem-layout-for-machine-allocation -title: MEP-7 -sidebar_position: 7 ---- - -# Configurable Filesystem layout for Machine Allocation - -The current implementation uses a hard coded filesystem layout depending on the specified size and image. This is done in the metal-hammer. This worked well in the past because we had a small amount of sizes and images. But we reached a point where this is to restricted for all use cases we have to fulfill. It also forces us to modify the metal-hammer source code to support a new filesystem layout. - -This proposal tries to address this issue by introducing a filesystem layout struct in the metal-api which is then configurable per machine allocation. -The original behavior of automatic filesystem layout decision must still be present, because there must be no API change for existing API consumers. It should be a additional feature during machine allocation. - -## API and behavior - -The API will get a new endpoint `filesystemlayouts`to create/update/delete a set of available `filesystemlayouts`. - -### Constraints - -In order to keep the actual machine allocation api compatible, there must be no difference while allocating a machine. To achieve this every -`filesystemlayout` defines constraints which specifies for which combination of `sizes` and `images` this layout should be used by default. -The specified constraints over all `filesystemlayouts` therefore must be collision free, to be more specific, there must be exactly one layout outcome -for every possible combination of `sizes` and `images`. - -The `size` constraint must be a list of the exact size ids, the `image` constraint must be a map of os to semver compatible version constraint. For example: - -- `debian: ">= 10.20210101"` or `debian: "< 10.20210101"` - -The general form of a `image` constraint is a map from `os` to `versionconstraint` where: - -`os` must match the first part of the image without the version. -`versionconstraint` must be the comparator, a space and the version, or simply `*` to match all versions of this `os`. -The comparator must be one of: "=", "!=", ">", "<", ">=", "=>", "<=", "=<", "~", "~>", "^" - -It must also be possible to have a `filesystemlayout` in development or for other special purposes, which can be specified during the machine allocation. -To have such a layout, both constraints `sizes` and `images`must be empty list. - -### Reinstall - -The current reinstall implementation the metal-hammer detects during the installation on which disk the OS was installed and reports back to the metal-api the Report struct which has two properties `primarydisk` and `ospartition`. -Both fields are not required anymore because the logic is now shifted to the `filesystemlayout` definition. If `Disk.WipeOnReinstall` is set to true, this disk will be wiped, default is false and is preserved. - -### Handling of s2-xlarge machines - -These machines are a bit special compared to our `c1-*` machines because they have rotating hard disks for the mass storage purpose. -The downside is that the on board SATA-DOM has the same naming as the HDDs and can not be specified as the first /dev/sda disk because all HDDs are also /dev/sd\* disks. -Therefore we had a special SATA-DOM detection algorithm inside metal-hammer which simply checks for the smallest /dev/sd disk and took this to install the OS. - -This is not possible with the current approach, but we figured out that the SATA-DOM is always `/dev/sde`. So we can create a special `filesystemlayout` where the installations is made on this disk. - -### Possible Filesystemlayout hierarchies - -It is only possible to create a filesystem on top of a block device. The creation of a block device can be done on multiple ways, depending on the requirements regarding performance, space and redundancy of the filesystem. -It also depends on the disks available on the server. - -The current approach implements the following hierarchies: - -![filesystems](filesystems.png) - -### Implementation - -```go -// FilesystemLayout to be created on the given machine -type FilesystemLayout struct { - // ID unique layout identifier - ID string - // Description is human readable - Description string - // Filesystems to create on the server - Filesystems []Filesystem - // Disks to configure in the server with their partitions - Disks []Disk - // Raid if not empty, create raid arrays out of the individual disks, to place filesystems onto - Raid []Raid - // VolumeGroups to create - VolumeGroups []VolumeGroup - // LogicalVolumes to create on top of VolumeGroups - LogicalVolumes []LogicalVolume - // Constraints which must match to select this Layout - Constraints FilesystemLayoutConstraints -} - -type FilesystemLayoutConstraints struct { - // Sizes defines the list of sizes this layout applies to - Sizes []string - // Images defines a map from os to versionconstraint - // the combination of os and versionconstraint per size must be conflict free over all filesystemlayouts - Images map[string]string -} - -type RaidLevel string -type Format string -type GPTType string - -// Filesystem defines a single filesystem to be mounted -type Filesystem struct { - // Path defines the mountpoint, if nil, it will not be mounted - Path *string - // Device where the filesystem is created on, must be the full device path seen by the OS - Device string - // Format is the type of filesystem should be created - Format Format - // Label is optional enhances readability - Label *string - // MountOptions which might be required - MountOptions []string - // CreateOptions during filesystem creation - CreateOptions []string -} - -// Disk represents a single block device visible from the OS, required -type Disk struct { - // Device is the full device path - Device string - // Partitions to create on this device - Partitions []Partition - // WipeOnReinstall, if set to true the whole disk will be erased if reinstall happens - // during fresh install all disks are wiped - WipeOnReinstall bool -} - -// Raid is optional, if given the devices must match. -// TODO inherit GPTType from underlay device ? -type Raid struct { - // ArrayName of the raid device, most often this will be /dev/md0 and so forth - ArrayName string - // Devices the devices to form a raid device - Devices []Device - // Level the raidlevel to use, can be one of 0,1,5,10 - // TODO what should be support - Level RaidLevel - // CreateOptions required during raid creation, example: --metadata=1.0 for uefi boot partition - CreateOptions []string - // Spares defaults to 0 - Spares int -} - - -// VolumeGroup is optional, if given the devices must match. -type VolumeGroup struct { - // Name of the volumegroup without the /dev prefix - Name string - // Devices the devices to form a volumegroup device - Devices []string - // Tags to attach to the volumegroup - Tags []string -} - -// LogicalVolume is a block devices created with lvm on top of a volumegroup -type LogicalVolume struct { - // Name the name of the logical volume, without /dev prefix, will be accessible at /dev/vgname/lvname - Name string - // VolumeGroup the name of the volumegroup - VolumeGroup string - // Size of this LV in mebibytes (MiB) - Size uint64 - // LVMType can be either striped or raid1 - LVMType LVMType -} - -// Partition is a single partition on a device, only GPT partition types are supported -type Partition struct { - // Number of this partition, will be added to the device once partitioned - Number int - // Label to enhance readability - Label *string - // Size given in MebiBytes (MiB) - // if "0" is given the rest of the device will be used, this requires Number to be the highest in this partition - Size string - // GPTType defines the GPT partition type - GPTType *GPTType -} - -const ( - // VFAT is used for the UEFI boot partition - VFAT = Format("vfat") - // EXT3 is usually only used for /boot - EXT3 = Format("ext3") - // EXT4 is the default fs - EXT4 = Format("ext4") - // SWAP is for the swap partition - SWAP = Format("swap") - // None - NONE = Format("none") - - // GPTBoot EFI Boot Partition - GPTBoot = GPTType("ef00") - // GPTLinux Linux Partition - GPTLinux = GPTType("8300") - // GPTLinuxRaid Linux Raid Partition - GPTLinuxRaid = GPTType("fd00") - // GPTLinux Linux Partition - GPTLinuxLVM = GPTType("8e00") - - // LVMTypeLinear append across all physical volumes - LVMTypeLinear = LVMType("linear") - // LVMTypeStriped stripe across all physical volumes - LVMTypeStriped = LVMType("striped") - // LVMTypeStripe mirror with raid across all physical volumes - LVMTypeRaid1 = LVMType("raid1") -) -``` - -Example `metalctl` outputs: - -```bash -$ metalctl filesystemlayouts ls -ID DESCRIPTION SIZES IMAGES -default default fs layout c1-large-x86, c1-xlarge-x86 debian >=10, ubuntu >=20.04, centos >=7 -ceph fs layout for ceph s2-large-x86, s2-xlarge-x86 debian >=10, ubuntu >=20.04 -firewall firewall fs layout c1-large-x86, c1-xlarge-x86 firewall >=2 -storage storage fs layout s3-large-x86 centos >=7 -s3 storage fs layout s2-xlarge-x86 debian >=10, ubuntu >=20.04, >=firewall-2 -default-devel devel fs layout -``` - -The `default` layout reflects what is actually implemented in metal-hammer to guarantee backward compatibility. - -```yaml ---- -id: default -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - label: "efi" # required to be compatible with old images - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" # required to be compatible with old images - - path: "/var/lib" - device: "/dev/sda3" - format: "ext4" - label: "varlib" # required to be compatible with old images - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -The `firewall` layout reuses the built in nvme disk to store the logs, which is way faster and larger than what the sata-dom ssd provides. - -```yaml ---- -id: firewall -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - firewall: ">=2" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sda2" - format: "ext4" - - path: "/var" - device: "/dev/nvme0n1p1" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - device: "/dev/nvme0n1" - wipe: true - partitions: - - number: 1 - label: "var" - size: 0 - type: GPTLinux -``` - -The `storage` layout will be used for the storage servers, which must have mirrored boot disks. - -```yaml ---- -id: storage -constraints: - sizes: - - s3-large-x86 - images: - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/md1" - format: "vfat" - options: "-F32" - - path: "/" - device: "/dev/md2" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid - - device: "/dev/sdb" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid -raid: - - name: "/dev/md1" - level: 1 - devices: - - "/dev/sda1" - - "/dev/sdb1" - options: "--metadata=1.0" - - name: "/dev/md2" - level: 1 - devices: - - "/dev/sda2" - - "/dev/sdb2" - options: "--metadata=1.0" -``` - -The `s3-storage` layout matches the special situation on the s2-xlarge machines. - -```yaml ---- -id: s3-storage -constraints: - sizes: - - c1-large-x86 - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sde1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sde2" - format: "ext4" - - path: "/var/lib" - device: "/dev/sde3" - format: "ext4" -disks: - - device: "/dev/sde" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -A sample `lvm` layout which puts `/var/lib` as stripe on the nvme device - -```yaml ---- -id: lvm -description: "lvm layout" -constraints: - size: - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - createoptions: - - "-F 32" - label: "efi" - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" - - path: "/var/lib" - device: "/dev/vg00/varlib" - format: "ext4" - label: "varlib" - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -volumegroups: - - name: "vg00" - devices: - - "/dev/nvmne0n1" - - "/dev/nvmne0n2" -logicalvolumes: - - name: "varlib" - volumegroup: "vg00" - size: 200 - lvmtype: "striped" -disks: - - device: "/dev/sda" - wipeonreinstall: true - partitions: - - number: 1 - label: "efi" - size: 500 - gpttype: "ef00" - - number: 2 - label: "root" - size: 5000 - gpttype: "8300" - - device: "/dev/nvmne0n1" - wipeonreinstall: false - - device: "/dev/nvmne0n2" - wipeonreinstall: false -``` - -## Components which requires modifications - -- metal-hammer: - - change implementation from build in hard coded logic - - move logic to create fstab from install.sh to metal-hammer -- metal-api: - - new endpoint `filesystemlayouts` - - add optional spec of `filesystemlayout` during `allocation` with validation if given `filesystemlayout` is possible on given size. - - add `allocation.filesystemlayout` in the response, based on either the specified `filesystemlayout` or the calculated one. - - implement `filesystemlayouts` validation for: - - matching to disks in the size - - no overlapping with the sizes/imagefilter specified in `filesystemlayouts` - - all devices specified exists from top to bottom (fs -> disks -> device || fs -> raid -> devices) -- metalctl: - - implement `filesystemlayouts` -- metal-go: - - adopt api changes -- metal-images: - - install mdadm for raid support diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP8/filesystems.drawio b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP8/filesystems.drawio deleted file mode 100644 index 0f0c6ab5..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP8/filesystems.drawio +++ /dev/null @@ -1,43 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP8/filesystems.png b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP8/filesystems.png deleted file mode 100644 index 6d903b7e..00000000 Binary files a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP8/filesystems.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP9/README.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP9/README.md deleted file mode 100644 index a8cae83d..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP9/README.md +++ /dev/null @@ -1,132 +0,0 @@ ---- -slug: /MEP-9-no-open-ports-to-the-data-center -title: MEP-9 -sidebar_position: 9 ---- - -# No Open Ports To the Data Center - -Our metal-stack partitions typically have open ports for metal-stack native services, these are: - -- SSH port on the firewalls -- bmc-reverse-proxy for serial console access through the metal-console - -These open ports are potential security risks. For example, while SSH access is possible only with private key it's still vulnerable to DoS attack. - -Therefore, we want to get rid off these open ports to reduce the attack surface to the data center. - -## Requirements - -- Access to firewall SSH only via VPN -- Easy to update VPN components - -As a next step, we can also consider joining the management servers to the VPN mesh, which would replace typical WireGuard setups for operators to enter resources inside the partition. - -## High Level Design - -[](./architecture.svg) - -> Simplified drawing showing old vs. new architecture. - -### Concerns - -There's few concerns when using WireGuard for implementing VPN: - -1. WireGuard doesn't implement dynamic cipher substitution. Which is important in case one of the crypto methods, used by WireGuard will be broken. The only possible solution for that will be to update WireGuard to a fixed version. -2. Coordination server(Headscale) is a single point of failure. In case it fails, it potentially can disconnect existing members of the network, as WireGuard can't manage dynamic IPs by itself. -3. Headscale is already falls behind Tailscale coordination server implementation. Which can complicate the upgrade to newer version of Tailscale client in case of emergency. - -### Solutions to concerns - -1. Tailscale node software is using userspace implementation of WireGuard -- `wireguard-go`. One of the options is to inject Tailscale client into `metalctl`. And make it available as `metalctl vpn` or similar command. It should be possible to do as `tailscale` node is already available as open sourced Go pkg. That would allow us to control, what version of Tailscale users are using and in case of any critical changes to enforce them to update `metalctl` to use VPN functionality. -2. Would it be a considerable risk? We could look into `wg-dynamic` project to cover this problem. -3. At the moment, repository looks well maintained and the metal-stack team already contributes to it. - -## Implementation Details - -### metal-roles - -`metal-roles` will be responsible for deployment of `headscale` server(via new `headscale` role). It also should provide sufficient config to `metal-api` so it establishes connection with `headscale` gRPC server. - -### New `metalctl` commands - -`metalctl` will be responsible for client-side implementation of this MEP. Specifically, it's by using `metalctl` user expected to connect to firewalls. - -- `metalctl vpn` -- section for VPN related commands: - - `metalctl vpn get key [vpn name] --namespace [namespace name]` -- returns auth key to be used with `tailscale` client for establishing connection. - -Extend `metalctl firewall`: - -- `metalctl firewall ssh [ID]` -- connect to firewall via SSH. - -Extend `metalctl machine`: - -- `metalctl machine ssh [ID]` -- connect to machine via SSH. - -`metalctl` will be able to connect to firewall and machines by running `tailscale` in container. - -### metal-api - -Updates to `metal-api` should be made, so that it's able to add firewalls to VPNs. There should be one Tailscale namespace per project. So if multiple firewalls are created in single project, they will join the same namespace. - -Two new flags should be introduced to connect `metal-api` to `headscale` gRPC server: - -- `headscale-addr` -- specifies address of Headscale grpc API. -- `headscale-api-key` -- specifies temporary API key to connect to Headscale. It should be replaced and then rotated by `metal-api`. - -If `metal-api` initialized with `headscale` connection it should automatically join all created firewalls to VPN. - -Add new endpoint, that will be used by `metalctl` to connect to VPN: - -- `/v1/vpn GET` -- requests auth key from `headscale` server. - -### metal-hammer - -`metal-hammer` acts as an intermediary for machine configuration between `metal-api` and machine's image. Specifically it writes to `/etc/metal/install.yaml` file, data from which later will be used by image's `install.sh` file. - -To implement VPN support we have to add authentication key and VPN server address to `install.yaml` file. This key will be used to join machine to a VPN. - -### metal-images - -Images `install.sh` script have to be updated to work with authentication key and VPN server address, provided in `install.yaml` file. If this key is present, machine should connect to VPN. - -### metal-networker - -`metal-networker` also have to know if VPN was configured. In that case we need to disable public access to SSH and allow all(?) traffic from WireGuard interface. - -### firewall-controller - -`firewall-controller` have to monitor changes in `Firewall` resource and keep `tailscaled` version up-to-date. - -### Resources - -Update `Firewall` resource to include desired/actual `tailscale` version: - -``` -Firewall: - Spec: - tailscale: - Version: Minimal version - ... - Status: - ... - VPN: - Status: Boolean field - tailscale: - Version: Actual version - ... -``` - -### bmc-reverse-proxy - -TODO - -## References - -1. [WireGuard: Next Generation Secure Network Tunnel](https://www.youtube.com/watch?v=88GyLoZbDNw) -2. [How Tailscale works](https://tailscale.com/blog/how-tailscale-works) -3. [Tailscale is officially SOC 2 compliant](https://tailscale.com/blog/soc2) -4. [Why not Wireguard](https://www.ipfire.org/blog/why-not-wireguard) -5. [Wireguard: Known Limitations](https://www.wireguard.com/known-limitations/) -6. [Wireguard: Things That Might Be Accomplished](https://www.wireguard.com/todo/) -7. [Headscale: Tailscale control protocol v2](https://github.com/juanfont/headscale/issues/526) diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP9/architecture.drawio b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP9/architecture.drawio deleted file mode 100644 index adb09214..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP9/architecture.drawio +++ /dev/null @@ -1,324 +0,0 @@ - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - -
-
-
- headscale -
-
-
-
- - headscale - -
-
- - - - - - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
-
- - - - - Viewer does not support full SVG 1.1 - - - -
diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP9/architecture.svg b/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP9/architecture.svg deleted file mode 100644 index fd268d2f..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/MEP9/architecture.svg +++ /dev/null @@ -1 +0,0 @@ -
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
headscale
headscale
tailscaled
tailscaled
tailscaled
tailscaled
Internet
Internet
Internet
Internet
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/_category_.json b/versioned_docs/version-v0.22.1/contributing/01-Proposals/_category_.json deleted file mode 100644 index 2e7fa4bf..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "position": 1, - "label": "Enhancement Proposals" -} \ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/contributing/01-Proposals/index.md b/versioned_docs/version-v0.22.1/contributing/01-Proposals/index.md deleted file mode 100644 index 0f6eddc3..00000000 --- a/versioned_docs/version-v0.22.1/contributing/01-Proposals/index.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -slug: /enhancement-proposals -title: Enhancement Proposals -sidebar_position: 1 ---- - -# Metal Stack Enhancement Proposals (MEPs) - -This section contains proposals which address substantial modifications to metal-stack. - -Every proposal has a short name which starts with _MEP_ followed by an incremental, unique number. Proposals should be raised as pull requests in the [website](https://github.com/metal-stack/website) repository and can be discussed in Github issues. - -The list of proposals and their current state is listed in the table below. - -Possible states are: - -- `In Discussion` -- `Accepted` -- `Declined` -- `In Progress` -- `Completed` -- `Aborted` - -Once a proposal was accepted, an issue should be raised and the implementation should be done in a separate PR. - -| Name | Description | State | Progress | -| :------------------------------------------------------------- | :--------------------------------------------- | :-------------: | :----------------------------------------------------------------: | -| [MEP-1](MEP1/README.md) | Distributed Control Plane Deployment | `Declined` | | -| [MEP-2](MEP2/README.md) | Two Factor Authentication | `Aborted` | | -| [MEP-3](MEP3/README.md) | Machine Re-Installation to preserve local data | `Completed` | | -| [MEP-4](MEP4/README.md) | Multi-tenancy for the metal-api | `In Progress` | [releases#236](https://github.com/metal-stack/releases/issues/236) | -| [MEP-5](MEP5/README.md) | Shared Networks | `Completed` | | -| [MEP-6](MEP6/README.md) | DMZ Networks | `Completed` | | -| [MEP-7](https://github.com/metal-stack/docs-archive/pull/51) | Passing environment variables to machines | `Declined` | | -| [MEP-8](MEP8/README.md) | Configurable Filesystemlayout | `Completed` | | -| [MEP-9](MEP9/README.md) | No Open Ports To the Data Center | `Completed` | | -| [MEP-10](MEP10/README.md) | SONiC Support | `Completed` | | -| [MEP-11](MEP11/README.md) | Auditing of metal-stack resources | `Completed` | | -| [MEP-12](MEP12/README.md) | Rack Spreading | `Completed` | | -| [MEP-13](MEP13/README.md) | IPv6 | `Completed` | | -| [MEP-14](MEP14/README.md) | Independence from external sources | `Completed` | | -| [MEP-15](https://github.com/metal-stack/docs-archive/pull/232) | HAL Improvements | `In Discussion` | [releases#238](https://github.com/metal-stack/releases/issues/238) | -| [MEP-16](MEP16/README.md) | Firewall Support for Cluster API Provider | `Accepted` | [releases#237](https://github.com/metal-stack/releases/issues/237) | -| [MEP-17](MEP17/README.md) | Global Network View | `In Discussion` | | -| [MEP-18](MEP18/README.md) | Autonomous Control Plane | `In Discussion` | | - -## Proposal Process - -1. Before starting a new proposal, it is advised to have a quick chat with one of the maintainers. -2. Create a draft pull request in the [website](https://github.com/metal-stack/website) repository with your proposal. Your proposal doesn't have to be finished at this point. -3. Share the PR in the [metal-stack Slack](https://metal-stack.slack.com/) and invite maintainers to review it. -4. The review itself will probably take place in multiple iterations. Don't be discouraged if your proposal is not accepted right away. The goal is to reach consensus. -5. Once your proposal is accepted, create an umbrella issue in the relevant repository or when multiple repositories are involved in the [releases](https://github.com/metal-stack/releases). -6. Other issues should be created in different repositories and linked to the umbrella issue. -7. Unless stated otherwise, the proposer is responsible for the implementation of the proposal. - -## How to Write a Good MEP - -In the first section of your MEP, start with the current situation and the motivation for the change. Summarize your proposal briefly. - -Next follows the main part: describe your proposal in detail. Which parts of of metal-stack are affected? Are there API changes? If yes, describe them and provide examples here. -Try to think of side effects your proposal might have. Try to provide a view on how your proposal affects users of metal-stack. -Highlight breaking changes and think of a migration path for existing users. If your proposal affects multiple components, try to describe the interaction between them. - -After the main part of your proposal, feel free to add additional sections, e.g. about alternatives that were considered, non-goals or future possibilities. - -Depending on the complexity of your proposal, you might want to add a section about the implementation plan or roadmap. - -You can have a look at the existing MEPs for inspiration. As you will notice: not every MEP has the same structure. Feel free to structure your MEP in a way that makes sense for your proposal. diff --git a/versioned_docs/version-v0.22.1/contributing/02-planning-meetings.mdx b/versioned_docs/version-v0.22.1/contributing/02-planning-meetings.mdx deleted file mode 100644 index df10177b..00000000 --- a/versioned_docs/version-v0.22.1/contributing/02-planning-meetings.mdx +++ /dev/null @@ -1,120 +0,0 @@ ---- -slug: /planning-meetings -title: Planning Meetings -sidebar_position: 2 ---- - -# Planning Meetings - -Public planning meetings are held **biweekly** on **odd calendar weeks** from **14:00 to 14:30** (Berlin/Europe timezone) on Microsoft Teams. The purpose is to provide an overview of our current projects and priorities, as well as to discuss new topics and issues within the group. - -export function PlanningMeetingDatesTable() { - const today = new Date(); - const dayOfWeek = today.getDay(); - - let daysUntilMonday = 0; - switch (dayOfWeek) { - case 0: - daysUntilMonday = 1; - break; - case 1: - daysUntilMonday = 0; - break; - default: - daysUntilMonday = 8 - dayOfWeek; - } - - const nextMonday = new Date(); - nextMonday.setDate(nextMonday.getDate() + daysUntilMonday) - - let onejan = new Date(today.getFullYear(), 0, 1); - let week = Math.ceil((((nextMonday.getTime() - onejan.getTime()) / 86400000) + onejan.getDay() + 1) / 7); - - if (week % 2 === 0) { - nextMonday.setDate(nextMonday.getDate() + 7) - } - - const blacklist = [ - new Date('2025-12-29'), - ] - - const amount = 8 - const dates = []; - - for (let i = 0; i < amount; i++) { - const nextDate = new Date(nextMonday); - nextDate.setDate(nextDate.getDate() + (i * 14)) - - if (blacklist.find(item => {return item.toDateString() == nextDate.toDateString()}) !== undefined ) { - continue - } - - dates.push(nextDate.toDateString()) - } - - return ( - - - - - - - - - - {dates.map((date, index) => ( - - - - - - ))} - -
DateTimeLink
{date}14:00 – 14:30Join Link
- ) -} - - - -Our [development planning board](https://github.com/orgs/metal-stack/projects/34) can be found on GitHub. - -[//]: <> (The C025PB1EUKC in the slack url references the #devs channel.) -If you want to get an invitation to the event, please drop us a line on our [Slack channel](https://metal-stack.slack.com/archives/C025PB1EUKC). - -Planning meetings are currently not recorded. The meetings are held either in English or German depending on the attendees. - -:::info -Note that anyone can contribute to metal-stack without participating in planning meetings. However, if you want to speed up the review process for your requirements, it might be helpful to attend the meetings. -::: - -## Agenda - -Here is the agenda that we generally want to follow in a planning meeting: - -- Possibility to bring up news that are interesting for every developer of the metal-stack org -- Check `Done` column and archive cards - - Attendees have the chance to briefly present achievements if they want -- Check the `In Progress` column and discuss whether these tasks are still worked on, there were significant blockers or they can be lower-prioritized -- Check new issues labelled with `triage` and prioritize them -- Allow attendees to bring up issues and prioritize them - - Attendees have the chance to briefly present these new issues - -## Idea Backlog - -The backlog contains ideas of what could become part of the roadmap in the future. The list is ordered alphabetically. Therefore, the order does not express the importance or weight of a backlog item. - -We incorporate community feedback into the roadmap. If you think that important points are missing in the backlog, please share your ideas with us. We have a Slack channel. Please check out [metal-stack.io](https://metal-stack.io) for contact information. - -:::danger -By no means this list is a promise of what is being worked on in the near future. It is just a summary of ideas that was agreed on to be "nice to have". It is up to the investors, maintainers and the community to choose topics from this list and to implement them or to remove them from the list. -::: - -- Add metal-stack to [Gardener conformance test grid](https://testgrid.k8s.io/gardener-all) -- Autoscaler for metal control plane components -- CI dashboard and public integration testing -- Improved release and deploy processes (GitOps, [Spinnaker](https://spinnaker.io/), [Flux](https://fluxcd.io/)) -- Machine internet without firewalls -- metal-stack dashboard (UI) -- Offer our metal-stack extensions as enterprise products (accounting, cluster-api, S3) (neither of them will ever be required for running metal-stack, they just add extra value for certain enterprises) -- Partition managed by Kubernetes (with Kubelets joining the control plane cluster) -- Public offering / demo playground diff --git a/versioned_docs/version-v0.22.1/contributing/03-contribution-guideline.md b/versioned_docs/version-v0.22.1/contributing/03-contribution-guideline.md deleted file mode 100644 index 010c2a05..00000000 --- a/versioned_docs/version-v0.22.1/contributing/03-contribution-guideline.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -slug: /contribution-guideline -title: Contribution Guideline -sidebar_position: 3 ---- - -# Contribution Guideline - -This document describes the way we want to contribute code to the projects of metal-stack, which are hosted on [github.com/metal-stack](https://github.com/metal-stack). - -The document is meant to be understood as a general guideline for contributions, but not as burden to be placed on a developer. Use your best judgment when contributing code. Try to be as clean and precise as possible when writing code and try to make your code as maintainable and understandable as possible for other people. - -Even if it should go without saying, we live an open culture of discussion, in which everybody is welcome to participate. We treat every contribution with respect and objectiveness with the general aim to write software of quality. - -If you want, feel free to propose changes to this document in a pull request. - -## How Can I Contribute? - -Open a Github issue in the project you would like to contribute. Within the issue, your idea can be discussed. It is also possible to directly create a pull request when the set of changes is relatively small. - -When opening an issue please consider the following aspects: - -1. Create a meaningful issue describing the WHY? of your contribution. -1. Try to set appropriate labels to the issue. For example, attach the `triage` label to your issue if you want it to be discussed in the next [planning meeting](./02-planning-meetings.mdx). It might be useful to attend the meeting if you want to emphasize it being worked on. - -### Pull Requests - -The process described here has several goals: - -- Maintain quality -- Enable a sustainable system to review contributions -- Enable documented and reproducible addition of contributions - -1. Create a repository fork within the context of that issue. Members of the organization may work on the repository directly without a fork, which allows building development artifacts more easily. -1. Develop, document and test your contribution (try not to solve more than one issue in a single pull request). -1. Create a Draft Pull Request to the repository's main branch. -1. Create a meaningful description of the pull request or reference the related issue. The pull request template explains what the content should include, please read it. -1. Ask for merging your contribution by removing the draft marker. Repository maintainers (see [Code Ownership](#code-ownership)) are notified automatically, but you can also reach out to people directly on Slack if you want a review from a specific person. - -## General Objectives - -This section contains language-agnostic topics that all metal-stack projects are trying to follow. - -### Code Ownership - -The code base is owned by the entire team and every member is allowed to contribute changes to any of the projects. This is considered as collective code ownership[^1]. - -As a matter of fact, there are persons in a project, which already have experience with the sources. These are defined directly in the repository's [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) file. If you want to merge changes into the master branch, it is advisable to include code owners into the process of discussion and merging. - -### Microservices - -One major ambition of metal-stack is to follow the idea of [microservices](https://en.wikipedia.org/wiki/Microservices). This way, we want to achieve that we can - -- adapt to changes faster than with monolithic architectures, -- be free of restrictions due to certain choices of technology, -- leverage powerful traits of cloud infrastructures (e.g. high-scalability, high-availability, ...). - -### Programming Languages - -We are generally open to write code in any language that fits best to the function of the software. However, we encourage [golang](https://en.wikipedia.org/wiki/Go_(programming_language)) to be the main language of metal-stack as we think that it makes development faster when not establishing too many different languages in our architecture. Reason for this is that we are striving for consistent behavior of the microservices, similar to what has been described for the Twelve-Factor App (see [12 Factor](https://12factor.net/)). We help enforcing unified behavior by allowing a small layer of shared code for every programming language. We will refer to this shared code as "libraries" for the rest of this document. - -### Artifacts - -Artifacts are always produced by a CI process (Github Actions). - -Docker images are published on the Github Container Registry of the metal-stack organization. - -Binary artifacts or OS images can be uploaded to `images.metal-stack.io` if necessary. - -When building Docker images, please consider our build tool [docker-make](https://github.com/fi-ts/docker-make) or the specific [docker-make action](https://github.com/fi-ts/action-docker-make) respectively. - -### APIs - -We are currently making use of [Swagger](https://swagger.io/) when we exposing traditional REST APIs for end-users. This helps us with being technology-agnostic as we can generate clients in almost any language using [go-swagger](https://goswagger.io/). Swagger additionally simplifies the documentation of our APIs. - -Most APIs though are not required to be user-facing but are of technical nature. These are preferred to be implemented using [grpc](https://grpc.io/). - -#### Versioning - -Artifacts are versioned by tagging the respective repository with a tag starting with the letter `v`. After the letter, there stands a valid [semantic version](https://semver.org/). - -### Documentation - -In order to make it easier for others to understand a project, we document general information and usage instructions in a `README.md` in any project. - -In addition to that, we document a microservice in the [docs](https://github.com/metal-stack/docs) repository. The documentation should contain the reasoning why this service exists and why it was being implemented the way it was being implemented. The aim of this procedure is to reduce the time for contributors to comprehend architectural decisions that were made during the process of writing the software and to clarify the general purpose of this service in the entire context of the software. - -## Guidelines - -This chapter describes general guidelines on how to develop and contribute code for a certain programming language. - -### Golang - -Development follows the official guide to: - -- Write clear, idiomatic Go code[^2] -- Learn from mistakes that must not be repeated[^3] -- Apply appropriate names to your artifacts: - - [https://go.dev/talks/2014/names.slide](https://go.dev/talks/2014/names.slide) - - [https://go.dev/blog/package-names](https://go.dev/blog/package-names) - - [https://go.dev/doc/effective_go#names](https://go.dev/doc/effective_go#names) -- Enable others to understand the reasoning of non-trivial code sequences by applying a meaningful documentation. - -#### Development Decisions - -- **Dependency Management** by using Go modules -- **Build and Test Automation** by using [GNU Make](https://man7.org/linux/man-pages/man1/make.1p.html). -- **End-user APIs** should consider using go-swagger and [Go-Restful](https://github.com/emicklei/go-restful) - **Technical APIs** should consider using [grpc](https://grpc.io/) - -#### Libraries - -metal-stack maintains several libraries that you should utilize in your project in order to unify common behavior. Some of these projects are: - -- [metal-go](https://github.com/metal-stack/metal-go) -- [metal-lib](https://github.com/metal-stack/metal-lib) - -#### Error Handling with Generated Swagger Clients - -From the server-side you should ensure that you are returning the common error json struct in case of an error as defined in the `metal-lib/httperrors`. Ensure you are using `go-restful >= v2.9.1` and `go-restful-openapi >= v0.13.1` (allows default responses with error codes other than 200). - -### Documentation - -We want to share knowledge and keep things simple. If things cannot kept simple we want to enable everybody to understand them by: - -- Document in short sentences[^4]. -- Do not explain the HOW (this is already documented by your code and documenting the obvious is considered a defect). -- Explain the WHY. Add a "to" in your documentation line to force yourself to explain the reasonning (e.g. "` to `"). - -### Python - -Development follows the official guide to: - -- Style Guide for Python Code (PEP 8)[^5] - - The use of an IDE like [PyCharm](https://www.jetbrains.com/pycharm/) helps to write compliant code easily -- Consider [setuptools](https://pythonhosted.org/an_example_pypi_project/setuptools.html) for packaging -- If you want to add a Python microservice to the mix, consider [pyinstaller](https://github.com/pyinstaller/pyinstaller) on Alpine to achieve small image sizes - -[^1]: [https://martinfowler.com/bliki/CodeOwnership.html](https://martinfowler.com/bliki/CodeOwnership.html) - -[^2]: [https://go.dev/doc/effective_go](https://go.dev/doc/effective_go) - -[^3]: [https://github.com/golang/go/wiki/CodeReviewComments](https://github.com/golang/go/wiki/CodeReviewComments) - -[^4]: [https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences](https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences) - -[^5]: [https://www.python.org/dev/peps/pep-0008/](https://www.python.org/dev/peps/pep-0008/) diff --git a/versioned_docs/version-v0.22.1/contributing/04-release-flow.md b/versioned_docs/version-v0.22.1/contributing/04-release-flow.md deleted file mode 100644 index 2a6403b7..00000000 --- a/versioned_docs/version-v0.22.1/contributing/04-release-flow.md +++ /dev/null @@ -1,107 +0,0 @@ ---- -slug: /release-flow -title: Release Flow -sidebar_position: 4 ---- - -# Releases - -The metal-stack contains of many microservices that depend on each other. The automated release flow is there to ensure that all components work together flawlessly for every metal-stack release. - -Releases and integration tests are published through our [release repository](https://github.com/metal-stack/releases). You can also find the [release notes](https://github.com/metal-stack/releases/releases) for this metal-stack version in there. The release notes contain information about new features, upgrade paths and bug fixes. - -If you want, you can sign up at our Slack channel where we are announcing every new release. Often, we provide additional information for metal-stack administrators and adopters at this place, too. - -This document is intended for developers, especially maintainers of metal-stack projects. - -## Release Flow - -The following diagram attempts to describe our current release flow: - -![](release_flow.svg) - -A release is created in the following way: - -- Individual repository maintainers within the metal-stack GitHub Organization can publish a release of their component. -- This release is automatically pushed to the `develop` branch of the release repository by the metal-robot. -- A push triggers a virtual release integration test using the mini-lab environment. This setup launches metal-stack with the `sonic` and `gardener` flavors to validate the different Ansible roles and execute basic operations across the metal-stack layer. -- To contribute components that are not directly part of the release vector, a pull request must be made against the `develop` branch of the release repository. Release maintainers may push directly to the `develop` branch. -- The release maintainers can `/freeze` the `develop` branch, effectively stopping the metal-robot from pushing component releases to this branch. -- The `develop` branch is tagged by a release maintainer with a `-rc.x` suffix to create a __release candidate__. -- The release candidate must pass a large integration test suite on a real environment, which is currently run by FI-TS. It tests the entire machine provisioning engine including the integration with Gardener, the deployment, metal-images and Kubernetes conformance tests. -- If the integration tests pass, the PR of the `develop` branch must be approved by at least two release maintainers. -- A release is created via GitHub releases, including all release notes, with a tag on the `main` branch. - -## FAQ - -**Question: I need PR #xyz to go into the release, why did you not include it?** - -Answer: It's not on purpose if we miss a PR to be included into a metal-stack release. Please use the pending pull request from `develop` into `master` as soon as it is open and comment which pull request you want to have included into the release. Also consider attending our planning meetings or contact us in our Slack channel if you have urgent requirements that need to be dealt with. - -**Question: Who is responsible for the releases? Who can freeze a release?** - -Answer: Every repository in metal-stack has a `CODEOWNERS` file pointing to a maintainer team. This is also true for the releases repository. Only release repository maintainers are allowed to `/freeze` a release (meaning the metal-robot does not automatically append new component releases to the release vector anymore). - -**Question: I can't push to the `develop` branch of this repository? How can I request changes to the release vector?** - -Answer: Most changes are automatically integrated by the metal-robot. For manually managed components, please raise a pull request against the `develop` branch. Only release maintainers are allowed to push to `develop` as otherwise it would be possible to mess up the release pipeline. - -**Question: What requirements need to be fulfilled to add a repository to the release vector?** - -Please see the section below named [Requirements for Release Vector Repositories](#requirements-for-release-vector-repositories). - -### Requirements for Release Vector Repositories - -Before adding a repository in the metal-stack org to the releases repository, it is advised for the maintainer to fulfill the following points: - -- The following files should be present at the repository root: - - [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) - - When a repository is created, the metal-robot automatically creates a -maintainers team in our GitHub org. - - The CODEOWNERS file should reference this team. - - The team should contain at least two maintainers. - - `LICENSE` - - This usually should be MIT with "metal-stack" as authors. - - `CONTRIBUTING.md` - - This should contain the following content: - ``` - # Contributing - - Please check out the [contributing section](https://docs.metal-stack.io/stable/development/contributing/) in our [docs](https://docs.metal-stack.io/). - ``` - - `README.md` -- The `developers-core` team should be given repository access with `write` role, the codeowners team should have the `maintain` role -- Release artifacts should have an SPDX-formatted SBOM attached. - - For container images these are embedded using Buildx. -- The following branch protection rules should be set: - - The mainline should be protected. - - A pull request should be required before merging (required by at least one code owner). - - Status checks should be required to pass. - - Force push should not be allowed on this branch. -- One person from the releases maintainers has to add the repository to the metal-robot in order to pick up the releases, add them to the release vector and generate release notes. - -### How-To Release a Project - -[release-drafter](https://github.com/release-drafter/release-drafter) is preferred in order to generate release notes from merged PRs for your projects. It should be triggered for pushes on your main branch. - -The draft is then used to create a project release. The release has to be published through the GitHub UI as demonstrated in the screenshot below. - -**Tagging the repository is not enough as repository tagging does not associate your release notes to your release!** - -![](release.png) - -Some further remarks: - -- Use semver versions with `v` prefix for your tags -- Name your release after your release tag -- The metal-robot only picks up lines from your release notes that start with `-` or `*` (unordered list items) and appends them to the according section in the aggregated release draft -- A tag created through a GitHub UI release does not trigger a `push` event . This means, your pipeline will not start to run with the `push` trigger when publishing through the UI. - - Instead, use the `published` [release event trigger](https://docs.github.com/en/actions/reference/events-that-trigger-workflows#release) for your actions: - - ```yaml - on: - release: - types: - - published - ``` -- In case they are necessary, please do not forget to include `NOTEWORTHY`, `ACTIONS_REQUIRED` or `BREAKING_CHANGE` sections into releases. More information on those release draft sections can be read in a pull request template. diff --git a/versioned_docs/version-v0.22.1/contributing/05-community.md b/versioned_docs/version-v0.22.1/contributing/05-community.md deleted file mode 100644 index 61eaf099..00000000 --- a/versioned_docs/version-v0.22.1/contributing/05-community.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -slug: /community -title: Community -sidebar_position: 5 -draft: true ---- - -# Community - -(Slack channel, community events like FOSDEM, Kubernetes Community Days..., blog -articles) diff --git a/versioned_docs/version-v0.22.1/contributing/release.png b/versioned_docs/version-v0.22.1/contributing/release.png deleted file mode 100644 index 598b1182..00000000 Binary files a/versioned_docs/version-v0.22.1/contributing/release.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.1/contributing/release_flow.drawio b/versioned_docs/version-v0.22.1/contributing/release_flow.drawio deleted file mode 100644 index 6ca6b34f..00000000 --- a/versioned_docs/version-v0.22.1/contributing/release_flow.drawio +++ /dev/null @@ -1,721 +0,0 @@ - - - - - - - - - - - -
-
-
- Review release notes -
-
-
-
- - Review release notes - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- Organization Webhook -
-
-
-
- - Organization Webhook - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- - Publish release - -
-
-
-
- - Publish release - -
-
-
- - - - - - - - -
-
-
- Maintainer -
-
-
-
- - Maint... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- metal-robot release handler -
-
-
-
- - metal-robot release han... - -
-
-
- - - - - - - - -
-
-
- - no - -
-
-
-
- - no - -
-
-
- - - - - - - - -
-
-
- - yes - -
-
-
-
- - yes - -
-
-
- - - - - - - -
-
-
- version in event newer than release vector version -
-
-
-
- - version in event newer than... - -
-
-
- - - - - - - -
-
-
- - do nothing - -
-
-
-
- - do nothing - -
-
-
- - - - - - - - - - - - -
-
-
- Github Action -
-
-
-
- - Github Action - -
-
-
- - - - - - - -
-
-
- Bump version in release vector and push to - - develop - -
-
-
-
- - Bump version in release vector... - -
-
-
- - - - - - - - - - - -
-
-
- Open pull request from - - develop - - to - - master - -
-
-
-
- - Open pull request from develop... - -
-
-
- - - - - - - -
-
-
- Update aggregated release draft in - - metal-stack/releases - -
-
-
-
- - Update aggregated release draf... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Integration Testing -
-
-
-
- - Integration Testing - -
-
-
- - - - - - - - - - - -
-
-
- Merge to - - master - -
-
-
-
- - Merge to master - -
-
-
- - - - - - - - - - - - -
-
-
- Review -
-
-
-
- - Review - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Tests suceeded and PR changes reviewed -
-
-
-
- - Tests suceeded and PR chang... - -
-
-
- - - - - - - -
-
-
- - publish results to #integration - -
-
-
-
- - publish results to #integr... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Release metal-stack -
-
-
-
- - Release metal-stack - -
-
-
- - - - - - - - - - - -
-
-
- - publish to #announcements - -
-
-
-
- - publish to #announcements - -
-
-
- - - - - - - -
-
-
- - - metal-stack/docs - - pull request - -
-
-
-
- - metal-stack/docs pull requ... - -
-
-
- - - - - - - - - - - - -
-
-
- Freeze -
-
-
-
- - Freeze - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Freeze - - develop - - and create a release candidate -
-
-
-
- - Freeze develop and create a rel... - -
-
-
- - - - - - - -
-
-
- Large integration suites -
- - (currently owned by FI-TS, not public) - -
-
-
-
-
- - Large integration suites... - -
-
-
- - - - - - - - -
-
-
- Run -
-
-
-
- - Run - -
-
-
- - - - -
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.1/contributing/release_flow.svg b/versioned_docs/version-v0.22.1/contributing/release_flow.svg deleted file mode 100644 index 55cdd493..00000000 --- a/versioned_docs/version-v0.22.1/contributing/release_flow.svg +++ /dev/null @@ -1 +0,0 @@ -
Review release notes
Review release notes
projects
projects
projects
projects
Organization Webhook
Organization Webhook
projects
projects
Publish release
Publish release
Maintainer
Maint...
metal-robot release handler
metal-robot release han...
no
no
yes
yes
version in event newer than release vector version
version in event newer than...
do nothing
do nothing
Github Action
Github Action
Bump version in release vector and push todevelop
Bump version in release vector...
Open pull request fromdeveloptomaster
Open pull request from develop...
Update aggregated release draft inmetal-stack/releases
Update aggregated release draf...
Integration Testing
Integration Testing
Merge tomaster
Merge to master
Review
Review
Tests suceeded and PR changes reviewed
Tests suceeded and PR chang...
publish results to #integration
publish results to #integr...
Release metal-stack
Release metal-stack
publish to #announcements
publish to #announcements
metal-stack/docspull request
metal-stack/docs pull requ...
Freeze
Freeze
Freezedevelopand create a release candidate
Freeze develop and create a rel...
Large integration suites
(currently owned by FI-TS, not public)
Large integration suites...
Run
Run
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.1/docs/02-General/04-flavors-of-metalstack.md b/versioned_docs/version-v0.22.1/docs/02-General/04-flavors-of-metalstack.md index 7da427fc..2277ca6b 100644 --- a/versioned_docs/version-v0.22.1/docs/02-General/04-flavors-of-metalstack.md +++ b/versioned_docs/version-v0.22.1/docs/02-General/04-flavors-of-metalstack.md @@ -14,7 +14,7 @@ As modern infrastructure and cloud native applications are designed with Kuberne Regardless which flavor of metal-stack you use, it is always possible to manually provision machines, networks and ip addresses. This is the most basic way of using metal-stack and is very similar to how traditional bare metal infrastructures are managed. -Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](../../contributing/01-Proposals/MEP4/README.md) and [MEP-16](../../contributing/01-Proposals/MEP16/README.md) in the future. +Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api) and [MEP-16](/community/MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller) in the future. ## Gardener diff --git a/versioned_docs/version-v0.22.1/docs/04-For Operators/03-deployment-guide.mdx b/versioned_docs/version-v0.22.1/docs/04-For Operators/03-deployment-guide.mdx index 58ddafd3..6be800cd 100644 --- a/versioned_docs/version-v0.22.1/docs/04-For Operators/03-deployment-guide.mdx +++ b/versioned_docs/version-v0.22.1/docs/04-For Operators/03-deployment-guide.mdx @@ -31,7 +31,7 @@ You can use the [mini-lab](https://github.com/metal-stack/mini-lab) as a templat The metal control plane is typically deployed in a Kubernetes cluster. Therefore, this document will assume that you have a Kubernetes cluster ready for getting deployed. Even though it is theoretically possible to deploy metal-stack without Kubernetes, we strongly advise you to use the described method because we believe that Kubernetes gives you a lot of benefits regarding the stability and maintainability of the application deployment. :::tip -For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](../../contributing/01-Proposals/MEP18/README.md) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). +For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](/community/MEP-18-autonomous-control-plane) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). ::: Let's start off with a fresh folder for your deployment: diff --git a/versioned_docs/version-v0.22.1/docs/05-Concepts/01-architecture.mdx b/versioned_docs/version-v0.22.1/docs/05-Concepts/01-architecture.mdx index 709960e3..75298df9 100644 --- a/versioned_docs/version-v0.22.1/docs/05-Concepts/01-architecture.mdx +++ b/versioned_docs/version-v0.22.1/docs/05-Concepts/01-architecture.mdx @@ -152,4 +152,4 @@ Thus, for creating a partition as well as a machine or a firewall, the flags `dn In order to be fully offline resilient, make sure to check out `metal-image-cache-sync`. This component provides copies of `metal-images`, `metal-kernel` and `metal-hammer`. -This feature is related to [MEP14](../../contributing/01-Proposals/MEP14/README.md). +This feature is related to [MEP14](/community/MEP-14-independence-from-external-sources). diff --git a/versioned_docs/version-v0.22.1/docs/05-Concepts/02-user-management.md b/versioned_docs/version-v0.22.1/docs/05-Concepts/02-user-management.md index f1ee2778..ba742ee9 100644 --- a/versioned_docs/version-v0.22.1/docs/05-Concepts/02-user-management.md +++ b/versioned_docs/version-v0.22.1/docs/05-Concepts/02-user-management.md @@ -7,7 +7,7 @@ sidebar_position: 2 # User Management At the moment, metal-stack can more or less be seen as a low-level API that does not scope access based on projects and tenants. -Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](../../contributing/01-Proposals/MEP4/README.md). +Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](/community/MEP-4-multi-tenancy-for-the-metal-api). Until then projects and tenants can be created, but have no effect on access control. diff --git a/versioned_docs/version-v0.22.1/docs/06-For CISOs/Security/01-principles.md b/versioned_docs/version-v0.22.1/docs/06-For CISOs/Security/01-principles.md index 8e7030f5..e327ec4a 100644 --- a/versioned_docs/version-v0.22.1/docs/06-For CISOs/Security/01-principles.md +++ b/versioned_docs/version-v0.22.1/docs/06-For CISOs/Security/01-principles.md @@ -15,7 +15,7 @@ The minimal need to know principle is a security concept that restricts access t ### RBAC :::info -As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](../../../contributing/01-Proposals/MEP4/README.md). +As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](..//community/MEP-4-multi-tenancy-for-the-metal-api). ::: As described in our [User Management](../../05-Concepts/02-user-management.md) concept the [metal-api](https://github.com/metal-stack/metal-api) currently offers three different user roles for authorization: diff --git a/versioned_docs/version-v0.22.1/docs/06-For CISOs/Security/04-communication-matrix.md b/versioned_docs/version-v0.22.1/docs/06-For CISOs/Security/04-communication-matrix.md index 07df2607..24c1bc1d 100644 --- a/versioned_docs/version-v0.22.1/docs/06-For CISOs/Security/04-communication-matrix.md +++ b/versioned_docs/version-v0.22.1/docs/06-For CISOs/Security/04-communication-matrix.md @@ -116,7 +116,7 @@ Please note that every [networking setup](../../05-Concepts/03-Network/01-theory | VLAN | Switches, Firewalls | Layer 2 traffic segmentation. | | VXLAN | Switches, Firewalls | Encapsulate Layer 2 frames in Layer 3 packets for network virtualization. | | EVPN | Switches, Firewalls | Overlay network technology for scalable and flexible network architectures. | -| VPN | Firewalls | Management access [without open SSH ports](../../../contributing/01-Proposals/MEP9/README.md). | +| VPN | Firewalls | Management access [without open SSH ports](..//community/MEP-9-no-open-ports-to-the-data-center). | | BGP | Multiple | Routing protocol for dynamic routing and network management. | | SSH | Management Server, Switches | Secure shell access for management and configuration. | | LLDP | Switches, Machines | Link Layer Discovery Protocol for network device discovery. | diff --git a/versioned_docs/version-v0.22.1/docs/06-For CISOs/rbac.md b/versioned_docs/version-v0.22.1/docs/06-For CISOs/rbac.md index 9a87b896..06c902bb 100644 --- a/versioned_docs/version-v0.22.1/docs/06-For CISOs/rbac.md +++ b/versioned_docs/version-v0.22.1/docs/06-For CISOs/rbac.md @@ -31,4 +31,4 @@ To ensure that internal components interact securely with the metal-api, metal-s Users can interact with the metal-api using [metalctl](https://github.com/metal-stack/metalctl), the command-line interface provided by metal-stack. Depending on the required operations, users should authenticate with the appropriate role to match their level of access. -As part of [MEP-4](../../contributing/01-Proposals/MEP4/README.md), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. +As part of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. diff --git a/versioned_docs/version-v0.22.1/docs/06-For CISOs/remote-access.md b/versioned_docs/version-v0.22.1/docs/06-For CISOs/remote-access.md index 0b8dbb19..dc24e82f 100644 --- a/versioned_docs/version-v0.22.1/docs/06-For CISOs/remote-access.md +++ b/versioned_docs/version-v0.22.1/docs/06-For CISOs/remote-access.md @@ -6,7 +6,7 @@ title: Remote Access ## Machines and Firewalls -Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](../../contributing/01-Proposals/MEP9/README.md). Administrators can access machines in two primary ways. +Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](/community/MEP-9-no-open-ports-to-the-data-center). Administrators can access machines in two primary ways. **Out-of-band management via SOL** @@ -26,4 +26,4 @@ This approach uses the [`metal-console`](../08-References/Control%20Plane/metal- Both methods ensure secure and controlled access to machines without exposing them unnecessarily to the network, maintaining the integrity and safety of the infrastructure. -Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](../../contributing/01-Proposals/MEP4/README.md). \ No newline at end of file +Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api). \ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/Distributed-API-Working.png b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/Distributed-API-Working.png deleted file mode 100644 index 899e223d..00000000 Binary files a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/Distributed-API-Working.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/Distributed-API.png b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/Distributed-API.png deleted file mode 100644 index 688c7c2e..00000000 Binary files a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/Distributed-API.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/Distributed-Deployment.png b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/Distributed-Deployment.png deleted file mode 100644 index 8bba51b8..00000000 Binary files a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/Distributed-Deployment.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/Distributed.drawio b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/Distributed.drawio deleted file mode 100644 index f7c6fe79..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/Distributed.drawio +++ /dev/null @@ -1 +0,0 @@ -7V3bcts2EP0aP8pDgPfH2ImbziRTt+m0zVMHpmCJMUUoFG1L/fqC4kUkAEoULwDkRC8WIBIkF2d3z+6C8JV5u9r+kqD18jOZ4+gKGvPtlfn+CtKPY9M/Wc8u7/FtM+9YJOE87wKHji/hf7joNIre53CON40DU0KiNFw3OwMSxzhIG30oSchr87BHEjWvukYLzHV8CVDE9/4dztNl3uvZxqH/Iw4Xy/LKwCh+eUDB0yIhz3FxvStoPu4/+c8rVI5VHL9Zojl5rXWZH67M24SQNP+22t7iKJNtKbb8vLuWX6v7TnCcdjkhWVoPJP5zF3/w/trOvG+vv70+z4rJe0HRMy4fw4noeDePhA5L5YmC/Afn+3N2pzcfcfSC0zBAh67s4dJd1DwuO3+22U/3O3oAgOtt/Qxnkf29R0kapiGJy6vS288vnP9eSK4aHFIhrrOvz6voLkEr+vXmdRmm+Ms6v89XClLat0xXEW0B+nU/QziTgUFb1SRkjYCswqD4HqEHHN1UU3pLIpLQn2ISZ9fYpAl5qvABise7Q6swymD/F07mKEZFd4FxkA2LonAR00ZAJwkn1RO94CTF29aJBBU8qNphssJpsqOHFCfMbLeAVKFywC0x+VpDcNm3rKHX9opOVGjNohr9gBz6pQDPGUACkEPSBif0OWkfHGEayyfzMqlWj2QaI4kUcCK1eJGajkikYASRrrboX/K79Qh+vYe77+Ef8RMBM4+TaECCp4SgYClDoJz0BDJuF6jFCNQ0O8oTGiPI055D4NsPc8+Yo0cAwMy0OJHhOfUDRZMk6ZIsSIyiD4dexnTUZDdHm+W+H3SwHNTE3YXZne5HwfH8Xea1souucZz3NH+v24+OZqZ1xrKHPDpfCY5QGr40naFI9PtT6a2jXe2ANQnjdFMb+T7rOMDAAoxaGdBvOqkzT6Bf8nsQn256LadXd7whz0mAi9MYQFVi6a+zrsCfDsTd4ZhPhKwL0H3DaborEICeU9LE5x50JcyCCG02mZ9rYBEcQ00upCOPWdAGOt4Cp0eOc8ZG4SCDypOdmkE1ADdTpVGlPGFNtTkTUuXQI/yYNTfUvobx4tO+9d50xrKeVhPHlsDBAyiwns5Uzsg5Krt2Dy9fdpUMVMgOuCh4QLZredg2fedhBji5MQT7bObckZJzBNt498OQ7HKm3Wm4jW0zXsbmEWaL6LdlTqWegLdesv1MC+Hp72T8SSgMxxkoN2yhquUYuZubjDP4nImgI6JohtahRmbVYsxqlcBR5pLKkPMtYb4U6uSgh23xmSTQlw9aRz3apJmNT5FGsIeedrA3j1ExzfMC0G6BnZS0gFieloCivcGYDXQN2oBeURu4mLCNtRXqozZwMWEbSy80kJ0ol6ModLv5GbqNgjIuza9B5OYp9zbjs1hJoRub7qVs4tqofWBzwKkp7UUEcmx6TD2hrQrkb0gDJqq/8PUSnk8r1IAKjXoHdWx2XQMVgAKuwerEoXLIhAd8dw3neBum/2R4vva8ovl137SL1vttgfZ9Y1dr3OMkpM+X+eWiNkmfNR8LOGW7NljWPIy2b+3qLXa8TurVllE/Hca4HVWw4fv5SS/7hmZc2KyxYzNoailNCkYymJHY2HhqNb/kDAS3MgFUpFBdDgL+IDkI2DUHAfXKQcCLyUFwpWMABSuZJHu3i8lCcMVjHaR3Mg8hbY3mzxLyVCVkv3Miwp0MZ9omIg4UtuSsX2vkVsxfB/goVXXnAxGRxeMuImHBEzZDoCxybXKZDdRPV/rj3pSUsuBKz9Jxb15GmoKiTD/g84mKywn9gK9f5GfysbRy0zJF5FcuwD8Z+Zn22GZo2PzwkbmmkV/1opk+oYt5PGzWKPCzWFOrgfD4qPln/fnC4z4AtAv7LNEKddYB/Zilh6PpmNOOrGsKU1H9AVg+g6neBQhQJR0lMXhLVIGIN4QCVhuXwr9TKqTvIm2fzKdYGr6f1vqiZKXxdkPhT6h7f822Or/VZnXU7IEqa9sN/Hjsy9tTKxmfHvoJlrPB4koCi8nSf8Pyr53Dx5WLHZ75o3V4nacX6ewFT9ch0chWM6badQSW2pVpqUvd11DXpGbj7dELwS3qw754LjspafPhnobJeMC9YK88Jeko8Uo1j+Oe5XL0UzGna0RTsu7JKwSA8WU+u8fKxLs4OKauxrd1lk/zEElvFtpmv7k7OdCe0Hjm4WNLtc8Onwiu7POMzn2WW9DGTHM1U19Q6QCmVDMtWgS0D9mv12WmcfZwiiHS50+bWjOCtNglU1WgVRMWFMXpk70U4vBxOi8spERYM2hoJy1tV64McMqSVqFwhQ/ZvNfhsww6FuNN/RahlHekcxKUyxSrz4G6auOFqtGtejEtKZS31o0tfPVlhTPTsBlEWdkrTwda6HQyX+fuZMc/QQHa9hvltrLzGmc0t7IbbQM6vjKSJd6Uswb2VU31rMG9JELP5l3U83lXSYJSiRmdPPdosablaKDb1VTyywfdPgH0uZaSf5rjhpJbBi3HTvLhaNNOqglF2bWxUs2keGNnTk7Vvs7ti9+02dfZZsEld19noUQhJ1EVlvSsFZ+MBTwZ0gqfu2AmdVIqPM4aaGQHTc7RV1tHXu45ENoMvxRuZjJZRNo+c5KWew4SHrcBgHLRqdkEY0TtFhSRhMf5KrUb8gost1bYY3WK1NnJNxdUNT181nuaEvi4hhf4ULn1UJvTOrcG3r++PYoy+BehDNLy4sO0wWTLtOq1QZMdPUdELOjKnZUittxtUpkVgr2sEKjZoNKSyTBDnSd1aEDUK43DRheGb9cBcvL4MhvZdrx7/PjBWZ+jIq9ZBmo4E7KlkqHgNUH+2tGpF1rVcw4otfwoliX//6mUqH/FJdzuZKI3cxlT/btycqX5EMGmlhdKNaESrtl5lod67l5GnjPC7P+QZIuaFjx+xkRmmw8ML8Jss2km9Ua73GjuMhIgvWz7iMor2q7uCEDHXzbB/vMJg90zcrzFWWIB6OHjVUwpHDqlw/SUf39Kx1QYYMtr6oN/qDoI7ctPFJm4zpnhiUycrdrECZLOGubZ9FO0Mu/3hnxD18SwWtdwZNe+KdatjVws8UTAtaQCF7414JYb2p0mNbZK5Ir23dMXuRy38UT/JmAk5NJmQrLtlw6uLUHr5Wcyx9kR/wM= \ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/Distributed.png b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/Distributed.png deleted file mode 100644 index d96ca216..00000000 Binary files a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/Distributed.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/README.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/README.md deleted file mode 100644 index 0fd4bb63..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP1/README.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -slug: /MEP-1-distributed-metal-control-plane -title: MEP-1 -sidebar_position: 1 ---- - -# Distributed Metal Control Plane - -This enhancement proposal was replaced by [MEP18](../MEP18/README.md). - -## Problem Statement - -We face the situation that we argue for running bare metal on-premises because this way the customers can control where and how their software and data are processed and stored. -On the other hand, we have currently decided that our metal-api control plane components run on a kubernetes cluster (in our case on a cluster provided by one of the available hyperscalers). - -Running the control plane on Kubernetes has the following benefits: - -- Ease of deployment -- Get most, if not all, of the required infrastructure services like (probably incomplete): - - IPs - - DNS - - L7-Loadbalancing - - Storage - - S3 Backup - - High Availability - -Using a kubernetes as a service offering from one of the hyperscalers, enables us to focus on using kubernetes instead of maintaining it as well. - -## Goal - -It would be much saner if metal-stack has no, or only minimal dependencies to external services. Imagine a metal-stack deployment in a plant, it would be optimal if we only have to deliver a single rack with servers and networking gear installed and wired, plug that rack to the power supply and a internet uplink and its ready to go. - -Have a second plant which you want to be part of all your plants? Just tell both that they are part of something bigger and metal-api knows of two partitions. - -## Possible Solutions - -We can think of two different solutions to this vision: - -1. Keep the central control plane approach and require some sort of kubernetes deployment accessible from the internet. This has the downside that the user must, provide a managed kubernetes deployment in his own datacenter or uses a hyperscaler. Still not optimal. -1. Install the metal-api and all its dependencies in every partition, replicate or shard the databases to every connected partition, make them know each other. Connect the partitions over the internet with some sort of vpn to make the services visible to each other. - -As we can see, the first approach does not really address the problem, therefore i will describe solution #2 in more details. - -## Central/Current setup - -### Stateful services - -Every distributed system suffer from handling state in a scalable, fast and correct way. To start how to cope with the state, we first must identify which state can be seen as partition local only and which state must be synchronous for read, and synchronous for writes across partitions. - -Affected states: - -- masterdata: e.g. tenant and project must be present in every partition, but these are entities which are read often but updates are rare. A write can therefore be visible with a decent delay in a distinct partition with no consequences. -- ipam: the prefixes and ip´s allocated from machines. These entities are also read often and rare updates. But we must differentiate between dirty reads for different types. A machine network is partition local, ips acquired from such a network must by synchronous in the same partition. Ips acquired from global networks such as internet must by synchronous for all partitions, as otherwise a internet ip could be acquired twice. -- vrf ids: they must only be unique in one partition -- image and size configurations: read often, written seldom, so no high requirements on the storage of these entities. -- images: os images are already replicated from a central s3 storage to a per partition s3 service. metal-hammer kernel and initrd are small and pull always from the central s3, can be done similar to os images. -- machine and machine allocation: must be only synchronous in the partition -- switch: must be only synchronous in the partition -- nsq messages: do not need to cross partition boundaries. No need to keep the messages persistent, even the opposite is true, we don't want to have the messages persist for a longer period. - -Now we can see that the most critical state to held and synchronize are the IPAM data, because these entities must be guaranteed to be synchronously updated, while being updated frequently. - -Datastores: - -We use three different types of datastores to persist the states of the metal application. - -- rethinkdb is the main datastore for almost all entities managed by metal-api -- postgresql is used for masterdata and ipam data. -- nsq uses disk and memory tho store the messages. - -### Stateless services - -These are the easy part, all of our services which are stateless can be scaled up and down without any impact on functionality. Even the stateful services like masterdata and metal-api rely fully on the underlying datastore and can therefore also be scaled up and down to meet scalability requirements. - -Albeit, most of these services need to be placed behind a loadbalancer which does the L4/L7 balancing across the started/available replicas of the service for the clients talking to it. This is actually provided by kubernetes with either service type loadbalancer or type clusterip. - -One exception is the `metal-console` service which must have the partition in it´s dns name now, because there is no direct network connectivity between the management networks of the partitions. See "Network Setup) - -## Distributed setup - -### State - -In order to replicate certain data which must be available across all partitions we can use on of the existing open source databases which enable such kind of setup. There are a few available out there, the following incomplete list will highlight the pro´s and cons of each. - -- RethinkDB - - We already store most of our data in RethinkDB and it gives already the ability to synchronize the data in a distributed manner with different guarantees for consistency and latency. This is described here: [Scaling, Sharding and replication](https://rethinkdb.com/docs/sharding-and-replication/). But because rethinkdb has a rough history and unsure future with the last release took more than a year, we in the team already thought that we eventually must move away from rethinkdb in the future. - -- Postgresql - - Postgres does not have a multi datacenter with replication in both directions, it just can make the remote instance store the same data. - -- CockroachDB - - Is a Postgresql compatible database engine on the wire. CockroachDB gives you both, ACID and geo replication with writes allowed from all connected members. It is even possible to configure [Follow the Workload](https://www.cockroachlabs.com/docs/stable/topology-follow-the-workload) and [Geo Partitioning and Replication](https://www.cockroachlabs.com/docs/v19.2/topology-geo-partitioned-replicas). - -If we migrate all metal-api entities to be stored the same way we store masterdata, we could use cockroachdb to store all metal entities in one ore more databases spread across all partitions and still ensure consistency and high availability. - -A simple setup how this would look like is shown here. - -![Simple CockroachDB setup](Distributed.png) - -go-ipam was modified in a example PR here: [PR 17](https://github.com/metal-stack/go-ipam/pull/17) - -### API Access - -In order to make the metal-api accessible for api users like `cloud-api` or `metalctl` as easy at it is today, some effort has to be taken. One possible approach would be to use a external loadbalancer which spread the requests evenly to all metal-api endpoints in all partitions. Because all data are accessible from all partitions, a api request going to partition A with a request to create a machine in partition B, will still work. If on the other hand partition B is not in a connected state because the interconnection between both partitions is broken, then of course the request will fail. - -**IMPORTANT** -The NSQ Message to inform `metal-core` must end in the correct partition - -To provide such a external loadbalancer we have several opportunities: - -- Cloudflare or comparable CDN service. -- BGP Anycast from every partition - -Another setup would place a small gateway behind the metal-api address, which forwards to the metal-api in the partition where the request must be executed. This gateway, `metal-api-router` must inspect the payload, extract the desired partition, and forward the request without any modifications to the metal-api endpoint in this partition. This can be done for all requests, or if we want to optimize, only for write accesses. - -## Network setup - -In order to have the impact to the overall security concept as minimal as possible i would not modify the current network setup. The only modifications which has to be made are: - -- Allow https ingress traffic to all metal-api instances. -- Allow ssh ingress traffic to all metal-console instances. -- Allow CockroachDB Replication between all partitions. -- No NSQ traffic from outside required anymore, except we cant solve the topic above. - -A simple setup how this would look like is shown here, this does not work though because of the forementioned NSQ issue. - -![API and Console Access](Distributed-API.png) - -Therefore we need the `metal-api-router`: - -![Working API and Console Access](Distributed-API-Working.png) - -## Deployment - -The deployment of our components will substantially differ in a partition compared to a the deployment we have actually. Deploying it in kubernetes in the partition would be very difficult to achieve because we have no sane way to deploy kubernetes on physical machines without a underlying API. -I would therefore suggest to deploy our components in the same way we do that for the services running on the management server. Use systemd to start docker containers. - -![Deployment](Distributed-Deployment.png) diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP10/README.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP10/README.md deleted file mode 100644 index 6811cdc0..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP10/README.md +++ /dev/null @@ -1,197 +0,0 @@ ---- -slug: /MEP-10-sonic-support -title: MEP-10 -sidebar_position: 10 ---- - -# SONiC Support - -As writing this proposal, metal-stack only supports Cumulus on Broadcom ASICs. Unfortunately, after the acquisition of -Cumulus Networks by Nvidia, Broadcom decided to cut its relationship with Cumulus, and therefore Cumulus 4.2 is the last -version that supports Broadcom ASICs. Since trashing the existing hardware is not a solution, adding support for a -different network operating system is necessary. - -One of the remaining big players is [SONiC](https://sonic-net.github.io/SONiC/), which Microsoft created to scale the -network of Azure. It's an open-source project and is now part of the [Linux Foundation](https://www.linuxfoundation.org/press/press-release/software-for-open-networking-in-the-cloud-sonic-moves-to-the-linux-foundation). - -For a general introduction to SONiC, please follow the [Architecture](https://github.com/sonic-net/SONiC/wiki/Architecture) official -documentation. - -## ConfigDB - -On a cold start, the content of `/etc/sonic/config_db.json` will be loaded into the Redis database `CONFIG_DB`, and both -contain the switch's configuration except the BGP unnumbered configuration, which still has to be configured directly by -the frr configuration files. The SONiC community is working to remove this exception, but no release date is known. - -## BGP Configuration - -Frr runs inside a container, and a shell script configured it on the container startup. For BGP unnumbered, we must set -the configuration variable `docker_routing_config_mode` to `split` to prevent SONiC from overwriting our configuration -files created by `metal-core`. But by using the split mode, the integrated configuration mode of frr is deactivated, and -we have to write our BGP configuration to the daemon-specific files `bgp.conf`, `staticd.conf`, and `zebra.conf` instead -to `frr.conf`. - -```bash -elif [ "$CONFIG_TYPE" == "split" ]; then - echo "no service integrated-vtysh-config" > /etc/frr/vtysh.conf - rm -f /etc/frr/frr.conf -``` - -Reference: [docker-init](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/docker_init.sh#L69) - -Adding support for the integrated configuration mode, we must at least adjust the startup shell script and the supervisor configuration: - -```bash -{% if DEVICE_METADATA.localhost.docker_routing_config_mode is defined and DEVICE_METADATA.localhost.docker_routing_config_mode == "unified" %} -[program:vtysh_b] -command=/usr/bin/vtysh -b -``` - -Reference: [supervisord.conf](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/frr/supervisord/supervisord.conf.j2#L157) - -## Non-BGP Configuration - -For the Non-BGP configuration we have to write it into the Redis database directly or via one of the following interfaces: - -- `config replace ` -- the Mgmt Framework -- the SONiC restapi - -Directly writing into the Redis database isn't a stable interface, and we must determine the create, delete, and update -operations on our own. The last point is also valid for the Mgmt Framework and the SONiC restapi. Furthermore, the -Mgmt Framework doesn't start anymore for several months, and a [potential fix](https://github.com/sonic-net/sonic-buildimage/pull/10893) -is still not merged. And the SONiC restapi isn't enabled by default, and we must build and maintain our own SONiC images. - -Using `config replace` would reduce the complexity in the `metal-core` codebase because we don't have to determine the -actual changes between the running and the desired configuration. The approach's drawbacks are using a version of SONiC -that contains the PR [Yang support for VXLAN](https://github.com/sonic-net/sonic-buildimage/pull/7294), and we must provide -the whole new startup configuration to prevent unwanted deconfiguration. - -### Configure Loopback interface and activate VXLAN - -```json -{ - "LOOPBACK_INTERFACE": { - "Loopback0": {}, - "Loopback0|": {} - }, - "VXLAN_TUNNEL": { - "vtep": { - "src_ip": "" - } - } -} -``` - -#### Configure MTU - -```json -{ - "PORT": { - "Ethernet0": { - "mtu": "9000" - } - } -} -``` - -#### Configure PXE Vlan - -```json -{ - "VLAN": { - "Vlan4000": { - "vlanid": "4000" - } - }, - "VLAN_INTERFACE": { - "Vlan4000": {}, - "Vlan4000|": {} - }, - "VLAN_MEMBER": { - "Vlan4000|": { - "tagging_mode": "untagged" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104000_Vlan4000": { - "vlan": "Vlan4000", - "vni": "104000" - } - } -} -``` - -#### Configure VRF - -```json -{ - "INTERFACE": { - "Ethernet0": { - "vrf_name": "vrf104001" - } - }, - "VLAN": { - "Vlan4001": { - "vlanid": "4001" - } - }, - "VLAN_INTERFACE": { - "Vlan4001": { - "vrf_name": "vrf104001" - } - }, - "VRF": { - "vrf104001": { - "vni": "104001" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104001_Vlan4001": { - "vlan": "Vlan4001", - "vni": "104001" - } - } -} -``` - -## DHCP Relay - -The DHCP relay container only starts if `DEVICE_METADATA.localhost.type` is equal to `ToRRouter`. - -## LLDP - -SONiC always uses the local port subtype for LLDP and sets it to some freely configurable alias field of the interface. - -```python -# Get the port alias. If None or empty string, use port name instead -port_alias = port_table_dict.get("alias") -if not port_alias: - self.log_info("Unable to retrieve port alias for port '{}'. Using port name instead.".format(port_name)) - port_alias = port_name - -lldpcli_cmd = "lldpcli configure ports {0} lldp portidsubtype local {1}".format(port_name, port_alias) -``` - -Reference: [lldpmgr](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-lldp/lldpmgrd#L153) - -## Mgmt Interface - -The mgmt interface is `eth0`. To configure a static IP address and activate the Mgmt VRF, use: - -```json -{ - "MGMT_INTERFACE": { - "eth0|": { - "gwaddr": "" - } - }, - "MGMT_VRF_CONFIG": { - "vrf_global": { - "mgmtVrfEnabled": "true" - } - } -} -``` - -[IP forwarding is deactivated on `eth0`](https://github.com/sonic-net/sonic-buildimage/blob/202205/files/image_config/sysctl/sysctl-net.conf#L7), and no IP Masquerade is configured. diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP11/README.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP11/README.md deleted file mode 100644 index 87f48a10..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP11/README.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -slug: /MEP-11-auditing-of-metal-stack-resources -title: MEP-11 -sidebar_position: 11 ---- - -# Auditing of metal-stack resources - -Currently no logs of the ownership of resources like machines, networks, ips and volumes are generated or kept. Though due to legal requirements data centers are required to keep track of this ownership over time to prevent liability issues when opening the platform for external users. - -In this proposal we want to introduce a flexible and low-maintenance approach for auditing on top of [Meilisearch](https://www.meilisearch.com/). - -## Overview - -In general our auditing logs will be collected by a request interceptor or middleware. Every request and response will be processed and eventually logged to Meilisearch. -Meilisearch will be configured to regularly create chunks of the auditing logs. These finished chunks will be backed up to a S3 compatible storage with a read-only option enabled. - -Of course sensitive data like session keys or passwords will be redacted before logging. We want to track relevant requests and responses. If auditing the request fails, the request itself will be aborted and will not be processed further. The requests and responses that will be audited will be annotated with a correlation id. - -Transferring the meilisearch auditing data chunks to the S3 compatible storage will be done by a sidecar cronjob that is executed periodically. -To avoid data manipulation the S3 compatible storage will be configured to be read-only. - -## Whitelisting - -To reduce the amount of unnecessary logs we want to introduce a whitelist of resources and operations on those that should be logged. -Other requests will be passed directly to the next middleware or web service without any further processing. - -As we are only interested in mutating endpoints, we ignore all `GET` requests. -The whitelist includes all `POST`, `PUT`, `PATCH` and `DELETE` endpoints of the HTTP middleware except for the following (non-manipulating) route suffixes: - -- `/find` -- `/notify` -- `/try` and `/match` -- `/capacity` -- `/from-hardware` - -Regarding GRPC audit trails, they are not so interesting because only internal clients are using this API. However, we can log the trails of the `Boot` service, which can be interesting to revise the machine lifecycle. - -## Chunking in Meilisearch - -We want our data to be chunked in Meilisearch. To accomplish this, we rotate the index identifier on a scheduled basis. The index identifiers will be derived from the current date and time. - -To keep things simple, we only support hourly, daily and monthly rotation. The eventually prefixed index names will only include relevant parts of date and time like `2021-01`, `2021-01-01` or `2021-01-01_13`. - -The metal-api will only write to the current index and switches to the new index on rotation. The metal-api will never read or update data in any indices. - -## Moving chunks to S3 compatible storage - -As Meilisearch will be filled with data over time, we want to move completed chunks to a S3 compatible storage. This will be done by a sidecar cronjob that is executed periodically. Note that the periods of the index rotation and the cronjob execution don't have to match. - -When the backup process gets started, it initiates a [Meilisearch dump](https://www.meilisearch.com/docs/learn/advanced/dumps) of the whole database across all indices. Once the returned task is finished, the dump must be copied from a Meilisearch volume to the S3 compatible storage. After a successful copy, the dump can be deleted. - -Now we want to remove all indices from Meilisearch, except the most recent one. For this, we [get all indices](https://www.meilisearch.com/docs/reference/api/indexes#list-all-indexes), sort them and [delete each index](https://www.meilisearch.com/docs/reference/api/indexes#delete-an-index) except the most recent one to avoid data loss. - -For the actual implementation, we can build upon [backup-restore-sidecar](https://github.com/metal-stack/backup-restore-sidecar). But due to the index rotation and the fact, that older indices need to be deleted, this probably does not fit into the mentioned sidecar. - -## S3 compatible storage - -The dumps of chunks should automatically deleted after a certain amount of time, once we are either no longer allowed or required to keep them. -The default retention time will be 6 months. Ideally already uploaded chunks should be read-only to prevent data manipulation. - -A candidate for the S3 compatible storage is Google Cloud Storage, which allows to configure automatic expiration of objects through a [lifecycle rule](https://cloud.google.com/storage/docs/managing-lifecycles?hl=en#storage-set-lifecycle-config-go). - -## Affected components - -- metal-api grpc server needs an auditing interceptor -- metal-api web server needs an auditing filter chain / middleware -- metal-api needs new command line arguments to configure the auditing -- mini-lab needs a Meilisearch instance -- mini-lab may need a local S3 compatible storage -- we need a sidecar to implement the backup to S3 compatible storage -- Consider auditing of volume allocations and freeings outside of metal-stack - -## Alternatives considered - -Instead of using Meilisearch we investigated using an immutable database like [immudb](https://immudb.io/). But immudb does not support chunking of data and due to its immutable nature, we will never be able to free up space of expired data. Even if we are legally allowed or required to delete data, we will not be able to do so with immudb. - -In another variant of the Meilisearch approach the metal-api would also be responsible for copying chunks to the S3 compatible storage and deleting old indices. But separating the concerns allows completely different implementations for every deployment stage. diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP12/README.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP12/README.md deleted file mode 100644 index 65532c57..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP12/README.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -slug: /MEP-12-rack-spreading -title: MEP-12 -sidebar_position: 12 ---- - -# Rack Spreading - -Currently, when creating a machine through the metal-api, the machine is placed randomly inside a partition. This algorithm does not consider spreading machines across different racks and different chassis. This may lead to the situation that a group of machines (that for example form a cluster) can end up being placed in the same rack and the same chassis. - -Spreading a group of machines across racks can enhance availability for scenarios like a rack losing power or a chassis meltdown. - -So, instead of just randomly deciding the placement of a machine candidate, we want to propose a placement strategy that attempts to spread machine candidates across the racks inside a partition. - -Furthermore a followup improvement to guarantee that machines are really spread across multiple racks, even if multiple machines are ordered in parallel, was implemented with [PR490](https://github.com/metal-stack/metal-api/pull/490). - -## Placement Strategy - -Machines in the project are spread across all available racks evenly within a partition (best effort). For this, an additional request to the datastore has to be made in order to find allocated machines within the project in the partition. - -The algorithm will then figure out the least occupied racks and elect a machine candidate randomly from those racks. - -The user can optionally pass placement tags which will be considered for spreading the machines as well (this will for example allow spreading by a cluster id tag inside the same project). - -## API - -```golang -// service/v1/machine.go - -type MachineAllocation struct { - // existing fields are omitted for readability - PlacementTags []string `json:"placement_tags" description:"by default machines are spread across the racks inside a partition for every project. if placement tags are provided, the machine candidate has an additional anti-affinity to other machines having the same tags"` -} -``` diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP13/README.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP13/README.md deleted file mode 100644 index 2dde20f5..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP13/README.md +++ /dev/null @@ -1,111 +0,0 @@ ---- -slug: /MEP-13-dual-stack-support -title: MEP-13 -sidebar_position: 13 ---- - -# Dual-stack Support - -dual-stack support is required to be able to create Kubernetes clusters with either IPv6 single-stack or dual-stack enabled. -With the inherent scarcity of IPv4 addresses, the need to be able to use IPv6 has increased. - -Full IPv6 dual-stack support was added to Kubernetes with v1.23 as stable. - -Gardeners have had full IPv6 dual-stack support since `v1.109`. - -metal-stack manages CIDRs and IP addresses with the [go-ipam](https://github.com/metal-stack/go-ipam) library, which already got full IPv6 support in 2021 (see [https://metal-stack.io/blog/2021/02/ipv6-part1](https://metal-stack.io/blog/2021/02/ipv6-part1)). -But this was only the foundation, more work needs to be done to get full IPv6 support for all aspects managed by metal-stack.io. - -## General Decisions - -For the general decision we do not look at the isolated clusters feature for now as this would make the solution even more complex and we want to introduce IPv6 in smaller steps to the users. - -### Networks - -Currently, metal-stack organizes CIDRs / prefixes into a `network' resource in the metal-api. A network can consist of multiple CIDRs from the same address family. For example, if an operator wants to provide Internet connectivity to provisioned machines, they can start with small network CIDRs. The number of managed network prefixes can then be expanded as needed over time. - -With dual-stack we have to choose between two options: Network per address family or networks with both address families. These options are described in the next section. - -#### Network per Address Family - -This means that we allow networks with CIDRs from one address family only, one for IPv4 and one for IPv6. - -The machine creation process will not change if the machine only needs to be either IPv4 or IPv6 addressable. -But if on the other side, the machine need to be able to connect to both address families, the machine creation needs to specify two networks, one for IPv4 and one for IPv6. -Also there will be 2 distinct VRF IDs for every network with a different address family. - -#### Network with both Address Families - -Make a network dual address family capable, meaning that you can add multiple cidrs from both address families to a network. -Then the machine creation will remain the same for single-stack and dual-stack cases, but the ip address allocation will need to specify the address family from which to allocate an ip address when the network is dual-stack. -This does not break the existing API, but allows existing extensions to easily add dual-stack support. -To avoid additional checking of which address families are available on this network during an ip allocation call, we could store the address families in the network. - -#### Decision - -The decision was made to go with the having both address families in a single network entity because we think this is the most flexible way to support dual-stack machines and Kubernetes clusters as well as single-stack with the least amount of modifications on the networking side. - -### Examples - -To illustrate the the usage we start by creating a tenant super network which has both address families: - -```yaml ---- -id: tenant-super-network-mini-lab -name: Project Super Network -description: Super network of all project networks -partitionid: mini-lab -prefixes: - - 10.0.0.0/16 - - 2001:db8:0:10::/64 -defaultchildprefixlength: - IPv4: 22 - IPv6: 96 -privatesuper: true -``` - -In order to create this network, we simple call: - -```bash -metalctl network create -f tenant-super.yaml -``` - -This is usually done during the initial setup of the environment. - -Next step is to allocate a tenant network where the machines of a project can be placed: - -```bash -metalctl network allocate --partition mini-lab --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 --name my-node-network -``` - -This leads to the following network allocation: - -```yaml -id: 2d2c0350-3f66-4597-ae97-ef6797232212 -name: my-node-network -parentnetworkid: tenant-super-network-mini-lab -partitionid: mini-lab -prefixes: - - 10.0.0.0/22 - - 2001:db8:0:10::/96 -projectid: 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -vrf: 20 -consumption: - ipv4: - available_ips: 1024 - available_prefixes: 256 - used_ips: 2 - used_prefixes: 0 - ipv6: - available_ips: 2147483647 - available_prefixes: 1073741824 - used_ips: 1 - used_prefixes: 0 -privatesuper: false -``` - -Users can the create IP addresses from these child networks. By default, they retrieve an IPv4 address except a super network only consists of IPv6 prefixes. In the latter case the users acquire an IPv6 address. - -```bash -metalctl network ip create --network 2d2c0350-3f66-4597-ae97-ef6797232212 --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -``` diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP14/README.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP14/README.md deleted file mode 100644 index 47c06434..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP14/README.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -slug: /MEP-14-independence-from-external-sources -title: MEP-14 -sidebar_position: 14 ---- - -# Independence from external sources - -In certain situations some customers may need to operate and create machines without making use of external services like DNS or NTP through the internet. To make this possible, all metal-stack components reaching external services need to be configurable with custom endpoints. - -So far, the following components have been identified as requiring changes: - -- pixiecore -- metal-hammer -- metal-images - -More components are likely to be added to the list during processing. -For DNS and NTP servers it should be possible to provide default values within a partition. They can either be inherited from machines and firewalls or overwritten with own ones. - -## pixiecore - -A NTP server endpoint need to be configured on the pixiecore. This can be achieved by providing it through environment variables on start up. - -## metal-hammer - -If using a self-deployed NTP server, also the metal-hammer need to be configured with it. For backward compatibility, default values from `pool.ntp.org` and `time.google.com` are used. - -## metal-images - -Configurations for the `metal-images` are different for machines and firewalls. - -## metalctl - -In order to pass DNS and NTP servers to partitions and machines while creating them, the flags `dnsservers` and `ntpservers` need to be added. - -The implementation of this MEP will make metal-stack possible to create and maintain machines without requiring an internet connection. diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP16/README.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP16/README.md deleted file mode 100644 index dbfa59d6..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP16/README.md +++ /dev/null @@ -1,332 +0,0 @@ ---- -slug: /MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller -title: MEP-16 -sidebar_position: 16 ---- - -# metal-api as an Alternative Configuration Source for the firewall-controller - -In the current situation, a firewall as provisioned by metal-stack is a fully immutable entity. Any modifications on the firewall like changing the firewall ruleset must be done _somehow_ by the user – the metal-api and hence metal-stack is not aware of its current state. - -As part of our [integration with the Gardener project](https://docs.metal-stack.io/stable/overview/kubernetes/#Gardener) we offer a solution called the [firewall-controller](https://github.com/metal-stack/firewall-controller), which is part of our [firewall OS images](https://github.com/metal-stack/metal-images/blob/6318a624861b18a559a9d37299bca5f760eef524/firewall/Dockerfile#L57-L58) and addresses shortcomings of the firewall resource's immutability, which would otherwise be completely impractible to work with. The firewall-controller crashes infinitely if it is not properly configured through the userdata when using the firewall image of metal-stack. - -The firewall-controller approach is tightly coupled to Gardener and it requires the administrator of the Gardener installation to pass a shoot and a seed kubeconfig through machine userdata when creating the firewall. How this userdata has to look like is not documented and is just part of another project called the [firewall-controller-manager](https://github.com/metal-stack/firewall-controller-manager) (FCM), which task is to orchestrate rolling updates of firewall machines in a way that network traffic interruption is minimal when updating a firewall or applying a change to an immutable firewall configuration. - -In general, a firewall entity in metal-stack has similarities to the machine entity but it has a fundamental difference: A user gains ownership over a machine after provisioning. They can access it through SSH, modify it at will and this is completely wanted. For firewalls, however, we do not want a user to access the provisioned firewall as the firewall is a privileged part of the infrastructure with access to the underlay network. The underlay can not be tampered with at any given point in time by a user as it can destroy the entire network traffic flow inside a metal-stack partition. - -For this reason, we have a gap in the metal-stack project in terms of a missing solution for people who do not rely on the Gardener integration. We are basically leaving a user with the option to implement an orchestrated recreation of every possible change on the firewall to minimize traffic interruption for the machines sitting behind the firewall or re-implement the firewall-controller to how they want to use it for their use-case. - -Also we do not have a clear distinction in the API between user and metal-stack operator for firewalls. If a user would allocate a firewall it is also possible for the user to inject his own SSH keys and access the firewall and tamper with the underlay network. - -Parts of these problems are probably going to decrease with the work on [MEP-4](../MEP4/README.md) where there will be dedicated APIs for users and administrators of metal-stack including fine-grained access tokens. - -With this MEP we want to describe a way to improve this current situation and allow other users that do not rely on the Gardener integration – for whatever motivation they have – to adequately manage firewalls. For this, we propose an alternative configuration for the firewall-controller that is native to metal-stack and more independent of Gardener. - -## Proposal - -The central idea of this proposal is allowing the firewall-controller to use the metal-api as a configuration source. This should serve as an alternative strategy to the currently used FCM `Firewall` resource based approach in the Gardener use-case. -Updates of the firewall rules should be possible through the metal-api. - -The firewall-controller itself should now be able to decide which of the two main strategies should be used for the base configuration: a kubeconfig or the metal-api. This should be possible through a dedicated _firewall-controller-config_. - -Using this config will now allow operators to fine-tune the data sources for all of its dynamic configuration tasks independently. -For example the data source of the core firewall rules could be set either from the `Firewall` resource located in the Gardener `Seed` or the metal-apiserver node network entity, while the CWNPs should be fetched and applied from a given kubeconfig (the `Shoot` Kubeconfig in the Gardener case). -This configuration file is intended to be injected during firewall creation through the userdata along with potential source connection credentials. - -```yaml -# the name of the firewall, defaulted to the hostname -name: best-firewall-ever - -sources: - seed: - kubeconfig: /path/to/seed.yaml # current gardener behavior - namespace: shoot--proj--name - shoot: - kubeconfig: /path/to/shoot.yaml # current gardener behavior - namespace: firewall - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - static: - # static should mirror all information provided by the metal or seed/shoot sources - firewall: # optional - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -# all sub-controllers running on the firewall -# each can be configured independently -controllers: - # this is the base controller - firewall: - source: seed # or: metal, static - - # these are optional: when not provided, they are disabled - selfUpdate: - enabled: true - droptailer: - enabled: true - - # these are optional: when not provided, they are disabled - service: - source: shoot # or: metal, static - cwnp: - source: shoot # or: metal, static - monitor: - source: shoot # currently only shoot is supported -``` - -The existing behavior of the firewall-controller writing into `/etc/nftables/firewall-controller.v4` is not changed. The different controller configuration sources are internally treated in the same way as before. The `static` source can be used to prevent the firewall-controller from crashing and consistently providing a static ruleset. This might be interesting for metal-stack native use cases or environments where the metal-api cannot be accessed. - -There must be one central nftables-rule-file-controller that is notified and triggered by all other controllers that contribute to the nftables configuration. - -For example, in order to maintain the existing Gardener integration, the configuration file for the firewall-controller will look like this: - -```yaml -name: shoot--abc--cluster-firewall-def -sources: - seed: - kubeconfig: /etc/firewall-controller/seed.yaml - namespace: shoot--abc--cluster - shoot: - kubeconfig: /etc/firewall-controller/shoot.yaml - namespace: firewall - -controllers: - firewall: - source: seed - - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Plain metal-stack users might use a configuration like this: - -```yaml -name: best-firewall-ever - -sources: - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - -controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - cwnp: - # firewall rules stored in firewall entity - # potential improvement would be to attach the rules to the node network entity - # be aware that the firewall and private networks are immutable - # eventually we introduce a firewall ruleset entity - source: metal -``` - -In highly restricted environments that cannot access metal-api the static source could be used: - -```yaml -name: most-restricted-firewall-ever - -sources: - static: - firewall: - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -controllers: - firewall: - source: static - - cwnp: - source: static -``` - -### Non-Goals - -- Resolving the missing differentiation between users and administrators by letting users pass userdata and SSH keys to the firewall creation. - - This is even more related to [MEP-4](../MEP4/README.md) than this MEP. - -### Advantages - -- Offers a native metal-stack solution that improves managing firewalls for users by adding dynamic reconfiguration through the metal-api - - e.g., in the mini-lab, users can now allocate a machine, then an IP address and announce this IP from the machine without having to re-create the firewall but by adding a firewall rule to the metal-api. -- Improve consistency throughout the API (firewall rules would reflect what is persisted in metal-api). -- Other providers like Cluster API can leverage this approach, too. -- It can contribute to solving the shoot migration issue (in Cluster API case the `clusterctl move` for firewall objects) - - For Gardener takes the seed out of the equation (of which the kubeconfig changes during shoot migration) - - However: Things like egress rules, rate limiting, etc. are currently not part of the firewall or network entity in the metal-api. These would need to be added to one of them. -- Potentially resolve the issue that end-users can manipulate accounting data of the firewall through the `FirewallMonitor` - - for this we would need to be able to report traffic data to metal-api - -### Caveats - -- Metal-View access is too broad for firewalls. Mitigated by [MEP-4](../MEP4/README.md). -- Polling of the firewall-controller is bad for performance. Mitigated by [MEP-4](../MEP4/README.md). - -### Firewall Controller Manager - -Currently the firewall-controller-manager expects the creators of a `FirewallDeployment` to use the defaulting webhook that is tailored to the Gardener integration in order to generate `Firewall.spec.userdata` or to override it manually. Currently `Firewall.spec.userdata` will never be set explicitly. - -Instead we'd like to propose `Firewall.spec.userdataContents` which will replace the old `userdata`-string by a typed data structure. The FCM will do the heavy lifting while the `FirewallDeployment` creator decides what should be configured. - -```yaml -kind: FirewallDeployment -spec: - template: - spec: - userdataContents: - - path: /etc/firewall-controller/config.yaml - content: | - --- - sources: - static: {} - controllers: - firewall: - source: static - - path: /etc/firewall-controller/seed.yaml - contentFrom: - firewallControllerKubeconfigSecret: - name: seed-kubeconfig - key: kubeconfig - - - path: /etc/firewall-controller/shoot.yaml - contentFrom: - secretRef: - name: shoot-kubeconfig - key: kubeconfig -``` - -### Gardener Extension Provider Metal Stack - -The GEPM should be migrated to the new `Firewall.spec.userdataContents` field. - -### Cluster API Provider Metal Stack - -![architectural overview](firewall-for-capms-overview.svg) - -In Cluster API there are essentially two main clusters: the management cluster and the workload cluster while the CAPMS takes in the role of the GEPM. -Typically a local bootstrap cluster is created in KinD which acts as the management cluster. It creates the workload cluster. Thereafter the ownership of the workload cluster is typically moved (using `clusterctl move`) to a different cluster which will then become the management cluster. -The new management cluster might actually be the workload cluster itself. - -In contrast to Gardener, Cluster API aims to be less opinionated and minimal. It is common practice to not install any non-required components or CRDs into the workload cluster by default. Therefore we cannot expect custom resources like `ClusterwideNetworkPolicy` or `FirewallMonitor` to be installed in the workload cluster but strongly recommend our users to do it. Therefore it's the responsibility of the operator to tell [cluster-api-provider-metal-stack](https://github.com/metal-stack/cluster-api-provider-metal-stack) the kubeconfig for the cluster where these CRDs are installed and defined in. - -A viable configuration for a `MetalStackCluster` that generates firewall rules based of `Service` type `LoadBalancer` and `ClusterwideNetworkPolicy` and expects them to be deployed in the workload cluster is shown below. The `FirewallMonitor` will be reported into the same cluster. - -```yaml -kind: MetalStackCluster -metadata: - name: ${CLUSTER_NAME} -spec: - firewallTemplate: - userdataContents: - - path: /etc/firewall-controller/config.yaml - contentFrom: - secretRef: - name: ${CLUSTER_NAME}-firewall-controller-config - key: controllerConfig - - - path: /etc/firewall-controller/workload.yaml - contentFrom: - # this is the kubeconfig generated by kubeadm - secretRef: - name: ${CLUSTER_NAME}-kubeconfig - key: value ---- -kind: Secret -metadata: - name: ${CLUSTER_NAME}-firewall-controller-config -stringData: - controllerConfig: | - --- - name: ${CLUSTER_NAME}-firewall - - sources: - metal: - url: ${METAL_API_URL} - hmac: ${METAL_API_HMAC} - type: ${METAL_API_HMAC_TYPE} - projectID: ${METAL_API_PROJECT_ID} - shoot: - kubeconfig: /etc/firewall-controller/workload.yaml - namespace: firewall - - controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Here the firewall-controller-config will be referenced by the `MetalStackCluster` as a `Secret`. Please note that the `Secret`s in `userdataContents` will not be fetched and will directly be passed to the `FirewallDeployment`. At first the reconciliation of it in the FCM will fail due to the missing Kubeconfig secret. After the `MetalStackCluster` has been marked as ready, CAPI will create this missing secret. Effectively the firewall and initial control plane node should be created at the same time. - -This approach allows maximum flexibility as intended by Cluster API and is still able to provide robust rolling updates of firewalls. - -An advanced use case of this flexibility would be a management cluster, that is in charge of multiple workload clusters. Where one workload cluster acts as a monitoring or tooling cluster, receives logs and the firewall monitor for the other workload clusters. The CWNPs could be defined here, all in a separate namespace. - -#### Cluster API Caveats - -When the cluster is pivoted and reconciles its own firewall, a malfunctioning firewall prevents the cluster from self-healing and requires manual intervention by creating a new firewall. This is an inherent problem of the cluster-api approach. It can be circumvented by using an extra cluster to manage workload clusters. - -In the current form of this approach firewalls and therefore the firewall egress and ingress rules are managed by the cluster operators that manage the cluster-api resources. -Hence it will not be possible to gain a fine-grained control over every cluster operator's choices from a central ruleset at the level of metal-stack firewalls. -In case this control surfaces as a requirement, it would need to be implemented in a firewall external to metal-stack. - -## Roadmap - -In general this proposal is not thought to be implemented in one batch. Instead an incremental approach is required. - -1. Enhance firewall-controller-manager - - - Add `FirewallDeployment.spec.template.spec.userdataContents` - -2. Enhance firewall-controller - - - Reduce coupling between controllers - - Introduce controller config - - Abstract module to write into distinct nftable rules for every controller - - Implement `sources.static`, but not `sources.metal` - - GEPM should set `FirewallDeployment.spec.template.spec.userdataContents` - -3. Allow Cluster API to use the FCM with static ruleset - - - Add `firewall.metal-stack.io/paused` annotation (managed by CAPMS during `clusterctl move`, theoretically useful for Gardener shoot migration as well to avoid shallow deletion). - - Reconcile multiple `FirewallDeployment` resources across multiple namespaces. For Gardener the old behavior of reconciling only one namespace should persist. - - Allow setting the `firewall.metal-stack.io/no-controller-connection` annotation through the `FirewallDeployment` (either through the template or inheritance). - - Add `MetalStackCluster.spec.firewallTemplate`. - - Make `MetalStackCluster.spec.nodeNetworkID` optional if `spec.firewallTemplate` given. - -4. Add `sources.metal` as configuration option. - - - Allow updates of firewall rules in the metal-apiserver. - - Depends on [MEP-4](../MEP4/README.md) metal-apiserver progress - -5. Potentially migrate the GEPM to use `sources.metal` diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio deleted file mode 100644 index faea3e3d..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio +++ /dev/null @@ -1,4 +0,0 @@ - - - -
handles traffic
Firewall
Firewall Controller
node-exporter
nftables-exporter
droptailer-client
Workload Cluster
droptailer
Configures
Bootstrap or Management Cluster
reconcile
configures
reconcile
Cluster API Provider metal-stack
Metal Stack Cluster CRD
Firewall Deployment CRD
Firewall CRD
Firewall Set CRD
rec
reconcile
reconcile
Firewall Controller Manager
Metal Stack Machine CRD
manages
Admin
Kubeconfig FirewallMonitor
FirewallMonitor CRD
main metal-api
Firewall entity
kubeconfig CWNP
Clusterwide Network Policy CRD
base config
controllerConfig
user-defined
network rules
reports firewall
state
send firewall log lines
controllerConfig
controllerConfig
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg deleted file mode 100644 index 853f8175..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg +++ /dev/null @@ -1 +0,0 @@ -
handles traffic
handles traffic
Firewall
Firewall
Firewall Controller
Firewall Controller
node-exporter
node-exporter
nftables-exporter
nftables-exporter
droptailer-client
droptailer-client
Workload Cluster
Workload Cluster
droptailer
droptailer
Configures
Configures
Bootstrap or Management Cluster
Bootstrap or Management Cluster
reconcile
reconcile
configures
configures
reconcile
reconcile
Cluster API Provider metal-stack
Cluster API Provider...
Metal Stack Cluster CRD
Metal Stack Cluster...
Firewall Deployment CRD
Firewall Deployment...
Firewall CRD
Firewall CRD
Firewall Set CRD
Firewall Set CRD
rec
rec
reconcile
reconcile
reconcile
reconcile
Firewall Controller Manager
Firewall Controller...
Metal Stack Machine CRD
Metal Stack Machine...
manages
manages
Admin
Admin
Kubeconfig FirewallMonitor
Kubeconfig FirewallMonitor
FirewallMonitor CRD
FirewallMonitor CRD
main metal-api
main metal-api
Firewall entity
Firewall entity
kubeconfig CWNP
kubeconfig CWNP
Clusterwide Network PolicyCRD
Clusterwide Network...
base config
base config
controllerConfig
controllerConfig
user-defined
network rules
user-defined...
reports firewall
state
reports firewall...
send firewall log lines
send firewall log lines
controllerConfig
controllerConfig
controllerConfig
controllerConfig
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP17/README.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP17/README.md deleted file mode 100644 index 35f48970..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP17/README.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -slug: /MEP-17-global-network-view -title: MEP-17 -sidebar_position: 17 ---- - -# Global Network View - -> [!IMPORTANT] -> This MEP assumes the implementation of the metal-apiserver as described by [MEP-4](../MEP4/README.md) which is currently work in progress. - -Having a complete view of the network topology is useful when working with deployments or troubleshooting connectivity issues. -Currently, the API doesn't know of any other switches than the leaf switches. -Information about all other switches and their connections must be gathered from Ansible inventories or by accessing the switches via SSH. -Documentation of each partition's network must be kept in-sync with all changes made to the deployment or cabling. -We would like to expand the API's knowledge of the network to the entire underlay including inter-switch connections as well as BGP statistics and health status. - -## Switch Types - -Registering a switch at the API is done by the metal-core. -Apart from that, it also reconciles port and FRR configuration to adapt to the machine provisioning cycle. -This reconfiguration is only necessary on the leaf switches. -To allow deploying the metal-core on other switches than leaves we need a way of telling it what type of switch it is running on so it can act accordingly. -On any non-leaf switches it will only register the switch and report statistic but not change any configuration. -Supported switch types are - -- `leaf` -- `spine` -- `exit` -- `mgmtleaf` -- `mgmtspine` - -## Network Topology - -All switches should periodically report their LLDP neighbors and port configuration. -This information can be used to quickly identify common network issues, like MTU mismatch or the like. -Ideally, there would be some graphical representation of the network topology containing only the most important information for a quick overview. -It should contain all switches and machines as nodes and all connections as edges of a graph. -Ports, VRFs, and maybe also IPs should be associated with a connection. - -Apart from the topology graph, there should be a way to display more detailed information about both ports of a connection, like - -- MTU -- speed -- IP -- UP/DOWN status -- VRF -- VLAN -- whether it participates in a BGP session - -## BGP Announcements - -The metal-core should collect all routes it knows about and send them to the API along with a timestamp. -Reported routes should be stored to a redis database along with the switch that reported them and the timestamp of the last time they were reported. -An expiration threshold should be defined and all expired routes should be cleaned up periodically. -Whenever new routes are reported they get merged into the existing ones by the strategy: - -- when new, just add -- when existing, update `last_announced` timestamp - -By querying the BGP announcements we can find out whether an allocated IP is still in use. diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/README.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/README.md deleted file mode 100644 index 9c02c0b7..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/README.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -slug: /MEP-18-autonomous-control-plane -title: MEP-18 -sidebar_position: 18 ---- - -# Autonomous Control Plane - -As described in the [deployment chapter](../../../docs/04-For%20Operators/03-deployment-guide.mdx), we strongly recommend Kubernetes as the target platform for running the metal-stack control plane. - -Kubernetes clusters for this purpose are readily available from hyperscalers, metalstack.cloud, or other cloud providers. Simply using a managed Kubernetes cluster greatly simplifies a metal-stack installation. However, sometimes it might be desirable to host the metal-stack control plane autonomously, without the help of another cloud provider. Reasons for this might include corporate policies that prohibit the use of external data center products, or network constraints. - -The Kubernetes cluster hosting the metal-stack control plane must provide at least the following features: - -- Load balancing (for exposing the APIs) -- Persistent storage (for the databases and key-value stores) -- Access to object storage for automated backups of the stateful sets -- Access to a DNS provider supported by one of the used DNS extensions -- Externally accessible DNS records for obtaining officially signed certificates through DNS challenges - -This metal-stack control plane cluster must also be highly available to prevent a complete loss of control over the managed resources in the data center. -Regular Kubernetes updates to apply security fixes and feature updates must be possible in an automated manner. The Day-2 operational overhead of running this cluster in your own datacenter must be reasonable. - -In this chapter, we propose a solution for setting up a metal-stack environment with an autonomous control plane that is independent of another cloud provider. - -## Use Your Own Dogfood - -The most obvious solution is to just deploy a Kubernetes cluster manually in your own data center by utilizing existing tooling for the deployment: - -- k3s -- kubeadm -- vmware and rancher -- talos -- kubespray -- ... (not a complete list) - -However, all these solutions add another layer of complexity that needs to be maintained and operated by people who also need to learn and understand metal-stack. In general, metal-stack in combination with [Gardener](https://gardener.cloud) contains all the necessary tools to provide KaaS, so it makes sense to reuse what is already in place without introducing new dependencies on other products and vendors. - -The only problem here is that Gardener is not yet able to create an initial cluster, which may change with the implementation of [GEP-28](https://github.com/gardener/gardener/blob/master/docs/proposals/28-autonomous-shoot-clusters.md). In the meantime, we suggest using [k3s](https://k3s.io/), which manages the initial metal-stack partition to host the control plane, since the maintenance overhead is acceptable and it is easy to deploy. - -## The Matryoshka Principle - -Instead of directly using the K3s cluster for the production control plane, we propose using it as a minimal control plane cluster which only purpose is to host the production control plane cluster. This layer of indirection brings some reasonable advantages: - -- In the event of an interruption or loss of this minimal control plane cluster, the production control plane remains unaffected, and end users can continue to manage their clusters as normal. -- A dedicated operations team can take care of the Day-2 maintenance of this installation, which can be handy because the tools like k3s are a little different from the rest of the setup (it is likely that more manual maintenance is required than for any other cluster). This would also be true if the initial cluster problem would be solved by the Gardener itself and not using k3s. -- Since the number of shoot clusters to host is static, the resource requirements are minimal and will not change significantly over time. There are no huge resource requirements in terms of cpu, memory and storage. As such, the lack of scalability is not such a big issue. - -So, our proposal is to chain two metal-stack control planes. The initial control plane cluster would use k3s and on this cluster we can spin up a cluster for the production control plane with the use of Gardener. - -The following figure shows how the high-level architecture of this setup looks like. A even more simplified illustration of this setup can be looked up in the appendix[^1]. - -![Autonomous Control Plane Architecture](./autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg) - -The k3s nodes can either be bare metal machines or virtual machines. When using VMs a single k3s node might be a viable solution, too. These nodes are supposed to be setup manually / partly automated with an operating system like Debian. - -To name the cluster that hosts the initial metal-stack control plane and Gardener we use the term _initial cluster_. The initial cluster creates worker nodes to host the _target cluster_. - -## Initial Cluster - -The initial cluster is kept very small. The physical bare metal machines can be any machines and switches which are supported by metal-stack, but can be smaller in terms of cpu, memory and network speed because these machines must only be capable of running the target cluster for the metal-stack control plane. A typical single socket server with 8-16 cores and 64GB of RAM and two NVMe drives of 1TB would be a good starting point. - -In a typical k3s setup, a stateful set would lose the data once the k3s cluster was terminated and started again. But there is a possibility to define parts of the local storage of the server to be provided to the k3s cluster for the PVCs. With that, k3s could be terminated and started again, for example to update and reboot the host os, or update k3s itself and the data will persist. - -Example k3s configuration for persistent storage on the hosts os: - -```yaml -k3s: Cluster -apiVersion: k3s.x-k8s.io/v1alpha4 -name: needle-control-plane -nodes: - - role: control-plane - # add a mount from /path/to/my/files on the host to /files on the node - extraMounts: - - hostPath: /path/to/my/files - containerPath: /files -``` - -Into this cluster metal-stack and Gardener will be deployed. This deployment can be done by a Gitlab runner which is running on this machine. -The mini-lab will be used as a base for this deployment. The current development of [gardener-in-minilab](https://github.com/metal-stack/mini-lab/pull/202) must be extended to host all required extensions to make this a working metal-stack control plane which can manage the machines in the attached bare metal setup. - -In addition to the metal-stack and Gardener deployment, some additional required services are deployed (non-complete list): - -- PowerDNS to serve as a DNS Server for all DNS entries used in the initial and the target cluster, like `api.initial.metal-stack.local`, `gardener-api.initial.metal-stack.local` and the DNS entries for the api servers of the created kubernetes clusters. -- NTP -- Monitoring for the initial cluster and partition -- Optional: OIDC Server for authenticating against the metal-api -- Optional: Container Registry to host all metal-stack and gardener containers -- Optional: Let's Encrypt [boulder](https://github.com/letsencrypt/boulder) as a certificate authority -- ... - -Physical view, minimal setup for a initial cluster with a single physical node: - -![Small Initial Cluster](autonomous-control-plane-images/small-initial-cluster.svg) - -Physical View, bigger ha setup which is spread across two data centers: - -![HA Initial Cluster](autonomous-control-plane-images/ha-initial-cluster.svg) - -### Control Plane High Availability - -Running the initial control plane on a single physical server is not as available as it should be in such a use case. It should be possible to survive a loss of this server, because the server could be lost by many events, such as hardware failure, disk corruption or even failure of the datacenter location where this server is deployed. - -Setting up a second server with the same software components is an option, but the problem of data redundancy must be solved, because neither the gardener control plane, nor the metal-stack control plane can be instantiated twice. - -Given that we provide part of the local storage of the server as backing storage for the stateful sets in the k3s cluster, the data stored on the server itself must be replicated to another server and backed up on a regular basis. - -The replication of ETCD can be achieved through [clustered configuration](https://docs.k3s.io/datastore/ha-embedded) of k3s. Components of metal-stack and Gardener can run standalone and already utilize backup-restore mechanism that must be configured accordingly. For two or more bare metal machine used for the initial cluster, a loadbalancing mechanism for the ingress is required. kube-vip could be a possible solution. - -For monitoring a backend like a Victoria Metrics Cluster would allow spearding the monitoring data across the initial cluster nodes. These metrics should also be backed up in object storage. - -### Partition - -The partition which is managed by the initial cluster can be a simple and small hardware setup but yet capable enough to host the target cluster. It would even be a good practice to create separate target clusters on the initial cluster, e.g. one for the metal-stack control plane and one for the Gardener (maybe one more for monitoring). - -It can follow the metal-stack minimal setup which provides about 8-16 small servers connected to a 1G/s or 10G/s network dataplane. Central storage is optional as the persistence of the services running in these clusters is always backed up to a central object storage. Operations would be much easier if a central storage is provided. - -## Target Cluster - -The target cluster is the metal-stack environment which serves for end-user production use, the control plane is running in a shoot hosted in the initial cluster. The seed(s) and shoot(s) for end-users are created on the machines provided by the target cluster. -These machines can be of a different type in terms of size, but more importantly, these machines are connected to another network dataplane. Also the management infrastructure is separated from the initial cluster management network. - -## Failure Scenarios - -Everything could fail, everything will fail at some point. But this must kept in mind and nothing bad should happen if only one component at a time fails. -If more than one fails, the restoration to a working state must be easily possible and well documented. - -To ensure all possible breakages are documented, we suggest writing a list which summarizes all failure scenarios that might occur including the remediation. - -Here is an example of how a scenario documentation could look like: - -**Scenario**: Initial cluster is gone, all machines have died -**Impact**: Management of the initial cluster infrastructure not possible anymore, the target cluster continues to run but cannot be managed because the API servers are gone. end-users are not affected by this incident. -**Remediation**: The initial cluster nodes must be provisioned from scratch and re-deployed through the CI mechanism. The backups of the stateful sets are automatically restored during this process. - -## Implementation - -As part of this proposal, we provide the following tools and integrations in order to setup an autonomous control plane: - -- Deployment roles for the services like PowerDNS and NTP for the initial cluster -- Stretch goal: Deployment role to setup k3s in clustered configuration for the initial cluster and update it -- Extend the Gardener on mini-lab integration to allow shoot creation in the mini-lab -- Steady integration of the setup (maybe something like [k3d](https://github.com/k3d-io/k3d) in the mini-lab) - -## Appendix - -[^1]: ![metal-stack-chain](autonomous-control-plane-images/metal-stack-chain.svg) diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio deleted file mode 100644 index eafcb514..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio +++ /dev/null @@ -1,535 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine01 -
-
-
-
- - spine01 - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 01 - -
-
-
-
- - Initial cluster node 01 - -
-
-
- - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine02 -
-
-
-
- - spine02 - -
-
-
- - - - - - - - - -
-
-
- leaf03 -
-
-
-
- - leaf03 - -
-
-
- - - - - - - - - -
-
-
- leaf04 -
-
-
-
- - leaf04 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 02 - -
-
-
-
- - Initial cluster node 02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 03 - -
-
-
-
- - Initial cluster node 03 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg deleted file mode 100644 index 99261ada..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine01
spine01
leaf01
leaf01
leaf02
leaf02
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Initial cluster node 01
Initial cluster node 01
123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine02
spine02
leaf03
leaf03
leaf04
leaf04
Initial cluster node 02
Initial cluster node 02
Initial cluster node 03
Initial cluster node 03
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio deleted file mode 100644 index aae8a12d..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio +++ /dev/null @@ -1,1133 +0,0 @@ - - - - - - - - - - - - - - - - - - - -
-
-
- Initial Cluster -
-
-
-
- - Initial Cluster - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - -
-
-
- K3s Standalone - - - (on Debian) - - -
-
-
-
- - K3s Standalone (on Debian) - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Initial Partition -
-
-
-
- - Initial Partition - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for metal-stack -
-
-
-
- - Target Cluster for metal-stack - -
-
-
- - - - - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for Gardener -
-
-
-
- - Target Cluster for Gardener - -
-
-
- - - - - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - - - - - - - -
-
-
- Target Partition -
-
-
-
- - Target Partition - -
-
-
- - - - - - - - - - -
-
-
- Gardener Seeds and End-User Shoots -
-
-
-
- - Gardener Seeds and End-User Shoots - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - - - - -
-
-
- ETCD can be clustered or standalone, backed up by sidecar -
-
-
-
- - ETCD can be clustere... - -
-
-
- - - - - - - - - - -
-
-
- This data will get lost in case local PV gets deleted -
-
-
-
- - This data will get l... - -
-
-
- - - - - - - - - - -
-
-
- We can work with local PVs here, too. -
- backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered. -
-
-
-
- - We can work with local PVs he... - -
-
-
- - - - - - - -
-
-
- ETCD will be deployed in HA configuration on local PVs. -
-
- csi-driver-lvm needs to implement auto deletion of orphaned PVs. -
-
- Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot. -
-
-
-
- - ETCD will be deployed in HA c... - -
-
-
- - - - - - - - - - -
-
-
- More sophisticated storage solutions can be in place. -
-
- (Lightbits, NetApp, ...) -
-
-
-
- - More sophisticated storage so... - -
-
-
- - - - - - - - - - -
-
-
- TODO: Evaluate how to persist these metrics. -
-
-
-
- - TODO: Evaluate how to persist... - -
-
-
- - - - - - - - - - -
-
-
- - 1 VM or -
-
-
- - - 3 Bare Metal Machines - - -
-
-
-
-
- - 1 VM or... - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-stack -
-
-
-
- - metal-stack - -
-
-
- - - - - - - -
-
-
- metal-api -
-
-
-
- - metal-api - -
-
-
- - - - - - - -
-
-
- metal-db -
-
-
-
- - metal-db - -
-
-
- - - - - - - -
-
-
- ipam-db -
-
-
-
- - ipam-db - -
-
-
- - - - - - - -
-
-
- masterdata-db -
-
-
-
- - masterdata-db - -
-
-
- - - - - - - -
-
-
- headscale-db -
-
-
-
- - headscale-db - -
-
-
- - - - - - - -
-
-
- auditing-db -
-
-
-
- - auditing-db - -
-
-
- - - - - - - -
-
-
- nsqd -
-
-
-
- - nsqd - -
-
-
- - - - - - - - - - - -
-
-
- Gardener -
-
-
-
- - Gardener - -
-
-
- - - - - - - - - - -
-
-
- Virtual Garden -
-
-
-
- - Virtual Garden - -
-
-
- - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - -
-
-
- gardenlet -
-
-
-
- - gardenlet - -
-
-
- - - - - - - -
-
-
- Garden etcd -
-
-
-
- - Garden etcd - -
-
-
- - - - - - - -
-
-
- Prometheus -
-
-
-
- - Prometheus - -
-
-
- - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - -
-
-
- - Gitlab - -
- - Runner - -
-
-
-
-
- - Gitlab... - -
-
-
- - - - - - - - - - -
-
-
- Services -
-
-
-
- - Services - -
-
-
- - - - - - - -
-
-
- PowerDNS -
-
-
-
- - PowerDNS - -
-
-
- - - - - - - -
-
-
- boulder -
-
-
-
- - boulder - -
-
-
- - - - - - - -
-
-
- NTP -
-
-
-
- - NTP - -
-
-
- - - - - - - -
-
-
- OIDC -
-
-
-
- - OIDC - -
-
-
- - - - - - - -
-
-
- ... -
-
-
-
- - ... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg deleted file mode 100644 index e58e783b..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg +++ /dev/null @@ -1 +0,0 @@ -
Initial Cluster
Initial Cluster
metal-roles
metal-roles
CI
CI
K3s Standalone(on Debian)
K3s Standalone (on Debian)
Initial Partition
Initial Partition
Target Cluster for metal-stack
Target Cluster for metal-stack
Metal Control Plane
Metal Control Plane
provisions
provisions
Target Cluster for Gardener
Target Cluster for Gardener
Gardener Control Plane
Gardener Control Plane
Monitoring
Monitoring
Target Partition
Target Partition
Gardener Seeds and End-User Shoots
Gardener Seeds and End-User Shoots
provisions
provisions
metal-roles
metal-roles
CI
CI
metal-roles
metal-roles
ETCD can be clustered or standalone, backed up by sidecar
ETCD can be clustere...
This data will get lost in case local PV gets deleted
This data will get l...
We can work with local PVs here, too.
backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered.
We can work with local PVs he...
ETCD will be deployed in HA configuration on local PVs.

csi-driver-lvm needs to implement auto deletion of orphaned PVs.

Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot.
ETCD will be deployed in HA c...
More sophisticated storage solutions can be in place.

(Lightbits, NetApp, ...)
More sophisticated storage so...
TODO: Evaluate how to persist these metrics.
TODO: Evaluate how to persist...
1 VM or
3 Bare Metal Machines
1 VM or...
metal-stack
metal-stack
metal-api
metal-api
metal-db
metal-db
ipam-db
ipam-db
masterdata-db
masterdata-db
headscale-db
headscale-db
auditing-db
auditing-db
nsqd
nsqd
Gardener
Gardener
Virtual Garden
Virtual Garden
Gardener Control Plane
Gardener Control Plane
gardenlet
gardenlet
Garden etcd
Garden etcd
Prometheus
Prometheus
Monitoring
Monitoring
Gitlab
Runner
Gitlab...
Services
Services
PowerDNS
PowerDNS
boulder
boulder
NTP
NTP
OIDC
OIDC
...
...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio deleted file mode 100644 index cd5cf007..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio +++ /dev/null @@ -1,404 +0,0 @@ - - - - - - - - - - -
-
-
- Partition 1 -
-
-
-
- - Partition 1 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 2 -
-
-
-
- - Partition 2 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 3 -
-
-
-
- - Partition 3 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Production Control Plane -
-
-
-
- - Production Control Plane - -
-
- - - - -
-
-
- metal-stack -
- kubernetes cluster -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- gardener -
- kubernetes cluster -
-
-
-
- - gardener... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - - - - -
-
-
- Control Plane Partition -
-
-
-
- - Control Plane Partition - -
-
- - - - - -
-
-
- backup of stateful sets -
-
-
-
- - backup of stateful sets - -
-
- - - - - - -
-
-
- bare metal machine -
-
-
-
- - bare metal machine - -
-
- - - - -
-
-
- metal-stack -
- and -
- gardener -
- kubernetes cluster -
- running in kind -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - -
-
-
- S3 -
-
-
-
- - S3 - -
-
- - - - -
-
-
- Needle -
-
-
-
- - Needle - -
-
- - - -
-
-
- - Nail - -
-
-
-
- - Nail - -
-
-
- - - - - Text is not SVG - cannot display - - - -
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg deleted file mode 100644 index 8f88ba14..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg +++ /dev/null @@ -1 +0,0 @@ -
Partition 1
Partition 1
seeds
seeds
shoots
shoots
Partition 2
Partition 2
seeds
seeds
shoots
shoots
Partition 3
Partition 3
seeds
seeds
shoots
shoots
Production Control Plane
Production Control Plane
metal-stack
kubernetes cluster
metal-stack...
gardener
kubernetes cluster
gardener...
Manages
Manages
Control Plane Partition
Control Plane Partition
backup of stateful sets
backup of stateful sets
bare metal machine
bare metal machine
metal-stack
and
gardener
kubernetes cluster
running in kind
metal-stack...
Manages
Manages
S3
S3
Needle
Needle 
Nail
Nail
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio deleted file mode 100644 index a75ee340..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- Initial cluster node -
-
-
-
- - Initial cluster node - -
-
-
- - - - - - - - - - - - - -
-
-
- mirocloud (initial cluster partition nodes) -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg deleted file mode 100644 index a9d29f05..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
leaf01
leaf01
leaf02
leaf02
Initial cluster node
Initial cluster node
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP2/README.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP2/README.md deleted file mode 100644 index c7f2360a..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP2/README.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -slug: /MEP-2-two-factor-authentication -title: MEP-2 -sidebar_position: 2 ---- - -# Two Factor Authentication diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP3/README.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP3/README.md deleted file mode 100644 index 5ce36721..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP3/README.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -slug: /MEP-3-machine-re-installation -title: MEP-3 -sidebar_position: 3 ---- - -# Machine Re-Installation - -In the current metal-api only machine installations are possible, performing a machine upgrade is only possible by creating a new machine and delete the old one. -This has the drawback that in case a lot of data is stored on the local disks, a full restore of the original data must be performed. - -To prevent this, we will introduce a new metal-api endpoint to reinstall the machine with a new image, _without_ actually deleting the data stored on the additional hard disks. - -Storage is a difficult task to get right and reliable. A short analysis of our different storage requirements lead to 3 different scenarios. - -- Storage for the etcd pvs in the seed cluster of every partition. - This is the most important storage in our setup because these etcd pods serve as configuration backend for all customer kubernetes clusters. If they fail, the cluster is down. However gardener deploys a backup and restore sidecar into the etcd pod of every customer kubernetes control plane, and if this sidecar detects a corrupt or missing etcd database file(s) it starts automatic restore from the configured backup location. This will take some minutes. If for example a node dies, and gardener creates a new node instead, the csi-lvm created pv is not present on that node. Kubernetes will not schedule the missing etcd pod on this node because it has a local PV configured and is therefore tainted to run only on that node. To let kubernetes create that pod anyhow, someone has to either remove the taint, or delete the pod. If this is done, the pod starts and the restore of the etcd data can start as well. You can see this is a bit too complicated and will take the customer cluster down for a while (not measured yet but in the range of 5-10 minutes). -- Storage in customer clusters. - This was not promised in 2020. We have a intermediate solution with the provisioning of csi-lvm by default into all customer clusters. Albeit this is only local storage and will get deleted if a node dies. -- S3 Storage. - We have two possibilities to cope with storage: - - In place update of the OS with a daemonset - This will be fast and simple, but might fail because the packages being installed are broken right now, or a filesystem gets full, or any other failure you can think of during a os update. Another drawback is that metal-api does not reflect the updated os image. - - metal-api get a machine reinstall endpoint - With this approach we leverage from existing and already proven mechanisms. Reinstall must keep all data except the sata-dom. Gardener currently is not able to do an update with this approach because it can only do `rolling` updates. Therefore a additional `osupdatestrategy` has to be implemented for metal and other providers in gardener to be able to leverage the metal reinstall on the same machineID approach. - -If reinstall is implemented, we should focus on the same technology for all scenarios and put ceph via rook.io into the kubernetes clusters as additional StorageClass. It has to be checked whether to use the raw disk or a PV as the underlay block device where ceph stores its data. - -## API and behavior - -The API will get an new endpoint "reinstall" this endpoint takes two arguments: - -- machineID -- image - -No other aspects of the machine can be modified during the re-installation. All data stored in the existing allocation will be preserved, only the image will be modified. -Once this endpoint was called, the machine will get a `reboot` signal with the boot order set to PXE instead of HDD and the network interfaces on the leaf are set to PXE as well. Then the normal installation process starts: - -- unchanged: PXE boot with metal-hammer -- changed: metal-hammer first checks with the machineID in the metal-api (through metal-core) if there is already a allocation present -- changed: if a allocation is present and the allocation has set `reinstall: true`, wipe disk is only executed for the root disk, all other disks are untouched. -- unchanged: the specified image is downloaded and burned, `/install.sh` is executed -- unchanged: successful installation is reported back, network is set the the vrf, boot order is set to HDD. -- unchanged: distribution kernel is booted via kexec - -We can see that the `allocation` requires one additional parameter: `reinstall` and metal-hammer must check for already existing allocation at an earlier stage. - -Components which requires modifications (first guess): - -- metal-hammer: - - check for allocation present earlier - - evaluation of `reinstall` flag set - - wipe of disks depends on that flag - - Bonus: move configuration of disk layout and primary disk detection algorithm (PDDA) from metal-hammer into metal-api. - metal-api **MUST** reject reinstallation if the disk found by PDDA does not have the `/etc/metal` directory! -- metal-core: - - probably nothing -- metal-api: - - new endpoint `/machine/reinstall` - - add `Reinstall bool` to data model of `allocation` - - make sure to reset `Reinstall` after reinstallation to prevent endless reinstallation loop -- metalctl: - - implement `reinstall` -- metal-go: - - implement `reinstall` -- gardener (longterm): - - add the `OSUpgradeStrategy` `reinstall` diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP4/README.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP4/README.md deleted file mode 100644 index 389a02d4..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP4/README.md +++ /dev/null @@ -1,211 +0,0 @@ ---- -slug: /MEP-4-multi-tenancy-for-the-metal-api -title: MEP-4 -sidebar_position: 4 ---- - -# Multi-Tenancy for the metal-api -:::info -This document is work in progress. -::: - -In the past we decided to treat the metal-api as a "low-level API", i.e. the API does not specifically deal with projects and tenants. A user with editor access can for example assign machines to every project he desires, he can see all the machines available and can control them. We tried to keep the metal-api code base as small as possible and we added resource scoping to a "higher-level APIs". From there, a user would be able to only see his own clusters and IP addresses. - -As time passed metal-stack has become an open-source project and people are willing to adopt. Adopters who want to put their own technologies on top of the metal-stack infrastructure don't have those "higher-level APIs" that we implemented closed-source for our user base. So, external adopters most likely need to implement resource scoping on their own. - -Introducing multi-tenancy to the metal-api is a serious chance of making our product better and more successful as it opens the door for: - -- Becoming a "fully-featured" API -- Narrowing down attack surfaces and possibility of unintended resource modification produced by bugs or human errors -- Discouraging people to implement their own scoping layers in front of the metal-stack -- Gaining performance through resource scopes -- Letting untrusted / third-parties work with the API - -## Requirements - -These are some general requirements / higher objectives that MEP-4 has to fulfill. - -- Should be able to run with mini-lab without requiring to setup complex auth backends (dex, LDAP, keycloak, ...) - - Simple to start with, more complex options for production setups -- Fine-grained access permissions (every endpoint maps to a permission) -- Tenant scoping (disallow resource access to resources of other tenants) -- Project scoping (disallow resource access to resources of other projects) -- Access tokens in self-service for technical user access - -## Implementation - -We gathered a lot of knowledge while implementing a multi-tenancy-capable backend for metalstack.cloud. The goal is now to use the same technology and adopt that to the metal-api, this includes: - -- gRPC in combination with connectrpc -- OPA for making auth decisions -- REST HTTP only for OIDC login flows - -### API Definitions - -The API definitions should be located on a separate Github repository separate from the server implementation. The proposed repository location is: https://github.com/metal-stack/api. - -This repository contains the `proto3` specification of the exposed metal-stack api. This includes the messages, simple validations, services and the access permission to these services. The input parameters for the authorization in the backend are generated from the `proto3` annotations. - -Client implementations for the most relevant languages (go, python) are generated automatically. - -This api is divided into end-user and admin access at the top level. The proposed APIs are: - -- `metalstack.api.v2`: For end-user facing services -- `metalstack.admin.v2`: For operators and controllers which need access to unscoped entities - -The methods of the API can have different role scopes (and can be narrowed down further with fine-grained method permissions): - -- `tenant`: Tenant-scoped methods, e.g. project creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `project`: Project-scoped methods, e.g. machine creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `admin` Admin-scoped methods, e.g. unscoped tenant list or switch register - - Available roles: VIEWER, EDITOR - -And has methods with different visibility scopes: - -- `self`: Methods that only the logged in user can access, e.g. show permissions with the presented token -- `public`: Methods that do not require any specific authorization - -### API - -The API server implements the services defined in the API and validates access to a method using OPA with the JWT tokens passed in the requests. The server is implemented using the connectrpc.com framework. - -The API server implements the login flow through OIDC. After successful authentication, the API server derives user permissions from the OIDC provider and issues a new JWT token which is passed on to the user. The tokens including the permissions are stored in a redis compatible backend. - -With these tokens, users can create Access Tokens for CI/CD or other use cases. - -JWT Tokens can be revoked by admins and the user itself. - -### API Server - -Is put into a new github repo which implements the services defined in the `api` repository. It opens a `https` endpoints where the grpc (via connectrpc.com) and oidc services are exposed. - -### Migration of the Consumers - -To allow consumers to migrate to the `v2` API gradually, both apis, the new and the old, are deployed in parallel. In the control-plane both apis are deployed side-by-side behind the ingress. `api.example.com` is forwarded to `metal-api` and `metal.example.com` is forwarded to the new `metal-apiserver`. - -The api-server will talk to the existing metal-api during the process of migration services away to the new grpc api. - -The migration process can be done in the following manner: - -for each resource in the metal-api: - -- create a new proto3 based definition in the `api` repo. -- implement the business logic per service in the new `metal-apiserver` without calling the metal-api. -- clients must be able to talk to `v1` and `v2` backend in parallel -- Deprecate the already migrated service in the swagger route to notify the client that this route should not be used anymore. -- identify all consumers of this resource and replace them to use the grpc instead of the rest api -- move the business logic incl. the backend calls to ipam, metal-db, masterdata-api, nsq for this resource from the metal-api to the `metal-apiserver` - -We will migrate the rethinkdb backend implementation to a generic approach during this effort. - -- Try to enhance the generic rethinkdb interface with `project` scoped methods. - -There are a lot of consumers of metal-api, which need to be migrated: - -- ansible -- firewall-controller -- firewall-controller-manager -- gardener-extension-auth -- gardener-extension-provider-metal - - Do not point the secret bindings to a the shared provider secret in the seed anymore. Instead, use individual provider-secret containing project-scoped API access tokens in the Gardener project namespaces. -- machine-controller-manager-provider-metal -- metal-ccm -- metal-console -- metal-bmc -- metal-core -- metal-hammer -- metal-image-cache-sync -- metal-images -- metal-metrics-exporter -- metal-networker -- metalctl -- pixie - -## User Scenarios - -This section gathers a collection of workflows from the perspective of a user that we want to provide with the implementation of this proposal. - -### Machine Creation - -A regular user wants to create a machine resource. - -Requirements: Project was created, permissions are present - -- The user can see networks that were provided by the admin. - - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` - -- The user has to set the project scope first or provide `--project` flags for all commands. - ``` - $ metalctl project set 793bb6cd-8b46-479d-9209-0fedca428fe1 - You are now acting on project 793bb6cd-8b46-479d-9209-0fedca428fe1. - ``` -- The user can create the child network required for machine allocation. - ``` - $ metalctl network allocate --partition fra-equ01 --name test - ``` -- Now, the user sees his own child network. - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - └─╴08b9114b-ec47-4697-b402-a11421788dc6 test 793bb6cd-8b46-479d-9209-0fedca428fe1 fra-equ01 false false 10.128.64.0/22  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` -- The user does not see any machines yet. - ``` - $ metalctl machine ls - ``` -- The user can create a machine. - ``` - $ metalctl machine create --networks internet,08b9114b-ec47-4697-b402-a11421788dc6 --name test --hostname test --image ubuntu-20.04 --partition fra-equ01 --size c1-xlarge-x86` - ``` -- The machine will now be provisioned. - ``` - $ metalctl machine ls - ID LAST EVENT WHEN AGE HOSTNAME PROJECT SIZE IMAGE PARTITION - 00000000-0000-0000-0000-ac1f6b7befb2 Phoned Home 20s 50d 4h test 793bb6cd-8b46-479d-9209-0fedca428fe1 c1-xlarge-x86 Ubuntu 20.04 20210415 fra-equ01 - ``` - -:::warning -A user **cannot** list all allocated machines for all projects. The user **must** always switch project context first and can only view the machines inside this project. Only admins can see all machines at once. -::: -### Scopes for Resources - -The admins / operators of the metal-stack should be able to provide _global_ resources that users are able to use along with their own resources. In particular, users can view and use _global_ resources, but they are not allowed to create, modify or delete them. - -:::info -When a project ID field is empty on a resource, the resource is considered _global_. -::: - -Where possible, users should be capable of creating their own resource entities. - -| Resource | User | Global | -| :----------------- | :--- | :----- | -| File System Layout | yes | yes | -| Firewall | yes | | -| Firmware | | yes | -| OS Image | | yes | -| Machine | yes | | -| Network (Base) | | yes | -| Network (Children) | yes | | -| IP | yes | | -| Partition | | yes | -| Project | yes | | -| Project Token | yes | | -| Size | | yes | -| Switch | | | -| Tenant | | yes | - -:::info -Example: A user can make use of the file system layouts provided by the admins, but can also create own layouts. Same applies for images. As soon as a user creates own resources, the user takes over the responsibility for the machine provisioning to succeed. -::: diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP5/README.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP5/README.md deleted file mode 100644 index 3b7fc45c..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP5/README.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -slug: /MEP-5-shared-networks -title: MEP-5 -sidebar_position: 5 ---- - -# Shared Networks - -## Why are shared networks needed - -For special purpose machines that serve shared services with performance critical workloads to all machines of a partition (like persistent storage) it would be good to have kind of a "shared network" that is easily accessible. -They do not necessarily need another firewall. This would avoid having two firewalls in the datapath between a machine in a private network and the machines of a shared service. - -## Constraints that need to hold - -- a shared network is usable from all machines that have a firewall in front, that uses it -- a shared network is only usable within a single partition (currently we are constrained in bandwidth and have no routing of 10.0.0.0/8 addresses btw. partitions and failure domain should be the partition but this constraint might get lifted in the future) -- networks may be marked as shared after network allocation (but there should be no way back from shared to unshared) -- neither machines nor firewalls may have multiple private, unshared networks configured -- machines must have a single primary network configured - - this might be a shared network - - OR a plain, unshared private network -- firewalls may participate in multiple shared networks -- machines can be allocated with a primary network using auto IP allocation or with `noauto` and a specific IP - -## Should shared networks be private - -**Alternative 1:** If we implemented shared networks by extending functions around plain, private networks we would not have to manage another CIDR (mini point) and it would be possible to create a k8s cluster with a private network, mark the network as `shared` and produce shared services from this k8s cluster. - -**Alternative 2:** If shared networks are implemented as first class networks we could customize the VRF and also accomplish an other goal of our roadmap: being able to create machines directly in an external network. - -Together with @majst01 and @Gerrit91 we decided to continue to implement **Alternative 1**. - -## Firewalls accessing a shared network - -Firewalls that access shared networks need to: - -- hide the private network behind an ip address of the shared network if the shared network was configured with `nat=true`. -- import the prefixes of the shared VRF to the private VRF and import the prefixes of the private VRF to the shared VRF so that the communication between the two is working in both directions. As long as no `nat=true` was set on the shared VRF, the original machine ips are visible in both communication directions. - -## Setup with shared networks and single consumer - -![Simple Setup](./shared.png) - -## Setup with single shared network and multiple consumers - -![Advanced Setup](./shared_advanced.png) - -## Getting internet access - -Machines contained in a shared network can access the internet with different scenarios: - -- if they have an own firewall: this is internet accessibility, as common (check whether all traffic gets routed through it!) -- if they don't have an own firewall, an external HTTP proxy is needed that has an endpoint exposed as Service Type NodePort diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP5/shared.drawio b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP5/shared.drawio deleted file mode 100644 index aa7af045..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP5/shared.drawio +++ /dev/null @@ -1,121 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP5/shared.png b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP5/shared.png deleted file mode 100644 index b0b47f03..00000000 Binary files a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP5/shared.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP5/shared_advanced.drawio b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP5/shared_advanced.drawio deleted file mode 100644 index 6f96eca0..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP5/shared_advanced.drawio +++ /dev/null @@ -1,187 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP5/shared_advanced.png b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP5/shared_advanced.png deleted file mode 100644 index da989915..00000000 Binary files a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP5/shared_advanced.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP6/README.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP6/README.md deleted file mode 100644 index edf52a6c..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP6/README.md +++ /dev/null @@ -1,123 +0,0 @@ ---- -slug: /MEP-6-dmz-networks -title: MEP-6 -sidebar_position: 6 ---- - -# DMZ Networks - -## Reasoning - -To fulfill higher levels of security measures the standard metal-stack approach with a single firewall in front of a set of machines might be insufficient. -There are cases where two physically distinct firewalls in front of application workload are mandatory. In traditional network terms this is known as DMZ approach. - -For Kubernetes workloads it makes sense to use the front cluster for ingress, WAF purposes and as outgoing proxy. The clusters may be used for application workload. - -## DMZ network - -- Use a separate DMZ network prefix for every tenant -- This is used as intermediate network btw. private networks of a tenant and the internet -- For every partition a distinct DMZ firewall/cluster is needed for a tenant -- For Gardener orchestrated Kubernetes clusters this network must be a publicly reachable internet prefix because shoot clusters need a vpn service that is used for instrumentation from the seed cluster - this will be a requirement as long as the inverse vpn tunnel feature Konnectivity is not available to us. - -## Approach 1: DMZ with publicly reachable internet prefix - -![DMZ Internet](dmz-internet_public.svg) - -A DMZ network with publicly reachable internet prefix will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: null -partitionid: "" -prefixes: - - 212.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 104007 -vrfshared: false -nat: true -shared: false -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -This is currently supported by the metal-networker and needs no further changes! - -## Approach 2: DMZ with private IPs - -![DMZ Internet](dmz-internet_private.svg) - -A DMZ network with private IPs will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: tenant-super-network-fra-equ01 -partitionid: fra-equ01 -prefixes: - - 10.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 4711 -vrfshared: false -nat: true -shared: true # it's usable from multiple projects -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network (only locally, no-export) -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -## Code Changes / Implications - -- `metal-networker` and `metal-ccm` assume that there is only one network providing the default-route -- `metal-networker` needs to - - import the default route from the internet network to the dmz network (DMZ Firewall) - - import the DMZ network to the internet network and adjusting NAT rules (DMZ Firewall) - - import destination prefixes of the DMZ network to the private primary network (DMZ Firewall, Application Firewall) - - import DMZ-IPs of the private primary network to the DMZ network (DMZ Firewall, Application Firewall) -- `metal-api`: destination prefixes of private networks need to be configurable (`allocateNetwork`) -- `gardener-extension-provider-metal`: needs to be able to delete DMZ clusters (but skip the network deletion part) -- the application firewall is not publicly reachable - for debugging purposes a hop over the DMZ firewall is needed - -## Decision - -We decided to follow the second approach with private DMZ networks. diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP6/dmz-internet_private.drawio b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP6/dmz-internet_private.drawio deleted file mode 100644 index 7b83bbfc..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP6/dmz-internet_private.drawio +++ /dev/null @@ -1,178 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP6/dmz-internet_private.svg b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP6/dmz-internet_private.svg deleted file mode 100644 index f5e58204..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP6/dmz-internet_private.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
10.90.30.129
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
10.90.30.128/25
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
10.90.30.129 is reachable
/0 via Firewall DMZ
10.0.0.0/24 is reachable
10.0.1.0/24 is reachable
10.90.30.129 is reachable...
Internet
212.1.1.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0

import 10.0.0.0/24 no export
import 10.0.1.0/24 no export
import 10.90.30.128/25 no export
import 10.0.0.0/24 no exp...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP6/dmz-internet_public.drawio b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP6/dmz-internet_public.drawio deleted file mode 100644 index 544939e5..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP6/dmz-internet_public.drawio +++ /dev/null @@ -1,184 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP6/dmz-internet_public.svg b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP6/dmz-internet_public.svg deleted file mode 100644 index 5e825081..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP6/dmz-internet_public.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
212.1.2.3
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
212.1.2.0/27
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
212.1.2.3 is reachable
/0 via Firewall DMZ
212.1.2.3 is reachable...
Internet
212.1.1.0/27 212.1.2.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0
import 212.1.2.0/27
import 10.0.0.0/24 no redistribute
import 10.0.1.0/24 no redistribute

import 212.1.2.0/27...
SNAT to
212.1.2.1
SNAT to...
SNAT to
212.1.2.2
SNAT to...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP8/README.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP8/README.md deleted file mode 100644 index 14748fae..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP8/README.md +++ /dev/null @@ -1,503 +0,0 @@ ---- -slug: /MEP-7-configurable-filesystem-layout-for-machine-allocation -title: MEP-7 -sidebar_position: 7 ---- - -# Configurable Filesystem layout for Machine Allocation - -The current implementation uses a hard coded filesystem layout depending on the specified size and image. This is done in the metal-hammer. This worked well in the past because we had a small amount of sizes and images. But we reached a point where this is to restricted for all use cases we have to fulfill. It also forces us to modify the metal-hammer source code to support a new filesystem layout. - -This proposal tries to address this issue by introducing a filesystem layout struct in the metal-api which is then configurable per machine allocation. -The original behavior of automatic filesystem layout decision must still be present, because there must be no API change for existing API consumers. It should be a additional feature during machine allocation. - -## API and behavior - -The API will get a new endpoint `filesystemlayouts`to create/update/delete a set of available `filesystemlayouts`. - -### Constraints - -In order to keep the actual machine allocation api compatible, there must be no difference while allocating a machine. To achieve this every -`filesystemlayout` defines constraints which specifies for which combination of `sizes` and `images` this layout should be used by default. -The specified constraints over all `filesystemlayouts` therefore must be collision free, to be more specific, there must be exactly one layout outcome -for every possible combination of `sizes` and `images`. - -The `size` constraint must be a list of the exact size ids, the `image` constraint must be a map of os to semver compatible version constraint. For example: - -- `debian: ">= 10.20210101"` or `debian: "< 10.20210101"` - -The general form of a `image` constraint is a map from `os` to `versionconstraint` where: - -`os` must match the first part of the image without the version. -`versionconstraint` must be the comparator, a space and the version, or simply `*` to match all versions of this `os`. -The comparator must be one of: "=", "!=", ">", "<", ">=", "=>", "<=", "=<", "~", "~>", "^" - -It must also be possible to have a `filesystemlayout` in development or for other special purposes, which can be specified during the machine allocation. -To have such a layout, both constraints `sizes` and `images`must be empty list. - -### Reinstall - -The current reinstall implementation the metal-hammer detects during the installation on which disk the OS was installed and reports back to the metal-api the Report struct which has two properties `primarydisk` and `ospartition`. -Both fields are not required anymore because the logic is now shifted to the `filesystemlayout` definition. If `Disk.WipeOnReinstall` is set to true, this disk will be wiped, default is false and is preserved. - -### Handling of s2-xlarge machines - -These machines are a bit special compared to our `c1-*` machines because they have rotating hard disks for the mass storage purpose. -The downside is that the on board SATA-DOM has the same naming as the HDDs and can not be specified as the first /dev/sda disk because all HDDs are also /dev/sd\* disks. -Therefore we had a special SATA-DOM detection algorithm inside metal-hammer which simply checks for the smallest /dev/sd disk and took this to install the OS. - -This is not possible with the current approach, but we figured out that the SATA-DOM is always `/dev/sde`. So we can create a special `filesystemlayout` where the installations is made on this disk. - -### Possible Filesystemlayout hierarchies - -It is only possible to create a filesystem on top of a block device. The creation of a block device can be done on multiple ways, depending on the requirements regarding performance, space and redundancy of the filesystem. -It also depends on the disks available on the server. - -The current approach implements the following hierarchies: - -![filesystems](filesystems.png) - -### Implementation - -```go -// FilesystemLayout to be created on the given machine -type FilesystemLayout struct { - // ID unique layout identifier - ID string - // Description is human readable - Description string - // Filesystems to create on the server - Filesystems []Filesystem - // Disks to configure in the server with their partitions - Disks []Disk - // Raid if not empty, create raid arrays out of the individual disks, to place filesystems onto - Raid []Raid - // VolumeGroups to create - VolumeGroups []VolumeGroup - // LogicalVolumes to create on top of VolumeGroups - LogicalVolumes []LogicalVolume - // Constraints which must match to select this Layout - Constraints FilesystemLayoutConstraints -} - -type FilesystemLayoutConstraints struct { - // Sizes defines the list of sizes this layout applies to - Sizes []string - // Images defines a map from os to versionconstraint - // the combination of os and versionconstraint per size must be conflict free over all filesystemlayouts - Images map[string]string -} - -type RaidLevel string -type Format string -type GPTType string - -// Filesystem defines a single filesystem to be mounted -type Filesystem struct { - // Path defines the mountpoint, if nil, it will not be mounted - Path *string - // Device where the filesystem is created on, must be the full device path seen by the OS - Device string - // Format is the type of filesystem should be created - Format Format - // Label is optional enhances readability - Label *string - // MountOptions which might be required - MountOptions []string - // CreateOptions during filesystem creation - CreateOptions []string -} - -// Disk represents a single block device visible from the OS, required -type Disk struct { - // Device is the full device path - Device string - // Partitions to create on this device - Partitions []Partition - // WipeOnReinstall, if set to true the whole disk will be erased if reinstall happens - // during fresh install all disks are wiped - WipeOnReinstall bool -} - -// Raid is optional, if given the devices must match. -// TODO inherit GPTType from underlay device ? -type Raid struct { - // ArrayName of the raid device, most often this will be /dev/md0 and so forth - ArrayName string - // Devices the devices to form a raid device - Devices []Device - // Level the raidlevel to use, can be one of 0,1,5,10 - // TODO what should be support - Level RaidLevel - // CreateOptions required during raid creation, example: --metadata=1.0 for uefi boot partition - CreateOptions []string - // Spares defaults to 0 - Spares int -} - - -// VolumeGroup is optional, if given the devices must match. -type VolumeGroup struct { - // Name of the volumegroup without the /dev prefix - Name string - // Devices the devices to form a volumegroup device - Devices []string - // Tags to attach to the volumegroup - Tags []string -} - -// LogicalVolume is a block devices created with lvm on top of a volumegroup -type LogicalVolume struct { - // Name the name of the logical volume, without /dev prefix, will be accessible at /dev/vgname/lvname - Name string - // VolumeGroup the name of the volumegroup - VolumeGroup string - // Size of this LV in mebibytes (MiB) - Size uint64 - // LVMType can be either striped or raid1 - LVMType LVMType -} - -// Partition is a single partition on a device, only GPT partition types are supported -type Partition struct { - // Number of this partition, will be added to the device once partitioned - Number int - // Label to enhance readability - Label *string - // Size given in MebiBytes (MiB) - // if "0" is given the rest of the device will be used, this requires Number to be the highest in this partition - Size string - // GPTType defines the GPT partition type - GPTType *GPTType -} - -const ( - // VFAT is used for the UEFI boot partition - VFAT = Format("vfat") - // EXT3 is usually only used for /boot - EXT3 = Format("ext3") - // EXT4 is the default fs - EXT4 = Format("ext4") - // SWAP is for the swap partition - SWAP = Format("swap") - // None - NONE = Format("none") - - // GPTBoot EFI Boot Partition - GPTBoot = GPTType("ef00") - // GPTLinux Linux Partition - GPTLinux = GPTType("8300") - // GPTLinuxRaid Linux Raid Partition - GPTLinuxRaid = GPTType("fd00") - // GPTLinux Linux Partition - GPTLinuxLVM = GPTType("8e00") - - // LVMTypeLinear append across all physical volumes - LVMTypeLinear = LVMType("linear") - // LVMTypeStriped stripe across all physical volumes - LVMTypeStriped = LVMType("striped") - // LVMTypeStripe mirror with raid across all physical volumes - LVMTypeRaid1 = LVMType("raid1") -) -``` - -Example `metalctl` outputs: - -```bash -$ metalctl filesystemlayouts ls -ID DESCRIPTION SIZES IMAGES -default default fs layout c1-large-x86, c1-xlarge-x86 debian >=10, ubuntu >=20.04, centos >=7 -ceph fs layout for ceph s2-large-x86, s2-xlarge-x86 debian >=10, ubuntu >=20.04 -firewall firewall fs layout c1-large-x86, c1-xlarge-x86 firewall >=2 -storage storage fs layout s3-large-x86 centos >=7 -s3 storage fs layout s2-xlarge-x86 debian >=10, ubuntu >=20.04, >=firewall-2 -default-devel devel fs layout -``` - -The `default` layout reflects what is actually implemented in metal-hammer to guarantee backward compatibility. - -```yaml ---- -id: default -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - label: "efi" # required to be compatible with old images - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" # required to be compatible with old images - - path: "/var/lib" - device: "/dev/sda3" - format: "ext4" - label: "varlib" # required to be compatible with old images - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -The `firewall` layout reuses the built in nvme disk to store the logs, which is way faster and larger than what the sata-dom ssd provides. - -```yaml ---- -id: firewall -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - firewall: ">=2" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sda2" - format: "ext4" - - path: "/var" - device: "/dev/nvme0n1p1" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - device: "/dev/nvme0n1" - wipe: true - partitions: - - number: 1 - label: "var" - size: 0 - type: GPTLinux -``` - -The `storage` layout will be used for the storage servers, which must have mirrored boot disks. - -```yaml ---- -id: storage -constraints: - sizes: - - s3-large-x86 - images: - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/md1" - format: "vfat" - options: "-F32" - - path: "/" - device: "/dev/md2" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid - - device: "/dev/sdb" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid -raid: - - name: "/dev/md1" - level: 1 - devices: - - "/dev/sda1" - - "/dev/sdb1" - options: "--metadata=1.0" - - name: "/dev/md2" - level: 1 - devices: - - "/dev/sda2" - - "/dev/sdb2" - options: "--metadata=1.0" -``` - -The `s3-storage` layout matches the special situation on the s2-xlarge machines. - -```yaml ---- -id: s3-storage -constraints: - sizes: - - c1-large-x86 - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sde1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sde2" - format: "ext4" - - path: "/var/lib" - device: "/dev/sde3" - format: "ext4" -disks: - - device: "/dev/sde" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -A sample `lvm` layout which puts `/var/lib` as stripe on the nvme device - -```yaml ---- -id: lvm -description: "lvm layout" -constraints: - size: - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - createoptions: - - "-F 32" - label: "efi" - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" - - path: "/var/lib" - device: "/dev/vg00/varlib" - format: "ext4" - label: "varlib" - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -volumegroups: - - name: "vg00" - devices: - - "/dev/nvmne0n1" - - "/dev/nvmne0n2" -logicalvolumes: - - name: "varlib" - volumegroup: "vg00" - size: 200 - lvmtype: "striped" -disks: - - device: "/dev/sda" - wipeonreinstall: true - partitions: - - number: 1 - label: "efi" - size: 500 - gpttype: "ef00" - - number: 2 - label: "root" - size: 5000 - gpttype: "8300" - - device: "/dev/nvmne0n1" - wipeonreinstall: false - - device: "/dev/nvmne0n2" - wipeonreinstall: false -``` - -## Components which requires modifications - -- metal-hammer: - - change implementation from build in hard coded logic - - move logic to create fstab from install.sh to metal-hammer -- metal-api: - - new endpoint `filesystemlayouts` - - add optional spec of `filesystemlayout` during `allocation` with validation if given `filesystemlayout` is possible on given size. - - add `allocation.filesystemlayout` in the response, based on either the specified `filesystemlayout` or the calculated one. - - implement `filesystemlayouts` validation for: - - matching to disks in the size - - no overlapping with the sizes/imagefilter specified in `filesystemlayouts` - - all devices specified exists from top to bottom (fs -> disks -> device || fs -> raid -> devices) -- metalctl: - - implement `filesystemlayouts` -- metal-go: - - adopt api changes -- metal-images: - - install mdadm for raid support diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP8/filesystems.drawio b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP8/filesystems.drawio deleted file mode 100644 index 0f0c6ab5..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP8/filesystems.drawio +++ /dev/null @@ -1,43 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP8/filesystems.png b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP8/filesystems.png deleted file mode 100644 index 6d903b7e..00000000 Binary files a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP8/filesystems.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP9/README.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP9/README.md deleted file mode 100644 index a8cae83d..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP9/README.md +++ /dev/null @@ -1,132 +0,0 @@ ---- -slug: /MEP-9-no-open-ports-to-the-data-center -title: MEP-9 -sidebar_position: 9 ---- - -# No Open Ports To the Data Center - -Our metal-stack partitions typically have open ports for metal-stack native services, these are: - -- SSH port on the firewalls -- bmc-reverse-proxy for serial console access through the metal-console - -These open ports are potential security risks. For example, while SSH access is possible only with private key it's still vulnerable to DoS attack. - -Therefore, we want to get rid off these open ports to reduce the attack surface to the data center. - -## Requirements - -- Access to firewall SSH only via VPN -- Easy to update VPN components - -As a next step, we can also consider joining the management servers to the VPN mesh, which would replace typical WireGuard setups for operators to enter resources inside the partition. - -## High Level Design - -[](./architecture.svg) - -> Simplified drawing showing old vs. new architecture. - -### Concerns - -There's few concerns when using WireGuard for implementing VPN: - -1. WireGuard doesn't implement dynamic cipher substitution. Which is important in case one of the crypto methods, used by WireGuard will be broken. The only possible solution for that will be to update WireGuard to a fixed version. -2. Coordination server(Headscale) is a single point of failure. In case it fails, it potentially can disconnect existing members of the network, as WireGuard can't manage dynamic IPs by itself. -3. Headscale is already falls behind Tailscale coordination server implementation. Which can complicate the upgrade to newer version of Tailscale client in case of emergency. - -### Solutions to concerns - -1. Tailscale node software is using userspace implementation of WireGuard -- `wireguard-go`. One of the options is to inject Tailscale client into `metalctl`. And make it available as `metalctl vpn` or similar command. It should be possible to do as `tailscale` node is already available as open sourced Go pkg. That would allow us to control, what version of Tailscale users are using and in case of any critical changes to enforce them to update `metalctl` to use VPN functionality. -2. Would it be a considerable risk? We could look into `wg-dynamic` project to cover this problem. -3. At the moment, repository looks well maintained and the metal-stack team already contributes to it. - -## Implementation Details - -### metal-roles - -`metal-roles` will be responsible for deployment of `headscale` server(via new `headscale` role). It also should provide sufficient config to `metal-api` so it establishes connection with `headscale` gRPC server. - -### New `metalctl` commands - -`metalctl` will be responsible for client-side implementation of this MEP. Specifically, it's by using `metalctl` user expected to connect to firewalls. - -- `metalctl vpn` -- section for VPN related commands: - - `metalctl vpn get key [vpn name] --namespace [namespace name]` -- returns auth key to be used with `tailscale` client for establishing connection. - -Extend `metalctl firewall`: - -- `metalctl firewall ssh [ID]` -- connect to firewall via SSH. - -Extend `metalctl machine`: - -- `metalctl machine ssh [ID]` -- connect to machine via SSH. - -`metalctl` will be able to connect to firewall and machines by running `tailscale` in container. - -### metal-api - -Updates to `metal-api` should be made, so that it's able to add firewalls to VPNs. There should be one Tailscale namespace per project. So if multiple firewalls are created in single project, they will join the same namespace. - -Two new flags should be introduced to connect `metal-api` to `headscale` gRPC server: - -- `headscale-addr` -- specifies address of Headscale grpc API. -- `headscale-api-key` -- specifies temporary API key to connect to Headscale. It should be replaced and then rotated by `metal-api`. - -If `metal-api` initialized with `headscale` connection it should automatically join all created firewalls to VPN. - -Add new endpoint, that will be used by `metalctl` to connect to VPN: - -- `/v1/vpn GET` -- requests auth key from `headscale` server. - -### metal-hammer - -`metal-hammer` acts as an intermediary for machine configuration between `metal-api` and machine's image. Specifically it writes to `/etc/metal/install.yaml` file, data from which later will be used by image's `install.sh` file. - -To implement VPN support we have to add authentication key and VPN server address to `install.yaml` file. This key will be used to join machine to a VPN. - -### metal-images - -Images `install.sh` script have to be updated to work with authentication key and VPN server address, provided in `install.yaml` file. If this key is present, machine should connect to VPN. - -### metal-networker - -`metal-networker` also have to know if VPN was configured. In that case we need to disable public access to SSH and allow all(?) traffic from WireGuard interface. - -### firewall-controller - -`firewall-controller` have to monitor changes in `Firewall` resource and keep `tailscaled` version up-to-date. - -### Resources - -Update `Firewall` resource to include desired/actual `tailscale` version: - -``` -Firewall: - Spec: - tailscale: - Version: Minimal version - ... - Status: - ... - VPN: - Status: Boolean field - tailscale: - Version: Actual version - ... -``` - -### bmc-reverse-proxy - -TODO - -## References - -1. [WireGuard: Next Generation Secure Network Tunnel](https://www.youtube.com/watch?v=88GyLoZbDNw) -2. [How Tailscale works](https://tailscale.com/blog/how-tailscale-works) -3. [Tailscale is officially SOC 2 compliant](https://tailscale.com/blog/soc2) -4. [Why not Wireguard](https://www.ipfire.org/blog/why-not-wireguard) -5. [Wireguard: Known Limitations](https://www.wireguard.com/known-limitations/) -6. [Wireguard: Things That Might Be Accomplished](https://www.wireguard.com/todo/) -7. [Headscale: Tailscale control protocol v2](https://github.com/juanfont/headscale/issues/526) diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP9/architecture.drawio b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP9/architecture.drawio deleted file mode 100644 index adb09214..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP9/architecture.drawio +++ /dev/null @@ -1,324 +0,0 @@ - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - -
-
-
- headscale -
-
-
-
- - headscale - -
-
- - - - - - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
-
- - - - - Viewer does not support full SVG 1.1 - - - -
diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP9/architecture.svg b/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP9/architecture.svg deleted file mode 100644 index fd268d2f..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/MEP9/architecture.svg +++ /dev/null @@ -1 +0,0 @@ -
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
headscale
headscale
tailscaled
tailscaled
tailscaled
tailscaled
Internet
Internet
Internet
Internet
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/_category_.json b/versioned_docs/version-v0.22.2/contributing/01-Proposals/_category_.json deleted file mode 100644 index 2e7fa4bf..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "position": 1, - "label": "Enhancement Proposals" -} \ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/contributing/01-Proposals/index.md b/versioned_docs/version-v0.22.2/contributing/01-Proposals/index.md deleted file mode 100644 index 0f6eddc3..00000000 --- a/versioned_docs/version-v0.22.2/contributing/01-Proposals/index.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -slug: /enhancement-proposals -title: Enhancement Proposals -sidebar_position: 1 ---- - -# Metal Stack Enhancement Proposals (MEPs) - -This section contains proposals which address substantial modifications to metal-stack. - -Every proposal has a short name which starts with _MEP_ followed by an incremental, unique number. Proposals should be raised as pull requests in the [website](https://github.com/metal-stack/website) repository and can be discussed in Github issues. - -The list of proposals and their current state is listed in the table below. - -Possible states are: - -- `In Discussion` -- `Accepted` -- `Declined` -- `In Progress` -- `Completed` -- `Aborted` - -Once a proposal was accepted, an issue should be raised and the implementation should be done in a separate PR. - -| Name | Description | State | Progress | -| :------------------------------------------------------------- | :--------------------------------------------- | :-------------: | :----------------------------------------------------------------: | -| [MEP-1](MEP1/README.md) | Distributed Control Plane Deployment | `Declined` | | -| [MEP-2](MEP2/README.md) | Two Factor Authentication | `Aborted` | | -| [MEP-3](MEP3/README.md) | Machine Re-Installation to preserve local data | `Completed` | | -| [MEP-4](MEP4/README.md) | Multi-tenancy for the metal-api | `In Progress` | [releases#236](https://github.com/metal-stack/releases/issues/236) | -| [MEP-5](MEP5/README.md) | Shared Networks | `Completed` | | -| [MEP-6](MEP6/README.md) | DMZ Networks | `Completed` | | -| [MEP-7](https://github.com/metal-stack/docs-archive/pull/51) | Passing environment variables to machines | `Declined` | | -| [MEP-8](MEP8/README.md) | Configurable Filesystemlayout | `Completed` | | -| [MEP-9](MEP9/README.md) | No Open Ports To the Data Center | `Completed` | | -| [MEP-10](MEP10/README.md) | SONiC Support | `Completed` | | -| [MEP-11](MEP11/README.md) | Auditing of metal-stack resources | `Completed` | | -| [MEP-12](MEP12/README.md) | Rack Spreading | `Completed` | | -| [MEP-13](MEP13/README.md) | IPv6 | `Completed` | | -| [MEP-14](MEP14/README.md) | Independence from external sources | `Completed` | | -| [MEP-15](https://github.com/metal-stack/docs-archive/pull/232) | HAL Improvements | `In Discussion` | [releases#238](https://github.com/metal-stack/releases/issues/238) | -| [MEP-16](MEP16/README.md) | Firewall Support for Cluster API Provider | `Accepted` | [releases#237](https://github.com/metal-stack/releases/issues/237) | -| [MEP-17](MEP17/README.md) | Global Network View | `In Discussion` | | -| [MEP-18](MEP18/README.md) | Autonomous Control Plane | `In Discussion` | | - -## Proposal Process - -1. Before starting a new proposal, it is advised to have a quick chat with one of the maintainers. -2. Create a draft pull request in the [website](https://github.com/metal-stack/website) repository with your proposal. Your proposal doesn't have to be finished at this point. -3. Share the PR in the [metal-stack Slack](https://metal-stack.slack.com/) and invite maintainers to review it. -4. The review itself will probably take place in multiple iterations. Don't be discouraged if your proposal is not accepted right away. The goal is to reach consensus. -5. Once your proposal is accepted, create an umbrella issue in the relevant repository or when multiple repositories are involved in the [releases](https://github.com/metal-stack/releases). -6. Other issues should be created in different repositories and linked to the umbrella issue. -7. Unless stated otherwise, the proposer is responsible for the implementation of the proposal. - -## How to Write a Good MEP - -In the first section of your MEP, start with the current situation and the motivation for the change. Summarize your proposal briefly. - -Next follows the main part: describe your proposal in detail. Which parts of of metal-stack are affected? Are there API changes? If yes, describe them and provide examples here. -Try to think of side effects your proposal might have. Try to provide a view on how your proposal affects users of metal-stack. -Highlight breaking changes and think of a migration path for existing users. If your proposal affects multiple components, try to describe the interaction between them. - -After the main part of your proposal, feel free to add additional sections, e.g. about alternatives that were considered, non-goals or future possibilities. - -Depending on the complexity of your proposal, you might want to add a section about the implementation plan or roadmap. - -You can have a look at the existing MEPs for inspiration. As you will notice: not every MEP has the same structure. Feel free to structure your MEP in a way that makes sense for your proposal. diff --git a/versioned_docs/version-v0.22.2/contributing/02-planning-meetings.mdx b/versioned_docs/version-v0.22.2/contributing/02-planning-meetings.mdx deleted file mode 100644 index df10177b..00000000 --- a/versioned_docs/version-v0.22.2/contributing/02-planning-meetings.mdx +++ /dev/null @@ -1,120 +0,0 @@ ---- -slug: /planning-meetings -title: Planning Meetings -sidebar_position: 2 ---- - -# Planning Meetings - -Public planning meetings are held **biweekly** on **odd calendar weeks** from **14:00 to 14:30** (Berlin/Europe timezone) on Microsoft Teams. The purpose is to provide an overview of our current projects and priorities, as well as to discuss new topics and issues within the group. - -export function PlanningMeetingDatesTable() { - const today = new Date(); - const dayOfWeek = today.getDay(); - - let daysUntilMonday = 0; - switch (dayOfWeek) { - case 0: - daysUntilMonday = 1; - break; - case 1: - daysUntilMonday = 0; - break; - default: - daysUntilMonday = 8 - dayOfWeek; - } - - const nextMonday = new Date(); - nextMonday.setDate(nextMonday.getDate() + daysUntilMonday) - - let onejan = new Date(today.getFullYear(), 0, 1); - let week = Math.ceil((((nextMonday.getTime() - onejan.getTime()) / 86400000) + onejan.getDay() + 1) / 7); - - if (week % 2 === 0) { - nextMonday.setDate(nextMonday.getDate() + 7) - } - - const blacklist = [ - new Date('2025-12-29'), - ] - - const amount = 8 - const dates = []; - - for (let i = 0; i < amount; i++) { - const nextDate = new Date(nextMonday); - nextDate.setDate(nextDate.getDate() + (i * 14)) - - if (blacklist.find(item => {return item.toDateString() == nextDate.toDateString()}) !== undefined ) { - continue - } - - dates.push(nextDate.toDateString()) - } - - return ( - - - - - - - - - - {dates.map((date, index) => ( - - - - - - ))} - -
DateTimeLink
{date}14:00 – 14:30Join Link
- ) -} - - - -Our [development planning board](https://github.com/orgs/metal-stack/projects/34) can be found on GitHub. - -[//]: <> (The C025PB1EUKC in the slack url references the #devs channel.) -If you want to get an invitation to the event, please drop us a line on our [Slack channel](https://metal-stack.slack.com/archives/C025PB1EUKC). - -Planning meetings are currently not recorded. The meetings are held either in English or German depending on the attendees. - -:::info -Note that anyone can contribute to metal-stack without participating in planning meetings. However, if you want to speed up the review process for your requirements, it might be helpful to attend the meetings. -::: - -## Agenda - -Here is the agenda that we generally want to follow in a planning meeting: - -- Possibility to bring up news that are interesting for every developer of the metal-stack org -- Check `Done` column and archive cards - - Attendees have the chance to briefly present achievements if they want -- Check the `In Progress` column and discuss whether these tasks are still worked on, there were significant blockers or they can be lower-prioritized -- Check new issues labelled with `triage` and prioritize them -- Allow attendees to bring up issues and prioritize them - - Attendees have the chance to briefly present these new issues - -## Idea Backlog - -The backlog contains ideas of what could become part of the roadmap in the future. The list is ordered alphabetically. Therefore, the order does not express the importance or weight of a backlog item. - -We incorporate community feedback into the roadmap. If you think that important points are missing in the backlog, please share your ideas with us. We have a Slack channel. Please check out [metal-stack.io](https://metal-stack.io) for contact information. - -:::danger -By no means this list is a promise of what is being worked on in the near future. It is just a summary of ideas that was agreed on to be "nice to have". It is up to the investors, maintainers and the community to choose topics from this list and to implement them or to remove them from the list. -::: - -- Add metal-stack to [Gardener conformance test grid](https://testgrid.k8s.io/gardener-all) -- Autoscaler for metal control plane components -- CI dashboard and public integration testing -- Improved release and deploy processes (GitOps, [Spinnaker](https://spinnaker.io/), [Flux](https://fluxcd.io/)) -- Machine internet without firewalls -- metal-stack dashboard (UI) -- Offer our metal-stack extensions as enterprise products (accounting, cluster-api, S3) (neither of them will ever be required for running metal-stack, they just add extra value for certain enterprises) -- Partition managed by Kubernetes (with Kubelets joining the control plane cluster) -- Public offering / demo playground diff --git a/versioned_docs/version-v0.22.2/contributing/03-contribution-guideline.md b/versioned_docs/version-v0.22.2/contributing/03-contribution-guideline.md deleted file mode 100644 index 2c0526e3..00000000 --- a/versioned_docs/version-v0.22.2/contributing/03-contribution-guideline.md +++ /dev/null @@ -1,145 +0,0 @@ ---- -slug: /contribution-guideline -title: Contribution Guideline -sidebar_position: 3 ---- - -# Contribution Guideline - -This document describes the way we want to contribute code to the projects of metal-stack, which are hosted on [github.com/metal-stack](https://github.com/metal-stack). - -The document is meant to be understood as a general guideline for contributions, but not as burden to be placed on a developer. Use your best judgment when contributing code. Try to be as clean and precise as possible when writing code and try to make your code as maintainable and understandable as possible for other people. - -Even if it should go without saying, we live an open culture of discussion, in which everybody is welcome to participate. We treat every contribution with respect and objectiveness with the general aim to write software of quality. - -If you want, feel free to propose changes to this document in a pull request. - -## How Can I Contribute? - -Open a Github issue in the project you would like to contribute. Within the issue, your idea can be discussed. It is also possible to directly create a pull request when the set of changes is relatively small. - -When opening an issue please consider the following aspects: - -1. Create a meaningful issue describing the WHY? of your contribution. -1. Try to set appropriate labels to the issue. For example, attach the `triage` label to your issue if you want it to be discussed in the next [planning meeting](./02-planning-meetings.mdx). It might be useful to attend the meeting if you want to emphasize it being worked on. - -### Pull Requests - -The process described here has several goals: - -- Maintain quality -- Enable a sustainable system to review contributions -- Enable documented and reproducible addition of contributions - -1. Create a repository fork within the context of that issue. Members of the organization may work on the repository directly without a fork, which allows building development artifacts more easily. -1. Develop, document and test your contribution (try not to solve more than one issue in a single pull request). -1. Create a Draft Pull Request to the repository's main branch. -1. Create a meaningful description of the pull request or reference the related issue. The pull request template explains what the content should include, please read it. -1. Ask for merging your contribution by removing the draft marker. Repository maintainers (see [Code Ownership](#code-ownership)) are notified automatically, but you can also reach out to people directly on Slack if you want a review from a specific person. - -## General Objectives - -This section contains language-agnostic topics that all metal-stack projects are trying to follow. - -### Code Ownership - -The code base is owned by the entire team and every member is allowed to contribute changes to any of the projects. This is considered as collective code ownership[^1]. - -As a matter of fact, there are persons in a project, which already have experience with the sources. These are defined directly in the repository's [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) file. If you want to merge changes into the master branch, it is advisable to include code owners into the process of discussion and merging. - -### Microservices - -One major ambition of metal-stack is to follow the idea of [microservices](https://en.wikipedia.org/wiki/Microservices). This way, we want to achieve that we can - -- adapt to changes faster than with monolithic architectures, -- be free of restrictions due to certain choices of technology, -- leverage powerful traits of cloud infrastructures (e.g. high-scalability, high-availability, ...). - -### Programming Languages - -We are generally open to write code in any language that fits best to the function of the software. However, we encourage [golang](https://en.wikipedia.org/wiki/Go_(programming_language)) to be the main language of metal-stack as we think that it makes development faster when not establishing too many different languages in our architecture. Reason for this is that we are striving for consistent behavior of the microservices, similar to what has been described for the Twelve-Factor App (see [12 Factor](https://12factor.net/)). We help enforcing unified behavior by allowing a small layer of shared code for every programming language. We will refer to this shared code as "libraries" for the rest of this document. - -### Artifacts - -Artifacts are always produced by a CI process (i.e. Github Actions). - -Container images and [OCI artifacts](https://github.com/opencontainers/image-spec) are published on the Github Container Registry of the metal-stack organization. Please consider using Github Actions workflows utilizing similar actions as the other repositories (e.g. [build-push-action](https://github.com/docker/build-push-action), ...) - -For OCI images, we usually utilize [oras](https://github.com/oras-project/oras) for pushing the artifact to the registry. - -For signing artifacts we use [cosign](https://github.com/sigstore/cosign). The private key for signing artifacts is a CI secret called `COSIGN_PRIVATE_KEY`. - -Binary artifacts or OS images can be uploaded to `images.metal-stack.io` if necessary. - -### APIs - -The preferred way to implement an API is using [Connect RPC](https://connectrpc.com/), which is based on [grpc](https://grpc.io/). For working with the [Protobuf](https://protobuf.dev/) definitions, we utilize [buf](https://github.com/bufbuild/buf). - -The metal-api does still have a [Swagger-based](https://swagger.io/) API exposing traditional REST APIs for end-users. This API framework will become deprecated so it should not be used anymore for new projects. - -#### Versioning - -Artifacts are versioned by tagging the respective repository with a tag starting with the letter `v`. After the letter, there stands a valid [semantic version](https://semver.org/). - -### Documentation - -In order to make it easier for others to understand a project, we document general information and usage instructions in a `README.md` in any project. - -In addition to that, we document a microservice in the [docs](https://github.com/metal-stack/docs) repository. The documentation should contain the reasoning why this service exists and why it was being implemented the way it was being implemented. The aim of this procedure is to reduce the time for contributors to comprehend architectural decisions that were made during the process of writing the software and to clarify the general purpose of this service in the entire context of the software. - -## Guidelines - -This chapter describes general guidelines on how to develop and contribute code for a certain programming language. - -### Golang - -Development follows the official guide to: - -- Write clear, idiomatic Go code[^2] -- Learn from mistakes that must not be repeated[^3] -- Apply appropriate names to your artifacts: - - [https://go.dev/talks/2014/names.slide](https://go.dev/talks/2014/names.slide) - - [https://go.dev/blog/package-names](https://go.dev/blog/package-names) - - [https://go.dev/doc/effective_go#names](https://go.dev/doc/effective_go#names) -- Enable others to understand the reasoning of non-trivial code sequences by applying a meaningful documentation. - -#### Development Decisions - -- **Dependency Management** by using Go modules -- **Build and Test Automation** by using [GNU Make](https://man7.org/linux/man-pages/man1/make.1p.html). -- **APIs** should consider using [buf](https://github.com/bufbuild/buf) - -#### Libraries - -metal-stack maintains libraries that you can utilize in your project in order to unify common behavior. The main project that does this is called [metal-lib](https://github.com/metal-stack/metal-lib). - -#### Error Handling with Generated Swagger Clients - -From the server-side you should ensure that you are returning the common error json struct in case of an error as defined in the `metal-lib/httperrors`. Ensure you are using `go-restful >= v2.9.1` and `go-restful-openapi >= v0.13.1` (allows default responses with error codes other than 200). - -### Documentation - -We want to share knowledge and keep things simple. If things cannot kept simple we want to enable everybody to understand them by: - -- Document in short sentences[^4]. -- Do not explain the HOW (this is already documented by your code and documenting the obvious is considered a defect). -- Explain the WHY. Add a "to" in your documentation line to force yourself to explain the reasonning (e.g. "` to `"). - -### Python - -Development follows the official guide to: - -- Style Guide for Python Code (PEP 8)[^5] - - The use of an IDE like [PyCharm](https://www.jetbrains.com/pycharm/) helps to write compliant code easily -- Consider [setuptools](https://pythonhosted.org/an_example_pypi_project/setuptools.html) for packaging -- If you want to add a Python microservice to the mix, consider [pyinstaller](https://github.com/pyinstaller/pyinstaller) on Alpine to achieve small image sizes - -[^1]: [https://martinfowler.com/bliki/CodeOwnership.html](https://martinfowler.com/bliki/CodeOwnership.html) - -[^2]: [https://go.dev/doc/effective_go](https://go.dev/doc/effective_go) - -[^3]: [https://github.com/golang/go/wiki/CodeReviewComments](https://github.com/golang/go/wiki/CodeReviewComments) - -[^4]: [https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences](https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences) - -[^5]: [https://www.python.org/dev/peps/pep-0008/](https://www.python.org/dev/peps/pep-0008/) diff --git a/versioned_docs/version-v0.22.2/contributing/04-release-flow.md b/versioned_docs/version-v0.22.2/contributing/04-release-flow.md deleted file mode 100644 index 744d9274..00000000 --- a/versioned_docs/version-v0.22.2/contributing/04-release-flow.md +++ /dev/null @@ -1,100 +0,0 @@ ---- -slug: /release-flow -title: Release Flow -sidebar_position: 4 ---- - -# Releases - -The metal-stack contains of many microservices that depend on each other. The automated release flow is there to ensure that all components work together flawlessly for every metal-stack release. - -Releases and integration tests are published through our [release repository](https://github.com/metal-stack/releases). You can also find the [release notes](https://github.com/metal-stack/releases/releases) for this metal-stack version in there. The release notes contain information about new features, upgrade paths and bug fixes. - -If you want, you can sign up at our Slack channel where we are announcing every new release. Often, we provide additional information for metal-stack administrators and adopters at this place, too. - -This document is intended for developers, especially maintainers of metal-stack projects. - -## Release Flow - -The following diagram attempts to describe our current release flow: - -![](release_flow.svg) - -A release is created in the following way: - -- Individual repository maintainers within the metal-stack GitHub Organization can publish a release of their component. -- This release is automatically pushed to the `develop` branch of the release repository by the metal-robot. -- A push triggers a virtual release integration test using the mini-lab environment. This setup launches metal-stack with the `sonic` and `gardener` flavors to validate the different Ansible roles and execute basic operations across the metal-stack layer. -- To contribute components that are not directly part of the release vector, a pull request must be made against the `develop` branch of the release repository. Release maintainers may push directly to the `develop` branch. -- The release maintainers can `/freeze` the `develop` branch, effectively stopping the metal-robot from pushing component releases to this branch. -- The `develop` branch is tagged by a release maintainer with a `-rc.x` suffix to create a __release candidate__. -- The release candidate must pass a large integration test suite on a real environment, which is currently run by FI-TS. It tests the entire machine provisioning engine including the integration with Gardener, the deployment, metal-images and Kubernetes conformance tests. -- If the integration tests pass, the PR of the `develop` branch must be approved by at least two release maintainers. -- A release is created via GitHub releases, including all release notes, with a tag on the `main` branch. - -## FAQ - -**Question: I need PR #xyz to go into the release, why did you not include it?** - -Answer: It's not on purpose if we miss a PR to be included into a metal-stack release. Please use the pending pull request from `develop` into `master` as soon as it is open and comment which pull request you want to have included into the release. Also consider attending our planning meetings or contact us in our Slack channel if you have urgent requirements that need to be dealt with. - -**Question: Who is responsible for the releases? Who can freeze a release?** - -Answer: Every repository in metal-stack has a `CODEOWNERS` file pointing to a maintainer team. This is also true for the releases repository. Only release repository maintainers are allowed to `/freeze` a release (meaning the metal-robot does not automatically append new component releases to the release vector anymore). - -**Question: I can't push to the `develop` branch of this repository? How can I request changes to the release vector?** - -Answer: Most changes are automatically integrated by the metal-robot. For manually managed components, please raise a pull request against the `develop` branch. Only release maintainers are allowed to push to `develop` as otherwise it would be possible to mess up the release pipeline. - -**Question: What requirements need to be fulfilled to add a repository to the release vector?** - -Please see the section below named [Requirements for Release Vector Repositories](#requirements-for-release-vector-repositories). - -### Requirements for Release Vector Repositories - -Before adding a repository in the metal-stack org to the releases repository, it is advised for the maintainer to fulfill the following points: - -- The following files should be present at the repository root: - - [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) - - When a repository is created, the metal-robot automatically creates a -maintainers team in our GitHub org. - - The CODEOWNERS file should reference this team. - - The team should contain at least two maintainers. - - `LICENSE` - - This usually should be MIT with "metal-stack" as authors. - - `README.md` -- The `developers-core` team should be given repository access with `write` role, the codeowners team should have the `maintain` role -- Release artifacts should have an SPDX-formatted SBOM attached. - - For container images these are embedded using Buildx. -- The following branch protection rules should be set: - - The mainline should be protected. - - A pull request should be required before merging (required by at least one code owner). - - Status checks should be required to pass. - - Force push should not be allowed on this branch. -- One person from the releases maintainers has to add the repository to the metal-robot in order to pick up the releases, add them to the release vector and generate release notes. - -### How-To Release a Project - -[release-drafter](https://github.com/release-drafter/release-drafter) is preferred in order to generate release notes from merged PRs for your projects. It should be triggered for pushes on your main branch. - -The draft is then used to create a project release. The release has to be published through the GitHub UI as demonstrated in the screenshot below. - -**Tagging the repository is not enough as repository tagging does not associate your release notes to your release!** - -![](release.png) - -Some further remarks: - -- Use semver versions with `v` prefix for your tags -- Name your release after your release tag -- The metal-robot only picks up lines from your release notes that start with `-` or `*` (unordered list items) and appends them to the according section in the aggregated release draft -- A tag created through a GitHub UI release does not trigger a `push` event . This means, your pipeline will not start to run with the `push` trigger when publishing through the UI. - - Instead, use the `published` [release event trigger](https://docs.github.com/en/actions/reference/events-that-trigger-workflows#release) for your actions: - - ```yaml - on: - release: - types: - - published - ``` -- In case they are necessary, please do not forget to include `NOTEWORTHY`, `ACTIONS_REQUIRED` or `BREAKING_CHANGE` sections into releases. More information on those release draft sections can be read in a pull request template. diff --git a/versioned_docs/version-v0.22.2/contributing/05-community.md b/versioned_docs/version-v0.22.2/contributing/05-community.md deleted file mode 100644 index 61eaf099..00000000 --- a/versioned_docs/version-v0.22.2/contributing/05-community.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -slug: /community -title: Community -sidebar_position: 5 -draft: true ---- - -# Community - -(Slack channel, community events like FOSDEM, Kubernetes Community Days..., blog -articles) diff --git a/versioned_docs/version-v0.22.2/contributing/release.png b/versioned_docs/version-v0.22.2/contributing/release.png deleted file mode 100644 index 598b1182..00000000 Binary files a/versioned_docs/version-v0.22.2/contributing/release.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.2/contributing/release_flow.drawio b/versioned_docs/version-v0.22.2/contributing/release_flow.drawio deleted file mode 100644 index 6ca6b34f..00000000 --- a/versioned_docs/version-v0.22.2/contributing/release_flow.drawio +++ /dev/null @@ -1,721 +0,0 @@ - - - - - - - - - - - -
-
-
- Review release notes -
-
-
-
- - Review release notes - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- Organization Webhook -
-
-
-
- - Organization Webhook - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- - Publish release - -
-
-
-
- - Publish release - -
-
-
- - - - - - - - -
-
-
- Maintainer -
-
-
-
- - Maint... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- metal-robot release handler -
-
-
-
- - metal-robot release han... - -
-
-
- - - - - - - - -
-
-
- - no - -
-
-
-
- - no - -
-
-
- - - - - - - - -
-
-
- - yes - -
-
-
-
- - yes - -
-
-
- - - - - - - -
-
-
- version in event newer than release vector version -
-
-
-
- - version in event newer than... - -
-
-
- - - - - - - -
-
-
- - do nothing - -
-
-
-
- - do nothing - -
-
-
- - - - - - - - - - - - -
-
-
- Github Action -
-
-
-
- - Github Action - -
-
-
- - - - - - - -
-
-
- Bump version in release vector and push to - - develop - -
-
-
-
- - Bump version in release vector... - -
-
-
- - - - - - - - - - - -
-
-
- Open pull request from - - develop - - to - - master - -
-
-
-
- - Open pull request from develop... - -
-
-
- - - - - - - -
-
-
- Update aggregated release draft in - - metal-stack/releases - -
-
-
-
- - Update aggregated release draf... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Integration Testing -
-
-
-
- - Integration Testing - -
-
-
- - - - - - - - - - - -
-
-
- Merge to - - master - -
-
-
-
- - Merge to master - -
-
-
- - - - - - - - - - - - -
-
-
- Review -
-
-
-
- - Review - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Tests suceeded and PR changes reviewed -
-
-
-
- - Tests suceeded and PR chang... - -
-
-
- - - - - - - -
-
-
- - publish results to #integration - -
-
-
-
- - publish results to #integr... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Release metal-stack -
-
-
-
- - Release metal-stack - -
-
-
- - - - - - - - - - - -
-
-
- - publish to #announcements - -
-
-
-
- - publish to #announcements - -
-
-
- - - - - - - -
-
-
- - - metal-stack/docs - - pull request - -
-
-
-
- - metal-stack/docs pull requ... - -
-
-
- - - - - - - - - - - - -
-
-
- Freeze -
-
-
-
- - Freeze - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Freeze - - develop - - and create a release candidate -
-
-
-
- - Freeze develop and create a rel... - -
-
-
- - - - - - - -
-
-
- Large integration suites -
- - (currently owned by FI-TS, not public) - -
-
-
-
-
- - Large integration suites... - -
-
-
- - - - - - - - -
-
-
- Run -
-
-
-
- - Run - -
-
-
- - - - -
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.2/contributing/release_flow.svg b/versioned_docs/version-v0.22.2/contributing/release_flow.svg deleted file mode 100644 index 55cdd493..00000000 --- a/versioned_docs/version-v0.22.2/contributing/release_flow.svg +++ /dev/null @@ -1 +0,0 @@ -
Review release notes
Review release notes
projects
projects
projects
projects
Organization Webhook
Organization Webhook
projects
projects
Publish release
Publish release
Maintainer
Maint...
metal-robot release handler
metal-robot release han...
no
no
yes
yes
version in event newer than release vector version
version in event newer than...
do nothing
do nothing
Github Action
Github Action
Bump version in release vector and push todevelop
Bump version in release vector...
Open pull request fromdeveloptomaster
Open pull request from develop...
Update aggregated release draft inmetal-stack/releases
Update aggregated release draf...
Integration Testing
Integration Testing
Merge tomaster
Merge to master
Review
Review
Tests suceeded and PR changes reviewed
Tests suceeded and PR chang...
publish results to #integration
publish results to #integr...
Release metal-stack
Release metal-stack
publish to #announcements
publish to #announcements
metal-stack/docspull request
metal-stack/docs pull requ...
Freeze
Freeze
Freezedevelopand create a release candidate
Freeze develop and create a rel...
Large integration suites
(currently owned by FI-TS, not public)
Large integration suites...
Run
Run
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.2/docs/02-General/04-flavors-of-metalstack.md b/versioned_docs/version-v0.22.2/docs/02-General/04-flavors-of-metalstack.md index 7da427fc..2277ca6b 100644 --- a/versioned_docs/version-v0.22.2/docs/02-General/04-flavors-of-metalstack.md +++ b/versioned_docs/version-v0.22.2/docs/02-General/04-flavors-of-metalstack.md @@ -14,7 +14,7 @@ As modern infrastructure and cloud native applications are designed with Kuberne Regardless which flavor of metal-stack you use, it is always possible to manually provision machines, networks and ip addresses. This is the most basic way of using metal-stack and is very similar to how traditional bare metal infrastructures are managed. -Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](../../contributing/01-Proposals/MEP4/README.md) and [MEP-16](../../contributing/01-Proposals/MEP16/README.md) in the future. +Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api) and [MEP-16](/community/MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller) in the future. ## Gardener diff --git a/versioned_docs/version-v0.22.2/docs/04-For Operators/03-deployment-guide.mdx b/versioned_docs/version-v0.22.2/docs/04-For Operators/03-deployment-guide.mdx index 58ddafd3..6be800cd 100644 --- a/versioned_docs/version-v0.22.2/docs/04-For Operators/03-deployment-guide.mdx +++ b/versioned_docs/version-v0.22.2/docs/04-For Operators/03-deployment-guide.mdx @@ -31,7 +31,7 @@ You can use the [mini-lab](https://github.com/metal-stack/mini-lab) as a templat The metal control plane is typically deployed in a Kubernetes cluster. Therefore, this document will assume that you have a Kubernetes cluster ready for getting deployed. Even though it is theoretically possible to deploy metal-stack without Kubernetes, we strongly advise you to use the described method because we believe that Kubernetes gives you a lot of benefits regarding the stability and maintainability of the application deployment. :::tip -For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](../../contributing/01-Proposals/MEP18/README.md) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). +For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](/community/MEP-18-autonomous-control-plane) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). ::: Let's start off with a fresh folder for your deployment: diff --git a/versioned_docs/version-v0.22.2/docs/05-Concepts/01-architecture.mdx b/versioned_docs/version-v0.22.2/docs/05-Concepts/01-architecture.mdx index 709960e3..75298df9 100644 --- a/versioned_docs/version-v0.22.2/docs/05-Concepts/01-architecture.mdx +++ b/versioned_docs/version-v0.22.2/docs/05-Concepts/01-architecture.mdx @@ -152,4 +152,4 @@ Thus, for creating a partition as well as a machine or a firewall, the flags `dn In order to be fully offline resilient, make sure to check out `metal-image-cache-sync`. This component provides copies of `metal-images`, `metal-kernel` and `metal-hammer`. -This feature is related to [MEP14](../../contributing/01-Proposals/MEP14/README.md). +This feature is related to [MEP14](/community/MEP-14-independence-from-external-sources). diff --git a/versioned_docs/version-v0.22.2/docs/05-Concepts/02-user-management.md b/versioned_docs/version-v0.22.2/docs/05-Concepts/02-user-management.md index f1ee2778..ba742ee9 100644 --- a/versioned_docs/version-v0.22.2/docs/05-Concepts/02-user-management.md +++ b/versioned_docs/version-v0.22.2/docs/05-Concepts/02-user-management.md @@ -7,7 +7,7 @@ sidebar_position: 2 # User Management At the moment, metal-stack can more or less be seen as a low-level API that does not scope access based on projects and tenants. -Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](../../contributing/01-Proposals/MEP4/README.md). +Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](/community/MEP-4-multi-tenancy-for-the-metal-api). Until then projects and tenants can be created, but have no effect on access control. diff --git a/versioned_docs/version-v0.22.2/docs/06-For CISOs/Security/01-principles.md b/versioned_docs/version-v0.22.2/docs/06-For CISOs/Security/01-principles.md index 8e7030f5..e327ec4a 100644 --- a/versioned_docs/version-v0.22.2/docs/06-For CISOs/Security/01-principles.md +++ b/versioned_docs/version-v0.22.2/docs/06-For CISOs/Security/01-principles.md @@ -15,7 +15,7 @@ The minimal need to know principle is a security concept that restricts access t ### RBAC :::info -As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](../../../contributing/01-Proposals/MEP4/README.md). +As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](..//community/MEP-4-multi-tenancy-for-the-metal-api). ::: As described in our [User Management](../../05-Concepts/02-user-management.md) concept the [metal-api](https://github.com/metal-stack/metal-api) currently offers three different user roles for authorization: diff --git a/versioned_docs/version-v0.22.2/docs/06-For CISOs/Security/04-communication-matrix.md b/versioned_docs/version-v0.22.2/docs/06-For CISOs/Security/04-communication-matrix.md index 07df2607..24c1bc1d 100644 --- a/versioned_docs/version-v0.22.2/docs/06-For CISOs/Security/04-communication-matrix.md +++ b/versioned_docs/version-v0.22.2/docs/06-For CISOs/Security/04-communication-matrix.md @@ -116,7 +116,7 @@ Please note that every [networking setup](../../05-Concepts/03-Network/01-theory | VLAN | Switches, Firewalls | Layer 2 traffic segmentation. | | VXLAN | Switches, Firewalls | Encapsulate Layer 2 frames in Layer 3 packets for network virtualization. | | EVPN | Switches, Firewalls | Overlay network technology for scalable and flexible network architectures. | -| VPN | Firewalls | Management access [without open SSH ports](../../../contributing/01-Proposals/MEP9/README.md). | +| VPN | Firewalls | Management access [without open SSH ports](..//community/MEP-9-no-open-ports-to-the-data-center). | | BGP | Multiple | Routing protocol for dynamic routing and network management. | | SSH | Management Server, Switches | Secure shell access for management and configuration. | | LLDP | Switches, Machines | Link Layer Discovery Protocol for network device discovery. | diff --git a/versioned_docs/version-v0.22.2/docs/06-For CISOs/rbac.md b/versioned_docs/version-v0.22.2/docs/06-For CISOs/rbac.md index 9a87b896..06c902bb 100644 --- a/versioned_docs/version-v0.22.2/docs/06-For CISOs/rbac.md +++ b/versioned_docs/version-v0.22.2/docs/06-For CISOs/rbac.md @@ -31,4 +31,4 @@ To ensure that internal components interact securely with the metal-api, metal-s Users can interact with the metal-api using [metalctl](https://github.com/metal-stack/metalctl), the command-line interface provided by metal-stack. Depending on the required operations, users should authenticate with the appropriate role to match their level of access. -As part of [MEP-4](../../contributing/01-Proposals/MEP4/README.md), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. +As part of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. diff --git a/versioned_docs/version-v0.22.2/docs/06-For CISOs/remote-access.md b/versioned_docs/version-v0.22.2/docs/06-For CISOs/remote-access.md index 0b8dbb19..dc24e82f 100644 --- a/versioned_docs/version-v0.22.2/docs/06-For CISOs/remote-access.md +++ b/versioned_docs/version-v0.22.2/docs/06-For CISOs/remote-access.md @@ -6,7 +6,7 @@ title: Remote Access ## Machines and Firewalls -Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](../../contributing/01-Proposals/MEP9/README.md). Administrators can access machines in two primary ways. +Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](/community/MEP-9-no-open-ports-to-the-data-center). Administrators can access machines in two primary ways. **Out-of-band management via SOL** @@ -26,4 +26,4 @@ This approach uses the [`metal-console`](../08-References/Control%20Plane/metal- Both methods ensure secure and controlled access to machines without exposing them unnecessarily to the network, maintaining the integrity and safety of the infrastructure. -Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](../../contributing/01-Proposals/MEP4/README.md). \ No newline at end of file +Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api). \ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/Distributed-API-Working.png b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/Distributed-API-Working.png deleted file mode 100644 index 899e223d..00000000 Binary files a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/Distributed-API-Working.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/Distributed-API.png b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/Distributed-API.png deleted file mode 100644 index 688c7c2e..00000000 Binary files a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/Distributed-API.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/Distributed-Deployment.png b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/Distributed-Deployment.png deleted file mode 100644 index 8bba51b8..00000000 Binary files a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/Distributed-Deployment.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/Distributed.drawio b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/Distributed.drawio deleted file mode 100644 index f7c6fe79..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/Distributed.drawio +++ /dev/null @@ -1 +0,0 @@ -7V3bcts2EP0aP8pDgPfH2ImbziRTt+m0zVMHpmCJMUUoFG1L/fqC4kUkAEoULwDkRC8WIBIkF2d3z+6C8JV5u9r+kqD18jOZ4+gKGvPtlfn+CtKPY9M/Wc8u7/FtM+9YJOE87wKHji/hf7joNIre53CON40DU0KiNFw3OwMSxzhIG30oSchr87BHEjWvukYLzHV8CVDE9/4dztNl3uvZxqH/Iw4Xy/LKwCh+eUDB0yIhz3FxvStoPu4/+c8rVI5VHL9Zojl5rXWZH67M24SQNP+22t7iKJNtKbb8vLuWX6v7TnCcdjkhWVoPJP5zF3/w/trOvG+vv70+z4rJe0HRMy4fw4noeDePhA5L5YmC/Afn+3N2pzcfcfSC0zBAh67s4dJd1DwuO3+22U/3O3oAgOtt/Qxnkf29R0kapiGJy6vS288vnP9eSK4aHFIhrrOvz6voLkEr+vXmdRmm+Ms6v89XClLat0xXEW0B+nU/QziTgUFb1SRkjYCswqD4HqEHHN1UU3pLIpLQn2ISZ9fYpAl5qvABise7Q6swymD/F07mKEZFd4FxkA2LonAR00ZAJwkn1RO94CTF29aJBBU8qNphssJpsqOHFCfMbLeAVKFywC0x+VpDcNm3rKHX9opOVGjNohr9gBz6pQDPGUACkEPSBif0OWkfHGEayyfzMqlWj2QaI4kUcCK1eJGajkikYASRrrboX/K79Qh+vYe77+Ef8RMBM4+TaECCp4SgYClDoJz0BDJuF6jFCNQ0O8oTGiPI055D4NsPc8+Yo0cAwMy0OJHhOfUDRZMk6ZIsSIyiD4dexnTUZDdHm+W+H3SwHNTE3YXZne5HwfH8Xea1souucZz3NH+v24+OZqZ1xrKHPDpfCY5QGr40naFI9PtT6a2jXe2ANQnjdFMb+T7rOMDAAoxaGdBvOqkzT6Bf8nsQn256LadXd7whz0mAi9MYQFVi6a+zrsCfDsTd4ZhPhKwL0H3DaborEICeU9LE5x50JcyCCG02mZ9rYBEcQ00upCOPWdAGOt4Cp0eOc8ZG4SCDypOdmkE1ADdTpVGlPGFNtTkTUuXQI/yYNTfUvobx4tO+9d50xrKeVhPHlsDBAyiwns5Uzsg5Krt2Dy9fdpUMVMgOuCh4QLZredg2fedhBji5MQT7bObckZJzBNt498OQ7HKm3Wm4jW0zXsbmEWaL6LdlTqWegLdesv1MC+Hp72T8SSgMxxkoN2yhquUYuZubjDP4nImgI6JohtahRmbVYsxqlcBR5pLKkPMtYb4U6uSgh23xmSTQlw9aRz3apJmNT5FGsIeedrA3j1ExzfMC0G6BnZS0gFieloCivcGYDXQN2oBeURu4mLCNtRXqozZwMWEbSy80kJ0ol6ModLv5GbqNgjIuza9B5OYp9zbjs1hJoRub7qVs4tqofWBzwKkp7UUEcmx6TD2hrQrkb0gDJqq/8PUSnk8r1IAKjXoHdWx2XQMVgAKuwerEoXLIhAd8dw3neBum/2R4vva8ovl137SL1vttgfZ9Y1dr3OMkpM+X+eWiNkmfNR8LOGW7NljWPIy2b+3qLXa8TurVllE/Hca4HVWw4fv5SS/7hmZc2KyxYzNoailNCkYymJHY2HhqNb/kDAS3MgFUpFBdDgL+IDkI2DUHAfXKQcCLyUFwpWMABSuZJHu3i8lCcMVjHaR3Mg8hbY3mzxLyVCVkv3Miwp0MZ9omIg4UtuSsX2vkVsxfB/goVXXnAxGRxeMuImHBEzZDoCxybXKZDdRPV/rj3pSUsuBKz9Jxb15GmoKiTD/g84mKywn9gK9f5GfysbRy0zJF5FcuwD8Z+Zn22GZo2PzwkbmmkV/1opk+oYt5PGzWKPCzWFOrgfD4qPln/fnC4z4AtAv7LNEKddYB/Zilh6PpmNOOrGsKU1H9AVg+g6neBQhQJR0lMXhLVIGIN4QCVhuXwr9TKqTvIm2fzKdYGr6f1vqiZKXxdkPhT6h7f822Or/VZnXU7IEqa9sN/Hjsy9tTKxmfHvoJlrPB4koCi8nSf8Pyr53Dx5WLHZ75o3V4nacX6ewFT9ch0chWM6badQSW2pVpqUvd11DXpGbj7dELwS3qw754LjspafPhnobJeMC9YK88Jeko8Uo1j+Oe5XL0UzGna0RTsu7JKwSA8WU+u8fKxLs4OKauxrd1lk/zEElvFtpmv7k7OdCe0Hjm4WNLtc8Onwiu7POMzn2WW9DGTHM1U19Q6QCmVDMtWgS0D9mv12WmcfZwiiHS50+bWjOCtNglU1WgVRMWFMXpk70U4vBxOi8spERYM2hoJy1tV64McMqSVqFwhQ/ZvNfhsww6FuNN/RahlHekcxKUyxSrz4G6auOFqtGtejEtKZS31o0tfPVlhTPTsBlEWdkrTwda6HQyX+fuZMc/QQHa9hvltrLzGmc0t7IbbQM6vjKSJd6Uswb2VU31rMG9JELP5l3U83lXSYJSiRmdPPdosablaKDb1VTyywfdPgH0uZaSf5rjhpJbBi3HTvLhaNNOqglF2bWxUs2keGNnTk7Vvs7ti9+02dfZZsEld19noUQhJ1EVlvSsFZ+MBTwZ0gqfu2AmdVIqPM4aaGQHTc7RV1tHXu45ENoMvxRuZjJZRNo+c5KWew4SHrcBgHLRqdkEY0TtFhSRhMf5KrUb8gost1bYY3WK1NnJNxdUNT181nuaEvi4hhf4ULn1UJvTOrcG3r++PYoy+BehDNLy4sO0wWTLtOq1QZMdPUdELOjKnZUittxtUpkVgr2sEKjZoNKSyTBDnSd1aEDUK43DRheGb9cBcvL4MhvZdrx7/PjBWZ+jIq9ZBmo4E7KlkqHgNUH+2tGpF1rVcw4otfwoliX//6mUqH/FJdzuZKI3cxlT/btycqX5EMGmlhdKNaESrtl5lod67l5GnjPC7P+QZIuaFjx+xkRmmw8ML8Jss2km9Ua73GjuMhIgvWz7iMor2q7uCEDHXzbB/vMJg90zcrzFWWIB6OHjVUwpHDqlw/SUf39Kx1QYYMtr6oN/qDoI7ctPFJm4zpnhiUycrdrECZLOGubZ9FO0Mu/3hnxD18SwWtdwZNe+KdatjVws8UTAtaQCF7414JYb2p0mNbZK5Ir23dMXuRy38UT/JmAk5NJmQrLtlw6uLUHr5Wcyx9kR/wM= \ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/Distributed.png b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/Distributed.png deleted file mode 100644 index d96ca216..00000000 Binary files a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/Distributed.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/README.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/README.md deleted file mode 100644 index 0fd4bb63..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP1/README.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -slug: /MEP-1-distributed-metal-control-plane -title: MEP-1 -sidebar_position: 1 ---- - -# Distributed Metal Control Plane - -This enhancement proposal was replaced by [MEP18](../MEP18/README.md). - -## Problem Statement - -We face the situation that we argue for running bare metal on-premises because this way the customers can control where and how their software and data are processed and stored. -On the other hand, we have currently decided that our metal-api control plane components run on a kubernetes cluster (in our case on a cluster provided by one of the available hyperscalers). - -Running the control plane on Kubernetes has the following benefits: - -- Ease of deployment -- Get most, if not all, of the required infrastructure services like (probably incomplete): - - IPs - - DNS - - L7-Loadbalancing - - Storage - - S3 Backup - - High Availability - -Using a kubernetes as a service offering from one of the hyperscalers, enables us to focus on using kubernetes instead of maintaining it as well. - -## Goal - -It would be much saner if metal-stack has no, or only minimal dependencies to external services. Imagine a metal-stack deployment in a plant, it would be optimal if we only have to deliver a single rack with servers and networking gear installed and wired, plug that rack to the power supply and a internet uplink and its ready to go. - -Have a second plant which you want to be part of all your plants? Just tell both that they are part of something bigger and metal-api knows of two partitions. - -## Possible Solutions - -We can think of two different solutions to this vision: - -1. Keep the central control plane approach and require some sort of kubernetes deployment accessible from the internet. This has the downside that the user must, provide a managed kubernetes deployment in his own datacenter or uses a hyperscaler. Still not optimal. -1. Install the metal-api and all its dependencies in every partition, replicate or shard the databases to every connected partition, make them know each other. Connect the partitions over the internet with some sort of vpn to make the services visible to each other. - -As we can see, the first approach does not really address the problem, therefore i will describe solution #2 in more details. - -## Central/Current setup - -### Stateful services - -Every distributed system suffer from handling state in a scalable, fast and correct way. To start how to cope with the state, we first must identify which state can be seen as partition local only and which state must be synchronous for read, and synchronous for writes across partitions. - -Affected states: - -- masterdata: e.g. tenant and project must be present in every partition, but these are entities which are read often but updates are rare. A write can therefore be visible with a decent delay in a distinct partition with no consequences. -- ipam: the prefixes and ip´s allocated from machines. These entities are also read often and rare updates. But we must differentiate between dirty reads for different types. A machine network is partition local, ips acquired from such a network must by synchronous in the same partition. Ips acquired from global networks such as internet must by synchronous for all partitions, as otherwise a internet ip could be acquired twice. -- vrf ids: they must only be unique in one partition -- image and size configurations: read often, written seldom, so no high requirements on the storage of these entities. -- images: os images are already replicated from a central s3 storage to a per partition s3 service. metal-hammer kernel and initrd are small and pull always from the central s3, can be done similar to os images. -- machine and machine allocation: must be only synchronous in the partition -- switch: must be only synchronous in the partition -- nsq messages: do not need to cross partition boundaries. No need to keep the messages persistent, even the opposite is true, we don't want to have the messages persist for a longer period. - -Now we can see that the most critical state to held and synchronize are the IPAM data, because these entities must be guaranteed to be synchronously updated, while being updated frequently. - -Datastores: - -We use three different types of datastores to persist the states of the metal application. - -- rethinkdb is the main datastore for almost all entities managed by metal-api -- postgresql is used for masterdata and ipam data. -- nsq uses disk and memory tho store the messages. - -### Stateless services - -These are the easy part, all of our services which are stateless can be scaled up and down without any impact on functionality. Even the stateful services like masterdata and metal-api rely fully on the underlying datastore and can therefore also be scaled up and down to meet scalability requirements. - -Albeit, most of these services need to be placed behind a loadbalancer which does the L4/L7 balancing across the started/available replicas of the service for the clients talking to it. This is actually provided by kubernetes with either service type loadbalancer or type clusterip. - -One exception is the `metal-console` service which must have the partition in it´s dns name now, because there is no direct network connectivity between the management networks of the partitions. See "Network Setup) - -## Distributed setup - -### State - -In order to replicate certain data which must be available across all partitions we can use on of the existing open source databases which enable such kind of setup. There are a few available out there, the following incomplete list will highlight the pro´s and cons of each. - -- RethinkDB - - We already store most of our data in RethinkDB and it gives already the ability to synchronize the data in a distributed manner with different guarantees for consistency and latency. This is described here: [Scaling, Sharding and replication](https://rethinkdb.com/docs/sharding-and-replication/). But because rethinkdb has a rough history and unsure future with the last release took more than a year, we in the team already thought that we eventually must move away from rethinkdb in the future. - -- Postgresql - - Postgres does not have a multi datacenter with replication in both directions, it just can make the remote instance store the same data. - -- CockroachDB - - Is a Postgresql compatible database engine on the wire. CockroachDB gives you both, ACID and geo replication with writes allowed from all connected members. It is even possible to configure [Follow the Workload](https://www.cockroachlabs.com/docs/stable/topology-follow-the-workload) and [Geo Partitioning and Replication](https://www.cockroachlabs.com/docs/v19.2/topology-geo-partitioned-replicas). - -If we migrate all metal-api entities to be stored the same way we store masterdata, we could use cockroachdb to store all metal entities in one ore more databases spread across all partitions and still ensure consistency and high availability. - -A simple setup how this would look like is shown here. - -![Simple CockroachDB setup](Distributed.png) - -go-ipam was modified in a example PR here: [PR 17](https://github.com/metal-stack/go-ipam/pull/17) - -### API Access - -In order to make the metal-api accessible for api users like `cloud-api` or `metalctl` as easy at it is today, some effort has to be taken. One possible approach would be to use a external loadbalancer which spread the requests evenly to all metal-api endpoints in all partitions. Because all data are accessible from all partitions, a api request going to partition A with a request to create a machine in partition B, will still work. If on the other hand partition B is not in a connected state because the interconnection between both partitions is broken, then of course the request will fail. - -**IMPORTANT** -The NSQ Message to inform `metal-core` must end in the correct partition - -To provide such a external loadbalancer we have several opportunities: - -- Cloudflare or comparable CDN service. -- BGP Anycast from every partition - -Another setup would place a small gateway behind the metal-api address, which forwards to the metal-api in the partition where the request must be executed. This gateway, `metal-api-router` must inspect the payload, extract the desired partition, and forward the request without any modifications to the metal-api endpoint in this partition. This can be done for all requests, or if we want to optimize, only for write accesses. - -## Network setup - -In order to have the impact to the overall security concept as minimal as possible i would not modify the current network setup. The only modifications which has to be made are: - -- Allow https ingress traffic to all metal-api instances. -- Allow ssh ingress traffic to all metal-console instances. -- Allow CockroachDB Replication between all partitions. -- No NSQ traffic from outside required anymore, except we cant solve the topic above. - -A simple setup how this would look like is shown here, this does not work though because of the forementioned NSQ issue. - -![API and Console Access](Distributed-API.png) - -Therefore we need the `metal-api-router`: - -![Working API and Console Access](Distributed-API-Working.png) - -## Deployment - -The deployment of our components will substantially differ in a partition compared to a the deployment we have actually. Deploying it in kubernetes in the partition would be very difficult to achieve because we have no sane way to deploy kubernetes on physical machines without a underlying API. -I would therefore suggest to deploy our components in the same way we do that for the services running on the management server. Use systemd to start docker containers. - -![Deployment](Distributed-Deployment.png) diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP10/README.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP10/README.md deleted file mode 100644 index 6811cdc0..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP10/README.md +++ /dev/null @@ -1,197 +0,0 @@ ---- -slug: /MEP-10-sonic-support -title: MEP-10 -sidebar_position: 10 ---- - -# SONiC Support - -As writing this proposal, metal-stack only supports Cumulus on Broadcom ASICs. Unfortunately, after the acquisition of -Cumulus Networks by Nvidia, Broadcom decided to cut its relationship with Cumulus, and therefore Cumulus 4.2 is the last -version that supports Broadcom ASICs. Since trashing the existing hardware is not a solution, adding support for a -different network operating system is necessary. - -One of the remaining big players is [SONiC](https://sonic-net.github.io/SONiC/), which Microsoft created to scale the -network of Azure. It's an open-source project and is now part of the [Linux Foundation](https://www.linuxfoundation.org/press/press-release/software-for-open-networking-in-the-cloud-sonic-moves-to-the-linux-foundation). - -For a general introduction to SONiC, please follow the [Architecture](https://github.com/sonic-net/SONiC/wiki/Architecture) official -documentation. - -## ConfigDB - -On a cold start, the content of `/etc/sonic/config_db.json` will be loaded into the Redis database `CONFIG_DB`, and both -contain the switch's configuration except the BGP unnumbered configuration, which still has to be configured directly by -the frr configuration files. The SONiC community is working to remove this exception, but no release date is known. - -## BGP Configuration - -Frr runs inside a container, and a shell script configured it on the container startup. For BGP unnumbered, we must set -the configuration variable `docker_routing_config_mode` to `split` to prevent SONiC from overwriting our configuration -files created by `metal-core`. But by using the split mode, the integrated configuration mode of frr is deactivated, and -we have to write our BGP configuration to the daemon-specific files `bgp.conf`, `staticd.conf`, and `zebra.conf` instead -to `frr.conf`. - -```bash -elif [ "$CONFIG_TYPE" == "split" ]; then - echo "no service integrated-vtysh-config" > /etc/frr/vtysh.conf - rm -f /etc/frr/frr.conf -``` - -Reference: [docker-init](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/docker_init.sh#L69) - -Adding support for the integrated configuration mode, we must at least adjust the startup shell script and the supervisor configuration: - -```bash -{% if DEVICE_METADATA.localhost.docker_routing_config_mode is defined and DEVICE_METADATA.localhost.docker_routing_config_mode == "unified" %} -[program:vtysh_b] -command=/usr/bin/vtysh -b -``` - -Reference: [supervisord.conf](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/frr/supervisord/supervisord.conf.j2#L157) - -## Non-BGP Configuration - -For the Non-BGP configuration we have to write it into the Redis database directly or via one of the following interfaces: - -- `config replace ` -- the Mgmt Framework -- the SONiC restapi - -Directly writing into the Redis database isn't a stable interface, and we must determine the create, delete, and update -operations on our own. The last point is also valid for the Mgmt Framework and the SONiC restapi. Furthermore, the -Mgmt Framework doesn't start anymore for several months, and a [potential fix](https://github.com/sonic-net/sonic-buildimage/pull/10893) -is still not merged. And the SONiC restapi isn't enabled by default, and we must build and maintain our own SONiC images. - -Using `config replace` would reduce the complexity in the `metal-core` codebase because we don't have to determine the -actual changes between the running and the desired configuration. The approach's drawbacks are using a version of SONiC -that contains the PR [Yang support for VXLAN](https://github.com/sonic-net/sonic-buildimage/pull/7294), and we must provide -the whole new startup configuration to prevent unwanted deconfiguration. - -### Configure Loopback interface and activate VXLAN - -```json -{ - "LOOPBACK_INTERFACE": { - "Loopback0": {}, - "Loopback0|": {} - }, - "VXLAN_TUNNEL": { - "vtep": { - "src_ip": "" - } - } -} -``` - -#### Configure MTU - -```json -{ - "PORT": { - "Ethernet0": { - "mtu": "9000" - } - } -} -``` - -#### Configure PXE Vlan - -```json -{ - "VLAN": { - "Vlan4000": { - "vlanid": "4000" - } - }, - "VLAN_INTERFACE": { - "Vlan4000": {}, - "Vlan4000|": {} - }, - "VLAN_MEMBER": { - "Vlan4000|": { - "tagging_mode": "untagged" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104000_Vlan4000": { - "vlan": "Vlan4000", - "vni": "104000" - } - } -} -``` - -#### Configure VRF - -```json -{ - "INTERFACE": { - "Ethernet0": { - "vrf_name": "vrf104001" - } - }, - "VLAN": { - "Vlan4001": { - "vlanid": "4001" - } - }, - "VLAN_INTERFACE": { - "Vlan4001": { - "vrf_name": "vrf104001" - } - }, - "VRF": { - "vrf104001": { - "vni": "104001" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104001_Vlan4001": { - "vlan": "Vlan4001", - "vni": "104001" - } - } -} -``` - -## DHCP Relay - -The DHCP relay container only starts if `DEVICE_METADATA.localhost.type` is equal to `ToRRouter`. - -## LLDP - -SONiC always uses the local port subtype for LLDP and sets it to some freely configurable alias field of the interface. - -```python -# Get the port alias. If None or empty string, use port name instead -port_alias = port_table_dict.get("alias") -if not port_alias: - self.log_info("Unable to retrieve port alias for port '{}'. Using port name instead.".format(port_name)) - port_alias = port_name - -lldpcli_cmd = "lldpcli configure ports {0} lldp portidsubtype local {1}".format(port_name, port_alias) -``` - -Reference: [lldpmgr](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-lldp/lldpmgrd#L153) - -## Mgmt Interface - -The mgmt interface is `eth0`. To configure a static IP address and activate the Mgmt VRF, use: - -```json -{ - "MGMT_INTERFACE": { - "eth0|": { - "gwaddr": "" - } - }, - "MGMT_VRF_CONFIG": { - "vrf_global": { - "mgmtVrfEnabled": "true" - } - } -} -``` - -[IP forwarding is deactivated on `eth0`](https://github.com/sonic-net/sonic-buildimage/blob/202205/files/image_config/sysctl/sysctl-net.conf#L7), and no IP Masquerade is configured. diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP11/README.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP11/README.md deleted file mode 100644 index 87f48a10..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP11/README.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -slug: /MEP-11-auditing-of-metal-stack-resources -title: MEP-11 -sidebar_position: 11 ---- - -# Auditing of metal-stack resources - -Currently no logs of the ownership of resources like machines, networks, ips and volumes are generated or kept. Though due to legal requirements data centers are required to keep track of this ownership over time to prevent liability issues when opening the platform for external users. - -In this proposal we want to introduce a flexible and low-maintenance approach for auditing on top of [Meilisearch](https://www.meilisearch.com/). - -## Overview - -In general our auditing logs will be collected by a request interceptor or middleware. Every request and response will be processed and eventually logged to Meilisearch. -Meilisearch will be configured to regularly create chunks of the auditing logs. These finished chunks will be backed up to a S3 compatible storage with a read-only option enabled. - -Of course sensitive data like session keys or passwords will be redacted before logging. We want to track relevant requests and responses. If auditing the request fails, the request itself will be aborted and will not be processed further. The requests and responses that will be audited will be annotated with a correlation id. - -Transferring the meilisearch auditing data chunks to the S3 compatible storage will be done by a sidecar cronjob that is executed periodically. -To avoid data manipulation the S3 compatible storage will be configured to be read-only. - -## Whitelisting - -To reduce the amount of unnecessary logs we want to introduce a whitelist of resources and operations on those that should be logged. -Other requests will be passed directly to the next middleware or web service without any further processing. - -As we are only interested in mutating endpoints, we ignore all `GET` requests. -The whitelist includes all `POST`, `PUT`, `PATCH` and `DELETE` endpoints of the HTTP middleware except for the following (non-manipulating) route suffixes: - -- `/find` -- `/notify` -- `/try` and `/match` -- `/capacity` -- `/from-hardware` - -Regarding GRPC audit trails, they are not so interesting because only internal clients are using this API. However, we can log the trails of the `Boot` service, which can be interesting to revise the machine lifecycle. - -## Chunking in Meilisearch - -We want our data to be chunked in Meilisearch. To accomplish this, we rotate the index identifier on a scheduled basis. The index identifiers will be derived from the current date and time. - -To keep things simple, we only support hourly, daily and monthly rotation. The eventually prefixed index names will only include relevant parts of date and time like `2021-01`, `2021-01-01` or `2021-01-01_13`. - -The metal-api will only write to the current index and switches to the new index on rotation. The metal-api will never read or update data in any indices. - -## Moving chunks to S3 compatible storage - -As Meilisearch will be filled with data over time, we want to move completed chunks to a S3 compatible storage. This will be done by a sidecar cronjob that is executed periodically. Note that the periods of the index rotation and the cronjob execution don't have to match. - -When the backup process gets started, it initiates a [Meilisearch dump](https://www.meilisearch.com/docs/learn/advanced/dumps) of the whole database across all indices. Once the returned task is finished, the dump must be copied from a Meilisearch volume to the S3 compatible storage. After a successful copy, the dump can be deleted. - -Now we want to remove all indices from Meilisearch, except the most recent one. For this, we [get all indices](https://www.meilisearch.com/docs/reference/api/indexes#list-all-indexes), sort them and [delete each index](https://www.meilisearch.com/docs/reference/api/indexes#delete-an-index) except the most recent one to avoid data loss. - -For the actual implementation, we can build upon [backup-restore-sidecar](https://github.com/metal-stack/backup-restore-sidecar). But due to the index rotation and the fact, that older indices need to be deleted, this probably does not fit into the mentioned sidecar. - -## S3 compatible storage - -The dumps of chunks should automatically deleted after a certain amount of time, once we are either no longer allowed or required to keep them. -The default retention time will be 6 months. Ideally already uploaded chunks should be read-only to prevent data manipulation. - -A candidate for the S3 compatible storage is Google Cloud Storage, which allows to configure automatic expiration of objects through a [lifecycle rule](https://cloud.google.com/storage/docs/managing-lifecycles?hl=en#storage-set-lifecycle-config-go). - -## Affected components - -- metal-api grpc server needs an auditing interceptor -- metal-api web server needs an auditing filter chain / middleware -- metal-api needs new command line arguments to configure the auditing -- mini-lab needs a Meilisearch instance -- mini-lab may need a local S3 compatible storage -- we need a sidecar to implement the backup to S3 compatible storage -- Consider auditing of volume allocations and freeings outside of metal-stack - -## Alternatives considered - -Instead of using Meilisearch we investigated using an immutable database like [immudb](https://immudb.io/). But immudb does not support chunking of data and due to its immutable nature, we will never be able to free up space of expired data. Even if we are legally allowed or required to delete data, we will not be able to do so with immudb. - -In another variant of the Meilisearch approach the metal-api would also be responsible for copying chunks to the S3 compatible storage and deleting old indices. But separating the concerns allows completely different implementations for every deployment stage. diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP12/README.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP12/README.md deleted file mode 100644 index 65532c57..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP12/README.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -slug: /MEP-12-rack-spreading -title: MEP-12 -sidebar_position: 12 ---- - -# Rack Spreading - -Currently, when creating a machine through the metal-api, the machine is placed randomly inside a partition. This algorithm does not consider spreading machines across different racks and different chassis. This may lead to the situation that a group of machines (that for example form a cluster) can end up being placed in the same rack and the same chassis. - -Spreading a group of machines across racks can enhance availability for scenarios like a rack losing power or a chassis meltdown. - -So, instead of just randomly deciding the placement of a machine candidate, we want to propose a placement strategy that attempts to spread machine candidates across the racks inside a partition. - -Furthermore a followup improvement to guarantee that machines are really spread across multiple racks, even if multiple machines are ordered in parallel, was implemented with [PR490](https://github.com/metal-stack/metal-api/pull/490). - -## Placement Strategy - -Machines in the project are spread across all available racks evenly within a partition (best effort). For this, an additional request to the datastore has to be made in order to find allocated machines within the project in the partition. - -The algorithm will then figure out the least occupied racks and elect a machine candidate randomly from those racks. - -The user can optionally pass placement tags which will be considered for spreading the machines as well (this will for example allow spreading by a cluster id tag inside the same project). - -## API - -```golang -// service/v1/machine.go - -type MachineAllocation struct { - // existing fields are omitted for readability - PlacementTags []string `json:"placement_tags" description:"by default machines are spread across the racks inside a partition for every project. if placement tags are provided, the machine candidate has an additional anti-affinity to other machines having the same tags"` -} -``` diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP13/README.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP13/README.md deleted file mode 100644 index 2dde20f5..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP13/README.md +++ /dev/null @@ -1,111 +0,0 @@ ---- -slug: /MEP-13-dual-stack-support -title: MEP-13 -sidebar_position: 13 ---- - -# Dual-stack Support - -dual-stack support is required to be able to create Kubernetes clusters with either IPv6 single-stack or dual-stack enabled. -With the inherent scarcity of IPv4 addresses, the need to be able to use IPv6 has increased. - -Full IPv6 dual-stack support was added to Kubernetes with v1.23 as stable. - -Gardeners have had full IPv6 dual-stack support since `v1.109`. - -metal-stack manages CIDRs and IP addresses with the [go-ipam](https://github.com/metal-stack/go-ipam) library, which already got full IPv6 support in 2021 (see [https://metal-stack.io/blog/2021/02/ipv6-part1](https://metal-stack.io/blog/2021/02/ipv6-part1)). -But this was only the foundation, more work needs to be done to get full IPv6 support for all aspects managed by metal-stack.io. - -## General Decisions - -For the general decision we do not look at the isolated clusters feature for now as this would make the solution even more complex and we want to introduce IPv6 in smaller steps to the users. - -### Networks - -Currently, metal-stack organizes CIDRs / prefixes into a `network' resource in the metal-api. A network can consist of multiple CIDRs from the same address family. For example, if an operator wants to provide Internet connectivity to provisioned machines, they can start with small network CIDRs. The number of managed network prefixes can then be expanded as needed over time. - -With dual-stack we have to choose between two options: Network per address family or networks with both address families. These options are described in the next section. - -#### Network per Address Family - -This means that we allow networks with CIDRs from one address family only, one for IPv4 and one for IPv6. - -The machine creation process will not change if the machine only needs to be either IPv4 or IPv6 addressable. -But if on the other side, the machine need to be able to connect to both address families, the machine creation needs to specify two networks, one for IPv4 and one for IPv6. -Also there will be 2 distinct VRF IDs for every network with a different address family. - -#### Network with both Address Families - -Make a network dual address family capable, meaning that you can add multiple cidrs from both address families to a network. -Then the machine creation will remain the same for single-stack and dual-stack cases, but the ip address allocation will need to specify the address family from which to allocate an ip address when the network is dual-stack. -This does not break the existing API, but allows existing extensions to easily add dual-stack support. -To avoid additional checking of which address families are available on this network during an ip allocation call, we could store the address families in the network. - -#### Decision - -The decision was made to go with the having both address families in a single network entity because we think this is the most flexible way to support dual-stack machines and Kubernetes clusters as well as single-stack with the least amount of modifications on the networking side. - -### Examples - -To illustrate the the usage we start by creating a tenant super network which has both address families: - -```yaml ---- -id: tenant-super-network-mini-lab -name: Project Super Network -description: Super network of all project networks -partitionid: mini-lab -prefixes: - - 10.0.0.0/16 - - 2001:db8:0:10::/64 -defaultchildprefixlength: - IPv4: 22 - IPv6: 96 -privatesuper: true -``` - -In order to create this network, we simple call: - -```bash -metalctl network create -f tenant-super.yaml -``` - -This is usually done during the initial setup of the environment. - -Next step is to allocate a tenant network where the machines of a project can be placed: - -```bash -metalctl network allocate --partition mini-lab --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 --name my-node-network -``` - -This leads to the following network allocation: - -```yaml -id: 2d2c0350-3f66-4597-ae97-ef6797232212 -name: my-node-network -parentnetworkid: tenant-super-network-mini-lab -partitionid: mini-lab -prefixes: - - 10.0.0.0/22 - - 2001:db8:0:10::/96 -projectid: 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -vrf: 20 -consumption: - ipv4: - available_ips: 1024 - available_prefixes: 256 - used_ips: 2 - used_prefixes: 0 - ipv6: - available_ips: 2147483647 - available_prefixes: 1073741824 - used_ips: 1 - used_prefixes: 0 -privatesuper: false -``` - -Users can the create IP addresses from these child networks. By default, they retrieve an IPv4 address except a super network only consists of IPv6 prefixes. In the latter case the users acquire an IPv6 address. - -```bash -metalctl network ip create --network 2d2c0350-3f66-4597-ae97-ef6797232212 --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -``` diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP14/README.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP14/README.md deleted file mode 100644 index 47c06434..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP14/README.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -slug: /MEP-14-independence-from-external-sources -title: MEP-14 -sidebar_position: 14 ---- - -# Independence from external sources - -In certain situations some customers may need to operate and create machines without making use of external services like DNS or NTP through the internet. To make this possible, all metal-stack components reaching external services need to be configurable with custom endpoints. - -So far, the following components have been identified as requiring changes: - -- pixiecore -- metal-hammer -- metal-images - -More components are likely to be added to the list during processing. -For DNS and NTP servers it should be possible to provide default values within a partition. They can either be inherited from machines and firewalls or overwritten with own ones. - -## pixiecore - -A NTP server endpoint need to be configured on the pixiecore. This can be achieved by providing it through environment variables on start up. - -## metal-hammer - -If using a self-deployed NTP server, also the metal-hammer need to be configured with it. For backward compatibility, default values from `pool.ntp.org` and `time.google.com` are used. - -## metal-images - -Configurations for the `metal-images` are different for machines and firewalls. - -## metalctl - -In order to pass DNS and NTP servers to partitions and machines while creating them, the flags `dnsservers` and `ntpservers` need to be added. - -The implementation of this MEP will make metal-stack possible to create and maintain machines without requiring an internet connection. diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP16/README.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP16/README.md deleted file mode 100644 index dbfa59d6..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP16/README.md +++ /dev/null @@ -1,332 +0,0 @@ ---- -slug: /MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller -title: MEP-16 -sidebar_position: 16 ---- - -# metal-api as an Alternative Configuration Source for the firewall-controller - -In the current situation, a firewall as provisioned by metal-stack is a fully immutable entity. Any modifications on the firewall like changing the firewall ruleset must be done _somehow_ by the user – the metal-api and hence metal-stack is not aware of its current state. - -As part of our [integration with the Gardener project](https://docs.metal-stack.io/stable/overview/kubernetes/#Gardener) we offer a solution called the [firewall-controller](https://github.com/metal-stack/firewall-controller), which is part of our [firewall OS images](https://github.com/metal-stack/metal-images/blob/6318a624861b18a559a9d37299bca5f760eef524/firewall/Dockerfile#L57-L58) and addresses shortcomings of the firewall resource's immutability, which would otherwise be completely impractible to work with. The firewall-controller crashes infinitely if it is not properly configured through the userdata when using the firewall image of metal-stack. - -The firewall-controller approach is tightly coupled to Gardener and it requires the administrator of the Gardener installation to pass a shoot and a seed kubeconfig through machine userdata when creating the firewall. How this userdata has to look like is not documented and is just part of another project called the [firewall-controller-manager](https://github.com/metal-stack/firewall-controller-manager) (FCM), which task is to orchestrate rolling updates of firewall machines in a way that network traffic interruption is minimal when updating a firewall or applying a change to an immutable firewall configuration. - -In general, a firewall entity in metal-stack has similarities to the machine entity but it has a fundamental difference: A user gains ownership over a machine after provisioning. They can access it through SSH, modify it at will and this is completely wanted. For firewalls, however, we do not want a user to access the provisioned firewall as the firewall is a privileged part of the infrastructure with access to the underlay network. The underlay can not be tampered with at any given point in time by a user as it can destroy the entire network traffic flow inside a metal-stack partition. - -For this reason, we have a gap in the metal-stack project in terms of a missing solution for people who do not rely on the Gardener integration. We are basically leaving a user with the option to implement an orchestrated recreation of every possible change on the firewall to minimize traffic interruption for the machines sitting behind the firewall or re-implement the firewall-controller to how they want to use it for their use-case. - -Also we do not have a clear distinction in the API between user and metal-stack operator for firewalls. If a user would allocate a firewall it is also possible for the user to inject his own SSH keys and access the firewall and tamper with the underlay network. - -Parts of these problems are probably going to decrease with the work on [MEP-4](../MEP4/README.md) where there will be dedicated APIs for users and administrators of metal-stack including fine-grained access tokens. - -With this MEP we want to describe a way to improve this current situation and allow other users that do not rely on the Gardener integration – for whatever motivation they have – to adequately manage firewalls. For this, we propose an alternative configuration for the firewall-controller that is native to metal-stack and more independent of Gardener. - -## Proposal - -The central idea of this proposal is allowing the firewall-controller to use the metal-api as a configuration source. This should serve as an alternative strategy to the currently used FCM `Firewall` resource based approach in the Gardener use-case. -Updates of the firewall rules should be possible through the metal-api. - -The firewall-controller itself should now be able to decide which of the two main strategies should be used for the base configuration: a kubeconfig or the metal-api. This should be possible through a dedicated _firewall-controller-config_. - -Using this config will now allow operators to fine-tune the data sources for all of its dynamic configuration tasks independently. -For example the data source of the core firewall rules could be set either from the `Firewall` resource located in the Gardener `Seed` or the metal-apiserver node network entity, while the CWNPs should be fetched and applied from a given kubeconfig (the `Shoot` Kubeconfig in the Gardener case). -This configuration file is intended to be injected during firewall creation through the userdata along with potential source connection credentials. - -```yaml -# the name of the firewall, defaulted to the hostname -name: best-firewall-ever - -sources: - seed: - kubeconfig: /path/to/seed.yaml # current gardener behavior - namespace: shoot--proj--name - shoot: - kubeconfig: /path/to/shoot.yaml # current gardener behavior - namespace: firewall - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - static: - # static should mirror all information provided by the metal or seed/shoot sources - firewall: # optional - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -# all sub-controllers running on the firewall -# each can be configured independently -controllers: - # this is the base controller - firewall: - source: seed # or: metal, static - - # these are optional: when not provided, they are disabled - selfUpdate: - enabled: true - droptailer: - enabled: true - - # these are optional: when not provided, they are disabled - service: - source: shoot # or: metal, static - cwnp: - source: shoot # or: metal, static - monitor: - source: shoot # currently only shoot is supported -``` - -The existing behavior of the firewall-controller writing into `/etc/nftables/firewall-controller.v4` is not changed. The different controller configuration sources are internally treated in the same way as before. The `static` source can be used to prevent the firewall-controller from crashing and consistently providing a static ruleset. This might be interesting for metal-stack native use cases or environments where the metal-api cannot be accessed. - -There must be one central nftables-rule-file-controller that is notified and triggered by all other controllers that contribute to the nftables configuration. - -For example, in order to maintain the existing Gardener integration, the configuration file for the firewall-controller will look like this: - -```yaml -name: shoot--abc--cluster-firewall-def -sources: - seed: - kubeconfig: /etc/firewall-controller/seed.yaml - namespace: shoot--abc--cluster - shoot: - kubeconfig: /etc/firewall-controller/shoot.yaml - namespace: firewall - -controllers: - firewall: - source: seed - - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Plain metal-stack users might use a configuration like this: - -```yaml -name: best-firewall-ever - -sources: - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - -controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - cwnp: - # firewall rules stored in firewall entity - # potential improvement would be to attach the rules to the node network entity - # be aware that the firewall and private networks are immutable - # eventually we introduce a firewall ruleset entity - source: metal -``` - -In highly restricted environments that cannot access metal-api the static source could be used: - -```yaml -name: most-restricted-firewall-ever - -sources: - static: - firewall: - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -controllers: - firewall: - source: static - - cwnp: - source: static -``` - -### Non-Goals - -- Resolving the missing differentiation between users and administrators by letting users pass userdata and SSH keys to the firewall creation. - - This is even more related to [MEP-4](../MEP4/README.md) than this MEP. - -### Advantages - -- Offers a native metal-stack solution that improves managing firewalls for users by adding dynamic reconfiguration through the metal-api - - e.g., in the mini-lab, users can now allocate a machine, then an IP address and announce this IP from the machine without having to re-create the firewall but by adding a firewall rule to the metal-api. -- Improve consistency throughout the API (firewall rules would reflect what is persisted in metal-api). -- Other providers like Cluster API can leverage this approach, too. -- It can contribute to solving the shoot migration issue (in Cluster API case the `clusterctl move` for firewall objects) - - For Gardener takes the seed out of the equation (of which the kubeconfig changes during shoot migration) - - However: Things like egress rules, rate limiting, etc. are currently not part of the firewall or network entity in the metal-api. These would need to be added to one of them. -- Potentially resolve the issue that end-users can manipulate accounting data of the firewall through the `FirewallMonitor` - - for this we would need to be able to report traffic data to metal-api - -### Caveats - -- Metal-View access is too broad for firewalls. Mitigated by [MEP-4](../MEP4/README.md). -- Polling of the firewall-controller is bad for performance. Mitigated by [MEP-4](../MEP4/README.md). - -### Firewall Controller Manager - -Currently the firewall-controller-manager expects the creators of a `FirewallDeployment` to use the defaulting webhook that is tailored to the Gardener integration in order to generate `Firewall.spec.userdata` or to override it manually. Currently `Firewall.spec.userdata` will never be set explicitly. - -Instead we'd like to propose `Firewall.spec.userdataContents` which will replace the old `userdata`-string by a typed data structure. The FCM will do the heavy lifting while the `FirewallDeployment` creator decides what should be configured. - -```yaml -kind: FirewallDeployment -spec: - template: - spec: - userdataContents: - - path: /etc/firewall-controller/config.yaml - content: | - --- - sources: - static: {} - controllers: - firewall: - source: static - - path: /etc/firewall-controller/seed.yaml - contentFrom: - firewallControllerKubeconfigSecret: - name: seed-kubeconfig - key: kubeconfig - - - path: /etc/firewall-controller/shoot.yaml - contentFrom: - secretRef: - name: shoot-kubeconfig - key: kubeconfig -``` - -### Gardener Extension Provider Metal Stack - -The GEPM should be migrated to the new `Firewall.spec.userdataContents` field. - -### Cluster API Provider Metal Stack - -![architectural overview](firewall-for-capms-overview.svg) - -In Cluster API there are essentially two main clusters: the management cluster and the workload cluster while the CAPMS takes in the role of the GEPM. -Typically a local bootstrap cluster is created in KinD which acts as the management cluster. It creates the workload cluster. Thereafter the ownership of the workload cluster is typically moved (using `clusterctl move`) to a different cluster which will then become the management cluster. -The new management cluster might actually be the workload cluster itself. - -In contrast to Gardener, Cluster API aims to be less opinionated and minimal. It is common practice to not install any non-required components or CRDs into the workload cluster by default. Therefore we cannot expect custom resources like `ClusterwideNetworkPolicy` or `FirewallMonitor` to be installed in the workload cluster but strongly recommend our users to do it. Therefore it's the responsibility of the operator to tell [cluster-api-provider-metal-stack](https://github.com/metal-stack/cluster-api-provider-metal-stack) the kubeconfig for the cluster where these CRDs are installed and defined in. - -A viable configuration for a `MetalStackCluster` that generates firewall rules based of `Service` type `LoadBalancer` and `ClusterwideNetworkPolicy` and expects them to be deployed in the workload cluster is shown below. The `FirewallMonitor` will be reported into the same cluster. - -```yaml -kind: MetalStackCluster -metadata: - name: ${CLUSTER_NAME} -spec: - firewallTemplate: - userdataContents: - - path: /etc/firewall-controller/config.yaml - contentFrom: - secretRef: - name: ${CLUSTER_NAME}-firewall-controller-config - key: controllerConfig - - - path: /etc/firewall-controller/workload.yaml - contentFrom: - # this is the kubeconfig generated by kubeadm - secretRef: - name: ${CLUSTER_NAME}-kubeconfig - key: value ---- -kind: Secret -metadata: - name: ${CLUSTER_NAME}-firewall-controller-config -stringData: - controllerConfig: | - --- - name: ${CLUSTER_NAME}-firewall - - sources: - metal: - url: ${METAL_API_URL} - hmac: ${METAL_API_HMAC} - type: ${METAL_API_HMAC_TYPE} - projectID: ${METAL_API_PROJECT_ID} - shoot: - kubeconfig: /etc/firewall-controller/workload.yaml - namespace: firewall - - controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Here the firewall-controller-config will be referenced by the `MetalStackCluster` as a `Secret`. Please note that the `Secret`s in `userdataContents` will not be fetched and will directly be passed to the `FirewallDeployment`. At first the reconciliation of it in the FCM will fail due to the missing Kubeconfig secret. After the `MetalStackCluster` has been marked as ready, CAPI will create this missing secret. Effectively the firewall and initial control plane node should be created at the same time. - -This approach allows maximum flexibility as intended by Cluster API and is still able to provide robust rolling updates of firewalls. - -An advanced use case of this flexibility would be a management cluster, that is in charge of multiple workload clusters. Where one workload cluster acts as a monitoring or tooling cluster, receives logs and the firewall monitor for the other workload clusters. The CWNPs could be defined here, all in a separate namespace. - -#### Cluster API Caveats - -When the cluster is pivoted and reconciles its own firewall, a malfunctioning firewall prevents the cluster from self-healing and requires manual intervention by creating a new firewall. This is an inherent problem of the cluster-api approach. It can be circumvented by using an extra cluster to manage workload clusters. - -In the current form of this approach firewalls and therefore the firewall egress and ingress rules are managed by the cluster operators that manage the cluster-api resources. -Hence it will not be possible to gain a fine-grained control over every cluster operator's choices from a central ruleset at the level of metal-stack firewalls. -In case this control surfaces as a requirement, it would need to be implemented in a firewall external to metal-stack. - -## Roadmap - -In general this proposal is not thought to be implemented in one batch. Instead an incremental approach is required. - -1. Enhance firewall-controller-manager - - - Add `FirewallDeployment.spec.template.spec.userdataContents` - -2. Enhance firewall-controller - - - Reduce coupling between controllers - - Introduce controller config - - Abstract module to write into distinct nftable rules for every controller - - Implement `sources.static`, but not `sources.metal` - - GEPM should set `FirewallDeployment.spec.template.spec.userdataContents` - -3. Allow Cluster API to use the FCM with static ruleset - - - Add `firewall.metal-stack.io/paused` annotation (managed by CAPMS during `clusterctl move`, theoretically useful for Gardener shoot migration as well to avoid shallow deletion). - - Reconcile multiple `FirewallDeployment` resources across multiple namespaces. For Gardener the old behavior of reconciling only one namespace should persist. - - Allow setting the `firewall.metal-stack.io/no-controller-connection` annotation through the `FirewallDeployment` (either through the template or inheritance). - - Add `MetalStackCluster.spec.firewallTemplate`. - - Make `MetalStackCluster.spec.nodeNetworkID` optional if `spec.firewallTemplate` given. - -4. Add `sources.metal` as configuration option. - - - Allow updates of firewall rules in the metal-apiserver. - - Depends on [MEP-4](../MEP4/README.md) metal-apiserver progress - -5. Potentially migrate the GEPM to use `sources.metal` diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio deleted file mode 100644 index faea3e3d..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio +++ /dev/null @@ -1,4 +0,0 @@ - - - -
handles traffic
Firewall
Firewall Controller
node-exporter
nftables-exporter
droptailer-client
Workload Cluster
droptailer
Configures
Bootstrap or Management Cluster
reconcile
configures
reconcile
Cluster API Provider metal-stack
Metal Stack Cluster CRD
Firewall Deployment CRD
Firewall CRD
Firewall Set CRD
rec
reconcile
reconcile
Firewall Controller Manager
Metal Stack Machine CRD
manages
Admin
Kubeconfig FirewallMonitor
FirewallMonitor CRD
main metal-api
Firewall entity
kubeconfig CWNP
Clusterwide Network Policy CRD
base config
controllerConfig
user-defined
network rules
reports firewall
state
send firewall log lines
controllerConfig
controllerConfig
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg deleted file mode 100644 index 853f8175..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg +++ /dev/null @@ -1 +0,0 @@ -
handles traffic
handles traffic
Firewall
Firewall
Firewall Controller
Firewall Controller
node-exporter
node-exporter
nftables-exporter
nftables-exporter
droptailer-client
droptailer-client
Workload Cluster
Workload Cluster
droptailer
droptailer
Configures
Configures
Bootstrap or Management Cluster
Bootstrap or Management Cluster
reconcile
reconcile
configures
configures
reconcile
reconcile
Cluster API Provider metal-stack
Cluster API Provider...
Metal Stack Cluster CRD
Metal Stack Cluster...
Firewall Deployment CRD
Firewall Deployment...
Firewall CRD
Firewall CRD
Firewall Set CRD
Firewall Set CRD
rec
rec
reconcile
reconcile
reconcile
reconcile
Firewall Controller Manager
Firewall Controller...
Metal Stack Machine CRD
Metal Stack Machine...
manages
manages
Admin
Admin
Kubeconfig FirewallMonitor
Kubeconfig FirewallMonitor
FirewallMonitor CRD
FirewallMonitor CRD
main metal-api
main metal-api
Firewall entity
Firewall entity
kubeconfig CWNP
kubeconfig CWNP
Clusterwide Network PolicyCRD
Clusterwide Network...
base config
base config
controllerConfig
controllerConfig
user-defined
network rules
user-defined...
reports firewall
state
reports firewall...
send firewall log lines
send firewall log lines
controllerConfig
controllerConfig
controllerConfig
controllerConfig
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP17/README.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP17/README.md deleted file mode 100644 index 35f48970..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP17/README.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -slug: /MEP-17-global-network-view -title: MEP-17 -sidebar_position: 17 ---- - -# Global Network View - -> [!IMPORTANT] -> This MEP assumes the implementation of the metal-apiserver as described by [MEP-4](../MEP4/README.md) which is currently work in progress. - -Having a complete view of the network topology is useful when working with deployments or troubleshooting connectivity issues. -Currently, the API doesn't know of any other switches than the leaf switches. -Information about all other switches and their connections must be gathered from Ansible inventories or by accessing the switches via SSH. -Documentation of each partition's network must be kept in-sync with all changes made to the deployment or cabling. -We would like to expand the API's knowledge of the network to the entire underlay including inter-switch connections as well as BGP statistics and health status. - -## Switch Types - -Registering a switch at the API is done by the metal-core. -Apart from that, it also reconciles port and FRR configuration to adapt to the machine provisioning cycle. -This reconfiguration is only necessary on the leaf switches. -To allow deploying the metal-core on other switches than leaves we need a way of telling it what type of switch it is running on so it can act accordingly. -On any non-leaf switches it will only register the switch and report statistic but not change any configuration. -Supported switch types are - -- `leaf` -- `spine` -- `exit` -- `mgmtleaf` -- `mgmtspine` - -## Network Topology - -All switches should periodically report their LLDP neighbors and port configuration. -This information can be used to quickly identify common network issues, like MTU mismatch or the like. -Ideally, there would be some graphical representation of the network topology containing only the most important information for a quick overview. -It should contain all switches and machines as nodes and all connections as edges of a graph. -Ports, VRFs, and maybe also IPs should be associated with a connection. - -Apart from the topology graph, there should be a way to display more detailed information about both ports of a connection, like - -- MTU -- speed -- IP -- UP/DOWN status -- VRF -- VLAN -- whether it participates in a BGP session - -## BGP Announcements - -The metal-core should collect all routes it knows about and send them to the API along with a timestamp. -Reported routes should be stored to a redis database along with the switch that reported them and the timestamp of the last time they were reported. -An expiration threshold should be defined and all expired routes should be cleaned up periodically. -Whenever new routes are reported they get merged into the existing ones by the strategy: - -- when new, just add -- when existing, update `last_announced` timestamp - -By querying the BGP announcements we can find out whether an allocated IP is still in use. diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/README.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/README.md deleted file mode 100644 index 9c02c0b7..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/README.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -slug: /MEP-18-autonomous-control-plane -title: MEP-18 -sidebar_position: 18 ---- - -# Autonomous Control Plane - -As described in the [deployment chapter](../../../docs/04-For%20Operators/03-deployment-guide.mdx), we strongly recommend Kubernetes as the target platform for running the metal-stack control plane. - -Kubernetes clusters for this purpose are readily available from hyperscalers, metalstack.cloud, or other cloud providers. Simply using a managed Kubernetes cluster greatly simplifies a metal-stack installation. However, sometimes it might be desirable to host the metal-stack control plane autonomously, without the help of another cloud provider. Reasons for this might include corporate policies that prohibit the use of external data center products, or network constraints. - -The Kubernetes cluster hosting the metal-stack control plane must provide at least the following features: - -- Load balancing (for exposing the APIs) -- Persistent storage (for the databases and key-value stores) -- Access to object storage for automated backups of the stateful sets -- Access to a DNS provider supported by one of the used DNS extensions -- Externally accessible DNS records for obtaining officially signed certificates through DNS challenges - -This metal-stack control plane cluster must also be highly available to prevent a complete loss of control over the managed resources in the data center. -Regular Kubernetes updates to apply security fixes and feature updates must be possible in an automated manner. The Day-2 operational overhead of running this cluster in your own datacenter must be reasonable. - -In this chapter, we propose a solution for setting up a metal-stack environment with an autonomous control plane that is independent of another cloud provider. - -## Use Your Own Dogfood - -The most obvious solution is to just deploy a Kubernetes cluster manually in your own data center by utilizing existing tooling for the deployment: - -- k3s -- kubeadm -- vmware and rancher -- talos -- kubespray -- ... (not a complete list) - -However, all these solutions add another layer of complexity that needs to be maintained and operated by people who also need to learn and understand metal-stack. In general, metal-stack in combination with [Gardener](https://gardener.cloud) contains all the necessary tools to provide KaaS, so it makes sense to reuse what is already in place without introducing new dependencies on other products and vendors. - -The only problem here is that Gardener is not yet able to create an initial cluster, which may change with the implementation of [GEP-28](https://github.com/gardener/gardener/blob/master/docs/proposals/28-autonomous-shoot-clusters.md). In the meantime, we suggest using [k3s](https://k3s.io/), which manages the initial metal-stack partition to host the control plane, since the maintenance overhead is acceptable and it is easy to deploy. - -## The Matryoshka Principle - -Instead of directly using the K3s cluster for the production control plane, we propose using it as a minimal control plane cluster which only purpose is to host the production control plane cluster. This layer of indirection brings some reasonable advantages: - -- In the event of an interruption or loss of this minimal control plane cluster, the production control plane remains unaffected, and end users can continue to manage their clusters as normal. -- A dedicated operations team can take care of the Day-2 maintenance of this installation, which can be handy because the tools like k3s are a little different from the rest of the setup (it is likely that more manual maintenance is required than for any other cluster). This would also be true if the initial cluster problem would be solved by the Gardener itself and not using k3s. -- Since the number of shoot clusters to host is static, the resource requirements are minimal and will not change significantly over time. There are no huge resource requirements in terms of cpu, memory and storage. As such, the lack of scalability is not such a big issue. - -So, our proposal is to chain two metal-stack control planes. The initial control plane cluster would use k3s and on this cluster we can spin up a cluster for the production control plane with the use of Gardener. - -The following figure shows how the high-level architecture of this setup looks like. A even more simplified illustration of this setup can be looked up in the appendix[^1]. - -![Autonomous Control Plane Architecture](./autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg) - -The k3s nodes can either be bare metal machines or virtual machines. When using VMs a single k3s node might be a viable solution, too. These nodes are supposed to be setup manually / partly automated with an operating system like Debian. - -To name the cluster that hosts the initial metal-stack control plane and Gardener we use the term _initial cluster_. The initial cluster creates worker nodes to host the _target cluster_. - -## Initial Cluster - -The initial cluster is kept very small. The physical bare metal machines can be any machines and switches which are supported by metal-stack, but can be smaller in terms of cpu, memory and network speed because these machines must only be capable of running the target cluster for the metal-stack control plane. A typical single socket server with 8-16 cores and 64GB of RAM and two NVMe drives of 1TB would be a good starting point. - -In a typical k3s setup, a stateful set would lose the data once the k3s cluster was terminated and started again. But there is a possibility to define parts of the local storage of the server to be provided to the k3s cluster for the PVCs. With that, k3s could be terminated and started again, for example to update and reboot the host os, or update k3s itself and the data will persist. - -Example k3s configuration for persistent storage on the hosts os: - -```yaml -k3s: Cluster -apiVersion: k3s.x-k8s.io/v1alpha4 -name: needle-control-plane -nodes: - - role: control-plane - # add a mount from /path/to/my/files on the host to /files on the node - extraMounts: - - hostPath: /path/to/my/files - containerPath: /files -``` - -Into this cluster metal-stack and Gardener will be deployed. This deployment can be done by a Gitlab runner which is running on this machine. -The mini-lab will be used as a base for this deployment. The current development of [gardener-in-minilab](https://github.com/metal-stack/mini-lab/pull/202) must be extended to host all required extensions to make this a working metal-stack control plane which can manage the machines in the attached bare metal setup. - -In addition to the metal-stack and Gardener deployment, some additional required services are deployed (non-complete list): - -- PowerDNS to serve as a DNS Server for all DNS entries used in the initial and the target cluster, like `api.initial.metal-stack.local`, `gardener-api.initial.metal-stack.local` and the DNS entries for the api servers of the created kubernetes clusters. -- NTP -- Monitoring for the initial cluster and partition -- Optional: OIDC Server for authenticating against the metal-api -- Optional: Container Registry to host all metal-stack and gardener containers -- Optional: Let's Encrypt [boulder](https://github.com/letsencrypt/boulder) as a certificate authority -- ... - -Physical view, minimal setup for a initial cluster with a single physical node: - -![Small Initial Cluster](autonomous-control-plane-images/small-initial-cluster.svg) - -Physical View, bigger ha setup which is spread across two data centers: - -![HA Initial Cluster](autonomous-control-plane-images/ha-initial-cluster.svg) - -### Control Plane High Availability - -Running the initial control plane on a single physical server is not as available as it should be in such a use case. It should be possible to survive a loss of this server, because the server could be lost by many events, such as hardware failure, disk corruption or even failure of the datacenter location where this server is deployed. - -Setting up a second server with the same software components is an option, but the problem of data redundancy must be solved, because neither the gardener control plane, nor the metal-stack control plane can be instantiated twice. - -Given that we provide part of the local storage of the server as backing storage for the stateful sets in the k3s cluster, the data stored on the server itself must be replicated to another server and backed up on a regular basis. - -The replication of ETCD can be achieved through [clustered configuration](https://docs.k3s.io/datastore/ha-embedded) of k3s. Components of metal-stack and Gardener can run standalone and already utilize backup-restore mechanism that must be configured accordingly. For two or more bare metal machine used for the initial cluster, a loadbalancing mechanism for the ingress is required. kube-vip could be a possible solution. - -For monitoring a backend like a Victoria Metrics Cluster would allow spearding the monitoring data across the initial cluster nodes. These metrics should also be backed up in object storage. - -### Partition - -The partition which is managed by the initial cluster can be a simple and small hardware setup but yet capable enough to host the target cluster. It would even be a good practice to create separate target clusters on the initial cluster, e.g. one for the metal-stack control plane and one for the Gardener (maybe one more for monitoring). - -It can follow the metal-stack minimal setup which provides about 8-16 small servers connected to a 1G/s or 10G/s network dataplane. Central storage is optional as the persistence of the services running in these clusters is always backed up to a central object storage. Operations would be much easier if a central storage is provided. - -## Target Cluster - -The target cluster is the metal-stack environment which serves for end-user production use, the control plane is running in a shoot hosted in the initial cluster. The seed(s) and shoot(s) for end-users are created on the machines provided by the target cluster. -These machines can be of a different type in terms of size, but more importantly, these machines are connected to another network dataplane. Also the management infrastructure is separated from the initial cluster management network. - -## Failure Scenarios - -Everything could fail, everything will fail at some point. But this must kept in mind and nothing bad should happen if only one component at a time fails. -If more than one fails, the restoration to a working state must be easily possible and well documented. - -To ensure all possible breakages are documented, we suggest writing a list which summarizes all failure scenarios that might occur including the remediation. - -Here is an example of how a scenario documentation could look like: - -**Scenario**: Initial cluster is gone, all machines have died -**Impact**: Management of the initial cluster infrastructure not possible anymore, the target cluster continues to run but cannot be managed because the API servers are gone. end-users are not affected by this incident. -**Remediation**: The initial cluster nodes must be provisioned from scratch and re-deployed through the CI mechanism. The backups of the stateful sets are automatically restored during this process. - -## Implementation - -As part of this proposal, we provide the following tools and integrations in order to setup an autonomous control plane: - -- Deployment roles for the services like PowerDNS and NTP for the initial cluster -- Stretch goal: Deployment role to setup k3s in clustered configuration for the initial cluster and update it -- Extend the Gardener on mini-lab integration to allow shoot creation in the mini-lab -- Steady integration of the setup (maybe something like [k3d](https://github.com/k3d-io/k3d) in the mini-lab) - -## Appendix - -[^1]: ![metal-stack-chain](autonomous-control-plane-images/metal-stack-chain.svg) diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio deleted file mode 100644 index eafcb514..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio +++ /dev/null @@ -1,535 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine01 -
-
-
-
- - spine01 - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 01 - -
-
-
-
- - Initial cluster node 01 - -
-
-
- - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine02 -
-
-
-
- - spine02 - -
-
-
- - - - - - - - - -
-
-
- leaf03 -
-
-
-
- - leaf03 - -
-
-
- - - - - - - - - -
-
-
- leaf04 -
-
-
-
- - leaf04 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 02 - -
-
-
-
- - Initial cluster node 02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 03 - -
-
-
-
- - Initial cluster node 03 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg deleted file mode 100644 index 99261ada..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine01
spine01
leaf01
leaf01
leaf02
leaf02
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Initial cluster node 01
Initial cluster node 01
123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine02
spine02
leaf03
leaf03
leaf04
leaf04
Initial cluster node 02
Initial cluster node 02
Initial cluster node 03
Initial cluster node 03
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio deleted file mode 100644 index aae8a12d..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio +++ /dev/null @@ -1,1133 +0,0 @@ - - - - - - - - - - - - - - - - - - - -
-
-
- Initial Cluster -
-
-
-
- - Initial Cluster - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - -
-
-
- K3s Standalone - - - (on Debian) - - -
-
-
-
- - K3s Standalone (on Debian) - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Initial Partition -
-
-
-
- - Initial Partition - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for metal-stack -
-
-
-
- - Target Cluster for metal-stack - -
-
-
- - - - - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for Gardener -
-
-
-
- - Target Cluster for Gardener - -
-
-
- - - - - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - - - - - - - -
-
-
- Target Partition -
-
-
-
- - Target Partition - -
-
-
- - - - - - - - - - -
-
-
- Gardener Seeds and End-User Shoots -
-
-
-
- - Gardener Seeds and End-User Shoots - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - - - - -
-
-
- ETCD can be clustered or standalone, backed up by sidecar -
-
-
-
- - ETCD can be clustere... - -
-
-
- - - - - - - - - - -
-
-
- This data will get lost in case local PV gets deleted -
-
-
-
- - This data will get l... - -
-
-
- - - - - - - - - - -
-
-
- We can work with local PVs here, too. -
- backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered. -
-
-
-
- - We can work with local PVs he... - -
-
-
- - - - - - - -
-
-
- ETCD will be deployed in HA configuration on local PVs. -
-
- csi-driver-lvm needs to implement auto deletion of orphaned PVs. -
-
- Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot. -
-
-
-
- - ETCD will be deployed in HA c... - -
-
-
- - - - - - - - - - -
-
-
- More sophisticated storage solutions can be in place. -
-
- (Lightbits, NetApp, ...) -
-
-
-
- - More sophisticated storage so... - -
-
-
- - - - - - - - - - -
-
-
- TODO: Evaluate how to persist these metrics. -
-
-
-
- - TODO: Evaluate how to persist... - -
-
-
- - - - - - - - - - -
-
-
- - 1 VM or -
-
-
- - - 3 Bare Metal Machines - - -
-
-
-
-
- - 1 VM or... - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-stack -
-
-
-
- - metal-stack - -
-
-
- - - - - - - -
-
-
- metal-api -
-
-
-
- - metal-api - -
-
-
- - - - - - - -
-
-
- metal-db -
-
-
-
- - metal-db - -
-
-
- - - - - - - -
-
-
- ipam-db -
-
-
-
- - ipam-db - -
-
-
- - - - - - - -
-
-
- masterdata-db -
-
-
-
- - masterdata-db - -
-
-
- - - - - - - -
-
-
- headscale-db -
-
-
-
- - headscale-db - -
-
-
- - - - - - - -
-
-
- auditing-db -
-
-
-
- - auditing-db - -
-
-
- - - - - - - -
-
-
- nsqd -
-
-
-
- - nsqd - -
-
-
- - - - - - - - - - - -
-
-
- Gardener -
-
-
-
- - Gardener - -
-
-
- - - - - - - - - - -
-
-
- Virtual Garden -
-
-
-
- - Virtual Garden - -
-
-
- - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - -
-
-
- gardenlet -
-
-
-
- - gardenlet - -
-
-
- - - - - - - -
-
-
- Garden etcd -
-
-
-
- - Garden etcd - -
-
-
- - - - - - - -
-
-
- Prometheus -
-
-
-
- - Prometheus - -
-
-
- - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - -
-
-
- - Gitlab - -
- - Runner - -
-
-
-
-
- - Gitlab... - -
-
-
- - - - - - - - - - -
-
-
- Services -
-
-
-
- - Services - -
-
-
- - - - - - - -
-
-
- PowerDNS -
-
-
-
- - PowerDNS - -
-
-
- - - - - - - -
-
-
- boulder -
-
-
-
- - boulder - -
-
-
- - - - - - - -
-
-
- NTP -
-
-
-
- - NTP - -
-
-
- - - - - - - -
-
-
- OIDC -
-
-
-
- - OIDC - -
-
-
- - - - - - - -
-
-
- ... -
-
-
-
- - ... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg deleted file mode 100644 index e58e783b..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg +++ /dev/null @@ -1 +0,0 @@ -
Initial Cluster
Initial Cluster
metal-roles
metal-roles
CI
CI
K3s Standalone(on Debian)
K3s Standalone (on Debian)
Initial Partition
Initial Partition
Target Cluster for metal-stack
Target Cluster for metal-stack
Metal Control Plane
Metal Control Plane
provisions
provisions
Target Cluster for Gardener
Target Cluster for Gardener
Gardener Control Plane
Gardener Control Plane
Monitoring
Monitoring
Target Partition
Target Partition
Gardener Seeds and End-User Shoots
Gardener Seeds and End-User Shoots
provisions
provisions
metal-roles
metal-roles
CI
CI
metal-roles
metal-roles
ETCD can be clustered or standalone, backed up by sidecar
ETCD can be clustere...
This data will get lost in case local PV gets deleted
This data will get l...
We can work with local PVs here, too.
backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered.
We can work with local PVs he...
ETCD will be deployed in HA configuration on local PVs.

csi-driver-lvm needs to implement auto deletion of orphaned PVs.

Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot.
ETCD will be deployed in HA c...
More sophisticated storage solutions can be in place.

(Lightbits, NetApp, ...)
More sophisticated storage so...
TODO: Evaluate how to persist these metrics.
TODO: Evaluate how to persist...
1 VM or
3 Bare Metal Machines
1 VM or...
metal-stack
metal-stack
metal-api
metal-api
metal-db
metal-db
ipam-db
ipam-db
masterdata-db
masterdata-db
headscale-db
headscale-db
auditing-db
auditing-db
nsqd
nsqd
Gardener
Gardener
Virtual Garden
Virtual Garden
Gardener Control Plane
Gardener Control Plane
gardenlet
gardenlet
Garden etcd
Garden etcd
Prometheus
Prometheus
Monitoring
Monitoring
Gitlab
Runner
Gitlab...
Services
Services
PowerDNS
PowerDNS
boulder
boulder
NTP
NTP
OIDC
OIDC
...
...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio deleted file mode 100644 index cd5cf007..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio +++ /dev/null @@ -1,404 +0,0 @@ - - - - - - - - - - -
-
-
- Partition 1 -
-
-
-
- - Partition 1 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 2 -
-
-
-
- - Partition 2 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 3 -
-
-
-
- - Partition 3 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Production Control Plane -
-
-
-
- - Production Control Plane - -
-
- - - - -
-
-
- metal-stack -
- kubernetes cluster -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- gardener -
- kubernetes cluster -
-
-
-
- - gardener... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - - - - -
-
-
- Control Plane Partition -
-
-
-
- - Control Plane Partition - -
-
- - - - - -
-
-
- backup of stateful sets -
-
-
-
- - backup of stateful sets - -
-
- - - - - - -
-
-
- bare metal machine -
-
-
-
- - bare metal machine - -
-
- - - - -
-
-
- metal-stack -
- and -
- gardener -
- kubernetes cluster -
- running in kind -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - -
-
-
- S3 -
-
-
-
- - S3 - -
-
- - - - -
-
-
- Needle -
-
-
-
- - Needle - -
-
- - - -
-
-
- - Nail - -
-
-
-
- - Nail - -
-
-
- - - - - Text is not SVG - cannot display - - - -
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg deleted file mode 100644 index 8f88ba14..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg +++ /dev/null @@ -1 +0,0 @@ -
Partition 1
Partition 1
seeds
seeds
shoots
shoots
Partition 2
Partition 2
seeds
seeds
shoots
shoots
Partition 3
Partition 3
seeds
seeds
shoots
shoots
Production Control Plane
Production Control Plane
metal-stack
kubernetes cluster
metal-stack...
gardener
kubernetes cluster
gardener...
Manages
Manages
Control Plane Partition
Control Plane Partition
backup of stateful sets
backup of stateful sets
bare metal machine
bare metal machine
metal-stack
and
gardener
kubernetes cluster
running in kind
metal-stack...
Manages
Manages
S3
S3
Needle
Needle 
Nail
Nail
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio deleted file mode 100644 index a75ee340..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- Initial cluster node -
-
-
-
- - Initial cluster node - -
-
-
- - - - - - - - - - - - - -
-
-
- mirocloud (initial cluster partition nodes) -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg deleted file mode 100644 index a9d29f05..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
leaf01
leaf01
leaf02
leaf02
Initial cluster node
Initial cluster node
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP2/README.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP2/README.md deleted file mode 100644 index c7f2360a..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP2/README.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -slug: /MEP-2-two-factor-authentication -title: MEP-2 -sidebar_position: 2 ---- - -# Two Factor Authentication diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP3/README.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP3/README.md deleted file mode 100644 index 5ce36721..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP3/README.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -slug: /MEP-3-machine-re-installation -title: MEP-3 -sidebar_position: 3 ---- - -# Machine Re-Installation - -In the current metal-api only machine installations are possible, performing a machine upgrade is only possible by creating a new machine and delete the old one. -This has the drawback that in case a lot of data is stored on the local disks, a full restore of the original data must be performed. - -To prevent this, we will introduce a new metal-api endpoint to reinstall the machine with a new image, _without_ actually deleting the data stored on the additional hard disks. - -Storage is a difficult task to get right and reliable. A short analysis of our different storage requirements lead to 3 different scenarios. - -- Storage for the etcd pvs in the seed cluster of every partition. - This is the most important storage in our setup because these etcd pods serve as configuration backend for all customer kubernetes clusters. If they fail, the cluster is down. However gardener deploys a backup and restore sidecar into the etcd pod of every customer kubernetes control plane, and if this sidecar detects a corrupt or missing etcd database file(s) it starts automatic restore from the configured backup location. This will take some minutes. If for example a node dies, and gardener creates a new node instead, the csi-lvm created pv is not present on that node. Kubernetes will not schedule the missing etcd pod on this node because it has a local PV configured and is therefore tainted to run only on that node. To let kubernetes create that pod anyhow, someone has to either remove the taint, or delete the pod. If this is done, the pod starts and the restore of the etcd data can start as well. You can see this is a bit too complicated and will take the customer cluster down for a while (not measured yet but in the range of 5-10 minutes). -- Storage in customer clusters. - This was not promised in 2020. We have a intermediate solution with the provisioning of csi-lvm by default into all customer clusters. Albeit this is only local storage and will get deleted if a node dies. -- S3 Storage. - We have two possibilities to cope with storage: - - In place update of the OS with a daemonset - This will be fast and simple, but might fail because the packages being installed are broken right now, or a filesystem gets full, or any other failure you can think of during a os update. Another drawback is that metal-api does not reflect the updated os image. - - metal-api get a machine reinstall endpoint - With this approach we leverage from existing and already proven mechanisms. Reinstall must keep all data except the sata-dom. Gardener currently is not able to do an update with this approach because it can only do `rolling` updates. Therefore a additional `osupdatestrategy` has to be implemented for metal and other providers in gardener to be able to leverage the metal reinstall on the same machineID approach. - -If reinstall is implemented, we should focus on the same technology for all scenarios and put ceph via rook.io into the kubernetes clusters as additional StorageClass. It has to be checked whether to use the raw disk or a PV as the underlay block device where ceph stores its data. - -## API and behavior - -The API will get an new endpoint "reinstall" this endpoint takes two arguments: - -- machineID -- image - -No other aspects of the machine can be modified during the re-installation. All data stored in the existing allocation will be preserved, only the image will be modified. -Once this endpoint was called, the machine will get a `reboot` signal with the boot order set to PXE instead of HDD and the network interfaces on the leaf are set to PXE as well. Then the normal installation process starts: - -- unchanged: PXE boot with metal-hammer -- changed: metal-hammer first checks with the machineID in the metal-api (through metal-core) if there is already a allocation present -- changed: if a allocation is present and the allocation has set `reinstall: true`, wipe disk is only executed for the root disk, all other disks are untouched. -- unchanged: the specified image is downloaded and burned, `/install.sh` is executed -- unchanged: successful installation is reported back, network is set the the vrf, boot order is set to HDD. -- unchanged: distribution kernel is booted via kexec - -We can see that the `allocation` requires one additional parameter: `reinstall` and metal-hammer must check for already existing allocation at an earlier stage. - -Components which requires modifications (first guess): - -- metal-hammer: - - check for allocation present earlier - - evaluation of `reinstall` flag set - - wipe of disks depends on that flag - - Bonus: move configuration of disk layout and primary disk detection algorithm (PDDA) from metal-hammer into metal-api. - metal-api **MUST** reject reinstallation if the disk found by PDDA does not have the `/etc/metal` directory! -- metal-core: - - probably nothing -- metal-api: - - new endpoint `/machine/reinstall` - - add `Reinstall bool` to data model of `allocation` - - make sure to reset `Reinstall` after reinstallation to prevent endless reinstallation loop -- metalctl: - - implement `reinstall` -- metal-go: - - implement `reinstall` -- gardener (longterm): - - add the `OSUpgradeStrategy` `reinstall` diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP4/README.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP4/README.md deleted file mode 100644 index 389a02d4..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP4/README.md +++ /dev/null @@ -1,211 +0,0 @@ ---- -slug: /MEP-4-multi-tenancy-for-the-metal-api -title: MEP-4 -sidebar_position: 4 ---- - -# Multi-Tenancy for the metal-api -:::info -This document is work in progress. -::: - -In the past we decided to treat the metal-api as a "low-level API", i.e. the API does not specifically deal with projects and tenants. A user with editor access can for example assign machines to every project he desires, he can see all the machines available and can control them. We tried to keep the metal-api code base as small as possible and we added resource scoping to a "higher-level APIs". From there, a user would be able to only see his own clusters and IP addresses. - -As time passed metal-stack has become an open-source project and people are willing to adopt. Adopters who want to put their own technologies on top of the metal-stack infrastructure don't have those "higher-level APIs" that we implemented closed-source for our user base. So, external adopters most likely need to implement resource scoping on their own. - -Introducing multi-tenancy to the metal-api is a serious chance of making our product better and more successful as it opens the door for: - -- Becoming a "fully-featured" API -- Narrowing down attack surfaces and possibility of unintended resource modification produced by bugs or human errors -- Discouraging people to implement their own scoping layers in front of the metal-stack -- Gaining performance through resource scopes -- Letting untrusted / third-parties work with the API - -## Requirements - -These are some general requirements / higher objectives that MEP-4 has to fulfill. - -- Should be able to run with mini-lab without requiring to setup complex auth backends (dex, LDAP, keycloak, ...) - - Simple to start with, more complex options for production setups -- Fine-grained access permissions (every endpoint maps to a permission) -- Tenant scoping (disallow resource access to resources of other tenants) -- Project scoping (disallow resource access to resources of other projects) -- Access tokens in self-service for technical user access - -## Implementation - -We gathered a lot of knowledge while implementing a multi-tenancy-capable backend for metalstack.cloud. The goal is now to use the same technology and adopt that to the metal-api, this includes: - -- gRPC in combination with connectrpc -- OPA for making auth decisions -- REST HTTP only for OIDC login flows - -### API Definitions - -The API definitions should be located on a separate Github repository separate from the server implementation. The proposed repository location is: https://github.com/metal-stack/api. - -This repository contains the `proto3` specification of the exposed metal-stack api. This includes the messages, simple validations, services and the access permission to these services. The input parameters for the authorization in the backend are generated from the `proto3` annotations. - -Client implementations for the most relevant languages (go, python) are generated automatically. - -This api is divided into end-user and admin access at the top level. The proposed APIs are: - -- `metalstack.api.v2`: For end-user facing services -- `metalstack.admin.v2`: For operators and controllers which need access to unscoped entities - -The methods of the API can have different role scopes (and can be narrowed down further with fine-grained method permissions): - -- `tenant`: Tenant-scoped methods, e.g. project creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `project`: Project-scoped methods, e.g. machine creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `admin` Admin-scoped methods, e.g. unscoped tenant list or switch register - - Available roles: VIEWER, EDITOR - -And has methods with different visibility scopes: - -- `self`: Methods that only the logged in user can access, e.g. show permissions with the presented token -- `public`: Methods that do not require any specific authorization - -### API - -The API server implements the services defined in the API and validates access to a method using OPA with the JWT tokens passed in the requests. The server is implemented using the connectrpc.com framework. - -The API server implements the login flow through OIDC. After successful authentication, the API server derives user permissions from the OIDC provider and issues a new JWT token which is passed on to the user. The tokens including the permissions are stored in a redis compatible backend. - -With these tokens, users can create Access Tokens for CI/CD or other use cases. - -JWT Tokens can be revoked by admins and the user itself. - -### API Server - -Is put into a new github repo which implements the services defined in the `api` repository. It opens a `https` endpoints where the grpc (via connectrpc.com) and oidc services are exposed. - -### Migration of the Consumers - -To allow consumers to migrate to the `v2` API gradually, both apis, the new and the old, are deployed in parallel. In the control-plane both apis are deployed side-by-side behind the ingress. `api.example.com` is forwarded to `metal-api` and `metal.example.com` is forwarded to the new `metal-apiserver`. - -The api-server will talk to the existing metal-api during the process of migration services away to the new grpc api. - -The migration process can be done in the following manner: - -for each resource in the metal-api: - -- create a new proto3 based definition in the `api` repo. -- implement the business logic per service in the new `metal-apiserver` without calling the metal-api. -- clients must be able to talk to `v1` and `v2` backend in parallel -- Deprecate the already migrated service in the swagger route to notify the client that this route should not be used anymore. -- identify all consumers of this resource and replace them to use the grpc instead of the rest api -- move the business logic incl. the backend calls to ipam, metal-db, masterdata-api, nsq for this resource from the metal-api to the `metal-apiserver` - -We will migrate the rethinkdb backend implementation to a generic approach during this effort. - -- Try to enhance the generic rethinkdb interface with `project` scoped methods. - -There are a lot of consumers of metal-api, which need to be migrated: - -- ansible -- firewall-controller -- firewall-controller-manager -- gardener-extension-auth -- gardener-extension-provider-metal - - Do not point the secret bindings to a the shared provider secret in the seed anymore. Instead, use individual provider-secret containing project-scoped API access tokens in the Gardener project namespaces. -- machine-controller-manager-provider-metal -- metal-ccm -- metal-console -- metal-bmc -- metal-core -- metal-hammer -- metal-image-cache-sync -- metal-images -- metal-metrics-exporter -- metal-networker -- metalctl -- pixie - -## User Scenarios - -This section gathers a collection of workflows from the perspective of a user that we want to provide with the implementation of this proposal. - -### Machine Creation - -A regular user wants to create a machine resource. - -Requirements: Project was created, permissions are present - -- The user can see networks that were provided by the admin. - - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` - -- The user has to set the project scope first or provide `--project` flags for all commands. - ``` - $ metalctl project set 793bb6cd-8b46-479d-9209-0fedca428fe1 - You are now acting on project 793bb6cd-8b46-479d-9209-0fedca428fe1. - ``` -- The user can create the child network required for machine allocation. - ``` - $ metalctl network allocate --partition fra-equ01 --name test - ``` -- Now, the user sees his own child network. - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - └─╴08b9114b-ec47-4697-b402-a11421788dc6 test 793bb6cd-8b46-479d-9209-0fedca428fe1 fra-equ01 false false 10.128.64.0/22  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` -- The user does not see any machines yet. - ``` - $ metalctl machine ls - ``` -- The user can create a machine. - ``` - $ metalctl machine create --networks internet,08b9114b-ec47-4697-b402-a11421788dc6 --name test --hostname test --image ubuntu-20.04 --partition fra-equ01 --size c1-xlarge-x86` - ``` -- The machine will now be provisioned. - ``` - $ metalctl machine ls - ID LAST EVENT WHEN AGE HOSTNAME PROJECT SIZE IMAGE PARTITION - 00000000-0000-0000-0000-ac1f6b7befb2 Phoned Home 20s 50d 4h test 793bb6cd-8b46-479d-9209-0fedca428fe1 c1-xlarge-x86 Ubuntu 20.04 20210415 fra-equ01 - ``` - -:::warning -A user **cannot** list all allocated machines for all projects. The user **must** always switch project context first and can only view the machines inside this project. Only admins can see all machines at once. -::: -### Scopes for Resources - -The admins / operators of the metal-stack should be able to provide _global_ resources that users are able to use along with their own resources. In particular, users can view and use _global_ resources, but they are not allowed to create, modify or delete them. - -:::info -When a project ID field is empty on a resource, the resource is considered _global_. -::: - -Where possible, users should be capable of creating their own resource entities. - -| Resource | User | Global | -| :----------------- | :--- | :----- | -| File System Layout | yes | yes | -| Firewall | yes | | -| Firmware | | yes | -| OS Image | | yes | -| Machine | yes | | -| Network (Base) | | yes | -| Network (Children) | yes | | -| IP | yes | | -| Partition | | yes | -| Project | yes | | -| Project Token | yes | | -| Size | | yes | -| Switch | | | -| Tenant | | yes | - -:::info -Example: A user can make use of the file system layouts provided by the admins, but can also create own layouts. Same applies for images. As soon as a user creates own resources, the user takes over the responsibility for the machine provisioning to succeed. -::: diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP5/README.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP5/README.md deleted file mode 100644 index 3b7fc45c..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP5/README.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -slug: /MEP-5-shared-networks -title: MEP-5 -sidebar_position: 5 ---- - -# Shared Networks - -## Why are shared networks needed - -For special purpose machines that serve shared services with performance critical workloads to all machines of a partition (like persistent storage) it would be good to have kind of a "shared network" that is easily accessible. -They do not necessarily need another firewall. This would avoid having two firewalls in the datapath between a machine in a private network and the machines of a shared service. - -## Constraints that need to hold - -- a shared network is usable from all machines that have a firewall in front, that uses it -- a shared network is only usable within a single partition (currently we are constrained in bandwidth and have no routing of 10.0.0.0/8 addresses btw. partitions and failure domain should be the partition but this constraint might get lifted in the future) -- networks may be marked as shared after network allocation (but there should be no way back from shared to unshared) -- neither machines nor firewalls may have multiple private, unshared networks configured -- machines must have a single primary network configured - - this might be a shared network - - OR a plain, unshared private network -- firewalls may participate in multiple shared networks -- machines can be allocated with a primary network using auto IP allocation or with `noauto` and a specific IP - -## Should shared networks be private - -**Alternative 1:** If we implemented shared networks by extending functions around plain, private networks we would not have to manage another CIDR (mini point) and it would be possible to create a k8s cluster with a private network, mark the network as `shared` and produce shared services from this k8s cluster. - -**Alternative 2:** If shared networks are implemented as first class networks we could customize the VRF and also accomplish an other goal of our roadmap: being able to create machines directly in an external network. - -Together with @majst01 and @Gerrit91 we decided to continue to implement **Alternative 1**. - -## Firewalls accessing a shared network - -Firewalls that access shared networks need to: - -- hide the private network behind an ip address of the shared network if the shared network was configured with `nat=true`. -- import the prefixes of the shared VRF to the private VRF and import the prefixes of the private VRF to the shared VRF so that the communication between the two is working in both directions. As long as no `nat=true` was set on the shared VRF, the original machine ips are visible in both communication directions. - -## Setup with shared networks and single consumer - -![Simple Setup](./shared.png) - -## Setup with single shared network and multiple consumers - -![Advanced Setup](./shared_advanced.png) - -## Getting internet access - -Machines contained in a shared network can access the internet with different scenarios: - -- if they have an own firewall: this is internet accessibility, as common (check whether all traffic gets routed through it!) -- if they don't have an own firewall, an external HTTP proxy is needed that has an endpoint exposed as Service Type NodePort diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP5/shared.drawio b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP5/shared.drawio deleted file mode 100644 index aa7af045..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP5/shared.drawio +++ /dev/null @@ -1,121 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP5/shared.png b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP5/shared.png deleted file mode 100644 index b0b47f03..00000000 Binary files a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP5/shared.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP5/shared_advanced.drawio b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP5/shared_advanced.drawio deleted file mode 100644 index 6f96eca0..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP5/shared_advanced.drawio +++ /dev/null @@ -1,187 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP5/shared_advanced.png b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP5/shared_advanced.png deleted file mode 100644 index da989915..00000000 Binary files a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP5/shared_advanced.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP6/README.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP6/README.md deleted file mode 100644 index edf52a6c..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP6/README.md +++ /dev/null @@ -1,123 +0,0 @@ ---- -slug: /MEP-6-dmz-networks -title: MEP-6 -sidebar_position: 6 ---- - -# DMZ Networks - -## Reasoning - -To fulfill higher levels of security measures the standard metal-stack approach with a single firewall in front of a set of machines might be insufficient. -There are cases where two physically distinct firewalls in front of application workload are mandatory. In traditional network terms this is known as DMZ approach. - -For Kubernetes workloads it makes sense to use the front cluster for ingress, WAF purposes and as outgoing proxy. The clusters may be used for application workload. - -## DMZ network - -- Use a separate DMZ network prefix for every tenant -- This is used as intermediate network btw. private networks of a tenant and the internet -- For every partition a distinct DMZ firewall/cluster is needed for a tenant -- For Gardener orchestrated Kubernetes clusters this network must be a publicly reachable internet prefix because shoot clusters need a vpn service that is used for instrumentation from the seed cluster - this will be a requirement as long as the inverse vpn tunnel feature Konnectivity is not available to us. - -## Approach 1: DMZ with publicly reachable internet prefix - -![DMZ Internet](dmz-internet_public.svg) - -A DMZ network with publicly reachable internet prefix will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: null -partitionid: "" -prefixes: - - 212.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 104007 -vrfshared: false -nat: true -shared: false -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -This is currently supported by the metal-networker and needs no further changes! - -## Approach 2: DMZ with private IPs - -![DMZ Internet](dmz-internet_private.svg) - -A DMZ network with private IPs will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: tenant-super-network-fra-equ01 -partitionid: fra-equ01 -prefixes: - - 10.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 4711 -vrfshared: false -nat: true -shared: true # it's usable from multiple projects -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network (only locally, no-export) -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -## Code Changes / Implications - -- `metal-networker` and `metal-ccm` assume that there is only one network providing the default-route -- `metal-networker` needs to - - import the default route from the internet network to the dmz network (DMZ Firewall) - - import the DMZ network to the internet network and adjusting NAT rules (DMZ Firewall) - - import destination prefixes of the DMZ network to the private primary network (DMZ Firewall, Application Firewall) - - import DMZ-IPs of the private primary network to the DMZ network (DMZ Firewall, Application Firewall) -- `metal-api`: destination prefixes of private networks need to be configurable (`allocateNetwork`) -- `gardener-extension-provider-metal`: needs to be able to delete DMZ clusters (but skip the network deletion part) -- the application firewall is not publicly reachable - for debugging purposes a hop over the DMZ firewall is needed - -## Decision - -We decided to follow the second approach with private DMZ networks. diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP6/dmz-internet_private.drawio b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP6/dmz-internet_private.drawio deleted file mode 100644 index 7b83bbfc..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP6/dmz-internet_private.drawio +++ /dev/null @@ -1,178 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP6/dmz-internet_private.svg b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP6/dmz-internet_private.svg deleted file mode 100644 index f5e58204..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP6/dmz-internet_private.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
10.90.30.129
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
10.90.30.128/25
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
10.90.30.129 is reachable
/0 via Firewall DMZ
10.0.0.0/24 is reachable
10.0.1.0/24 is reachable
10.90.30.129 is reachable...
Internet
212.1.1.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0

import 10.0.0.0/24 no export
import 10.0.1.0/24 no export
import 10.90.30.128/25 no export
import 10.0.0.0/24 no exp...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP6/dmz-internet_public.drawio b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP6/dmz-internet_public.drawio deleted file mode 100644 index 544939e5..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP6/dmz-internet_public.drawio +++ /dev/null @@ -1,184 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP6/dmz-internet_public.svg b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP6/dmz-internet_public.svg deleted file mode 100644 index 5e825081..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP6/dmz-internet_public.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
212.1.2.3
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
212.1.2.0/27
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
212.1.2.3 is reachable
/0 via Firewall DMZ
212.1.2.3 is reachable...
Internet
212.1.1.0/27 212.1.2.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0
import 212.1.2.0/27
import 10.0.0.0/24 no redistribute
import 10.0.1.0/24 no redistribute

import 212.1.2.0/27...
SNAT to
212.1.2.1
SNAT to...
SNAT to
212.1.2.2
SNAT to...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP8/README.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP8/README.md deleted file mode 100644 index 14748fae..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP8/README.md +++ /dev/null @@ -1,503 +0,0 @@ ---- -slug: /MEP-7-configurable-filesystem-layout-for-machine-allocation -title: MEP-7 -sidebar_position: 7 ---- - -# Configurable Filesystem layout for Machine Allocation - -The current implementation uses a hard coded filesystem layout depending on the specified size and image. This is done in the metal-hammer. This worked well in the past because we had a small amount of sizes and images. But we reached a point where this is to restricted for all use cases we have to fulfill. It also forces us to modify the metal-hammer source code to support a new filesystem layout. - -This proposal tries to address this issue by introducing a filesystem layout struct in the metal-api which is then configurable per machine allocation. -The original behavior of automatic filesystem layout decision must still be present, because there must be no API change for existing API consumers. It should be a additional feature during machine allocation. - -## API and behavior - -The API will get a new endpoint `filesystemlayouts`to create/update/delete a set of available `filesystemlayouts`. - -### Constraints - -In order to keep the actual machine allocation api compatible, there must be no difference while allocating a machine. To achieve this every -`filesystemlayout` defines constraints which specifies for which combination of `sizes` and `images` this layout should be used by default. -The specified constraints over all `filesystemlayouts` therefore must be collision free, to be more specific, there must be exactly one layout outcome -for every possible combination of `sizes` and `images`. - -The `size` constraint must be a list of the exact size ids, the `image` constraint must be a map of os to semver compatible version constraint. For example: - -- `debian: ">= 10.20210101"` or `debian: "< 10.20210101"` - -The general form of a `image` constraint is a map from `os` to `versionconstraint` where: - -`os` must match the first part of the image without the version. -`versionconstraint` must be the comparator, a space and the version, or simply `*` to match all versions of this `os`. -The comparator must be one of: "=", "!=", ">", "<", ">=", "=>", "<=", "=<", "~", "~>", "^" - -It must also be possible to have a `filesystemlayout` in development or for other special purposes, which can be specified during the machine allocation. -To have such a layout, both constraints `sizes` and `images`must be empty list. - -### Reinstall - -The current reinstall implementation the metal-hammer detects during the installation on which disk the OS was installed and reports back to the metal-api the Report struct which has two properties `primarydisk` and `ospartition`. -Both fields are not required anymore because the logic is now shifted to the `filesystemlayout` definition. If `Disk.WipeOnReinstall` is set to true, this disk will be wiped, default is false and is preserved. - -### Handling of s2-xlarge machines - -These machines are a bit special compared to our `c1-*` machines because they have rotating hard disks for the mass storage purpose. -The downside is that the on board SATA-DOM has the same naming as the HDDs and can not be specified as the first /dev/sda disk because all HDDs are also /dev/sd\* disks. -Therefore we had a special SATA-DOM detection algorithm inside metal-hammer which simply checks for the smallest /dev/sd disk and took this to install the OS. - -This is not possible with the current approach, but we figured out that the SATA-DOM is always `/dev/sde`. So we can create a special `filesystemlayout` where the installations is made on this disk. - -### Possible Filesystemlayout hierarchies - -It is only possible to create a filesystem on top of a block device. The creation of a block device can be done on multiple ways, depending on the requirements regarding performance, space and redundancy of the filesystem. -It also depends on the disks available on the server. - -The current approach implements the following hierarchies: - -![filesystems](filesystems.png) - -### Implementation - -```go -// FilesystemLayout to be created on the given machine -type FilesystemLayout struct { - // ID unique layout identifier - ID string - // Description is human readable - Description string - // Filesystems to create on the server - Filesystems []Filesystem - // Disks to configure in the server with their partitions - Disks []Disk - // Raid if not empty, create raid arrays out of the individual disks, to place filesystems onto - Raid []Raid - // VolumeGroups to create - VolumeGroups []VolumeGroup - // LogicalVolumes to create on top of VolumeGroups - LogicalVolumes []LogicalVolume - // Constraints which must match to select this Layout - Constraints FilesystemLayoutConstraints -} - -type FilesystemLayoutConstraints struct { - // Sizes defines the list of sizes this layout applies to - Sizes []string - // Images defines a map from os to versionconstraint - // the combination of os and versionconstraint per size must be conflict free over all filesystemlayouts - Images map[string]string -} - -type RaidLevel string -type Format string -type GPTType string - -// Filesystem defines a single filesystem to be mounted -type Filesystem struct { - // Path defines the mountpoint, if nil, it will not be mounted - Path *string - // Device where the filesystem is created on, must be the full device path seen by the OS - Device string - // Format is the type of filesystem should be created - Format Format - // Label is optional enhances readability - Label *string - // MountOptions which might be required - MountOptions []string - // CreateOptions during filesystem creation - CreateOptions []string -} - -// Disk represents a single block device visible from the OS, required -type Disk struct { - // Device is the full device path - Device string - // Partitions to create on this device - Partitions []Partition - // WipeOnReinstall, if set to true the whole disk will be erased if reinstall happens - // during fresh install all disks are wiped - WipeOnReinstall bool -} - -// Raid is optional, if given the devices must match. -// TODO inherit GPTType from underlay device ? -type Raid struct { - // ArrayName of the raid device, most often this will be /dev/md0 and so forth - ArrayName string - // Devices the devices to form a raid device - Devices []Device - // Level the raidlevel to use, can be one of 0,1,5,10 - // TODO what should be support - Level RaidLevel - // CreateOptions required during raid creation, example: --metadata=1.0 for uefi boot partition - CreateOptions []string - // Spares defaults to 0 - Spares int -} - - -// VolumeGroup is optional, if given the devices must match. -type VolumeGroup struct { - // Name of the volumegroup without the /dev prefix - Name string - // Devices the devices to form a volumegroup device - Devices []string - // Tags to attach to the volumegroup - Tags []string -} - -// LogicalVolume is a block devices created with lvm on top of a volumegroup -type LogicalVolume struct { - // Name the name of the logical volume, without /dev prefix, will be accessible at /dev/vgname/lvname - Name string - // VolumeGroup the name of the volumegroup - VolumeGroup string - // Size of this LV in mebibytes (MiB) - Size uint64 - // LVMType can be either striped or raid1 - LVMType LVMType -} - -// Partition is a single partition on a device, only GPT partition types are supported -type Partition struct { - // Number of this partition, will be added to the device once partitioned - Number int - // Label to enhance readability - Label *string - // Size given in MebiBytes (MiB) - // if "0" is given the rest of the device will be used, this requires Number to be the highest in this partition - Size string - // GPTType defines the GPT partition type - GPTType *GPTType -} - -const ( - // VFAT is used for the UEFI boot partition - VFAT = Format("vfat") - // EXT3 is usually only used for /boot - EXT3 = Format("ext3") - // EXT4 is the default fs - EXT4 = Format("ext4") - // SWAP is for the swap partition - SWAP = Format("swap") - // None - NONE = Format("none") - - // GPTBoot EFI Boot Partition - GPTBoot = GPTType("ef00") - // GPTLinux Linux Partition - GPTLinux = GPTType("8300") - // GPTLinuxRaid Linux Raid Partition - GPTLinuxRaid = GPTType("fd00") - // GPTLinux Linux Partition - GPTLinuxLVM = GPTType("8e00") - - // LVMTypeLinear append across all physical volumes - LVMTypeLinear = LVMType("linear") - // LVMTypeStriped stripe across all physical volumes - LVMTypeStriped = LVMType("striped") - // LVMTypeStripe mirror with raid across all physical volumes - LVMTypeRaid1 = LVMType("raid1") -) -``` - -Example `metalctl` outputs: - -```bash -$ metalctl filesystemlayouts ls -ID DESCRIPTION SIZES IMAGES -default default fs layout c1-large-x86, c1-xlarge-x86 debian >=10, ubuntu >=20.04, centos >=7 -ceph fs layout for ceph s2-large-x86, s2-xlarge-x86 debian >=10, ubuntu >=20.04 -firewall firewall fs layout c1-large-x86, c1-xlarge-x86 firewall >=2 -storage storage fs layout s3-large-x86 centos >=7 -s3 storage fs layout s2-xlarge-x86 debian >=10, ubuntu >=20.04, >=firewall-2 -default-devel devel fs layout -``` - -The `default` layout reflects what is actually implemented in metal-hammer to guarantee backward compatibility. - -```yaml ---- -id: default -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - label: "efi" # required to be compatible with old images - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" # required to be compatible with old images - - path: "/var/lib" - device: "/dev/sda3" - format: "ext4" - label: "varlib" # required to be compatible with old images - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -The `firewall` layout reuses the built in nvme disk to store the logs, which is way faster and larger than what the sata-dom ssd provides. - -```yaml ---- -id: firewall -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - firewall: ">=2" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sda2" - format: "ext4" - - path: "/var" - device: "/dev/nvme0n1p1" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - device: "/dev/nvme0n1" - wipe: true - partitions: - - number: 1 - label: "var" - size: 0 - type: GPTLinux -``` - -The `storage` layout will be used for the storage servers, which must have mirrored boot disks. - -```yaml ---- -id: storage -constraints: - sizes: - - s3-large-x86 - images: - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/md1" - format: "vfat" - options: "-F32" - - path: "/" - device: "/dev/md2" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid - - device: "/dev/sdb" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid -raid: - - name: "/dev/md1" - level: 1 - devices: - - "/dev/sda1" - - "/dev/sdb1" - options: "--metadata=1.0" - - name: "/dev/md2" - level: 1 - devices: - - "/dev/sda2" - - "/dev/sdb2" - options: "--metadata=1.0" -``` - -The `s3-storage` layout matches the special situation on the s2-xlarge machines. - -```yaml ---- -id: s3-storage -constraints: - sizes: - - c1-large-x86 - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sde1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sde2" - format: "ext4" - - path: "/var/lib" - device: "/dev/sde3" - format: "ext4" -disks: - - device: "/dev/sde" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -A sample `lvm` layout which puts `/var/lib` as stripe on the nvme device - -```yaml ---- -id: lvm -description: "lvm layout" -constraints: - size: - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - createoptions: - - "-F 32" - label: "efi" - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" - - path: "/var/lib" - device: "/dev/vg00/varlib" - format: "ext4" - label: "varlib" - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -volumegroups: - - name: "vg00" - devices: - - "/dev/nvmne0n1" - - "/dev/nvmne0n2" -logicalvolumes: - - name: "varlib" - volumegroup: "vg00" - size: 200 - lvmtype: "striped" -disks: - - device: "/dev/sda" - wipeonreinstall: true - partitions: - - number: 1 - label: "efi" - size: 500 - gpttype: "ef00" - - number: 2 - label: "root" - size: 5000 - gpttype: "8300" - - device: "/dev/nvmne0n1" - wipeonreinstall: false - - device: "/dev/nvmne0n2" - wipeonreinstall: false -``` - -## Components which requires modifications - -- metal-hammer: - - change implementation from build in hard coded logic - - move logic to create fstab from install.sh to metal-hammer -- metal-api: - - new endpoint `filesystemlayouts` - - add optional spec of `filesystemlayout` during `allocation` with validation if given `filesystemlayout` is possible on given size. - - add `allocation.filesystemlayout` in the response, based on either the specified `filesystemlayout` or the calculated one. - - implement `filesystemlayouts` validation for: - - matching to disks in the size - - no overlapping with the sizes/imagefilter specified in `filesystemlayouts` - - all devices specified exists from top to bottom (fs -> disks -> device || fs -> raid -> devices) -- metalctl: - - implement `filesystemlayouts` -- metal-go: - - adopt api changes -- metal-images: - - install mdadm for raid support diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP8/filesystems.drawio b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP8/filesystems.drawio deleted file mode 100644 index 0f0c6ab5..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP8/filesystems.drawio +++ /dev/null @@ -1,43 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP8/filesystems.png b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP8/filesystems.png deleted file mode 100644 index 6d903b7e..00000000 Binary files a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP8/filesystems.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP9/README.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP9/README.md deleted file mode 100644 index a8cae83d..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP9/README.md +++ /dev/null @@ -1,132 +0,0 @@ ---- -slug: /MEP-9-no-open-ports-to-the-data-center -title: MEP-9 -sidebar_position: 9 ---- - -# No Open Ports To the Data Center - -Our metal-stack partitions typically have open ports for metal-stack native services, these are: - -- SSH port on the firewalls -- bmc-reverse-proxy for serial console access through the metal-console - -These open ports are potential security risks. For example, while SSH access is possible only with private key it's still vulnerable to DoS attack. - -Therefore, we want to get rid off these open ports to reduce the attack surface to the data center. - -## Requirements - -- Access to firewall SSH only via VPN -- Easy to update VPN components - -As a next step, we can also consider joining the management servers to the VPN mesh, which would replace typical WireGuard setups for operators to enter resources inside the partition. - -## High Level Design - -[](./architecture.svg) - -> Simplified drawing showing old vs. new architecture. - -### Concerns - -There's few concerns when using WireGuard for implementing VPN: - -1. WireGuard doesn't implement dynamic cipher substitution. Which is important in case one of the crypto methods, used by WireGuard will be broken. The only possible solution for that will be to update WireGuard to a fixed version. -2. Coordination server(Headscale) is a single point of failure. In case it fails, it potentially can disconnect existing members of the network, as WireGuard can't manage dynamic IPs by itself. -3. Headscale is already falls behind Tailscale coordination server implementation. Which can complicate the upgrade to newer version of Tailscale client in case of emergency. - -### Solutions to concerns - -1. Tailscale node software is using userspace implementation of WireGuard -- `wireguard-go`. One of the options is to inject Tailscale client into `metalctl`. And make it available as `metalctl vpn` or similar command. It should be possible to do as `tailscale` node is already available as open sourced Go pkg. That would allow us to control, what version of Tailscale users are using and in case of any critical changes to enforce them to update `metalctl` to use VPN functionality. -2. Would it be a considerable risk? We could look into `wg-dynamic` project to cover this problem. -3. At the moment, repository looks well maintained and the metal-stack team already contributes to it. - -## Implementation Details - -### metal-roles - -`metal-roles` will be responsible for deployment of `headscale` server(via new `headscale` role). It also should provide sufficient config to `metal-api` so it establishes connection with `headscale` gRPC server. - -### New `metalctl` commands - -`metalctl` will be responsible for client-side implementation of this MEP. Specifically, it's by using `metalctl` user expected to connect to firewalls. - -- `metalctl vpn` -- section for VPN related commands: - - `metalctl vpn get key [vpn name] --namespace [namespace name]` -- returns auth key to be used with `tailscale` client for establishing connection. - -Extend `metalctl firewall`: - -- `metalctl firewall ssh [ID]` -- connect to firewall via SSH. - -Extend `metalctl machine`: - -- `metalctl machine ssh [ID]` -- connect to machine via SSH. - -`metalctl` will be able to connect to firewall and machines by running `tailscale` in container. - -### metal-api - -Updates to `metal-api` should be made, so that it's able to add firewalls to VPNs. There should be one Tailscale namespace per project. So if multiple firewalls are created in single project, they will join the same namespace. - -Two new flags should be introduced to connect `metal-api` to `headscale` gRPC server: - -- `headscale-addr` -- specifies address of Headscale grpc API. -- `headscale-api-key` -- specifies temporary API key to connect to Headscale. It should be replaced and then rotated by `metal-api`. - -If `metal-api` initialized with `headscale` connection it should automatically join all created firewalls to VPN. - -Add new endpoint, that will be used by `metalctl` to connect to VPN: - -- `/v1/vpn GET` -- requests auth key from `headscale` server. - -### metal-hammer - -`metal-hammer` acts as an intermediary for machine configuration between `metal-api` and machine's image. Specifically it writes to `/etc/metal/install.yaml` file, data from which later will be used by image's `install.sh` file. - -To implement VPN support we have to add authentication key and VPN server address to `install.yaml` file. This key will be used to join machine to a VPN. - -### metal-images - -Images `install.sh` script have to be updated to work with authentication key and VPN server address, provided in `install.yaml` file. If this key is present, machine should connect to VPN. - -### metal-networker - -`metal-networker` also have to know if VPN was configured. In that case we need to disable public access to SSH and allow all(?) traffic from WireGuard interface. - -### firewall-controller - -`firewall-controller` have to monitor changes in `Firewall` resource and keep `tailscaled` version up-to-date. - -### Resources - -Update `Firewall` resource to include desired/actual `tailscale` version: - -``` -Firewall: - Spec: - tailscale: - Version: Minimal version - ... - Status: - ... - VPN: - Status: Boolean field - tailscale: - Version: Actual version - ... -``` - -### bmc-reverse-proxy - -TODO - -## References - -1. [WireGuard: Next Generation Secure Network Tunnel](https://www.youtube.com/watch?v=88GyLoZbDNw) -2. [How Tailscale works](https://tailscale.com/blog/how-tailscale-works) -3. [Tailscale is officially SOC 2 compliant](https://tailscale.com/blog/soc2) -4. [Why not Wireguard](https://www.ipfire.org/blog/why-not-wireguard) -5. [Wireguard: Known Limitations](https://www.wireguard.com/known-limitations/) -6. [Wireguard: Things That Might Be Accomplished](https://www.wireguard.com/todo/) -7. [Headscale: Tailscale control protocol v2](https://github.com/juanfont/headscale/issues/526) diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP9/architecture.drawio b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP9/architecture.drawio deleted file mode 100644 index adb09214..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP9/architecture.drawio +++ /dev/null @@ -1,324 +0,0 @@ - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - -
-
-
- headscale -
-
-
-
- - headscale - -
-
- - - - - - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
-
- - - - - Viewer does not support full SVG 1.1 - - - -
diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP9/architecture.svg b/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP9/architecture.svg deleted file mode 100644 index fd268d2f..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/MEP9/architecture.svg +++ /dev/null @@ -1 +0,0 @@ -
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
headscale
headscale
tailscaled
tailscaled
tailscaled
tailscaled
Internet
Internet
Internet
Internet
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/_category_.json b/versioned_docs/version-v0.22.3/contributing/01-Proposals/_category_.json deleted file mode 100644 index 2e7fa4bf..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "position": 1, - "label": "Enhancement Proposals" -} \ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/contributing/01-Proposals/index.md b/versioned_docs/version-v0.22.3/contributing/01-Proposals/index.md deleted file mode 100644 index 0f6eddc3..00000000 --- a/versioned_docs/version-v0.22.3/contributing/01-Proposals/index.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -slug: /enhancement-proposals -title: Enhancement Proposals -sidebar_position: 1 ---- - -# Metal Stack Enhancement Proposals (MEPs) - -This section contains proposals which address substantial modifications to metal-stack. - -Every proposal has a short name which starts with _MEP_ followed by an incremental, unique number. Proposals should be raised as pull requests in the [website](https://github.com/metal-stack/website) repository and can be discussed in Github issues. - -The list of proposals and their current state is listed in the table below. - -Possible states are: - -- `In Discussion` -- `Accepted` -- `Declined` -- `In Progress` -- `Completed` -- `Aborted` - -Once a proposal was accepted, an issue should be raised and the implementation should be done in a separate PR. - -| Name | Description | State | Progress | -| :------------------------------------------------------------- | :--------------------------------------------- | :-------------: | :----------------------------------------------------------------: | -| [MEP-1](MEP1/README.md) | Distributed Control Plane Deployment | `Declined` | | -| [MEP-2](MEP2/README.md) | Two Factor Authentication | `Aborted` | | -| [MEP-3](MEP3/README.md) | Machine Re-Installation to preserve local data | `Completed` | | -| [MEP-4](MEP4/README.md) | Multi-tenancy for the metal-api | `In Progress` | [releases#236](https://github.com/metal-stack/releases/issues/236) | -| [MEP-5](MEP5/README.md) | Shared Networks | `Completed` | | -| [MEP-6](MEP6/README.md) | DMZ Networks | `Completed` | | -| [MEP-7](https://github.com/metal-stack/docs-archive/pull/51) | Passing environment variables to machines | `Declined` | | -| [MEP-8](MEP8/README.md) | Configurable Filesystemlayout | `Completed` | | -| [MEP-9](MEP9/README.md) | No Open Ports To the Data Center | `Completed` | | -| [MEP-10](MEP10/README.md) | SONiC Support | `Completed` | | -| [MEP-11](MEP11/README.md) | Auditing of metal-stack resources | `Completed` | | -| [MEP-12](MEP12/README.md) | Rack Spreading | `Completed` | | -| [MEP-13](MEP13/README.md) | IPv6 | `Completed` | | -| [MEP-14](MEP14/README.md) | Independence from external sources | `Completed` | | -| [MEP-15](https://github.com/metal-stack/docs-archive/pull/232) | HAL Improvements | `In Discussion` | [releases#238](https://github.com/metal-stack/releases/issues/238) | -| [MEP-16](MEP16/README.md) | Firewall Support for Cluster API Provider | `Accepted` | [releases#237](https://github.com/metal-stack/releases/issues/237) | -| [MEP-17](MEP17/README.md) | Global Network View | `In Discussion` | | -| [MEP-18](MEP18/README.md) | Autonomous Control Plane | `In Discussion` | | - -## Proposal Process - -1. Before starting a new proposal, it is advised to have a quick chat with one of the maintainers. -2. Create a draft pull request in the [website](https://github.com/metal-stack/website) repository with your proposal. Your proposal doesn't have to be finished at this point. -3. Share the PR in the [metal-stack Slack](https://metal-stack.slack.com/) and invite maintainers to review it. -4. The review itself will probably take place in multiple iterations. Don't be discouraged if your proposal is not accepted right away. The goal is to reach consensus. -5. Once your proposal is accepted, create an umbrella issue in the relevant repository or when multiple repositories are involved in the [releases](https://github.com/metal-stack/releases). -6. Other issues should be created in different repositories and linked to the umbrella issue. -7. Unless stated otherwise, the proposer is responsible for the implementation of the proposal. - -## How to Write a Good MEP - -In the first section of your MEP, start with the current situation and the motivation for the change. Summarize your proposal briefly. - -Next follows the main part: describe your proposal in detail. Which parts of of metal-stack are affected? Are there API changes? If yes, describe them and provide examples here. -Try to think of side effects your proposal might have. Try to provide a view on how your proposal affects users of metal-stack. -Highlight breaking changes and think of a migration path for existing users. If your proposal affects multiple components, try to describe the interaction between them. - -After the main part of your proposal, feel free to add additional sections, e.g. about alternatives that were considered, non-goals or future possibilities. - -Depending on the complexity of your proposal, you might want to add a section about the implementation plan or roadmap. - -You can have a look at the existing MEPs for inspiration. As you will notice: not every MEP has the same structure. Feel free to structure your MEP in a way that makes sense for your proposal. diff --git a/versioned_docs/version-v0.22.3/contributing/02-planning-meetings.mdx b/versioned_docs/version-v0.22.3/contributing/02-planning-meetings.mdx deleted file mode 100644 index df10177b..00000000 --- a/versioned_docs/version-v0.22.3/contributing/02-planning-meetings.mdx +++ /dev/null @@ -1,120 +0,0 @@ ---- -slug: /planning-meetings -title: Planning Meetings -sidebar_position: 2 ---- - -# Planning Meetings - -Public planning meetings are held **biweekly** on **odd calendar weeks** from **14:00 to 14:30** (Berlin/Europe timezone) on Microsoft Teams. The purpose is to provide an overview of our current projects and priorities, as well as to discuss new topics and issues within the group. - -export function PlanningMeetingDatesTable() { - const today = new Date(); - const dayOfWeek = today.getDay(); - - let daysUntilMonday = 0; - switch (dayOfWeek) { - case 0: - daysUntilMonday = 1; - break; - case 1: - daysUntilMonday = 0; - break; - default: - daysUntilMonday = 8 - dayOfWeek; - } - - const nextMonday = new Date(); - nextMonday.setDate(nextMonday.getDate() + daysUntilMonday) - - let onejan = new Date(today.getFullYear(), 0, 1); - let week = Math.ceil((((nextMonday.getTime() - onejan.getTime()) / 86400000) + onejan.getDay() + 1) / 7); - - if (week % 2 === 0) { - nextMonday.setDate(nextMonday.getDate() + 7) - } - - const blacklist = [ - new Date('2025-12-29'), - ] - - const amount = 8 - const dates = []; - - for (let i = 0; i < amount; i++) { - const nextDate = new Date(nextMonday); - nextDate.setDate(nextDate.getDate() + (i * 14)) - - if (blacklist.find(item => {return item.toDateString() == nextDate.toDateString()}) !== undefined ) { - continue - } - - dates.push(nextDate.toDateString()) - } - - return ( - - - - - - - - - - {dates.map((date, index) => ( - - - - - - ))} - -
DateTimeLink
{date}14:00 – 14:30Join Link
- ) -} - - - -Our [development planning board](https://github.com/orgs/metal-stack/projects/34) can be found on GitHub. - -[//]: <> (The C025PB1EUKC in the slack url references the #devs channel.) -If you want to get an invitation to the event, please drop us a line on our [Slack channel](https://metal-stack.slack.com/archives/C025PB1EUKC). - -Planning meetings are currently not recorded. The meetings are held either in English or German depending on the attendees. - -:::info -Note that anyone can contribute to metal-stack without participating in planning meetings. However, if you want to speed up the review process for your requirements, it might be helpful to attend the meetings. -::: - -## Agenda - -Here is the agenda that we generally want to follow in a planning meeting: - -- Possibility to bring up news that are interesting for every developer of the metal-stack org -- Check `Done` column and archive cards - - Attendees have the chance to briefly present achievements if they want -- Check the `In Progress` column and discuss whether these tasks are still worked on, there were significant blockers or they can be lower-prioritized -- Check new issues labelled with `triage` and prioritize them -- Allow attendees to bring up issues and prioritize them - - Attendees have the chance to briefly present these new issues - -## Idea Backlog - -The backlog contains ideas of what could become part of the roadmap in the future. The list is ordered alphabetically. Therefore, the order does not express the importance or weight of a backlog item. - -We incorporate community feedback into the roadmap. If you think that important points are missing in the backlog, please share your ideas with us. We have a Slack channel. Please check out [metal-stack.io](https://metal-stack.io) for contact information. - -:::danger -By no means this list is a promise of what is being worked on in the near future. It is just a summary of ideas that was agreed on to be "nice to have". It is up to the investors, maintainers and the community to choose topics from this list and to implement them or to remove them from the list. -::: - -- Add metal-stack to [Gardener conformance test grid](https://testgrid.k8s.io/gardener-all) -- Autoscaler for metal control plane components -- CI dashboard and public integration testing -- Improved release and deploy processes (GitOps, [Spinnaker](https://spinnaker.io/), [Flux](https://fluxcd.io/)) -- Machine internet without firewalls -- metal-stack dashboard (UI) -- Offer our metal-stack extensions as enterprise products (accounting, cluster-api, S3) (neither of them will ever be required for running metal-stack, they just add extra value for certain enterprises) -- Partition managed by Kubernetes (with Kubelets joining the control plane cluster) -- Public offering / demo playground diff --git a/versioned_docs/version-v0.22.3/contributing/03-contribution-guideline.md b/versioned_docs/version-v0.22.3/contributing/03-contribution-guideline.md deleted file mode 100644 index 2c0526e3..00000000 --- a/versioned_docs/version-v0.22.3/contributing/03-contribution-guideline.md +++ /dev/null @@ -1,145 +0,0 @@ ---- -slug: /contribution-guideline -title: Contribution Guideline -sidebar_position: 3 ---- - -# Contribution Guideline - -This document describes the way we want to contribute code to the projects of metal-stack, which are hosted on [github.com/metal-stack](https://github.com/metal-stack). - -The document is meant to be understood as a general guideline for contributions, but not as burden to be placed on a developer. Use your best judgment when contributing code. Try to be as clean and precise as possible when writing code and try to make your code as maintainable and understandable as possible for other people. - -Even if it should go without saying, we live an open culture of discussion, in which everybody is welcome to participate. We treat every contribution with respect and objectiveness with the general aim to write software of quality. - -If you want, feel free to propose changes to this document in a pull request. - -## How Can I Contribute? - -Open a Github issue in the project you would like to contribute. Within the issue, your idea can be discussed. It is also possible to directly create a pull request when the set of changes is relatively small. - -When opening an issue please consider the following aspects: - -1. Create a meaningful issue describing the WHY? of your contribution. -1. Try to set appropriate labels to the issue. For example, attach the `triage` label to your issue if you want it to be discussed in the next [planning meeting](./02-planning-meetings.mdx). It might be useful to attend the meeting if you want to emphasize it being worked on. - -### Pull Requests - -The process described here has several goals: - -- Maintain quality -- Enable a sustainable system to review contributions -- Enable documented and reproducible addition of contributions - -1. Create a repository fork within the context of that issue. Members of the organization may work on the repository directly without a fork, which allows building development artifacts more easily. -1. Develop, document and test your contribution (try not to solve more than one issue in a single pull request). -1. Create a Draft Pull Request to the repository's main branch. -1. Create a meaningful description of the pull request or reference the related issue. The pull request template explains what the content should include, please read it. -1. Ask for merging your contribution by removing the draft marker. Repository maintainers (see [Code Ownership](#code-ownership)) are notified automatically, but you can also reach out to people directly on Slack if you want a review from a specific person. - -## General Objectives - -This section contains language-agnostic topics that all metal-stack projects are trying to follow. - -### Code Ownership - -The code base is owned by the entire team and every member is allowed to contribute changes to any of the projects. This is considered as collective code ownership[^1]. - -As a matter of fact, there are persons in a project, which already have experience with the sources. These are defined directly in the repository's [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) file. If you want to merge changes into the master branch, it is advisable to include code owners into the process of discussion and merging. - -### Microservices - -One major ambition of metal-stack is to follow the idea of [microservices](https://en.wikipedia.org/wiki/Microservices). This way, we want to achieve that we can - -- adapt to changes faster than with monolithic architectures, -- be free of restrictions due to certain choices of technology, -- leverage powerful traits of cloud infrastructures (e.g. high-scalability, high-availability, ...). - -### Programming Languages - -We are generally open to write code in any language that fits best to the function of the software. However, we encourage [golang](https://en.wikipedia.org/wiki/Go_(programming_language)) to be the main language of metal-stack as we think that it makes development faster when not establishing too many different languages in our architecture. Reason for this is that we are striving for consistent behavior of the microservices, similar to what has been described for the Twelve-Factor App (see [12 Factor](https://12factor.net/)). We help enforcing unified behavior by allowing a small layer of shared code for every programming language. We will refer to this shared code as "libraries" for the rest of this document. - -### Artifacts - -Artifacts are always produced by a CI process (i.e. Github Actions). - -Container images and [OCI artifacts](https://github.com/opencontainers/image-spec) are published on the Github Container Registry of the metal-stack organization. Please consider using Github Actions workflows utilizing similar actions as the other repositories (e.g. [build-push-action](https://github.com/docker/build-push-action), ...) - -For OCI images, we usually utilize [oras](https://github.com/oras-project/oras) for pushing the artifact to the registry. - -For signing artifacts we use [cosign](https://github.com/sigstore/cosign). The private key for signing artifacts is a CI secret called `COSIGN_PRIVATE_KEY`. - -Binary artifacts or OS images can be uploaded to `images.metal-stack.io` if necessary. - -### APIs - -The preferred way to implement an API is using [Connect RPC](https://connectrpc.com/), which is based on [grpc](https://grpc.io/). For working with the [Protobuf](https://protobuf.dev/) definitions, we utilize [buf](https://github.com/bufbuild/buf). - -The metal-api does still have a [Swagger-based](https://swagger.io/) API exposing traditional REST APIs for end-users. This API framework will become deprecated so it should not be used anymore for new projects. - -#### Versioning - -Artifacts are versioned by tagging the respective repository with a tag starting with the letter `v`. After the letter, there stands a valid [semantic version](https://semver.org/). - -### Documentation - -In order to make it easier for others to understand a project, we document general information and usage instructions in a `README.md` in any project. - -In addition to that, we document a microservice in the [docs](https://github.com/metal-stack/docs) repository. The documentation should contain the reasoning why this service exists and why it was being implemented the way it was being implemented. The aim of this procedure is to reduce the time for contributors to comprehend architectural decisions that were made during the process of writing the software and to clarify the general purpose of this service in the entire context of the software. - -## Guidelines - -This chapter describes general guidelines on how to develop and contribute code for a certain programming language. - -### Golang - -Development follows the official guide to: - -- Write clear, idiomatic Go code[^2] -- Learn from mistakes that must not be repeated[^3] -- Apply appropriate names to your artifacts: - - [https://go.dev/talks/2014/names.slide](https://go.dev/talks/2014/names.slide) - - [https://go.dev/blog/package-names](https://go.dev/blog/package-names) - - [https://go.dev/doc/effective_go#names](https://go.dev/doc/effective_go#names) -- Enable others to understand the reasoning of non-trivial code sequences by applying a meaningful documentation. - -#### Development Decisions - -- **Dependency Management** by using Go modules -- **Build and Test Automation** by using [GNU Make](https://man7.org/linux/man-pages/man1/make.1p.html). -- **APIs** should consider using [buf](https://github.com/bufbuild/buf) - -#### Libraries - -metal-stack maintains libraries that you can utilize in your project in order to unify common behavior. The main project that does this is called [metal-lib](https://github.com/metal-stack/metal-lib). - -#### Error Handling with Generated Swagger Clients - -From the server-side you should ensure that you are returning the common error json struct in case of an error as defined in the `metal-lib/httperrors`. Ensure you are using `go-restful >= v2.9.1` and `go-restful-openapi >= v0.13.1` (allows default responses with error codes other than 200). - -### Documentation - -We want to share knowledge and keep things simple. If things cannot kept simple we want to enable everybody to understand them by: - -- Document in short sentences[^4]. -- Do not explain the HOW (this is already documented by your code and documenting the obvious is considered a defect). -- Explain the WHY. Add a "to" in your documentation line to force yourself to explain the reasonning (e.g. "` to `"). - -### Python - -Development follows the official guide to: - -- Style Guide for Python Code (PEP 8)[^5] - - The use of an IDE like [PyCharm](https://www.jetbrains.com/pycharm/) helps to write compliant code easily -- Consider [setuptools](https://pythonhosted.org/an_example_pypi_project/setuptools.html) for packaging -- If you want to add a Python microservice to the mix, consider [pyinstaller](https://github.com/pyinstaller/pyinstaller) on Alpine to achieve small image sizes - -[^1]: [https://martinfowler.com/bliki/CodeOwnership.html](https://martinfowler.com/bliki/CodeOwnership.html) - -[^2]: [https://go.dev/doc/effective_go](https://go.dev/doc/effective_go) - -[^3]: [https://github.com/golang/go/wiki/CodeReviewComments](https://github.com/golang/go/wiki/CodeReviewComments) - -[^4]: [https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences](https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences) - -[^5]: [https://www.python.org/dev/peps/pep-0008/](https://www.python.org/dev/peps/pep-0008/) diff --git a/versioned_docs/version-v0.22.3/contributing/04-release-flow.md b/versioned_docs/version-v0.22.3/contributing/04-release-flow.md deleted file mode 100644 index 744d9274..00000000 --- a/versioned_docs/version-v0.22.3/contributing/04-release-flow.md +++ /dev/null @@ -1,100 +0,0 @@ ---- -slug: /release-flow -title: Release Flow -sidebar_position: 4 ---- - -# Releases - -The metal-stack contains of many microservices that depend on each other. The automated release flow is there to ensure that all components work together flawlessly for every metal-stack release. - -Releases and integration tests are published through our [release repository](https://github.com/metal-stack/releases). You can also find the [release notes](https://github.com/metal-stack/releases/releases) for this metal-stack version in there. The release notes contain information about new features, upgrade paths and bug fixes. - -If you want, you can sign up at our Slack channel where we are announcing every new release. Often, we provide additional information for metal-stack administrators and adopters at this place, too. - -This document is intended for developers, especially maintainers of metal-stack projects. - -## Release Flow - -The following diagram attempts to describe our current release flow: - -![](release_flow.svg) - -A release is created in the following way: - -- Individual repository maintainers within the metal-stack GitHub Organization can publish a release of their component. -- This release is automatically pushed to the `develop` branch of the release repository by the metal-robot. -- A push triggers a virtual release integration test using the mini-lab environment. This setup launches metal-stack with the `sonic` and `gardener` flavors to validate the different Ansible roles and execute basic operations across the metal-stack layer. -- To contribute components that are not directly part of the release vector, a pull request must be made against the `develop` branch of the release repository. Release maintainers may push directly to the `develop` branch. -- The release maintainers can `/freeze` the `develop` branch, effectively stopping the metal-robot from pushing component releases to this branch. -- The `develop` branch is tagged by a release maintainer with a `-rc.x` suffix to create a __release candidate__. -- The release candidate must pass a large integration test suite on a real environment, which is currently run by FI-TS. It tests the entire machine provisioning engine including the integration with Gardener, the deployment, metal-images and Kubernetes conformance tests. -- If the integration tests pass, the PR of the `develop` branch must be approved by at least two release maintainers. -- A release is created via GitHub releases, including all release notes, with a tag on the `main` branch. - -## FAQ - -**Question: I need PR #xyz to go into the release, why did you not include it?** - -Answer: It's not on purpose if we miss a PR to be included into a metal-stack release. Please use the pending pull request from `develop` into `master` as soon as it is open and comment which pull request you want to have included into the release. Also consider attending our planning meetings or contact us in our Slack channel if you have urgent requirements that need to be dealt with. - -**Question: Who is responsible for the releases? Who can freeze a release?** - -Answer: Every repository in metal-stack has a `CODEOWNERS` file pointing to a maintainer team. This is also true for the releases repository. Only release repository maintainers are allowed to `/freeze` a release (meaning the metal-robot does not automatically append new component releases to the release vector anymore). - -**Question: I can't push to the `develop` branch of this repository? How can I request changes to the release vector?** - -Answer: Most changes are automatically integrated by the metal-robot. For manually managed components, please raise a pull request against the `develop` branch. Only release maintainers are allowed to push to `develop` as otherwise it would be possible to mess up the release pipeline. - -**Question: What requirements need to be fulfilled to add a repository to the release vector?** - -Please see the section below named [Requirements for Release Vector Repositories](#requirements-for-release-vector-repositories). - -### Requirements for Release Vector Repositories - -Before adding a repository in the metal-stack org to the releases repository, it is advised for the maintainer to fulfill the following points: - -- The following files should be present at the repository root: - - [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) - - When a repository is created, the metal-robot automatically creates a -maintainers team in our GitHub org. - - The CODEOWNERS file should reference this team. - - The team should contain at least two maintainers. - - `LICENSE` - - This usually should be MIT with "metal-stack" as authors. - - `README.md` -- The `developers-core` team should be given repository access with `write` role, the codeowners team should have the `maintain` role -- Release artifacts should have an SPDX-formatted SBOM attached. - - For container images these are embedded using Buildx. -- The following branch protection rules should be set: - - The mainline should be protected. - - A pull request should be required before merging (required by at least one code owner). - - Status checks should be required to pass. - - Force push should not be allowed on this branch. -- One person from the releases maintainers has to add the repository to the metal-robot in order to pick up the releases, add them to the release vector and generate release notes. - -### How-To Release a Project - -[release-drafter](https://github.com/release-drafter/release-drafter) is preferred in order to generate release notes from merged PRs for your projects. It should be triggered for pushes on your main branch. - -The draft is then used to create a project release. The release has to be published through the GitHub UI as demonstrated in the screenshot below. - -**Tagging the repository is not enough as repository tagging does not associate your release notes to your release!** - -![](release.png) - -Some further remarks: - -- Use semver versions with `v` prefix for your tags -- Name your release after your release tag -- The metal-robot only picks up lines from your release notes that start with `-` or `*` (unordered list items) and appends them to the according section in the aggregated release draft -- A tag created through a GitHub UI release does not trigger a `push` event . This means, your pipeline will not start to run with the `push` trigger when publishing through the UI. - - Instead, use the `published` [release event trigger](https://docs.github.com/en/actions/reference/events-that-trigger-workflows#release) for your actions: - - ```yaml - on: - release: - types: - - published - ``` -- In case they are necessary, please do not forget to include `NOTEWORTHY`, `ACTIONS_REQUIRED` or `BREAKING_CHANGE` sections into releases. More information on those release draft sections can be read in a pull request template. diff --git a/versioned_docs/version-v0.22.3/contributing/05-community.md b/versioned_docs/version-v0.22.3/contributing/05-community.md deleted file mode 100644 index 61eaf099..00000000 --- a/versioned_docs/version-v0.22.3/contributing/05-community.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -slug: /community -title: Community -sidebar_position: 5 -draft: true ---- - -# Community - -(Slack channel, community events like FOSDEM, Kubernetes Community Days..., blog -articles) diff --git a/versioned_docs/version-v0.22.3/contributing/release.png b/versioned_docs/version-v0.22.3/contributing/release.png deleted file mode 100644 index 598b1182..00000000 Binary files a/versioned_docs/version-v0.22.3/contributing/release.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.3/contributing/release_flow.drawio b/versioned_docs/version-v0.22.3/contributing/release_flow.drawio deleted file mode 100644 index 6ca6b34f..00000000 --- a/versioned_docs/version-v0.22.3/contributing/release_flow.drawio +++ /dev/null @@ -1,721 +0,0 @@ - - - - - - - - - - - -
-
-
- Review release notes -
-
-
-
- - Review release notes - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- Organization Webhook -
-
-
-
- - Organization Webhook - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- - Publish release - -
-
-
-
- - Publish release - -
-
-
- - - - - - - - -
-
-
- Maintainer -
-
-
-
- - Maint... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- metal-robot release handler -
-
-
-
- - metal-robot release han... - -
-
-
- - - - - - - - -
-
-
- - no - -
-
-
-
- - no - -
-
-
- - - - - - - - -
-
-
- - yes - -
-
-
-
- - yes - -
-
-
- - - - - - - -
-
-
- version in event newer than release vector version -
-
-
-
- - version in event newer than... - -
-
-
- - - - - - - -
-
-
- - do nothing - -
-
-
-
- - do nothing - -
-
-
- - - - - - - - - - - - -
-
-
- Github Action -
-
-
-
- - Github Action - -
-
-
- - - - - - - -
-
-
- Bump version in release vector and push to - - develop - -
-
-
-
- - Bump version in release vector... - -
-
-
- - - - - - - - - - - -
-
-
- Open pull request from - - develop - - to - - master - -
-
-
-
- - Open pull request from develop... - -
-
-
- - - - - - - -
-
-
- Update aggregated release draft in - - metal-stack/releases - -
-
-
-
- - Update aggregated release draf... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Integration Testing -
-
-
-
- - Integration Testing - -
-
-
- - - - - - - - - - - -
-
-
- Merge to - - master - -
-
-
-
- - Merge to master - -
-
-
- - - - - - - - - - - - -
-
-
- Review -
-
-
-
- - Review - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Tests suceeded and PR changes reviewed -
-
-
-
- - Tests suceeded and PR chang... - -
-
-
- - - - - - - -
-
-
- - publish results to #integration - -
-
-
-
- - publish results to #integr... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Release metal-stack -
-
-
-
- - Release metal-stack - -
-
-
- - - - - - - - - - - -
-
-
- - publish to #announcements - -
-
-
-
- - publish to #announcements - -
-
-
- - - - - - - -
-
-
- - - metal-stack/docs - - pull request - -
-
-
-
- - metal-stack/docs pull requ... - -
-
-
- - - - - - - - - - - - -
-
-
- Freeze -
-
-
-
- - Freeze - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Freeze - - develop - - and create a release candidate -
-
-
-
- - Freeze develop and create a rel... - -
-
-
- - - - - - - -
-
-
- Large integration suites -
- - (currently owned by FI-TS, not public) - -
-
-
-
-
- - Large integration suites... - -
-
-
- - - - - - - - -
-
-
- Run -
-
-
-
- - Run - -
-
-
- - - - -
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.3/contributing/release_flow.svg b/versioned_docs/version-v0.22.3/contributing/release_flow.svg deleted file mode 100644 index 55cdd493..00000000 --- a/versioned_docs/version-v0.22.3/contributing/release_flow.svg +++ /dev/null @@ -1 +0,0 @@ -
Review release notes
Review release notes
projects
projects
projects
projects
Organization Webhook
Organization Webhook
projects
projects
Publish release
Publish release
Maintainer
Maint...
metal-robot release handler
metal-robot release han...
no
no
yes
yes
version in event newer than release vector version
version in event newer than...
do nothing
do nothing
Github Action
Github Action
Bump version in release vector and push todevelop
Bump version in release vector...
Open pull request fromdeveloptomaster
Open pull request from develop...
Update aggregated release draft inmetal-stack/releases
Update aggregated release draf...
Integration Testing
Integration Testing
Merge tomaster
Merge to master
Review
Review
Tests suceeded and PR changes reviewed
Tests suceeded and PR chang...
publish results to #integration
publish results to #integr...
Release metal-stack
Release metal-stack
publish to #announcements
publish to #announcements
metal-stack/docspull request
metal-stack/docs pull requ...
Freeze
Freeze
Freezedevelopand create a release candidate
Freeze develop and create a rel...
Large integration suites
(currently owned by FI-TS, not public)
Large integration suites...
Run
Run
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.3/docs/02-General/04-flavors-of-metalstack.md b/versioned_docs/version-v0.22.3/docs/02-General/04-flavors-of-metalstack.md index 7da427fc..2277ca6b 100644 --- a/versioned_docs/version-v0.22.3/docs/02-General/04-flavors-of-metalstack.md +++ b/versioned_docs/version-v0.22.3/docs/02-General/04-flavors-of-metalstack.md @@ -14,7 +14,7 @@ As modern infrastructure and cloud native applications are designed with Kuberne Regardless which flavor of metal-stack you use, it is always possible to manually provision machines, networks and ip addresses. This is the most basic way of using metal-stack and is very similar to how traditional bare metal infrastructures are managed. -Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](../../contributing/01-Proposals/MEP4/README.md) and [MEP-16](../../contributing/01-Proposals/MEP16/README.md) in the future. +Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api) and [MEP-16](/community/MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller) in the future. ## Gardener diff --git a/versioned_docs/version-v0.22.3/docs/04-For Operators/03-deployment-guide.mdx b/versioned_docs/version-v0.22.3/docs/04-For Operators/03-deployment-guide.mdx index 58ddafd3..6be800cd 100644 --- a/versioned_docs/version-v0.22.3/docs/04-For Operators/03-deployment-guide.mdx +++ b/versioned_docs/version-v0.22.3/docs/04-For Operators/03-deployment-guide.mdx @@ -31,7 +31,7 @@ You can use the [mini-lab](https://github.com/metal-stack/mini-lab) as a templat The metal control plane is typically deployed in a Kubernetes cluster. Therefore, this document will assume that you have a Kubernetes cluster ready for getting deployed. Even though it is theoretically possible to deploy metal-stack without Kubernetes, we strongly advise you to use the described method because we believe that Kubernetes gives you a lot of benefits regarding the stability and maintainability of the application deployment. :::tip -For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](../../contributing/01-Proposals/MEP18/README.md) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). +For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](/community/MEP-18-autonomous-control-plane) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). ::: Let's start off with a fresh folder for your deployment: diff --git a/versioned_docs/version-v0.22.3/docs/05-Concepts/01-architecture.mdx b/versioned_docs/version-v0.22.3/docs/05-Concepts/01-architecture.mdx index 709960e3..75298df9 100644 --- a/versioned_docs/version-v0.22.3/docs/05-Concepts/01-architecture.mdx +++ b/versioned_docs/version-v0.22.3/docs/05-Concepts/01-architecture.mdx @@ -152,4 +152,4 @@ Thus, for creating a partition as well as a machine or a firewall, the flags `dn In order to be fully offline resilient, make sure to check out `metal-image-cache-sync`. This component provides copies of `metal-images`, `metal-kernel` and `metal-hammer`. -This feature is related to [MEP14](../../contributing/01-Proposals/MEP14/README.md). +This feature is related to [MEP14](/community/MEP-14-independence-from-external-sources). diff --git a/versioned_docs/version-v0.22.3/docs/05-Concepts/02-user-management.md b/versioned_docs/version-v0.22.3/docs/05-Concepts/02-user-management.md index f1ee2778..ba742ee9 100644 --- a/versioned_docs/version-v0.22.3/docs/05-Concepts/02-user-management.md +++ b/versioned_docs/version-v0.22.3/docs/05-Concepts/02-user-management.md @@ -7,7 +7,7 @@ sidebar_position: 2 # User Management At the moment, metal-stack can more or less be seen as a low-level API that does not scope access based on projects and tenants. -Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](../../contributing/01-Proposals/MEP4/README.md). +Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](/community/MEP-4-multi-tenancy-for-the-metal-api). Until then projects and tenants can be created, but have no effect on access control. diff --git a/versioned_docs/version-v0.22.3/docs/06-For CISOs/Security/01-principles.md b/versioned_docs/version-v0.22.3/docs/06-For CISOs/Security/01-principles.md index 8e7030f5..e327ec4a 100644 --- a/versioned_docs/version-v0.22.3/docs/06-For CISOs/Security/01-principles.md +++ b/versioned_docs/version-v0.22.3/docs/06-For CISOs/Security/01-principles.md @@ -15,7 +15,7 @@ The minimal need to know principle is a security concept that restricts access t ### RBAC :::info -As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](../../../contributing/01-Proposals/MEP4/README.md). +As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](..//community/MEP-4-multi-tenancy-for-the-metal-api). ::: As described in our [User Management](../../05-Concepts/02-user-management.md) concept the [metal-api](https://github.com/metal-stack/metal-api) currently offers three different user roles for authorization: diff --git a/versioned_docs/version-v0.22.3/docs/06-For CISOs/Security/04-communication-matrix.md b/versioned_docs/version-v0.22.3/docs/06-For CISOs/Security/04-communication-matrix.md index 07df2607..24c1bc1d 100644 --- a/versioned_docs/version-v0.22.3/docs/06-For CISOs/Security/04-communication-matrix.md +++ b/versioned_docs/version-v0.22.3/docs/06-For CISOs/Security/04-communication-matrix.md @@ -116,7 +116,7 @@ Please note that every [networking setup](../../05-Concepts/03-Network/01-theory | VLAN | Switches, Firewalls | Layer 2 traffic segmentation. | | VXLAN | Switches, Firewalls | Encapsulate Layer 2 frames in Layer 3 packets for network virtualization. | | EVPN | Switches, Firewalls | Overlay network technology for scalable and flexible network architectures. | -| VPN | Firewalls | Management access [without open SSH ports](../../../contributing/01-Proposals/MEP9/README.md). | +| VPN | Firewalls | Management access [without open SSH ports](..//community/MEP-9-no-open-ports-to-the-data-center). | | BGP | Multiple | Routing protocol for dynamic routing and network management. | | SSH | Management Server, Switches | Secure shell access for management and configuration. | | LLDP | Switches, Machines | Link Layer Discovery Protocol for network device discovery. | diff --git a/versioned_docs/version-v0.22.3/docs/06-For CISOs/rbac.md b/versioned_docs/version-v0.22.3/docs/06-For CISOs/rbac.md index 9a87b896..06c902bb 100644 --- a/versioned_docs/version-v0.22.3/docs/06-For CISOs/rbac.md +++ b/versioned_docs/version-v0.22.3/docs/06-For CISOs/rbac.md @@ -31,4 +31,4 @@ To ensure that internal components interact securely with the metal-api, metal-s Users can interact with the metal-api using [metalctl](https://github.com/metal-stack/metalctl), the command-line interface provided by metal-stack. Depending on the required operations, users should authenticate with the appropriate role to match their level of access. -As part of [MEP-4](../../contributing/01-Proposals/MEP4/README.md), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. +As part of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. diff --git a/versioned_docs/version-v0.22.3/docs/06-For CISOs/remote-access.md b/versioned_docs/version-v0.22.3/docs/06-For CISOs/remote-access.md index 0b8dbb19..dc24e82f 100644 --- a/versioned_docs/version-v0.22.3/docs/06-For CISOs/remote-access.md +++ b/versioned_docs/version-v0.22.3/docs/06-For CISOs/remote-access.md @@ -6,7 +6,7 @@ title: Remote Access ## Machines and Firewalls -Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](../../contributing/01-Proposals/MEP9/README.md). Administrators can access machines in two primary ways. +Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](/community/MEP-9-no-open-ports-to-the-data-center). Administrators can access machines in two primary ways. **Out-of-band management via SOL** @@ -26,4 +26,4 @@ This approach uses the [`metal-console`](../08-References/Control%20Plane/metal- Both methods ensure secure and controlled access to machines without exposing them unnecessarily to the network, maintaining the integrity and safety of the infrastructure. -Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](../../contributing/01-Proposals/MEP4/README.md). \ No newline at end of file +Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api). \ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/Distributed-API-Working.png b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/Distributed-API-Working.png deleted file mode 100644 index 899e223d..00000000 Binary files a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/Distributed-API-Working.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/Distributed-API.png b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/Distributed-API.png deleted file mode 100644 index 688c7c2e..00000000 Binary files a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/Distributed-API.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/Distributed-Deployment.png b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/Distributed-Deployment.png deleted file mode 100644 index 8bba51b8..00000000 Binary files a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/Distributed-Deployment.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/Distributed.drawio b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/Distributed.drawio deleted file mode 100644 index f7c6fe79..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/Distributed.drawio +++ /dev/null @@ -1 +0,0 @@ -7V3bcts2EP0aP8pDgPfH2ImbziRTt+m0zVMHpmCJMUUoFG1L/fqC4kUkAEoULwDkRC8WIBIkF2d3z+6C8JV5u9r+kqD18jOZ4+gKGvPtlfn+CtKPY9M/Wc8u7/FtM+9YJOE87wKHji/hf7joNIre53CON40DU0KiNFw3OwMSxzhIG30oSchr87BHEjWvukYLzHV8CVDE9/4dztNl3uvZxqH/Iw4Xy/LKwCh+eUDB0yIhz3FxvStoPu4/+c8rVI5VHL9Zojl5rXWZH67M24SQNP+22t7iKJNtKbb8vLuWX6v7TnCcdjkhWVoPJP5zF3/w/trOvG+vv70+z4rJe0HRMy4fw4noeDePhA5L5YmC/Afn+3N2pzcfcfSC0zBAh67s4dJd1DwuO3+22U/3O3oAgOtt/Qxnkf29R0kapiGJy6vS288vnP9eSK4aHFIhrrOvz6voLkEr+vXmdRmm+Ms6v89XClLat0xXEW0B+nU/QziTgUFb1SRkjYCswqD4HqEHHN1UU3pLIpLQn2ISZ9fYpAl5qvABise7Q6swymD/F07mKEZFd4FxkA2LonAR00ZAJwkn1RO94CTF29aJBBU8qNphssJpsqOHFCfMbLeAVKFywC0x+VpDcNm3rKHX9opOVGjNohr9gBz6pQDPGUACkEPSBif0OWkfHGEayyfzMqlWj2QaI4kUcCK1eJGajkikYASRrrboX/K79Qh+vYe77+Ef8RMBM4+TaECCp4SgYClDoJz0BDJuF6jFCNQ0O8oTGiPI055D4NsPc8+Yo0cAwMy0OJHhOfUDRZMk6ZIsSIyiD4dexnTUZDdHm+W+H3SwHNTE3YXZne5HwfH8Xea1souucZz3NH+v24+OZqZ1xrKHPDpfCY5QGr40naFI9PtT6a2jXe2ANQnjdFMb+T7rOMDAAoxaGdBvOqkzT6Bf8nsQn256LadXd7whz0mAi9MYQFVi6a+zrsCfDsTd4ZhPhKwL0H3DaborEICeU9LE5x50JcyCCG02mZ9rYBEcQ00upCOPWdAGOt4Cp0eOc8ZG4SCDypOdmkE1ADdTpVGlPGFNtTkTUuXQI/yYNTfUvobx4tO+9d50xrKeVhPHlsDBAyiwns5Uzsg5Krt2Dy9fdpUMVMgOuCh4QLZredg2fedhBji5MQT7bObckZJzBNt498OQ7HKm3Wm4jW0zXsbmEWaL6LdlTqWegLdesv1MC+Hp72T8SSgMxxkoN2yhquUYuZubjDP4nImgI6JohtahRmbVYsxqlcBR5pLKkPMtYb4U6uSgh23xmSTQlw9aRz3apJmNT5FGsIeedrA3j1ExzfMC0G6BnZS0gFieloCivcGYDXQN2oBeURu4mLCNtRXqozZwMWEbSy80kJ0ol6ModLv5GbqNgjIuza9B5OYp9zbjs1hJoRub7qVs4tqofWBzwKkp7UUEcmx6TD2hrQrkb0gDJqq/8PUSnk8r1IAKjXoHdWx2XQMVgAKuwerEoXLIhAd8dw3neBum/2R4vva8ovl137SL1vttgfZ9Y1dr3OMkpM+X+eWiNkmfNR8LOGW7NljWPIy2b+3qLXa8TurVllE/Hca4HVWw4fv5SS/7hmZc2KyxYzNoailNCkYymJHY2HhqNb/kDAS3MgFUpFBdDgL+IDkI2DUHAfXKQcCLyUFwpWMABSuZJHu3i8lCcMVjHaR3Mg8hbY3mzxLyVCVkv3Miwp0MZ9omIg4UtuSsX2vkVsxfB/goVXXnAxGRxeMuImHBEzZDoCxybXKZDdRPV/rj3pSUsuBKz9Jxb15GmoKiTD/g84mKywn9gK9f5GfysbRy0zJF5FcuwD8Z+Zn22GZo2PzwkbmmkV/1opk+oYt5PGzWKPCzWFOrgfD4qPln/fnC4z4AtAv7LNEKddYB/Zilh6PpmNOOrGsKU1H9AVg+g6neBQhQJR0lMXhLVIGIN4QCVhuXwr9TKqTvIm2fzKdYGr6f1vqiZKXxdkPhT6h7f822Or/VZnXU7IEqa9sN/Hjsy9tTKxmfHvoJlrPB4koCi8nSf8Pyr53Dx5WLHZ75o3V4nacX6ewFT9ch0chWM6badQSW2pVpqUvd11DXpGbj7dELwS3qw754LjspafPhnobJeMC9YK88Jeko8Uo1j+Oe5XL0UzGna0RTsu7JKwSA8WU+u8fKxLs4OKauxrd1lk/zEElvFtpmv7k7OdCe0Hjm4WNLtc8Onwiu7POMzn2WW9DGTHM1U19Q6QCmVDMtWgS0D9mv12WmcfZwiiHS50+bWjOCtNglU1WgVRMWFMXpk70U4vBxOi8spERYM2hoJy1tV64McMqSVqFwhQ/ZvNfhsww6FuNN/RahlHekcxKUyxSrz4G6auOFqtGtejEtKZS31o0tfPVlhTPTsBlEWdkrTwda6HQyX+fuZMc/QQHa9hvltrLzGmc0t7IbbQM6vjKSJd6Uswb2VU31rMG9JELP5l3U83lXSYJSiRmdPPdosablaKDb1VTyywfdPgH0uZaSf5rjhpJbBi3HTvLhaNNOqglF2bWxUs2keGNnTk7Vvs7ti9+02dfZZsEld19noUQhJ1EVlvSsFZ+MBTwZ0gqfu2AmdVIqPM4aaGQHTc7RV1tHXu45ENoMvxRuZjJZRNo+c5KWew4SHrcBgHLRqdkEY0TtFhSRhMf5KrUb8gost1bYY3WK1NnJNxdUNT181nuaEvi4hhf4ULn1UJvTOrcG3r++PYoy+BehDNLy4sO0wWTLtOq1QZMdPUdELOjKnZUittxtUpkVgr2sEKjZoNKSyTBDnSd1aEDUK43DRheGb9cBcvL4MhvZdrx7/PjBWZ+jIq9ZBmo4E7KlkqHgNUH+2tGpF1rVcw4otfwoliX//6mUqH/FJdzuZKI3cxlT/btycqX5EMGmlhdKNaESrtl5lod67l5GnjPC7P+QZIuaFjx+xkRmmw8ML8Jss2km9Ua73GjuMhIgvWz7iMor2q7uCEDHXzbB/vMJg90zcrzFWWIB6OHjVUwpHDqlw/SUf39Kx1QYYMtr6oN/qDoI7ctPFJm4zpnhiUycrdrECZLOGubZ9FO0Mu/3hnxD18SwWtdwZNe+KdatjVws8UTAtaQCF7414JYb2p0mNbZK5Ir23dMXuRy38UT/JmAk5NJmQrLtlw6uLUHr5Wcyx9kR/wM= \ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/Distributed.png b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/Distributed.png deleted file mode 100644 index d96ca216..00000000 Binary files a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/Distributed.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/README.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/README.md deleted file mode 100644 index 0fd4bb63..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP1/README.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -slug: /MEP-1-distributed-metal-control-plane -title: MEP-1 -sidebar_position: 1 ---- - -# Distributed Metal Control Plane - -This enhancement proposal was replaced by [MEP18](../MEP18/README.md). - -## Problem Statement - -We face the situation that we argue for running bare metal on-premises because this way the customers can control where and how their software and data are processed and stored. -On the other hand, we have currently decided that our metal-api control plane components run on a kubernetes cluster (in our case on a cluster provided by one of the available hyperscalers). - -Running the control plane on Kubernetes has the following benefits: - -- Ease of deployment -- Get most, if not all, of the required infrastructure services like (probably incomplete): - - IPs - - DNS - - L7-Loadbalancing - - Storage - - S3 Backup - - High Availability - -Using a kubernetes as a service offering from one of the hyperscalers, enables us to focus on using kubernetes instead of maintaining it as well. - -## Goal - -It would be much saner if metal-stack has no, or only minimal dependencies to external services. Imagine a metal-stack deployment in a plant, it would be optimal if we only have to deliver a single rack with servers and networking gear installed and wired, plug that rack to the power supply and a internet uplink and its ready to go. - -Have a second plant which you want to be part of all your plants? Just tell both that they are part of something bigger and metal-api knows of two partitions. - -## Possible Solutions - -We can think of two different solutions to this vision: - -1. Keep the central control plane approach and require some sort of kubernetes deployment accessible from the internet. This has the downside that the user must, provide a managed kubernetes deployment in his own datacenter or uses a hyperscaler. Still not optimal. -1. Install the metal-api and all its dependencies in every partition, replicate or shard the databases to every connected partition, make them know each other. Connect the partitions over the internet with some sort of vpn to make the services visible to each other. - -As we can see, the first approach does not really address the problem, therefore i will describe solution #2 in more details. - -## Central/Current setup - -### Stateful services - -Every distributed system suffer from handling state in a scalable, fast and correct way. To start how to cope with the state, we first must identify which state can be seen as partition local only and which state must be synchronous for read, and synchronous for writes across partitions. - -Affected states: - -- masterdata: e.g. tenant and project must be present in every partition, but these are entities which are read often but updates are rare. A write can therefore be visible with a decent delay in a distinct partition with no consequences. -- ipam: the prefixes and ip´s allocated from machines. These entities are also read often and rare updates. But we must differentiate between dirty reads for different types. A machine network is partition local, ips acquired from such a network must by synchronous in the same partition. Ips acquired from global networks such as internet must by synchronous for all partitions, as otherwise a internet ip could be acquired twice. -- vrf ids: they must only be unique in one partition -- image and size configurations: read often, written seldom, so no high requirements on the storage of these entities. -- images: os images are already replicated from a central s3 storage to a per partition s3 service. metal-hammer kernel and initrd are small and pull always from the central s3, can be done similar to os images. -- machine and machine allocation: must be only synchronous in the partition -- switch: must be only synchronous in the partition -- nsq messages: do not need to cross partition boundaries. No need to keep the messages persistent, even the opposite is true, we don't want to have the messages persist for a longer period. - -Now we can see that the most critical state to held and synchronize are the IPAM data, because these entities must be guaranteed to be synchronously updated, while being updated frequently. - -Datastores: - -We use three different types of datastores to persist the states of the metal application. - -- rethinkdb is the main datastore for almost all entities managed by metal-api -- postgresql is used for masterdata and ipam data. -- nsq uses disk and memory tho store the messages. - -### Stateless services - -These are the easy part, all of our services which are stateless can be scaled up and down without any impact on functionality. Even the stateful services like masterdata and metal-api rely fully on the underlying datastore and can therefore also be scaled up and down to meet scalability requirements. - -Albeit, most of these services need to be placed behind a loadbalancer which does the L4/L7 balancing across the started/available replicas of the service for the clients talking to it. This is actually provided by kubernetes with either service type loadbalancer or type clusterip. - -One exception is the `metal-console` service which must have the partition in it´s dns name now, because there is no direct network connectivity between the management networks of the partitions. See "Network Setup) - -## Distributed setup - -### State - -In order to replicate certain data which must be available across all partitions we can use on of the existing open source databases which enable such kind of setup. There are a few available out there, the following incomplete list will highlight the pro´s and cons of each. - -- RethinkDB - - We already store most of our data in RethinkDB and it gives already the ability to synchronize the data in a distributed manner with different guarantees for consistency and latency. This is described here: [Scaling, Sharding and replication](https://rethinkdb.com/docs/sharding-and-replication/). But because rethinkdb has a rough history and unsure future with the last release took more than a year, we in the team already thought that we eventually must move away from rethinkdb in the future. - -- Postgresql - - Postgres does not have a multi datacenter with replication in both directions, it just can make the remote instance store the same data. - -- CockroachDB - - Is a Postgresql compatible database engine on the wire. CockroachDB gives you both, ACID and geo replication with writes allowed from all connected members. It is even possible to configure [Follow the Workload](https://www.cockroachlabs.com/docs/stable/topology-follow-the-workload) and [Geo Partitioning and Replication](https://www.cockroachlabs.com/docs/v19.2/topology-geo-partitioned-replicas). - -If we migrate all metal-api entities to be stored the same way we store masterdata, we could use cockroachdb to store all metal entities in one ore more databases spread across all partitions and still ensure consistency and high availability. - -A simple setup how this would look like is shown here. - -![Simple CockroachDB setup](Distributed.png) - -go-ipam was modified in a example PR here: [PR 17](https://github.com/metal-stack/go-ipam/pull/17) - -### API Access - -In order to make the metal-api accessible for api users like `cloud-api` or `metalctl` as easy at it is today, some effort has to be taken. One possible approach would be to use a external loadbalancer which spread the requests evenly to all metal-api endpoints in all partitions. Because all data are accessible from all partitions, a api request going to partition A with a request to create a machine in partition B, will still work. If on the other hand partition B is not in a connected state because the interconnection between both partitions is broken, then of course the request will fail. - -**IMPORTANT** -The NSQ Message to inform `metal-core` must end in the correct partition - -To provide such a external loadbalancer we have several opportunities: - -- Cloudflare or comparable CDN service. -- BGP Anycast from every partition - -Another setup would place a small gateway behind the metal-api address, which forwards to the metal-api in the partition where the request must be executed. This gateway, `metal-api-router` must inspect the payload, extract the desired partition, and forward the request without any modifications to the metal-api endpoint in this partition. This can be done for all requests, or if we want to optimize, only for write accesses. - -## Network setup - -In order to have the impact to the overall security concept as minimal as possible i would not modify the current network setup. The only modifications which has to be made are: - -- Allow https ingress traffic to all metal-api instances. -- Allow ssh ingress traffic to all metal-console instances. -- Allow CockroachDB Replication between all partitions. -- No NSQ traffic from outside required anymore, except we cant solve the topic above. - -A simple setup how this would look like is shown here, this does not work though because of the forementioned NSQ issue. - -![API and Console Access](Distributed-API.png) - -Therefore we need the `metal-api-router`: - -![Working API and Console Access](Distributed-API-Working.png) - -## Deployment - -The deployment of our components will substantially differ in a partition compared to a the deployment we have actually. Deploying it in kubernetes in the partition would be very difficult to achieve because we have no sane way to deploy kubernetes on physical machines without a underlying API. -I would therefore suggest to deploy our components in the same way we do that for the services running on the management server. Use systemd to start docker containers. - -![Deployment](Distributed-Deployment.png) diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP10/README.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP10/README.md deleted file mode 100644 index 6811cdc0..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP10/README.md +++ /dev/null @@ -1,197 +0,0 @@ ---- -slug: /MEP-10-sonic-support -title: MEP-10 -sidebar_position: 10 ---- - -# SONiC Support - -As writing this proposal, metal-stack only supports Cumulus on Broadcom ASICs. Unfortunately, after the acquisition of -Cumulus Networks by Nvidia, Broadcom decided to cut its relationship with Cumulus, and therefore Cumulus 4.2 is the last -version that supports Broadcom ASICs. Since trashing the existing hardware is not a solution, adding support for a -different network operating system is necessary. - -One of the remaining big players is [SONiC](https://sonic-net.github.io/SONiC/), which Microsoft created to scale the -network of Azure. It's an open-source project and is now part of the [Linux Foundation](https://www.linuxfoundation.org/press/press-release/software-for-open-networking-in-the-cloud-sonic-moves-to-the-linux-foundation). - -For a general introduction to SONiC, please follow the [Architecture](https://github.com/sonic-net/SONiC/wiki/Architecture) official -documentation. - -## ConfigDB - -On a cold start, the content of `/etc/sonic/config_db.json` will be loaded into the Redis database `CONFIG_DB`, and both -contain the switch's configuration except the BGP unnumbered configuration, which still has to be configured directly by -the frr configuration files. The SONiC community is working to remove this exception, but no release date is known. - -## BGP Configuration - -Frr runs inside a container, and a shell script configured it on the container startup. For BGP unnumbered, we must set -the configuration variable `docker_routing_config_mode` to `split` to prevent SONiC from overwriting our configuration -files created by `metal-core`. But by using the split mode, the integrated configuration mode of frr is deactivated, and -we have to write our BGP configuration to the daemon-specific files `bgp.conf`, `staticd.conf`, and `zebra.conf` instead -to `frr.conf`. - -```bash -elif [ "$CONFIG_TYPE" == "split" ]; then - echo "no service integrated-vtysh-config" > /etc/frr/vtysh.conf - rm -f /etc/frr/frr.conf -``` - -Reference: [docker-init](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/docker_init.sh#L69) - -Adding support for the integrated configuration mode, we must at least adjust the startup shell script and the supervisor configuration: - -```bash -{% if DEVICE_METADATA.localhost.docker_routing_config_mode is defined and DEVICE_METADATA.localhost.docker_routing_config_mode == "unified" %} -[program:vtysh_b] -command=/usr/bin/vtysh -b -``` - -Reference: [supervisord.conf](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-fpm-frr/frr/supervisord/supervisord.conf.j2#L157) - -## Non-BGP Configuration - -For the Non-BGP configuration we have to write it into the Redis database directly or via one of the following interfaces: - -- `config replace ` -- the Mgmt Framework -- the SONiC restapi - -Directly writing into the Redis database isn't a stable interface, and we must determine the create, delete, and update -operations on our own. The last point is also valid for the Mgmt Framework and the SONiC restapi. Furthermore, the -Mgmt Framework doesn't start anymore for several months, and a [potential fix](https://github.com/sonic-net/sonic-buildimage/pull/10893) -is still not merged. And the SONiC restapi isn't enabled by default, and we must build and maintain our own SONiC images. - -Using `config replace` would reduce the complexity in the `metal-core` codebase because we don't have to determine the -actual changes between the running and the desired configuration. The approach's drawbacks are using a version of SONiC -that contains the PR [Yang support for VXLAN](https://github.com/sonic-net/sonic-buildimage/pull/7294), and we must provide -the whole new startup configuration to prevent unwanted deconfiguration. - -### Configure Loopback interface and activate VXLAN - -```json -{ - "LOOPBACK_INTERFACE": { - "Loopback0": {}, - "Loopback0|": {} - }, - "VXLAN_TUNNEL": { - "vtep": { - "src_ip": "" - } - } -} -``` - -#### Configure MTU - -```json -{ - "PORT": { - "Ethernet0": { - "mtu": "9000" - } - } -} -``` - -#### Configure PXE Vlan - -```json -{ - "VLAN": { - "Vlan4000": { - "vlanid": "4000" - } - }, - "VLAN_INTERFACE": { - "Vlan4000": {}, - "Vlan4000|": {} - }, - "VLAN_MEMBER": { - "Vlan4000|": { - "tagging_mode": "untagged" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104000_Vlan4000": { - "vlan": "Vlan4000", - "vni": "104000" - } - } -} -``` - -#### Configure VRF - -```json -{ - "INTERFACE": { - "Ethernet0": { - "vrf_name": "vrf104001" - } - }, - "VLAN": { - "Vlan4001": { - "vlanid": "4001" - } - }, - "VLAN_INTERFACE": { - "Vlan4001": { - "vrf_name": "vrf104001" - } - }, - "VRF": { - "vrf104001": { - "vni": "104001" - } - }, - "VXLAN_TUNNEL_MAP": { - "vtep|map_104001_Vlan4001": { - "vlan": "Vlan4001", - "vni": "104001" - } - } -} -``` - -## DHCP Relay - -The DHCP relay container only starts if `DEVICE_METADATA.localhost.type` is equal to `ToRRouter`. - -## LLDP - -SONiC always uses the local port subtype for LLDP and sets it to some freely configurable alias field of the interface. - -```python -# Get the port alias. If None or empty string, use port name instead -port_alias = port_table_dict.get("alias") -if not port_alias: - self.log_info("Unable to retrieve port alias for port '{}'. Using port name instead.".format(port_name)) - port_alias = port_name - -lldpcli_cmd = "lldpcli configure ports {0} lldp portidsubtype local {1}".format(port_name, port_alias) -``` - -Reference: [lldpmgr](https://github.com/sonic-net/sonic-buildimage/blob/202205/dockers/docker-lldp/lldpmgrd#L153) - -## Mgmt Interface - -The mgmt interface is `eth0`. To configure a static IP address and activate the Mgmt VRF, use: - -```json -{ - "MGMT_INTERFACE": { - "eth0|": { - "gwaddr": "" - } - }, - "MGMT_VRF_CONFIG": { - "vrf_global": { - "mgmtVrfEnabled": "true" - } - } -} -``` - -[IP forwarding is deactivated on `eth0`](https://github.com/sonic-net/sonic-buildimage/blob/202205/files/image_config/sysctl/sysctl-net.conf#L7), and no IP Masquerade is configured. diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP11/README.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP11/README.md deleted file mode 100644 index 87f48a10..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP11/README.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -slug: /MEP-11-auditing-of-metal-stack-resources -title: MEP-11 -sidebar_position: 11 ---- - -# Auditing of metal-stack resources - -Currently no logs of the ownership of resources like machines, networks, ips and volumes are generated or kept. Though due to legal requirements data centers are required to keep track of this ownership over time to prevent liability issues when opening the platform for external users. - -In this proposal we want to introduce a flexible and low-maintenance approach for auditing on top of [Meilisearch](https://www.meilisearch.com/). - -## Overview - -In general our auditing logs will be collected by a request interceptor or middleware. Every request and response will be processed and eventually logged to Meilisearch. -Meilisearch will be configured to regularly create chunks of the auditing logs. These finished chunks will be backed up to a S3 compatible storage with a read-only option enabled. - -Of course sensitive data like session keys or passwords will be redacted before logging. We want to track relevant requests and responses. If auditing the request fails, the request itself will be aborted and will not be processed further. The requests and responses that will be audited will be annotated with a correlation id. - -Transferring the meilisearch auditing data chunks to the S3 compatible storage will be done by a sidecar cronjob that is executed periodically. -To avoid data manipulation the S3 compatible storage will be configured to be read-only. - -## Whitelisting - -To reduce the amount of unnecessary logs we want to introduce a whitelist of resources and operations on those that should be logged. -Other requests will be passed directly to the next middleware or web service without any further processing. - -As we are only interested in mutating endpoints, we ignore all `GET` requests. -The whitelist includes all `POST`, `PUT`, `PATCH` and `DELETE` endpoints of the HTTP middleware except for the following (non-manipulating) route suffixes: - -- `/find` -- `/notify` -- `/try` and `/match` -- `/capacity` -- `/from-hardware` - -Regarding GRPC audit trails, they are not so interesting because only internal clients are using this API. However, we can log the trails of the `Boot` service, which can be interesting to revise the machine lifecycle. - -## Chunking in Meilisearch - -We want our data to be chunked in Meilisearch. To accomplish this, we rotate the index identifier on a scheduled basis. The index identifiers will be derived from the current date and time. - -To keep things simple, we only support hourly, daily and monthly rotation. The eventually prefixed index names will only include relevant parts of date and time like `2021-01`, `2021-01-01` or `2021-01-01_13`. - -The metal-api will only write to the current index and switches to the new index on rotation. The metal-api will never read or update data in any indices. - -## Moving chunks to S3 compatible storage - -As Meilisearch will be filled with data over time, we want to move completed chunks to a S3 compatible storage. This will be done by a sidecar cronjob that is executed periodically. Note that the periods of the index rotation and the cronjob execution don't have to match. - -When the backup process gets started, it initiates a [Meilisearch dump](https://www.meilisearch.com/docs/learn/advanced/dumps) of the whole database across all indices. Once the returned task is finished, the dump must be copied from a Meilisearch volume to the S3 compatible storage. After a successful copy, the dump can be deleted. - -Now we want to remove all indices from Meilisearch, except the most recent one. For this, we [get all indices](https://www.meilisearch.com/docs/reference/api/indexes#list-all-indexes), sort them and [delete each index](https://www.meilisearch.com/docs/reference/api/indexes#delete-an-index) except the most recent one to avoid data loss. - -For the actual implementation, we can build upon [backup-restore-sidecar](https://github.com/metal-stack/backup-restore-sidecar). But due to the index rotation and the fact, that older indices need to be deleted, this probably does not fit into the mentioned sidecar. - -## S3 compatible storage - -The dumps of chunks should automatically deleted after a certain amount of time, once we are either no longer allowed or required to keep them. -The default retention time will be 6 months. Ideally already uploaded chunks should be read-only to prevent data manipulation. - -A candidate for the S3 compatible storage is Google Cloud Storage, which allows to configure automatic expiration of objects through a [lifecycle rule](https://cloud.google.com/storage/docs/managing-lifecycles?hl=en#storage-set-lifecycle-config-go). - -## Affected components - -- metal-api grpc server needs an auditing interceptor -- metal-api web server needs an auditing filter chain / middleware -- metal-api needs new command line arguments to configure the auditing -- mini-lab needs a Meilisearch instance -- mini-lab may need a local S3 compatible storage -- we need a sidecar to implement the backup to S3 compatible storage -- Consider auditing of volume allocations and freeings outside of metal-stack - -## Alternatives considered - -Instead of using Meilisearch we investigated using an immutable database like [immudb](https://immudb.io/). But immudb does not support chunking of data and due to its immutable nature, we will never be able to free up space of expired data. Even if we are legally allowed or required to delete data, we will not be able to do so with immudb. - -In another variant of the Meilisearch approach the metal-api would also be responsible for copying chunks to the S3 compatible storage and deleting old indices. But separating the concerns allows completely different implementations for every deployment stage. diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP12/README.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP12/README.md deleted file mode 100644 index 65532c57..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP12/README.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -slug: /MEP-12-rack-spreading -title: MEP-12 -sidebar_position: 12 ---- - -# Rack Spreading - -Currently, when creating a machine through the metal-api, the machine is placed randomly inside a partition. This algorithm does not consider spreading machines across different racks and different chassis. This may lead to the situation that a group of machines (that for example form a cluster) can end up being placed in the same rack and the same chassis. - -Spreading a group of machines across racks can enhance availability for scenarios like a rack losing power or a chassis meltdown. - -So, instead of just randomly deciding the placement of a machine candidate, we want to propose a placement strategy that attempts to spread machine candidates across the racks inside a partition. - -Furthermore a followup improvement to guarantee that machines are really spread across multiple racks, even if multiple machines are ordered in parallel, was implemented with [PR490](https://github.com/metal-stack/metal-api/pull/490). - -## Placement Strategy - -Machines in the project are spread across all available racks evenly within a partition (best effort). For this, an additional request to the datastore has to be made in order to find allocated machines within the project in the partition. - -The algorithm will then figure out the least occupied racks and elect a machine candidate randomly from those racks. - -The user can optionally pass placement tags which will be considered for spreading the machines as well (this will for example allow spreading by a cluster id tag inside the same project). - -## API - -```golang -// service/v1/machine.go - -type MachineAllocation struct { - // existing fields are omitted for readability - PlacementTags []string `json:"placement_tags" description:"by default machines are spread across the racks inside a partition for every project. if placement tags are provided, the machine candidate has an additional anti-affinity to other machines having the same tags"` -} -``` diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP13/README.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP13/README.md deleted file mode 100644 index 2dde20f5..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP13/README.md +++ /dev/null @@ -1,111 +0,0 @@ ---- -slug: /MEP-13-dual-stack-support -title: MEP-13 -sidebar_position: 13 ---- - -# Dual-stack Support - -dual-stack support is required to be able to create Kubernetes clusters with either IPv6 single-stack or dual-stack enabled. -With the inherent scarcity of IPv4 addresses, the need to be able to use IPv6 has increased. - -Full IPv6 dual-stack support was added to Kubernetes with v1.23 as stable. - -Gardeners have had full IPv6 dual-stack support since `v1.109`. - -metal-stack manages CIDRs and IP addresses with the [go-ipam](https://github.com/metal-stack/go-ipam) library, which already got full IPv6 support in 2021 (see [https://metal-stack.io/blog/2021/02/ipv6-part1](https://metal-stack.io/blog/2021/02/ipv6-part1)). -But this was only the foundation, more work needs to be done to get full IPv6 support for all aspects managed by metal-stack.io. - -## General Decisions - -For the general decision we do not look at the isolated clusters feature for now as this would make the solution even more complex and we want to introduce IPv6 in smaller steps to the users. - -### Networks - -Currently, metal-stack organizes CIDRs / prefixes into a `network' resource in the metal-api. A network can consist of multiple CIDRs from the same address family. For example, if an operator wants to provide Internet connectivity to provisioned machines, they can start with small network CIDRs. The number of managed network prefixes can then be expanded as needed over time. - -With dual-stack we have to choose between two options: Network per address family or networks with both address families. These options are described in the next section. - -#### Network per Address Family - -This means that we allow networks with CIDRs from one address family only, one for IPv4 and one for IPv6. - -The machine creation process will not change if the machine only needs to be either IPv4 or IPv6 addressable. -But if on the other side, the machine need to be able to connect to both address families, the machine creation needs to specify two networks, one for IPv4 and one for IPv6. -Also there will be 2 distinct VRF IDs for every network with a different address family. - -#### Network with both Address Families - -Make a network dual address family capable, meaning that you can add multiple cidrs from both address families to a network. -Then the machine creation will remain the same for single-stack and dual-stack cases, but the ip address allocation will need to specify the address family from which to allocate an ip address when the network is dual-stack. -This does not break the existing API, but allows existing extensions to easily add dual-stack support. -To avoid additional checking of which address families are available on this network during an ip allocation call, we could store the address families in the network. - -#### Decision - -The decision was made to go with the having both address families in a single network entity because we think this is the most flexible way to support dual-stack machines and Kubernetes clusters as well as single-stack with the least amount of modifications on the networking side. - -### Examples - -To illustrate the the usage we start by creating a tenant super network which has both address families: - -```yaml ---- -id: tenant-super-network-mini-lab -name: Project Super Network -description: Super network of all project networks -partitionid: mini-lab -prefixes: - - 10.0.0.0/16 - - 2001:db8:0:10::/64 -defaultchildprefixlength: - IPv4: 22 - IPv6: 96 -privatesuper: true -``` - -In order to create this network, we simple call: - -```bash -metalctl network create -f tenant-super.yaml -``` - -This is usually done during the initial setup of the environment. - -Next step is to allocate a tenant network where the machines of a project can be placed: - -```bash -metalctl network allocate --partition mini-lab --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 --name my-node-network -``` - -This leads to the following network allocation: - -```yaml -id: 2d2c0350-3f66-4597-ae97-ef6797232212 -name: my-node-network -parentnetworkid: tenant-super-network-mini-lab -partitionid: mini-lab -prefixes: - - 10.0.0.0/22 - - 2001:db8:0:10::/96 -projectid: 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -vrf: 20 -consumption: - ipv4: - available_ips: 1024 - available_prefixes: 256 - used_ips: 2 - used_prefixes: 0 - ipv6: - available_ips: 2147483647 - available_prefixes: 1073741824 - used_ips: 1 - used_prefixes: 0 -privatesuper: false -``` - -Users can the create IP addresses from these child networks. By default, they retrieve an IPv4 address except a super network only consists of IPv6 prefixes. In the latter case the users acquire an IPv6 address. - -```bash -metalctl network ip create --network 2d2c0350-3f66-4597-ae97-ef6797232212 --project 4b9b17c4-2d7c-4190-ae95-dda44e430fa6 -``` diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP14/README.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP14/README.md deleted file mode 100644 index 47c06434..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP14/README.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -slug: /MEP-14-independence-from-external-sources -title: MEP-14 -sidebar_position: 14 ---- - -# Independence from external sources - -In certain situations some customers may need to operate and create machines without making use of external services like DNS or NTP through the internet. To make this possible, all metal-stack components reaching external services need to be configurable with custom endpoints. - -So far, the following components have been identified as requiring changes: - -- pixiecore -- metal-hammer -- metal-images - -More components are likely to be added to the list during processing. -For DNS and NTP servers it should be possible to provide default values within a partition. They can either be inherited from machines and firewalls or overwritten with own ones. - -## pixiecore - -A NTP server endpoint need to be configured on the pixiecore. This can be achieved by providing it through environment variables on start up. - -## metal-hammer - -If using a self-deployed NTP server, also the metal-hammer need to be configured with it. For backward compatibility, default values from `pool.ntp.org` and `time.google.com` are used. - -## metal-images - -Configurations for the `metal-images` are different for machines and firewalls. - -## metalctl - -In order to pass DNS and NTP servers to partitions and machines while creating them, the flags `dnsservers` and `ntpservers` need to be added. - -The implementation of this MEP will make metal-stack possible to create and maintain machines without requiring an internet connection. diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP16/README.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP16/README.md deleted file mode 100644 index dbfa59d6..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP16/README.md +++ /dev/null @@ -1,332 +0,0 @@ ---- -slug: /MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller -title: MEP-16 -sidebar_position: 16 ---- - -# metal-api as an Alternative Configuration Source for the firewall-controller - -In the current situation, a firewall as provisioned by metal-stack is a fully immutable entity. Any modifications on the firewall like changing the firewall ruleset must be done _somehow_ by the user – the metal-api and hence metal-stack is not aware of its current state. - -As part of our [integration with the Gardener project](https://docs.metal-stack.io/stable/overview/kubernetes/#Gardener) we offer a solution called the [firewall-controller](https://github.com/metal-stack/firewall-controller), which is part of our [firewall OS images](https://github.com/metal-stack/metal-images/blob/6318a624861b18a559a9d37299bca5f760eef524/firewall/Dockerfile#L57-L58) and addresses shortcomings of the firewall resource's immutability, which would otherwise be completely impractible to work with. The firewall-controller crashes infinitely if it is not properly configured through the userdata when using the firewall image of metal-stack. - -The firewall-controller approach is tightly coupled to Gardener and it requires the administrator of the Gardener installation to pass a shoot and a seed kubeconfig through machine userdata when creating the firewall. How this userdata has to look like is not documented and is just part of another project called the [firewall-controller-manager](https://github.com/metal-stack/firewall-controller-manager) (FCM), which task is to orchestrate rolling updates of firewall machines in a way that network traffic interruption is minimal when updating a firewall or applying a change to an immutable firewall configuration. - -In general, a firewall entity in metal-stack has similarities to the machine entity but it has a fundamental difference: A user gains ownership over a machine after provisioning. They can access it through SSH, modify it at will and this is completely wanted. For firewalls, however, we do not want a user to access the provisioned firewall as the firewall is a privileged part of the infrastructure with access to the underlay network. The underlay can not be tampered with at any given point in time by a user as it can destroy the entire network traffic flow inside a metal-stack partition. - -For this reason, we have a gap in the metal-stack project in terms of a missing solution for people who do not rely on the Gardener integration. We are basically leaving a user with the option to implement an orchestrated recreation of every possible change on the firewall to minimize traffic interruption for the machines sitting behind the firewall or re-implement the firewall-controller to how they want to use it for their use-case. - -Also we do not have a clear distinction in the API between user and metal-stack operator for firewalls. If a user would allocate a firewall it is also possible for the user to inject his own SSH keys and access the firewall and tamper with the underlay network. - -Parts of these problems are probably going to decrease with the work on [MEP-4](../MEP4/README.md) where there will be dedicated APIs for users and administrators of metal-stack including fine-grained access tokens. - -With this MEP we want to describe a way to improve this current situation and allow other users that do not rely on the Gardener integration – for whatever motivation they have – to adequately manage firewalls. For this, we propose an alternative configuration for the firewall-controller that is native to metal-stack and more independent of Gardener. - -## Proposal - -The central idea of this proposal is allowing the firewall-controller to use the metal-api as a configuration source. This should serve as an alternative strategy to the currently used FCM `Firewall` resource based approach in the Gardener use-case. -Updates of the firewall rules should be possible through the metal-api. - -The firewall-controller itself should now be able to decide which of the two main strategies should be used for the base configuration: a kubeconfig or the metal-api. This should be possible through a dedicated _firewall-controller-config_. - -Using this config will now allow operators to fine-tune the data sources for all of its dynamic configuration tasks independently. -For example the data source of the core firewall rules could be set either from the `Firewall` resource located in the Gardener `Seed` or the metal-apiserver node network entity, while the CWNPs should be fetched and applied from a given kubeconfig (the `Shoot` Kubeconfig in the Gardener case). -This configuration file is intended to be injected during firewall creation through the userdata along with potential source connection credentials. - -```yaml -# the name of the firewall, defaulted to the hostname -name: best-firewall-ever - -sources: - seed: - kubeconfig: /path/to/seed.yaml # current gardener behavior - namespace: shoot--proj--name - shoot: - kubeconfig: /path/to/shoot.yaml # current gardener behavior - namespace: firewall - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - static: - # static should mirror all information provided by the metal or seed/shoot sources - firewall: # optional - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -# all sub-controllers running on the firewall -# each can be configured independently -controllers: - # this is the base controller - firewall: - source: seed # or: metal, static - - # these are optional: when not provided, they are disabled - selfUpdate: - enabled: true - droptailer: - enabled: true - - # these are optional: when not provided, they are disabled - service: - source: shoot # or: metal, static - cwnp: - source: shoot # or: metal, static - monitor: - source: shoot # currently only shoot is supported -``` - -The existing behavior of the firewall-controller writing into `/etc/nftables/firewall-controller.v4` is not changed. The different controller configuration sources are internally treated in the same way as before. The `static` source can be used to prevent the firewall-controller from crashing and consistently providing a static ruleset. This might be interesting for metal-stack native use cases or environments where the metal-api cannot be accessed. - -There must be one central nftables-rule-file-controller that is notified and triggered by all other controllers that contribute to the nftables configuration. - -For example, in order to maintain the existing Gardener integration, the configuration file for the firewall-controller will look like this: - -```yaml -name: shoot--abc--cluster-firewall-def -sources: - seed: - kubeconfig: /etc/firewall-controller/seed.yaml - namespace: shoot--abc--cluster - shoot: - kubeconfig: /etc/firewall-controller/shoot.yaml - namespace: firewall - -controllers: - firewall: - source: seed - - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Plain metal-stack users might use a configuration like this: - -```yaml -name: best-firewall-ever - -sources: - metal: - url: https://metal-api - hmac: some-hmac - type: Metal-View - projectID: abc - -controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - cwnp: - # firewall rules stored in firewall entity - # potential improvement would be to attach the rules to the node network entity - # be aware that the firewall and private networks are immutable - # eventually we introduce a firewall ruleset entity - source: metal -``` - -In highly restricted environments that cannot access metal-api the static source could be used: - -```yaml -name: most-restricted-firewall-ever - -sources: - static: - firewall: - controllerURL: https://... - cwnp: - egress: [] - ingress: [] - -controllers: - firewall: - source: static - - cwnp: - source: static -``` - -### Non-Goals - -- Resolving the missing differentiation between users and administrators by letting users pass userdata and SSH keys to the firewall creation. - - This is even more related to [MEP-4](../MEP4/README.md) than this MEP. - -### Advantages - -- Offers a native metal-stack solution that improves managing firewalls for users by adding dynamic reconfiguration through the metal-api - - e.g., in the mini-lab, users can now allocate a machine, then an IP address and announce this IP from the machine without having to re-create the firewall but by adding a firewall rule to the metal-api. -- Improve consistency throughout the API (firewall rules would reflect what is persisted in metal-api). -- Other providers like Cluster API can leverage this approach, too. -- It can contribute to solving the shoot migration issue (in Cluster API case the `clusterctl move` for firewall objects) - - For Gardener takes the seed out of the equation (of which the kubeconfig changes during shoot migration) - - However: Things like egress rules, rate limiting, etc. are currently not part of the firewall or network entity in the metal-api. These would need to be added to one of them. -- Potentially resolve the issue that end-users can manipulate accounting data of the firewall through the `FirewallMonitor` - - for this we would need to be able to report traffic data to metal-api - -### Caveats - -- Metal-View access is too broad for firewalls. Mitigated by [MEP-4](../MEP4/README.md). -- Polling of the firewall-controller is bad for performance. Mitigated by [MEP-4](../MEP4/README.md). - -### Firewall Controller Manager - -Currently the firewall-controller-manager expects the creators of a `FirewallDeployment` to use the defaulting webhook that is tailored to the Gardener integration in order to generate `Firewall.spec.userdata` or to override it manually. Currently `Firewall.spec.userdata` will never be set explicitly. - -Instead we'd like to propose `Firewall.spec.userdataContents` which will replace the old `userdata`-string by a typed data structure. The FCM will do the heavy lifting while the `FirewallDeployment` creator decides what should be configured. - -```yaml -kind: FirewallDeployment -spec: - template: - spec: - userdataContents: - - path: /etc/firewall-controller/config.yaml - content: | - --- - sources: - static: {} - controllers: - firewall: - source: static - - path: /etc/firewall-controller/seed.yaml - contentFrom: - firewallControllerKubeconfigSecret: - name: seed-kubeconfig - key: kubeconfig - - - path: /etc/firewall-controller/shoot.yaml - contentFrom: - secretRef: - name: shoot-kubeconfig - key: kubeconfig -``` - -### Gardener Extension Provider Metal Stack - -The GEPM should be migrated to the new `Firewall.spec.userdataContents` field. - -### Cluster API Provider Metal Stack - -![architectural overview](firewall-for-capms-overview.svg) - -In Cluster API there are essentially two main clusters: the management cluster and the workload cluster while the CAPMS takes in the role of the GEPM. -Typically a local bootstrap cluster is created in KinD which acts as the management cluster. It creates the workload cluster. Thereafter the ownership of the workload cluster is typically moved (using `clusterctl move`) to a different cluster which will then become the management cluster. -The new management cluster might actually be the workload cluster itself. - -In contrast to Gardener, Cluster API aims to be less opinionated and minimal. It is common practice to not install any non-required components or CRDs into the workload cluster by default. Therefore we cannot expect custom resources like `ClusterwideNetworkPolicy` or `FirewallMonitor` to be installed in the workload cluster but strongly recommend our users to do it. Therefore it's the responsibility of the operator to tell [cluster-api-provider-metal-stack](https://github.com/metal-stack/cluster-api-provider-metal-stack) the kubeconfig for the cluster where these CRDs are installed and defined in. - -A viable configuration for a `MetalStackCluster` that generates firewall rules based of `Service` type `LoadBalancer` and `ClusterwideNetworkPolicy` and expects them to be deployed in the workload cluster is shown below. The `FirewallMonitor` will be reported into the same cluster. - -```yaml -kind: MetalStackCluster -metadata: - name: ${CLUSTER_NAME} -spec: - firewallTemplate: - userdataContents: - - path: /etc/firewall-controller/config.yaml - contentFrom: - secretRef: - name: ${CLUSTER_NAME}-firewall-controller-config - key: controllerConfig - - - path: /etc/firewall-controller/workload.yaml - contentFrom: - # this is the kubeconfig generated by kubeadm - secretRef: - name: ${CLUSTER_NAME}-kubeconfig - key: value ---- -kind: Secret -metadata: - name: ${CLUSTER_NAME}-firewall-controller-config -stringData: - controllerConfig: | - --- - name: ${CLUSTER_NAME}-firewall - - sources: - metal: - url: ${METAL_API_URL} - hmac: ${METAL_API_HMAC} - type: ${METAL_API_HMAC_TYPE} - projectID: ${METAL_API_PROJECT_ID} - shoot: - kubeconfig: /etc/firewall-controller/workload.yaml - namespace: firewall - - controllers: - firewall: - source: metal - selfUpdate: - enabled: true - droptailer: - enabled: true - - service: - source: shoot - cwnp: - source: shoot - monitor: - source: shoot -``` - -Here the firewall-controller-config will be referenced by the `MetalStackCluster` as a `Secret`. Please note that the `Secret`s in `userdataContents` will not be fetched and will directly be passed to the `FirewallDeployment`. At first the reconciliation of it in the FCM will fail due to the missing Kubeconfig secret. After the `MetalStackCluster` has been marked as ready, CAPI will create this missing secret. Effectively the firewall and initial control plane node should be created at the same time. - -This approach allows maximum flexibility as intended by Cluster API and is still able to provide robust rolling updates of firewalls. - -An advanced use case of this flexibility would be a management cluster, that is in charge of multiple workload clusters. Where one workload cluster acts as a monitoring or tooling cluster, receives logs and the firewall monitor for the other workload clusters. The CWNPs could be defined here, all in a separate namespace. - -#### Cluster API Caveats - -When the cluster is pivoted and reconciles its own firewall, a malfunctioning firewall prevents the cluster from self-healing and requires manual intervention by creating a new firewall. This is an inherent problem of the cluster-api approach. It can be circumvented by using an extra cluster to manage workload clusters. - -In the current form of this approach firewalls and therefore the firewall egress and ingress rules are managed by the cluster operators that manage the cluster-api resources. -Hence it will not be possible to gain a fine-grained control over every cluster operator's choices from a central ruleset at the level of metal-stack firewalls. -In case this control surfaces as a requirement, it would need to be implemented in a firewall external to metal-stack. - -## Roadmap - -In general this proposal is not thought to be implemented in one batch. Instead an incremental approach is required. - -1. Enhance firewall-controller-manager - - - Add `FirewallDeployment.spec.template.spec.userdataContents` - -2. Enhance firewall-controller - - - Reduce coupling between controllers - - Introduce controller config - - Abstract module to write into distinct nftable rules for every controller - - Implement `sources.static`, but not `sources.metal` - - GEPM should set `FirewallDeployment.spec.template.spec.userdataContents` - -3. Allow Cluster API to use the FCM with static ruleset - - - Add `firewall.metal-stack.io/paused` annotation (managed by CAPMS during `clusterctl move`, theoretically useful for Gardener shoot migration as well to avoid shallow deletion). - - Reconcile multiple `FirewallDeployment` resources across multiple namespaces. For Gardener the old behavior of reconciling only one namespace should persist. - - Allow setting the `firewall.metal-stack.io/no-controller-connection` annotation through the `FirewallDeployment` (either through the template or inheritance). - - Add `MetalStackCluster.spec.firewallTemplate`. - - Make `MetalStackCluster.spec.nodeNetworkID` optional if `spec.firewallTemplate` given. - -4. Add `sources.metal` as configuration option. - - - Allow updates of firewall rules in the metal-apiserver. - - Depends on [MEP-4](../MEP4/README.md) metal-apiserver progress - -5. Potentially migrate the GEPM to use `sources.metal` diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio deleted file mode 100644 index faea3e3d..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP16/firewall-for-capms-overview.drawio +++ /dev/null @@ -1,4 +0,0 @@ - - - -
handles traffic
Firewall
Firewall Controller
node-exporter
nftables-exporter
droptailer-client
Workload Cluster
droptailer
Configures
Bootstrap or Management Cluster
reconcile
configures
reconcile
Cluster API Provider metal-stack
Metal Stack Cluster CRD
Firewall Deployment CRD
Firewall CRD
Firewall Set CRD
rec
reconcile
reconcile
Firewall Controller Manager
Metal Stack Machine CRD
manages
Admin
Kubeconfig FirewallMonitor
FirewallMonitor CRD
main metal-api
Firewall entity
kubeconfig CWNP
Clusterwide Network Policy CRD
base config
controllerConfig
user-defined
network rules
reports firewall
state
send firewall log lines
controllerConfig
controllerConfig
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg deleted file mode 100644 index 853f8175..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP16/firewall-for-capms-overview.svg +++ /dev/null @@ -1 +0,0 @@ -
handles traffic
handles traffic
Firewall
Firewall
Firewall Controller
Firewall Controller
node-exporter
node-exporter
nftables-exporter
nftables-exporter
droptailer-client
droptailer-client
Workload Cluster
Workload Cluster
droptailer
droptailer
Configures
Configures
Bootstrap or Management Cluster
Bootstrap or Management Cluster
reconcile
reconcile
configures
configures
reconcile
reconcile
Cluster API Provider metal-stack
Cluster API Provider...
Metal Stack Cluster CRD
Metal Stack Cluster...
Firewall Deployment CRD
Firewall Deployment...
Firewall CRD
Firewall CRD
Firewall Set CRD
Firewall Set CRD
rec
rec
reconcile
reconcile
reconcile
reconcile
Firewall Controller Manager
Firewall Controller...
Metal Stack Machine CRD
Metal Stack Machine...
manages
manages
Admin
Admin
Kubeconfig FirewallMonitor
Kubeconfig FirewallMonitor
FirewallMonitor CRD
FirewallMonitor CRD
main metal-api
main metal-api
Firewall entity
Firewall entity
kubeconfig CWNP
kubeconfig CWNP
Clusterwide Network PolicyCRD
Clusterwide Network...
base config
base config
controllerConfig
controllerConfig
user-defined
network rules
user-defined...
reports firewall
state
reports firewall...
send firewall log lines
send firewall log lines
controllerConfig
controllerConfig
controllerConfig
controllerConfig
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP17/README.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP17/README.md deleted file mode 100644 index 35f48970..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP17/README.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -slug: /MEP-17-global-network-view -title: MEP-17 -sidebar_position: 17 ---- - -# Global Network View - -> [!IMPORTANT] -> This MEP assumes the implementation of the metal-apiserver as described by [MEP-4](../MEP4/README.md) which is currently work in progress. - -Having a complete view of the network topology is useful when working with deployments or troubleshooting connectivity issues. -Currently, the API doesn't know of any other switches than the leaf switches. -Information about all other switches and their connections must be gathered from Ansible inventories or by accessing the switches via SSH. -Documentation of each partition's network must be kept in-sync with all changes made to the deployment or cabling. -We would like to expand the API's knowledge of the network to the entire underlay including inter-switch connections as well as BGP statistics and health status. - -## Switch Types - -Registering a switch at the API is done by the metal-core. -Apart from that, it also reconciles port and FRR configuration to adapt to the machine provisioning cycle. -This reconfiguration is only necessary on the leaf switches. -To allow deploying the metal-core on other switches than leaves we need a way of telling it what type of switch it is running on so it can act accordingly. -On any non-leaf switches it will only register the switch and report statistic but not change any configuration. -Supported switch types are - -- `leaf` -- `spine` -- `exit` -- `mgmtleaf` -- `mgmtspine` - -## Network Topology - -All switches should periodically report their LLDP neighbors and port configuration. -This information can be used to quickly identify common network issues, like MTU mismatch or the like. -Ideally, there would be some graphical representation of the network topology containing only the most important information for a quick overview. -It should contain all switches and machines as nodes and all connections as edges of a graph. -Ports, VRFs, and maybe also IPs should be associated with a connection. - -Apart from the topology graph, there should be a way to display more detailed information about both ports of a connection, like - -- MTU -- speed -- IP -- UP/DOWN status -- VRF -- VLAN -- whether it participates in a BGP session - -## BGP Announcements - -The metal-core should collect all routes it knows about and send them to the API along with a timestamp. -Reported routes should be stored to a redis database along with the switch that reported them and the timestamp of the last time they were reported. -An expiration threshold should be defined and all expired routes should be cleaned up periodically. -Whenever new routes are reported they get merged into the existing ones by the strategy: - -- when new, just add -- when existing, update `last_announced` timestamp - -By querying the BGP announcements we can find out whether an allocated IP is still in use. diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/README.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/README.md deleted file mode 100644 index 9c02c0b7..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/README.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -slug: /MEP-18-autonomous-control-plane -title: MEP-18 -sidebar_position: 18 ---- - -# Autonomous Control Plane - -As described in the [deployment chapter](../../../docs/04-For%20Operators/03-deployment-guide.mdx), we strongly recommend Kubernetes as the target platform for running the metal-stack control plane. - -Kubernetes clusters for this purpose are readily available from hyperscalers, metalstack.cloud, or other cloud providers. Simply using a managed Kubernetes cluster greatly simplifies a metal-stack installation. However, sometimes it might be desirable to host the metal-stack control plane autonomously, without the help of another cloud provider. Reasons for this might include corporate policies that prohibit the use of external data center products, or network constraints. - -The Kubernetes cluster hosting the metal-stack control plane must provide at least the following features: - -- Load balancing (for exposing the APIs) -- Persistent storage (for the databases and key-value stores) -- Access to object storage for automated backups of the stateful sets -- Access to a DNS provider supported by one of the used DNS extensions -- Externally accessible DNS records for obtaining officially signed certificates through DNS challenges - -This metal-stack control plane cluster must also be highly available to prevent a complete loss of control over the managed resources in the data center. -Regular Kubernetes updates to apply security fixes and feature updates must be possible in an automated manner. The Day-2 operational overhead of running this cluster in your own datacenter must be reasonable. - -In this chapter, we propose a solution for setting up a metal-stack environment with an autonomous control plane that is independent of another cloud provider. - -## Use Your Own Dogfood - -The most obvious solution is to just deploy a Kubernetes cluster manually in your own data center by utilizing existing tooling for the deployment: - -- k3s -- kubeadm -- vmware and rancher -- talos -- kubespray -- ... (not a complete list) - -However, all these solutions add another layer of complexity that needs to be maintained and operated by people who also need to learn and understand metal-stack. In general, metal-stack in combination with [Gardener](https://gardener.cloud) contains all the necessary tools to provide KaaS, so it makes sense to reuse what is already in place without introducing new dependencies on other products and vendors. - -The only problem here is that Gardener is not yet able to create an initial cluster, which may change with the implementation of [GEP-28](https://github.com/gardener/gardener/blob/master/docs/proposals/28-autonomous-shoot-clusters.md). In the meantime, we suggest using [k3s](https://k3s.io/), which manages the initial metal-stack partition to host the control plane, since the maintenance overhead is acceptable and it is easy to deploy. - -## The Matryoshka Principle - -Instead of directly using the K3s cluster for the production control plane, we propose using it as a minimal control plane cluster which only purpose is to host the production control plane cluster. This layer of indirection brings some reasonable advantages: - -- In the event of an interruption or loss of this minimal control plane cluster, the production control plane remains unaffected, and end users can continue to manage their clusters as normal. -- A dedicated operations team can take care of the Day-2 maintenance of this installation, which can be handy because the tools like k3s are a little different from the rest of the setup (it is likely that more manual maintenance is required than for any other cluster). This would also be true if the initial cluster problem would be solved by the Gardener itself and not using k3s. -- Since the number of shoot clusters to host is static, the resource requirements are minimal and will not change significantly over time. There are no huge resource requirements in terms of cpu, memory and storage. As such, the lack of scalability is not such a big issue. - -So, our proposal is to chain two metal-stack control planes. The initial control plane cluster would use k3s and on this cluster we can spin up a cluster for the production control plane with the use of Gardener. - -The following figure shows how the high-level architecture of this setup looks like. A even more simplified illustration of this setup can be looked up in the appendix[^1]. - -![Autonomous Control Plane Architecture](./autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg) - -The k3s nodes can either be bare metal machines or virtual machines. When using VMs a single k3s node might be a viable solution, too. These nodes are supposed to be setup manually / partly automated with an operating system like Debian. - -To name the cluster that hosts the initial metal-stack control plane and Gardener we use the term _initial cluster_. The initial cluster creates worker nodes to host the _target cluster_. - -## Initial Cluster - -The initial cluster is kept very small. The physical bare metal machines can be any machines and switches which are supported by metal-stack, but can be smaller in terms of cpu, memory and network speed because these machines must only be capable of running the target cluster for the metal-stack control plane. A typical single socket server with 8-16 cores and 64GB of RAM and two NVMe drives of 1TB would be a good starting point. - -In a typical k3s setup, a stateful set would lose the data once the k3s cluster was terminated and started again. But there is a possibility to define parts of the local storage of the server to be provided to the k3s cluster for the PVCs. With that, k3s could be terminated and started again, for example to update and reboot the host os, or update k3s itself and the data will persist. - -Example k3s configuration for persistent storage on the hosts os: - -```yaml -k3s: Cluster -apiVersion: k3s.x-k8s.io/v1alpha4 -name: needle-control-plane -nodes: - - role: control-plane - # add a mount from /path/to/my/files on the host to /files on the node - extraMounts: - - hostPath: /path/to/my/files - containerPath: /files -``` - -Into this cluster metal-stack and Gardener will be deployed. This deployment can be done by a Gitlab runner which is running on this machine. -The mini-lab will be used as a base for this deployment. The current development of [gardener-in-minilab](https://github.com/metal-stack/mini-lab/pull/202) must be extended to host all required extensions to make this a working metal-stack control plane which can manage the machines in the attached bare metal setup. - -In addition to the metal-stack and Gardener deployment, some additional required services are deployed (non-complete list): - -- PowerDNS to serve as a DNS Server for all DNS entries used in the initial and the target cluster, like `api.initial.metal-stack.local`, `gardener-api.initial.metal-stack.local` and the DNS entries for the api servers of the created kubernetes clusters. -- NTP -- Monitoring for the initial cluster and partition -- Optional: OIDC Server for authenticating against the metal-api -- Optional: Container Registry to host all metal-stack and gardener containers -- Optional: Let's Encrypt [boulder](https://github.com/letsencrypt/boulder) as a certificate authority -- ... - -Physical view, minimal setup for a initial cluster with a single physical node: - -![Small Initial Cluster](autonomous-control-plane-images/small-initial-cluster.svg) - -Physical View, bigger ha setup which is spread across two data centers: - -![HA Initial Cluster](autonomous-control-plane-images/ha-initial-cluster.svg) - -### Control Plane High Availability - -Running the initial control plane on a single physical server is not as available as it should be in such a use case. It should be possible to survive a loss of this server, because the server could be lost by many events, such as hardware failure, disk corruption or even failure of the datacenter location where this server is deployed. - -Setting up a second server with the same software components is an option, but the problem of data redundancy must be solved, because neither the gardener control plane, nor the metal-stack control plane can be instantiated twice. - -Given that we provide part of the local storage of the server as backing storage for the stateful sets in the k3s cluster, the data stored on the server itself must be replicated to another server and backed up on a regular basis. - -The replication of ETCD can be achieved through [clustered configuration](https://docs.k3s.io/datastore/ha-embedded) of k3s. Components of metal-stack and Gardener can run standalone and already utilize backup-restore mechanism that must be configured accordingly. For two or more bare metal machine used for the initial cluster, a loadbalancing mechanism for the ingress is required. kube-vip could be a possible solution. - -For monitoring a backend like a Victoria Metrics Cluster would allow spearding the monitoring data across the initial cluster nodes. These metrics should also be backed up in object storage. - -### Partition - -The partition which is managed by the initial cluster can be a simple and small hardware setup but yet capable enough to host the target cluster. It would even be a good practice to create separate target clusters on the initial cluster, e.g. one for the metal-stack control plane and one for the Gardener (maybe one more for monitoring). - -It can follow the metal-stack minimal setup which provides about 8-16 small servers connected to a 1G/s or 10G/s network dataplane. Central storage is optional as the persistence of the services running in these clusters is always backed up to a central object storage. Operations would be much easier if a central storage is provided. - -## Target Cluster - -The target cluster is the metal-stack environment which serves for end-user production use, the control plane is running in a shoot hosted in the initial cluster. The seed(s) and shoot(s) for end-users are created on the machines provided by the target cluster. -These machines can be of a different type in terms of size, but more importantly, these machines are connected to another network dataplane. Also the management infrastructure is separated from the initial cluster management network. - -## Failure Scenarios - -Everything could fail, everything will fail at some point. But this must kept in mind and nothing bad should happen if only one component at a time fails. -If more than one fails, the restoration to a working state must be easily possible and well documented. - -To ensure all possible breakages are documented, we suggest writing a list which summarizes all failure scenarios that might occur including the remediation. - -Here is an example of how a scenario documentation could look like: - -**Scenario**: Initial cluster is gone, all machines have died -**Impact**: Management of the initial cluster infrastructure not possible anymore, the target cluster continues to run but cannot be managed because the API servers are gone. end-users are not affected by this incident. -**Remediation**: The initial cluster nodes must be provisioned from scratch and re-deployed through the CI mechanism. The backups of the stateful sets are automatically restored during this process. - -## Implementation - -As part of this proposal, we provide the following tools and integrations in order to setup an autonomous control plane: - -- Deployment roles for the services like PowerDNS and NTP for the initial cluster -- Stretch goal: Deployment role to setup k3s in clustered configuration for the initial cluster and update it -- Extend the Gardener on mini-lab integration to allow shoot creation in the mini-lab -- Steady integration of the setup (maybe something like [k3d](https://github.com/k3d-io/k3d) in the mini-lab) - -## Appendix - -[^1]: ![metal-stack-chain](autonomous-control-plane-images/metal-stack-chain.svg) diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio deleted file mode 100644 index eafcb514..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.drawio +++ /dev/null @@ -1,535 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine01 -
-
-
-
- - spine01 - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 01 - -
-
-
-
- - Initial cluster node 01 - -
-
-
- - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- spine02 -
-
-
-
- - spine02 - -
-
-
- - - - - - - - - -
-
-
- leaf03 -
-
-
-
- - leaf03 - -
-
-
- - - - - - - - - -
-
-
- leaf04 -
-
-
-
- - leaf04 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 02 - -
-
-
-
- - Initial cluster node 02 - -
-
-
- - - - - - - - - - - - - -
-
-
- - Initial cluster node 03 - -
-
-
-
- - Initial cluster node 03 - -
-
-
- - - - - - - - - - - - - -
-
-
- - mirocloud (initial cluster partition nodes) - -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg deleted file mode 100644 index 99261ada..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/ha-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine01
spine01
leaf01
leaf01
leaf02
leaf02
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Initial cluster node 01
Initial cluster node 01
123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
spine02
spine02
leaf03
leaf03
leaf04
leaf04
Initial cluster node 02
Initial cluster node 02
Initial cluster node 03
Initial cluster node 03
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio deleted file mode 100644 index aae8a12d..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.drawio +++ /dev/null @@ -1,1133 +0,0 @@ - - - - - - - - - - - - - - - - - - - -
-
-
- Initial Cluster -
-
-
-
- - Initial Cluster - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - -
-
-
- K3s Standalone - - - (on Debian) - - -
-
-
-
- - K3s Standalone (on Debian) - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Initial Partition -
-
-
-
- - Initial Partition - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for metal-stack -
-
-
-
- - Target Cluster for metal-stack - -
-
-
- - - - - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - - - - - - -
-
-
- Target Cluster for Gardener -
-
-
-
- - Target Cluster for Gardener - -
-
-
- - - - - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - - - - - - - -
-
-
- Target Partition -
-
-
-
- - Target Partition - -
-
-
- - - - - - - - - - -
-
-
- Gardener Seeds and End-User Shoots -
-
-
-
- - Gardener Seeds and End-User Shoots - -
-
-
- - - - - - - - - - - -
-
-
- provisions -
-
-
-
- - provisions - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - -
-
-
- CI -
-
-
-
- - CI - -
-
-
- - - - - - - - -
-
-
- metal-roles -
-
-
-
- - metal-roles - -
-
-
- - - - - - - - - - -
-
-
- ETCD can be clustered or standalone, backed up by sidecar -
-
-
-
- - ETCD can be clustere... - -
-
-
- - - - - - - - - - -
-
-
- This data will get lost in case local PV gets deleted -
-
-
-
- - This data will get l... - -
-
-
- - - - - - - - - - -
-
-
- We can work with local PVs here, too. -
- backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered. -
-
-
-
- - We can work with local PVs he... - -
-
-
- - - - - - - -
-
-
- ETCD will be deployed in HA configuration on local PVs. -
-
- csi-driver-lvm needs to implement auto deletion of orphaned PVs. -
-
- Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot. -
-
-
-
- - ETCD will be deployed in HA c... - -
-
-
- - - - - - - - - - -
-
-
- More sophisticated storage solutions can be in place. -
-
- (Lightbits, NetApp, ...) -
-
-
-
- - More sophisticated storage so... - -
-
-
- - - - - - - - - - -
-
-
- TODO: Evaluate how to persist these metrics. -
-
-
-
- - TODO: Evaluate how to persist... - -
-
-
- - - - - - - - - - -
-
-
- - 1 VM or -
-
-
- - - 3 Bare Metal Machines - - -
-
-
-
-
- - 1 VM or... - -
-
-
- - - - - - - - - - - - - - -
-
-
- metal-stack -
-
-
-
- - metal-stack - -
-
-
- - - - - - - -
-
-
- metal-api -
-
-
-
- - metal-api - -
-
-
- - - - - - - -
-
-
- metal-db -
-
-
-
- - metal-db - -
-
-
- - - - - - - -
-
-
- ipam-db -
-
-
-
- - ipam-db - -
-
-
- - - - - - - -
-
-
- masterdata-db -
-
-
-
- - masterdata-db - -
-
-
- - - - - - - -
-
-
- headscale-db -
-
-
-
- - headscale-db - -
-
-
- - - - - - - -
-
-
- auditing-db -
-
-
-
- - auditing-db - -
-
-
- - - - - - - -
-
-
- nsqd -
-
-
-
- - nsqd - -
-
-
- - - - - - - - - - - -
-
-
- Gardener -
-
-
-
- - Gardener - -
-
-
- - - - - - - - - - -
-
-
- Virtual Garden -
-
-
-
- - Virtual Garden - -
-
-
- - - - - - - -
-
-
- Gardener Control Plane -
-
-
-
- - Gardener Control Plane - -
-
-
- - - - - - - -
-
-
- gardenlet -
-
-
-
- - gardenlet - -
-
-
- - - - - - - -
-
-
- Garden etcd -
-
-
-
- - Garden etcd - -
-
-
- - - - - - - -
-
-
- Prometheus -
-
-
-
- - Prometheus - -
-
-
- - - - - - - - - - - -
-
-
- Monitoring -
-
-
-
- - Monitoring - -
-
-
- - - - - - - - - - -
-
-
- - Gitlab - -
- - Runner - -
-
-
-
-
- - Gitlab... - -
-
-
- - - - - - - - - - -
-
-
- Services -
-
-
-
- - Services - -
-
-
- - - - - - - -
-
-
- PowerDNS -
-
-
-
- - PowerDNS - -
-
-
- - - - - - - -
-
-
- boulder -
-
-
-
- - boulder - -
-
-
- - - - - - - -
-
-
- NTP -
-
-
-
- - NTP - -
-
-
- - - - - - - -
-
-
- OIDC -
-
-
-
- - OIDC - -
-
-
- - - - - - - -
-
-
- ... -
-
-
-
- - ... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg deleted file mode 100644 index e58e783b..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-autonomous-control-plane-full.svg +++ /dev/null @@ -1 +0,0 @@ -
Initial Cluster
Initial Cluster
metal-roles
metal-roles
CI
CI
K3s Standalone(on Debian)
K3s Standalone (on Debian)
Initial Partition
Initial Partition
Target Cluster for metal-stack
Target Cluster for metal-stack
Metal Control Plane
Metal Control Plane
provisions
provisions
Target Cluster for Gardener
Target Cluster for Gardener
Gardener Control Plane
Gardener Control Plane
Monitoring
Monitoring
Target Partition
Target Partition
Gardener Seeds and End-User Shoots
Gardener Seeds and End-User Shoots
provisions
provisions
metal-roles
metal-roles
CI
CI
metal-roles
metal-roles
ETCD can be clustered or standalone, backed up by sidecar
ETCD can be clustere...
This data will get lost in case local PV gets deleted
This data will get l...
We can work with local PVs here, too.
backup-restore-sidecar for metal-stack databases, for big ones Postgres clustered.
We can work with local PVs he...
ETCD will be deployed in HA configuration on local PVs.

csi-driver-lvm needs to implement auto deletion of orphaned PVs.

Seed metrics get lost, but they report to the monitoring in the Metal Control Plane Shoot.
ETCD will be deployed in HA c...
More sophisticated storage solutions can be in place.

(Lightbits, NetApp, ...)
More sophisticated storage so...
TODO: Evaluate how to persist these metrics.
TODO: Evaluate how to persist...
1 VM or
3 Bare Metal Machines
1 VM or...
metal-stack
metal-stack
metal-api
metal-api
metal-db
metal-db
ipam-db
ipam-db
masterdata-db
masterdata-db
headscale-db
headscale-db
auditing-db
auditing-db
nsqd
nsqd
Gardener
Gardener
Virtual Garden
Virtual Garden
Gardener Control Plane
Gardener Control Plane
gardenlet
gardenlet
Garden etcd
Garden etcd
Prometheus
Prometheus
Monitoring
Monitoring
Gitlab
Runner
Gitlab...
Services
Services
PowerDNS
PowerDNS
boulder
boulder
NTP
NTP
OIDC
OIDC
...
...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio deleted file mode 100644 index cd5cf007..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.drawio +++ /dev/null @@ -1,404 +0,0 @@ - - - - - - - - - - -
-
-
- Partition 1 -
-
-
-
- - Partition 1 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 2 -
-
-
-
- - Partition 2 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Partition 3 -
-
-
-
- - Partition 3 - -
-
- - - - -
-
-
- seeds -
-
-
-
- - seeds - -
-
- - - - -
-
-
- shoots -
-
-
-
- - shoots - -
-
- - - - - - -
-
-
- Production Control Plane -
-
-
-
- - Production Control Plane - -
-
- - - - -
-
-
- metal-stack -
- kubernetes cluster -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- gardener -
- kubernetes cluster -
-
-
-
- - gardener... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - - - - -
-
-
- Control Plane Partition -
-
-
-
- - Control Plane Partition - -
-
- - - - - -
-
-
- backup of stateful sets -
-
-
-
- - backup of stateful sets - -
-
- - - - - - -
-
-
- bare metal machine -
-
-
-
- - bare metal machine - -
-
- - - - -
-
-
- metal-stack -
- and -
- gardener -
- kubernetes cluster -
- running in kind -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- - Manages - -
-
-
-
- - Manages - -
-
- - - - - -
-
-
- S3 -
-
-
-
- - S3 - -
-
- - - - -
-
-
- Needle -
-
-
-
- - Needle - -
-
- - - -
-
-
- - Nail - -
-
-
-
- - Nail - -
-
-
- - - - - Text is not SVG - cannot display - - - -
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg deleted file mode 100644 index 8f88ba14..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/metal-stack-chain.svg +++ /dev/null @@ -1 +0,0 @@ -
Partition 1
Partition 1
seeds
seeds
shoots
shoots
Partition 2
Partition 2
seeds
seeds
shoots
shoots
Partition 3
Partition 3
seeds
seeds
shoots
shoots
Production Control Plane
Production Control Plane
metal-stack
kubernetes cluster
metal-stack...
gardener
kubernetes cluster
gardener...
Manages
Manages
Control Plane Partition
Control Plane Partition
backup of stateful sets
backup of stateful sets
bare metal machine
bare metal machine
metal-stack
and
gardener
kubernetes cluster
running in kind
metal-stack...
Manages
Manages
S3
S3
Needle
Needle 
Nail
Nail
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio deleted file mode 100644 index a75ee340..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.drawio +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - - - - - - - - - - - - - - 1 - - - - - 2 - - - - - 3 - - - - - 4 - - - - - 5 - - - - - 6 - - - - - 7 - - - - - 8 - - - - - 9 - - - - - 10 - - - - - 11 - - - - - 12 - - - - - - - - - - - - - -
-
-
- internet-router-management -
-
-
-
- - internet-router-management - -
-
-
- - - - - - - - - -
-
-
- management-switch-and-server -
-
-
-
- - management-switch-and-server - -
-
-
- - - - - - - - - -
-
-
- leaf01 -
-
-
-
- - leaf01 - -
-
-
- - - - - - - - - -
-
-
- leaf02 -
-
-
-
- - leaf02 - -
-
-
- - - - - - - - - - - - - -
-
-
- Initial cluster node -
-
-
-
- - Initial cluster node - -
-
-
- - - - - - - - - - - - - -
-
-
- mirocloud (initial cluster partition nodes) -
-
-
-
- - mirocloud (initial cluster... - -
-
-
-
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg deleted file mode 100644 index a9d29f05..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP18/autonomous-control-plane-images/small-initial-cluster.svg +++ /dev/null @@ -1 +0,0 @@ -123456789101112
internet-router-management
internet-router-management
management-switch-and-server
management-switch-and-server
leaf01
leaf01
leaf02
leaf02
Initial cluster node
Initial cluster node
mirocloud (initial cluster partition nodes)
mirocloud (initial cluster...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP2/README.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP2/README.md deleted file mode 100644 index c7f2360a..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP2/README.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -slug: /MEP-2-two-factor-authentication -title: MEP-2 -sidebar_position: 2 ---- - -# Two Factor Authentication diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP3/README.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP3/README.md deleted file mode 100644 index 5ce36721..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP3/README.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -slug: /MEP-3-machine-re-installation -title: MEP-3 -sidebar_position: 3 ---- - -# Machine Re-Installation - -In the current metal-api only machine installations are possible, performing a machine upgrade is only possible by creating a new machine and delete the old one. -This has the drawback that in case a lot of data is stored on the local disks, a full restore of the original data must be performed. - -To prevent this, we will introduce a new metal-api endpoint to reinstall the machine with a new image, _without_ actually deleting the data stored on the additional hard disks. - -Storage is a difficult task to get right and reliable. A short analysis of our different storage requirements lead to 3 different scenarios. - -- Storage for the etcd pvs in the seed cluster of every partition. - This is the most important storage in our setup because these etcd pods serve as configuration backend for all customer kubernetes clusters. If they fail, the cluster is down. However gardener deploys a backup and restore sidecar into the etcd pod of every customer kubernetes control plane, and if this sidecar detects a corrupt or missing etcd database file(s) it starts automatic restore from the configured backup location. This will take some minutes. If for example a node dies, and gardener creates a new node instead, the csi-lvm created pv is not present on that node. Kubernetes will not schedule the missing etcd pod on this node because it has a local PV configured and is therefore tainted to run only on that node. To let kubernetes create that pod anyhow, someone has to either remove the taint, or delete the pod. If this is done, the pod starts and the restore of the etcd data can start as well. You can see this is a bit too complicated and will take the customer cluster down for a while (not measured yet but in the range of 5-10 minutes). -- Storage in customer clusters. - This was not promised in 2020. We have a intermediate solution with the provisioning of csi-lvm by default into all customer clusters. Albeit this is only local storage and will get deleted if a node dies. -- S3 Storage. - We have two possibilities to cope with storage: - - In place update of the OS with a daemonset - This will be fast and simple, but might fail because the packages being installed are broken right now, or a filesystem gets full, or any other failure you can think of during a os update. Another drawback is that metal-api does not reflect the updated os image. - - metal-api get a machine reinstall endpoint - With this approach we leverage from existing and already proven mechanisms. Reinstall must keep all data except the sata-dom. Gardener currently is not able to do an update with this approach because it can only do `rolling` updates. Therefore a additional `osupdatestrategy` has to be implemented for metal and other providers in gardener to be able to leverage the metal reinstall on the same machineID approach. - -If reinstall is implemented, we should focus on the same technology for all scenarios and put ceph via rook.io into the kubernetes clusters as additional StorageClass. It has to be checked whether to use the raw disk or a PV as the underlay block device where ceph stores its data. - -## API and behavior - -The API will get an new endpoint "reinstall" this endpoint takes two arguments: - -- machineID -- image - -No other aspects of the machine can be modified during the re-installation. All data stored in the existing allocation will be preserved, only the image will be modified. -Once this endpoint was called, the machine will get a `reboot` signal with the boot order set to PXE instead of HDD and the network interfaces on the leaf are set to PXE as well. Then the normal installation process starts: - -- unchanged: PXE boot with metal-hammer -- changed: metal-hammer first checks with the machineID in the metal-api (through metal-core) if there is already a allocation present -- changed: if a allocation is present and the allocation has set `reinstall: true`, wipe disk is only executed for the root disk, all other disks are untouched. -- unchanged: the specified image is downloaded and burned, `/install.sh` is executed -- unchanged: successful installation is reported back, network is set the the vrf, boot order is set to HDD. -- unchanged: distribution kernel is booted via kexec - -We can see that the `allocation` requires one additional parameter: `reinstall` and metal-hammer must check for already existing allocation at an earlier stage. - -Components which requires modifications (first guess): - -- metal-hammer: - - check for allocation present earlier - - evaluation of `reinstall` flag set - - wipe of disks depends on that flag - - Bonus: move configuration of disk layout and primary disk detection algorithm (PDDA) from metal-hammer into metal-api. - metal-api **MUST** reject reinstallation if the disk found by PDDA does not have the `/etc/metal` directory! -- metal-core: - - probably nothing -- metal-api: - - new endpoint `/machine/reinstall` - - add `Reinstall bool` to data model of `allocation` - - make sure to reset `Reinstall` after reinstallation to prevent endless reinstallation loop -- metalctl: - - implement `reinstall` -- metal-go: - - implement `reinstall` -- gardener (longterm): - - add the `OSUpgradeStrategy` `reinstall` diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP4/README.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP4/README.md deleted file mode 100644 index 389a02d4..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP4/README.md +++ /dev/null @@ -1,211 +0,0 @@ ---- -slug: /MEP-4-multi-tenancy-for-the-metal-api -title: MEP-4 -sidebar_position: 4 ---- - -# Multi-Tenancy for the metal-api -:::info -This document is work in progress. -::: - -In the past we decided to treat the metal-api as a "low-level API", i.e. the API does not specifically deal with projects and tenants. A user with editor access can for example assign machines to every project he desires, he can see all the machines available and can control them. We tried to keep the metal-api code base as small as possible and we added resource scoping to a "higher-level APIs". From there, a user would be able to only see his own clusters and IP addresses. - -As time passed metal-stack has become an open-source project and people are willing to adopt. Adopters who want to put their own technologies on top of the metal-stack infrastructure don't have those "higher-level APIs" that we implemented closed-source for our user base. So, external adopters most likely need to implement resource scoping on their own. - -Introducing multi-tenancy to the metal-api is a serious chance of making our product better and more successful as it opens the door for: - -- Becoming a "fully-featured" API -- Narrowing down attack surfaces and possibility of unintended resource modification produced by bugs or human errors -- Discouraging people to implement their own scoping layers in front of the metal-stack -- Gaining performance through resource scopes -- Letting untrusted / third-parties work with the API - -## Requirements - -These are some general requirements / higher objectives that MEP-4 has to fulfill. - -- Should be able to run with mini-lab without requiring to setup complex auth backends (dex, LDAP, keycloak, ...) - - Simple to start with, more complex options for production setups -- Fine-grained access permissions (every endpoint maps to a permission) -- Tenant scoping (disallow resource access to resources of other tenants) -- Project scoping (disallow resource access to resources of other projects) -- Access tokens in self-service for technical user access - -## Implementation - -We gathered a lot of knowledge while implementing a multi-tenancy-capable backend for metalstack.cloud. The goal is now to use the same technology and adopt that to the metal-api, this includes: - -- gRPC in combination with connectrpc -- OPA for making auth decisions -- REST HTTP only for OIDC login flows - -### API Definitions - -The API definitions should be located on a separate Github repository separate from the server implementation. The proposed repository location is: https://github.com/metal-stack/api. - -This repository contains the `proto3` specification of the exposed metal-stack api. This includes the messages, simple validations, services and the access permission to these services. The input parameters for the authorization in the backend are generated from the `proto3` annotations. - -Client implementations for the most relevant languages (go, python) are generated automatically. - -This api is divided into end-user and admin access at the top level. The proposed APIs are: - -- `metalstack.api.v2`: For end-user facing services -- `metalstack.admin.v2`: For operators and controllers which need access to unscoped entities - -The methods of the API can have different role scopes (and can be narrowed down further with fine-grained method permissions): - -- `tenant`: Tenant-scoped methods, e.g. project creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `project`: Project-scoped methods, e.g. machine creation (tenant needs to be provided in the request payload) - - Available roles: VIEWER, EDITOR, OWNER -- `admin` Admin-scoped methods, e.g. unscoped tenant list or switch register - - Available roles: VIEWER, EDITOR - -And has methods with different visibility scopes: - -- `self`: Methods that only the logged in user can access, e.g. show permissions with the presented token -- `public`: Methods that do not require any specific authorization - -### API - -The API server implements the services defined in the API and validates access to a method using OPA with the JWT tokens passed in the requests. The server is implemented using the connectrpc.com framework. - -The API server implements the login flow through OIDC. After successful authentication, the API server derives user permissions from the OIDC provider and issues a new JWT token which is passed on to the user. The tokens including the permissions are stored in a redis compatible backend. - -With these tokens, users can create Access Tokens for CI/CD or other use cases. - -JWT Tokens can be revoked by admins and the user itself. - -### API Server - -Is put into a new github repo which implements the services defined in the `api` repository. It opens a `https` endpoints where the grpc (via connectrpc.com) and oidc services are exposed. - -### Migration of the Consumers - -To allow consumers to migrate to the `v2` API gradually, both apis, the new and the old, are deployed in parallel. In the control-plane both apis are deployed side-by-side behind the ingress. `api.example.com` is forwarded to `metal-api` and `metal.example.com` is forwarded to the new `metal-apiserver`. - -The api-server will talk to the existing metal-api during the process of migration services away to the new grpc api. - -The migration process can be done in the following manner: - -for each resource in the metal-api: - -- create a new proto3 based definition in the `api` repo. -- implement the business logic per service in the new `metal-apiserver` without calling the metal-api. -- clients must be able to talk to `v1` and `v2` backend in parallel -- Deprecate the already migrated service in the swagger route to notify the client that this route should not be used anymore. -- identify all consumers of this resource and replace them to use the grpc instead of the rest api -- move the business logic incl. the backend calls to ipam, metal-db, masterdata-api, nsq for this resource from the metal-api to the `metal-apiserver` - -We will migrate the rethinkdb backend implementation to a generic approach during this effort. - -- Try to enhance the generic rethinkdb interface with `project` scoped methods. - -There are a lot of consumers of metal-api, which need to be migrated: - -- ansible -- firewall-controller -- firewall-controller-manager -- gardener-extension-auth -- gardener-extension-provider-metal - - Do not point the secret bindings to a the shared provider secret in the seed anymore. Instead, use individual provider-secret containing project-scoped API access tokens in the Gardener project namespaces. -- machine-controller-manager-provider-metal -- metal-ccm -- metal-console -- metal-bmc -- metal-core -- metal-hammer -- metal-image-cache-sync -- metal-images -- metal-metrics-exporter -- metal-networker -- metalctl -- pixie - -## User Scenarios - -This section gathers a collection of workflows from the perspective of a user that we want to provide with the implementation of this proposal. - -### Machine Creation - -A regular user wants to create a machine resource. - -Requirements: Project was created, permissions are present - -- The user can see networks that were provided by the admin. - - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` - -- The user has to set the project scope first or provide `--project` flags for all commands. - ``` - $ metalctl project set 793bb6cd-8b46-479d-9209-0fedca428fe1 - You are now acting on project 793bb6cd-8b46-479d-9209-0fedca428fe1. - ``` -- The user can create the child network required for machine allocation. - ``` - $ metalctl network allocate --partition fra-equ01 --name test - ``` -- Now, the user sees his own child network. - ``` - $ metalctl network ls - ID NAME PROJECT PARTITION NAT SHARED PREFIXES IPS - internet Internet Network true false 212.34.83.0/27  ● - tenant-super-network-fra-equ01 Project Super Network fra-equ01 false false 10.128.0.0/14  ● - └─╴08b9114b-ec47-4697-b402-a11421788dc6 test 793bb6cd-8b46-479d-9209-0fedca428fe1 fra-equ01 false false 10.128.64.0/22  ● - underlay-fra-equ01 Underlay Network fra-equ01 false false 10.0.0.0/16  ● - ``` -- The user does not see any machines yet. - ``` - $ metalctl machine ls - ``` -- The user can create a machine. - ``` - $ metalctl machine create --networks internet,08b9114b-ec47-4697-b402-a11421788dc6 --name test --hostname test --image ubuntu-20.04 --partition fra-equ01 --size c1-xlarge-x86` - ``` -- The machine will now be provisioned. - ``` - $ metalctl machine ls - ID LAST EVENT WHEN AGE HOSTNAME PROJECT SIZE IMAGE PARTITION - 00000000-0000-0000-0000-ac1f6b7befb2 Phoned Home 20s 50d 4h test 793bb6cd-8b46-479d-9209-0fedca428fe1 c1-xlarge-x86 Ubuntu 20.04 20210415 fra-equ01 - ``` - -:::warning -A user **cannot** list all allocated machines for all projects. The user **must** always switch project context first and can only view the machines inside this project. Only admins can see all machines at once. -::: -### Scopes for Resources - -The admins / operators of the metal-stack should be able to provide _global_ resources that users are able to use along with their own resources. In particular, users can view and use _global_ resources, but they are not allowed to create, modify or delete them. - -:::info -When a project ID field is empty on a resource, the resource is considered _global_. -::: - -Where possible, users should be capable of creating their own resource entities. - -| Resource | User | Global | -| :----------------- | :--- | :----- | -| File System Layout | yes | yes | -| Firewall | yes | | -| Firmware | | yes | -| OS Image | | yes | -| Machine | yes | | -| Network (Base) | | yes | -| Network (Children) | yes | | -| IP | yes | | -| Partition | | yes | -| Project | yes | | -| Project Token | yes | | -| Size | | yes | -| Switch | | | -| Tenant | | yes | - -:::info -Example: A user can make use of the file system layouts provided by the admins, but can also create own layouts. Same applies for images. As soon as a user creates own resources, the user takes over the responsibility for the machine provisioning to succeed. -::: diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP5/README.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP5/README.md deleted file mode 100644 index 3b7fc45c..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP5/README.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -slug: /MEP-5-shared-networks -title: MEP-5 -sidebar_position: 5 ---- - -# Shared Networks - -## Why are shared networks needed - -For special purpose machines that serve shared services with performance critical workloads to all machines of a partition (like persistent storage) it would be good to have kind of a "shared network" that is easily accessible. -They do not necessarily need another firewall. This would avoid having two firewalls in the datapath between a machine in a private network and the machines of a shared service. - -## Constraints that need to hold - -- a shared network is usable from all machines that have a firewall in front, that uses it -- a shared network is only usable within a single partition (currently we are constrained in bandwidth and have no routing of 10.0.0.0/8 addresses btw. partitions and failure domain should be the partition but this constraint might get lifted in the future) -- networks may be marked as shared after network allocation (but there should be no way back from shared to unshared) -- neither machines nor firewalls may have multiple private, unshared networks configured -- machines must have a single primary network configured - - this might be a shared network - - OR a plain, unshared private network -- firewalls may participate in multiple shared networks -- machines can be allocated with a primary network using auto IP allocation or with `noauto` and a specific IP - -## Should shared networks be private - -**Alternative 1:** If we implemented shared networks by extending functions around plain, private networks we would not have to manage another CIDR (mini point) and it would be possible to create a k8s cluster with a private network, mark the network as `shared` and produce shared services from this k8s cluster. - -**Alternative 2:** If shared networks are implemented as first class networks we could customize the VRF and also accomplish an other goal of our roadmap: being able to create machines directly in an external network. - -Together with @majst01 and @Gerrit91 we decided to continue to implement **Alternative 1**. - -## Firewalls accessing a shared network - -Firewalls that access shared networks need to: - -- hide the private network behind an ip address of the shared network if the shared network was configured with `nat=true`. -- import the prefixes of the shared VRF to the private VRF and import the prefixes of the private VRF to the shared VRF so that the communication between the two is working in both directions. As long as no `nat=true` was set on the shared VRF, the original machine ips are visible in both communication directions. - -## Setup with shared networks and single consumer - -![Simple Setup](./shared.png) - -## Setup with single shared network and multiple consumers - -![Advanced Setup](./shared_advanced.png) - -## Getting internet access - -Machines contained in a shared network can access the internet with different scenarios: - -- if they have an own firewall: this is internet accessibility, as common (check whether all traffic gets routed through it!) -- if they don't have an own firewall, an external HTTP proxy is needed that has an endpoint exposed as Service Type NodePort diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP5/shared.drawio b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP5/shared.drawio deleted file mode 100644 index aa7af045..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP5/shared.drawio +++ /dev/null @@ -1,121 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP5/shared.png b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP5/shared.png deleted file mode 100644 index b0b47f03..00000000 Binary files a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP5/shared.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP5/shared_advanced.drawio b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP5/shared_advanced.drawio deleted file mode 100644 index 6f96eca0..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP5/shared_advanced.drawio +++ /dev/null @@ -1,187 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP5/shared_advanced.png b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP5/shared_advanced.png deleted file mode 100644 index da989915..00000000 Binary files a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP5/shared_advanced.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP6/README.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP6/README.md deleted file mode 100644 index edf52a6c..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP6/README.md +++ /dev/null @@ -1,123 +0,0 @@ ---- -slug: /MEP-6-dmz-networks -title: MEP-6 -sidebar_position: 6 ---- - -# DMZ Networks - -## Reasoning - -To fulfill higher levels of security measures the standard metal-stack approach with a single firewall in front of a set of machines might be insufficient. -There are cases where two physically distinct firewalls in front of application workload are mandatory. In traditional network terms this is known as DMZ approach. - -For Kubernetes workloads it makes sense to use the front cluster for ingress, WAF purposes and as outgoing proxy. The clusters may be used for application workload. - -## DMZ network - -- Use a separate DMZ network prefix for every tenant -- This is used as intermediate network btw. private networks of a tenant and the internet -- For every partition a distinct DMZ firewall/cluster is needed for a tenant -- For Gardener orchestrated Kubernetes clusters this network must be a publicly reachable internet prefix because shoot clusters need a vpn service that is used for instrumentation from the seed cluster - this will be a requirement as long as the inverse vpn tunnel feature Konnectivity is not available to us. - -## Approach 1: DMZ with publicly reachable internet prefix - -![DMZ Internet](dmz-internet_public.svg) - -A DMZ network with publicly reachable internet prefix will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: null -partitionid: "" -prefixes: - - 212.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 104007 -vrfshared: false -nat: true -shared: false -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -This is currently supported by the metal-networker and needs no further changes! - -## Approach 2: DMZ with private IPs - -![DMZ Internet](dmz-internet_private.svg) - -A DMZ network with private IPs will look like this in the metal-api: - -```yaml ---- -description: DMZ-Network -destinationprefixes: - - 0.0.0.0/0 -id: dmz -labels: - network.metal-stack.io/default-external: "" -name: DMZ-Network -parentnetworkid: tenant-super-network-fra-equ01 -partitionid: fra-equ01 -prefixes: - - 10.90.30.128/25 -privatesuper: false -projectid: "" -vrf: 4711 -vrfshared: false -nat: true -shared: true # it's usable from multiple projects -underlay: false -``` - -### DMZ firewall - -The firewall of the DMZ will intersect its private network for attached machines, the DMZ network and the public internet. - -- The private network of the project needs to import - - the default route from the internet network - - the DMZ network -- The internet network must import the DMZ network (only locally, no-export) -- The DMZ network provides the default route for tenant's clusters in a partition. It imports the default route from the internet network - -### Application Firewall - -The firewall of application workloads intersects its private network for attached machines and the DMZ network. - -## Code Changes / Implications - -- `metal-networker` and `metal-ccm` assume that there is only one network providing the default-route -- `metal-networker` needs to - - import the default route from the internet network to the dmz network (DMZ Firewall) - - import the DMZ network to the internet network and adjusting NAT rules (DMZ Firewall) - - import destination prefixes of the DMZ network to the private primary network (DMZ Firewall, Application Firewall) - - import DMZ-IPs of the private primary network to the DMZ network (DMZ Firewall, Application Firewall) -- `metal-api`: destination prefixes of private networks need to be configurable (`allocateNetwork`) -- `gardener-extension-provider-metal`: needs to be able to delete DMZ clusters (but skip the network deletion part) -- the application firewall is not publicly reachable - for debugging purposes a hop over the DMZ firewall is needed - -## Decision - -We decided to follow the second approach with private DMZ networks. diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP6/dmz-internet_private.drawio b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP6/dmz-internet_private.drawio deleted file mode 100644 index 7b83bbfc..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP6/dmz-internet_private.drawio +++ /dev/null @@ -1,178 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP6/dmz-internet_private.svg b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP6/dmz-internet_private.svg deleted file mode 100644 index f5e58204..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP6/dmz-internet_private.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
10.90.30.129
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
10.90.30.128/25
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
10.90.30.129 is reachable
/0 via Firewall DMZ
10.0.0.0/24 is reachable
10.0.1.0/24 is reachable
10.90.30.129 is reachable...
Internet
212.1.1.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0

import 10.0.0.0/24 no export
import 10.0.1.0/24 no export
import 10.90.30.128/25 no export
import 10.0.0.0/24 no exp...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP6/dmz-internet_public.drawio b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP6/dmz-internet_public.drawio deleted file mode 100644 index 544939e5..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP6/dmz-internet_public.drawio +++ /dev/null @@ -1,184 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP6/dmz-internet_public.svg b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP6/dmz-internet_public.svg deleted file mode 100644 index 5e825081..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP6/dmz-internet_public.svg +++ /dev/null @@ -1,3 +0,0 @@ -
Machine
Machine
Firewall DMZ
Firewall DMZ
DMZ VRF
DMZ VRF
Machine
Machine
Firewall A
Firewall A
Private VRF A
Private VRF A
10.0.0.2
212.1.2.3
/0 via Firewall A
10.0.0.2...
VRF A 10.0.0.1
VRF A 10.0.0.1
DMZ Network
212.1.2.0/27
DMZ Network...
Private Network
10.0.0.0/24
Private Network...
import /0
import /0
import 10.0.0.0/24
import 10.0.0.0/24 -
Machine
Machine
Firewall B
Firewall B
Private VRF B
Private VRF B
10.0.1.2
/0 via Firewall B
10.0.1.2...
VRF B 10.0.1.1
VRF B 10.0.1.1
Private Network
10.0.1.0/24
Private Network...
import /0
import /0
import 10.0.1.0/24
import 10.0.1.0/24 -
212.1.2.3 is reachable
/0 via Firewall DMZ
212.1.2.3 is reachable...
Internet
212.1.1.0/27 212.1.2.0/27
Internet...
SNAT to 212.1.1.1
SNAT to 212.1.1.1
Internet VRF
Internet VRF
import /0
import /0
import 212.1.2.0/27
import 10.0.0.0/24 no redistribute
import 10.0.1.0/24 no redistribute

import 212.1.2.0/27...
SNAT to
212.1.2.1
SNAT to...
SNAT to
212.1.2.2
SNAT to...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP8/README.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP8/README.md deleted file mode 100644 index 14748fae..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP8/README.md +++ /dev/null @@ -1,503 +0,0 @@ ---- -slug: /MEP-7-configurable-filesystem-layout-for-machine-allocation -title: MEP-7 -sidebar_position: 7 ---- - -# Configurable Filesystem layout for Machine Allocation - -The current implementation uses a hard coded filesystem layout depending on the specified size and image. This is done in the metal-hammer. This worked well in the past because we had a small amount of sizes and images. But we reached a point where this is to restricted for all use cases we have to fulfill. It also forces us to modify the metal-hammer source code to support a new filesystem layout. - -This proposal tries to address this issue by introducing a filesystem layout struct in the metal-api which is then configurable per machine allocation. -The original behavior of automatic filesystem layout decision must still be present, because there must be no API change for existing API consumers. It should be a additional feature during machine allocation. - -## API and behavior - -The API will get a new endpoint `filesystemlayouts`to create/update/delete a set of available `filesystemlayouts`. - -### Constraints - -In order to keep the actual machine allocation api compatible, there must be no difference while allocating a machine. To achieve this every -`filesystemlayout` defines constraints which specifies for which combination of `sizes` and `images` this layout should be used by default. -The specified constraints over all `filesystemlayouts` therefore must be collision free, to be more specific, there must be exactly one layout outcome -for every possible combination of `sizes` and `images`. - -The `size` constraint must be a list of the exact size ids, the `image` constraint must be a map of os to semver compatible version constraint. For example: - -- `debian: ">= 10.20210101"` or `debian: "< 10.20210101"` - -The general form of a `image` constraint is a map from `os` to `versionconstraint` where: - -`os` must match the first part of the image without the version. -`versionconstraint` must be the comparator, a space and the version, or simply `*` to match all versions of this `os`. -The comparator must be one of: "=", "!=", ">", "<", ">=", "=>", "<=", "=<", "~", "~>", "^" - -It must also be possible to have a `filesystemlayout` in development or for other special purposes, which can be specified during the machine allocation. -To have such a layout, both constraints `sizes` and `images`must be empty list. - -### Reinstall - -The current reinstall implementation the metal-hammer detects during the installation on which disk the OS was installed and reports back to the metal-api the Report struct which has two properties `primarydisk` and `ospartition`. -Both fields are not required anymore because the logic is now shifted to the `filesystemlayout` definition. If `Disk.WipeOnReinstall` is set to true, this disk will be wiped, default is false and is preserved. - -### Handling of s2-xlarge machines - -These machines are a bit special compared to our `c1-*` machines because they have rotating hard disks for the mass storage purpose. -The downside is that the on board SATA-DOM has the same naming as the HDDs and can not be specified as the first /dev/sda disk because all HDDs are also /dev/sd\* disks. -Therefore we had a special SATA-DOM detection algorithm inside metal-hammer which simply checks for the smallest /dev/sd disk and took this to install the OS. - -This is not possible with the current approach, but we figured out that the SATA-DOM is always `/dev/sde`. So we can create a special `filesystemlayout` where the installations is made on this disk. - -### Possible Filesystemlayout hierarchies - -It is only possible to create a filesystem on top of a block device. The creation of a block device can be done on multiple ways, depending on the requirements regarding performance, space and redundancy of the filesystem. -It also depends on the disks available on the server. - -The current approach implements the following hierarchies: - -![filesystems](filesystems.png) - -### Implementation - -```go -// FilesystemLayout to be created on the given machine -type FilesystemLayout struct { - // ID unique layout identifier - ID string - // Description is human readable - Description string - // Filesystems to create on the server - Filesystems []Filesystem - // Disks to configure in the server with their partitions - Disks []Disk - // Raid if not empty, create raid arrays out of the individual disks, to place filesystems onto - Raid []Raid - // VolumeGroups to create - VolumeGroups []VolumeGroup - // LogicalVolumes to create on top of VolumeGroups - LogicalVolumes []LogicalVolume - // Constraints which must match to select this Layout - Constraints FilesystemLayoutConstraints -} - -type FilesystemLayoutConstraints struct { - // Sizes defines the list of sizes this layout applies to - Sizes []string - // Images defines a map from os to versionconstraint - // the combination of os and versionconstraint per size must be conflict free over all filesystemlayouts - Images map[string]string -} - -type RaidLevel string -type Format string -type GPTType string - -// Filesystem defines a single filesystem to be mounted -type Filesystem struct { - // Path defines the mountpoint, if nil, it will not be mounted - Path *string - // Device where the filesystem is created on, must be the full device path seen by the OS - Device string - // Format is the type of filesystem should be created - Format Format - // Label is optional enhances readability - Label *string - // MountOptions which might be required - MountOptions []string - // CreateOptions during filesystem creation - CreateOptions []string -} - -// Disk represents a single block device visible from the OS, required -type Disk struct { - // Device is the full device path - Device string - // Partitions to create on this device - Partitions []Partition - // WipeOnReinstall, if set to true the whole disk will be erased if reinstall happens - // during fresh install all disks are wiped - WipeOnReinstall bool -} - -// Raid is optional, if given the devices must match. -// TODO inherit GPTType from underlay device ? -type Raid struct { - // ArrayName of the raid device, most often this will be /dev/md0 and so forth - ArrayName string - // Devices the devices to form a raid device - Devices []Device - // Level the raidlevel to use, can be one of 0,1,5,10 - // TODO what should be support - Level RaidLevel - // CreateOptions required during raid creation, example: --metadata=1.0 for uefi boot partition - CreateOptions []string - // Spares defaults to 0 - Spares int -} - - -// VolumeGroup is optional, if given the devices must match. -type VolumeGroup struct { - // Name of the volumegroup without the /dev prefix - Name string - // Devices the devices to form a volumegroup device - Devices []string - // Tags to attach to the volumegroup - Tags []string -} - -// LogicalVolume is a block devices created with lvm on top of a volumegroup -type LogicalVolume struct { - // Name the name of the logical volume, without /dev prefix, will be accessible at /dev/vgname/lvname - Name string - // VolumeGroup the name of the volumegroup - VolumeGroup string - // Size of this LV in mebibytes (MiB) - Size uint64 - // LVMType can be either striped or raid1 - LVMType LVMType -} - -// Partition is a single partition on a device, only GPT partition types are supported -type Partition struct { - // Number of this partition, will be added to the device once partitioned - Number int - // Label to enhance readability - Label *string - // Size given in MebiBytes (MiB) - // if "0" is given the rest of the device will be used, this requires Number to be the highest in this partition - Size string - // GPTType defines the GPT partition type - GPTType *GPTType -} - -const ( - // VFAT is used for the UEFI boot partition - VFAT = Format("vfat") - // EXT3 is usually only used for /boot - EXT3 = Format("ext3") - // EXT4 is the default fs - EXT4 = Format("ext4") - // SWAP is for the swap partition - SWAP = Format("swap") - // None - NONE = Format("none") - - // GPTBoot EFI Boot Partition - GPTBoot = GPTType("ef00") - // GPTLinux Linux Partition - GPTLinux = GPTType("8300") - // GPTLinuxRaid Linux Raid Partition - GPTLinuxRaid = GPTType("fd00") - // GPTLinux Linux Partition - GPTLinuxLVM = GPTType("8e00") - - // LVMTypeLinear append across all physical volumes - LVMTypeLinear = LVMType("linear") - // LVMTypeStriped stripe across all physical volumes - LVMTypeStriped = LVMType("striped") - // LVMTypeStripe mirror with raid across all physical volumes - LVMTypeRaid1 = LVMType("raid1") -) -``` - -Example `metalctl` outputs: - -```bash -$ metalctl filesystemlayouts ls -ID DESCRIPTION SIZES IMAGES -default default fs layout c1-large-x86, c1-xlarge-x86 debian >=10, ubuntu >=20.04, centos >=7 -ceph fs layout for ceph s2-large-x86, s2-xlarge-x86 debian >=10, ubuntu >=20.04 -firewall firewall fs layout c1-large-x86, c1-xlarge-x86 firewall >=2 -storage storage fs layout s3-large-x86 centos >=7 -s3 storage fs layout s2-xlarge-x86 debian >=10, ubuntu >=20.04, >=firewall-2 -default-devel devel fs layout -``` - -The `default` layout reflects what is actually implemented in metal-hammer to guarantee backward compatibility. - -```yaml ---- -id: default -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - label: "efi" # required to be compatible with old images - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" # required to be compatible with old images - - path: "/var/lib" - device: "/dev/sda3" - format: "ext4" - label: "varlib" # required to be compatible with old images - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -The `firewall` layout reuses the built in nvme disk to store the logs, which is way faster and larger than what the sata-dom ssd provides. - -```yaml ---- -id: firewall -constraints: - sizes: - - c1-large-x86 - - c1-xlarge-x86 - images: - firewall: ">=2" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sda2" - format: "ext4" - - path: "/var" - device: "/dev/nvme0n1p1" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - device: "/dev/nvme0n1" - wipe: true - partitions: - - number: 1 - label: "var" - size: 0 - type: GPTLinux -``` - -The `storage` layout will be used for the storage servers, which must have mirrored boot disks. - -```yaml ---- -id: storage -constraints: - sizes: - - s3-large-x86 - images: - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/md1" - format: "vfat" - options: "-F32" - - path: "/" - device: "/dev/md2" - format: "ext4" -disks: - - device: "/dev/sda" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid - - device: "/dev/sdb" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTLinuxRaid - - number: 2 - label: "root" - size: 5000 - type: GPTLinuxRaid -raid: - - name: "/dev/md1" - level: 1 - devices: - - "/dev/sda1" - - "/dev/sdb1" - options: "--metadata=1.0" - - name: "/dev/md2" - level: 1 - devices: - - "/dev/sda2" - - "/dev/sdb2" - options: "--metadata=1.0" -``` - -The `s3-storage` layout matches the special situation on the s2-xlarge machines. - -```yaml ---- -id: s3-storage -constraints: - sizes: - - c1-large-x86 - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sde1" - format: "vfat" - options: "-F 32" - - path: "/" - device: "/dev/sde2" - format: "ext4" - - path: "/var/lib" - device: "/dev/sde3" - format: "ext4" -disks: - - device: "/dev/sde" - wipe: true - partitions: - - number: 1 - label: "efi" - size: 500 - type: GPTBoot - - number: 2 - label: "root" - size: 5000 - type: GPTLinux - - number: 3 - label: "varlib" - size: 0 # to end of partition - type: GPTLinux -``` - -A sample `lvm` layout which puts `/var/lib` as stripe on the nvme device - -```yaml ---- -id: lvm -description: "lvm layout" -constraints: - size: - - s2-xlarge-x86 - images: - debian: ">=10" - ubuntu: ">=20.04" - centos: ">=7" -filesystems: - - path: "/boot/efi" - device: "/dev/sda1" - format: "vfat" - createoptions: - - "-F 32" - label: "efi" - - path: "/" - device: "/dev/sda2" - format: "ext4" - label: "root" - - path: "/var/lib" - device: "/dev/vg00/varlib" - format: "ext4" - label: "varlib" - - path: "/tmp" - device: "tmpfs" - format: "tmpfs" - mountoptions: - [ - "defaults", - "noatime", - "nosuid", - "nodev", - "noexec", - "mode=1777", - "size=512M", - ] -volumegroups: - - name: "vg00" - devices: - - "/dev/nvmne0n1" - - "/dev/nvmne0n2" -logicalvolumes: - - name: "varlib" - volumegroup: "vg00" - size: 200 - lvmtype: "striped" -disks: - - device: "/dev/sda" - wipeonreinstall: true - partitions: - - number: 1 - label: "efi" - size: 500 - gpttype: "ef00" - - number: 2 - label: "root" - size: 5000 - gpttype: "8300" - - device: "/dev/nvmne0n1" - wipeonreinstall: false - - device: "/dev/nvmne0n2" - wipeonreinstall: false -``` - -## Components which requires modifications - -- metal-hammer: - - change implementation from build in hard coded logic - - move logic to create fstab from install.sh to metal-hammer -- metal-api: - - new endpoint `filesystemlayouts` - - add optional spec of `filesystemlayout` during `allocation` with validation if given `filesystemlayout` is possible on given size. - - add `allocation.filesystemlayout` in the response, based on either the specified `filesystemlayout` or the calculated one. - - implement `filesystemlayouts` validation for: - - matching to disks in the size - - no overlapping with the sizes/imagefilter specified in `filesystemlayouts` - - all devices specified exists from top to bottom (fs -> disks -> device || fs -> raid -> devices) -- metalctl: - - implement `filesystemlayouts` -- metal-go: - - adopt api changes -- metal-images: - - install mdadm for raid support diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP8/filesystems.drawio b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP8/filesystems.drawio deleted file mode 100644 index 0f0c6ab5..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP8/filesystems.drawio +++ /dev/null @@ -1,43 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP8/filesystems.png b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP8/filesystems.png deleted file mode 100644 index 6d903b7e..00000000 Binary files a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP8/filesystems.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP9/README.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP9/README.md deleted file mode 100644 index a8cae83d..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP9/README.md +++ /dev/null @@ -1,132 +0,0 @@ ---- -slug: /MEP-9-no-open-ports-to-the-data-center -title: MEP-9 -sidebar_position: 9 ---- - -# No Open Ports To the Data Center - -Our metal-stack partitions typically have open ports for metal-stack native services, these are: - -- SSH port on the firewalls -- bmc-reverse-proxy for serial console access through the metal-console - -These open ports are potential security risks. For example, while SSH access is possible only with private key it's still vulnerable to DoS attack. - -Therefore, we want to get rid off these open ports to reduce the attack surface to the data center. - -## Requirements - -- Access to firewall SSH only via VPN -- Easy to update VPN components - -As a next step, we can also consider joining the management servers to the VPN mesh, which would replace typical WireGuard setups for operators to enter resources inside the partition. - -## High Level Design - -[](./architecture.svg) - -> Simplified drawing showing old vs. new architecture. - -### Concerns - -There's few concerns when using WireGuard for implementing VPN: - -1. WireGuard doesn't implement dynamic cipher substitution. Which is important in case one of the crypto methods, used by WireGuard will be broken. The only possible solution for that will be to update WireGuard to a fixed version. -2. Coordination server(Headscale) is a single point of failure. In case it fails, it potentially can disconnect existing members of the network, as WireGuard can't manage dynamic IPs by itself. -3. Headscale is already falls behind Tailscale coordination server implementation. Which can complicate the upgrade to newer version of Tailscale client in case of emergency. - -### Solutions to concerns - -1. Tailscale node software is using userspace implementation of WireGuard -- `wireguard-go`. One of the options is to inject Tailscale client into `metalctl`. And make it available as `metalctl vpn` or similar command. It should be possible to do as `tailscale` node is already available as open sourced Go pkg. That would allow us to control, what version of Tailscale users are using and in case of any critical changes to enforce them to update `metalctl` to use VPN functionality. -2. Would it be a considerable risk? We could look into `wg-dynamic` project to cover this problem. -3. At the moment, repository looks well maintained and the metal-stack team already contributes to it. - -## Implementation Details - -### metal-roles - -`metal-roles` will be responsible for deployment of `headscale` server(via new `headscale` role). It also should provide sufficient config to `metal-api` so it establishes connection with `headscale` gRPC server. - -### New `metalctl` commands - -`metalctl` will be responsible for client-side implementation of this MEP. Specifically, it's by using `metalctl` user expected to connect to firewalls. - -- `metalctl vpn` -- section for VPN related commands: - - `metalctl vpn get key [vpn name] --namespace [namespace name]` -- returns auth key to be used with `tailscale` client for establishing connection. - -Extend `metalctl firewall`: - -- `metalctl firewall ssh [ID]` -- connect to firewall via SSH. - -Extend `metalctl machine`: - -- `metalctl machine ssh [ID]` -- connect to machine via SSH. - -`metalctl` will be able to connect to firewall and machines by running `tailscale` in container. - -### metal-api - -Updates to `metal-api` should be made, so that it's able to add firewalls to VPNs. There should be one Tailscale namespace per project. So if multiple firewalls are created in single project, they will join the same namespace. - -Two new flags should be introduced to connect `metal-api` to `headscale` gRPC server: - -- `headscale-addr` -- specifies address of Headscale grpc API. -- `headscale-api-key` -- specifies temporary API key to connect to Headscale. It should be replaced and then rotated by `metal-api`. - -If `metal-api` initialized with `headscale` connection it should automatically join all created firewalls to VPN. - -Add new endpoint, that will be used by `metalctl` to connect to VPN: - -- `/v1/vpn GET` -- requests auth key from `headscale` server. - -### metal-hammer - -`metal-hammer` acts as an intermediary for machine configuration between `metal-api` and machine's image. Specifically it writes to `/etc/metal/install.yaml` file, data from which later will be used by image's `install.sh` file. - -To implement VPN support we have to add authentication key and VPN server address to `install.yaml` file. This key will be used to join machine to a VPN. - -### metal-images - -Images `install.sh` script have to be updated to work with authentication key and VPN server address, provided in `install.yaml` file. If this key is present, machine should connect to VPN. - -### metal-networker - -`metal-networker` also have to know if VPN was configured. In that case we need to disable public access to SSH and allow all(?) traffic from WireGuard interface. - -### firewall-controller - -`firewall-controller` have to monitor changes in `Firewall` resource and keep `tailscaled` version up-to-date. - -### Resources - -Update `Firewall` resource to include desired/actual `tailscale` version: - -``` -Firewall: - Spec: - tailscale: - Version: Minimal version - ... - Status: - ... - VPN: - Status: Boolean field - tailscale: - Version: Actual version - ... -``` - -### bmc-reverse-proxy - -TODO - -## References - -1. [WireGuard: Next Generation Secure Network Tunnel](https://www.youtube.com/watch?v=88GyLoZbDNw) -2. [How Tailscale works](https://tailscale.com/blog/how-tailscale-works) -3. [Tailscale is officially SOC 2 compliant](https://tailscale.com/blog/soc2) -4. [Why not Wireguard](https://www.ipfire.org/blog/why-not-wireguard) -5. [Wireguard: Known Limitations](https://www.wireguard.com/known-limitations/) -6. [Wireguard: Things That Might Be Accomplished](https://www.wireguard.com/todo/) -7. [Headscale: Tailscale control protocol v2](https://github.com/juanfont/headscale/issues/526) diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP9/architecture.drawio b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP9/architecture.drawio deleted file mode 100644 index adb09214..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP9/architecture.drawio +++ /dev/null @@ -1,324 +0,0 @@ - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - - - - - - - -
-
-
- Metal Control Plane -
-
-
-
- - Metal Control Plane - -
-
- - - - -
-
-
- metal-stack -
- Partition -
-
-
-
- - metal-stack... - -
-
- - - - -
-
-
- firewall -
-
-
-
- - firewall - -
-
- - - - - -
-
-
- machine -
-
-
-
- - machine - -
-
- - - - -
-
-
- ssh -
-
-
-
- - ssh - -
-
- - - - - - -
-
-
- bmc-proxy -
-
-
-
- - bmc-proxy - -
-
- - - - -
-
-
- headscale -
-
-
-
- - headscale - -
-
- - - - - - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - - - -
-
-
- tailscaled -
-
-
-
- - tailscaled - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
- - - - -
-
-
- Internet -
-
-
-
- - Internet - -
-
-
- - - - - Viewer does not support full SVG 1.1 - - - -
diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP9/architecture.svg b/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP9/architecture.svg deleted file mode 100644 index fd268d2f..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/MEP9/architecture.svg +++ /dev/null @@ -1 +0,0 @@ -
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
Metal Control Plane
Metal Control Plane
metal-stack
Partition
metal-stack...
firewall
firewall
machine
machine
ssh
ssh
bmc-proxy
bmc-proxy
headscale
headscale
tailscaled
tailscaled
tailscaled
tailscaled
Internet
Internet
Internet
Internet
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/_category_.json b/versioned_docs/version-v0.22.4/contributing/01-Proposals/_category_.json deleted file mode 100644 index 2e7fa4bf..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "position": 1, - "label": "Enhancement Proposals" -} \ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/contributing/01-Proposals/index.md b/versioned_docs/version-v0.22.4/contributing/01-Proposals/index.md deleted file mode 100644 index 0f6eddc3..00000000 --- a/versioned_docs/version-v0.22.4/contributing/01-Proposals/index.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -slug: /enhancement-proposals -title: Enhancement Proposals -sidebar_position: 1 ---- - -# Metal Stack Enhancement Proposals (MEPs) - -This section contains proposals which address substantial modifications to metal-stack. - -Every proposal has a short name which starts with _MEP_ followed by an incremental, unique number. Proposals should be raised as pull requests in the [website](https://github.com/metal-stack/website) repository and can be discussed in Github issues. - -The list of proposals and their current state is listed in the table below. - -Possible states are: - -- `In Discussion` -- `Accepted` -- `Declined` -- `In Progress` -- `Completed` -- `Aborted` - -Once a proposal was accepted, an issue should be raised and the implementation should be done in a separate PR. - -| Name | Description | State | Progress | -| :------------------------------------------------------------- | :--------------------------------------------- | :-------------: | :----------------------------------------------------------------: | -| [MEP-1](MEP1/README.md) | Distributed Control Plane Deployment | `Declined` | | -| [MEP-2](MEP2/README.md) | Two Factor Authentication | `Aborted` | | -| [MEP-3](MEP3/README.md) | Machine Re-Installation to preserve local data | `Completed` | | -| [MEP-4](MEP4/README.md) | Multi-tenancy for the metal-api | `In Progress` | [releases#236](https://github.com/metal-stack/releases/issues/236) | -| [MEP-5](MEP5/README.md) | Shared Networks | `Completed` | | -| [MEP-6](MEP6/README.md) | DMZ Networks | `Completed` | | -| [MEP-7](https://github.com/metal-stack/docs-archive/pull/51) | Passing environment variables to machines | `Declined` | | -| [MEP-8](MEP8/README.md) | Configurable Filesystemlayout | `Completed` | | -| [MEP-9](MEP9/README.md) | No Open Ports To the Data Center | `Completed` | | -| [MEP-10](MEP10/README.md) | SONiC Support | `Completed` | | -| [MEP-11](MEP11/README.md) | Auditing of metal-stack resources | `Completed` | | -| [MEP-12](MEP12/README.md) | Rack Spreading | `Completed` | | -| [MEP-13](MEP13/README.md) | IPv6 | `Completed` | | -| [MEP-14](MEP14/README.md) | Independence from external sources | `Completed` | | -| [MEP-15](https://github.com/metal-stack/docs-archive/pull/232) | HAL Improvements | `In Discussion` | [releases#238](https://github.com/metal-stack/releases/issues/238) | -| [MEP-16](MEP16/README.md) | Firewall Support for Cluster API Provider | `Accepted` | [releases#237](https://github.com/metal-stack/releases/issues/237) | -| [MEP-17](MEP17/README.md) | Global Network View | `In Discussion` | | -| [MEP-18](MEP18/README.md) | Autonomous Control Plane | `In Discussion` | | - -## Proposal Process - -1. Before starting a new proposal, it is advised to have a quick chat with one of the maintainers. -2. Create a draft pull request in the [website](https://github.com/metal-stack/website) repository with your proposal. Your proposal doesn't have to be finished at this point. -3. Share the PR in the [metal-stack Slack](https://metal-stack.slack.com/) and invite maintainers to review it. -4. The review itself will probably take place in multiple iterations. Don't be discouraged if your proposal is not accepted right away. The goal is to reach consensus. -5. Once your proposal is accepted, create an umbrella issue in the relevant repository or when multiple repositories are involved in the [releases](https://github.com/metal-stack/releases). -6. Other issues should be created in different repositories and linked to the umbrella issue. -7. Unless stated otherwise, the proposer is responsible for the implementation of the proposal. - -## How to Write a Good MEP - -In the first section of your MEP, start with the current situation and the motivation for the change. Summarize your proposal briefly. - -Next follows the main part: describe your proposal in detail. Which parts of of metal-stack are affected? Are there API changes? If yes, describe them and provide examples here. -Try to think of side effects your proposal might have. Try to provide a view on how your proposal affects users of metal-stack. -Highlight breaking changes and think of a migration path for existing users. If your proposal affects multiple components, try to describe the interaction between them. - -After the main part of your proposal, feel free to add additional sections, e.g. about alternatives that were considered, non-goals or future possibilities. - -Depending on the complexity of your proposal, you might want to add a section about the implementation plan or roadmap. - -You can have a look at the existing MEPs for inspiration. As you will notice: not every MEP has the same structure. Feel free to structure your MEP in a way that makes sense for your proposal. diff --git a/versioned_docs/version-v0.22.4/contributing/02-planning-meetings.mdx b/versioned_docs/version-v0.22.4/contributing/02-planning-meetings.mdx deleted file mode 100644 index df10177b..00000000 --- a/versioned_docs/version-v0.22.4/contributing/02-planning-meetings.mdx +++ /dev/null @@ -1,120 +0,0 @@ ---- -slug: /planning-meetings -title: Planning Meetings -sidebar_position: 2 ---- - -# Planning Meetings - -Public planning meetings are held **biweekly** on **odd calendar weeks** from **14:00 to 14:30** (Berlin/Europe timezone) on Microsoft Teams. The purpose is to provide an overview of our current projects and priorities, as well as to discuss new topics and issues within the group. - -export function PlanningMeetingDatesTable() { - const today = new Date(); - const dayOfWeek = today.getDay(); - - let daysUntilMonday = 0; - switch (dayOfWeek) { - case 0: - daysUntilMonday = 1; - break; - case 1: - daysUntilMonday = 0; - break; - default: - daysUntilMonday = 8 - dayOfWeek; - } - - const nextMonday = new Date(); - nextMonday.setDate(nextMonday.getDate() + daysUntilMonday) - - let onejan = new Date(today.getFullYear(), 0, 1); - let week = Math.ceil((((nextMonday.getTime() - onejan.getTime()) / 86400000) + onejan.getDay() + 1) / 7); - - if (week % 2 === 0) { - nextMonday.setDate(nextMonday.getDate() + 7) - } - - const blacklist = [ - new Date('2025-12-29'), - ] - - const amount = 8 - const dates = []; - - for (let i = 0; i < amount; i++) { - const nextDate = new Date(nextMonday); - nextDate.setDate(nextDate.getDate() + (i * 14)) - - if (blacklist.find(item => {return item.toDateString() == nextDate.toDateString()}) !== undefined ) { - continue - } - - dates.push(nextDate.toDateString()) - } - - return ( - - - - - - - - - - {dates.map((date, index) => ( - - - - - - ))} - -
DateTimeLink
{date}14:00 – 14:30Join Link
- ) -} - - - -Our [development planning board](https://github.com/orgs/metal-stack/projects/34) can be found on GitHub. - -[//]: <> (The C025PB1EUKC in the slack url references the #devs channel.) -If you want to get an invitation to the event, please drop us a line on our [Slack channel](https://metal-stack.slack.com/archives/C025PB1EUKC). - -Planning meetings are currently not recorded. The meetings are held either in English or German depending on the attendees. - -:::info -Note that anyone can contribute to metal-stack without participating in planning meetings. However, if you want to speed up the review process for your requirements, it might be helpful to attend the meetings. -::: - -## Agenda - -Here is the agenda that we generally want to follow in a planning meeting: - -- Possibility to bring up news that are interesting for every developer of the metal-stack org -- Check `Done` column and archive cards - - Attendees have the chance to briefly present achievements if they want -- Check the `In Progress` column and discuss whether these tasks are still worked on, there were significant blockers or they can be lower-prioritized -- Check new issues labelled with `triage` and prioritize them -- Allow attendees to bring up issues and prioritize them - - Attendees have the chance to briefly present these new issues - -## Idea Backlog - -The backlog contains ideas of what could become part of the roadmap in the future. The list is ordered alphabetically. Therefore, the order does not express the importance or weight of a backlog item. - -We incorporate community feedback into the roadmap. If you think that important points are missing in the backlog, please share your ideas with us. We have a Slack channel. Please check out [metal-stack.io](https://metal-stack.io) for contact information. - -:::danger -By no means this list is a promise of what is being worked on in the near future. It is just a summary of ideas that was agreed on to be "nice to have". It is up to the investors, maintainers and the community to choose topics from this list and to implement them or to remove them from the list. -::: - -- Add metal-stack to [Gardener conformance test grid](https://testgrid.k8s.io/gardener-all) -- Autoscaler for metal control plane components -- CI dashboard and public integration testing -- Improved release and deploy processes (GitOps, [Spinnaker](https://spinnaker.io/), [Flux](https://fluxcd.io/)) -- Machine internet without firewalls -- metal-stack dashboard (UI) -- Offer our metal-stack extensions as enterprise products (accounting, cluster-api, S3) (neither of them will ever be required for running metal-stack, they just add extra value for certain enterprises) -- Partition managed by Kubernetes (with Kubelets joining the control plane cluster) -- Public offering / demo playground diff --git a/versioned_docs/version-v0.22.4/contributing/03-contribution-guideline.md b/versioned_docs/version-v0.22.4/contributing/03-contribution-guideline.md deleted file mode 100644 index 2c0526e3..00000000 --- a/versioned_docs/version-v0.22.4/contributing/03-contribution-guideline.md +++ /dev/null @@ -1,145 +0,0 @@ ---- -slug: /contribution-guideline -title: Contribution Guideline -sidebar_position: 3 ---- - -# Contribution Guideline - -This document describes the way we want to contribute code to the projects of metal-stack, which are hosted on [github.com/metal-stack](https://github.com/metal-stack). - -The document is meant to be understood as a general guideline for contributions, but not as burden to be placed on a developer. Use your best judgment when contributing code. Try to be as clean and precise as possible when writing code and try to make your code as maintainable and understandable as possible for other people. - -Even if it should go without saying, we live an open culture of discussion, in which everybody is welcome to participate. We treat every contribution with respect and objectiveness with the general aim to write software of quality. - -If you want, feel free to propose changes to this document in a pull request. - -## How Can I Contribute? - -Open a Github issue in the project you would like to contribute. Within the issue, your idea can be discussed. It is also possible to directly create a pull request when the set of changes is relatively small. - -When opening an issue please consider the following aspects: - -1. Create a meaningful issue describing the WHY? of your contribution. -1. Try to set appropriate labels to the issue. For example, attach the `triage` label to your issue if you want it to be discussed in the next [planning meeting](./02-planning-meetings.mdx). It might be useful to attend the meeting if you want to emphasize it being worked on. - -### Pull Requests - -The process described here has several goals: - -- Maintain quality -- Enable a sustainable system to review contributions -- Enable documented and reproducible addition of contributions - -1. Create a repository fork within the context of that issue. Members of the organization may work on the repository directly without a fork, which allows building development artifacts more easily. -1. Develop, document and test your contribution (try not to solve more than one issue in a single pull request). -1. Create a Draft Pull Request to the repository's main branch. -1. Create a meaningful description of the pull request or reference the related issue. The pull request template explains what the content should include, please read it. -1. Ask for merging your contribution by removing the draft marker. Repository maintainers (see [Code Ownership](#code-ownership)) are notified automatically, but you can also reach out to people directly on Slack if you want a review from a specific person. - -## General Objectives - -This section contains language-agnostic topics that all metal-stack projects are trying to follow. - -### Code Ownership - -The code base is owned by the entire team and every member is allowed to contribute changes to any of the projects. This is considered as collective code ownership[^1]. - -As a matter of fact, there are persons in a project, which already have experience with the sources. These are defined directly in the repository's [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) file. If you want to merge changes into the master branch, it is advisable to include code owners into the process of discussion and merging. - -### Microservices - -One major ambition of metal-stack is to follow the idea of [microservices](https://en.wikipedia.org/wiki/Microservices). This way, we want to achieve that we can - -- adapt to changes faster than with monolithic architectures, -- be free of restrictions due to certain choices of technology, -- leverage powerful traits of cloud infrastructures (e.g. high-scalability, high-availability, ...). - -### Programming Languages - -We are generally open to write code in any language that fits best to the function of the software. However, we encourage [golang](https://en.wikipedia.org/wiki/Go_(programming_language)) to be the main language of metal-stack as we think that it makes development faster when not establishing too many different languages in our architecture. Reason for this is that we are striving for consistent behavior of the microservices, similar to what has been described for the Twelve-Factor App (see [12 Factor](https://12factor.net/)). We help enforcing unified behavior by allowing a small layer of shared code for every programming language. We will refer to this shared code as "libraries" for the rest of this document. - -### Artifacts - -Artifacts are always produced by a CI process (i.e. Github Actions). - -Container images and [OCI artifacts](https://github.com/opencontainers/image-spec) are published on the Github Container Registry of the metal-stack organization. Please consider using Github Actions workflows utilizing similar actions as the other repositories (e.g. [build-push-action](https://github.com/docker/build-push-action), ...) - -For OCI images, we usually utilize [oras](https://github.com/oras-project/oras) for pushing the artifact to the registry. - -For signing artifacts we use [cosign](https://github.com/sigstore/cosign). The private key for signing artifacts is a CI secret called `COSIGN_PRIVATE_KEY`. - -Binary artifacts or OS images can be uploaded to `images.metal-stack.io` if necessary. - -### APIs - -The preferred way to implement an API is using [Connect RPC](https://connectrpc.com/), which is based on [grpc](https://grpc.io/). For working with the [Protobuf](https://protobuf.dev/) definitions, we utilize [buf](https://github.com/bufbuild/buf). - -The metal-api does still have a [Swagger-based](https://swagger.io/) API exposing traditional REST APIs for end-users. This API framework will become deprecated so it should not be used anymore for new projects. - -#### Versioning - -Artifacts are versioned by tagging the respective repository with a tag starting with the letter `v`. After the letter, there stands a valid [semantic version](https://semver.org/). - -### Documentation - -In order to make it easier for others to understand a project, we document general information and usage instructions in a `README.md` in any project. - -In addition to that, we document a microservice in the [docs](https://github.com/metal-stack/docs) repository. The documentation should contain the reasoning why this service exists and why it was being implemented the way it was being implemented. The aim of this procedure is to reduce the time for contributors to comprehend architectural decisions that were made during the process of writing the software and to clarify the general purpose of this service in the entire context of the software. - -## Guidelines - -This chapter describes general guidelines on how to develop and contribute code for a certain programming language. - -### Golang - -Development follows the official guide to: - -- Write clear, idiomatic Go code[^2] -- Learn from mistakes that must not be repeated[^3] -- Apply appropriate names to your artifacts: - - [https://go.dev/talks/2014/names.slide](https://go.dev/talks/2014/names.slide) - - [https://go.dev/blog/package-names](https://go.dev/blog/package-names) - - [https://go.dev/doc/effective_go#names](https://go.dev/doc/effective_go#names) -- Enable others to understand the reasoning of non-trivial code sequences by applying a meaningful documentation. - -#### Development Decisions - -- **Dependency Management** by using Go modules -- **Build and Test Automation** by using [GNU Make](https://man7.org/linux/man-pages/man1/make.1p.html). -- **APIs** should consider using [buf](https://github.com/bufbuild/buf) - -#### Libraries - -metal-stack maintains libraries that you can utilize in your project in order to unify common behavior. The main project that does this is called [metal-lib](https://github.com/metal-stack/metal-lib). - -#### Error Handling with Generated Swagger Clients - -From the server-side you should ensure that you are returning the common error json struct in case of an error as defined in the `metal-lib/httperrors`. Ensure you are using `go-restful >= v2.9.1` and `go-restful-openapi >= v0.13.1` (allows default responses with error codes other than 200). - -### Documentation - -We want to share knowledge and keep things simple. If things cannot kept simple we want to enable everybody to understand them by: - -- Document in short sentences[^4]. -- Do not explain the HOW (this is already documented by your code and documenting the obvious is considered a defect). -- Explain the WHY. Add a "to" in your documentation line to force yourself to explain the reasonning (e.g. "` to `"). - -### Python - -Development follows the official guide to: - -- Style Guide for Python Code (PEP 8)[^5] - - The use of an IDE like [PyCharm](https://www.jetbrains.com/pycharm/) helps to write compliant code easily -- Consider [setuptools](https://pythonhosted.org/an_example_pypi_project/setuptools.html) for packaging -- If you want to add a Python microservice to the mix, consider [pyinstaller](https://github.com/pyinstaller/pyinstaller) on Alpine to achieve small image sizes - -[^1]: [https://martinfowler.com/bliki/CodeOwnership.html](https://martinfowler.com/bliki/CodeOwnership.html) - -[^2]: [https://go.dev/doc/effective_go](https://go.dev/doc/effective_go) - -[^3]: [https://github.com/golang/go/wiki/CodeReviewComments](https://github.com/golang/go/wiki/CodeReviewComments) - -[^4]: [https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences](https://github.com/golang/go/wiki/CodeReviewComments#comment-sentences) - -[^5]: [https://www.python.org/dev/peps/pep-0008/](https://www.python.org/dev/peps/pep-0008/) diff --git a/versioned_docs/version-v0.22.4/contributing/04-release-flow.md b/versioned_docs/version-v0.22.4/contributing/04-release-flow.md deleted file mode 100644 index 62021ebf..00000000 --- a/versioned_docs/version-v0.22.4/contributing/04-release-flow.md +++ /dev/null @@ -1,110 +0,0 @@ ---- -slug: /release-flow -title: Release Flow -sidebar_position: 4 ---- - -# Releases - -The metal-stack contains of many microservices that depend on each other. The automated release flow is there to ensure that all components work together flawlessly for every metal-stack release. - -Releases and integration tests are published through our [release repository](https://github.com/metal-stack/releases). You can also find the [release notes](https://github.com/metal-stack/releases/releases) for this metal-stack version in there. The release notes contain information about new features, upgrade paths and bug fixes. - -If you want, you can sign up at our Slack channel where we are announcing every new release. Often, we provide additional information for metal-stack administrators and adopters at this place, too. - -This document is intended for developers, especially maintainers of metal-stack projects. - -## Release Flow - -The following diagram attempts to describe our current release flow: - -![](release_flow.svg) - -A release is created in the following way: - -- Individual repository maintainers within the metal-stack GitHub Organization can publish a release of their component. -- This release is automatically pushed to the `develop` branch of the release repository by the metal-robot. -- A push triggers a virtual release integration test using the mini-lab environment. This setup launches metal-stack with the `sonic` and `gardener` flavors to validate the different Ansible roles and execute basic operations across the metal-stack layer. -- To contribute components that are not directly part of the release vector, a pull request must be made against the `develop` branch of the release repository. Release maintainers may push directly to the `develop` branch. -- The release maintainers can `/freeze` the `develop` branch, effectively stopping the metal-robot from pushing component releases to this branch. -- The `develop` branch is tagged by a release maintainer with a `-rc.x` suffix to create a __release candidate__. -- The release candidate must pass a large integration test suite on a real environment, which is currently run by FI-TS. It tests the entire machine provisioning engine including the integration with Gardener, the deployment, metal-images and Kubernetes conformance tests. -- If the integration tests pass, the PR of the `develop` branch must be approved by at least two release maintainers. -- A release is created via GitHub releases, including all release notes, with a tag on the `main` branch. - -## FAQ - -**Question: I need PR #xyz to go into the release, why did you not include it?** - -Answer: It's not on purpose if we miss a PR to be included into a metal-stack release. Please use the pending pull request from `develop` into `master` as soon as it is open and comment which pull request you want to have included into the release. Also consider attending our planning meetings or contact us in our Slack channel if you have urgent requirements that need to be dealt with. - -**Question: Who is responsible for the releases? Who can freeze a release?** - -Answer: Every repository in metal-stack has a `CODEOWNERS` file pointing to a maintainer team. This is also true for the releases repository. Only release repository maintainers are allowed to `/freeze` a release (meaning the metal-robot does not automatically append new component releases to the release vector anymore). - -**Question: I can't push to the `develop` branch of this repository? How can I request changes to the release vector?** - -Answer: Most changes are automatically integrated by the metal-robot. For manually managed components, please raise a pull request against the `develop` branch. Only release maintainers are allowed to push to `develop` as otherwise it would be possible to mess up the release pipeline. - -**Question: What requirements need to be fulfilled to add a repository to the release vector?** - -Please see the section below named [Requirements for Release Vector Repositories](#requirements-for-release-vector-repositories). - -### Requirements for Release Vector Repositories - -Before adding a repository in the metal-stack org to the releases repository, it is advised for the maintainer to fulfill the following points: - -- The following files should be present at the repository root: - - [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) - - When a repository is created, the metal-robot automatically creates a -maintainers team in our GitHub org. - - The CODEOWNERS file should reference this team. - - The team should contain at least two maintainers. - - `LICENSE` - - This usually should be MIT with "metal-stack" as authors. - - `README.md` -- The `developers-core` team should be given repository access with `write` role, the codeowners team should have the `maintain` role -- Release artifacts should have an SPDX-formatted SBOM attached. - - For container images these are embedded using Buildx. -- The following branch protection rules should be set: - - The mainline should be protected. - - A pull request should be required before merging (required by at least one code owner). - - Status checks should be required to pass. - - Force push should not be allowed on this branch. -- One person from the releases maintainers has to add the repository to the metal-robot in order to pick up the releases, add them to the release vector and generate release notes. - -### How-To Release a Project - -[release-drafter](https://github.com/release-drafter/release-drafter) is preferred in order to generate release notes from merged PRs for your projects. It should be triggered for pushes on your main branch. - -The draft is then used to create a project release. The release has to be published through the GitHub UI as demonstrated in the screenshot below. - -**Tagging the repository is not enough as repository tagging does not associate your release notes to your release!** - -![](release.png) - -Some further remarks: - -- Use semver versions with `v` prefix for your tags -- Name your release after your release tag -- The metal-robot only picks up lines from your release notes that start with `-` or `*` (unordered list items) and appends them to the according section in the aggregated release draft -- A tag created through a GitHub UI release does not trigger a `push` event . This means, your pipeline will not start to run with the `push` trigger when publishing through the UI. - - Instead, use the `published` [release event trigger](https://docs.github.com/en/actions/reference/events-that-trigger-workflows#release) for your actions: - - ```yaml - on: - release: - types: - - published - ``` -- In case they are necessary, please do not forget to include `NOTEWORTHY`, `ACTIONS_REQUIRED` or `BREAKING_CHANGE` sections into releases. More information on those release draft sections can be read in a pull request template. - -### Pre-Releases - -Most metal-stack repositories are installed through the metal-stack release vector. Therefore, it is safe to release them and wait for the release integration suite to return results. - -However, there are certain repositories that have an external user base and can be used without a running metal-stack installation. Examples include [csi-driver-lvm](https://github.com/metal-stack/csi-driver-lvm) and [go-ipam](https://github.com/metal-stack/go-ipam). - -In the latter case, maintainers should create pre-releases using the GitHub feature "Set as a pre-release" if necessary. Additionally, maintainers should use an `-rc.x` tag to indicate that this component version is a pre-release. If the metal-stack integration tests do not add any substantial test coverage and if the component is thoroughly tested, a release candidate can be skipped. - -Once these components have been integration-tested, they can be released as the latest version with a valid tag on the same Git hash. In this case, the component in the release vector can be updated to the release version without running the integration suite again. If necessary, comment in the releases repository to execute this action (let a maintainer unfreeze the release pull request). diff --git a/versioned_docs/version-v0.22.4/contributing/05-oci-artifacts.md b/versioned_docs/version-v0.22.4/contributing/05-oci-artifacts.md deleted file mode 100644 index f9e46796..00000000 --- a/versioned_docs/version-v0.22.4/contributing/05-oci-artifacts.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -slug: /oci-artifacts -title: OCI Artifacts -sidebar_position: 5 ---- - -# OCI Artifacts - -Certain artifacts of metal-stack are not shipped as Docker containers but in a more generic registry container format following the [OCI](https://opencontainers.org/) specification. Examples for these artifacts are the metal-stack release vectors as defined by the [releases](https://github.com/metal-stack/releases) repository or ansible-roles that can be used for deploying metal-stack. - -The OCI artifacts have an expected format convention, which is described on this page. - -## Release Vector Artifacts - -This OCI artifact expects a layer with the artifact type `application/vnd.metal-stack.release-vector.v1` including one gzipped tar file called `release.tar.gz`, which should be marked with custom media type `application/vnd.metal-stack.release-vector.v1.tar+gzip`. - -Inside the tar file, there is a `release.yaml` file that contains a metal-stack release vector. - -The metal-stack release vector has a free format but by default expects an `ansible-roles` key at the root, mapping the role names to OCI artifacts and versions, like: - -``` -ansible-roles: - : - oci: - version: - # e.g.: - ansible-common: - oci: ghcr.io/metal-stack/ansible-common - repository: https://github.com/metal-stack/ansible-common - version: v0.7.2 -``` - -If this convention is not followed, it is not possible to install ansible-roles through the `metal_stack_release_vector` image as provided by the metal-deployment-base deployment base image. - -## Ansible Roles - -This OCI artifact expects a layer with the artifact type `application/vnd.metal-stack.release-vector.v1` including one gzipped tar file called `ansible-role.tar.gz`, which should be marked with custom media type `application/vnd.metal-stack.ansible-role.v1.tar+gzip`. - -Inside the tar file, there is **one folder** containing the ansible-role to install. Please do not include multiple folders as otherwise the `metal_stack_release_vector` module cannot alias role names, which is sometimes required for deployments. diff --git a/versioned_docs/version-v0.22.4/contributing/06-community.md b/versioned_docs/version-v0.22.4/contributing/06-community.md deleted file mode 100644 index 98a65b28..00000000 --- a/versioned_docs/version-v0.22.4/contributing/06-community.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -slug: /community -title: Community -sidebar_position: 6 -draft: true ---- - -# Community - -(Slack channel, community events like FOSDEM, Kubernetes Community Days..., blog -articles) diff --git a/versioned_docs/version-v0.22.4/contributing/release.png b/versioned_docs/version-v0.22.4/contributing/release.png deleted file mode 100644 index 598b1182..00000000 Binary files a/versioned_docs/version-v0.22.4/contributing/release.png and /dev/null differ diff --git a/versioned_docs/version-v0.22.4/contributing/release_flow.drawio b/versioned_docs/version-v0.22.4/contributing/release_flow.drawio deleted file mode 100644 index 6ca6b34f..00000000 --- a/versioned_docs/version-v0.22.4/contributing/release_flow.drawio +++ /dev/null @@ -1,721 +0,0 @@ - - - - - - - - - - - -
-
-
- Review release notes -
-
-
-
- - Review release notes - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- Organization Webhook -
-
-
-
- - Organization Webhook - -
-
-
- - - - - - - -
-
-
- projects -
-
-
-
- - projects - -
-
-
- - - - - - - - -
-
-
- - Publish release - -
-
-
-
- - Publish release - -
-
-
- - - - - - - - -
-
-
- Maintainer -
-
-
-
- - Maint... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- metal-robot release handler -
-
-
-
- - metal-robot release han... - -
-
-
- - - - - - - - -
-
-
- - no - -
-
-
-
- - no - -
-
-
- - - - - - - - -
-
-
- - yes - -
-
-
-
- - yes - -
-
-
- - - - - - - -
-
-
- version in event newer than release vector version -
-
-
-
- - version in event newer than... - -
-
-
- - - - - - - -
-
-
- - do nothing - -
-
-
-
- - do nothing - -
-
-
- - - - - - - - - - - - -
-
-
- Github Action -
-
-
-
- - Github Action - -
-
-
- - - - - - - -
-
-
- Bump version in release vector and push to - - develop - -
-
-
-
- - Bump version in release vector... - -
-
-
- - - - - - - - - - - -
-
-
- Open pull request from - - develop - - to - - master - -
-
-
-
- - Open pull request from develop... - -
-
-
- - - - - - - -
-
-
- Update aggregated release draft in - - metal-stack/releases - -
-
-
-
- - Update aggregated release draf... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Integration Testing -
-
-
-
- - Integration Testing - -
-
-
- - - - - - - - - - - -
-
-
- Merge to - - master - -
-
-
-
- - Merge to master - -
-
-
- - - - - - - - - - - - -
-
-
- Review -
-
-
-
- - Review - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Tests suceeded and PR changes reviewed -
-
-
-
- - Tests suceeded and PR chang... - -
-
-
- - - - - - - -
-
-
- - publish results to #integration - -
-
-
-
- - publish results to #integr... - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Release metal-stack -
-
-
-
- - Release metal-stack - -
-
-
- - - - - - - - - - - -
-
-
- - publish to #announcements - -
-
-
-
- - publish to #announcements - -
-
-
- - - - - - - -
-
-
- - - metal-stack/docs - - pull request - -
-
-
-
- - metal-stack/docs pull requ... - -
-
-
- - - - - - - - - - - - -
-
-
- Freeze -
-
-
-
- - Freeze - -
-
-
- - - - - - - - - - - - - - - -
-
-
- Freeze - - develop - - and create a release candidate -
-
-
-
- - Freeze develop and create a rel... - -
-
-
- - - - - - - -
-
-
- Large integration suites -
- - (currently owned by FI-TS, not public) - -
-
-
-
-
- - Large integration suites... - -
-
-
- - - - - - - - -
-
-
- Run -
-
-
-
- - Run - -
-
-
- - - - -
- - - - - Text is not SVG - cannot display - - - -
diff --git a/versioned_docs/version-v0.22.4/contributing/release_flow.svg b/versioned_docs/version-v0.22.4/contributing/release_flow.svg deleted file mode 100644 index 55cdd493..00000000 --- a/versioned_docs/version-v0.22.4/contributing/release_flow.svg +++ /dev/null @@ -1 +0,0 @@ -
Review release notes
Review release notes
projects
projects
projects
projects
Organization Webhook
Organization Webhook
projects
projects
Publish release
Publish release
Maintainer
Maint...
metal-robot release handler
metal-robot release han...
no
no
yes
yes
version in event newer than release vector version
version in event newer than...
do nothing
do nothing
Github Action
Github Action
Bump version in release vector and push todevelop
Bump version in release vector...
Open pull request fromdeveloptomaster
Open pull request from develop...
Update aggregated release draft inmetal-stack/releases
Update aggregated release draf...
Integration Testing
Integration Testing
Merge tomaster
Merge to master
Review
Review
Tests suceeded and PR changes reviewed
Tests suceeded and PR chang...
publish results to #integration
publish results to #integr...
Release metal-stack
Release metal-stack
publish to #announcements
publish to #announcements
metal-stack/docspull request
metal-stack/docs pull requ...
Freeze
Freeze
Freezedevelopand create a release candidate
Freeze develop and create a rel...
Large integration suites
(currently owned by FI-TS, not public)
Large integration suites...
Run
Run
Text is not SVG - cannot display
\ No newline at end of file diff --git a/versioned_docs/version-v0.22.4/docs/02-General/04-flavors-of-metalstack.md b/versioned_docs/version-v0.22.4/docs/02-General/04-flavors-of-metalstack.md index 7da427fc..2277ca6b 100644 --- a/versioned_docs/version-v0.22.4/docs/02-General/04-flavors-of-metalstack.md +++ b/versioned_docs/version-v0.22.4/docs/02-General/04-flavors-of-metalstack.md @@ -14,7 +14,7 @@ As modern infrastructure and cloud native applications are designed with Kuberne Regardless which flavor of metal-stack you use, it is always possible to manually provision machines, networks and ip addresses. This is the most basic way of using metal-stack and is very similar to how traditional bare metal infrastructures are managed. -Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](../../contributing/01-Proposals/MEP4/README.md) and [MEP-16](../../contributing/01-Proposals/MEP16/README.md) in the future. +Using plain metal-stack without additional layer was not a focus in the past. Therefore firewall and role management might be premature. These will be addressed by [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api) and [MEP-16](/community/MEP-16-metal-api-as-an-alternative-configuration-source-for-the-firewall-controller) in the future. ## Gardener diff --git a/versioned_docs/version-v0.22.4/docs/04-For Operators/03-deployment-guide.mdx b/versioned_docs/version-v0.22.4/docs/04-For Operators/03-deployment-guide.mdx index fc575ad4..ce58e0e0 100644 --- a/versioned_docs/version-v0.22.4/docs/04-For Operators/03-deployment-guide.mdx +++ b/versioned_docs/version-v0.22.4/docs/04-For Operators/03-deployment-guide.mdx @@ -31,7 +31,7 @@ You can use the [mini-lab](https://github.com/metal-stack/mini-lab) as a templat The metal control plane is typically deployed in a Kubernetes cluster. Therefore, this document will assume that you have a Kubernetes cluster ready for getting deployed. Even though it is theoretically possible to deploy metal-stack without Kubernetes, we strongly advise you to use the described method because we believe that Kubernetes gives you a lot of benefits regarding the stability and maintainability of the application deployment. :::tip -For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](../../contributing/01-Proposals/MEP18/README.md) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). +For metal-stack it does not matter where your control plane Kubernetes cluster is located. You can of course use a cluster managed by a hyperscaler. This has the advantage of not having to setup Kubernetes by yourself and could even become beneficial in terms of fail-safe operation. However, we also describe a solution of how to setup metal-stack with a self-hosted, [Autonomous Control Plane](/community/MEP-18-autonomous-control-plane) cluster. The only requirement from metal-stack is that your partitions can establish network connections to the metal control plane. If you are interested, you can find a reasoning behind this deployment decision [here](../05-Concepts/01-architecture.mdx#target-deployment-platforms). ::: Let's start off with a fresh folder for your deployment: @@ -75,7 +75,7 @@ At the end of this section we are gonna end up with the following files and fold ### Releases and Ansible Role Dependencies -As metal-stack consists of many microservices all having individual versions, we have come up with a [releases](https://github.com/metal-stack/releases) repository. It contains a YAML file (we often call it release vector) describing the fitting versions of all components for every release of metal-stack. Ansible role dependencies are also part of a metal-stack release. Both the metal-stack release vector and the metal-stack ansible-roles are shipped as OCI artifacts following a specific format that's described [here](../../contributing/05-oci-artifacts.md). These artifacts are signed with the CI token of the metal-stack Github organization and can be verified using [cosign](https://github.com/sigstore/cosign). +As metal-stack consists of many microservices all having individual versions, we have come up with a [releases](https://github.com/metal-stack/releases) repository. It contains a YAML file (we often call it release vector) describing the fitting versions of all components for every release of metal-stack. Ansible role dependencies are also part of a metal-stack release. Both the metal-stack release vector and the metal-stack ansible-roles are shipped as OCI artifacts following a specific format that's described [here](/community/oci-artifacts). These artifacts are signed with the CI token of the metal-stack Github organization and can be verified using [cosign](https://github.com/sigstore/cosign). In order to download the release vector and the referenced ansible-roles prior to a deployment, we provide a small helper module called `metal_stack_release_vector` as part of the [metal-deployment-base](https://github.com/metal-stack/metal-deployment-base) deployment image. Its main tasks are: diff --git a/versioned_docs/version-v0.22.4/docs/05-Concepts/01-architecture.mdx b/versioned_docs/version-v0.22.4/docs/05-Concepts/01-architecture.mdx index 709960e3..75298df9 100644 --- a/versioned_docs/version-v0.22.4/docs/05-Concepts/01-architecture.mdx +++ b/versioned_docs/version-v0.22.4/docs/05-Concepts/01-architecture.mdx @@ -152,4 +152,4 @@ Thus, for creating a partition as well as a machine or a firewall, the flags `dn In order to be fully offline resilient, make sure to check out `metal-image-cache-sync`. This component provides copies of `metal-images`, `metal-kernel` and `metal-hammer`. -This feature is related to [MEP14](../../contributing/01-Proposals/MEP14/README.md). +This feature is related to [MEP14](/community/MEP-14-independence-from-external-sources). diff --git a/versioned_docs/version-v0.22.4/docs/05-Concepts/02-user-management.md b/versioned_docs/version-v0.22.4/docs/05-Concepts/02-user-management.md index f1ee2778..ba742ee9 100644 --- a/versioned_docs/version-v0.22.4/docs/05-Concepts/02-user-management.md +++ b/versioned_docs/version-v0.22.4/docs/05-Concepts/02-user-management.md @@ -7,7 +7,7 @@ sidebar_position: 2 # User Management At the moment, metal-stack can more or less be seen as a low-level API that does not scope access based on projects and tenants. -Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](../../contributing/01-Proposals/MEP4/README.md). +Fine-grained access control with full multi-tenancy support is actively worked on in [MEP4](/community/MEP-4-multi-tenancy-for-the-metal-api). Until then projects and tenants can be created, but have no effect on access control. diff --git a/versioned_docs/version-v0.22.4/docs/06-For CISOs/Security/01-principles.md b/versioned_docs/version-v0.22.4/docs/06-For CISOs/Security/01-principles.md index 8e7030f5..e327ec4a 100644 --- a/versioned_docs/version-v0.22.4/docs/06-For CISOs/Security/01-principles.md +++ b/versioned_docs/version-v0.22.4/docs/06-For CISOs/Security/01-principles.md @@ -15,7 +15,7 @@ The minimal need to know principle is a security concept that restricts access t ### RBAC :::info -As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](../../../contributing/01-Proposals/MEP4/README.md). +As of now metal-stack does not implement fine-grained Role-Based Access Control (RBAC) within the `metal-api` but this is worked on in [MEP-4](..//community/MEP-4-multi-tenancy-for-the-metal-api). ::: As described in our [User Management](../../05-Concepts/02-user-management.md) concept the [metal-api](https://github.com/metal-stack/metal-api) currently offers three different user roles for authorization: diff --git a/versioned_docs/version-v0.22.4/docs/06-For CISOs/Security/04-communication-matrix.md b/versioned_docs/version-v0.22.4/docs/06-For CISOs/Security/04-communication-matrix.md index 07df2607..24c1bc1d 100644 --- a/versioned_docs/version-v0.22.4/docs/06-For CISOs/Security/04-communication-matrix.md +++ b/versioned_docs/version-v0.22.4/docs/06-For CISOs/Security/04-communication-matrix.md @@ -116,7 +116,7 @@ Please note that every [networking setup](../../05-Concepts/03-Network/01-theory | VLAN | Switches, Firewalls | Layer 2 traffic segmentation. | | VXLAN | Switches, Firewalls | Encapsulate Layer 2 frames in Layer 3 packets for network virtualization. | | EVPN | Switches, Firewalls | Overlay network technology for scalable and flexible network architectures. | -| VPN | Firewalls | Management access [without open SSH ports](../../../contributing/01-Proposals/MEP9/README.md). | +| VPN | Firewalls | Management access [without open SSH ports](..//community/MEP-9-no-open-ports-to-the-data-center). | | BGP | Multiple | Routing protocol for dynamic routing and network management. | | SSH | Management Server, Switches | Secure shell access for management and configuration. | | LLDP | Switches, Machines | Link Layer Discovery Protocol for network device discovery. | diff --git a/versioned_docs/version-v0.22.4/docs/06-For CISOs/rbac.md b/versioned_docs/version-v0.22.4/docs/06-For CISOs/rbac.md index 9a87b896..06c902bb 100644 --- a/versioned_docs/version-v0.22.4/docs/06-For CISOs/rbac.md +++ b/versioned_docs/version-v0.22.4/docs/06-For CISOs/rbac.md @@ -31,4 +31,4 @@ To ensure that internal components interact securely with the metal-api, metal-s Users can interact with the metal-api using [metalctl](https://github.com/metal-stack/metalctl), the command-line interface provided by metal-stack. Depending on the required operations, users should authenticate with the appropriate role to match their level of access. -As part of [MEP-4](../../contributing/01-Proposals/MEP4/README.md), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. +As part of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api), significant work is underway to introduce more fine-grained access control mechanisms within metal-stack, enhancing the precision and flexibility of permission management. diff --git a/versioned_docs/version-v0.22.4/docs/06-For CISOs/remote-access.md b/versioned_docs/version-v0.22.4/docs/06-For CISOs/remote-access.md index 0b8dbb19..dc24e82f 100644 --- a/versioned_docs/version-v0.22.4/docs/06-For CISOs/remote-access.md +++ b/versioned_docs/version-v0.22.4/docs/06-For CISOs/remote-access.md @@ -6,7 +6,7 @@ title: Remote Access ## Machines and Firewalls -Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](../../contributing/01-Proposals/MEP9/README.md). Administrators can access machines in two primary ways. +Remote access to machines and firewalls is essential for performing administrative tasks such as incident management, troubleshooting and sometimes for development. Standard SSH access is often insufficient for these purposes. In many cases, direct serial console access is required to fully manage the system. metal-stack follows a security-first approach by not offering direct SSH access to machines. This practice reduces the attack surface and prevents unauthorized access that could lead to system damage. Detailed information can be found in [MEP-9](/community/MEP-9-no-open-ports-to-the-data-center). Administrators can access machines in two primary ways. **Out-of-band management via SOL** @@ -26,4 +26,4 @@ This approach uses the [`metal-console`](../08-References/Control%20Plane/metal- Both methods ensure secure and controlled access to machines without exposing them unnecessarily to the network, maintaining the integrity and safety of the infrastructure. -Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](../../contributing/01-Proposals/MEP4/README.md). \ No newline at end of file +Connecting directly to a machine without a clear plan of action can have unintended consequences and negatively impact stability. For this reason, administrative privileges are required. This restriction ensures that only authorized personnel with the necessary expertise can perform actions that affect the underlying infrastructure. These principles will evolve with the introduction of [MEP-4](/community/MEP-4-multi-tenancy-for-the-metal-api). \ No newline at end of file diff --git a/versioned_sidebars/version-v0.21.10-sidebars.json b/versioned_sidebars/version-v0.21.10-sidebars.json index aaf562b8..b99db936 100644 --- a/versioned_sidebars/version-v0.21.10-sidebars.json +++ b/versioned_sidebars/version-v0.21.10-sidebars.json @@ -4,11 +4,5 @@ "type": "autogenerated", "dirName": "docs" } - ], - "contributing": [ - { - "type": "autogenerated", - "dirName": "contributing" - } ] } diff --git a/versioned_sidebars/version-v0.21.11-sidebars.json b/versioned_sidebars/version-v0.21.11-sidebars.json index aaf562b8..b99db936 100644 --- a/versioned_sidebars/version-v0.21.11-sidebars.json +++ b/versioned_sidebars/version-v0.21.11-sidebars.json @@ -4,11 +4,5 @@ "type": "autogenerated", "dirName": "docs" } - ], - "contributing": [ - { - "type": "autogenerated", - "dirName": "contributing" - } ] } diff --git a/versioned_sidebars/version-v0.21.8-sidebars.json b/versioned_sidebars/version-v0.21.8-sidebars.json index aaf562b8..b99db936 100644 --- a/versioned_sidebars/version-v0.21.8-sidebars.json +++ b/versioned_sidebars/version-v0.21.8-sidebars.json @@ -4,11 +4,5 @@ "type": "autogenerated", "dirName": "docs" } - ], - "contributing": [ - { - "type": "autogenerated", - "dirName": "contributing" - } ] } diff --git a/versioned_sidebars/version-v0.21.9-sidebars.json b/versioned_sidebars/version-v0.21.9-sidebars.json index aaf562b8..b99db936 100644 --- a/versioned_sidebars/version-v0.21.9-sidebars.json +++ b/versioned_sidebars/version-v0.21.9-sidebars.json @@ -4,11 +4,5 @@ "type": "autogenerated", "dirName": "docs" } - ], - "contributing": [ - { - "type": "autogenerated", - "dirName": "contributing" - } ] } diff --git a/versioned_sidebars/version-v0.22.0-sidebars.json b/versioned_sidebars/version-v0.22.0-sidebars.json index aaf562b8..b99db936 100644 --- a/versioned_sidebars/version-v0.22.0-sidebars.json +++ b/versioned_sidebars/version-v0.22.0-sidebars.json @@ -4,11 +4,5 @@ "type": "autogenerated", "dirName": "docs" } - ], - "contributing": [ - { - "type": "autogenerated", - "dirName": "contributing" - } ] } diff --git a/versioned_sidebars/version-v0.22.1-sidebars.json b/versioned_sidebars/version-v0.22.1-sidebars.json index aaf562b8..b99db936 100644 --- a/versioned_sidebars/version-v0.22.1-sidebars.json +++ b/versioned_sidebars/version-v0.22.1-sidebars.json @@ -4,11 +4,5 @@ "type": "autogenerated", "dirName": "docs" } - ], - "contributing": [ - { - "type": "autogenerated", - "dirName": "contributing" - } ] } diff --git a/versioned_sidebars/version-v0.22.2-sidebars.json b/versioned_sidebars/version-v0.22.2-sidebars.json index aaf562b8..b99db936 100644 --- a/versioned_sidebars/version-v0.22.2-sidebars.json +++ b/versioned_sidebars/version-v0.22.2-sidebars.json @@ -4,11 +4,5 @@ "type": "autogenerated", "dirName": "docs" } - ], - "contributing": [ - { - "type": "autogenerated", - "dirName": "contributing" - } ] } diff --git a/versioned_sidebars/version-v0.22.3-sidebars.json b/versioned_sidebars/version-v0.22.3-sidebars.json index aaf562b8..b99db936 100644 --- a/versioned_sidebars/version-v0.22.3-sidebars.json +++ b/versioned_sidebars/version-v0.22.3-sidebars.json @@ -4,11 +4,5 @@ "type": "autogenerated", "dirName": "docs" } - ], - "contributing": [ - { - "type": "autogenerated", - "dirName": "contributing" - } ] } diff --git a/versioned_sidebars/version-v0.22.4-sidebars.json b/versioned_sidebars/version-v0.22.4-sidebars.json index aaf562b8..b99db936 100644 --- a/versioned_sidebars/version-v0.22.4-sidebars.json +++ b/versioned_sidebars/version-v0.22.4-sidebars.json @@ -4,11 +4,5 @@ "type": "autogenerated", "dirName": "docs" } - ], - "contributing": [ - { - "type": "autogenerated", - "dirName": "contributing" - } ] }