From dc291589cfdc8a8b7af9fa82163e103960d2fefa Mon Sep 17 00:00:00 2001 From: rodrodsilo Date: Mon, 16 Feb 2026 10:00:04 +0200 Subject: [PATCH 001/115] Adjusting more the cpu to fit better on DO hosts --- root/values_medium.yaml | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/root/values_medium.yaml b/root/values_medium.yaml index 1b0a5d02..b69b7e4f 100644 --- a/root/values_medium.yaml +++ b/root/values_medium.yaml @@ -66,8 +66,8 @@ apps: replicas: 1 resources: limits: - cpu: "2000m" - memory: "4Gi" + cpu: "1000m" + memory: "2Gi" requests: cpu: "500m" memory: "1Gi" @@ -76,8 +76,8 @@ apps: redis: resources: limits: - cpu: "1000m" - memory: "2Gi" + cpu: "500m" + memory: "1Gi" requests: cpu: "250m" memory: "512Mi" @@ -85,8 +85,8 @@ apps: replicas: 1 resources: limits: - cpu: "1000m" - memory: "2Gi" + cpu: "500m" + memory: "1Gi" requests: cpu: "250m" memory: "512Mi" @@ -182,3 +182,20 @@ apps: storageClassName: direct accessModes: - ReadWriteOnce + + otel-lgtm-stack: + valuesObject: + collectors: + resources: + metrics: + limits: + memory: 4Gi + cpu: '1' + + opentelemetry-operator: + valuesObject: + manager: + resources: + requests: + cpu: "250m" + memory: "512Mi" From 375aff7ea214e2218c051e08bbfe7cebd0394615 Mon Sep 17 00:00:00 2001 From: brownzebra Date: Tue, 17 Feb 2026 14:12:04 +0200 Subject: [PATCH 002/115] feat: update docs to current cluster-forge state --- PRD.md | 697 +++++++++++++++++++----- README.md | 200 ++++--- docs/bootstrap_guide.md | 477 ++++++++++++++-- docs/cluster_size_configuration.md | 281 +++++----- docs/secret-management-user-guide.md | 274 ---------- docs/secrets_management_architecture.md | 455 ---------------- docs/values_inheritance_pattern.md | 206 +++++-- 7 files changed, 1393 insertions(+), 1197 deletions(-) delete mode 100644 docs/secret-management-user-guide.md delete mode 100644 docs/secrets_management_architecture.md diff --git a/PRD.md b/PRD.md index 98777449..9afefc7f 100644 --- a/PRD.md +++ b/PRD.md @@ -2,168 +2,615 @@ ## Executive Summary -**Cluster-Forge** is a Kubernetes platform automation tool designed to bundle various third-party, community, -and in-house components into a single, streamlined stack that can be deployed in Kubernetes clusters. -By automating the deployment process, Cluster-Forge simplifies the creation of consistent, ready-to-use clusters -with all essential services pre-configured and integrated. +**Cluster-Forge** is a Kubernetes platform automation tool that bundles third-party, community, and in-house components into a single, GitOps-managed stack deployable in Kubernetes clusters. It automates the deployment of a complete AI/ML compute platform built on AMD Enterprise AI Suite components, delivering consistent, production-ready clusters with all essential services pre-configured and integrated. + +The platform uses ArgoCD's app-of-apps pattern with a sophisticated bootstrap process that establishes GitOps infrastructure (ArgoCD, Gitea, OpenBao) before deploying the complete application stack. ## Target Users -- **Infrastructure Engineers** -- **Platform Engineers** -- **DevOps Engineers** -- **Cloud Native Engineers** -- **Site Reliability Engineers** -- **AI/ML Engineers** +- **AI/ML Engineers** - Unified platform for model training, serving, and orchestration +- **Platform Engineers** - Infrastructure automation with GitOps patterns +- **DevOps Engineers** - Consistent deployment across environments +- **Infrastructure Engineers** - Multi-cluster management and operations +- **Site Reliability Engineers** - Observability and reliability tooling +- **Research Teams** - Ephemeral test clusters for experimentation ## Product Architecture -### Dual Repository GitOps Pattern +### Bootstrap-First Deployment Model -Cluster-Forge implements a sophisticated GitOps deployment pattern supporting both external GitHub deployment and local cluster-native deployment: +Cluster-Forge uses a three-phase bootstrap process that establishes GitOps infrastructure before deploying applications: -- **External Mode** (`values.yaml`): Traditional GitOps with GitHub dependency -- **Local Mode** (`values_cf.yaml`): Self-contained GitOps with local Gitea and separate configuration repository +**Phase 1: Pre-Cleanup** +- Detects and removes previous installations when gitea-init-job completed successfully +- Deletes Gitea resources, OpenBao init jobs, and temporary files +- Ensures clean state for fresh deployments -See [Values Inheritance Pattern](docs/values_inheritance_pattern.md) for detailed documentation. +**Phase 2: GitOps Foundation Bootstrap** (Manual Helm Templates) +1. **ArgoCD** (v8.3.5) - GitOps controller deployed via helm template + kubectl apply +2. **OpenBao** (v0.18.2) - Secrets management with init job to configure vault, policies, and initial secrets +3. **Gitea** (v12.3.0) - Git server with init job to create cluster-forge and cluster-values repositories -### Size-Aware Configuration +**Phase 3: App-of-Apps Deployment** (ArgoCD-Managed) +- Creates cluster-forge Application pointing to root/ helm chart +- ArgoCD syncs and manages all remaining applications from enabledApps list +- Applications deployed in wave order (-5 to 0) based on dependencies -Cluster-Forge provides three pre-configured cluster profiles with streamlined inheritance: +### Dual Repository GitOps Pattern -- **Small Clusters** (1-5 users): Development/testing with minimal resources -- **Medium Clusters** (5-20 users): Team production workloads -- **Large Clusters** (10s-100s users): Enterprise scale with full features +Cluster-Forge supports flexible GitOps repository configurations: -Size-specific configurations follow DRY principles, inheriting from base configuration and only overriding differences. See [Cluster Size Configuration](docs/cluster_size_configuration.md) for details. +**Local Mode (Default)** - Self-contained cluster-native GitOps: +- `clusterForge.repoUrl`: Points to local Gitea (http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git) +- `externalValues.enabled: true`: Separate cluster-values repository for configuration +- Initialization handled by gitea-init-job which clones and pushes repositories from initial-cf-values ConfigMap +- Zero external dependencies once bootstrapped -### Workflow +**External Mode** - Traditional GitHub-based GitOps: +- Set `clusterForge.repoUrl` to external GitHub repository +- Use `--dev` flag with bootstrap.sh to configure targetRevision for feature branch development +- Supports custom branch selection for testing and development -Cluster-Forge deploys all necessary components within the cluster using GitOps-controller [ArgoCD](https://argo-cd.readthedocs.io/) -and [app-of-apps pattern](https://argo-cd.readthedocs.io/en/stable/operator-manual/cluster-bootstrapping/#app-of-apps-pattern) where Cluster-Forge itself acts as an app of apps. +### Size-Aware Configuration -### Components +Three cluster profiles with inheritance-based resource optimization: + +**Small Clusters** (1-5 users, dev/test): +- Single replica deployments (ArgoCD, Redis, etc.) +- Reduced resource limits (ArgoCD controller: 2 CPU, 4Gi RAM) +- Adds kyverno-policies-storage-local-path for RWX→RWO PVC mutation +- MinIO tenant: 250Gi storage, single server +- Suitable for: Local workstations, development environments + +**Medium Clusters** (5-20 users, team production): +- Single replica with moderate resource allocation +- Same storage policies as small (local-path support) +- ArgoCD controller: 2 CPU, 4Gi RAM +- Default configuration for balanced performance +- Suitable for: Small teams, staging environments + +**Large Clusters** (10s-100s users, enterprise scale): +- OpenBao HA: 3 replicas with Raft consensus +- No local-path policies (assumes distributed storage like Longhorn) +- MinIO tenant: 500Gi storage +- Production-grade resource allocation +- Suitable for: Production deployments, multi-tenant environments + +Size configurations use YAML merge semantics where size-specific values override base values.yaml settings. + +### App-of-Apps Architecture + +Cluster-Forge root chart generates ArgoCD Application manifests from: +- `enabledApps[]` - List of applications to deploy +- `apps.` - Configuration for each application including: + - `path` - Relative path in sources/ directory + - `namespace` - Target Kubernetes namespace + - `syncWave` - Deployment order (-5 to 0) + - `valuesObject` - Inline Helm values + - `helmParameters` - Templated Helm parameters (e.g., domain injection) + - `ignoreDifferences` - ArgoCD diff exclusions + +The cluster-forge Application uses multi-source feature when externalValues.enabled=true: +- Source 1: cluster-forge repo (root/ helm chart) +- Source 2: cluster-values repo (custom values.yaml) +- Merges: base values.yaml + size values + external cluster-values/values.yaml + +### Component Categories + +**Layer 1: GitOps Foundation** (Sync Wave -4 to -3) +- ArgoCD 8.3.5 - GitOps continuous deployment controller +- Gitea 12.3.0 - Self-hosted Git server with SQLite backend +- OpenBao 0.18.2 - Vault-compatible secrets management +- External Secrets 0.15.1 - Secrets synchronization operator + +**Layer 2: Core Infrastructure** (Sync Wave -5 to -2) + +*Networking:* +- Gateway API v1.3.0 - Kubernetes standard ingress API +- KGateway v2.1.0-main - Gateway API implementation with custom WebSocket support +- MetalLB v0.15.2 - Bare metal load balancer +- Cert-Manager v1.18.2 - Automated TLS certificate management + +*Policy & Security:* +- Kyverno 3.5.1 - Policy engine for admission control +- Kyverno Config - OIDC integration, policy configurations +- Kyverno Policies Base - Core security policies +- Kyverno Policies Storage-Local-Path - Access mode mutation (small/medium only) +- Cluster-Auth 0.5.0 - Kubernetes RBAC integration + +*Storage & Database:* +- CNPG Operator 0.26.0 - CloudNativePG PostgreSQL operator +- MinIO Operator 7.1.1 - S3-compatible object storage operator +- MinIO Tenant 7.1.1 - Tenant deployment with default-bucket and models buckets + +**Layer 3: Observability** (Sync Wave -5 to -2) +- Prometheus Operator CRDs 23.0.0 - Metrics infrastructure +- OpenTelemetry Operator 0.93.1 - Telemetry collection with contrib collector +- OTEL-LGTM Stack v1.0.7 - Integrated observability (Loki, Grafana, Tempo, Mimir) + - Storage: 50Gi each for tempo/loki/mimir, 10Gi grafana + - Metrics collector: 8Gi RAM, 2 CPU + - Logs collector daemonset: 2Gi RAM, 1 CPU + +**Layer 4: Identity & Access** (Sync Wave -1 to 0) +- Keycloak (keycloak-old chart) - Enterprise IAM with AIRM realm + - Custom extensions via init containers (SilogenExtensionPackage.jar) + - Realm import with domain-group-authenticator + - Client secrets for: AIRM, K8s, MinIO, Gitea, ArgoCD + +**Layer 5: AI/ML Compute Stack** (Sync Wave -3 to 0) + +*GPU & Scheduling:* +- AMD GPU Operator v1.4.1 - GPU device plugin and drivers +- KubeRay Operator 1.4.2 - Ray distributed computing framework +- Kueue 0.13.0 - Job queueing with multi-framework support + - Integrations: batch/job, Ray, MPIJob, PyTorchJob, TensorFlow, Jobset, AppWrapper, Pod, Deployment +- AppWrapper v1.1.2 - Application-level resource scheduling +- KEDA 2.18.1 - Event-driven autoscaling +- Kedify-OTEL v0.0.6 - KEDA telemetry integration + +*ML Serving & Inference:* +- KServe v0.16.0 - Model serving platform (Standard deployment mode) +- KServe CRDs v0.16.0 - Model serving custom resources + +*Workflow & Messaging:* +- Kaiwo v0.2.0-rc11 - AI workload orchestration +- Kaiwo CRDs v0.2.0-rc11 - Workflow custom resources +- RabbitMQ v2.15.0 - Message broker for async processing + +**Layer 6: AIRM Application** (Sync Wave 0) +- AIRM 0.3.2 - AMD Resource Manager application suite +- AIM Cluster Model Source - Cluster resource models for AIRM + +### Repository Structure + +``` +cluster-forge/ +├── scripts/ +│ ├── bootstrap.sh # Main bootstrap orchestration +│ ├── init-gitea-job/ # Helm chart for Gitea initialization +│ ├── init-openbao-job/ # Helm chart for OpenBao initialization +│ └── utils/ # Backup/restore utilities +│ ├── export_databases.sh +│ ├── export_rabbitmq.sh +│ ├── import_databases.sh +│ ├── import_rabbitmq.sh +│ └── mirror_minio.sh +├── root/ +│ ├── Chart.yaml # ClusterForge root helm chart metadata +│ ├── values.yaml # Base configuration +│ ├── values_small.yaml # Small cluster overrides +│ ├── values_medium.yaml # Medium cluster overrides +│ ├── values_large.yaml # Large cluster overrides +│ └── templates/ +│ ├── _helpers.yaml # Template helper functions +│ ├── cluster-apps.yaml # Generates ArgoCD Application per enabledApp +│ └── cluster-forge.yaml # Self-managing ClusterForge Application +├── sources/ # Versioned helm charts and configurations +│ ├── / +│ │ ├── / # Upstream helm chart or Kustomize +│ │ ├── source.yaml # Source metadata (optional) +│ │ └── values_ha.yaml # HA overrides (optional) +│ └── / # Configuration helm charts +│ └── templates/ # ConfigMaps, Secrets, ExternalSecrets +├── docs/ # Architecture and operational documentation +└── sbom/ # Software bill of materials tooling +``` + +## Key Features + +### Single-Command Bootstrap + +The bootstrap.sh script orchestrates complete cluster setup: + +```bash +./scripts/bootstrap.sh [--CLUSTER_SIZE=small|medium|large] [--dev] +``` + +**Bootstrap Process:** +1. **Validation** - Checks domain, cluster size, values files, yq tool availability +2. **Pre-cleanup** - Removes previous installations if gitea-init-job completed +3. **Values Merge** - Combines base + size-specific values with domain injection +4. **Namespace Creation** - Creates argocd, cf-gitea, cf-openbao namespaces +5. **ArgoCD Deployment** - helm template + kubectl apply with server-side apply +6. **OpenBao Deployment** - helm template + kubectl apply, waits for pod ready +7. **OpenBao Init Job** - Configures vault policies, auth methods, initial secrets +8. **Gitea Deployment** - helm template + kubectl apply, waits for rollout +9. **Gitea Init Job** - Creates cluster-org, clones/pushes cluster-forge and cluster-values repos +10. **ClusterForge App** - Creates root Application with merged values +11. **Cleanup** - Removes temporary values files + +**Development Mode** (--dev flag): +- Prompts for git branch selection (current, custom, or abort) +- Sets targetRevision for ArgoCD applications +- Enables feature branch testing without committing to main + +### Self-Contained GitOps + +Once bootstrapped, the cluster is fully self-sufficient: + +**Local Git Server (Gitea):** +- Stores cluster-forge repository (platform code) +- Stores cluster-values repository (environment-specific configuration) +- Provides Git UI at https://gitea.{domain} +- Admin credentials in gitea-admin-credentials secret +- SQLite backend for lightweight operation + +**Local Secrets Management (OpenBao):** +- Vault-compatible secrets engine +- Initialized with policies for each component +- Kubernetes auth method configured +- External Secrets Operator integration +- Secrets for: Keycloak clients, AIRM, database credentials, API keys + +**Configuration as Code:** +- All platform configuration in cluster-values repo +- Changes trigger ArgoCD sync automatically +- Full audit trail through Git history +- Rollback capability via Git revert + +### Values Inheritance System + +Three-layer configuration merge: + +1. **Base Layer** (values.yaml) - Common defaults for all sizes +2. **Size Layer** (values_{size}.yaml) - Size-specific overrides +3. **External Layer** (cluster-values/values.yaml) - Environment customization + +```yaml +# Bootstrap merges: base <- size <- external +VALUES=$(yq eval-all '. as $item ireduce ({}; . * $item)' \ + values.yaml values_medium.yaml cluster-values/values.yaml) +``` + +**Size-Specific Behaviors:** + +Small/Medium add storage policies: +```yaml +enabledApps: + - kyverno-policies-storage-local-path # RWX→RWO mutation for local-path +``` + +Large enables HA components: +```yaml +apps: + openbao: + valuesObject: + server: + ha: + enabled: true + replicas: 3 +``` + +### Component Version Management + +**Versioned Sources Structure:** +``` +sources/argocd/ + ├── 8.3.5/ # Upstream helm chart + ├── source.yaml # Source metadata (upstream repo, version) + └── values_ha.yaml # Optional HA overrides +``` + +**Configuration Companions:** +Each major component has -config variant: +- argocd-config: OIDC integration, RBAC policies, ExternalSecrets +- gitea-config: Keycloak OAuth, repository templates +- openbao-config: Policy definitions, secret paths, initialization scripts +- minio-tenant-config: Bucket policies, user credentials, gateway routes + +### Secrets Management Architecture + +**Three-Tier Secrets System:** + +1. **OpenBao (Source of Truth)** + - KV v2 secrets engine at secret/ + - Policies per namespace: argocd-policy, airm-policy, gitea-policy, etc. + - Kubernetes auth method for pod authentication + +2. **External Secrets Operator (Synchronization)** + - ExternalSecret resources in each namespace + - SecretStore points to OpenBao with serviceAccountRef + - Automatic sync from OpenBao → Kubernetes Secrets + - Example: argocd-oidc-creds ExternalSecret → OIDC client secret + +3. **Kubernetes Secrets (Consumption)** + - Standard Kubernetes Secret objects + - Referenced by pods via env, volumeMounts + - Automatically updated when OpenBao source changes + +**Bootstrap Secret Flow:** +- bootstrap.sh generates initial passwords with `openssl rand -hex 16` +- openbao-init-job writes secrets to OpenBao +- External Secrets Operator syncs to Kubernetes Secrets +- Applications consume via secret references + +### Modular Policy System + +Kyverno policies organized by concern: + +**Base Policies** (kyverno-policies-base): +- Core security policies +- Resource quotas +- Label requirements + +**Storage Policies** (kyverno-policies-storage-local-path): +- Access mode mutation: ReadWriteMany → ReadWriteOnce +- Only enabled for small/medium clusters with local-path storage +- Prevents PVC creation failures on non-distributed storage + +**Custom Policies:** +- AIRM-specific policies included in airm chart +- Custom validations and mutations per application + +### Backup and Restore Utilities + +**Database Export/Import:** +```bash +scripts/utils/export_databases.sh # PostgreSQL dumps from CNPG +scripts/utils/import_databases.sh # Restore PostgreSQL databases +``` + +**Message Queue:** +```bash +scripts/utils/export_rabbitmq.sh # RabbitMQ definitions and messages +scripts/utils/import_rabbitmq.sh # Restore queues and exchanges +``` + +**Object Storage:** +```bash +scripts/utils/mirror_minio.sh # MinIO bucket synchronization +``` + +### Observability Stack + +**Integrated LGTM Platform:** +- **Loki** - Log aggregation with 50Gi storage +- **Grafana** - Visualization dashboards with 10Gi storage +- **Tempo** - Distributed tracing with 50Gi storage +- **Mimir** - Prometheus metrics with 50Gi storage + +**Automatic Collection:** +- Metrics collector deployment: 8Gi RAM, 2 CPU limits +- Logs collector daemonset: 2Gi RAM, 1 CPU per node +- OpenTelemetry contrib collector for advanced telemetry +- Node exporter and kube-state-metrics enabled by default + +**Service Endpoints:** +- Grafana UI: Port 3000 +- OTLP gRPC: Port 4317 +- OTLP HTTP: Port 4318 +- Prometheus: Port 9090 +- Loki: Port 3100 + +### AI/ML Workload Support + +**Multi-Framework Job Integration:** + +Kueue manages scheduling for: +- Kubernetes batch/job +- Ray (RayJob, RayCluster) +- Kubeflow (MPIJob, PyTorchJob, TFJob, XGBoostJob, JAXJob, PaddleJob) +- AppWrapper for multi-pod applications +- Pod, Deployment, StatefulSet + +**Resource Management:** +- Kueue ClusterQueues for resource pools +- LocalQueues per namespace +- ResourceFlavors for GPU/CPU quotas +- Cohort sharing across teams + +**Model Serving:** +- KServe Standard deployment mode +- InferenceService CRD for models +- Auto-scaling with KEDA +- S3 model storage via MinIO + +**GPU Support:** +- AMD GPU Operator for device plugin +- Automatic driver installation +- GPU metrics in Prometheus +- Scheduling via Kueue resource flavors -Cluster-Forge repository file structure has 3 main folders: +## Technical Requirements -- **scripts** - bash scripts to [bootstrap](docs/bootstrap_guide.md) necessary prerequisite components for Cluster-Forge & install it -- **root** - core component, root helm chart for app-of-apps that creates all other ArgoCD applications into k8s cluster -- **sources** - folder that contains third-party, community and in-house helm charts & kubernetes manifests that represent cluster components -- **docs** - comprehensive documentation covering architecture, configuration, and operational guides +### Prerequisites + +**Kubernetes Cluster:** +- Kubernetes 1.33+ (configurable via bootstrap.sh KUBE_VERSION) +- kubectl with cluster-admin access +- Working storage class (local-path for small/medium, distributed for large) +- Sufficient resources per cluster size + +**Networking:** +- Domain name or wildcard DNS (*.example.com or *.{ip}.nip.io) +- Ingress capability (Gateway API + KGateway deployed by ClusterForge) +- External LoadBalancer or MetalLB (deployed by ClusterForge) + +**TLS Certificates:** +- cluster-tls secret in kgateway-system namespace +- Can be self-signed for development +- Production should use Cert-Manager with ACME + +**Required Tools:** +- yq v4+ (YAML processor) +- helm 3.0+ +- kubectl +- openssl (for password generation) +- git (for --dev mode) + +### Resource Requirements + +**Small Cluster:** +- 3-5 worker nodes +- 8 CPU, 16Gi RAM minimum per node +- 250Gi+ total storage +- Local-path or hostPath storage class + +**Medium Cluster:** +- 5-10 worker nodes +- 16 CPU, 32Gi RAM minimum per node +- 500Gi+ total storage +- Local-path or distributed storage + +**Large Cluster:** +- 10+ worker nodes +- 32 CPU, 64Gi RAM minimum per node +- 1Ti+ total storage +- Distributed storage required (Longhorn, Ceph, etc.) -So using the bootstrap script user deploys ArgoCD GitOps-controller and root application which then deploys other components into the cluster. +### Functional Requirements -Here are some key components that are being deployed: +**FR1: AIRM Platform Delivery** +- Deploy AMD Resource Manager (AIRM) 0.3.2 with UI and API +- Provide model serving with KServe v0.16.0 +- Support distributed computing via KubeRay Operator 1.4.2 +- Enable workflow orchestration through Kaiwo v0.2.0-rc11 +- Integrate AMD GPU Operator v1.4.1 for GPU resources -#### Layer 1: GitOps Foundation (Bootstrap) -- **ArgoCD** - GitOps controller for continuous deployment -- **Gitea** - Git repository server for source management -- **OpenBao** - Vault-compatible secret management system +**FR2: GitOps Operations** +- Bootstrap ArgoCD 8.3.5 with single command +- Manage 40+ components as ArgoCD Applications +- Support multi-source Applications for values separation +- Enable local Gitea 12.3.0 for cluster-native GitOps +- Provide developer mode for branch-based testing + +**FR3: Size-Aware Deployment** +- Support small/medium/large configurations via --CLUSTER_SIZE flag +- Automatically merge size-specific values with base configuration +- Enable/disable components based on cluster size (e.g., HA modes) +- Apply appropriate policies per size (storage access modes) + +**FR4: Secrets Management** +- Initialize OpenBao 0.18.2 with vault policies +- Configure External Secrets Operator 0.15.1 integration +- Generate and store all component credentials +- Sync secrets from OpenBao to Kubernetes automatically + +**FR5: Dependency Orchestration** +- Deploy components in wave order (-5 to 0) +- Bootstrap foundation before app-of-apps (ArgoCD, OpenBao, Gitea) +- Wait for component health before proceeding +- Use ignoreDifferences for known drift patterns -#### Layer 2: Core Infrastructure -**Networking & Security:** -- **Gateway API + KGateway** - Modern ingress and traffic management -- **Cert-Manager** - Automated TLS certificate management -- **MetalLB** - Load balancer for bare metal environments -- **External Secrets Operator** - External secret integration -- **Cilium** - Network security and observability -- **Kyverno** - Policy engine with modular policy system (see [Kyverno Modular Design](docs/kyverno_modular_design.md)) +### Non-Functional Requirements -**Storage & Database:** -- **CNPG Operator** - Cloud-native PostgreSQL management -- **MinIO Operator + Tenant** - S3-compatible object storage -- **Longhorn** - Distributed block storage +**Performance:** +- Complete bootstrap in under 15 minutes (small cluster) +- ArgoCD sync time under 5 minutes for full stack +- Gitea init job completes in under 2 minutes + +**Reliability:** +- OpenBao HA with 3 replicas and Raft (large clusters) +- ArgoCD automated sync with self-heal +- Server-side apply to prevent field manager conflicts + +**Maintainability:** +- Single values file per cluster size +- DRY principle for configuration inheritance +- Versioned sources for reproducible deployments +- SBOM generation for supply chain security + +**Usability:** +- Single-command deployment +- Helpful error messages with validation +- Progress indication during bootstrap +- Access URLs displayed on completion + +## Development and Customization + +### Adding New Components + +1. Add chart to sources/{component}/{version}/ +2. Define app configuration in values.yaml: +```yaml +apps: + my-component: + path: my-component/1.0.0 + namespace: my-namespace + syncWave: -1 + valuesObject: + # component values +``` +3. Add to enabledApps list +4. Test with --dev mode + +### Custom Cluster Values + +Create cluster-values repository with custom values.yaml: +```yaml +# Override any base configuration +global: + domain: custom.example.com + +apps: + argocd: + valuesObject: + server: + replicas: 3 # Custom override +``` + +### Size Configuration + +Modify values_{size}.yaml to adjust resources: +- Change replica counts +- Adjust CPU/memory limits +- Enable/disable HA modes +- Add size-specific enabledApps -#### Layer 3: Observability & Monitoring -- **Prometheus** - Metrics collection and alerting -- **Grafana** - Visualization and dashboarding -- **Prometheus Operator CRDs** - Metrics collection infrastructure -- **OpenTelemetry Operator** - Distributed tracing and telemetry -- **OTEL-LGTM Stack** - Unified observability platform (Loki, Grafana, Tempo, Mimir) +## Documentation -#### Layer 4: AI/ML Compute Stack -**GPU & Compute:** -- **AMD GPU Operator** - GPU device management and drivers -- **KubeRay Operator** - Ray distributed computing framework -- **KServe + CRDs** - Kubernetes-native model serving -- **Kueue** - Advanced job queueing system -- **AppWrapper** - Application scheduling and resource management -- **KEDA** - Event-driven autoscaling +Detailed documentation in `/docs`: -**Workflow & Orchestration:** -- **Kaiwo + CRDs** - Workflow management system -- **RabbitMQ** - Message broker for async processing +- [Bootstrap Guide](docs/bootstrap_guide.md) - Deployment walkthrough +- [Cluster Size Configuration](docs/cluster_size_configuration.md) - Size planning +- [Values Inheritance Pattern](docs/values_inheritance_pattern.md) - GitOps configuration +- [Kyverno Modular Design](docs/kyverno_modular_design.md) - Policy architecture +- [Kyverno Access Mode Policy](docs/kyverno_access_mode_policy.md) - Storage policies +- [Backup and Restore](docs/backup_and_restore.md) - Data protection -#### Layer 5: Identity & Access -- **Keycloak** - Enterprise identity and access management -- **Cluster-Auth** - Kubernetes RBAC integration +## Software Bill of Materials (SBOM) -#### Layer 6: AIRM App -- **AIRM API** - The central API layer for AMD Resource Manager, handling authentication, access control, and cluster coordination. -- **AIRM UI** - The frontend interface to interact with resource management features, integrated with the AIRM API and authentication services. -- **AIRM Dispatcher** - The agent responsible for dispatching compute workloads to registered Kubernetes clusters and managing their lifecycle. +ClusterForge includes comprehensive SBOM tooling in `/sbom`: -## Technical Requirements +**SBOM Files:** +- `components.yaml` - Canonical list of all components with versions, licenses, and metadata +- `SBOM-QUICK-GUIDE.md` - Guide for SBOM generation and validation -### Prerequisites & Dependencies +**Validation Scripts:** +- `validate-components-sync.sh` - Ensures components.yaml matches actual sources/ +- `validate-enabled-apps.sh` - Validates enabledApps lists reference defined components +- `validate-metadata.sh` - Checks required metadata fields +- `validate-sync.sh` - Full validation suite -#### External Dependencies -- **Kubernetes cluster** with kubectl access -- **Working storage class** for persistent volumes -- **Domain name configuration** for external access -- **cluster-tls secret** in kgateway-system namespace -- **Network connectivity** for image pulls and external services +**Generation Scripts:** +- `generate-sbom.sh` - Generates SPDX/CycloneDX SBOM documents +- `generate-compare-components.sh` - Compares component versions +- `update_licenses.sh` - Updates license information -#### Required Tools -- **Helm 3.0+** - Package management -- **kubectl** - Kubernetes CLI tool -- **OpenSSL** - Certificate and secret generation +## Version Information -### Functional Requirements +**Current Release:** v1.8.0-rc2 -**FR1: AIRM Platform Delivery** -- Deploy complete AI/ML platform with web UI and API -- Provide model serving capabilities with KServe integration -- Support distributed computing with Ray operator -- Enable workflow orchestration through Kaiwo -- Integrate GPU resource management +**Key Component Versions:** +- ArgoCD: 8.3.5 +- Gitea: 12.3.0 +- OpenBao: 0.18.2 +- Keycloak: keycloak-old chart +- KServe: v0.16.0 +- Kaiwo: v0.2.0-rc11 +- AIRM: 0.3.2 +- Kueue: 0.13.0 +- AMD GPU Operator: v1.4.1 +- OTEL-LGTM Stack: v1.0.7 -**FR2: GitOps Operations** -- Bootstrap ArgoCD foundation with single script -- Manage all components as ArgoCD Applications -- Support both external GitHub and local Gitea repositories -- Enable continuous deployment and sync capabilities -- Provide developer access to cluster configuration via Git - -**FR3: Size-Aware Deployment** -- Support small, medium, and large cluster configurations -- Implement automatic resource scaling based on cluster size -- Provide appropriate storage and access mode configurations per size -- Enable cluster-specific policy enforcement (e.g., [Kyverno Access Mode Policy](docs/kyverno_access_mode_policy.md)) - -**FR4: Dependency Management** -- Deploy components in correct dependency order -- Validate component health before proceeding -- Handle complex inter-component dependencies automatically -- Support component customization through values files +## Support and Contribution -### Non-Functional Requirements +**Repository:** https://github.com/silogen/cluster-forge -- Single-command bootstrap deployment -- Complete platform deployment in under 30 minutes -- Provide HA-configuration for all critical components -- Support air-gapped deployment scenarios -- Maintain configuration version control through Git -- Enable seamless transition from external to local repository management +**Issue Tracking:** Use GitHub Issues for bug reports and feature requests -## Documentation +**Maintainers:** ClusterForge Team -Comprehensive documentation is available in the `/docs` folder: +## License -- [Bootstrap Guide](docs/bootstrap_guide.md) - Step-by-step deployment instructions -- [Cluster Size Configuration](docs/cluster_size_configuration.md) - Small/medium/large cluster setup -- [Values Inheritance Pattern](docs/values_inheritance_pattern.md) - GitOps repository configuration -- [Kyverno Modular Design](docs/kyverno_modular_design.md) - Policy system architecture -- [Kyverno Access Mode Policy](docs/kyverno_access_mode_policy.md) - Storage compatibility policies -- [Secrets Management Architecture](docs/secrets_management_architecture.md) - Security implementation -- [Backup and Restore](docs/backup_and_restore.md) - Data protection procedures \ No newline at end of file +See [LICENSE](LICENSE) and [NOTICE](NOTICE) files for licensing information. \ No newline at end of file diff --git a/README.md b/README.md index 6b9c608e..1eea0360 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,28 @@ # Cluster-Forge -**A helper tool that deploys [AMD Enterprise AI Suite](https://enterprise-ai.docs.amd.com/en/latest/) into Kubernetes cluster.** +**A Kubernetes platform automation tool that deploys [AMD Enterprise AI Suite](https://enterprise-ai.docs.amd.com/en/latest/) with complete GitOps infrastructure.** ## Overview -**Cluster-Forge** is a tool designed to bundle various third-party, community, and in-house components into a single, streamlined stack that can be deployed in Kubernetes clusters. By automating the deployment process, Cluster-Forge simplifies the creation of consistent, ready-to-use clusters. +**Cluster-Forge** bundles third-party, community, and in-house components into a single, GitOps-managed stack deployable in Kubernetes clusters. It automates the deployment of a complete AI/ML compute platform with all essential services pre-configured and integrated. -This tool is ideal for scenarios such as: +Using a bootstrap-first deployment model, Cluster-Forge establishes GitOps infrastructure (ArgoCD, Gitea, OpenBao) before deploying the complete application stack via ArgoCD's app-of-apps pattern. -- **Ephemeral test clusters** - Create temporary environments quickly -- **CI/CD pipeline clusters** - Ensure consistent testing environments -- **Multiple production clusters** - Manage a fleet of clusters efficiently -- **Reproducible environments** - Ensure consistency across deployments +**Ideal for:** + +- **AI/ML Engineers** - Unified platform for model training, serving, and orchestration +- **Platform Engineers** - Infrastructure automation with GitOps patterns +- **DevOps Teams** - Consistent deployment across development, staging, and production +- **Research Teams** - Ephemeral test clusters for experimentation ## 🚀 Quick Start -### Basic Deployment +### Single-Command Deployment ```bash -./scripts/bootstrap.sh +./scripts/bootstrap.sh [--CLUSTER_SIZE=small|medium|large] [--dev] ``` -### Size-Aware Deployment +### Size-Aware Deployment Examples ```bash # Small cluster (1-5 users, development/testing) ./scripts/bootstrap.sh dev.example.com --CLUSTER_SIZE=small @@ -30,101 +32,135 @@ This tool is ideal for scenarios such as: # Large cluster (10s-100s users, enterprise scale) ./scripts/bootstrap.sh prod.example.com --CLUSTER_SIZE=large + +# Development mode (feature branch testing) +./scripts/bootstrap.sh dev.example.com --CLUSTER_SIZE=small --dev ``` For detailed deployment instructions, see the [Bootstrap Guide](docs/bootstrap_guide.md). -## 📋 Workflow +## 📋 Architecture + +### Bootstrap-First Deployment + +Cluster-Forge uses a three-phase bootstrap process: + +**Phase 1: Pre-Cleanup** +- Detects and removes previous installations when applicable +- Ensures clean state for fresh deployments -Cluster-Forge deploys all necessary components within the cluster using GitOps-controller [ArgoCD](https://argo-cd.readthedocs.io/) -and [app-of-apps pattern](https://argo-cd.readthedocs.io/en/stable/operator-manual/cluster-bootstrapping/#app-of-apps-pattern) where Cluster-Forge acts as an app of apps. +**Phase 2: GitOps Foundation Bootstrap** (Manual Helm Templates) +1. **ArgoCD** (v8.3.5) - GitOps controller deployed via helm template +2. **OpenBao** (v0.18.2) - Secrets management with initialization job +3. **Gitea** (v12.3.0) - Git server with initialization job -### GitOps Architecture +**Phase 3: App-of-Apps Deployment** (ArgoCD-Managed) +- Creates cluster-forge Application pointing to root/ helm chart +- ArgoCD syncs all remaining applications from enabledApps list +- Applications deployed in wave order (-5 to 0) based on dependencies -Cluster-Forge supports two deployment modes: -- **External Mode**: Traditional GitOps with GitHub dependency -- **Local Mode**: Self-contained GitOps with local Gitea +### Dual Repository GitOps Pattern -See [Values Inheritance Pattern](docs/values_inheritance_pattern.md) for detailed architecture documentation. +**Local Mode (Default)** - Self-contained cluster-native GitOps: +- Uses local Gitea for both cluster-forge and cluster-values repositories +- Zero external dependencies once bootstrapped +- Initialization handled by gitea-init-job + +**External Mode** - Traditional GitHub-based GitOps: +- Points to external GitHub repository +- Use `--dev` flag for feature branch development +- Supports custom branch selection for testing + +See [Values Inheritance Pattern](docs/values_inheritance_pattern.md) for detailed architecture. ## 🛠️ Components ### Layer 1: GitOps Foundation -- **ArgoCD** - GitOps controller for continuous deployment -- **Gitea** - Git repository server for source management -- **OpenBao** - Vault-compatible secret management system +- **ArgoCD 8.3.5** - GitOps continuous deployment controller +- **Gitea 12.3.0** - Self-hosted Git server with SQLite backend +- **OpenBao 0.18.2** - Vault-compatible secrets management +- **External Secrets 0.15.1** - Secrets synchronization operator ### Layer 2: Core Infrastructure + **Networking & Security:** -- **Gateway API + KGateway** - Modern ingress and traffic management -- **Cert-Manager** - Automated TLS certificate management -- **MetalLB** - Load balancer for bare metal environments -- **External Secrets Operator** - External secret integration -- **Cilium** - Network security and observability -- **Kyverno** - Policy engine with modular policy system +- **Gateway API v1.3.0** - Kubernetes standard ingress API +- **KGateway v2.1.0-main** - Gateway API implementation with WebSocket support +- **MetalLB v0.15.2** - Bare metal load balancer +- **Cert-Manager v1.18.2** - Automated TLS certificate management +- **Kyverno 3.5.1** - Policy engine with modular policy system **Storage & Database:** -- **Longhorn** - Distributed block storage -- **CNPG Operator** - Cloud-native PostgreSQL management -- **MinIO Operator + Tenant** - S3-compatible object storage - -### Layer 3: Observability & Monitoring -- **Prometheus** - Metrics collection and alerting -- **Grafana** - Visualization and dashboarding -- **OpenTelemetry Operator** - Distributed tracing and telemetry -- **OTEL-LGTM Stack** - Unified observability platform (Loki, Grafana, Tempo, Mimir) - -### Layer 4: AI/ML Compute Stack -**GPU & Compute:** -- **AMD GPU Operator** - GPU device management and drivers -- **KubeRay Operator** - Ray distributed computing framework -- **KServe** - Kubernetes-native model serving -- **Kueue** - Advanced job queueing system -- **AppWrapper** - Application scheduling and resource management -- **KEDA** - Event-driven autoscaling - -**Workflow & Orchestration:** -- **Kaiwo** - Workflow management system -- **RabbitMQ** - Message broker for async processing - -### Layer 5: Identity & Access -- **Keycloak** - Enterprise identity and access management -- **Cluster-Auth** - Kubernetes RBAC integration - -### Layer 6: AIRM App -- **AIRM API** - Central API layer for AMD Resource Manager -- **AIRM UI** - Frontend interface for resource management -- **AIRM Dispatcher** - Compute workload dispatching agent - -## 💾 Storage Classes - -Storage classes are provided by default with Longhorn. These can be customized as needed. - -| Purpose | StorageClass | Access Mode | Locality | -|---------|--------------|-------------|----------| -| GPU Job | mlstorage | RWO | LOCAL/remote | -| GPU Job | default | RWO | LOCAL/remote | -| Advanced usage | direct | RWO | LOCAL | -| Multi-container | multinode | RWX | ANYWHERE | - -## 📄 Configuration +- **CNPG Operator 0.26.0** - CloudNativePG PostgreSQL operator +- **MinIO Operator 7.1.1** - S3-compatible object storage operator +- **MinIO Tenant 7.1.1** - Tenant deployment with default-bucket and models buckets -### Cluster Sizing +### Layer 3: Observability +- **Prometheus Operator CRDs 23.0.0** - Metrics infrastructure +- **OpenTelemetry Operator 0.93.1** - Telemetry collection +- **OTEL-LGTM Stack v1.0.7** - Integrated observability (Loki, Grafana, Tempo, Mimir) + +### Layer 4: Identity & Access +- **Keycloak** (keycloak-old chart) - Enterprise IAM with AIRM realm +- **Cluster-Auth 0.5.0** - Kubernetes RBAC integration -Cluster-Forge provides three pre-configured cluster profiles: +### Layer 5: AI/ML Compute Stack -- **Small**: Minimal resources, local-path storage, RWX→RWO access mode conversion -- **Medium**: Balanced resources, local-path storage, RWX→RWO access mode conversion -- **Large**: Full enterprise features, Longhorn storage, native RWX support +**GPU & Scheduling:** +- **AMD GPU Operator v1.4.1** - GPU device plugin and drivers +- **KubeRay Operator 1.4.2** - Ray distributed computing framework +- **Kueue 0.13.0** - Job queueing with multi-framework support +- **AppWrapper v1.1.2** - Application-level resource scheduling +- **KEDA 2.18.1** - Event-driven autoscaling + +**ML Serving & Inference:** +- **KServe v0.16.0** - Model serving platform (Standard deployment mode) + +**Workflow & Messaging:** +- **Kaiwo v0.2.0-rc11** - AI workload orchestration +- **RabbitMQ v2.15.0** - Message broker for async processing + +### Layer 6: AIRM Application +- **AIRM 0.3.2** - AMD Resource Manager application suite +- **AIM Cluster Model Source** - Cluster resource models for AIRM + +## � Configuration + +### Cluster Sizing + +Three cluster profiles with inheritance-based resource optimization: + +**Small Clusters** (1-5 users, dev/test): +- Single replica deployments +- Reduced resource limits (ArgoCD controller: 2 CPU, 4Gi RAM) +- Adds kyverno-policies-storage-local-path for RWX→RWO PVC mutation +- MinIO tenant: 250Gi storage +- Suitable for: Local workstations, development environments + +**Medium Clusters** (5-20 users, team production): +- Single replica with moderate resource allocation +- Same storage policies as small (local-path support) +- ArgoCD controller: 2 CPU, 4Gi RAM +- Default configuration for balanced performance +- Suitable for: Small teams, staging environments + +**Large Clusters** (10s-100s users, enterprise scale): +- OpenBao HA: 3 replicas with Raft consensus +- No local-path policies (assumes distributed storage) +- MinIO tenant: 500Gi storage +- Production-grade resource allocation +- Suitable for: Production deployments, multi-tenant environments See [Cluster Size Configuration](docs/cluster_size_configuration.md) for detailed specifications. ### Values Files Configuration follows a streamlined inheritance pattern: -- **Base**: 52 common applications with alpha-sorted enabledApps +- **Base**: Common applications with alpha-sorted enabledApps - **Size-specific**: Only override differences from base (DRY principle) -- **Runtime**: Domain and cluster-specific parameters +- **Runtime**: Domain and cluster-specific parameters injected during bootstrap + +The bootstrap script uses YAML merge semantics where size-specific values override base values.yaml settings. ## 📚 Documentation @@ -135,11 +171,13 @@ Comprehensive documentation is available in the `/docs` folder: | **Getting Started** | [Bootstrap Guide](docs/bootstrap_guide.md) | | **Configuration** | [Cluster Size Configuration](docs/cluster_size_configuration.md) | | **Architecture** | [Values Inheritance Pattern](docs/values_inheritance_pattern.md) | -| **Security** | [Kyverno Modular Design](docs/kyverno_modular_design.md) | -| **Policies** | [Kyverno Access Mode Policy](docs/kyverno_access_mode_policy.md) | -| **Secrets** | [Secrets Management Architecture](docs/secrets_management_architecture.md) | +| **Policy System** | [Kyverno Modular Design](docs/kyverno_modular_design.md) | +| **Storage Policies** | [Kyverno Access Mode Policy](docs/kyverno_access_mode_policy.md) | | **Operations** | [Backup and Restore](docs/backup_and_restore.md) | +Additional documentation: +- **SBOM**: See `/sbom` folder for software bill of materials generation and validation + ## 📝 License Cluster-Forge is licensed under the Apache License, Version 2.0. See the [LICENSE](LICENSE) file for details. diff --git a/docs/bootstrap_guide.md b/docs/bootstrap_guide.md index b5d29a88..29ad3081 100644 --- a/docs/bootstrap_guide.md +++ b/docs/bootstrap_guide.md @@ -1,78 +1,230 @@ -# Bootstrap Script +# Bootstrap Guide -This script bootstraps a complete GitOps environment with ArgoCD, OpenBao (secret management), and Gitea (Git repository) on a Kubernetes cluster. +This guide explains how to bootstrap a complete GitOps environment using Cluster-Forge's three-phase deployment model. The bootstrap process establishes ArgoCD, OpenBao (secret management), and Gitea (Git repository) before deploying the full application stack. ## Prerequisites -- Kubernetes cluster (running and accessible via `kubectl`) +- Kubernetes cluster (1.33+ recommended, running and accessible via `kubectl`) - Tools installed: - - `kubectl` - - `helm` - - `openssl` - - `yq` + - `kubectl` with cluster-admin access + - `helm` (3.0+) + - `openssl` (for password generation) + - `yq` (v4+) + - `git` (for --dev mode) ## Usage ```bash -./bootstrap.sh [values_file] +./scripts/bootstrap.sh [--CLUSTER_SIZE=small|medium|large] [--dev] ``` -**Examples:** +### Arguments + +- **domain** (required): Cluster domain for all services (e.g., `example.com`, `192.168.1.100.nip.io`) + +### Options + +- **--CLUSTER_SIZE** `[small|medium|large]`: Cluster size configuration (default: `medium`) +- **--dev**: Enable development mode for feature branch testing +- **--help**, **-h**: Show usage information + +### Examples + ```bash -# Using default values_cf.yaml -./bootstrap.sh plat-dev-1.silogen.ai +# Basic usage with default medium cluster size +./scripts/bootstrap.sh 192.168.1.100.nip.io + +# Large cluster +./scripts/bootstrap.sh example.com --CLUSTER_SIZE=large -# Using custom values file -./bootstrap.sh plat-dev-1.silogen.ai custom_values.yaml +# Development mode for feature branch testing +./scripts/bootstrap.sh dev.example.com --CLUSTER_SIZE=small --dev ``` -## What Does It Do? +## How It Works + +The bootstrap script uses a three-phase deployment model: + +### Phase 1: Pre-Cleanup +- Detects previous installations by checking for completed gitea-init-job +- Removes Gitea resources to enable fresh deployment +- Deletes OpenBao initialization jobs and temporary files +- Ensures clean state for new bootstrap -The script performs the following steps in sequence: +### Phase 2: GitOps Foundation Bootstrap (Manual Helm Templates) -### 1. Domain Configuration -- Validates that a domain argument is provided -- Sets the values file to use (defaults to `values_cf.yaml` if not specified) -- Uses the global domain value to render [root](../root) helm chart -- This domain is used for all service endpoints (Gitea, ArgoCD, etc.) +**1. Configuration Preparation** +- Validates required domain argument +- Validates cluster size (small, medium, or large) +- Merges base `values.yaml` with size-specific overrides `values_.yaml` +- Sets `global.domain` and `global.clusterSize` in merged configuration +- In dev mode: prompts for git branch selection and configures targetRevision -### 2. Namespace Creation +**2. Namespace Creation** Creates three namespaces for core components: - `argocd` - GitOps controller - `cf-gitea` - Git repository server - `cf-openbao` - Secret management system -### 3. ArgoCD Bootstrap -- Deploys ArgoCD -- Waits for all ArgoCD components to be ready +**3. ArgoCD Bootstrap** +- Extracts ArgoCD values from merged configuration +- Deploys ArgoCD using `helm template` with server-side apply +- Uses `--field-manager=argocd-controller` to match ArgoCD's self-management +- Waits for all ArgoCD components to be ready: + - application-controller StatefulSet + - applicationset-controller Deployment + - redis Deployment + - repo-server Deployment -### 4. OpenBao Bootstrap -- Deploys OpenBao -- Waits for the first pod (`openbao-0`) to be running +**4. OpenBao Bootstrap** +- Extracts OpenBao values from merged configuration +- Deploys OpenBao using `helm template` with server-side apply +- Waits for `openbao-0` pod to be running - Runs initialization job (`openbao-init-job`) which: - - Initializes & configures OpenBao Raft cluster - - Unseals all pods - - Creates root credentials - -### 5. Gitea Bootstrap -- Creates gitea-admin credentials secret -- Creates ConfigMap with initial cluster forge values -- Deploys & configures Gitea + - Initializes OpenBao Raft cluster + - Unseals all pods (3 for large clusters with HA) + - Configures Vault policies for each namespace + - Creates Kubernetes auth method + - Stores initialization keys and secrets + +**5. Gitea Bootstrap** +- Generates random admin password using `openssl rand -hex 16` +- Creates `initial-cf-values` ConfigMap with merged configuration +- Creates `gitea-admin-credentials` secret +- Extracts Gitea values from merged configuration +- Deploys Gitea using `helm template` - Waits for Gitea deployment to be ready - Runs initialization job (`gitea-init-job`) which: - - Creates cluster-org organization - - Creates cluster-forge as a mirror repo - - Creates cluster-values as a repo with cluster configuration + - Creates admin API token + - Creates `cluster-org` organization + - Clones and pushes cluster-forge repository from initial-cf-values ConfigMap + - Creates cluster-values repository with configuration + - In dev mode: sets targetRevision to selected branch + +### Phase 3: App-of-Apps Deployment (ArgoCD-Managed) + +**6. ClusterForge Application Deployment** +- Renders root helm chart with merged configuration +- Creates `cluster-forge` Application resource in ArgoCD +- When `externalValues.enabled: true`, uses multi-source feature: + - Source 1: cluster-forge repo (root/ helm chart) + - Source 2: cluster-values repo (custom values.yaml) +- ArgoCD deploys all enabled applications based on configuration +- Applications deployed in wave order (-5 to 0) based on dependencies + +**7. Cleanup** +- Removes temporary merged values files from /tmp/ + +## Cluster Configuration + +### Values Files Structure + +ClusterForge uses a layered configuration approach with YAML merge semantics: + +1. **Base values** (`root/values.yaml`): + - Contains all app definitions + - Defines default configuration for all apps + - Specifies `enabledApps` list (alpha-sorted) + - Configured with: + - `clusterForge.repoUrl` - Points to Gitea service URL (local mode) or GitHub (external mode) + - `clusterForge.targetRevision` - Version/branch to deploy + - `externalValues.enabled: true` - Enables dual-repository pattern + - `externalValues.repoUrl` - Points to cluster-values repo in Gitea + - `global.domain` - Set by bootstrap script + - `global.clusterSize` - Set by bootstrap script + +2. **Size-specific values** (`root/values_.yaml`): + - Override base values for specific cluster sizes + - Define resource limits and requests + - Single node (small and medium) RWO local-path storage + - Multinode (large) RWX storage + - Modify replica counts and HA settings + - Add size-specific enabled apps (e.g., `kyverno-policies-storage-local-path` for small/medium) + - Available sizes: `small`, `medium`, `large` + - Uses DRY principle - only contains differences from base + +3. **External values** (`cluster-values/values.yaml` in Gitea): + - Created during bootstrap in the `cluster-values` repository + - Contains cluster-specific overrides + - Can be modified post-bootstrap for customizations + - Structure: + ```yaml + clusterForge: + repoURL: http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git + path: root + targetRevision: main # or feature branch in dev mode + + global: + clusterSize: medium # Set by --CLUSTER_SIZE flag + domain: example.com # Set by domain argument + ``` + +### Value Merging Order + +When ArgoCD renders the cluster-forge application, values are merged in this order (later values override earlier): + +1. Base `values.yaml` +2. Size-specific `values_.yaml` +3. External `cluster-values/values.yaml` from Gitea + +### Cluster Sizes + +Each cluster size is optimized for different resource constraints: + +- **Small**: Development/testing environments, minimal resources +- **Medium** (default): Production-ready, balanced configuration +- **Large**: High-availability, maximum performance + +Size-specific configurations typically adjust: +- Component replica counts (ArgoCD, PostgreSQL, etc.) +- Resource limits and requests (CPU, memory) +- Storage sizes (PVC, retention periods) +- High-availability features (Redis HA, multiple replicas) + +## ClusterForge App-of-Apps Model + +The bootstrap script creates the root `cluster-forge` Application in ArgoCD, which implements an app-of-apps pattern. + +### Application Structure + +The `cluster-forge` Application is defined in [root/templates/cluster-forge.yaml](../root/templates/cluster-forge.yaml): -### 6. ArgoCD Application Deployment -- Creates root cluster-forge app that manages all other apps +### Child Applications -## Access to main components +The root chart renders individual Application resources for each app listed in `enabledApps` using the template in [root/templates/cluster-apps.yaml](../root/templates/cluster-apps.yaml). + +Each child application includes: +- **Namespace**: Target namespace for the application +- **Path**: Location of helm chart or manifests in `sources/` +- **Values**: Configuration from `apps..valuesObject` or `valuesFile` +- **Sync wave**: Deployment order (lower numbers deploy first) +- **Sync policy**: Automated with prune and self-heal enabled +- **Ignore differences**: Optional resource-specific ignore rules + +Example child application configuration in values: + +```yaml +apps: + argocd: + path: argocd/8.3.5 + namespace: argocd + syncWave: -3 + valuesObject: + # ArgoCD-specific values + helmParameters: + - name: global.domain + value: "argocd.{{ .Values.global.domain }}" +``` + +## Access to Main Components 1. **ArgoCD:** ```bash - # Initial admin user password + # Initial admin password kubectl -n argocd get secret argocd-initial-admin-secret -o jsonpath="{.data.password}" | base64 -d + + # Access URL (replace with your domain) + echo "https://argocd.${DOMAIN}" ``` 2. **Gitea:** @@ -82,36 +234,245 @@ Creates three namespaces for core components: # Admin password kubectl -n cf-gitea get secret gitea-admin-credentials -o jsonpath="{.data.password}" | base64 -d + + # API token (created by init job) + kubectl -n cf-gitea get secret gitea-admin-token -o jsonpath="{.data.token}" | base64 -d + + # Access URL (replace with your domain) + echo "https://gitea.${DOMAIN}" ``` 3. **OpenBao:** ```bash # Root token kubectl -n cf-openbao get secret openbao-keys -o jsonpath='{.data.root_token}' | base64 -d + + # Unseal keys (stored in openbao-keys secret) + kubectl -n cf-openbao get secret openbao-keys -o jsonpath='{.data.unseal_keys_b64}' | base64 -d ``` -4. **Devuser secret:** +4. **Keycloak (deployed by ArgoCD):** ```bash - # Devuser secret - kubectl -n keycloak get secret airm-devuser-credentials -o jsonpath="{.data.KEYCLOAK_INITIAL_DEVUSER_PASSWORD}"| base64 -d + # Admin password + kubectl -n keycloak get secret keycloak-credentials -o jsonpath="{.data.KEYCLOAK_INITIAL_ADMIN_PASSWORD}" | base64 -d + + # Dev user password + kubectl -n keycloak get secret airm-devuser-credentials -o jsonpath="{.data.KEYCLOAK_INITIAL_DEVUSER_PASSWORD}" | base64 -d + ``` + +## Development Mode + +Development mode is designed for testing ClusterForge changes from a feature branch in a lightweight environment. Unlike production mode, dev mode uses a minimal configuration without local Gitea mirrors or full secret management infrastructure. + +### Key Differences from Production + +| Feature | Production Mode | Development Mode | +|---------|----------------|------------------| +| **Cluster Size** | Medium/Large recommended | **Small required** | +| **Repository Source** | Local Gitea mirrors | Direct GitHub access | +| **Gitea Deployment** | ✓ Deployed | ✗ **Not deployed** | +| **OpenBao Deployment** | ✓ Full deployment | ✗ **Minimal/not deployed** | +| **External Values** | ✓ Enabled (cluster-values repo) | ✗ Disabled (inline values) | +| **Resource Requirements** | Higher | **Minimal** | +| **Use Case** | Production clusters | Feature testing, development | + +### Prerequisites for Dev Mode + +1. **Small cluster configuration required** - Dev mode is optimized for resource-constrained environments +2. **GitHub access** - Direct connection to GitHub repository (no local Gitea mirror) +3. **Minimal infrastructure** - Skips Gitea and heavy secret management components + +### Enabling Development Mode + +**Note:** Dev mode must use small cluster size to work properly: + +```bash +./bootstrap.sh --dev dev.example.com --CLUSTER_SIZE=small +``` + +The script will: +1. Detect your current git branch +2. Prompt you to confirm using it or specify a custom branch: ``` + Development mode enabled - ArgoCD will point to live GitHub repository + Current git branch: feature-xyz + + Use current branch 'feature-xyz' for targetRevision? [Y/n/custom_branch]: + ``` +3. Configure ClusterForge to point directly to GitHub (bypassing Gitea) +4. Set `externalValues.enabled: false` to use inline configuration +5. Deploy only ArgoCD and essential components + +### Development Workflow + +1. **Create feature branch:** + ```bash + git checkout -b feature-xyz + ``` + +2. **Make your changes** to apps, values, or configurations + +3. **Commit and push to GitHub:** + ```bash + git add . + git commit -m "Add new feature" + git push origin feature-xyz + ``` + +4. **Bootstrap cluster in dev mode (small cluster only):** + ```bash + ./bootstrap.sh --dev dev.example.com --CLUSTER_SIZE=small + # Confirm using 'feature-xyz' branch when prompted + ``` + +5. **Iterate:** Any subsequent changes pushed to `feature-xyz` will automatically sync to the cluster directly from GitHub via ArgoCD + +### Configuration for Dev Mode + +In dev mode, the cluster-forge Application uses a single-source configuration: + +```yaml +externalValues: + enabled: false # No separate cluster-values repo + +clusterForge: + repoUrl: "https://github.com/silogen/cluster-forge.git" # Direct GitHub access + targetRevision: feature-xyz # Your feature branch +``` + +This bypasses the need for: +- Local Gitea deployment and mirrors +- Separate cluster-values repository +- Full OpenBao secret management infrastructure + +### When to Use Dev Mode + +**Use dev mode when:** +- Testing new features or configurations +- Developing on resource-constrained clusters +- Rapid iteration without infrastructure overhead +- Working with feature branches before merging + +**Use production mode when:** +- Deploying to production environments +- Requiring full secret management (OpenBao) +- Needing local Git mirrors for air-gapped scenarios +- Medium/Large cluster configurations + +## Troubleshooting + +**Note:** Some troubleshooting steps below only apply to production mode deployments that include Gitea and OpenBao. + +### Bootstrap Fails at Gitea Init + +*Production mode only* + +If the Gitea initialization job fails during repository migration: + +```bash +# Check job logs +kubectl logs -n cf-gitea job/gitea-init-job + +# The job automatically retries migration up to 5 times +# If it continues failing, check Gitea pod logs +kubectl logs -n cf-gitea deploy/gitea -c gitea +``` + +### OpenBao Init Job Fails + +*Production mode only* + +If OpenBao initialization fails: + +```bash +# Check init job logs +kubectl logs -n cf-openbao job/openbao-init-job + +# Verify OpenBao is running +kubectl get pods -n cf-openbao + +# Re-run bootstrap (pre-cleanup will handle the retry) +./bootstrap.sh your-domain.com +``` + +### ArgoCD Applications Not Syncing + +If applications aren't deploying: + +```bash +# Check cluster-forge app status +kubectl get application cluster-forge -n argocd -o yaml + +# Check individual app status +kubectl get applications -n argocd + +# View app details in ArgoCD UI +# https://argocd.your-domain.com +``` + +### Merged Values Inspection + +The bootstrap script creates temporary merged values at `/tmp/merged_values.yaml` for debugging. You can inspect this file during bootstrap to see the final merged configuration. + +## Post-Bootstrap Customization + +### Production Mode (with Gitea) + +After bootstrap completes in production mode, you can customize the cluster by modifying the `cluster-values` repository in Gitea: + +1. **Access Gitea** at `https://gitea.${DOMAIN}` +2. **Navigate to** `cluster-org/cluster-values` repository +3. **Edit** `values.yaml` to add/override configuration +4. **Commit** changes +5. **ArgoCD** will automatically detect and apply changes + +Example customizations in `cluster-values/values.yaml`: + +```yaml +# Override app-specific values +apps: + keycloak: + valuesObject: + replicas: 2 + resources: + requests: + memory: "1Gi" + +# Disable specific apps +enabledApps: + - argocd + - gitea + # ... list only apps you want enabled + +# Add custom global values +global: + myCustomValue: "something" +``` + +### Development Mode (without Gitea) + +In development mode, there is no local Gitea repository. Customization is done by: -4. **Keycloak admin secret:** +1. **Editing values files** in your local cluster-forge repository +2. **Committing changes** to your feature branch +3. **Pushing to GitHub:** ```bash - # Devuser secret - kubectl -n keycloak get secret keycloak-credentials -o jsonpath="{.data.KEYCLOAK_INITIAL_ADMIN_PASSWORD}"| base64 -d + git add root/values.yaml # or values_small.yaml + git commit -m "Update configuration" + git push origin feature-xyz ``` +4. **ArgoCD syncs automatically** from GitHub -## Development +Since `externalValues.enabled: false` in dev mode, all configuration is in the main values files (`values.yaml`, `values_small.yaml`). -For development purposes there is a way to sync all apps directly from cluster-forge GitHub repo bypassing gitea. Here is the possible development flow: +## File Cleanup -- Create feature-branch with your changes -- Modify `values_dev.yaml` file with the following parameters: - - `clusterForge.targetRevision` - feature-branch name - - `global.domain` - domain name -- Commit & push changes to your feature-branch -- Run `scripts/bootstrap_dev.sh` -- Wait for cluster apps to be ready -- From this point forward, any changes you push to your feature branch will be automatically synchronized to the cluster by ArgoCD. +The bootstrap script automatically cleans up temporary files at the end: +- `/tmp/merged_values.yaml` +- `/tmp/argocd_values.yaml` +- `/tmp/argocd_size_values.yaml` +- `/tmp/openbao_values.yaml` +- `/tmp/openbao_size_values.yaml` +- `/tmp/gitea_values.yaml` +- `/tmp/gitea_size_values.yaml` diff --git a/docs/cluster_size_configuration.md b/docs/cluster_size_configuration.md index 1d094ebc..3c91524d 100644 --- a/docs/cluster_size_configuration.md +++ b/docs/cluster_size_configuration.md @@ -1,25 +1,26 @@ -# ClusterForge Size-Based Configuration +# Cluster-Forge Size-Based Configuration -This document describes the cluster size-based configuration system for ClusterForge applications, enabling optimal resource allocation based on cluster scale. +This document describes the cluster size-based configuration system for Cluster-Forge, enabling optimal resource allocation based on cluster scale. ## Overview -ClusterForge now supports three cluster sizes, each with optimized resource allocations for the applications deployed on top of ClusterBloom: +Cluster-Forge supports three cluster sizes, each with optimized resource allocations: - **Small**: Developer/single-user setups (1-5 users) - **Medium**: Team clusters (5-20 users) - **Large**: Production/enterprise scale (10s-100s users) +Size configurations use YAML merge semantics where size-specific values override base values.yaml settings. + ## File Structure ``` cluster-forge/ ├── root/ -│ ├── values.yaml # Base configuration (all applications enabled) +│ ├── values.yaml # Base configuration (all applications) │ ├── values_small.yaml # Small cluster overrides │ ├── values_medium.yaml # Medium cluster overrides -│ ├── values_large.yaml # Large cluster overrides -│ └── values_dev.yaml # Development environment overrides +│ └── values_large.yaml # Large cluster overrides └── scripts/ └── bootstrap.sh # Main bootstrap script with size support ``` @@ -30,202 +31,188 @@ cluster-forge/ **Target**: Developer Cluster / Single-User Setup (1-5 users) **Infrastructure**: -- **Nodes**: 1 all-in-one or 2 nodes (1×CP + 1×GPU worker) +- **Nodes**: 1-2 nodes (single all-in-one or 1 control plane + 1 worker) - **CPU**: 8-32 vCPU total - **Memory**: 32-128 GB RAM total -- **GPU**: 1-4 GPUs, no partitioning needed -- **Storage**: 1-4 TB total NVMe, Internal S3: 0.5-2 TB +- **GPU**: 1-4 GPUs (optional) +- **Storage**: 250Gi+ total, local-path StorageClass - **Networking**: 1 GbE acceptable **Application Configuration**: -- **ArgoCD**: Single replica, minimal resources -- **MinIO**: Single server, 500GB storage -- **OpenBao**: Single instance (no HA) -- **Prometheus**: 7d retention, 10GB storage -- **Grafana**: Single replica, 1GB storage +- **ArgoCD**: Single replica, 2 CPU / 4Gi RAM limits +- **MinIO Tenant**: 250Gi storage, single server +- **OpenBao**: Single instance (no HA), 5Gi storage +- **Storage Policies**: Includes `kyverno-policies-storage-local-path` for RWX→RWO conversion +- **Component Replicas**: All single replica deployments -**Use Cases**: Development, testing, proof-of-concept +**Use Cases**: Development, testing, proof-of-concept, local workstations ### Medium Cluster (`values_medium.yaml`) **Target**: Team Cluster (5-20 users) **Infrastructure**: -- **Nodes**: 1-3 nodes (Option A: 1×CP + 1-2 GPU workers, Option B: 3×CP + GPU workers) -- **CPU**: 32-64 vCPU per GPU node -- **Memory**: 128-256 GB RAM per GPU node -- **GPU**: Up to 8 GPUs total, partitioning optional -- **Storage**: 4-16 TB total NVMe, Internal S3: 2-10 TB +- **Nodes**: 3-5 nodes +- **CPU**: 32-64 vCPU per node +- **Memory**: 128-256 GB RAM per node +- **GPU**: Up to 8 GPUs total (optional) +- **Storage**: 500Gi+ total, local-path or distributed storage - **Networking**: 10 GbE recommended **Application Configuration**: -- **ArgoCD**: 2 replicas with HA Redis -- **MinIO**: 3 servers, 6TB total (3×2TB), datasets bucket -- **OpenBao**: 3 replicas with Raft HA -- **Enhanced resources** for team collaboration +- **ArgoCD**: Single replica, 2 CPU / 4Gi RAM limits +- **MinIO Tenant**: 250Gi storage, single server +- **OpenBao**: Single instance (no HA), 5Gi storage +- **Storage Policies**: Includes `kyverno-policies-storage-local-path` for RWX→RWO conversion +- **Component Replicas**: Balanced single replica configuration -**Use Cases**: Production workloads, staging environments +**Use Cases**: Team production workloads, staging environments, CI/CD ### Large Cluster (`values_large.yaml`) -**Target**: Production-Path / Scale-Out (10s-100s users) +**Target**: Production-Path / Enterprise Scale (10s-100s users) **Infrastructure**: -- **Nodes**: 3-5 dedicated CP servers + 3-6 GPU nodes (scale to 100s) -- **CPU**: Workers: 32-96 vCPU, CP nodes: 8-16 vCPU -- **Memory**: Workers: 256-1024 GB, CP nodes: 32-64 GB +- **Nodes**: 10+ nodes (3-5 dedicated control plane + GPU workers) +- **CPU**: Workers: 32-96 vCPU, Control plane: 8-16 vCPU +- **Memory**: Workers: 256-1024 GB, Control plane: 32-64 GB - **GPU**: 8+ GPUs baseline, mixed families, heterogeneous -- **Storage**: 10-100+ TB NVMe, External HA S3 (recommended) -- **Networking**: 25 GbE or more, optional separate storage network +- **Storage**: 1Ti+ total, distributed storage required +- **Networking**: 25 GbE or more recommended **Application Configuration**: -- **ArgoCD**: 3 replicas with enhanced PDB -- **MinIO**: External HA S3 recommended -- **OpenBao**: Full HA with enhanced security -- **Full observability stack** with extended retention +- **ArgoCD**: Single replica, production-ready resources +- **MinIO Tenant**: 500Gi storage, single server (external HA S3 recommended) +- **OpenBao**: 3 replicas with Raft HA consensus +- **Storage Policies**: No local-path policies (assumes distributed storage) +- **OTEL LGTM Stack**: 50Gi storage per component (Tempo, Loki, Mimir), 10Gi Grafana +- **Component Replicas**: Production-grade, HA where applicable -**Use Cases**: Large-scale production, enterprise deployments +**Use Cases**: Large-scale production, enterprise deployments, multi-tenant environments ## Usage ### Using the Bootstrap Script -The bootstrap script automatically selects the appropriate size configuration: +The bootstrap script automatically applies the appropriate size configuration: ```bash -# Basic usage (auto-detects cluster size) +# Default (medium cluster) ./scripts/bootstrap.sh example.com # Explicitly specify cluster size -./scripts/bootstrap.sh example.com --size small -./scripts/bootstrap.sh example.com --size medium -./scripts/bootstrap.sh example.com --size large - -# CI mode (no interactive prompts) -./scripts/bootstrap.sh example.com --size medium --ci +./scripts/bootstrap.sh example.com --CLUSTER_SIZE=small +./scripts/bootstrap.sh example.com --CLUSTER_SIZE=medium +./scripts/bootstrap.sh example.com --CLUSTER_SIZE=large ``` -### Size Detection Logic +### Configuration Merge Logic -The bootstrap script uses multiple methods to determine cluster size: +The script combines configurations using YAML merge semantics: +1. **Base**: `values.yaml` (all applications, common defaults) +2. **Size-specific**: `values_[size].yaml` (overrides and size-specific additions) -1. **Explicit `--size` parameter** (highest priority) -2. **CLUSTER_SIZE from bloom-config ConfigMap** (if available) -3. **Auto-detection based on node count** (fallback to small) +Later values override earlier ones, allowing size files to contain only the differences (DRY principle). -### Configuration Merge Logic +## Key Configuration Differences + +### Storage Strategy -The script combines configurations in this order: -1. **Base**: `values.yaml` (all applications enabled) -2. **Size-specific**: `values_[size].yaml` (resource overrides) -3. **Environment-specific**: `values_dev.yaml' (if specified) +| Size | Storage Approach | RWX Support | Kyverno Policy | +|------|-----------------|-------------|----------------| +| Small | local-path | ❌ (mutated to RWO) | `kyverno-policies-storage-local-path` | +| Medium | local-path or distributed | ❌ (mutated to RWO) | `kyverno-policies-storage-local-path` | +| Large | Distributed storage | ✅ Native RWX | No local-path policy | -## Application-Specific Configurations +### High Availability -### ArgoCD Scaling +| Component | Small | Medium | Large | +|-----------|-------|--------|-------| +| OpenBao | Single instance | Single instance | 3 replicas (Raft HA) | +| ArgoCD | Single replica | Single replica | Single replica | +| Redis | Single instance | Single instance | Single instance | +| Gitea | Single replica | Single replica | Single replica | -| Size | Controller Replicas | Repo Server Replicas | Redis HA | Resources | -|------|--------------------|--------------------|----------|-----------| -| Small | 1 | 1 | Disabled | Minimal | -| Medium | 2 | 2 | Enabled | Standard | -| Large | 3 | 3 | Enhanced | High + PDB | +### Observability Stack +| Size | Stack | Storage per Component | Notes | +|------|-------|----------------------|-------| +| Small | Basic | Minimal | Resource-constrained | +| Medium | Basic | Moderate | Team-scale monitoring | +| Large | OTEL LGTM | 50Gi (Tempo/Loki/Mimir), 10Gi (Grafana) | Full observability platform | + +| Application | Small | Medium | Large | Notes | +|-------------|-------|--------|-------|-------| +| Gitea | Base config | Base config | SQLite, no PostgreSQL/Valkey | Lightweight for all sizes | +| Keycloak | Base config | Base config | 1 replica, optimized resources | CPU: 250-500m, Mem: 512Mi-2Gi | +| Kueue | 1 replica | 1 replica | 1 replica | Workload queue controller | +| KEDA | Base config | Base config | Base config | Event-driven autoscaling | +| KServe | Base config | Base config | Base config | ML model serving | +| Kyverno | Base policies | Base + storage-local-path | Base policies only | Policy engine | ### MinIO Tenant Scaling -| Size | Servers | Storage per Server | Total Storage | Buckets | -|------|---------|-------------------|---------------|---------| -| Small | 1 | 500Gi | 500GB | Basic (default, models) | -| Medium | 3 | 2Ti | 6TB | + datasets | -| Large | External | - | 10-100+ TB | Full enterprise | +| Size | Servers | Storage | Buckets | Notes | +|------|---------|---------|---------|-------| +| Small | 1 | 250Gi | default-bucket, models | Single server, local-path storage | +| Medium | 1 | 250Gi | default-bucket, models | Single server, local-path or distributed | +| Large | 1 | 500Gi | default-bucket, models | Single server, external HA S3 recommended | ### OpenBao Scaling | Size | Mode | Replicas | Storage | HA Method | |------|------|----------|---------|-----------| -| Small | Standalone | 1 | 1Gi | None | -| Medium | HA | 3 | Standard | Raft | -| Large | HA | 3+ | Enhanced | Raft + external | - -## Advanced Configuration - -### Combining Size with Environment - -```bash -# Small development cluster -./scripts/bootstrap.sh dev.example.com --size small - -# Large production cluster with HA -./scripts/bootstrap.sh prod.example.com --size large -``` - -### Custom Overrides - -You can add additional override files: - -```bash -# Custom GPU configuration for large cluster -./scripts/bootstrap.sh gpu.example.com --size large -f custom-gpu-values.yaml -``` - -### Environment Variables - -The script supports environment variables: -- `CLUSTER_SIZE`: Override detected size -- `DOMAIN`: Set domain if not provided as argument -- `CI_MODE`: Enable CI mode (equivalent to `--ci`) - -## Validation - -The bootstrap script validates: -- **Node count** against cluster size requirements -- **Resource availability** for the selected size -- **Application compatibility** with cluster capabilities - -## Migration Between Sizes - -To change cluster size: - -1. **Update the size parameter**: Re-run bootstrap with new `--size` -2. **Resource validation**: Ensure cluster meets new requirements -3. **Application scaling**: ArgoCD will handle application updates -4. **Storage considerations**: May require storage expansion for larger sizes +| Small | Standalone | 1 | 5Gi | None | +| Medium | Standalone | 1 | 5Gi | None | +| Large | HA | 3 | 10Gi (default) | Raft consensus | ## Benefits 1. **Resource Optimization**: Right-sized configurations prevent over/under-provisioning -2. **Cost Efficiency**: Small clusters use minimal resources -3. **Scalability**: Easy to migrate between sizes as needs grow -4. **Consistency**: Standardized configurations across deployments -5. **Automation**: Bootstrap script handles complexity - -## Troubleshooting - -### Size Detection Issues -```bash -# Check current size detection -kubectl get configmap bloom-config -n default -o yaml - -# Force size override -./scripts/bootstrap.sh example.com --size medium -``` - -### Resource Constraints -```bash -# Validate node resources -kubectl describe nodes - -# Check for resource contention -kubectl top nodes -kubectl top pods --all-namespaces + - Small: Minimal replicas, basic resources + - Medium: Balanced configuration for team use + - Large: Production-grade with HA features + +2. **Storage Strategy**: Automatic policy application + - Small/Medium: Kyverno RWX→RWO mutation for local-path compatibility + - Large: Native RWX support with distributed storage + +3. **Cost Efficiency**: Progressive resource allocation + - Single replicas for small/medium clusters + - HA only enabled where needed (large clusters) + - DRY configuration principle reduces maintenance + +4. **Scalability**: Easy path from development to production + - Consistent application structure across sizes + - Configuration inheritance reduces duplication + - Clear upgrade path between sizes + +5. **Automation**: Bootstrap script handles all complexity + - Automatic value file merging + - Size-appropriate policy application + - Validation of configurations + +## Customization + +### Adding Custom Overrides + +Modify size-specific values files to adjust resources: + +```yaml +# values_large.yaml example +apps: + openbao: + valuesObject: + server: + ha: + enabled: true + replicas: 3 # HA for large clusters ``` -### Application Scaling Issues -```bash -# Check ArgoCD application status -kubectl get applications -n argocd - -# View specific application details -kubectl describe application -n argocd -``` +### Enabling/Disabling Applications ---- +Control which applications are deployed per size: -**This is the way** - A scalable configuration system that adapts ClusterForge applications to cluster capacity, ensuring optimal performance across all deployment sizes! \ No newline at end of file +```yaml +# values_small.yaml +enabledApps: + # Inherits base apps, adds storage policy + - kyverno-policies-storage-local-path +``` \ No newline at end of file diff --git a/docs/secret-management-user-guide.md b/docs/secret-management-user-guide.md deleted file mode 100644 index aefadaec..00000000 --- a/docs/secret-management-user-guide.md +++ /dev/null @@ -1,274 +0,0 @@ -# Secret Management User Guide - -This guide provides practical instructions for end-users to manage secrets in the cluster-forge OpenBao system. - -## Overview - -The cluster-forge secret management system uses a **declarative, GitOps-based approach** where secrets are defined in configuration files and automatically created by a CronJob that runs every 5 minutes. - -**How it works:** -- **For existing components**: All application secrets are already defined and automatically managed -- **For new components**: When you add a new application that needs secrets, you define them in the configuration file, commit the changes, and they're automatically created in OpenBao. Your new component can then fetch these secrets via External Secrets Operator using ExternalSecret resources that reference the OpenBao paths. - -**Example workflow for new components:** -1. Add your application deployment files -2. Define required secrets in `openbao-secret-definitions.yaml` -3. Create ExternalSecret resources to fetch the secrets from OpenBao -4. Your application pods automatically receive the secrets as Kubernetes Secret mounts - -## Quick Start: Adding a New Secret - -### 1. Edit the Secret Definition File - -Navigate to and edit: -``` -sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml -``` - -### 2. Add Your Secret Definition - -Add a new line following this format: -``` -SECRET_PATH|TYPE|VALUE|BYTES -``` - -**Examples:** -```bash -# Random 32-byte password for your application -secrets/my-app-database-password|random||32 - -# Static API key -secrets/my-app-api-key|static|your-fixed-api-key-here|0 - -# Domain-based URL (uses templating) -secrets/my-app-callback-url|static|https://my-app.{{ .Values.domain }}/callback|0 -``` - -### 3. Commit and Push Changes - -```bash -git add sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml -git commit -m "feat: add secrets for my-app" -git push origin main -``` - -### 4. Wait for Automatic Creation - -The GitOps pipeline will automatically create your secrets. **Total time: ~20-25 minutes** - -**Pipeline stages:** -- **GitHub → Gitea sync**: ~15 minutes (Gitea syncs every 15 minutes) -- **ArgoCD deployment**: ~3 minutes (ArgoCD detects and deploys changes) -- **Secret creation**: ~0-5 minutes (CronJob runs every 5 minutes) - -**Monitor progress:** -```bash -# Check ArgoCD sync status -kubectl get application openbao-config -n argocd - -# Check recent CronJob executions -kubectl get jobs -n cf-openbao -l job-name=openbao-secret-manager --sort-by=.metadata.creationTimestamp -``` - -**✅ Your secrets are ready when:** The CronJob shows a successful completion and you can verify the secret exists in OpenBao. - -## Secret Definition Format Reference - -### Format Specification - -``` -SECRET_PATH|TYPE|VALUE|BYTES -``` - -### Field Descriptions - -| Field | Description | Examples | -|-------|-------------|----------| -| **SECRET_PATH** | Path where secret will be stored in OpenBao | `secrets/my-app-password` | -| **TYPE** | Secret type: `static` or `random` | `random`, `static` | -| **VALUE** | Used only for static secrets (supports templating) | `my-api-key`, `https://api.{{ .Values.domain }}/v1` | -| **BYTES** | Used only for random secrets (length in bytes) | `16`, `32`, `64` | - -### Secret Types - -**Random Secrets:** -```bash -# Format: secrets/path|random||BYTES -secrets/my-app-password|random||16 # 16-byte password -secrets/api-key|random||32 # 32-byte API key -``` - -**Static Secrets:** -```bash -# Format: secrets/path|static|VALUE|0 -secrets/my-api-url|static|https://api.example.com|0 # Fixed value -secrets/my-callback|static|https://app.{{ .Values.domain }}|0 # Domain templating -``` - -## Working with Secrets - -### Viewing Secret Values - -```bash -# Check if secret exists in OpenBao -kubectl exec -n cf-openbao openbao-0 -- bao kv get secrets/my-app-password - -# View secret value (requires access) -kubectl exec -n cf-openbao openbao-0 -- bao kv get -field=value secrets/my-app-password -``` - -**Note**: Secrets are never updated automatically once created to prevent breaking applications. - -## Using Secrets in Applications - -### 1. Create an ExternalSecret Resource - -Create a file like `my-app-external-secret.yaml`: - -```yaml -apiVersion: external-secrets.io/v1beta1 -kind: ExternalSecret -metadata: - name: my-app-secrets - namespace: my-namespace -spec: - refreshInterval: 60s - secretStoreRef: - name: openbao-secret-store - kind: ClusterSecretStore - target: - name: my-app-secret - creationPolicy: Owner - data: - - secretKey: password - remoteRef: - key: secrets/my-app-password - property: value - - secretKey: api-key - remoteRef: - key: secrets/my-app-api-key - property: value -``` - -### 2. Use the Secret in Your Pod - -```yaml -apiVersion: v1 -kind: Pod -metadata: - name: my-app - namespace: my-namespace -spec: - containers: - - name: app - image: my-app:latest - env: - - name: DATABASE_PASSWORD - valueFrom: - secretKeyRef: - name: my-app-secret - key: password - - name: API_KEY - valueFrom: - secretKeyRef: - name: my-app-secret - key: api-key -``` - -## Current Secret Inventory - -For a complete and up-to-date list of all secrets in the system, refer to the **source of truth**: - -``` -sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml -``` - -This file contains all currently defined secrets organized by category: -- **Cluster-Wide**: Domain configuration -- **AIRM Application**: Database, RabbitMQ, and UI authentication secrets -- **Keycloak**: Admin passwords and database credentials -- **MinIO**: Storage access keys and console credentials -- **Infrastructure**: Client secrets for Kubernetes, Gitea, and ArgoCD -- **AIWB Application**: Database and authentication secrets - -**To view current secrets:** -```bash -# View the complete secret definitions file -cat sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml - -# Or check specific secrets in OpenBao -kubectl exec -n cf-openbao openbao-0 -- bao kv list secrets/ -``` - -## Troubleshooting - -### Secret Not Created After 25 Minutes - -1. **Check ArgoCD sync status:** - ```bash - kubectl get application openbao-config -n argocd - ``` - -2. **Check CronJob execution:** - ```bash - kubectl get cronjob openbao-secret-manager -n cf-openbao - kubectl get jobs -n cf-openbao -l job-name=openbao-secret-manager - ``` - -3. **Check CronJob logs:** - ```bash - # Get the most recent job - JOB=$(kubectl get jobs -n cf-openbao -l job-name=openbao-secret-manager --sort-by=.metadata.creationTimestamp -o jsonpath='{.items[-1].metadata.name}') - kubectl logs job/$JOB -n cf-openbao - ``` - -### Secret Definition Format Errors - -**Error**: CronJob fails with parsing errors - -**Solution**: Check your secret definition format: -- Ensure exactly 4 fields separated by `|` -- No extra spaces around the `|` separators -- For random secrets: VALUE field should be empty -- For static secrets: BYTES field should be `0` - -**Example of incorrect format:** -```bash -# Wrong - extra spaces -secrets/my-secret | random | | 32 - -# Wrong - missing field -secrets/my-secret|random|32 - -# Correct -secrets/my-secret|random||32 -``` - -### Common Issues - -**ExternalSecret not syncing:** -```bash -kubectl get externalsecret my-app-secrets -n my-namespace -kubectl describe externalsecret my-app-secrets -n my-namespace -``` - -**Secret not found in OpenBao:** -```bash -kubectl exec -n cf-openbao openbao-0 -- bao kv get secrets/my-app-password -``` - -## Best Practices - -**Naming:** Use descriptive, hierarchical names like `secrets/my-app-database-password` - -**Security:** Never commit actual secret values to git. Use random secrets for passwords/tokens. - -**Organization:** Group related secrets with consistent prefixes (e.g., `secrets/airm-*`) - -**Change Management:** Test in development first, existing secrets are never updated automatically. - -## Getting Help - -**For issues:** Check troubleshooting section, ArgoCD/CronJob logs, or see [secrets management architecture documentation](secrets_management_architecture.md) - -**For architectural details:** See [secrets management architecture documentation](secrets_management_architecture.md) for comprehensive system overview \ No newline at end of file diff --git a/docs/secrets_management_architecture.md b/docs/secrets_management_architecture.md deleted file mode 100644 index 4531ed76..00000000 --- a/docs/secrets_management_architecture.md +++ /dev/null @@ -1,455 +0,0 @@ -# Secrets Management Architecture - -## Overview - -This document describes the comprehensive secrets management architecture used in cluster-forge. The system is built around OpenBao (open-source Vault fork) as the central secrets vault, with External Secrets Operator enabling seamless integration with Kubernetes workloads. - -## Architecture Diagram - -```mermaid -graph TB - %% Styling - classDef vault fill:#4B8BBE,stroke:#306998,stroke-width:3px,color:#fff - classDef k8s fill:#326CE5,stroke:#00308F,stroke-width:2px,color:#fff - classDef app fill:#00D084,stroke:#00A86B,stroke-width:2px,color:#fff - classDef external fill:#FF6B6B,stroke:#C92A2A,stroke-width:2px,color:#fff - classDef cron fill:#FFA500,stroke:#FF8C00,stroke-width:2px,color:#fff - - %% OpenBao Core - subgraph OpenBao["OpenBao Vault Cluster (cf-openbao namespace)"] - BAO0[OpenBao-0
Leader]:::vault - BAO1[OpenBao-1
Follower]:::vault - BAO2[OpenBao-2
Follower]:::vault - - BAO0 -.Raft Replication.-> BAO1 - BAO0 -.Raft Replication.-> BAO2 - - subgraph Storage["Raft Integrated Storage"] - RAFT[Persistent Volumes
Raft Consensus]:::vault - end - - subgraph Auth["Authentication"] - USERPASS[UserPass Auth
readonly-user]:::vault - end - - subgraph Secrets["Secret Engines"] - KV2[KV v2: secrets/*
Generated Credentials]:::vault - APIKEY[KV v2: apikey-groups/*
API Keys]:::vault - RANDOM[sys/tools/random
Password Generator]:::vault - end - - BAO0 --> RAFT - BAO1 --> RAFT - BAO2 --> RAFT - BAO0 --> Auth - BAO0 --> Secrets - end - - %% Bootstrap Process - subgraph Bootstrap["Bootstrap Process (bootstrap.sh)"] - INIT[1. Init OpenBao
Generate Keys]:::cron - UNSEAL[2. Unseal All Pods
Join Raft Cluster]:::cron - SETUP[3. Setup Auth & Engines
Create read-policy]:::cron - GENSEC[4. Generate Secrets
Random Passwords]:::cron - - INIT --> UNSEAL - UNSEAL --> SETUP - SETUP --> GENSEC - end - - %% Unseal Automation - subgraph UnsealAuto["Automated Unseal (CronJob)"] - CRONJOB[openbao-unseal-job
Runs every 5 minutes]:::cron - UNSEALSCRIPT[Unseal Script
Checks sealed pods]:::cron - - CRONJOB --> UNSEALSCRIPT - end - - %% Kubernetes Secrets - subgraph K8sSecrets["Kubernetes Secrets Storage"] - BAOKEYS[openbao-keys
root_token, unseal_key]:::k8s - BAOUSER[openbao-user
readonly credentials]:::k8s - GITEAADMIN[gitea-admin-credentials
bootstrap admin]:::k8s - end - - %% External Secrets Operator - subgraph ESO["External Secrets Operator (external-secrets namespace)"] - ESOCTRL[ES Controller]:::external - ESOWH[ES Webhook]:::external - ESOCERT[ES Cert Controller]:::external - - ESOCTRL -.Watches.-> ESOWH - ESOCERT -.Manages.-> ESOWH - end - - %% ClusterSecretStores - subgraph CSS["ClusterSecretStores"] - CSS1[openbao-secret-store
UserPass Auth
path: secrets/]:::external - CSS2[k8s-secret-store
K8s SA Auth
backend: cf-es-backend]:::external - CSS3[airm-secret-store
Points to OpenBao]:::external - CSS4[k8srealm-secret-store
For Keycloak]:::external - CSS5[fake-secret-store
Testing/Defaults]:::external - end - - %% Application ExternalSecrets - subgraph AppSecrets["Application ExternalSecrets"] - ES1[keycloak-credentials]:::app - ES2[airm-realm-credentials]:::app - ES3[k8s-realm-credentials]:::app - ES4[minio-tenant secrets]:::app - ES5[cnpg database credentials]:::app - ES6[rabbitmq credentials]:::app - end - - %% Applications - subgraph Apps["Applications"] - KC[Keycloak
Identity Provider]:::app - GITEA[Gitea
Git Server]:::app - MINIO[MinIO
Object Storage]:::app - CNPG[CloudNativePG
Databases]:::app - RABBIT[RabbitMQ
Message Queue]:::app - end - - %% Flow connections - Bootstrap --> BAO0 - Bootstrap --> K8sSecrets - - K8sSecrets --> UnsealAuto - UnsealAuto --> BAO0 - UnsealAuto --> BAO1 - UnsealAuto --> BAO2 - - BAO0 --> CSS1 - BAO0 --> CSS3 - BAO0 --> CSS4 - K8sSecrets --> CSS2 - - CSS1 -.Authenticates via.-> USERPASS - CSS1 -.Reads from.-> KV2 - - ESO --> CSS1 - ESO --> CSS2 - ESO --> CSS3 - ESO --> CSS4 - ESO --> CSS5 - - CSS1 --> AppSecrets - CSS2 --> AppSecrets - CSS3 --> AppSecrets - CSS4 --> AppSecrets - - AppSecrets --> KC - AppSecrets --> GITEA - AppSecrets --> MINIO - AppSecrets --> CNPG - AppSecrets --> RABBIT - - BAOUSER -.Contains credentials for.-> CSS1 - BAOKEYS -.Unseals.-> BAO0 - BAOKEYS -.Unseals.-> BAO1 - BAOKEYS -.Unseals.-> BAO2 - - %% Secret Generation Flow - RANDOM -.Generates.-> KV2 - RANDOM -.Generates.-> APIKEY -``` - -## Key Components - -### 1. OpenBao Vault Cluster - -**Deployment Model:** -- 3-node cluster in High Availability (HA) mode -- Raft integrated storage (no external dependencies) -- Each pod runs in `cf-openbao` namespace -- Auto-unseal via CronJob every 5 minutes - -**Configuration:** -```yaml -Storage: Raft integrated -UI: Enabled -Auth Methods: userpass -Secret Engines: - - secrets/ (KV v2) - Application secrets - - apikey-groups/ (KV v2) - API key management - - sys/tools/random - Password generation -``` - -### 2. Bootstrap Process - -**init-openbao.sh:** -1. Checks if OpenBao is already initialized -2. Initializes with key-shares=1, key-threshold=1 (single key setup) -3. Stores `root_token` and `unseal_key` in K8s secret `openbao-keys` -4. Unseals all 3 pods -5. Forms Raft cluster (pods join via HTTP) - -**setup-openbao.sh:** -1. Enables KV v2 engines at `secrets/` and `apikey-groups/` -2. Enables `userpass` authentication -3. Creates `read-policy` for read-only access -4. Creates `readonly-user` with read-only permissions -5. Stores readonly credentials in K8s secret `openbao-user` - -**manage-secrets.sh (NEW - Unified Secret Management):** -Replaces the old hardcoded `generate-secrets.sh` with a declarative, config-driven approach: -1. Reads secret definitions from `openbao-secret-definitions.yaml` ConfigMap -2. Supports two secret types: - - `static`: Fixed values with domain templating support (e.g., `{{ .Values.domain }}`) - - `random`: Generated using OpenBao's random tool with specified byte length -3. Uses format: `SECRET_PATH|TYPE|VALUE|BYTES` (e.g., `secrets/my-app-password|random||32`) -4. Idempotent operation - skips existing secrets, only creates missing ones -5. Handles domain templating with `envsubst` for static values -6. Special handling for `cluster-auth-openbao-token` in init mode -7. Used by both bootstrap process and ongoing CronJob management -8. Comprehensive error handling and progress reporting instead of generate-secrets.sh. - -### 3. Automated Unseal Mechanism - -**CronJob Configuration:** -- Schedule: Every 5 minutes (`*/5 * * * *`) -- Runs in `cf-openbao` namespace -- Service Account: `openbao-unseal-job-sa` -- Permissions: Get pods, exec into pods, read secrets - -**Unseal Logic:** -1. Retrieves `unseal_key` from `openbao-keys` secret -2. Finds all running OpenBao pods that are sealed -3. Executes `bao operator unseal` on each sealed pod -4. Handles pod restarts and cluster member changes - -### 4. Automated Secret Management System - -**Declarative Secret Definition System:** -- **Location**: `sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml` -- **Format**: Structured ConfigMap with pipe-delimited entries: `SECRET_PATH|TYPE|VALUE|BYTES` -- **Configuration Management**: Deployed as Helm chart enabling GitOps-based secret management -- **Domain Templating**: Static values support `{{ .Values.domain }}` templating for environment-specific configuration - -**Secret Types Supported:** -1. **Static Secrets**: - - Format: `secrets/cluster-domain|static|{{ .Values.domain }}|0` - - Use case: Fixed values, URLs, domain references - - Supports Helm templating for dynamic values -2. **Random Secrets**: - - Format: `secrets/my-app-password|random||32` - - Use case: Generated passwords, tokens, API keys - - Byte length specified in fourth field - -**CronJob-Based Management:** -- **Schedule**: Every 5 minutes (`*/5 * * * *`) -- **Purpose**: Ensures all defined secrets exist in OpenBao without manual intervention -- **Behavior**: Idempotent - only creates missing secrets, skips existing ones -- **Template**: `sources/openbao-config/0.1.0/templates/openbao-secret-manager-cronjob.yaml` -- **Service Account**: `openbao-secret-manager-sa` with minimal required permissions -- **Timeout**: 5-minute active deadline with single retry on failure - -**Configuration Management Features:** -- **Checksum Annotations**: Forces pod recreation when ConfigMap changes -- **Resource Limits**: Memory: 256Mi, CPU: 500m for controlled resource usage -- **Environment Variables**: Domain templating via Helm values injection -- **Volume Mounts**: Scripts from `openbao-secret-manager-scripts`, config from `openbao-secrets-config` - -**Adding New Secrets Workflow:** -1. Edit `sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml` -2. Add line following format: `secrets/my-app-password|random||32` -3. Commit and push to main branch -4. ArgoCD syncs the configuration within ~3 minutes -5. CronJob automatically creates the secret within ~5 minutes -6. Total time from commit to secret availability: ~8 minutes - -**For detailed user guide:** See [secret management user guide](secret-management-user-guide.md) for step-by-step instructions and examples - -**Examples from Current Configuration:** -``` -# Database credentials -secrets/airm-cnpg-user-password|random||16 - -# Static domain-based URLs -secrets/minio-openid-url|static|https://kc.{{ .Values.domain }}/realms/airm/.well-known/openid-configuration|0 - -# Fixed API keys -secrets/minio-api-access-key|static|api-default-user|0 -``` - -### 5. External Secrets Operator - -**Components:** -- **Controller**: Watches ExternalSecret resources and syncs from backends -- **Webhook**: Validates ExternalSecret/SecretStore resources -- **Cert Controller**: Manages TLS certificates for webhooks - -**ClusterSecretStore Types:** - -1. **openbao-secret-store** - - Provider: OpenBao (vault) - - Auth: UserPass (readonly-user) - - Path: secrets/ - - Used by: Most application secrets - -2. **k8s-secret-store** - - Provider: Kubernetes - - Auth: Service Account (external-secrets-readonly) - - Backend: cf-es-backend namespace - - Used by: Cross-namespace secret sharing - -3. **airm-secret-store** / **k8srealm-secret-store** - - Domain-specific stores for AIRM and K8s realm - - Point to OpenBao with specific paths - -4. **fake-secret-store** - - Provider: Fake (hardcoded values) - - Used for: Testing and default values - -### 6. Secret Flow Architecture - -```mermaid -flowchart TD - %% Styling - classDef bootstrap fill:#FFE066,stroke:#FFB800,stroke-width:3px,color:#000 - classDef vault fill:#4B8BBE,stroke:#306998,stroke-width:3px,color:#fff - classDef config fill:#9B59B6,stroke:#8E44AD,stroke-width:2px,color:#fff - classDef k8s fill:#326CE5,stroke:#00308F,stroke-width:2px,color:#fff - classDef app fill:#00D084,stroke:#00A86B,stroke-width:2px,color:#fff - classDef external fill:#FF6B6B,stroke:#C92A2A,stroke-width:2px,color:#fff - - %% Main flow - A[Bootstrap Script
1. Deploy openbao-config
2. Initialize OpenBao]:::bootstrap - B[OpenBao Vault Cluster
3 replicas
Unseals every 5min]:::vault - C[Automated Management
CronJob every 5min
- Reads config
- Creates missing
- Skips existing]:::config - D["Secret Definition
ConfigMap Helm
- Format: PATH|TYPE|...
- Domain templating
- GitOps managed"]:::config - E[KV v2 Engine
secrets/* in OpenBao]:::vault - F[ClusterSecretStore
openbao-secret-store]:::external - G[ExternalSecret
Resources]:::k8s - H[Application Pod
mounts secret]:::app - - %% Flow connections with labels - A -->|1. Deploys| B - A -->|1. Creates| D - D -->|3. Monitors definitions| C - C -->|2. Config-driven secret creation| B - B -->|4. Secrets stored| E - E -->|5. External Secrets reads| F - F -->|6. Sync to K8s| G - G -->|7. Creates K8s Secret| H - - %% Feedback loop - C -.->|Monitors ConfigMap| D -``` - -### 7. Secret Categories - -**Identity & Authentication:** -- Keycloak admin password -- OAuth client secrets (Gitea, ArgoCD, AIRM UI) -- Realm credentials (AIRM, K8s) - -**Database Credentials:** -- PostgreSQL superuser & user credentials (AIRM, Keycloak, Catalog) -- Generated via OpenBao random tool -- Managed by CloudNativePG operator - -**Storage & Messaging:** -- MinIO root password, API keys, console keys -- MinIO OpenID Connect URLs -- RabbitMQ user credentials - -**Cluster Infrastructure:** -- Cluster admin tokens -- OpenBao root token (stored in K8s) -- Domain configuration - -### 8. Security Model - -**Encryption at Rest:** -- OpenBao data encrypted in Raft storage -- Kubernetes secrets encrypted if cluster encryption is enabled - -**Access Control:** -- **Root Token**: Stored in K8s secret, used only during bootstrap -- **Readonly User**: Limited to read operations on secrets path -- **Service Accounts**: Scoped to specific namespaces - -**Network Security:** -- OpenBao accessible only within cluster (ClusterIP) -- TLS disabled for internal communication (cluster-internal) -- External Secrets uses internal service DNS - -**Secret Rotation:** -- OpenBao supports secret versioning (KV v2) -- Applications can reference specific versions -- Old versions retained for rollback - -### 9. Disaster Recovery - -**Backup Strategy:** -- OpenBao unseal key stored in `openbao-keys` K8s secret -- Root token stored in `openbao-keys` K8s secret -- Raft storage on persistent volumes - -**Recovery Process:** -1. Restore persistent volumes with Raft data -2. Deploy OpenBao pods -3. Unseal using stored unseal key -4. Verify cluster health via `bao operator raft list-peers` - -**Important Notes:** -- Single unseal key (key-shares=1) - simplified but less secure -- For production, use Shamir's Secret Sharing (key-shares=5, threshold=3) -- Consider auto-unseal with cloud KMS for production - -### 10. Integration Points - -**Gitea Configuration:** -- Admin credentials generated during bootstrap -- OAuth client secret from OpenBao -- Integrated with Keycloak via OIDC - -**Keycloak Realms:** -- Two realms: `airm` and `k8s` -- Client secrets managed in OpenBao -- Realm templates with placeholder substitution - -**CloudNativePG:** -- Superuser and application user credentials -- Secrets created before cluster bootstrap -- Automatic database initialization - -**MinIO Tenant:** -- Console and API credentials separate -- OIDC integration with Keycloak -- Auto-configured with OpenBao secrets - -## Monitoring & Observability - -**Health Checks:** -- OpenBao: `bao status` via exec probe -- External Secrets: Controller logs and metrics -- Secret Sync: ExternalSecret CR status conditions - -**Common Issues:** -- **Sealed Vault**: Check CronJob execution and unseal key -- **Secret Sync Failure**: Verify ClusterSecretStore authentication -- **Missing Secrets**: Check OpenBao path and ExternalSecret remoteRef - -## Best Practices - -1. **Never commit unseal keys or root tokens** to version control -2. **Rotate readonly user credentials** periodically -3. **Monitor ExternalSecret sync errors** for failed secret updates -4. **Use specific secret versions** in production for stability -5. **Test secret rotation** in staging before production -6. **Backup `openbao-keys` secret** to secure external location -7. **Enable audit logging** in OpenBao for compliance -8. **Use namespaced SecretStores** for tenant isolation when possible - -## Future Enhancements - -- [ ] Implement auto-unseal with cloud KMS -- [ ] Add secret rotation automation -- [ ] Enable OpenBao audit logging -- [ ] Implement Shamir's Secret Sharing (N-of-M keys) -- [ ] Add monitoring/alerting for unsealed state -- [ ] Integrate with cert-manager for TLS -- [ ] Add RBAC policies for fine-grained access -- [ ] Implement secret versioning strategy diff --git a/docs/values_inheritance_pattern.md b/docs/values_inheritance_pattern.md index 51fb529d..0e4a7642 100644 --- a/docs/values_inheritance_pattern.md +++ b/docs/values_inheritance_pattern.md @@ -2,111 +2,203 @@ ## Overview -ClusterForge implements a sophisticated GitOps deployment pattern that supports both external GitHub deployment and local cluster-native deployment through dual values files and repository configurations. +Cluster-Forge implements a sophisticated dual-repository GitOps deployment pattern that supports both external GitHub deployment and local cluster-native deployment through separate configuration and application repositories. ## Two Deployment Modes -### External Mode (`values.yaml`) +### Local Mode (Default) ```yaml clusterForge: - repoUrl: "https://github.com/silogen/cluster-forge.git" - targetRevision: v1.7.1 - valuesFile: values.yaml + repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" + targetRevision: main externalValues: - enabled: false # Uses single external source + enabled: true # Uses multi-source pattern + repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" + targetRevision: main ``` -**Purpose**: Traditional GitOps with external GitHub dependency -**Use Cases**: Initial deployment, CI/CD pipelines, production releases -**Network**: Requires external internet access +**Purpose**: Self-contained cluster-native GitOps with local Gitea +**Use Cases**: Air-gapped environments, autonomous operation, production deployments +**Network**: Self-contained within cluster network +**Features**: +- Local Gitea serves both cluster-forge and cluster-values repositories +- Initialization handled by gitea-init-job during bootstrap +- Zero external dependencies once bootstrapped +- Full configuration version control within cluster -### Local Mode (`values_cf.yaml`) +### External Mode ```yaml clusterForge: - repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" - targetRevision: main + repoUrl: "https://github.com/silogen/cluster-forge.git" + targetRevision: v1.8.0-rc2 externalValues: - enabled: true # Uses local multi-source - repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" - targetRevision: main - path: values_cf.yaml + enabled: false # Single source from GitHub ``` -**Purpose**: Self-contained GitOps with local Gitea and separate configuration repository -**Use Cases**: Air-gapped environments, developer clusters, autonomous operation -**Network**: Self-contained within cluster network +**Purpose**: Traditional GitOps with external GitHub dependency +**Use Cases**: Initial deployment, CI/CD pipelines, feature branch testing +**Network**: Requires external internet access +**Features**: +- Direct GitHub access for application deployment +- Use `--dev` flag for feature branch development +- Supports custom branch selection for testing ## Size-Specific Inheritance -ClusterForge uses Helm's multi-values file support for cluster size configuration: +Cluster-Forge uses YAML merge semantics for cluster size configuration: ```bash -helm template -f values.yaml -f values_medium.yaml +# Bootstrap merges values using yq eval-all +yq eval-all '. as $item ireduce ({}; . * $item)' \ + values.yaml values_medium.yaml ``` ### Inheritance Hierarchy -1. **Base**: `values.yaml` or `values_cf.yaml` (52 common applications) +1. **Base**: `values.yaml` (common applications and defaults) 2. **Size Override**: `values_small.yaml`, `values_medium.yaml`, or `values_large.yaml` -3. **Runtime**: Domain and cluster-specific parameters +3. **External**: `cluster-values/values.yaml` from Gitea (when externalValues.enabled: true) +4. **Runtime**: Domain and cluster-specific parameters injected during bootstrap + +### DRY Principle in Size Files + +Size files only contain differences from base (Don't Repeat Yourself): + +**Base values.yaml**: +- Complete application definitions for all apps +- Alpha-sorted `enabledApps` list +- Common defaults applicable to all sizes + +**Size-specific values**: +- Only resource overrides that differ from base +- Size-specific enabledApps additions (e.g., storage policies) +- HA configurations for large clusters + +**Example**: +```yaml +# values_small.yaml - only differences +enabledApps: + - kyverno-policies-storage-local-path # Added to base list + +apps: + argocd: + valuesObject: + controller: + resources: + limits: + cpu: 2000m # Override from base + memory: 4Gi +``` + +| Cluster Size | Apps from Base | Additional Apps | Configuration Overrides | +|--------------|----------------|-----------------|------------------------| +| **Small** | All base apps | +1 (storage policy) | Minimal resources, single replicas | +| **Medium** | All base apps | +1 (storage policy) | Balanced resources, single replicas | +| **Large** | All base apps | +0 (no additions) | Production resources, OpenBao HA (3 replicas) | + +## Bootstrap and GitOps Workflow + +### Bootstrap Process + +The bootstrap script establishes the GitOps foundation: + +**Phase 1: Pre-Cleanup** +- Removes previous installations when applicable + +**Phase 2: GitOps Foundation Bootstrap** +1. ArgoCD deployment (helm template) +2. OpenBao deployment and initialization +3. Gitea deployment and initialization + - Creates cluster-org organization + - Clones cluster-forge from initial-cf-values ConfigMap + - Creates cluster-values repository + +**Phase 3: App-of-Apps Deployment** +- Creates cluster-forge Application in ArgoCD +- Uses multi-source when externalValues.enabled: true +- ArgoCD manages all remaining applications -### Size File Structure -- **Base files**: Complete application definitions and 52 enabledApps -- **Size files**: Only contain differences from base (DRY principle) -- **Large clusters**: No size file needed (inherit everything from base) +### Multi-Source GitOps Pattern -| Cluster Size | Apps from Base | Additional Apps | Total Apps | -|--------------|----------------|-----------------|-----------| -| **Small** | 52 (inherited) | +1 (storage policy) | **53 apps** | -| **Medium** | 52 (inherited) | +1 (storage policy) | **53 apps** | -| **Large** | 52 (inherited) | +0 (no additions) | **52 apps** | +When using local mode (`externalValues.enabled: true`), ArgoCD uses two separate repositories: -## Repository Transition Pattern +**Source 1: Application Source** (`cluster-forge`) +- Helm charts and manifests in `sources/` directory +- Application definitions in `root/` chart +- Component versions and configurations -### Bootstrap Workflow -1. **External Bootstrap**: Deploy from GitHub for initial setup -2. **Local Transition**: Switch to local Gitea for autonomous operation -3. **Developer Access**: Local Git workflows for cluster configuration -4. **Upstream Sync**: Periodic synchronization with main project +**Source 2: Configuration Source** (`cluster-values`) +- Custom `values.yaml` for environment-specific overrides +- Domain and cluster-specific settings +- Independent versioning from application code -### Multi-Source GitOps -When using `values_cf.yaml`, ArgoCD uses two separate repositories: -- **Application Source**: `cluster-org/cluster-forge` (Helm charts and manifests) -- **Configuration Source**: `cluster-org/cluster-values` (values.yaml customizations) +This separation enables: +- Different update cadences for infrastructure vs. configuration +- Easy configuration rollback without affecting application versions +- Clear ownership separation -This separation enables independent versioning of infrastructure vs. settings. +### Value Merge Order + +When ArgoCD renders applications with multi-source: + +1. **Base values** from `cluster-forge/root/values.yaml` +2. **Size-specific** from `cluster-forge/root/values_.yaml` +3. **External overrides** from `cluster-values/values.yaml` +4. **Runtime parameters** (domain, targetRevision) injected by bootstrap ## Developer Workflow -### Local Configuration Management +### Local Configuration Management (Local Mode) + ```bash -# Clone local configuration repository +# Clone local configuration repository from Gitea git clone http://gitea.cluster.example.com/cluster-org/cluster-values.git cd cluster-values # Modify cluster configurations -vim values_cf.yaml -git add values_cf.yaml +vim values.yaml +git add values.yaml git commit -m "Update cluster configuration" git push -# ArgoCD automatically deploys the changes +# ArgoCD automatically detects and syncs the changes +``` + +### Feature Branch Testing (External Mode with --dev) + +```bash +# Create feature branch in cluster-forge repository +git checkout -b feature/new-capability +# Make changes to applications or configurations +git commit -am "Add new capability" +git push origin feature/new-capability + +# Bootstrap with development mode +./scripts/bootstrap.sh dev.example.com --CLUSTER_SIZE=small --dev +# Script prompts for branch selection +# ArgoCD points directly to GitHub feature branch + +# Iterate: push changes to feature branch, ArgoCD syncs automatically ``` ### Configuration Version Control -- All cluster configuration changes tracked in Git history -- Pull request workflow for configuration reviews -- Automatic deployment through ArgoCD sync -- Rollback capabilities through Git revert + +Benefits of the dual-repository pattern: +- **Full Git history**: Track all cluster configuration changes +- **Pull request workflow**: Review configuration changes before deployment +- **Automatic deployment**: ArgoCD syncs on Git push +- **Rollback capabilities**: Revert via Git history +- **Separation of concerns**: Infrastructure code vs. environment configuration ## Benefits -1. **🎯 Deployment Flexibility**: External dependency → local autonomy transition +1. **🎯 Deployment Flexibility**: Support for both external and local GitOps modes 2. **🔄 Version Control**: Full Git history for all cluster configuration changes -3. **🛡️ Air-Gap Ready**: Works in secure, isolated environments -4. **👥 Developer Experience**: Local Git access for cluster configuration -5. **📦 Upstream Sync**: Can receive updates from main project +3. **🛡️ Air-Gap Ready**: Works in secure, isolated environments with local Gitea +4. **👥 Developer Experience**: Local Git access for cluster configuration management +5. **📦 Multi-Source Pattern**: Separate application code from configuration 6. **🔧 Maintainability**: DRY principle eliminates configuration redundancy +7. **🚀 Bootstrap Automation**: Single command establishes complete GitOps infrastructure -This architectural pattern enables clusters to evolve from external dependency to local autonomy while maintaining all benefits of declarative configuration management. \ No newline at end of file +This architectural pattern enables clusters to operate with full GitOps benefits while maintaining flexibility for different deployment scenarios from development to air-gapped production environments. \ No newline at end of file From ca9d0d9771c1280cabd3bedc5b9be0667390626f Mon Sep 17 00:00:00 2001 From: brownzebra Date: Tue, 17 Feb 2026 14:28:07 +0200 Subject: [PATCH 003/115] fix: resources in prd --- PRD.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PRD.md b/PRD.md index 9afefc7f..fba259d0 100644 --- a/PRD.md +++ b/PRD.md @@ -264,13 +264,13 @@ VALUES=$(yq eval-all '. as $item ireduce ({}; . * $item)' \ **Size-Specific Behaviors:** -Small/Medium add storage policies: +Small/Medium are single-node and have storage class mutation policies: ```yaml enabledApps: - kyverno-policies-storage-local-path # RWX→RWO mutation for local-path ``` -Large enables HA components: +Large enables Multi-Node and HA components: ```yaml apps: openbao: @@ -441,7 +441,7 @@ Kueue manages scheduling for: ### Resource Requirements **Small Cluster:** -- 3-5 worker nodes +- single-node - 8 CPU, 16Gi RAM minimum per node - 250Gi+ total storage - Local-path or hostPath storage class From e703f6bf04861a368d7efc31536a2dc877a67f84 Mon Sep 17 00:00:00 2001 From: brownzebra Date: Wed, 18 Feb 2026 09:59:45 +0200 Subject: [PATCH 004/115] chore: removing dev mode from docs --- PRD.md | 22 ++--- README.md | 6 +- docs/bootstrap_guide.md | 130 +---------------------------- docs/values_inheritance_pattern.md | 18 ---- 4 files changed, 8 insertions(+), 168 deletions(-) diff --git a/PRD.md b/PRD.md index fba259d0..82cb7b8d 100644 --- a/PRD.md +++ b/PRD.md @@ -48,7 +48,6 @@ Cluster-Forge supports flexible GitOps repository configurations: **External Mode** - Traditional GitHub-based GitOps: - Set `clusterForge.repoUrl` to external GitHub repository -- Use `--dev` flag with bootstrap.sh to configure targetRevision for feature branch development - Supports custom branch selection for testing and development ### Size-Aware Configuration @@ -203,7 +202,7 @@ cluster-forge/ The bootstrap.sh script orchestrates complete cluster setup: ```bash -./scripts/bootstrap.sh [--CLUSTER_SIZE=small|medium|large] [--dev] +./scripts/bootstrap.sh [--CLUSTER_SIZE=small|medium|large] ``` **Bootstrap Process:** @@ -219,11 +218,6 @@ The bootstrap.sh script orchestrates complete cluster setup: 10. **ClusterForge App** - Creates root Application with merged values 11. **Cleanup** - Removes temporary values files -**Development Mode** (--dev flag): -- Prompts for git branch selection (current, custom, or abort) -- Sets targetRevision for ArgoCD applications -- Enables feature branch testing without committing to main - ### Self-Contained GitOps Once bootstrapped, the cluster is fully self-sufficient: @@ -436,27 +430,23 @@ Kueue manages scheduling for: - helm 3.0+ - kubectl - openssl (for password generation) -- git (for --dev mode) ### Resource Requirements **Small Cluster:** -- single-node -- 8 CPU, 16Gi RAM minimum per node +- single node - 250Gi+ total storage - Local-path or hostPath storage class **Medium Cluster:** -- 5-10 worker nodes -- 16 CPU, 32Gi RAM minimum per node +- single node - 500Gi+ total storage - Local-path or distributed storage **Large Cluster:** -- 10+ worker nodes -- 32 CPU, 64Gi RAM minimum per node +- multinode, HA / 3 node control plane - 1Ti+ total storage -- Distributed storage required (Longhorn, Ceph, etc.) +- Distributed storage required (Storage appliances / cloud / Longhorn, Ceph, etc.) ### Functional Requirements @@ -472,7 +462,6 @@ Kueue manages scheduling for: - Manage 40+ components as ArgoCD Applications - Support multi-source Applications for values separation - Enable local Gitea 12.3.0 for cluster-native GitOps -- Provide developer mode for branch-based testing **FR3: Size-Aware Deployment** - Support small/medium/large configurations via --CLUSTER_SIZE flag @@ -532,7 +521,6 @@ apps: # component values ``` 3. Add to enabledApps list -4. Test with --dev mode ### Custom Cluster Values diff --git a/README.md b/README.md index 1eea0360..3058fb03 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ Using a bootstrap-first deployment model, Cluster-Forge establishes GitOps infra ### Single-Command Deployment ```bash -./scripts/bootstrap.sh [--CLUSTER_SIZE=small|medium|large] [--dev] +./scripts/bootstrap.sh [--CLUSTER_SIZE=small|medium|large] ``` ### Size-Aware Deployment Examples @@ -32,9 +32,6 @@ Using a bootstrap-first deployment model, Cluster-Forge establishes GitOps infra # Large cluster (10s-100s users, enterprise scale) ./scripts/bootstrap.sh prod.example.com --CLUSTER_SIZE=large - -# Development mode (feature branch testing) -./scripts/bootstrap.sh dev.example.com --CLUSTER_SIZE=small --dev ``` For detailed deployment instructions, see the [Bootstrap Guide](docs/bootstrap_guide.md). @@ -68,7 +65,6 @@ Cluster-Forge uses a three-phase bootstrap process: **External Mode** - Traditional GitHub-based GitOps: - Points to external GitHub repository -- Use `--dev` flag for feature branch development - Supports custom branch selection for testing See [Values Inheritance Pattern](docs/values_inheritance_pattern.md) for detailed architecture. diff --git a/docs/bootstrap_guide.md b/docs/bootstrap_guide.md index 29ad3081..4b6670c6 100644 --- a/docs/bootstrap_guide.md +++ b/docs/bootstrap_guide.md @@ -10,12 +10,11 @@ This guide explains how to bootstrap a complete GitOps environment using Cluster - `helm` (3.0+) - `openssl` (for password generation) - `yq` (v4+) - - `git` (for --dev mode) ## Usage ```bash -./scripts/bootstrap.sh [--CLUSTER_SIZE=small|medium|large] [--dev] +./scripts/bootstrap.sh [--CLUSTER_SIZE=small|medium|large] ``` ### Arguments @@ -25,7 +24,6 @@ This guide explains how to bootstrap a complete GitOps environment using Cluster ### Options - **--CLUSTER_SIZE** `[small|medium|large]`: Cluster size configuration (default: `medium`) -- **--dev**: Enable development mode for feature branch testing - **--help**, **-h**: Show usage information ### Examples @@ -36,9 +34,6 @@ This guide explains how to bootstrap a complete GitOps environment using Cluster # Large cluster ./scripts/bootstrap.sh example.com --CLUSTER_SIZE=large - -# Development mode for feature branch testing -./scripts/bootstrap.sh dev.example.com --CLUSTER_SIZE=small --dev ``` ## How It Works @@ -58,7 +53,6 @@ The bootstrap script uses a three-phase deployment model: - Validates cluster size (small, medium, or large) - Merges base `values.yaml` with size-specific overrides `values_.yaml` - Sets `global.domain` and `global.clusterSize` in merged configuration -- In dev mode: prompts for git branch selection and configures targetRevision **2. Namespace Creation** Creates three namespaces for core components: @@ -99,7 +93,6 @@ Creates three namespaces for core components: - Creates `cluster-org` organization - Clones and pushes cluster-forge repository from initial-cf-values ConfigMap - Creates cluster-values repository with configuration - - In dev mode: sets targetRevision to selected branch ### Phase 3: App-of-Apps Deployment (ArgoCD-Managed) @@ -152,7 +145,7 @@ ClusterForge uses a layered configuration approach with YAML merge semantics: clusterForge: repoURL: http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git path: root - targetRevision: main # or feature branch in dev mode + targetRevision: main global: clusterSize: medium # Set by --CLUSTER_SIZE flag @@ -260,113 +253,10 @@ apps: kubectl -n keycloak get secret airm-devuser-credentials -o jsonpath="{.data.KEYCLOAK_INITIAL_DEVUSER_PASSWORD}" | base64 -d ``` -## Development Mode - -Development mode is designed for testing ClusterForge changes from a feature branch in a lightweight environment. Unlike production mode, dev mode uses a minimal configuration without local Gitea mirrors or full secret management infrastructure. - -### Key Differences from Production - -| Feature | Production Mode | Development Mode | -|---------|----------------|------------------| -| **Cluster Size** | Medium/Large recommended | **Small required** | -| **Repository Source** | Local Gitea mirrors | Direct GitHub access | -| **Gitea Deployment** | ✓ Deployed | ✗ **Not deployed** | -| **OpenBao Deployment** | ✓ Full deployment | ✗ **Minimal/not deployed** | -| **External Values** | ✓ Enabled (cluster-values repo) | ✗ Disabled (inline values) | -| **Resource Requirements** | Higher | **Minimal** | -| **Use Case** | Production clusters | Feature testing, development | - -### Prerequisites for Dev Mode - -1. **Small cluster configuration required** - Dev mode is optimized for resource-constrained environments -2. **GitHub access** - Direct connection to GitHub repository (no local Gitea mirror) -3. **Minimal infrastructure** - Skips Gitea and heavy secret management components - -### Enabling Development Mode - -**Note:** Dev mode must use small cluster size to work properly: - -```bash -./bootstrap.sh --dev dev.example.com --CLUSTER_SIZE=small -``` - -The script will: -1. Detect your current git branch -2. Prompt you to confirm using it or specify a custom branch: - ``` - Development mode enabled - ArgoCD will point to live GitHub repository - Current git branch: feature-xyz - - Use current branch 'feature-xyz' for targetRevision? [Y/n/custom_branch]: - ``` -3. Configure ClusterForge to point directly to GitHub (bypassing Gitea) -4. Set `externalValues.enabled: false` to use inline configuration -5. Deploy only ArgoCD and essential components - -### Development Workflow - -1. **Create feature branch:** - ```bash - git checkout -b feature-xyz - ``` - -2. **Make your changes** to apps, values, or configurations - -3. **Commit and push to GitHub:** - ```bash - git add . - git commit -m "Add new feature" - git push origin feature-xyz - ``` - -4. **Bootstrap cluster in dev mode (small cluster only):** - ```bash - ./bootstrap.sh --dev dev.example.com --CLUSTER_SIZE=small - # Confirm using 'feature-xyz' branch when prompted - ``` - -5. **Iterate:** Any subsequent changes pushed to `feature-xyz` will automatically sync to the cluster directly from GitHub via ArgoCD - -### Configuration for Dev Mode - -In dev mode, the cluster-forge Application uses a single-source configuration: - -```yaml -externalValues: - enabled: false # No separate cluster-values repo - -clusterForge: - repoUrl: "https://github.com/silogen/cluster-forge.git" # Direct GitHub access - targetRevision: feature-xyz # Your feature branch -``` - -This bypasses the need for: -- Local Gitea deployment and mirrors -- Separate cluster-values repository -- Full OpenBao secret management infrastructure - -### When to Use Dev Mode - -**Use dev mode when:** -- Testing new features or configurations -- Developing on resource-constrained clusters -- Rapid iteration without infrastructure overhead -- Working with feature branches before merging - -**Use production mode when:** -- Deploying to production environments -- Requiring full secret management (OpenBao) -- Needing local Git mirrors for air-gapped scenarios -- Medium/Large cluster configurations - ## Troubleshooting -**Note:** Some troubleshooting steps below only apply to production mode deployments that include Gitea and OpenBao. - ### Bootstrap Fails at Gitea Init -*Production mode only* - If the Gitea initialization job fails during repository migration: ```bash @@ -449,22 +339,6 @@ global: myCustomValue: "something" ``` -### Development Mode (without Gitea) - -In development mode, there is no local Gitea repository. Customization is done by: - -1. **Editing values files** in your local cluster-forge repository -2. **Committing changes** to your feature branch -3. **Pushing to GitHub:** - ```bash - git add root/values.yaml # or values_small.yaml - git commit -m "Update configuration" - git push origin feature-xyz - ``` -4. **ArgoCD syncs automatically** from GitHub - -Since `externalValues.enabled: false` in dev mode, all configuration is in the main values files (`values.yaml`, `values_small.yaml`). - ## File Cleanup The bootstrap script automatically cleans up temporary files at the end: diff --git a/docs/values_inheritance_pattern.md b/docs/values_inheritance_pattern.md index 0e4a7642..131a26e6 100644 --- a/docs/values_inheritance_pattern.md +++ b/docs/values_inheritance_pattern.md @@ -42,7 +42,6 @@ externalValues: **Network**: Requires external internet access **Features**: - Direct GitHub access for application deployment -- Use `--dev` flag for feature branch development - Supports custom branch selection for testing ## Size-Specific Inheritance @@ -165,23 +164,6 @@ git push # ArgoCD automatically detects and syncs the changes ``` -### Feature Branch Testing (External Mode with --dev) - -```bash -# Create feature branch in cluster-forge repository -git checkout -b feature/new-capability -# Make changes to applications or configurations -git commit -am "Add new capability" -git push origin feature/new-capability - -# Bootstrap with development mode -./scripts/bootstrap.sh dev.example.com --CLUSTER_SIZE=small --dev -# Script prompts for branch selection -# ArgoCD points directly to GitHub feature branch - -# Iterate: push changes to feature branch, ArgoCD syncs automatically -``` - ### Configuration Version Control Benefits of the dual-repository pattern: From ddc7761b7d557693326bdf73d5d6ae6d919de2e5 Mon Sep 17 00:00:00 2001 From: brownzebra Date: Wed, 18 Feb 2026 10:14:07 +0200 Subject: [PATCH 005/115] fix: bump airm version to 0.3.3 --- sources/airm/0.3.3/.helmignore | 27 ++ sources/airm/0.3.3/Chart.yaml | 35 ++ .../airm/0.3.3/charts/airm-api/.helmignore | 27 ++ sources/airm/0.3.3/charts/airm-api/Chart.yaml | 29 ++ sources/airm/0.3.3/charts/airm-api/README.md | 124 ++++++ .../0.3.3/charts/airm-api/files/configure.sh | 374 ++++++++++++++++++ .../charts/airm-api/templates/_helpers.tpl | 15 + .../airm-api/templates/airm-app-backend.yaml | 268 +++++++++++++ .../airm-api/templates/airm-app-frontend.yaml | 85 ++++ .../airm-api/templates/airm-cert-issuer.yaml | 26 ++ .../templates/airm-cluster-roles.yaml | 34 ++ .../airm-cluster-runtime-config.yaml | 19 + .../airm-api/templates/airm-cluster.yaml | 47 +++ .../templates/airm-configure-job.yaml | 172 ++++++++ .../charts/airm-api/templates/airm-es.yaml | 215 ++++++++++ .../airm-api/templates/airm-httproute.yaml | 81 ++++ .../templates/airm-rabbitmq-cluster.yaml | 69 ++++ .../templates/airm-vllm-collector.yaml | 93 +++++ .../airm/0.3.3/charts/airm-api/values.yaml | 166 ++++++++ .../0.3.3/charts/airm-dispatcher/.helmignore | 27 ++ .../0.3.3/charts/airm-dispatcher/Chart.yaml | 29 ++ .../0.3.3/charts/airm-dispatcher/README.md | 54 +++ .../templates/airm-cluster-policies.yaml | 352 +++++++++++++++++ .../templates/airm-cluster-roles.yaml | 164 ++++++++ .../templates/airm-dispatcher-app.yaml | 343 ++++++++++++++++ .../templates/kyverno-cluster-role.yaml | 35 ++ .../0.3.3/charts/airm-dispatcher/values.yaml | 27 ++ sources/airm/0.3.3/values.yaml | 3 + 28 files changed, 2940 insertions(+) create mode 100644 sources/airm/0.3.3/.helmignore create mode 100644 sources/airm/0.3.3/Chart.yaml create mode 100644 sources/airm/0.3.3/charts/airm-api/.helmignore create mode 100644 sources/airm/0.3.3/charts/airm-api/Chart.yaml create mode 100644 sources/airm/0.3.3/charts/airm-api/README.md create mode 100644 sources/airm/0.3.3/charts/airm-api/files/configure.sh create mode 100644 sources/airm/0.3.3/charts/airm-api/templates/_helpers.tpl create mode 100644 sources/airm/0.3.3/charts/airm-api/templates/airm-app-backend.yaml create mode 100644 sources/airm/0.3.3/charts/airm-api/templates/airm-app-frontend.yaml create mode 100644 sources/airm/0.3.3/charts/airm-api/templates/airm-cert-issuer.yaml create mode 100644 sources/airm/0.3.3/charts/airm-api/templates/airm-cluster-roles.yaml create mode 100644 sources/airm/0.3.3/charts/airm-api/templates/airm-cluster-runtime-config.yaml create mode 100644 sources/airm/0.3.3/charts/airm-api/templates/airm-cluster.yaml create mode 100644 sources/airm/0.3.3/charts/airm-api/templates/airm-configure-job.yaml create mode 100644 sources/airm/0.3.3/charts/airm-api/templates/airm-es.yaml create mode 100644 sources/airm/0.3.3/charts/airm-api/templates/airm-httproute.yaml create mode 100644 sources/airm/0.3.3/charts/airm-api/templates/airm-rabbitmq-cluster.yaml create mode 100644 sources/airm/0.3.3/charts/airm-api/templates/airm-vllm-collector.yaml create mode 100644 sources/airm/0.3.3/charts/airm-api/values.yaml create mode 100644 sources/airm/0.3.3/charts/airm-dispatcher/.helmignore create mode 100644 sources/airm/0.3.3/charts/airm-dispatcher/Chart.yaml create mode 100644 sources/airm/0.3.3/charts/airm-dispatcher/README.md create mode 100644 sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-cluster-policies.yaml create mode 100644 sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-cluster-roles.yaml create mode 100644 sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml create mode 100644 sources/airm/0.3.3/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml create mode 100644 sources/airm/0.3.3/charts/airm-dispatcher/values.yaml create mode 100644 sources/airm/0.3.3/values.yaml diff --git a/sources/airm/0.3.3/.helmignore b/sources/airm/0.3.3/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.3/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.3/Chart.yaml b/sources/airm/0.3.3/Chart.yaml new file mode 100644 index 00000000..ba5cf2b4 --- /dev/null +++ b/sources/airm/0.3.3/Chart.yaml @@ -0,0 +1,35 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm +description: A Helm chart for AIRM full stack, including API, UI and dispatcher + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.3 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" + +dependencies: + - name: airm-api + version: 0.3.3 + - name: airm-dispatcher + version: 0.3.3 diff --git a/sources/airm/0.3.3/charts/airm-api/.helmignore b/sources/airm/0.3.3/charts/airm-api/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.3/charts/airm-api/Chart.yaml b/sources/airm/0.3.3/charts/airm-api/Chart.yaml new file mode 100644 index 00000000..53989d09 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/Chart.yaml @@ -0,0 +1,29 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm-api +description: A Helm chart for AIRM API and UI + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.3 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" diff --git a/sources/airm/0.3.3/charts/airm-api/README.md b/sources/airm/0.3.3/charts/airm-api/README.md new file mode 100644 index 00000000..a16ec9da --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/README.md @@ -0,0 +1,124 @@ + + +# HELM CHARTS + +Simple instructions to deploy AIRM UI and API applications using helm chart + +### 1. Requirements + +The following external components must be available in the Kubernetes cluster before the helm chart can be installed. + +- API Gateway implementation (e.g. KGateway) +- Keycloak with the expected `airm` realm installed +- Valid S3 compatible file storage service (e.g. MinIO) +- RabbitMQ operator +- Cert Manager operator +- External Secret operator +- CNPG operator +- OTEL LGTM stack installed on the cluster + +### 2. Install + +``` +cd helm/airm/charts + +# 1. Create output template just to validate (the public domain could be app-dev.silogen.ai, staging.silogen.ai, etc.) +helm template airm-api ./airm-api -n airm --create-namespace --set airm.appDomain= > airm-api-helm-generated.yaml + +# 2. Run chart install +helm install airm-api ./airm-api -n airm --create-namespace --set airm.appDomain= + +# 3. Delete chart if needed +helm delete airm-api -n airm + +# 4. Upgrade when bumping versions +helm upgrade -n airm --set airm.appDomain= airm-api ./airm-api +``` + +--- + +### 3. Helm Settings + +| Field Path | Description | Type | Example / Default | +|-------------------------------------------------------------------------------|-----------------------------------------------------------------| ------ |---------------------------------------------------------------------------------------------------| +| secretgenerator.image.repository | Docker image repository for secret generator | string | `ghcr.io/silogen/kubectl` | +| secretgenerator.image.tag | Docker image tag | string | `latest` | +| secretgenerator.image.pullPolicy | Image pull policy | string | `IfNotPresent` | +| kgateway.namespace | Namespace for kgateway resources | string | `kgateway-system` | +| kgateway.gatewayName | Gateway name | string | `https` | +| kgateway.airmapi.servicePort | Service port for airmapi | int | `80` | +| kgateway.airmapi.prefixValue | URL prefix for airmapi service | string | `airmapi` | +| kgateway.airmui.servicePort | Service port for airmui | int | `80` | +| kgateway.airmui.prefixValue | URL prefix for airmui service | string | `airmui` | +| aims.otelCollector.exporters.otlphttp.endpoint | Open Telemetry collector endpoint url for inference metrics | string | `http://lgtm-stack.otel-lgtm-stack.svc:4318` | +| aims.otelCollector.image | Base image for Open Telemetry Collector | string | `ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0` | +| aims.otelCollector.receivers.prometheus.config.scrape_configs.scrape_interval | Inference metrics scraping interval | string | `20s` | +| airm.includeDemoSetup | Include the demo organization and project setup when installing | bool | `true` | +| airm.appDomain | Public IP or domain for airm | string | `PUBLIC-IP` | +| airm.externalSecretStore.airm.name | Secret store name for airm | string | `airm-secret-store` | +| airm.externalSecretStore.minio.name | Secret store name for minio | string | `k8s-secret-store` | +| airm.externalSecretStore.keycloak.name | Secret store name for keycloak | string | `keycloak-secret-store` | +| airm.keycloak.publicUrl | Public URL to access keycloak | string | `https://kc.{{ .Values.airm.appDomain }}` | +| airm.keycloak.internalUrl | Internal URL to access keycloak | string | `http://keycloak.keycloak.svc.cluster.local:8080` | +| airm.keycloak.clientId | Client ID to access keycloak | string | `354a0fa1-35ac-4a6d-9c4d-d661129c2cd0` | +| airm.keycloak.realm | Keycloak realm for authentication | string | `airm` | +| airm.postgresql.cnpg.image | PostgreSQL container image | string | `ghcr.io/cloudnative-pg/postgresql:17` | +| airm.postgresql.cnpg.instance | Number of PostgreSQL instances | int | `1` | +| airm.postgresql.cnpg.resources.limits.cpu | CPU limit for PostgreSQL container | string | `"2"` | +| airm.postgresql.cnpg.resources.limits.memory | Memory limit for PostgreSQL container | string | `1Gi` | +| airm.postgresql.cnpg.resources.requests.cpu | CPU request for PostgreSQL container | string | `"1"` | +| airm.postgresql.cnpg.resources.requests.memory | Memory request for PostgreSQL container | string | `512Mi` | +| airm.postgresql.cnpg.storage.size | Storage size for PostgreSQL | string | `50Gi` | +| airm.postgresql.cnpg.storage.storageClass | Storage class for PostgreSQL | string | `default` | +| airm.postgresql.cnpg.walStorage.size | WAL storage size for PostgreSQL | string | `50Gi` | +| airm.postgresql.cnpg.walStorage.storageClass | WAL storage class for PostgreSQL | string | `default` | +| airm.rabbitmq.replicas | Number of replicas for the RabbitMQ cluster | int | `1` | +| airm.rabbitmq.resources.limits.cpu | CPU limit for for the RabbitMQ cluster | string | `1` | +| airm.rabbitmq.resources.limits.memory | Memory limit for for the RabbitMQ cluster | string | `1Gi` | +| airm.rabbitmq.resources.requests.cpu | CPU request for the RabbitMQ cluster | string | `500m` | +| airm.rabbitmq.resources.requests.memory | Memory request for the RabbitMQ cluster | string | `1Gi` | +| airm.rabbitmq.persistence.storage | Persistent storage size for the RabbitMQ cluster | string | `20Gi` | +| airm.rabbitmq.persistence.storageClassName | Storage class name for the RabbitMQ cluster | string | `default` | +| airm.rabbitmq.backup.enabled | Enable RabbitMQ backup | bool | `false` | +| airm.rabbitmq.backup.image | RabbitMQ backup container image | string | `amdenterpriseai/rabbitmq-backup:0.1` | +| airm.rabbitmq.backup.resources.limits.memory | Memory limit for cron job of RabbitMQ backup | string | `512Mi` | +| airm.rabbitmq.backup.resources.requests.cpu | CPU request for cron job of RabbitMQ backup | string | `250m` | +| airm.rabbitmq.backup.resources.requests.memory | Memory request for cron job of RabbitMQ backup | string | `256Mi` | +| airm.frontend.image.repository | Frontend image repository | string | `amdenterpriseai/airm-ui` | +| airm.frontend.image.tag | Frontend image tag | string | `v2025.08-rc.21` | +| airm.frontend.image.pullPolicy | Frontend image pull policy | string | `IfNotPresent` | +| airm.frontend.servicePort | Frontend service port | int | `80` | +| airm.frontend.resources.limits.memory | Memory limit for frontend | string | `4Gi` | +| airm.frontend.resources.requests.cpu | CPU request for frontend | string | `500m` | +| airm.frontend.resources.requests.memory | Memory request for frontend | string | `4Gi` | +| airm.backend.image.repository | Backend API image repository | string | `amdenterpriseai/airm-api` | +| airm.backend.image.tag | Backend API image tag | string | `v2025.08-rc.21` | +| airm.backend.image.pullPolicy | Backend API image pull policy | string | `IfNotPresent` | +| airm.backend.servicePort | Backend API service port | int | `80` | +| airm.backend.servicePortMetrics | Backend API metrics service port | int | `9009` | +| airm.backend.env.dbPort | Database port | int | `5432` | +| airm.backend.env.rabbitmqPort | RabbitMQ port | int | `5672` | +| airm.backend.env.minioUrl | Minio service URL | string | `http://minio.minio-tenant-default.svc.cluster.local:80` | +| airm.backend.env.minioBucket | Minio bucket name | string | `default-bucket` | +| airm.backend.env.prometheusUrl | Prometheus service URL | string | `http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:9090` | +| airm.backend.env.clusterAuthUrl | Cluster auth service URL | string | `http://cluster-auth.cluster-auth.svc.cluster.local:8081` | +| airm.backend.resources.limits.memory | Memory limit for backend API container | string | `1Gi` | +| airm.backend.resources.requests.cpu | CPU request for backend API container | string | `500m` | +| airm.backend.resources.requests.memory | Memory request for backend API container | string | `1Gi` | +| airm.backend.securityContext.allowPrivilegeEscalation | Security context: allow privilege escalation | bool | `false` | +| airm.backend.securityContext.runAsNonRoot | Security context: run container as non-root | bool | `true` | +| airm.backend.securityContext.runAsUser | Security context: user ID to run container as | int | `1000` | +| airm.backend.securityContext.seccompProfile.type | Security context: seccomp profile type | string | `RuntimeDefault` | +| airm.utilities.netcat.image.repository | Netcat image repository | string | `busybox` | +| airm.utilities.netcat.image.tag | Netcat image tag | string | `1.37.0` | +| airm.utilities.netcat.image.pullPolicy | Netcat image pull policy | string | `IfNotPresent` | +| airm.utilities.curl.image.repository | Curl image repository | string | `curlimages/curl` | +| airm.utilities.curl.image.tag | Curl image tag | string | `8.16.0` | +| airm.utilities.curl.image.pullPolicy | Curl image pull policy | string | `IfNotPresent` | +| airm.utilities.liquibase.image.repository | Liquibase image repository | string | `docker.io/liquibase/liquibase` | +| airm.utilities.liquibase.image.tag | Liquibase image tag | string | `4.31` | +| airm.utilities.liquibase.image.pullPolicy | Liquibase image pull policy | string | `IfNotPresent` | diff --git a/sources/airm/0.3.3/charts/airm-api/files/configure.sh b/sources/airm/0.3.3/charts/airm-api/files/configure.sh new file mode 100644 index 00000000..69a3f59d --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/files/configure.sh @@ -0,0 +1,374 @@ +#!/bin/bash + +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +##################################################################################### +echo "" +echo "Run configure script block..." +echo "" + +# --- Configuration Variables --- +# Get values from bloom configmap mounted as env + +# NOTE: ORG_NAME is hardcoded to demo because gpu operator metrics has same org name hardcoded there +# Otherwise the following line can be uncommented to consider the real org name from domain config +# ORG_NAME=$(echo $NEW_DOMAIN_NAME | awk -F '.' '{ print $2 }') +ORG_NAME="demo" +ORG_DOMAINS="[\"${NEW_DOMAIN_NAME}\"]" +CLUSTER_WORKLOADS_BASE_URL="https://workspaces.${NEW_DOMAIN_NAME}/" +CLUSTER_KUBE_API_URL="https://k8s.${NEW_DOMAIN_NAME}" +USER_EMAIL="devuser@${NEW_DOMAIN_NAME}" +PROJECT_NAME="demo" +PROJECT_DESCRIPTION="demo" +CLUSTER_NAME="demo-cluster" +TIMEOUT=300 +SLEEP_INTERVAL=5 + +# --- Input Validation --- +echo "Validating environment variables..." +echo "KEYCLOAK_CLIENT_ID: ${KEYCLOAK_CLIENT_ID}" +echo "NEW_DOMAIN_NAME: ${NEW_DOMAIN_NAME}" +echo "AIRM_API_URL: ${AIRM_API_URL}" + +function check_env_variable() { + if [ -z "${!1}" ]; then + echo "ERROR: $1 environment variable is not set." + exit 1 + fi +} + +function check_success() { + if [ "$1" -ne 0 ]; then + echo "ERROR: $2" + exit 1 + fi +} + +check_env_variable "AIRM_API_URL" +check_env_variable "KEYCLOAK_URL" +check_env_variable "KEYCLOAK_REALM" +check_env_variable "KEYCLOAK_CLIENT_SECRET" +check_env_variable "KEYCLOAK_CLIENT_ID" +check_env_variable "KEYCLOAK_ADMIN_CLIENT_ID" +check_env_variable "KEYCLOAK_ADMIN_CLIENT_SECRET" +check_env_variable "USER_PASSWORD" + +function refresh_token() { + TOKEN=$(curl -s -d "client_id=${KEYCLOAK_CLIENT_ID}" -d "username=${USER_EMAIL}" -d "password=${USER_PASSWORD}" -d 'grant_type=password' -d "client_secret=${KEYCLOAK_CLIENT_SECRET}" "${KEYCLOAK_URL}/realms/${KEYCLOAK_REALM}/protocol/openid-connect/token" | jq -r '.access_token') + if [ -z "$TOKEN" ] || [ "$TOKEN" == "null" ]; then + echo "ERROR: Failed to obtain access token from Keycloak." + exit 1 + fi +} + +function create_org() { + # Try to get ORG_ID by name + ORG_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r --arg name "$ORG_NAME" '.organizations[] | select(.name==$name) | .id') + + # If not found, create the org and fetch the ID again + if [ -z "$ORG_ID" ] || [ "$ORG_ID" == "null" ]; then + ORG_RESP=$(curl -s -o /dev/null -X POST -w "%{http_code}" "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' \ + -d "{ \"name\": \"$ORG_NAME\", \"domains\": $ORG_DOMAINS }") + echo "$ORG_RESP" + check_success "$([[ "$ORG_RESP" == "200" || "$ORG_RESP" == "201" ]] && echo 0 || echo 1)" "Failed to create organization" + + ORG_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r --arg name "$ORG_NAME" '.organizations[] | select(.name==$name) | .id') + fi + + if [ -z "$ORG_ID" ] || [ "$ORG_ID" == "null" ]; then + echo "ERROR: Failed to create or retrieve organization ID." + exit 1 + else + echo "ORG_ID=${ORG_ID}" + fi +} + +function add_user_to_org() { + # Check if user exists in org + USER_EXISTS=$(curl -s -X GET "${AIRM_API_URL}/v1/users" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" -H 'Content-Type: application/json' | jq -r --arg email "$USER_EMAIL" '.data? // [] | .[] | select(.email==$email) | .email') + # Add user to org if they don't exist + if [ -z "$USER_EXISTS" ] || [ "$USER_EXISTS" == "null" ]; then + echo "$USER_EXISTS" + echo "User '$USER_EMAIL' not found in organization. Adding..." + ADD_USER_RESP=$(curl -w "%{http_code}" -X 'POST' "${AIRM_API_URL}/v1/organizations/${ORG_ID}/users" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" -H 'Content-Type: application/json' -d '{ "email": "'"$USER_EMAIL"'", "roles": ["Platform Administrator"]}') + echo "$ADD_USER_RESP" + check_success "$([[ "$ADD_USER_RESP" == "200" || "$ADD_USER_RESP" == "201" || "$ADD_USER_RESP" == "null201" ]] && echo 0 || echo 1)" "Failed to add user to organization" + else + echo "User '$USER_EMAIL' already exists in organization." + fi +} + +function create_project() { + PROJECT_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/projects" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" | jq -r '.projects[] | select(.name=="'$PROJECT_NAME'") | .id') + + for (( i=0; i<=TIMEOUT; i+=SLEEP_INTERVAL )); do + CLUSTER_STATUS=$(curl -s -X GET "${AIRM_API_URL}/v1/clusters/$CLUSTER_ID" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r '.status') + + if [ "$CLUSTER_STATUS" == "healthy" ]; then + echo "Cluster is healthy!" + break # Exit the loop if the cluster is healthy + fi + echo "Cluster status: $CLUSTER_STATUS. Waiting $SLEEP_INTERVAL seconds... ($i/$TIMEOUT seconds elapsed)" + sleep $SLEEP_INTERVAL + done + + if [ "$CLUSTER_STATUS" != "healthy" ]; then + echo "ERROR: Cluster did not become healthy within $TIMEOUT seconds." + exit 1 + fi + + if [ -z "$PROJECT_ID" ] || [ "$PROJECT_ID" == "null" ]; then + echo "Projects '$PROJECT_NAME' not found. Creating..." + PROJECT_ID=$(curl -X 'POST' \ + "${AIRM_API_URL}/v1/projects" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' \ + -d '{ + "name": "'"$PROJECT_NAME"'", + "description": "'"$PROJECT_DESCRIPTION"'", + "cluster_id": "'"$CLUSTER_ID"'", + "quota": { + "cpu_milli_cores": 0, + "memory_bytes": 0, + "ephemeral_storage_bytes": 0, + "gpu_count": 0 + } + }' | jq -r '.id') + echo "$PROJECT_ID" + check_success "$([[ "$PROJECT_ID" != "null" ]] && echo 0 || echo 1)" "Failed to create project" + else + echo "Project '$PROJECT_NAME' already exists with ID: $PROJECT_ID" + fi +} + +function add_minio_secret_and_storage_to_project() { + for (( i=0; i<=TIMEOUT; i+=SLEEP_INTERVAL )); do + PROJECT_STATUS=$(curl -s -X GET "${AIRM_API_URL}/v1/projects/$PROJECT_ID" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r '.status') + + if [ "$PROJECT_STATUS" == "Ready" ]; then + echo "Project is ready!" + break # Exit the loop if the project is ready + fi + echo "Project status: $PROJECT_STATUS. Waiting $SLEEP_INTERVAL seconds... ($i/$TIMEOUT seconds elapsed)" + sleep $SLEEP_INTERVAL + done + + SECRET_NAME="minio-credentials-fetcher" + STORAGE_NAME="minio-storage" + + SECRET_IN_PROJECT=$(curl -X 'GET' \ + "${AIRM_API_URL}/v1/projects/${PROJECT_ID}/secrets" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer ${TOKEN}" | jq -r '.project_secrets[] | select(.secret.name=="'"$SECRET_NAME"'") | .id') + EXTERNAL_SECRET_API_VERSION="v1beta1" + EXTERNAL_SECRET_MANIFEST=$(cat < /dev/null 2>&1; then + echo "AIRM API is ready!" + break + else + echo "Waiting for AIRM API..." + sleep 10 + fi + done + + echo "All dependencies are ready!" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: configure + image: "{{ .Values.airm.utilities.clusterTool.image.repository }}:{{ .Values.airm.utilities.clusterTool.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.clusterTool.image.pullPolicy }}" + command: ["/bin/bash"] + args: ["/scripts/configure.sh"] + env: + - name: DEBIAN_FRONTEND + value: "noninteractive" + - name: ORG_NAME + value: "demo" + - name: NEW_DOMAIN_NAME + value: "{{ .Values.airm.appDomain }}" + - name: KEYCLOAK_CLIENT_ID + value: "{{ .Values.airm.keycloak.clientId }}" + - name: KEYCLOAK_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: KEYCLOAK_SECRET + name: "{{ .Release.Name }}-keycloak-ui-creds" + - name: USER_EMAIL + value: "devuser@{{ .Values.airm.appDomain }}" + - name: KEYCLOAK_URL + value: "{{ .Values.airm.keycloak.internalUrl }}" + - name: KEYCLOAK_REALM + value: "{{ .Values.airm.keycloak.realm }}" + - name: KEYCLOAK_ADMIN_CLIENT_ID + valueFrom: + secretKeyRef: + key: client-id + name: "{{ .Release.Name }}-keycloak-admin-client" + - name: KEYCLOAK_ADMIN_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: client-secret + name: "{{ .Release.Name }}-keycloak-admin-client" + - name: AIRM_API_URL + value: "http://{{ .Release.Name }}-api.{{ .Release.Namespace }}.svc.cluster.local" + - name: USER_PASSWORD + valueFrom: + secretKeyRef: + key: USER_PASSWORD + name: "{{ .Release.Name }}-user-credentials" + volumeMounts: + - name: configure-script + mountPath: /scripts + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + volumes: + - name: configure-script + configMap: + name: "{{ .Release.Name }}-configure-script" + defaultMode: 0755 + +{{- end }} diff --git a/sources/airm/0.3.3/charts/airm-api/templates/airm-es.yaml b/sources/airm/0.3.3/charts/airm-api/templates/airm-es.yaml new file mode 100644 index 00000000..4dd18aeb --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/templates/airm-es.yaml @@ -0,0 +1,215 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cnpg-superuser" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-cnpg-superuser-username + property: value + secretKey: username + - remoteRef: + key: airm-cnpg-superuser-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-cnpg-superuser" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cnpg-user" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-cnpg-user-username + property: value + secretKey: username + - remoteRef: + key: airm-cnpg-user-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-cnpg-user" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-keycloak-admin-client" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-keycloak-admin-client-id + property: value + secretKey: client-id + - remoteRef: + key: airm-keycloak-admin-client-secret + property: value + secretKey: client-secret + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.keycloak.name }} + target: + name: "{{ .Release.Name }}-keycloak-admin-client" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-rabbitmq-admin" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-rabbitmq-user-username + property: value + secretKey: username + - remoteRef: + key: airm-rabbitmq-user-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-rabbitmq-admin" + template: + data: + default_user.conf: | + default_user = {{ "{{ .username }}" }} + default_pass = {{ "{{ .password }}" }} + password: '{{ "{{ .password }}" }}' + username: '{{ "{{ .username }}" }}' + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-api-minio-credentials" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: minio-api-access-key + property: value + secretKey: minio-access-key + - remoteRef: + key: minio-api-secret-key + property: value + secretKey: minio-secret-key + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.minio.name }} + target: + name: "{{ .Release.Name }}-api-minio-credentials" +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-secrets-airm" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-ui-auth-nextauth-secret + property: value + secretKey: NEXTAUTH_SECRET + refreshInterval: 15s + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-secrets-airm" +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-keycloak-ui-client" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-ui-keycloak-secret + property: value + secretKey: KEYCLOAK_SECRET + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.keycloak.name }} + target: + name: "{{ .Release.Name }}-keycloak-ui-creds" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cluster-auth-secrets" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + refreshInterval: 5m + target: + name: "{{ .Release.Name }}-cluster-auth-admin" + data: + - secretKey: admin-token + remoteRef: + key: cluster-auth-admin-token + property: value +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-user-credentials" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: keycloak-initial-devuser-password + property: value + secretKey: USER_PASSWORD + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-user-credentials" + template: + type: Opaque diff --git a/sources/airm/0.3.3/charts/airm-api/templates/airm-httproute.yaml b/sources/airm/0.3.3/charts/airm-api/templates/airm-httproute.yaml new file mode 100644 index 00000000..3393d6a5 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/templates/airm-httproute.yaml @@ -0,0 +1,81 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: '{{ .Release.Name }}api-route' + namespace: '{{ .Release.Namespace }}' +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: {{ .Values.kgateway.gatewayName }} + namespace: {{ .Values.kgateway.namespace }} + rules: + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-api' + port: {{ .Values.kgateway.airmapi.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmapi.prefixValue }}\..* + path: + type: RegularExpression + value: .*/stream.* + timeouts: + backendRequest: {{ .Values.kgateway.airmapi.timeouts.stream.backendRequest }} + request: {{ .Values.kgateway.airmapi.timeouts.stream.request }} + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-api' + port: {{ .Values.kgateway.airmapi.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmapi.prefixValue }}\..* + path: + type: PathPrefix + value: / + timeouts: + backendRequest: {{ .Values.kgateway.airmapi.timeouts.nonStream.backendRequest }} + request: {{ .Values.kgateway.airmapi.timeouts.nonStream.request }} +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: '{{ .Release.Name }}ui-route' + namespace: '{{ .Release.Namespace }}' +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: {{ .Values.kgateway.gatewayName }} + namespace: {{ .Values.kgateway.namespace }} + rules: + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-ui' + port: {{ .Values.kgateway.airmui.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmui.prefixValue }}\..* + path: + type: PathPrefix + value: / + timeouts: + backendRequest: {{ .Values.kgateway.airmui.timeouts.backendRequest }} + request: {{ .Values.kgateway.airmui.timeouts.request }} diff --git a/sources/airm/0.3.3/charts/airm-api/templates/airm-rabbitmq-cluster.yaml b/sources/airm/0.3.3/charts/airm-api/templates/airm-rabbitmq-cluster.yaml new file mode 100644 index 00000000..3db2ff07 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/templates/airm-rabbitmq-cluster.yaml @@ -0,0 +1,69 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: rabbitmq.com/v1beta1 +kind: RabbitmqCluster +metadata: + name: '{{ .Release.Name }}-rabbitmq' + namespace: '{{ .Release.Namespace }}' +spec: + persistence: + {{- toYaml .Values.airm.rabbitmq.persistence | nindent 4 }} + replicas: {{ .Values.airm.rabbitmq.replicas }} + resources: + {{- toYaml .Values.airm.rabbitmq.resources | nindent 4 }} + secretBackend: + externalSecret: + name: '{{ .Release.Name }}-rabbitmq-admin' + tls: + secretName: '{{ .Release.Name }}-tls-secret' +--- +{{- if .Values.airm.rabbitmq.backup.enabled -}} + +apiVersion: batch/v1 +kind: CronJob +metadata: + name: '{{ .Release.Name }}-rabbitmq-backup-cron' + namespace: '{{ .Release.Namespace }}' +spec: + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + spec: + containers: + - env: + - name: RABBITMQ_URL + value: 'http://{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local:15672' + - name: RABBITMQ_USER + valueFrom: + secretKeyRef: + key: username + name: '{{ .Release.Name }}-rabbitmq-admin' + - name: RABBITMQ_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: '{{ .Release.Name }}-rabbitmq-admin' + - name: S3_HOST + value: "{{ .Values.airm.backend.env.minioUrl }}" + - name: S3_ACCESS_KEY + valueFrom: + secretKeyRef: + key: minio-access-key + name: '{{ .Release.Name }}-api-minio-credentials' + - name: S3_SECRET_KEY + valueFrom: + secretKeyRef: + key: minio-secret-key + name: '{{ .Release.Name }}-api-minio-credentials' + image: '{{ .Values.airm.rabbitmq.backup.image }}' + name: rabbitmq-backup-cron + resources: + {{- toYaml .Values.airm.rabbitmq.backup.resources | nindent 16 }} + restartPolicy: OnFailure + schedule: 0 * * * * + +{{- end }} diff --git a/sources/airm/0.3.3/charts/airm-api/templates/airm-vllm-collector.yaml b/sources/airm/0.3.3/charts/airm-api/templates/airm-vllm-collector.yaml new file mode 100644 index 00000000..f12aa532 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/templates/airm-vllm-collector.yaml @@ -0,0 +1,93 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: "{{ .Release.Name }}-{{ .Values.aims.otelCollector.name }}" + namespace: "{{ .Release.Namespace }}" +spec: + mode: daemonset + image: "{{ .Values.aims.otelCollector.image }}" + config: + receivers: + prometheus: + config: + scrape_configs: + - job_name: "vllm" + metrics_path: /metrics + scrape_interval: "{{ .Values.aims.otelCollector.receivers.prometheus.config.scrape_configs.scrape_interval }}" + kubernetes_sd_configs: + - role: pod + relabel_configs: + # Only scrape pods with the workload-id label + - source_labels: + [__meta_kubernetes_pod_label_airm_silogen_ai_workload_id] + action: keep + regex: .+ + # Only scrape pods with app label starting with isvc. + - source_labels: [__meta_kubernetes_pod_label_app] + action: keep + regex: isvc\..* + # Set the workload_id from the label + - source_labels: + [__meta_kubernetes_pod_label_airm_silogen_ai_workload_id] + target_label: workload_id + # Set service name from app label + - source_labels: [__meta_kubernetes_pod_label_app] + target_label: service + # Set service instance id from pod name + - source_labels: [__meta_kubernetes_pod_name] + target_label: service_instance_id + # Set the scrape target to port 8000 + - source_labels: [__meta_kubernetes_pod_ip] + target_label: __address__ + replacement: $1:8000 + otlp: + protocols: + grpc: {} + http: {} + + processors: + resource: + attributes: + - key: airm.silogen.ai/workload-id + from_attribute: workload_id + action: upsert + - key: service.instance.id + from_attribute: service_instance_id + action: upsert + - key: service.name + from_attribute: service + action: upsert + + transform: + metric_statements: + - context: datapoint + statements: + - set(attributes["workload_id"], resource.attributes["airm.silogen.ai/workload-id"]) where attributes["workload_id"] == nil + - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where attributes["service_instance_id"] == nil + - set(attributes["service"], resource.attributes["service.name"]) where attributes["service"] == nil + + exporters: + otlphttp: + endpoint: "{{ .Values.aims.otelCollector.exporters.otlphttp.endpoint }}" + + service: + pipelines: + metrics: + receivers: [prometheus] + processors: [resource, transform] + exporters: [otlphttp] + + traces: + receivers: [otlp] + processors: [resource] + exporters: [otlphttp] + + logs: + receivers: [otlp] + processors: [resource] + exporters: [otlphttp] diff --git a/sources/airm/0.3.3/charts/airm-api/values.yaml b/sources/airm/0.3.3/charts/airm-api/values.yaml new file mode 100644 index 00000000..78a9791f --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/values.yaml @@ -0,0 +1,166 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +secretgenerator: + image: + repository: amdenterpriseai/cluster-tool + tag: latest + pullPolicy: IfNotPresent + +kgateway: + namespace: kgateway-system + gatewayName: https + airmapi: + servicePort: 80 + prefixValue: airmapi + timeouts: + stream: + backendRequest: 30m + request: 30m + nonStream: + backendRequest: 10m + request: 10m + airmui: + servicePort: 80 + prefixValue: airmui + timeouts: + backendRequest: 1m + request: 1m + keycloak: + prefixValue: kc + +aims: + otelCollector: + image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0 + receivers: + prometheus: + config: + scrape_configs: + scrape_interval: 20s + exporters: + otlphttp: + endpoint: "http://lgtm-stack.otel-lgtm-stack.svc:4318" + name: "vllm-collector" + +airm: + appDomain: PUBLIC-IP + includeDemoSetup: true + + externalSecretStore: + airm: + name: openbao-secret-store + minio: + name: openbao-secret-store + keycloak: + name: openbao-secret-store + + postgresql: + enabled: true + cnpg: + image: ghcr.io/cloudnative-pg/postgresql:17 + instance: 1 + resources: + limits: + cpu: "2" + memory: 1Gi + requests: + cpu: "1" + memory: 512Mi + storage: + size: 50Gi + storageClass: default + walStorage: + size: 50Gi + storageClass: default + + rabbitmq: + replicas: 1 + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: '1' + memory: 1Gi + persistence: + storage: 20Gi + storageClassName: default + backup: + enabled: false + image: amdenterpriseai/rabbitmq-backup:0.1 + resources: + limits: + memory: 512Mi + requests: + cpu: 250m + memory: 256Mi + + keycloak: + internalUrl: http://keycloak.keycloak.svc.cluster.local:8080 + clientId: "354a0fa1-35ac-4a6d-9c4d-d661129c2cd0" + realm: airm + + frontend: + image: + repository: amdenterpriseai/airm-ui + tag: 0.3.3 + pullPolicy: IfNotPresent + servicePort: 80 + resources: + limits: + memory: 4Gi + requests: + cpu: 500m + memory: 4Gi + + backend: + image: + repository: amdenterpriseai/airm-api + tag: 0.3.3 + pullPolicy: IfNotPresent + + servicePort: 80 + servicePortMetrics: 9009 + env: + dbPort: 5432 + rabbitmqPort: 5672 + minioUrl: http://minio.minio-tenant-default.svc.cluster.local:80 + minioBucket: default-bucket + prometheusUrl: http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:9090 + clusterAuthUrl: http://cluster-auth.cluster-auth.svc.cluster.local:8081 + + resources: + limits: + memory: 1Gi + requests: + cpu: 500m + memory: 1Gi + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + + utilities: + netcat: + image: + repository: busybox + tag: 1.37.0 + pullPolicy: IfNotPresent + curl: + image: + repository: curlimages/curl + tag: 8.16.0 + pullPolicy: IfNotPresent + liquibase: + image: + repository: docker.io/liquibase/liquibase + tag: 4.31 + pullPolicy: IfNotPresent + clusterTool: + image: + repository: amdenterpriseai/cluster-tool + tag: latest + pullPolicy: IfNotPresent diff --git a/sources/airm/0.3.3/charts/airm-dispatcher/.helmignore b/sources/airm/0.3.3/charts/airm-dispatcher/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-dispatcher/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.3/charts/airm-dispatcher/Chart.yaml b/sources/airm/0.3.3/charts/airm-dispatcher/Chart.yaml new file mode 100644 index 00000000..4fbdee97 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-dispatcher/Chart.yaml @@ -0,0 +1,29 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm-dispatcher +description: A Helm chart for AIRM Dispatcher + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.3 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" diff --git a/sources/airm/0.3.3/charts/airm-dispatcher/README.md b/sources/airm/0.3.3/charts/airm-dispatcher/README.md new file mode 100644 index 00000000..0b85c706 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-dispatcher/README.md @@ -0,0 +1,54 @@ + + +# HELM CHARTS + +Simple instructions to deploy AIRM dispatcher application using helm chart. +The dispatcher can be run on a compute cluster, which may or may not be the same as the one hosting the AIRM API and UI. + +### 1. Requirements + +The following external components must be available in the Kubernetes cluster before the helm chart can be installed. + +- Accessible RabbitMQ cluster (must be the same cluster used by AIRM API). +- Kaiwo installed on the cluster (along with all its dependencies) + +### 2. Install + +``` +cd helm/airm/charts + +# 1. Create output template just to validate (the public domain could be app-dev.silogen.ai, staging.silogen.ai, etc.) +helm template airm-dispatcher ./airm-dispatcher -n airm --create-namespace > airm-dispatcher-helm-generated.yaml + +# 2. Run chart install +helm install airm-dispatcher ./airm-dispatcher -n airm --create-namespace + +# 3. Delete chart if needed +helm delete airm-dispatcher -n airm + +# 4. Upgrade when bumping versions +helm upgrade -n airm --set airm-dispatcher ./airm-dispatcher +``` + +--- + +### 3. Helm Settings + +| Field Path | Description | Type | Example / Default | +|---------------------------------------------|--------------------------------------------------------------|---------|-----------------------------------| +| airm.dispatcher.image.repository | Dispatcher image repository | string | `amdenterpriseai/airm-dispatcher` | +| airm.dispatcher.image.tag | Dispatcher image tag | string | `v2025.08-rc.21` | +| airm.dispatcher.image.pullPolicy | Dispatcher image pull policy | string | `IfNotPresent` | +| airm.dispatcher.servicePort | Dispatcher service port | int | `80` | +| airm.utilities.netcat.image.repository | Netcat image repository | string | `busybox` | +| airm.utilities.netcat.image.tag | Netcat image tag | string | `1.37.0` | +| airm.utilities.netcat.image.pullPolicy | Netcat image pull policy | string | `IfNotPresent` | +| airm.utilities.curl.image.repository | Curl image repository | string | `curlimages/curl` | +| airm.utilities.curl.image.tag | Curl image tag | string | `8.16.0` | +| airm.utilities.curl.image.pullPolicy | Curl image pull policy | string | `IfNotPresent` | +| airm.additionalClusterRoles.platformAdmin | Additional cluster roles for the Platform Administrator role | array | `[]` | +| airm.additionalClusterRoles.projectMember | Additional cluster roles for the Project Member role | array | `[]` | diff --git a/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-cluster-policies.yaml b/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-cluster-policies.yaml new file mode 100644 index 00000000..caf92aa6 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-cluster-policies.yaml @@ -0,0 +1,352 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-project-namespace-rolebinding +spec: + background: false + rules: + - name: generate-project-namespace-rolebinding + match: + any: + - resources: + kinds: + - Namespace + operations: + - CREATE + preconditions: + any: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/project-id" || '''' }}`}}' + operator: NotEquals + value: "" + skipBackgroundRequests: true + generate: + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + name: "project-member-role-binding" + namespace: "{{`{{request.object.metadata.name}}`}}" + synchronize: true + data: + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: airm-project-member + subjects: + - kind: Group + # Add this in for backwards compatibility + name: "oidc{{`{{request.object.metadata.name}}`}}" + apiGroup: rbac.authorization.k8s.io + - kind: Group + # The kubernetes cluster apply an OIDC prefix of 'oidc:', so we adjust the groups to expect that + name: "oidc:{{`{{request.object.metadata.name}}`}}" + apiGroup: rbac.authorization.k8s.io +--- +# Kyverno policy that enforces that workloads submitted to a namespace managed by AIRMan have the +# correct kueue lables and field set, so that they are bound by the quota of the namespace +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-quota-enforcement-for-workloads +spec: + background: false + rules: + - name: set-queue-name-from-namespace-default + match: + resources: + kinds: + - Deployment + - StatefulSet + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + + - name: set-queue-name-from-namespace-jobs + match: + resources: + kinds: + - Job # https://kueue.sigs.k8s.io/docs/tasks/run/jobs/ + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + spec: + suspend: true + + - name: set-queue-name-from-namespace-cronjobs + match: + resources: + kinds: + - CronJob # https://kueue.sigs.k8s.io/docs/tasks/run/run_cronjobs/ + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.spec.jobTemplate.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + spec: + jobTemplate: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + spec: + suspend: true + + - name: set-queue-name-from-namespace-kaiwo + match: + resources: + kinds: + - KaiwoJob + - KaiwoService + - AIMService + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: "{{`{{request.object.spec.clusterQueue || '' }}`}}" + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + spec: + clusterQueue: "{{`{{request.namespace }}`}}" +--- +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-workload-tracking-policy +spec: + background: false + rules: + # For all supported types, if airm.silogen.ai/workload-id and airm.silogen.ai/component-id are not set, we assume + # it has been submitted from outside of AIRMan. In that case, we set airm.silogen.ai/auto-discovered: true, so it can + # be tracked upstream. We also set airm.silogen.ai/discovered-component-type so that we can identify the type of component + # that was originally tracked, and ignore children created by it. See remove-auto-discovered-annotations-inherited-from-parent + # We also try to capture the user who submitted the workload, and consume it in the application + + # Please note that ReplicaSet is not supported because by default it is filtered away by Kyverno by default: https://github.com/kyverno/kyverno/blob/main/charts/kyverno/values.yaml#L270 + - name: add-discovery-annotations-for-supported-types + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || '''' }}`}}' + operator: Equals + value: "" + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || '''' }}`}}' + operator: Equals + value: "" + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/auto-discovered" || '''' }}`}}' + operator: Equals + value: "" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + annotations: + airm.silogen.ai/submitter: "{{`{{request.userInfo.username }}`}}" + airm.silogen.ai/auto-discovered: "true" + airm.silogen.ai/discovered-component-type: "{{`{{request.object.kind }}`}}" + # For all supported types, if airm.silogen.ai/auto-discovered is set and the airm.silogen.ai/discovered-component-type + # doesnt match the kind of the current component, we assume this type has been created by a parent which is also + # supported by AIRMan and we dont need to track this type upstream, so we unset the airm.silogen.ai/auto-discovered annotation. + # This is mostly to account for KaiwoJob, KaiwoService, AIMService which propagate annotations to pods. + - name: remove-auto-discovered-annotations-inherited-from-parent + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/auto-discovered" || '''' }}`}}' + operator: Equals + value: "true" + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/discovered-component-type" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.object.kind }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + annotations: + airm.silogen.ai/auto-discovered: "false" + # For all supported types, if airm.silogen.ai/project-id does not match that of the namespace label, overwrite it + # with the expected value, to avoid metrics getting mixed up between projects. + - name: set-project-id-from-namespace-label + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + context: + - name: ns_labels + apiCall: + urlPath: "/api/v1/namespaces/{{`{{request.namespace }}`}}" + method: GET + jmesPath: "metadata.labels" + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/project-id" || '''' }}`}}' + operator: NotEquals + value: '{{`{{ns_labels."airm.silogen.ai/project-id" }}`}}' + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/project-id: '{{`{{ns_labels."airm.silogen.ai/project-id" }}`}}' + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to types that expect them at + # metadata.labels. The expectation is that these are propagated to the subsequent pods that are created. + + # If the resource is spawned off by a CRD, it will not know about the labels on the previous version of the object, + # so we also check request.oldObject for the labels to try and preserve them if they were already set. + - name: add-workload-and-component-id-default + match: + resources: + kinds: + - Pod + - KaiwoJob + - KaiwoService + - AIMService + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to objects with templates and + # also add it to spec.template.metadata.labels to ensure that the pods created by them contain the labels as well + - name: add-workload-and-component-id-to-objects-with-template + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + template: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to cronjob and + # also add it to spec.jobTemplate.metadata.labels to ensure that the pods created by the cronjob + # contain it as well + - name: add-workload-and-component-id-cronjobs + match: + resources: + kinds: + - CronJob + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + jobTemplate: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + template: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' diff --git a/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-cluster-roles.yaml b/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-cluster-roles.yaml new file mode 100644 index 00000000..2461e894 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-cluster-roles.yaml @@ -0,0 +1,164 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: airm-platform-admin +rules: + - apiGroups: [""] + resources: + [ + "pods", + "events", + "services", + "configmaps", + "persistentvolumes", + "persistentvolumeclaims", + "namespaces", + "serviceaccounts", + ] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: [""] + resources: ["pods/log", "pods/exec", "pods/attach", "pods/portforward"] + verbs: ["*"] + - apiGroups: ["apps"] + resources: ["deployments", "deployments/scale", "replicasets", "statefulsets", "daemonsets"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses", "networkpolicies"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimmodels", "aimmodelcaches", "aimservicetemplates", "aimruntimeconfigs", "aimtemplatecaches"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes", "gateways"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["config.kaiwo.silogen.ai"] + resources: ["kaiwoconfigs"] + verbs: ["*"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwoqueueconfigs"] + verbs: ["*"] + - apiGroups: ["aim.silogen.ai"] + resources: [ "aimclustermodels", "aimclusterservicetemplates", "aimclusterruntimeconfigs", "aimclustermodelsources" ] + verbs: ["*"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "clusterroles", "rolebindings", "clusterrolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["external-secrets.io"] + resources: ["externalsecrets"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: ["external-secrets.io"] + resources: ["clustersecretstores"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["kueue.x-k8s.io"] + resources: ["clusterqueues", "resourceflavors", "localqueues", "workloadpriorityclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list"] +{{- range .Values.airm.additionalClusterRoles.platformAdmin }} + - apiGroups: + {{ .apiGroups | toYaml | nindent 6 }} + resources: + {{ .resources | toYaml | nindent 6 }} + verbs: + {{ .verbs | toYaml | nindent 6 }} +{{- end }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: airm-platform-admin-role-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: airm-platform-admin +subjects: + - kind: Group + # Add this in for backwards compatibility + name: "oidcairm-role:Platform Administrator" + apiGroup: rbac.authorization.k8s.io + - kind: Group + # The kubernetes cluster apply an OIDC prefix of 'oidc':, so we adjust the group to expect that + name: "oidc:airm-role:Platform Administrator" + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: airm-project-member +rules: + - apiGroups: [""] + resources: + [ + "pods", + "pods/log", + "pods/exec", + "pods/attach", + "pods/portforward", + "events", + "services", + "configmaps", + "persistentvolumes", + "persistentvolumeclaims", + ] + verbs: ["*"] + - apiGroups: ["apps"] + resources: ["deployments", "replicasets", "statefulsets", "daemonsets"] + verbs: ["*"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["*"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses", "networkpolicies"] + verbs: ["*"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["*"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimmodels", "aimmodelcaches", "aimservicetemplates", "aimruntimeconfigs", "aimtemplatecaches"] + verbs: ["*"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes", "gateways"] + verbs: ["*"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch", "create"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "clusterroles", "rolebindings", "clusterrolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["external-secrets.io"] + resources: ["clustersecretstores", "externalsecrets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] +{{- range .Values.airm.additionalClusterRoles.projectMember }} + - apiGroups: + {{ .apiGroups | toYaml | nindent 6 }} + resources: + {{ .resources | toYaml | nindent 6 }} + verbs: + {{ .verbs | toYaml | nindent 6 }} +{{- end }} diff --git a/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml b/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml new file mode 100644 index 00000000..8a3489ef --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml @@ -0,0 +1,343 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-cluster-nodes-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Replace + jobTemplate: + spec: + template: + spec: + containers: + - command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/clusters/nodes + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + name: airm-cluster-nodes-cron + resources: + limits: + memory: 100Mi + requests: + cpu: 50m + memory: 100Mi + restartPolicy: OnFailure + schedule: 0 * * * * +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-aim-models-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Forbid + schedule: "*/5 * * * *" + suspend: false + jobTemplate: + spec: + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + restartPolicy: OnFailure + initContainers: + - name: check-dispatcher-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm Dispatcher at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm Dispatcher is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-dispatcher.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.servicePort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-aim-models-cron + command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/aims/cluster-models + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + resources: + requests: + memory: "100Mi" + cpu: "50m" + limits: + memory: "100Mi" +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-heartbeat-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Replace + schedule: "*/1 * * * *" + suspend: false + jobTemplate: + spec: + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + restartPolicy: OnFailure + initContainers: + - name: check-dispatcher-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm Dispatcher at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm Dispatcher is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-dispatcher.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.servicePort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-heartbeat-cron + command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/heartbeats + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + resources: + requests: + memory: "100Mi" + cpu: "50m" + limits: + memory: "100Mi" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: "{{ .Release.Name }}-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + replicas: 1 + selector: + matchLabels: + app: "{{ .Release.Name }}-dispatcher" + template: + metadata: + labels: + app: "{{ .Release.Name }}-dispatcher" + spec: + serviceAccountName: "{{ .Release.Name }}-dispatcher-sa" + {{- with .Values.airm.dispatcher.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + initContainers: + - name: check-rabbitmq-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm RabbitMQ at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm RabbitMQ is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.env.rabbitmqPort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-dispatcher + image: "{{ .Values.airm.dispatcher.image.repository }}:{{ .Values.airm.dispatcher.image.tag }}" + imagePullPolicy: "{{ .Values.airm.dispatcher.image.pullPolicy }}" + ports: + - containerPort: 8080 + env: + - name: KUBE_CLUSTER_NAME + value: demo-cluster + - name: ORG_NAME + value: demo + - name: RABBITMQ_HOST + value: "{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local" + - name: RABBITMQ_PORT + value: "{{ .Values.airm.dispatcher.env.rabbitmqPort }}" + - name: RABBITMQ_AIRM_COMMON_VHOST + value: "vh_airm_common" + - name: RABBITMQ_AIRM_COMMON_QUEUE + value: "airm_common" + - name: RABBITMQ_USER + valueFrom: + secretKeyRef: + name: "{{ .Release.Name }}-rabbitmq-common-vhost-user" + key: username + - name: RABBITMQ_PASSWORD + valueFrom: + secretKeyRef: + name: "{{ .Release.Name }}-rabbitmq-common-vhost-user" + key: password + livenessProbe: + httpGet: + path: /v1/health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + successThreshold: 1 + readinessProbe: + httpGet: + path: /v1/health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + successThreshold: 1 + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "1Gi" + securityContext: + runAsUser: 0 +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: "{{ .Release.Name }}-dispatcher-cluster-access-binding" +subjects: + - kind: ServiceAccount + name: "{{ .Release.Name }}-dispatcher-sa" + namespace: "{{ .Release.Namespace }}" +roleRef: + kind: ClusterRole + name: "{{ .Release.Name }}-dispatcher-cluster-access-role" + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: "{{ .Release.Name }}-dispatcher-cluster-access-role" +rules: + - apiGroups: [""] + resources: ["services", "namespaces", "configmaps", "pods"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["deployments", "statefulsets", "daemonsets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices", "kaiwoqueueconfigs"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["external-secrets.io"] + resources: ["externalsecrets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimclustermodels"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: "{{ .Release.Name }}-dispatcher-sa" + namespace: "{{ .Release.Namespace }}" + +--- +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}-dispatcher" + namespace: "{{ .Release.Namespace }}" + labels: + app: "{{ .Release.Name }}-dispatcher" +spec: + ports: + - name: web + port: {{ .Values.airm.dispatcher.servicePort }} + targetPort: 8080 + type: ClusterIP + selector: + app: "{{ .Release.Name }}-dispatcher" diff --git a/sources/airm/0.3.3/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml b/sources/airm/0.3.3/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml new file mode 100644 index 00000000..e930efd0 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml @@ -0,0 +1,35 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# These are additional cluster roles needed by kyverno background controller to be able to +# create rolebindings in namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kyverno:airm-policy-roles + labels: + rbac.kyverno.io/aggregate-to-background-controller: "true" +rules: + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterroles", "rolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] # allow kyverno to bind clusterroles via rolebindings + resources: ["clusterroles"] + verbs: ["bind"] +--- +# These are additional cluster roles needed by kyverno reports controller to be able to +# manage custom resources for reporting +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kyverno:airm-reports-policy-roles + labels: + rbac.kyverno.io/aggregate-to-reports-controller: "true" +rules: + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["get", "list", "watch"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices"] + verbs: ["get", "list", "watch"] diff --git a/sources/airm/0.3.3/charts/airm-dispatcher/values.yaml b/sources/airm/0.3.3/charts/airm-dispatcher/values.yaml new file mode 100644 index 00000000..670cf399 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-dispatcher/values.yaml @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +airm: + dispatcher: + image: + repository: amdenterpriseai/airm-dispatcher + tag: 0.3.3 + pullPolicy: IfNotPresent + servicePort: 80 + env: + rabbitmqPort: 5672 + utilities: + netcat: + image: + repository: busybox + tag: 1.37.0 + pullPolicy: IfNotPresent + curl: + image: + repository: curlimages/curl + tag: 8.16.0 + pullPolicy: IfNotPresent + additionalClusterRoles: + platformAdmin: [] + projectMember: [] diff --git a/sources/airm/0.3.3/values.yaml b/sources/airm/0.3.3/values.yaml new file mode 100644 index 00000000..69346880 --- /dev/null +++ b/sources/airm/0.3.3/values.yaml @@ -0,0 +1,3 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT From 5bbe9fe2054481b2ed60fceea0effb2a3d863f16 Mon Sep 17 00:00:00 2001 From: brownzebra Date: Wed, 18 Feb 2026 10:21:34 +0200 Subject: [PATCH 006/115] chore: bump version in values file --- root/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/root/values.yaml b/root/values.yaml index bb6c2c1f..b3ff1c9b 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -723,7 +723,7 @@ apps: - /spec/accessModes # AMD Resource Manager (AIRM) airm: - path: airm/0.3.2 + path: airm/0.3.3 namespace: airm valuesFile: values.yaml helmParameters: From d0ba511fb3bbb3e8f51269d72dcc4ac1e63d57af Mon Sep 17 00:00:00 2001 From: brownzebra Date: Wed, 18 Feb 2026 10:23:36 +0200 Subject: [PATCH 007/115] chore: update cmponents for sbom --- sbom/components.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sbom/components.yaml b/sbom/components.yaml index 97d608a6..8ca11e67 100644 --- a/sbom/components.yaml +++ b/sbom/components.yaml @@ -10,7 +10,7 @@ components: license: MIT License licenseUrl: https://github.com/silogen/kaiwo/blob/main/LICENSE airm: - path: airm/0.3.2 + path: airm/0.3.3 valuesFile: values.yaml sourceUrl: https://github.com/silogen/cluster-forge/tree/main/sources/airm projectUrl: https://github.com/silogen/cluster-forge/tree/main/sources/airm From f8e976048ed17798495fbc9ae2c0e7b69cc0b613 Mon Sep 17 00:00:00 2001 From: pwistbac Date: Wed, 18 Feb 2026 17:34:48 +0200 Subject: [PATCH 008/115] fix: Copy the tls to the argocd namespace and add this as a rootCA to the argocd config (#597) --- root/values.yaml | 1 + sources/argocd-config/es-tls-secret.yaml | 73 ++++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 sources/argocd-config/es-tls-secret.yaml diff --git a/root/values.yaml b/root/values.yaml index b3ff1c9b..ee1179a3 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -101,6 +101,7 @@ apps: issuer: https://kc.{{ .Values.global.domain }}/realms/airm clientID: argocd clientSecret: $$argocd-oidc-creds:client_secret + rootCA: $cluster-tls:cert requestedScopes: ["openid", "profile", "email", "groups"] syncWave: -3 argocd-config: diff --git a/sources/argocd-config/es-tls-secret.yaml b/sources/argocd-config/es-tls-secret.yaml new file mode 100644 index 00000000..f70de573 --- /dev/null +++ b/sources/argocd-config/es-tls-secret.yaml @@ -0,0 +1,73 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: argocd-tls-secret-svc-account + namespace: argocd +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: argocd-external-secrets-tls-role +rules: +- apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: argocd-external-secrets-tls-role-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: argocd-external-secrets-tls-role +subjects: +- kind: ServiceAccount + name: argocd-tls-secret-svc-account + namespace: argocd +--- +apiVersion: external-secrets.io/v1beta1 +kind: ClusterSecretStore +metadata: + name: argocd-tls-secret-store +spec: + provider: + kubernetes: + remoteNamespace: kgateway-system + server: + caProvider: + type: ConfigMap + name: kube-root-ca.crt + key: ca.crt + namespace: kgateway-system + auth: + serviceAccount: + name: argocd-tls-secret-svc-account +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: cluster-tls + namespace: argocd +spec: + refreshInterval: 1h + secretStoreRef: + name: argocd-tls-secret-store + kind: ClusterSecretStore + target: + name: cluster-tls + template: + metadata: + labels: + app.kubernetes.io/part-of: argocd + data: + - secretKey: cert + remoteRef: + key: cluster-tls + property: tls.crt + - secretKey: key + remoteRef: + key: cluster-tls + property: tls.key +--- + From 7dc274ab8f414e7a35624cfbd34b6f4fb088bc8f Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Tue, 17 Feb 2026 13:29:23 +0200 Subject: [PATCH 009/115] feat: add --target-revision flag with git ancestry validation Add support for specifying target revision when bootstrapping cluster-forge: - New -r/--target-revision flag for bootstrap script - Git ancestry validation ensures only v1.8.0-rc1+ compatible revisions - Support for branches, tags, and commit hashes - Handles both local and remote branches (origin/branch-name) - Enhanced argument parsing with backwards compatibility - Simplified validation removes pre-v1.8.0 support complexity The flag allows developers to bootstrap from feature branches while ensuring compatibility with the v1.8.0+ architecture requirements. --- root/templates/_helpers.yaml | 1 + root/templates/cluster-forge.yaml | 17 +- scripts/bootstrap.sh | 332 ++++++++++++++++++----------- scripts/init-gitea-job/values.yaml | 12 +- 4 files changed, 231 insertions(+), 131 deletions(-) diff --git a/root/templates/_helpers.yaml b/root/templates/_helpers.yaml index adb29d25..096b29d5 100644 --- a/root/templates/_helpers.yaml +++ b/root/templates/_helpers.yaml @@ -1,3 +1,4 @@ +{{/* Renders a value that contains template. Usage: {{ include "common.tplvalues.render" ( dict "value" .Values.path.to.the.Value "context" $) }} diff --git a/root/templates/cluster-forge.yaml b/root/templates/cluster-forge.yaml index ce7cee38..9946a802 100644 --- a/root/templates/cluster-forge.yaml +++ b/root/templates/cluster-forge.yaml @@ -6,24 +6,28 @@ metadata: namespace: argocd spec: project: default - {{- if .Values.externalValues.enabled }} +{{- if .Values.externalValues.enabled }} # helm-chart & values file from 2 different git repos + # Uses the SAME targetRevision for both chart templates AND values + # This ensures version consistency - v1.7.0 uses v1.7.0 templates (no clusterSize) + # and v1.8.0+ uses v1.8.0+ templates (with clusterSize support) sources: - repoURL: {{ .Values.clusterForge.repoUrl }} targetRevision: {{ .Values.clusterForge.targetRevision }} path: root helm: - # here we want the base values.yaml and the custom values file from external repo - # the path to the custom values file is relative to the root of the external values repo valueFiles: - {{ .Values.externalValues.path }} + {{- if .Values.global.clusterSize }} - {{ .Values.global.clusterSize }} + {{- end }} - $cluster-values/values.yaml - repoURL: {{ .Values.externalValues.repoUrl }} targetRevision: {{ .Values.externalValues.targetRevision }} ref: cluster-values - {{ else }} +{{- else }} # helm-chart & values file within the same git repo + # Uses targetRevision for both chart templates AND values source: repoURL: {{ .Values.clusterForge.repoUrl }} targetRevision: {{ .Values.clusterForge.targetRevision }} @@ -31,7 +35,10 @@ spec: helm: valueFiles: - {{ .Values.clusterForge.valuesFile }} - {{- end }} + {{- if .Values.global.clusterSize }} + - {{ .Values.global.clusterSize }} + {{- end }} +{{- end }} destination: server: https://kubernetes.default.svc namespace: argocd diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index c0a8c461..2c01dea1 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -5,49 +5,78 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Initialize variables -DOMAIN="" -VALUES_FILE="values.yaml" +LATEST_RELEASE="v1.8.0" +TARGET_REVISION="$LATEST_RELEASE" + CLUSTER_SIZE="medium" # Default to medium +DOMAIN="" KUBE_VERSION=1.33 - -DEV_MODE=false -TARGET_REVISION="main" +VALUES_FILE="values.yaml" # Parse arguments while [[ $# -gt 0 ]]; do case $1 in - --CLUSTER_SIZE) - if [ -z "$2" ]; then - echo "ERROR: --CLUSTER_SIZE requires an argument" - exit 1 - fi - CLUSTER_SIZE="$2" - shift 2 - ;; - --CLUSTER_SIZE=*) - CLUSTER_SIZE="${1#*=}" - shift - ;; - --dev) - DEV_MODE=true - shift - ;; + --CLUSTER-SIZE|--cluster-size|-s) + if [ -z "$2" ]; then + echo "ERROR: --cluster-size requires an argument" + exit 1 + fi + CLUSTER_SIZE="$2" + shift 2 + ;; + --CLUSTER-SIZE=*) + CLUSTER_SIZE="${1#*=}" + shift + ;; + --cluster-size=*) + CLUSTER_SIZE="${1#*=}" + shift + ;; + -s=*) + CLUSTER_SIZE="${1#*=}" + shift + ;; + --TARGET-REVISION|--target-revision|-r) + if [ -z "$2" ]; then + echo "WARNING: defaulting to --target-revision=$LATEST_RELEASE (no value specified)" + TARGET_REVISION="$LATEST_RELEASE" + shift + else + TARGET_REVISION="$2" + shift 2 + fi + ;; + --TARGET-REVISION=*) + TARGET_REVISION="${1#*=}" + shift + ;; + --target-revision=*) + TARGET_REVISION="${1#*=}" + shift + ;; + -r=*) + TARGET_REVISION="${1#*=}" + shift + ;; --help|-h) - echo "Usage: $0 [options] [values_file]" - echo "" - echo "Arguments:" - echo " domain Required. Cluster domain (e.g., example.com)" - echo " values_file Optional. Values file to use (default: values.yaml)" - echo "" - echo "Options:" - echo " --CLUSTER_SIZE Optional. Cluster size [small|medium|large] (default: medium)" - echo " --dev Enable developer mode (sets Gitea repos to feature branch or custom value)" - echo "" - echo "" - echo "Examples:" - echo " $0 myIP.nip.io" - echo " $0 example.com values_custom.yaml --CLUSTER_SIZE=large" - echo " $0 --dev dev.example.com --CLUSTER_SIZE=small" + cat < [values_file] + + Arguments: + domain Required. Cluster domain (e.g., example.com) + values_file Optional. Values .yaml file to use, default: root/values.yaml + + Options: + -r, --target-revision cluster-forge git revision to seed into cluster-values/values.yaml file + options: [tag|commit_hash|branch_name], default: $LATEST_RELEASE + -s, --cluster-size options: [small|medium|large], default: medium + + Examples: + $0 compute.amd.com values_custom.yaml --cluster-size=large + $0 112.100.97.17.nip.io + $0 dev.example.com --cluster-size=small --target-revision=$LATEST_RELEASE + $0 dev.example.com -s=small -r=$LATEST_RELEASE +HELP_OUTPUT exit 0 ;; --*) @@ -71,6 +100,56 @@ while [[ $# -gt 0 ]]; do esac done +validate_target_revision() { + # Always allow main and the latest release + if [ "$TARGET_REVISION" = "main" ] || [ "$TARGET_REVISION" = "$LATEST_RELEASE" ]; then + return 0 + fi + + # Check if it's a valid v1.8.0+ semantic version pattern + if [[ "$TARGET_REVISION" =~ ^v1\.8\. ]] || [[ "$TARGET_REVISION" =~ ^v1\.([9-9]|[1-9][0-9]+)\. ]] || [[ "$TARGET_REVISION" =~ ^v[2-9]\. ]]; then + return 0 + fi + + # For branches/commits, check git ancestry to see if v1.8.0-rc1 or later is in the history + echo "Checking git ancestry for target revision: $TARGET_REVISION" + + # Check if the target revision exists in git (try local first, then remote) + RESOLVED_REVISION="" + if git rev-parse --verify "$TARGET_REVISION" >/dev/null 2>&1; then + RESOLVED_REVISION="$TARGET_REVISION" + elif git rev-parse --verify "origin/$TARGET_REVISION" >/dev/null 2>&1; then + RESOLVED_REVISION="origin/$TARGET_REVISION" + echo "Found target revision as remote branch: origin/$TARGET_REVISION" + else + echo "ERROR: Target revision '$TARGET_REVISION' does not exist in git" + echo "Available branches: $(git branch -a | grep -v HEAD | sed 's/^[ *]*//' | tr '\n' ' ')" + exit 1 + fi + + # Check if v1.8.0-rc1 or any later version is an ancestor of the target revision + # We'll check for v1.8.0-rc1 as the minimum supported version + MIN_SUPPORTED_TAG="v1.8.0-rc1" + + # Check if the minimum supported tag exists + if git rev-parse --verify "$MIN_SUPPORTED_TAG" >/dev/null 2>&1; then + # Check if MIN_SUPPORTED_TAG is an ancestor of RESOLVED_REVISION + if git merge-base --is-ancestor "$MIN_SUPPORTED_TAG" "$RESOLVED_REVISION" 2>/dev/null; then + echo "Target revision '$TARGET_REVISION' is based on or after $MIN_SUPPORTED_TAG - supported" + return 0 + else + echo "ERROR: Target revision '$TARGET_REVISION' is not based on $MIN_SUPPORTED_TAG or later" + echo "The --target-revision flag only supports revisions based on $MIN_SUPPORTED_TAG and later versions" + echo "Supported: v1.8.0+, main, branches forked from v1.8.0-rc1+, or $LATEST_RELEASE" + exit 1 + fi + else + echo "WARNING: Minimum supported tag '$MIN_SUPPORTED_TAG' not found in git" + echo "Proceeding with target revision '$TARGET_REVISION' (ancestry check skipped)" + return 0 + fi +} + # Validate required arguments if [ -z "$DOMAIN" ]; then echo "ERROR: Domain argument is required" @@ -96,50 +175,37 @@ if [ ! -f "${SCRIPT_DIR}/../root/${VALUES_FILE}" ]; then exit 1 fi -# Check if size-specific values file exists (optional overlay) -SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" -if [ ! -f "${SCRIPT_DIR}/../root/${SIZE_VALUES_FILE}" ]; then - echo "WARNING: Size-specific values file not found: ${SCRIPT_DIR}/../root/${SIZE_VALUES_FILE}" - echo "Proceeding with base values file only: ${VALUES_FILE}" - SIZE_VALUES_FILE="" -fi - -get_target_revision() { - if [ "$DEV_MODE" = true ]; then - CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "main") - echo "" - echo "Development mode enabled - ArgoCD will point to live GitHub repository" - echo "Current git branch: $CURRENT_BRANCH" - echo "" - read -p "Use current branch '$CURRENT_BRANCH' for targetRevision? [Y/n/custom_branch]: " choice - - case "$choice" in - n|N|no|No|NO) - echo "Exiting. Please checkout the branch you want to use and run again." - exit 0 - ;; - [Cc]ustom*|custom*) - read -p "Enter custom branch name: " custom_branch - if [ -n "$custom_branch" ]; then - TARGET_REVISION="$custom_branch" - else - echo "ERROR: Custom branch name cannot be empty" - exit 1 - fi - ;; - y|Y|yes|Yes|YES|"") - TARGET_REVISION="$CURRENT_BRANCH" - ;; - *) - # Treat any other input as a custom branch name - TARGET_REVISION="$choice" - ;; - esac - echo "Using targetRevision: $TARGET_REVISION" +# Check if size-specific values file exists +setup_values_files() { + SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" + + if [ ! -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + echo "WARNING: Size-specific values file not found: ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" + echo "Proceeding with base values file only: ${VALUES_FILE}" + SIZE_VALUES_FILE="" + else + echo "Using size-specific values file: ${SIZE_VALUES_FILE}" fi } +display_target_revision() { + # Check if TARGET_REVISION was explicitly set via command line flag + # by comparing against the default value + if [ "$TARGET_REVISION" != "$LATEST_RELEASE" ]; then + echo "Using specified targetRevision: $TARGET_REVISION" + else + echo "Using default targetRevision: $TARGET_REVISION" + fi +} + +# Since we only support v1.8.0+, always use local sources +setup_sources() { + SOURCE_ROOT="${SCRIPT_DIR}/.." + echo "Using local sources for target revision: $TARGET_REVISION" +} + pre_cleanup() { + echo "" echo "=== Pre-cleanup: Checking for previous runs ===" # Check if gitea-init-job exists and completed successfully @@ -171,12 +237,16 @@ pre_cleanup() { /tmp/openbao_values.yaml /tmp/openbao_size_values.yaml \ /tmp/gitea_values.yaml /tmp/gitea_size_values.yaml - echo "Pre-cleanup complete" + echo "=== Pre-cleanup complete ===" echo "" } -# Handle dev mode branch selection -get_target_revision +display_target_revision + +# Validate target revision and setup sources +validate_target_revision +setup_sources +setup_values_files # Run pre-cleanup pre_cleanup @@ -188,7 +258,7 @@ echo "Cluster size: $CLUSTER_SIZE" if [ -n "$SIZE_VALUES_FILE" ]; then echo "Size overlay: $SIZE_VALUES_FILE" fi -echo "============================" +echo "=== Starting Bootstrap Process ===" # Check for yq command availability if command -v yq >/dev/null 2>&1; then @@ -200,8 +270,12 @@ else exit 1 fi -# Update the global.clusterSize in the base values file with full filename -$YQ_CMD -i ".global.clusterSize = \"values_${CLUSTER_SIZE}.yaml\"" "${SCRIPT_DIR}/../root/${VALUES_FILE}" +# Update the global.clusterSize in the base values file with mapped filename +if [ -n "$SIZE_VALUES_FILE" ]; then + $YQ_CMD -i ".global.clusterSize = \"${SIZE_VALUES_FILE}\"" "${SOURCE_ROOT}/root/${VALUES_FILE}" +else + $YQ_CMD -i ".global.clusterSize = \"values_${CLUSTER_SIZE}.yaml\"" "${SOURCE_ROOT}/root/${VALUES_FILE}" +fi # Function to merge values files early for use throughout the script merge_values_files() { @@ -209,12 +283,12 @@ merge_values_files() { if [ -n "$SIZE_VALUES_FILE" ]; then # Merge base values with size-specific overrides VALUES=$($YQ_CMD eval-all '. as $item ireduce ({}; . * $item)' \ - ${SCRIPT_DIR}/../root/${VALUES_FILE} \ - ${SCRIPT_DIR}/../root/${SIZE_VALUES_FILE} | \ + ${SOURCE_ROOT}/root/${VALUES_FILE} \ + ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE} | \ $YQ_CMD eval ".global.domain = \"${DOMAIN}\"") else # Use base values only - VALUES=$(cat ${SCRIPT_DIR}/../root/${VALUES_FILE} | $YQ_CMD ".global.domain = \"${DOMAIN}\"") + VALUES=$(cat ${SOURCE_ROOT}/root/${VALUES_FILE} | $YQ_CMD ".global.domain = \"${DOMAIN}\"") fi # Write merged values to temp file for use throughout script @@ -233,21 +307,33 @@ get_openbao_value() { $YQ_CMD eval ".apps.openbao.valuesObject.${path}" /tmp/merged_values.yaml } +# Extract version information from app paths +extract_app_versions() { + ARGOCD_VERSION=$($YQ_CMD eval '.apps.argocd.path' /tmp/merged_values.yaml | cut -d'/' -f2) + OPENBAO_VERSION=$($YQ_CMD eval '.apps.openbao.path' /tmp/merged_values.yaml | cut -d'/' -f2) + GITEA_VERSION=$($YQ_CMD eval '.apps.gitea.path' /tmp/merged_values.yaml | cut -d'/' -f2) + + echo "Extracted versions - ArgoCD: $ARGOCD_VERSION, OpenBao: $OPENBAO_VERSION, Gitea: $GITEA_VERSION" +} + # Merge values files early so all subsequent operations can use the merged config merge_values_files +# Extract version information from merged values +extract_app_versions + # Create namespaces kubectl create ns argocd --dry-run=client -o yaml | kubectl apply -f - kubectl create ns cf-gitea --dry-run=client -o yaml | kubectl apply -f - kubectl create ns cf-openbao --dry-run=client -o yaml | kubectl apply -f - -# ArgoCD bootstrap -echo "Bootstrapping ArgoCD..." +echo "" +echo "=== ArgoCD Bootstrap ===" # Extract ArgoCD values from merged config and write to temp values file -$YQ_CMD eval '.apps.argocd.valuesObject' ${SCRIPT_DIR}/../root/${VALUES_FILE} > /tmp/argocd_values.yaml -$YQ_CMD eval '.apps.argocd.valuesObject' ${SCRIPT_DIR}/../root/${SIZE_VALUES_FILE} > /tmp/argocd_size_values.yaml +$YQ_CMD eval '.apps.argocd.valuesObject' ${SOURCE_ROOT}/root/${VALUES_FILE} > /tmp/argocd_values.yaml +$YQ_CMD eval '.apps.argocd.valuesObject' ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE} > /tmp/argocd_size_values.yaml # Use server-side apply to match ArgoCD's self-management strategy -helm template --release-name argocd ${SCRIPT_DIR}/../sources/argocd/8.3.5 --namespace argocd \ +helm template --release-name argocd ${SOURCE_ROOT}/sources/argocd/${ARGOCD_VERSION} --namespace argocd \ -f /tmp/argocd_values.yaml \ -f /tmp/argocd_size_values.yaml \ --set global.domain="argocd.${DOMAIN}" \ @@ -257,13 +343,13 @@ kubectl rollout status deploy/argocd-applicationset-controller -n argocd kubectl rollout status deploy/argocd-redis -n argocd kubectl rollout status deploy/argocd-repo-server -n argocd -# OpenBao bootstrap -echo "Bootstrapping OpenBao..." +echo "" +echo "=== OpenBao Bootstrap ===" # Extract OpenBao values from merged config -$YQ_CMD eval '.apps.openbao.valuesObject' ${SCRIPT_DIR}/../root/${VALUES_FILE} > /tmp/openbao_values.yaml -$YQ_CMD eval '.apps.openbao.valuesObject' ${SCRIPT_DIR}/../root/${SIZE_VALUES_FILE} > /tmp/openbao_size_values.yaml +$YQ_CMD eval '.apps.openbao.valuesObject' ${SOURCE_ROOT}/root/${VALUES_FILE} > /tmp/openbao_values.yaml +$YQ_CMD eval '.apps.openbao.valuesObject' ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE} > /tmp/openbao_size_values.yaml # Use server-side apply to match ArgoCD's field management strategy -helm template --release-name openbao ${SCRIPT_DIR}/../sources/openbao/0.18.2 --namespace cf-openbao \ +helm template --release-name openbao ${SOURCE_ROOT}/sources/openbao/${OPENBAO_VERSION} --namespace cf-openbao \ -f /tmp/openbao_values.yaml \ -f /tmp/openbao_size_values.yaml \ --set ui.enabled=true \ @@ -272,24 +358,24 @@ kubectl wait --for=jsonpath='{.status.phase}'=Running pod/openbao-0 -n cf-openba # Create initial secrets config for init job (separate from ArgoCD-managed version) echo "Creating initial OpenBao secrets configuration..." -cat ${SCRIPT_DIR}/../sources/openbao-config/0.1.0/templates/openbao-secret-manager-cm.yaml | \ +cat ${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-manager-cm.yaml | \ sed "s|name: openbao-secret-manager-scripts|name: openbao-secret-manager-scripts-init|g" | kubectl apply -f - # Create initial secrets config for init job (separate from ArgoCD-managed version) echo "Creating initial OpenBao secrets configuration..." -cat ${SCRIPT_DIR}/../sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml | \ +cat ${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml | \ sed "s|{{ .Values.domain }}|${DOMAIN}|g" | \ sed "s|name: openbao-secrets-config|name: openbao-secrets-init-config|g" | kubectl apply -f - # Pass OpenBao configuration to init script -helm template --release-name openbao-init ${SCRIPT_DIR}/init-openbao-job \ +helm template --release-name openbao-init ${SOURCE_ROOT}/scripts/init-openbao-job \ -f /tmp/openbao_values.yaml \ --set domain="${DOMAIN}" \ --kube-version=${KUBE_VERSION} | kubectl apply -f - kubectl wait --for=condition=complete --timeout=300s job/openbao-init-job -n cf-openbao -# Gitea bootstrap -echo "Bootstrapping Gitea..." +echo "" +echo "=== Gitea Bootstrap ===" generate_password() { openssl rand -hex 16 | tr 'a-f' 'A-F' | head -c 32 } @@ -304,11 +390,11 @@ kubectl create secret generic gitea-admin-credentials \ --from-literal=password=$(generate_password) \ --dry-run=client -o yaml | kubectl apply -f - -$YQ_CMD eval '.apps.gitea.valuesObject' ${SCRIPT_DIR}/../root/${VALUES_FILE} > /tmp/gitea_values.yaml -$YQ_CMD eval '.apps.gitea.valuesObject' ${SCRIPT_DIR}/../root/${SIZE_VALUES_FILE} > /tmp/gitea_size_values.yaml +$YQ_CMD eval '.apps.gitea.valuesObject' ${SOURCE_ROOT}/root/${VALUES_FILE} > /tmp/gitea_values.yaml +$YQ_CMD eval '.apps.gitea.valuesObject' ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE} > /tmp/gitea_size_values.yaml # Bootstrap Gitea -helm template --release-name gitea ${SCRIPT_DIR}/../sources/gitea/12.3.0 --namespace cf-gitea \ +helm template --release-name gitea ${SOURCE_ROOT}/sources/gitea/${GITEA_VERSION} --namespace cf-gitea \ -f /tmp/gitea_values.yaml \ -f /tmp/gitea_size_values.yaml \ --set gitea.config.server.ROOT_URL="https://gitea.${DOMAIN}/" \ @@ -316,35 +402,35 @@ helm template --release-name gitea ${SCRIPT_DIR}/../sources/gitea/12.3.0 --names kubectl rollout status deploy/gitea -n cf-gitea # Gitea Init Job -helm template --release-name gitea-init ${SCRIPT_DIR}/init-gitea-job \ +helm template --release-name gitea-init ${SOURCE_ROOT}/scripts/init-gitea-job \ + --set clusterSize="${SIZE_VALUES_FILE:-values_${CLUSTER_SIZE}.yaml}" \ --set domain="${DOMAIN}" \ - --set clusterSize="values_${CLUSTER_SIZE}.yaml" \ + --set targetRevision="${TARGET_REVISION}" \ --kube-version=${KUBE_VERSION} \ | kubectl apply -f - kubectl wait --for=condition=complete --timeout=300s job/gitea-init-job -n cf-gitea -# Create cluster-forge app-of-apps with merged configuration -echo "Creating ClusterForge app-of-apps (size: $CLUSTER_SIZE)..." -helm template ${SCRIPT_DIR}/../root \ +echo "" +echo "=== Creating ClusterForge App-of-Apps ===" +echo "Cluster size: $CLUSTER_SIZE" +helm template ${SOURCE_ROOT}/root \ -f /tmp/merged_values.yaml \ --kube-version=${KUBE_VERSION} | kubectl apply -f - -echo "" -echo "=== ClusterForge Bootstrap Complete ===" -echo "Domain: $DOMAIN" -echo "Cluster size: $CLUSTER_SIZE" -echo "Access ArgoCD at: https://argocd.${DOMAIN}" -echo "Access Gitea at: https://gitea.${DOMAIN}" -echo "" -if [ "$DEV_MODE" = true ]; then - echo "Mode: Development using non-main targetRevision" -fi -echo "Target revision: $TARGET_REVISION" -echo "Access ArgoCD at: https://argocd.${DOMAIN}" -echo "Access Gitea at: https://gitea.${DOMAIN}" -echo "" -echo "This is the way!" +echo <<__SUMMARY__ + + === ClusterForge Bootstrap Complete ===" + + Domain: $DOMAIN + Cluster size: $CLUSTER_SIZE + Target revision: $TARGET_REVISION + + Access ArgoCD at: https://argocd.${DOMAIN} + Access Gitea at: https://gitea.${DOMAIN} + + This is the way! +__SUMMARY__ # Cleanup temporary files echo "Cleaning up temporary files..." diff --git a/scripts/init-gitea-job/values.yaml b/scripts/init-gitea-job/values.yaml index 2790dc92..1813868f 100644 --- a/scripts/init-gitea-job/values.yaml +++ b/scripts/init-gitea-job/values.yaml @@ -1,3 +1,9 @@ -domain: # to be filled by bootstrap script -clusterSize: values_medium.yaml # to be filled by bootstrap script -targetRevision: v1.8.0-rc2 +# small|medium|large (injected by bootstrap script) +clusterSize: null + +# Domain for the cluster (injected by bootstrap script) +# Example: "compute.amd.com" +domain: null + +# Git revision to deploy (injected by bootstrap script) +targetRevision: null \ No newline at end of file From b5a0898d0d169799b4ebbe0c4ae161ba97485956 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Wed, 18 Feb 2026 20:45:21 +0200 Subject: [PATCH 010/115] fix: set targetRevision within cluster-values --- scripts/bootstrap.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 2c01dea1..5f9d8962 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -258,6 +258,17 @@ echo "Cluster size: $CLUSTER_SIZE" if [ -n "$SIZE_VALUES_FILE" ]; then echo "Size overlay: $SIZE_VALUES_FILE" fi +echo "Target revision: $TARGET_REVISION" +echo "" +echo "⚠️ This will bootstrap ClusterForge on your cluster with the above configuration." +echo " Existing ArgoCD, OpenBao, and Gitea resources may be modified or replaced." +echo "" +read -p "Continue with bootstrap? [Y/n]: " -r +echo "" +if [[ $REPLY =~ ^[Nn]$ ]]; then + echo "Bootstrap cancelled by user." + exit 0 +fi echo "=== Starting Bootstrap Process ===" # Check for yq command availability @@ -277,6 +288,10 @@ else $YQ_CMD -i ".global.clusterSize = \"values_${CLUSTER_SIZE}.yaml\"" "${SOURCE_ROOT}/root/${VALUES_FILE}" fi +# Note: clusterForge.targetRevision will be set by the gitea-init-job +# in the cluster-values repository (which overwrites the base values as the final values file) +echo "Target revision $TARGET_REVISION will be set in cluster-values repo by gitea-init-job" + # Function to merge values files early for use throughout the script merge_values_files() { echo "Merging values files..." @@ -291,6 +306,10 @@ merge_values_files() { VALUES=$(cat ${SOURCE_ROOT}/root/${VALUES_FILE} | $YQ_CMD ".global.domain = \"${DOMAIN}\"") fi + # Apply the target revision override (matching what cluster-values repo will contain) + echo "Applying targetRevision override: $TARGET_REVISION" + VALUES=$(echo "$VALUES" | $YQ_CMD eval ".clusterForge.targetRevision = \"${TARGET_REVISION}\"") + # Write merged values to temp file for use throughout script echo "$VALUES" > /tmp/merged_values.yaml echo "Merged values written to /tmp/merged_values.yaml" From ded11b689604d526772439b68ebd6c7a7321fe55 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Wed, 18 Feb 2026 21:07:51 +0200 Subject: [PATCH 011/115] chore: revert defunct feature relics --- root/templates/cluster-forge.yaml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/root/templates/cluster-forge.yaml b/root/templates/cluster-forge.yaml index 9946a802..4bf42558 100644 --- a/root/templates/cluster-forge.yaml +++ b/root/templates/cluster-forge.yaml @@ -9,8 +9,6 @@ spec: {{- if .Values.externalValues.enabled }} # helm-chart & values file from 2 different git repos # Uses the SAME targetRevision for both chart templates AND values - # This ensures version consistency - v1.7.0 uses v1.7.0 templates (no clusterSize) - # and v1.8.0+ uses v1.8.0+ templates (with clusterSize support) sources: - repoURL: {{ .Values.clusterForge.repoUrl }} targetRevision: {{ .Values.clusterForge.targetRevision }} @@ -18,16 +16,13 @@ spec: helm: valueFiles: - {{ .Values.externalValues.path }} - {{- if .Values.global.clusterSize }} - {{ .Values.global.clusterSize }} - {{- end }} - $cluster-values/values.yaml - repoURL: {{ .Values.externalValues.repoUrl }} targetRevision: {{ .Values.externalValues.targetRevision }} ref: cluster-values {{- else }} # helm-chart & values file within the same git repo - # Uses targetRevision for both chart templates AND values source: repoURL: {{ .Values.clusterForge.repoUrl }} targetRevision: {{ .Values.clusterForge.targetRevision }} @@ -35,9 +30,7 @@ spec: helm: valueFiles: - {{ .Values.clusterForge.valuesFile }} - {{- if .Values.global.clusterSize }} - {{ .Values.global.clusterSize }} - {{- end }} {{- end }} destination: server: https://kubernetes.default.svc @@ -45,4 +38,4 @@ spec: syncPolicy: automated: prune: true - selfHeal: true + selfHeal: true \ No newline at end of file From fda6ed43f46eedf9b04ab7c07e7ed71ac633b855 Mon Sep 17 00:00:00 2001 From: woojae-siloai Date: Thu, 19 Feb 2026 11:15:24 +0200 Subject: [PATCH 012/115] docs: add one line at bootstrap_guide --- docs/bootstrap_guide.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/bootstrap_guide.md b/docs/bootstrap_guide.md index 4b6670c6..6874bafe 100644 --- a/docs/bootstrap_guide.md +++ b/docs/bootstrap_guide.md @@ -41,6 +41,7 @@ This guide explains how to bootstrap a complete GitOps environment using Cluster The bootstrap script uses a three-phase deployment model: ### Phase 1: Pre-Cleanup +- The pre_cleanup function performs selective cleanup, only affects cf-gitea and cf-openbao namespaces - Detects previous installations by checking for completed gitea-init-job - Removes Gitea resources to enable fresh deployment - Deletes OpenBao initialization jobs and temporary files From a244207b94193f81b2084b51ea023f782eb6e372 Mon Sep 17 00:00:00 2001 From: brownzebra Date: Thu, 19 Feb 2026 12:38:00 +0200 Subject: [PATCH 013/115] chore: bump airm to 0.3.4 --- sources/airm/0.3.4/.helmignore | 27 ++ sources/airm/0.3.4/Chart.yaml | 35 ++ .../airm/0.3.4/charts/airm-api/.helmignore | 27 ++ sources/airm/0.3.4/charts/airm-api/Chart.yaml | 29 ++ sources/airm/0.3.4/charts/airm-api/README.md | 124 ++++++ .../0.3.4/charts/airm-api/files/configure.sh | 374 ++++++++++++++++++ .../charts/airm-api/templates/_helpers.tpl | 15 + .../airm-api/templates/airm-app-backend.yaml | 268 +++++++++++++ .../airm-api/templates/airm-app-frontend.yaml | 85 ++++ .../airm-api/templates/airm-cert-issuer.yaml | 26 ++ .../templates/airm-cluster-roles.yaml | 34 ++ .../airm-cluster-runtime-config.yaml | 19 + .../airm-api/templates/airm-cluster.yaml | 47 +++ .../templates/airm-configure-job.yaml | 172 ++++++++ .../charts/airm-api/templates/airm-es.yaml | 215 ++++++++++ .../airm-api/templates/airm-httproute.yaml | 81 ++++ .../templates/airm-rabbitmq-cluster.yaml | 69 ++++ .../templates/airm-vllm-collector.yaml | 93 +++++ .../airm/0.3.4/charts/airm-api/values.yaml | 166 ++++++++ .../0.3.4/charts/airm-dispatcher/.helmignore | 27 ++ .../0.3.4/charts/airm-dispatcher/Chart.yaml | 29 ++ .../0.3.4/charts/airm-dispatcher/README.md | 54 +++ .../templates/airm-cluster-policies.yaml | 352 +++++++++++++++++ .../templates/airm-cluster-roles.yaml | 164 ++++++++ .../templates/airm-dispatcher-app.yaml | 343 ++++++++++++++++ .../templates/kyverno-cluster-role.yaml | 35 ++ .../0.3.4/charts/airm-dispatcher/values.yaml | 27 ++ sources/airm/0.3.4/values.yaml | 3 + 28 files changed, 2940 insertions(+) create mode 100644 sources/airm/0.3.4/.helmignore create mode 100644 sources/airm/0.3.4/Chart.yaml create mode 100644 sources/airm/0.3.4/charts/airm-api/.helmignore create mode 100644 sources/airm/0.3.4/charts/airm-api/Chart.yaml create mode 100644 sources/airm/0.3.4/charts/airm-api/README.md create mode 100644 sources/airm/0.3.4/charts/airm-api/files/configure.sh create mode 100644 sources/airm/0.3.4/charts/airm-api/templates/_helpers.tpl create mode 100644 sources/airm/0.3.4/charts/airm-api/templates/airm-app-backend.yaml create mode 100644 sources/airm/0.3.4/charts/airm-api/templates/airm-app-frontend.yaml create mode 100644 sources/airm/0.3.4/charts/airm-api/templates/airm-cert-issuer.yaml create mode 100644 sources/airm/0.3.4/charts/airm-api/templates/airm-cluster-roles.yaml create mode 100644 sources/airm/0.3.4/charts/airm-api/templates/airm-cluster-runtime-config.yaml create mode 100644 sources/airm/0.3.4/charts/airm-api/templates/airm-cluster.yaml create mode 100644 sources/airm/0.3.4/charts/airm-api/templates/airm-configure-job.yaml create mode 100644 sources/airm/0.3.4/charts/airm-api/templates/airm-es.yaml create mode 100644 sources/airm/0.3.4/charts/airm-api/templates/airm-httproute.yaml create mode 100644 sources/airm/0.3.4/charts/airm-api/templates/airm-rabbitmq-cluster.yaml create mode 100644 sources/airm/0.3.4/charts/airm-api/templates/airm-vllm-collector.yaml create mode 100644 sources/airm/0.3.4/charts/airm-api/values.yaml create mode 100644 sources/airm/0.3.4/charts/airm-dispatcher/.helmignore create mode 100644 sources/airm/0.3.4/charts/airm-dispatcher/Chart.yaml create mode 100644 sources/airm/0.3.4/charts/airm-dispatcher/README.md create mode 100644 sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-cluster-policies.yaml create mode 100644 sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-cluster-roles.yaml create mode 100644 sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml create mode 100644 sources/airm/0.3.4/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml create mode 100644 sources/airm/0.3.4/charts/airm-dispatcher/values.yaml create mode 100644 sources/airm/0.3.4/values.yaml diff --git a/sources/airm/0.3.4/.helmignore b/sources/airm/0.3.4/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.4/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.4/Chart.yaml b/sources/airm/0.3.4/Chart.yaml new file mode 100644 index 00000000..4879c7c6 --- /dev/null +++ b/sources/airm/0.3.4/Chart.yaml @@ -0,0 +1,35 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm +description: A Helm chart for AIRM full stack, including API, UI and dispatcher + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.4 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" + +dependencies: + - name: airm-api + version: 0.3.4 + - name: airm-dispatcher + version: 0.3.4 diff --git a/sources/airm/0.3.4/charts/airm-api/.helmignore b/sources/airm/0.3.4/charts/airm-api/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.4/charts/airm-api/Chart.yaml b/sources/airm/0.3.4/charts/airm-api/Chart.yaml new file mode 100644 index 00000000..4bddec9c --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/Chart.yaml @@ -0,0 +1,29 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm-api +description: A Helm chart for AIRM API and UI + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.4 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" diff --git a/sources/airm/0.3.4/charts/airm-api/README.md b/sources/airm/0.3.4/charts/airm-api/README.md new file mode 100644 index 00000000..a16ec9da --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/README.md @@ -0,0 +1,124 @@ + + +# HELM CHARTS + +Simple instructions to deploy AIRM UI and API applications using helm chart + +### 1. Requirements + +The following external components must be available in the Kubernetes cluster before the helm chart can be installed. + +- API Gateway implementation (e.g. KGateway) +- Keycloak with the expected `airm` realm installed +- Valid S3 compatible file storage service (e.g. MinIO) +- RabbitMQ operator +- Cert Manager operator +- External Secret operator +- CNPG operator +- OTEL LGTM stack installed on the cluster + +### 2. Install + +``` +cd helm/airm/charts + +# 1. Create output template just to validate (the public domain could be app-dev.silogen.ai, staging.silogen.ai, etc.) +helm template airm-api ./airm-api -n airm --create-namespace --set airm.appDomain= > airm-api-helm-generated.yaml + +# 2. Run chart install +helm install airm-api ./airm-api -n airm --create-namespace --set airm.appDomain= + +# 3. Delete chart if needed +helm delete airm-api -n airm + +# 4. Upgrade when bumping versions +helm upgrade -n airm --set airm.appDomain= airm-api ./airm-api +``` + +--- + +### 3. Helm Settings + +| Field Path | Description | Type | Example / Default | +|-------------------------------------------------------------------------------|-----------------------------------------------------------------| ------ |---------------------------------------------------------------------------------------------------| +| secretgenerator.image.repository | Docker image repository for secret generator | string | `ghcr.io/silogen/kubectl` | +| secretgenerator.image.tag | Docker image tag | string | `latest` | +| secretgenerator.image.pullPolicy | Image pull policy | string | `IfNotPresent` | +| kgateway.namespace | Namespace for kgateway resources | string | `kgateway-system` | +| kgateway.gatewayName | Gateway name | string | `https` | +| kgateway.airmapi.servicePort | Service port for airmapi | int | `80` | +| kgateway.airmapi.prefixValue | URL prefix for airmapi service | string | `airmapi` | +| kgateway.airmui.servicePort | Service port for airmui | int | `80` | +| kgateway.airmui.prefixValue | URL prefix for airmui service | string | `airmui` | +| aims.otelCollector.exporters.otlphttp.endpoint | Open Telemetry collector endpoint url for inference metrics | string | `http://lgtm-stack.otel-lgtm-stack.svc:4318` | +| aims.otelCollector.image | Base image for Open Telemetry Collector | string | `ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0` | +| aims.otelCollector.receivers.prometheus.config.scrape_configs.scrape_interval | Inference metrics scraping interval | string | `20s` | +| airm.includeDemoSetup | Include the demo organization and project setup when installing | bool | `true` | +| airm.appDomain | Public IP or domain for airm | string | `PUBLIC-IP` | +| airm.externalSecretStore.airm.name | Secret store name for airm | string | `airm-secret-store` | +| airm.externalSecretStore.minio.name | Secret store name for minio | string | `k8s-secret-store` | +| airm.externalSecretStore.keycloak.name | Secret store name for keycloak | string | `keycloak-secret-store` | +| airm.keycloak.publicUrl | Public URL to access keycloak | string | `https://kc.{{ .Values.airm.appDomain }}` | +| airm.keycloak.internalUrl | Internal URL to access keycloak | string | `http://keycloak.keycloak.svc.cluster.local:8080` | +| airm.keycloak.clientId | Client ID to access keycloak | string | `354a0fa1-35ac-4a6d-9c4d-d661129c2cd0` | +| airm.keycloak.realm | Keycloak realm for authentication | string | `airm` | +| airm.postgresql.cnpg.image | PostgreSQL container image | string | `ghcr.io/cloudnative-pg/postgresql:17` | +| airm.postgresql.cnpg.instance | Number of PostgreSQL instances | int | `1` | +| airm.postgresql.cnpg.resources.limits.cpu | CPU limit for PostgreSQL container | string | `"2"` | +| airm.postgresql.cnpg.resources.limits.memory | Memory limit for PostgreSQL container | string | `1Gi` | +| airm.postgresql.cnpg.resources.requests.cpu | CPU request for PostgreSQL container | string | `"1"` | +| airm.postgresql.cnpg.resources.requests.memory | Memory request for PostgreSQL container | string | `512Mi` | +| airm.postgresql.cnpg.storage.size | Storage size for PostgreSQL | string | `50Gi` | +| airm.postgresql.cnpg.storage.storageClass | Storage class for PostgreSQL | string | `default` | +| airm.postgresql.cnpg.walStorage.size | WAL storage size for PostgreSQL | string | `50Gi` | +| airm.postgresql.cnpg.walStorage.storageClass | WAL storage class for PostgreSQL | string | `default` | +| airm.rabbitmq.replicas | Number of replicas for the RabbitMQ cluster | int | `1` | +| airm.rabbitmq.resources.limits.cpu | CPU limit for for the RabbitMQ cluster | string | `1` | +| airm.rabbitmq.resources.limits.memory | Memory limit for for the RabbitMQ cluster | string | `1Gi` | +| airm.rabbitmq.resources.requests.cpu | CPU request for the RabbitMQ cluster | string | `500m` | +| airm.rabbitmq.resources.requests.memory | Memory request for the RabbitMQ cluster | string | `1Gi` | +| airm.rabbitmq.persistence.storage | Persistent storage size for the RabbitMQ cluster | string | `20Gi` | +| airm.rabbitmq.persistence.storageClassName | Storage class name for the RabbitMQ cluster | string | `default` | +| airm.rabbitmq.backup.enabled | Enable RabbitMQ backup | bool | `false` | +| airm.rabbitmq.backup.image | RabbitMQ backup container image | string | `amdenterpriseai/rabbitmq-backup:0.1` | +| airm.rabbitmq.backup.resources.limits.memory | Memory limit for cron job of RabbitMQ backup | string | `512Mi` | +| airm.rabbitmq.backup.resources.requests.cpu | CPU request for cron job of RabbitMQ backup | string | `250m` | +| airm.rabbitmq.backup.resources.requests.memory | Memory request for cron job of RabbitMQ backup | string | `256Mi` | +| airm.frontend.image.repository | Frontend image repository | string | `amdenterpriseai/airm-ui` | +| airm.frontend.image.tag | Frontend image tag | string | `v2025.08-rc.21` | +| airm.frontend.image.pullPolicy | Frontend image pull policy | string | `IfNotPresent` | +| airm.frontend.servicePort | Frontend service port | int | `80` | +| airm.frontend.resources.limits.memory | Memory limit for frontend | string | `4Gi` | +| airm.frontend.resources.requests.cpu | CPU request for frontend | string | `500m` | +| airm.frontend.resources.requests.memory | Memory request for frontend | string | `4Gi` | +| airm.backend.image.repository | Backend API image repository | string | `amdenterpriseai/airm-api` | +| airm.backend.image.tag | Backend API image tag | string | `v2025.08-rc.21` | +| airm.backend.image.pullPolicy | Backend API image pull policy | string | `IfNotPresent` | +| airm.backend.servicePort | Backend API service port | int | `80` | +| airm.backend.servicePortMetrics | Backend API metrics service port | int | `9009` | +| airm.backend.env.dbPort | Database port | int | `5432` | +| airm.backend.env.rabbitmqPort | RabbitMQ port | int | `5672` | +| airm.backend.env.minioUrl | Minio service URL | string | `http://minio.minio-tenant-default.svc.cluster.local:80` | +| airm.backend.env.minioBucket | Minio bucket name | string | `default-bucket` | +| airm.backend.env.prometheusUrl | Prometheus service URL | string | `http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:9090` | +| airm.backend.env.clusterAuthUrl | Cluster auth service URL | string | `http://cluster-auth.cluster-auth.svc.cluster.local:8081` | +| airm.backend.resources.limits.memory | Memory limit for backend API container | string | `1Gi` | +| airm.backend.resources.requests.cpu | CPU request for backend API container | string | `500m` | +| airm.backend.resources.requests.memory | Memory request for backend API container | string | `1Gi` | +| airm.backend.securityContext.allowPrivilegeEscalation | Security context: allow privilege escalation | bool | `false` | +| airm.backend.securityContext.runAsNonRoot | Security context: run container as non-root | bool | `true` | +| airm.backend.securityContext.runAsUser | Security context: user ID to run container as | int | `1000` | +| airm.backend.securityContext.seccompProfile.type | Security context: seccomp profile type | string | `RuntimeDefault` | +| airm.utilities.netcat.image.repository | Netcat image repository | string | `busybox` | +| airm.utilities.netcat.image.tag | Netcat image tag | string | `1.37.0` | +| airm.utilities.netcat.image.pullPolicy | Netcat image pull policy | string | `IfNotPresent` | +| airm.utilities.curl.image.repository | Curl image repository | string | `curlimages/curl` | +| airm.utilities.curl.image.tag | Curl image tag | string | `8.16.0` | +| airm.utilities.curl.image.pullPolicy | Curl image pull policy | string | `IfNotPresent` | +| airm.utilities.liquibase.image.repository | Liquibase image repository | string | `docker.io/liquibase/liquibase` | +| airm.utilities.liquibase.image.tag | Liquibase image tag | string | `4.31` | +| airm.utilities.liquibase.image.pullPolicy | Liquibase image pull policy | string | `IfNotPresent` | diff --git a/sources/airm/0.3.4/charts/airm-api/files/configure.sh b/sources/airm/0.3.4/charts/airm-api/files/configure.sh new file mode 100644 index 00000000..69a3f59d --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/files/configure.sh @@ -0,0 +1,374 @@ +#!/bin/bash + +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +##################################################################################### +echo "" +echo "Run configure script block..." +echo "" + +# --- Configuration Variables --- +# Get values from bloom configmap mounted as env + +# NOTE: ORG_NAME is hardcoded to demo because gpu operator metrics has same org name hardcoded there +# Otherwise the following line can be uncommented to consider the real org name from domain config +# ORG_NAME=$(echo $NEW_DOMAIN_NAME | awk -F '.' '{ print $2 }') +ORG_NAME="demo" +ORG_DOMAINS="[\"${NEW_DOMAIN_NAME}\"]" +CLUSTER_WORKLOADS_BASE_URL="https://workspaces.${NEW_DOMAIN_NAME}/" +CLUSTER_KUBE_API_URL="https://k8s.${NEW_DOMAIN_NAME}" +USER_EMAIL="devuser@${NEW_DOMAIN_NAME}" +PROJECT_NAME="demo" +PROJECT_DESCRIPTION="demo" +CLUSTER_NAME="demo-cluster" +TIMEOUT=300 +SLEEP_INTERVAL=5 + +# --- Input Validation --- +echo "Validating environment variables..." +echo "KEYCLOAK_CLIENT_ID: ${KEYCLOAK_CLIENT_ID}" +echo "NEW_DOMAIN_NAME: ${NEW_DOMAIN_NAME}" +echo "AIRM_API_URL: ${AIRM_API_URL}" + +function check_env_variable() { + if [ -z "${!1}" ]; then + echo "ERROR: $1 environment variable is not set." + exit 1 + fi +} + +function check_success() { + if [ "$1" -ne 0 ]; then + echo "ERROR: $2" + exit 1 + fi +} + +check_env_variable "AIRM_API_URL" +check_env_variable "KEYCLOAK_URL" +check_env_variable "KEYCLOAK_REALM" +check_env_variable "KEYCLOAK_CLIENT_SECRET" +check_env_variable "KEYCLOAK_CLIENT_ID" +check_env_variable "KEYCLOAK_ADMIN_CLIENT_ID" +check_env_variable "KEYCLOAK_ADMIN_CLIENT_SECRET" +check_env_variable "USER_PASSWORD" + +function refresh_token() { + TOKEN=$(curl -s -d "client_id=${KEYCLOAK_CLIENT_ID}" -d "username=${USER_EMAIL}" -d "password=${USER_PASSWORD}" -d 'grant_type=password' -d "client_secret=${KEYCLOAK_CLIENT_SECRET}" "${KEYCLOAK_URL}/realms/${KEYCLOAK_REALM}/protocol/openid-connect/token" | jq -r '.access_token') + if [ -z "$TOKEN" ] || [ "$TOKEN" == "null" ]; then + echo "ERROR: Failed to obtain access token from Keycloak." + exit 1 + fi +} + +function create_org() { + # Try to get ORG_ID by name + ORG_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r --arg name "$ORG_NAME" '.organizations[] | select(.name==$name) | .id') + + # If not found, create the org and fetch the ID again + if [ -z "$ORG_ID" ] || [ "$ORG_ID" == "null" ]; then + ORG_RESP=$(curl -s -o /dev/null -X POST -w "%{http_code}" "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' \ + -d "{ \"name\": \"$ORG_NAME\", \"domains\": $ORG_DOMAINS }") + echo "$ORG_RESP" + check_success "$([[ "$ORG_RESP" == "200" || "$ORG_RESP" == "201" ]] && echo 0 || echo 1)" "Failed to create organization" + + ORG_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r --arg name "$ORG_NAME" '.organizations[] | select(.name==$name) | .id') + fi + + if [ -z "$ORG_ID" ] || [ "$ORG_ID" == "null" ]; then + echo "ERROR: Failed to create or retrieve organization ID." + exit 1 + else + echo "ORG_ID=${ORG_ID}" + fi +} + +function add_user_to_org() { + # Check if user exists in org + USER_EXISTS=$(curl -s -X GET "${AIRM_API_URL}/v1/users" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" -H 'Content-Type: application/json' | jq -r --arg email "$USER_EMAIL" '.data? // [] | .[] | select(.email==$email) | .email') + # Add user to org if they don't exist + if [ -z "$USER_EXISTS" ] || [ "$USER_EXISTS" == "null" ]; then + echo "$USER_EXISTS" + echo "User '$USER_EMAIL' not found in organization. Adding..." + ADD_USER_RESP=$(curl -w "%{http_code}" -X 'POST' "${AIRM_API_URL}/v1/organizations/${ORG_ID}/users" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" -H 'Content-Type: application/json' -d '{ "email": "'"$USER_EMAIL"'", "roles": ["Platform Administrator"]}') + echo "$ADD_USER_RESP" + check_success "$([[ "$ADD_USER_RESP" == "200" || "$ADD_USER_RESP" == "201" || "$ADD_USER_RESP" == "null201" ]] && echo 0 || echo 1)" "Failed to add user to organization" + else + echo "User '$USER_EMAIL' already exists in organization." + fi +} + +function create_project() { + PROJECT_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/projects" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" | jq -r '.projects[] | select(.name=="'$PROJECT_NAME'") | .id') + + for (( i=0; i<=TIMEOUT; i+=SLEEP_INTERVAL )); do + CLUSTER_STATUS=$(curl -s -X GET "${AIRM_API_URL}/v1/clusters/$CLUSTER_ID" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r '.status') + + if [ "$CLUSTER_STATUS" == "healthy" ]; then + echo "Cluster is healthy!" + break # Exit the loop if the cluster is healthy + fi + echo "Cluster status: $CLUSTER_STATUS. Waiting $SLEEP_INTERVAL seconds... ($i/$TIMEOUT seconds elapsed)" + sleep $SLEEP_INTERVAL + done + + if [ "$CLUSTER_STATUS" != "healthy" ]; then + echo "ERROR: Cluster did not become healthy within $TIMEOUT seconds." + exit 1 + fi + + if [ -z "$PROJECT_ID" ] || [ "$PROJECT_ID" == "null" ]; then + echo "Projects '$PROJECT_NAME' not found. Creating..." + PROJECT_ID=$(curl -X 'POST' \ + "${AIRM_API_URL}/v1/projects" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' \ + -d '{ + "name": "'"$PROJECT_NAME"'", + "description": "'"$PROJECT_DESCRIPTION"'", + "cluster_id": "'"$CLUSTER_ID"'", + "quota": { + "cpu_milli_cores": 0, + "memory_bytes": 0, + "ephemeral_storage_bytes": 0, + "gpu_count": 0 + } + }' | jq -r '.id') + echo "$PROJECT_ID" + check_success "$([[ "$PROJECT_ID" != "null" ]] && echo 0 || echo 1)" "Failed to create project" + else + echo "Project '$PROJECT_NAME' already exists with ID: $PROJECT_ID" + fi +} + +function add_minio_secret_and_storage_to_project() { + for (( i=0; i<=TIMEOUT; i+=SLEEP_INTERVAL )); do + PROJECT_STATUS=$(curl -s -X GET "${AIRM_API_URL}/v1/projects/$PROJECT_ID" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r '.status') + + if [ "$PROJECT_STATUS" == "Ready" ]; then + echo "Project is ready!" + break # Exit the loop if the project is ready + fi + echo "Project status: $PROJECT_STATUS. Waiting $SLEEP_INTERVAL seconds... ($i/$TIMEOUT seconds elapsed)" + sleep $SLEEP_INTERVAL + done + + SECRET_NAME="minio-credentials-fetcher" + STORAGE_NAME="minio-storage" + + SECRET_IN_PROJECT=$(curl -X 'GET' \ + "${AIRM_API_URL}/v1/projects/${PROJECT_ID}/secrets" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer ${TOKEN}" | jq -r '.project_secrets[] | select(.secret.name=="'"$SECRET_NAME"'") | .id') + EXTERNAL_SECRET_API_VERSION="v1beta1" + EXTERNAL_SECRET_MANIFEST=$(cat < /dev/null 2>&1; then + echo "AIRM API is ready!" + break + else + echo "Waiting for AIRM API..." + sleep 10 + fi + done + + echo "All dependencies are ready!" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: configure + image: "{{ .Values.airm.utilities.clusterTool.image.repository }}:{{ .Values.airm.utilities.clusterTool.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.clusterTool.image.pullPolicy }}" + command: ["/bin/bash"] + args: ["/scripts/configure.sh"] + env: + - name: DEBIAN_FRONTEND + value: "noninteractive" + - name: ORG_NAME + value: "demo" + - name: NEW_DOMAIN_NAME + value: "{{ .Values.airm.appDomain }}" + - name: KEYCLOAK_CLIENT_ID + value: "{{ .Values.airm.keycloak.clientId }}" + - name: KEYCLOAK_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: KEYCLOAK_SECRET + name: "{{ .Release.Name }}-keycloak-ui-creds" + - name: USER_EMAIL + value: "devuser@{{ .Values.airm.appDomain }}" + - name: KEYCLOAK_URL + value: "{{ .Values.airm.keycloak.internalUrl }}" + - name: KEYCLOAK_REALM + value: "{{ .Values.airm.keycloak.realm }}" + - name: KEYCLOAK_ADMIN_CLIENT_ID + valueFrom: + secretKeyRef: + key: client-id + name: "{{ .Release.Name }}-keycloak-admin-client" + - name: KEYCLOAK_ADMIN_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: client-secret + name: "{{ .Release.Name }}-keycloak-admin-client" + - name: AIRM_API_URL + value: "http://{{ .Release.Name }}-api.{{ .Release.Namespace }}.svc.cluster.local" + - name: USER_PASSWORD + valueFrom: + secretKeyRef: + key: USER_PASSWORD + name: "{{ .Release.Name }}-user-credentials" + volumeMounts: + - name: configure-script + mountPath: /scripts + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + volumes: + - name: configure-script + configMap: + name: "{{ .Release.Name }}-configure-script" + defaultMode: 0755 + +{{- end }} diff --git a/sources/airm/0.3.4/charts/airm-api/templates/airm-es.yaml b/sources/airm/0.3.4/charts/airm-api/templates/airm-es.yaml new file mode 100644 index 00000000..4dd18aeb --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/templates/airm-es.yaml @@ -0,0 +1,215 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cnpg-superuser" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-cnpg-superuser-username + property: value + secretKey: username + - remoteRef: + key: airm-cnpg-superuser-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-cnpg-superuser" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cnpg-user" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-cnpg-user-username + property: value + secretKey: username + - remoteRef: + key: airm-cnpg-user-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-cnpg-user" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-keycloak-admin-client" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-keycloak-admin-client-id + property: value + secretKey: client-id + - remoteRef: + key: airm-keycloak-admin-client-secret + property: value + secretKey: client-secret + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.keycloak.name }} + target: + name: "{{ .Release.Name }}-keycloak-admin-client" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-rabbitmq-admin" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-rabbitmq-user-username + property: value + secretKey: username + - remoteRef: + key: airm-rabbitmq-user-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-rabbitmq-admin" + template: + data: + default_user.conf: | + default_user = {{ "{{ .username }}" }} + default_pass = {{ "{{ .password }}" }} + password: '{{ "{{ .password }}" }}' + username: '{{ "{{ .username }}" }}' + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-api-minio-credentials" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: minio-api-access-key + property: value + secretKey: minio-access-key + - remoteRef: + key: minio-api-secret-key + property: value + secretKey: minio-secret-key + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.minio.name }} + target: + name: "{{ .Release.Name }}-api-minio-credentials" +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-secrets-airm" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-ui-auth-nextauth-secret + property: value + secretKey: NEXTAUTH_SECRET + refreshInterval: 15s + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-secrets-airm" +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-keycloak-ui-client" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-ui-keycloak-secret + property: value + secretKey: KEYCLOAK_SECRET + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.keycloak.name }} + target: + name: "{{ .Release.Name }}-keycloak-ui-creds" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cluster-auth-secrets" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + refreshInterval: 5m + target: + name: "{{ .Release.Name }}-cluster-auth-admin" + data: + - secretKey: admin-token + remoteRef: + key: cluster-auth-admin-token + property: value +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-user-credentials" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: keycloak-initial-devuser-password + property: value + secretKey: USER_PASSWORD + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-user-credentials" + template: + type: Opaque diff --git a/sources/airm/0.3.4/charts/airm-api/templates/airm-httproute.yaml b/sources/airm/0.3.4/charts/airm-api/templates/airm-httproute.yaml new file mode 100644 index 00000000..3393d6a5 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/templates/airm-httproute.yaml @@ -0,0 +1,81 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: '{{ .Release.Name }}api-route' + namespace: '{{ .Release.Namespace }}' +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: {{ .Values.kgateway.gatewayName }} + namespace: {{ .Values.kgateway.namespace }} + rules: + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-api' + port: {{ .Values.kgateway.airmapi.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmapi.prefixValue }}\..* + path: + type: RegularExpression + value: .*/stream.* + timeouts: + backendRequest: {{ .Values.kgateway.airmapi.timeouts.stream.backendRequest }} + request: {{ .Values.kgateway.airmapi.timeouts.stream.request }} + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-api' + port: {{ .Values.kgateway.airmapi.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmapi.prefixValue }}\..* + path: + type: PathPrefix + value: / + timeouts: + backendRequest: {{ .Values.kgateway.airmapi.timeouts.nonStream.backendRequest }} + request: {{ .Values.kgateway.airmapi.timeouts.nonStream.request }} +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: '{{ .Release.Name }}ui-route' + namespace: '{{ .Release.Namespace }}' +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: {{ .Values.kgateway.gatewayName }} + namespace: {{ .Values.kgateway.namespace }} + rules: + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-ui' + port: {{ .Values.kgateway.airmui.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmui.prefixValue }}\..* + path: + type: PathPrefix + value: / + timeouts: + backendRequest: {{ .Values.kgateway.airmui.timeouts.backendRequest }} + request: {{ .Values.kgateway.airmui.timeouts.request }} diff --git a/sources/airm/0.3.4/charts/airm-api/templates/airm-rabbitmq-cluster.yaml b/sources/airm/0.3.4/charts/airm-api/templates/airm-rabbitmq-cluster.yaml new file mode 100644 index 00000000..3db2ff07 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/templates/airm-rabbitmq-cluster.yaml @@ -0,0 +1,69 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: rabbitmq.com/v1beta1 +kind: RabbitmqCluster +metadata: + name: '{{ .Release.Name }}-rabbitmq' + namespace: '{{ .Release.Namespace }}' +spec: + persistence: + {{- toYaml .Values.airm.rabbitmq.persistence | nindent 4 }} + replicas: {{ .Values.airm.rabbitmq.replicas }} + resources: + {{- toYaml .Values.airm.rabbitmq.resources | nindent 4 }} + secretBackend: + externalSecret: + name: '{{ .Release.Name }}-rabbitmq-admin' + tls: + secretName: '{{ .Release.Name }}-tls-secret' +--- +{{- if .Values.airm.rabbitmq.backup.enabled -}} + +apiVersion: batch/v1 +kind: CronJob +metadata: + name: '{{ .Release.Name }}-rabbitmq-backup-cron' + namespace: '{{ .Release.Namespace }}' +spec: + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + spec: + containers: + - env: + - name: RABBITMQ_URL + value: 'http://{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local:15672' + - name: RABBITMQ_USER + valueFrom: + secretKeyRef: + key: username + name: '{{ .Release.Name }}-rabbitmq-admin' + - name: RABBITMQ_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: '{{ .Release.Name }}-rabbitmq-admin' + - name: S3_HOST + value: "{{ .Values.airm.backend.env.minioUrl }}" + - name: S3_ACCESS_KEY + valueFrom: + secretKeyRef: + key: minio-access-key + name: '{{ .Release.Name }}-api-minio-credentials' + - name: S3_SECRET_KEY + valueFrom: + secretKeyRef: + key: minio-secret-key + name: '{{ .Release.Name }}-api-minio-credentials' + image: '{{ .Values.airm.rabbitmq.backup.image }}' + name: rabbitmq-backup-cron + resources: + {{- toYaml .Values.airm.rabbitmq.backup.resources | nindent 16 }} + restartPolicy: OnFailure + schedule: 0 * * * * + +{{- end }} diff --git a/sources/airm/0.3.4/charts/airm-api/templates/airm-vllm-collector.yaml b/sources/airm/0.3.4/charts/airm-api/templates/airm-vllm-collector.yaml new file mode 100644 index 00000000..f12aa532 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/templates/airm-vllm-collector.yaml @@ -0,0 +1,93 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: "{{ .Release.Name }}-{{ .Values.aims.otelCollector.name }}" + namespace: "{{ .Release.Namespace }}" +spec: + mode: daemonset + image: "{{ .Values.aims.otelCollector.image }}" + config: + receivers: + prometheus: + config: + scrape_configs: + - job_name: "vllm" + metrics_path: /metrics + scrape_interval: "{{ .Values.aims.otelCollector.receivers.prometheus.config.scrape_configs.scrape_interval }}" + kubernetes_sd_configs: + - role: pod + relabel_configs: + # Only scrape pods with the workload-id label + - source_labels: + [__meta_kubernetes_pod_label_airm_silogen_ai_workload_id] + action: keep + regex: .+ + # Only scrape pods with app label starting with isvc. + - source_labels: [__meta_kubernetes_pod_label_app] + action: keep + regex: isvc\..* + # Set the workload_id from the label + - source_labels: + [__meta_kubernetes_pod_label_airm_silogen_ai_workload_id] + target_label: workload_id + # Set service name from app label + - source_labels: [__meta_kubernetes_pod_label_app] + target_label: service + # Set service instance id from pod name + - source_labels: [__meta_kubernetes_pod_name] + target_label: service_instance_id + # Set the scrape target to port 8000 + - source_labels: [__meta_kubernetes_pod_ip] + target_label: __address__ + replacement: $1:8000 + otlp: + protocols: + grpc: {} + http: {} + + processors: + resource: + attributes: + - key: airm.silogen.ai/workload-id + from_attribute: workload_id + action: upsert + - key: service.instance.id + from_attribute: service_instance_id + action: upsert + - key: service.name + from_attribute: service + action: upsert + + transform: + metric_statements: + - context: datapoint + statements: + - set(attributes["workload_id"], resource.attributes["airm.silogen.ai/workload-id"]) where attributes["workload_id"] == nil + - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where attributes["service_instance_id"] == nil + - set(attributes["service"], resource.attributes["service.name"]) where attributes["service"] == nil + + exporters: + otlphttp: + endpoint: "{{ .Values.aims.otelCollector.exporters.otlphttp.endpoint }}" + + service: + pipelines: + metrics: + receivers: [prometheus] + processors: [resource, transform] + exporters: [otlphttp] + + traces: + receivers: [otlp] + processors: [resource] + exporters: [otlphttp] + + logs: + receivers: [otlp] + processors: [resource] + exporters: [otlphttp] diff --git a/sources/airm/0.3.4/charts/airm-api/values.yaml b/sources/airm/0.3.4/charts/airm-api/values.yaml new file mode 100644 index 00000000..9ee63f06 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/values.yaml @@ -0,0 +1,166 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +secretgenerator: + image: + repository: amdenterpriseai/cluster-tool + tag: latest + pullPolicy: IfNotPresent + +kgateway: + namespace: kgateway-system + gatewayName: https + airmapi: + servicePort: 80 + prefixValue: airmapi + timeouts: + stream: + backendRequest: 30m + request: 30m + nonStream: + backendRequest: 10m + request: 10m + airmui: + servicePort: 80 + prefixValue: airmui + timeouts: + backendRequest: 1m + request: 1m + keycloak: + prefixValue: kc + +aims: + otelCollector: + image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0 + receivers: + prometheus: + config: + scrape_configs: + scrape_interval: 20s + exporters: + otlphttp: + endpoint: "http://lgtm-stack.otel-lgtm-stack.svc:4318" + name: "vllm-collector" + +airm: + appDomain: PUBLIC-IP + includeDemoSetup: true + + externalSecretStore: + airm: + name: openbao-secret-store + minio: + name: openbao-secret-store + keycloak: + name: openbao-secret-store + + postgresql: + enabled: true + cnpg: + image: ghcr.io/cloudnative-pg/postgresql:17 + instance: 1 + resources: + limits: + cpu: "2" + memory: 1Gi + requests: + cpu: "1" + memory: 512Mi + storage: + size: 50Gi + storageClass: default + walStorage: + size: 50Gi + storageClass: default + + rabbitmq: + replicas: 1 + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: '1' + memory: 1Gi + persistence: + storage: 20Gi + storageClassName: default + backup: + enabled: false + image: amdenterpriseai/rabbitmq-backup:0.1 + resources: + limits: + memory: 512Mi + requests: + cpu: 250m + memory: 256Mi + + keycloak: + internalUrl: http://keycloak.keycloak.svc.cluster.local:8080 + clientId: "354a0fa1-35ac-4a6d-9c4d-d661129c2cd0" + realm: airm + + frontend: + image: + repository: amdenterpriseai/airm-ui + tag: 0.3.4 + pullPolicy: IfNotPresent + servicePort: 80 + resources: + limits: + memory: 4Gi + requests: + cpu: 500m + memory: 4Gi + + backend: + image: + repository: amdenterpriseai/airm-api + tag: 0.3.4 + pullPolicy: IfNotPresent + + servicePort: 80 + servicePortMetrics: 9009 + env: + dbPort: 5432 + rabbitmqPort: 5672 + minioUrl: http://minio.minio-tenant-default.svc.cluster.local:80 + minioBucket: default-bucket + prometheusUrl: http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:9090 + clusterAuthUrl: http://cluster-auth.cluster-auth.svc.cluster.local:8081 + + resources: + limits: + memory: 1Gi + requests: + cpu: 500m + memory: 1Gi + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + + utilities: + netcat: + image: + repository: busybox + tag: 1.37.0 + pullPolicy: IfNotPresent + curl: + image: + repository: curlimages/curl + tag: 8.16.0 + pullPolicy: IfNotPresent + liquibase: + image: + repository: docker.io/liquibase/liquibase + tag: 4.31 + pullPolicy: IfNotPresent + clusterTool: + image: + repository: amdenterpriseai/cluster-tool + tag: latest + pullPolicy: IfNotPresent diff --git a/sources/airm/0.3.4/charts/airm-dispatcher/.helmignore b/sources/airm/0.3.4/charts/airm-dispatcher/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-dispatcher/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.4/charts/airm-dispatcher/Chart.yaml b/sources/airm/0.3.4/charts/airm-dispatcher/Chart.yaml new file mode 100644 index 00000000..16fb1b13 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-dispatcher/Chart.yaml @@ -0,0 +1,29 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm-dispatcher +description: A Helm chart for AIRM Dispatcher + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.4 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" diff --git a/sources/airm/0.3.4/charts/airm-dispatcher/README.md b/sources/airm/0.3.4/charts/airm-dispatcher/README.md new file mode 100644 index 00000000..0b85c706 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-dispatcher/README.md @@ -0,0 +1,54 @@ + + +# HELM CHARTS + +Simple instructions to deploy AIRM dispatcher application using helm chart. +The dispatcher can be run on a compute cluster, which may or may not be the same as the one hosting the AIRM API and UI. + +### 1. Requirements + +The following external components must be available in the Kubernetes cluster before the helm chart can be installed. + +- Accessible RabbitMQ cluster (must be the same cluster used by AIRM API). +- Kaiwo installed on the cluster (along with all its dependencies) + +### 2. Install + +``` +cd helm/airm/charts + +# 1. Create output template just to validate (the public domain could be app-dev.silogen.ai, staging.silogen.ai, etc.) +helm template airm-dispatcher ./airm-dispatcher -n airm --create-namespace > airm-dispatcher-helm-generated.yaml + +# 2. Run chart install +helm install airm-dispatcher ./airm-dispatcher -n airm --create-namespace + +# 3. Delete chart if needed +helm delete airm-dispatcher -n airm + +# 4. Upgrade when bumping versions +helm upgrade -n airm --set airm-dispatcher ./airm-dispatcher +``` + +--- + +### 3. Helm Settings + +| Field Path | Description | Type | Example / Default | +|---------------------------------------------|--------------------------------------------------------------|---------|-----------------------------------| +| airm.dispatcher.image.repository | Dispatcher image repository | string | `amdenterpriseai/airm-dispatcher` | +| airm.dispatcher.image.tag | Dispatcher image tag | string | `v2025.08-rc.21` | +| airm.dispatcher.image.pullPolicy | Dispatcher image pull policy | string | `IfNotPresent` | +| airm.dispatcher.servicePort | Dispatcher service port | int | `80` | +| airm.utilities.netcat.image.repository | Netcat image repository | string | `busybox` | +| airm.utilities.netcat.image.tag | Netcat image tag | string | `1.37.0` | +| airm.utilities.netcat.image.pullPolicy | Netcat image pull policy | string | `IfNotPresent` | +| airm.utilities.curl.image.repository | Curl image repository | string | `curlimages/curl` | +| airm.utilities.curl.image.tag | Curl image tag | string | `8.16.0` | +| airm.utilities.curl.image.pullPolicy | Curl image pull policy | string | `IfNotPresent` | +| airm.additionalClusterRoles.platformAdmin | Additional cluster roles for the Platform Administrator role | array | `[]` | +| airm.additionalClusterRoles.projectMember | Additional cluster roles for the Project Member role | array | `[]` | diff --git a/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-cluster-policies.yaml b/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-cluster-policies.yaml new file mode 100644 index 00000000..caf92aa6 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-cluster-policies.yaml @@ -0,0 +1,352 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-project-namespace-rolebinding +spec: + background: false + rules: + - name: generate-project-namespace-rolebinding + match: + any: + - resources: + kinds: + - Namespace + operations: + - CREATE + preconditions: + any: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/project-id" || '''' }}`}}' + operator: NotEquals + value: "" + skipBackgroundRequests: true + generate: + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + name: "project-member-role-binding" + namespace: "{{`{{request.object.metadata.name}}`}}" + synchronize: true + data: + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: airm-project-member + subjects: + - kind: Group + # Add this in for backwards compatibility + name: "oidc{{`{{request.object.metadata.name}}`}}" + apiGroup: rbac.authorization.k8s.io + - kind: Group + # The kubernetes cluster apply an OIDC prefix of 'oidc:', so we adjust the groups to expect that + name: "oidc:{{`{{request.object.metadata.name}}`}}" + apiGroup: rbac.authorization.k8s.io +--- +# Kyverno policy that enforces that workloads submitted to a namespace managed by AIRMan have the +# correct kueue lables and field set, so that they are bound by the quota of the namespace +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-quota-enforcement-for-workloads +spec: + background: false + rules: + - name: set-queue-name-from-namespace-default + match: + resources: + kinds: + - Deployment + - StatefulSet + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + + - name: set-queue-name-from-namespace-jobs + match: + resources: + kinds: + - Job # https://kueue.sigs.k8s.io/docs/tasks/run/jobs/ + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + spec: + suspend: true + + - name: set-queue-name-from-namespace-cronjobs + match: + resources: + kinds: + - CronJob # https://kueue.sigs.k8s.io/docs/tasks/run/run_cronjobs/ + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.spec.jobTemplate.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + spec: + jobTemplate: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + spec: + suspend: true + + - name: set-queue-name-from-namespace-kaiwo + match: + resources: + kinds: + - KaiwoJob + - KaiwoService + - AIMService + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: "{{`{{request.object.spec.clusterQueue || '' }}`}}" + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + spec: + clusterQueue: "{{`{{request.namespace }}`}}" +--- +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-workload-tracking-policy +spec: + background: false + rules: + # For all supported types, if airm.silogen.ai/workload-id and airm.silogen.ai/component-id are not set, we assume + # it has been submitted from outside of AIRMan. In that case, we set airm.silogen.ai/auto-discovered: true, so it can + # be tracked upstream. We also set airm.silogen.ai/discovered-component-type so that we can identify the type of component + # that was originally tracked, and ignore children created by it. See remove-auto-discovered-annotations-inherited-from-parent + # We also try to capture the user who submitted the workload, and consume it in the application + + # Please note that ReplicaSet is not supported because by default it is filtered away by Kyverno by default: https://github.com/kyverno/kyverno/blob/main/charts/kyverno/values.yaml#L270 + - name: add-discovery-annotations-for-supported-types + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || '''' }}`}}' + operator: Equals + value: "" + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || '''' }}`}}' + operator: Equals + value: "" + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/auto-discovered" || '''' }}`}}' + operator: Equals + value: "" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + annotations: + airm.silogen.ai/submitter: "{{`{{request.userInfo.username }}`}}" + airm.silogen.ai/auto-discovered: "true" + airm.silogen.ai/discovered-component-type: "{{`{{request.object.kind }}`}}" + # For all supported types, if airm.silogen.ai/auto-discovered is set and the airm.silogen.ai/discovered-component-type + # doesnt match the kind of the current component, we assume this type has been created by a parent which is also + # supported by AIRMan and we dont need to track this type upstream, so we unset the airm.silogen.ai/auto-discovered annotation. + # This is mostly to account for KaiwoJob, KaiwoService, AIMService which propagate annotations to pods. + - name: remove-auto-discovered-annotations-inherited-from-parent + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/auto-discovered" || '''' }}`}}' + operator: Equals + value: "true" + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/discovered-component-type" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.object.kind }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + annotations: + airm.silogen.ai/auto-discovered: "false" + # For all supported types, if airm.silogen.ai/project-id does not match that of the namespace label, overwrite it + # with the expected value, to avoid metrics getting mixed up between projects. + - name: set-project-id-from-namespace-label + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + context: + - name: ns_labels + apiCall: + urlPath: "/api/v1/namespaces/{{`{{request.namespace }}`}}" + method: GET + jmesPath: "metadata.labels" + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/project-id" || '''' }}`}}' + operator: NotEquals + value: '{{`{{ns_labels."airm.silogen.ai/project-id" }}`}}' + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/project-id: '{{`{{ns_labels."airm.silogen.ai/project-id" }}`}}' + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to types that expect them at + # metadata.labels. The expectation is that these are propagated to the subsequent pods that are created. + + # If the resource is spawned off by a CRD, it will not know about the labels on the previous version of the object, + # so we also check request.oldObject for the labels to try and preserve them if they were already set. + - name: add-workload-and-component-id-default + match: + resources: + kinds: + - Pod + - KaiwoJob + - KaiwoService + - AIMService + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to objects with templates and + # also add it to spec.template.metadata.labels to ensure that the pods created by them contain the labels as well + - name: add-workload-and-component-id-to-objects-with-template + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + template: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to cronjob and + # also add it to spec.jobTemplate.metadata.labels to ensure that the pods created by the cronjob + # contain it as well + - name: add-workload-and-component-id-cronjobs + match: + resources: + kinds: + - CronJob + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + jobTemplate: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + template: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' diff --git a/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-cluster-roles.yaml b/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-cluster-roles.yaml new file mode 100644 index 00000000..2461e894 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-cluster-roles.yaml @@ -0,0 +1,164 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: airm-platform-admin +rules: + - apiGroups: [""] + resources: + [ + "pods", + "events", + "services", + "configmaps", + "persistentvolumes", + "persistentvolumeclaims", + "namespaces", + "serviceaccounts", + ] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: [""] + resources: ["pods/log", "pods/exec", "pods/attach", "pods/portforward"] + verbs: ["*"] + - apiGroups: ["apps"] + resources: ["deployments", "deployments/scale", "replicasets", "statefulsets", "daemonsets"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses", "networkpolicies"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimmodels", "aimmodelcaches", "aimservicetemplates", "aimruntimeconfigs", "aimtemplatecaches"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes", "gateways"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["config.kaiwo.silogen.ai"] + resources: ["kaiwoconfigs"] + verbs: ["*"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwoqueueconfigs"] + verbs: ["*"] + - apiGroups: ["aim.silogen.ai"] + resources: [ "aimclustermodels", "aimclusterservicetemplates", "aimclusterruntimeconfigs", "aimclustermodelsources" ] + verbs: ["*"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "clusterroles", "rolebindings", "clusterrolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["external-secrets.io"] + resources: ["externalsecrets"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: ["external-secrets.io"] + resources: ["clustersecretstores"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["kueue.x-k8s.io"] + resources: ["clusterqueues", "resourceflavors", "localqueues", "workloadpriorityclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list"] +{{- range .Values.airm.additionalClusterRoles.platformAdmin }} + - apiGroups: + {{ .apiGroups | toYaml | nindent 6 }} + resources: + {{ .resources | toYaml | nindent 6 }} + verbs: + {{ .verbs | toYaml | nindent 6 }} +{{- end }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: airm-platform-admin-role-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: airm-platform-admin +subjects: + - kind: Group + # Add this in for backwards compatibility + name: "oidcairm-role:Platform Administrator" + apiGroup: rbac.authorization.k8s.io + - kind: Group + # The kubernetes cluster apply an OIDC prefix of 'oidc':, so we adjust the group to expect that + name: "oidc:airm-role:Platform Administrator" + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: airm-project-member +rules: + - apiGroups: [""] + resources: + [ + "pods", + "pods/log", + "pods/exec", + "pods/attach", + "pods/portforward", + "events", + "services", + "configmaps", + "persistentvolumes", + "persistentvolumeclaims", + ] + verbs: ["*"] + - apiGroups: ["apps"] + resources: ["deployments", "replicasets", "statefulsets", "daemonsets"] + verbs: ["*"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["*"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses", "networkpolicies"] + verbs: ["*"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["*"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimmodels", "aimmodelcaches", "aimservicetemplates", "aimruntimeconfigs", "aimtemplatecaches"] + verbs: ["*"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes", "gateways"] + verbs: ["*"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch", "create"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "clusterroles", "rolebindings", "clusterrolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["external-secrets.io"] + resources: ["clustersecretstores", "externalsecrets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] +{{- range .Values.airm.additionalClusterRoles.projectMember }} + - apiGroups: + {{ .apiGroups | toYaml | nindent 6 }} + resources: + {{ .resources | toYaml | nindent 6 }} + verbs: + {{ .verbs | toYaml | nindent 6 }} +{{- end }} diff --git a/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml b/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml new file mode 100644 index 00000000..8a3489ef --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml @@ -0,0 +1,343 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-cluster-nodes-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Replace + jobTemplate: + spec: + template: + spec: + containers: + - command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/clusters/nodes + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + name: airm-cluster-nodes-cron + resources: + limits: + memory: 100Mi + requests: + cpu: 50m + memory: 100Mi + restartPolicy: OnFailure + schedule: 0 * * * * +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-aim-models-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Forbid + schedule: "*/5 * * * *" + suspend: false + jobTemplate: + spec: + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + restartPolicy: OnFailure + initContainers: + - name: check-dispatcher-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm Dispatcher at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm Dispatcher is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-dispatcher.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.servicePort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-aim-models-cron + command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/aims/cluster-models + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + resources: + requests: + memory: "100Mi" + cpu: "50m" + limits: + memory: "100Mi" +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-heartbeat-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Replace + schedule: "*/1 * * * *" + suspend: false + jobTemplate: + spec: + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + restartPolicy: OnFailure + initContainers: + - name: check-dispatcher-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm Dispatcher at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm Dispatcher is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-dispatcher.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.servicePort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-heartbeat-cron + command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/heartbeats + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + resources: + requests: + memory: "100Mi" + cpu: "50m" + limits: + memory: "100Mi" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: "{{ .Release.Name }}-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + replicas: 1 + selector: + matchLabels: + app: "{{ .Release.Name }}-dispatcher" + template: + metadata: + labels: + app: "{{ .Release.Name }}-dispatcher" + spec: + serviceAccountName: "{{ .Release.Name }}-dispatcher-sa" + {{- with .Values.airm.dispatcher.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + initContainers: + - name: check-rabbitmq-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm RabbitMQ at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm RabbitMQ is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.env.rabbitmqPort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-dispatcher + image: "{{ .Values.airm.dispatcher.image.repository }}:{{ .Values.airm.dispatcher.image.tag }}" + imagePullPolicy: "{{ .Values.airm.dispatcher.image.pullPolicy }}" + ports: + - containerPort: 8080 + env: + - name: KUBE_CLUSTER_NAME + value: demo-cluster + - name: ORG_NAME + value: demo + - name: RABBITMQ_HOST + value: "{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local" + - name: RABBITMQ_PORT + value: "{{ .Values.airm.dispatcher.env.rabbitmqPort }}" + - name: RABBITMQ_AIRM_COMMON_VHOST + value: "vh_airm_common" + - name: RABBITMQ_AIRM_COMMON_QUEUE + value: "airm_common" + - name: RABBITMQ_USER + valueFrom: + secretKeyRef: + name: "{{ .Release.Name }}-rabbitmq-common-vhost-user" + key: username + - name: RABBITMQ_PASSWORD + valueFrom: + secretKeyRef: + name: "{{ .Release.Name }}-rabbitmq-common-vhost-user" + key: password + livenessProbe: + httpGet: + path: /v1/health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + successThreshold: 1 + readinessProbe: + httpGet: + path: /v1/health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + successThreshold: 1 + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "1Gi" + securityContext: + runAsUser: 0 +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: "{{ .Release.Name }}-dispatcher-cluster-access-binding" +subjects: + - kind: ServiceAccount + name: "{{ .Release.Name }}-dispatcher-sa" + namespace: "{{ .Release.Namespace }}" +roleRef: + kind: ClusterRole + name: "{{ .Release.Name }}-dispatcher-cluster-access-role" + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: "{{ .Release.Name }}-dispatcher-cluster-access-role" +rules: + - apiGroups: [""] + resources: ["services", "namespaces", "configmaps", "pods"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["deployments", "statefulsets", "daemonsets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices", "kaiwoqueueconfigs"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["external-secrets.io"] + resources: ["externalsecrets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimclustermodels"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: "{{ .Release.Name }}-dispatcher-sa" + namespace: "{{ .Release.Namespace }}" + +--- +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}-dispatcher" + namespace: "{{ .Release.Namespace }}" + labels: + app: "{{ .Release.Name }}-dispatcher" +spec: + ports: + - name: web + port: {{ .Values.airm.dispatcher.servicePort }} + targetPort: 8080 + type: ClusterIP + selector: + app: "{{ .Release.Name }}-dispatcher" diff --git a/sources/airm/0.3.4/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml b/sources/airm/0.3.4/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml new file mode 100644 index 00000000..e930efd0 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml @@ -0,0 +1,35 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# These are additional cluster roles needed by kyverno background controller to be able to +# create rolebindings in namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kyverno:airm-policy-roles + labels: + rbac.kyverno.io/aggregate-to-background-controller: "true" +rules: + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterroles", "rolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] # allow kyverno to bind clusterroles via rolebindings + resources: ["clusterroles"] + verbs: ["bind"] +--- +# These are additional cluster roles needed by kyverno reports controller to be able to +# manage custom resources for reporting +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kyverno:airm-reports-policy-roles + labels: + rbac.kyverno.io/aggregate-to-reports-controller: "true" +rules: + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["get", "list", "watch"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices"] + verbs: ["get", "list", "watch"] diff --git a/sources/airm/0.3.4/charts/airm-dispatcher/values.yaml b/sources/airm/0.3.4/charts/airm-dispatcher/values.yaml new file mode 100644 index 00000000..acb0eb0f --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-dispatcher/values.yaml @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +airm: + dispatcher: + image: + repository: amdenterpriseai/airm-dispatcher + tag: 0.3.4 + pullPolicy: IfNotPresent + servicePort: 80 + env: + rabbitmqPort: 5672 + utilities: + netcat: + image: + repository: busybox + tag: 1.37.0 + pullPolicy: IfNotPresent + curl: + image: + repository: curlimages/curl + tag: 8.16.0 + pullPolicy: IfNotPresent + additionalClusterRoles: + platformAdmin: [] + projectMember: [] diff --git a/sources/airm/0.3.4/values.yaml b/sources/airm/0.3.4/values.yaml new file mode 100644 index 00000000..69346880 --- /dev/null +++ b/sources/airm/0.3.4/values.yaml @@ -0,0 +1,3 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT From fe4d359a37f0d74f80ff4c7f25655f20546ae68d Mon Sep 17 00:00:00 2001 From: brownzebra Date: Thu, 19 Feb 2026 18:58:01 +0200 Subject: [PATCH 014/115] bump-airm-version-values-sbom --- root/values.yaml | 2 +- sbom/components.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index ee1179a3..f4542b0d 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -724,7 +724,7 @@ apps: - /spec/accessModes # AMD Resource Manager (AIRM) airm: - path: airm/0.3.3 + path: airm/0.3.4 namespace: airm valuesFile: values.yaml helmParameters: diff --git a/sbom/components.yaml b/sbom/components.yaml index 8ca11e67..821ab6d0 100644 --- a/sbom/components.yaml +++ b/sbom/components.yaml @@ -10,7 +10,7 @@ components: license: MIT License licenseUrl: https://github.com/silogen/kaiwo/blob/main/LICENSE airm: - path: airm/0.3.3 + path: airm/0.3.4 valuesFile: values.yaml sourceUrl: https://github.com/silogen/cluster-forge/tree/main/sources/airm projectUrl: https://github.com/silogen/cluster-forge/tree/main/sources/airm From 54c379efe221b6d6779c6c34977180894af367a7 Mon Sep 17 00:00:00 2001 From: brownzebra Date: Thu, 19 Feb 2026 17:00:52 +0000 Subject: [PATCH 015/115] Update version to v1.8.0-rc3 [actions skip] --- root/values.yaml | 2 +- scripts/init-gitea-job/values.yaml | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index f4542b0d..e6394a97 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -1,6 +1,6 @@ clusterForge: repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" - targetRevision: v1.8.0-rc2 + targetRevision: v1.8.0-rc3 # source helm values file from separate git repo externalValues: enabled: true diff --git a/scripts/init-gitea-job/values.yaml b/scripts/init-gitea-job/values.yaml index 1813868f..888293da 100644 --- a/scripts/init-gitea-job/values.yaml +++ b/scripts/init-gitea-job/values.yaml @@ -1,9 +1,7 @@ # small|medium|large (injected by bootstrap script) clusterSize: null - # Domain for the cluster (injected by bootstrap script) # Example: "compute.amd.com" domain: null - # Git revision to deploy (injected by bootstrap script) -targetRevision: null \ No newline at end of file +targetRevision: v1.8.0-rc3 From 04810dffaa86a75e78759992e90c0a55d9c69819 Mon Sep 17 00:00:00 2001 From: brownzebra Date: Thu, 19 Feb 2026 18:11:17 +0000 Subject: [PATCH 016/115] Update version to v1.8.0-rc4 [actions skip] --- root/values.yaml | 2 +- scripts/init-gitea-job/values.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index e6394a97..7a20e4c5 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -1,6 +1,6 @@ clusterForge: repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" - targetRevision: v1.8.0-rc3 + targetRevision: v1.8.0-rc4 # source helm values file from separate git repo externalValues: enabled: true diff --git a/scripts/init-gitea-job/values.yaml b/scripts/init-gitea-job/values.yaml index 888293da..17075359 100644 --- a/scripts/init-gitea-job/values.yaml +++ b/scripts/init-gitea-job/values.yaml @@ -4,4 +4,4 @@ clusterSize: null # Example: "compute.amd.com" domain: null # Git revision to deploy (injected by bootstrap script) -targetRevision: v1.8.0-rc3 +targetRevision: v1.8.0-rc4 From 85cef488c0604e4e8b2ab8a2d51863ba956bd725 Mon Sep 17 00:00:00 2001 From: brownzebra Date: Fri, 20 Feb 2026 13:52:05 +0200 Subject: [PATCH 017/115] chore: bump airm to 0.3.5 --- sources/airm/0.3.5/.helmignore | 27 ++ sources/airm/0.3.5/Chart.yaml | 35 ++ .../airm/0.3.5/charts/airm-api/.helmignore | 27 ++ sources/airm/0.3.5/charts/airm-api/Chart.yaml | 29 ++ sources/airm/0.3.5/charts/airm-api/README.md | 124 ++++++ .../0.3.5/charts/airm-api/files/configure.sh | 374 ++++++++++++++++++ .../charts/airm-api/templates/_helpers.tpl | 15 + .../airm-api/templates/airm-app-backend.yaml | 268 +++++++++++++ .../airm-api/templates/airm-app-frontend.yaml | 85 ++++ .../airm-api/templates/airm-cert-issuer.yaml | 26 ++ .../templates/airm-cluster-roles.yaml | 34 ++ .../airm-cluster-runtime-config.yaml | 19 + .../airm-api/templates/airm-cluster.yaml | 47 +++ .../templates/airm-configure-job.yaml | 172 ++++++++ .../charts/airm-api/templates/airm-es.yaml | 215 ++++++++++ .../airm-api/templates/airm-httproute.yaml | 81 ++++ .../templates/airm-rabbitmq-cluster.yaml | 69 ++++ .../templates/airm-vllm-collector.yaml | 93 +++++ .../airm/0.3.5/charts/airm-api/values.yaml | 166 ++++++++ .../0.3.5/charts/airm-dispatcher/.helmignore | 27 ++ .../0.3.5/charts/airm-dispatcher/Chart.yaml | 29 ++ .../0.3.5/charts/airm-dispatcher/README.md | 54 +++ .../templates/airm-cluster-policies.yaml | 352 +++++++++++++++++ .../templates/airm-cluster-roles.yaml | 164 ++++++++ .../templates/airm-dispatcher-app.yaml | 343 ++++++++++++++++ .../templates/kyverno-cluster-role.yaml | 35 ++ .../0.3.5/charts/airm-dispatcher/values.yaml | 27 ++ sources/airm/0.3.5/values.yaml | 3 + 28 files changed, 2940 insertions(+) create mode 100644 sources/airm/0.3.5/.helmignore create mode 100644 sources/airm/0.3.5/Chart.yaml create mode 100644 sources/airm/0.3.5/charts/airm-api/.helmignore create mode 100644 sources/airm/0.3.5/charts/airm-api/Chart.yaml create mode 100644 sources/airm/0.3.5/charts/airm-api/README.md create mode 100644 sources/airm/0.3.5/charts/airm-api/files/configure.sh create mode 100644 sources/airm/0.3.5/charts/airm-api/templates/_helpers.tpl create mode 100644 sources/airm/0.3.5/charts/airm-api/templates/airm-app-backend.yaml create mode 100644 sources/airm/0.3.5/charts/airm-api/templates/airm-app-frontend.yaml create mode 100644 sources/airm/0.3.5/charts/airm-api/templates/airm-cert-issuer.yaml create mode 100644 sources/airm/0.3.5/charts/airm-api/templates/airm-cluster-roles.yaml create mode 100644 sources/airm/0.3.5/charts/airm-api/templates/airm-cluster-runtime-config.yaml create mode 100644 sources/airm/0.3.5/charts/airm-api/templates/airm-cluster.yaml create mode 100644 sources/airm/0.3.5/charts/airm-api/templates/airm-configure-job.yaml create mode 100644 sources/airm/0.3.5/charts/airm-api/templates/airm-es.yaml create mode 100644 sources/airm/0.3.5/charts/airm-api/templates/airm-httproute.yaml create mode 100644 sources/airm/0.3.5/charts/airm-api/templates/airm-rabbitmq-cluster.yaml create mode 100644 sources/airm/0.3.5/charts/airm-api/templates/airm-vllm-collector.yaml create mode 100644 sources/airm/0.3.5/charts/airm-api/values.yaml create mode 100644 sources/airm/0.3.5/charts/airm-dispatcher/.helmignore create mode 100644 sources/airm/0.3.5/charts/airm-dispatcher/Chart.yaml create mode 100644 sources/airm/0.3.5/charts/airm-dispatcher/README.md create mode 100644 sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-cluster-policies.yaml create mode 100644 sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-cluster-roles.yaml create mode 100644 sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml create mode 100644 sources/airm/0.3.5/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml create mode 100644 sources/airm/0.3.5/charts/airm-dispatcher/values.yaml create mode 100644 sources/airm/0.3.5/values.yaml diff --git a/sources/airm/0.3.5/.helmignore b/sources/airm/0.3.5/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.5/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.5/Chart.yaml b/sources/airm/0.3.5/Chart.yaml new file mode 100644 index 00000000..ea8fa28e --- /dev/null +++ b/sources/airm/0.3.5/Chart.yaml @@ -0,0 +1,35 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm +description: A Helm chart for AIRM full stack, including API, UI and dispatcher + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.5 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" + +dependencies: + - name: airm-api + version: 0.3.5 + - name: airm-dispatcher + version: 0.3.5 diff --git a/sources/airm/0.3.5/charts/airm-api/.helmignore b/sources/airm/0.3.5/charts/airm-api/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.5/charts/airm-api/Chart.yaml b/sources/airm/0.3.5/charts/airm-api/Chart.yaml new file mode 100644 index 00000000..d8cf0c72 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/Chart.yaml @@ -0,0 +1,29 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm-api +description: A Helm chart for AIRM API and UI + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.5 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" diff --git a/sources/airm/0.3.5/charts/airm-api/README.md b/sources/airm/0.3.5/charts/airm-api/README.md new file mode 100644 index 00000000..a16ec9da --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/README.md @@ -0,0 +1,124 @@ + + +# HELM CHARTS + +Simple instructions to deploy AIRM UI and API applications using helm chart + +### 1. Requirements + +The following external components must be available in the Kubernetes cluster before the helm chart can be installed. + +- API Gateway implementation (e.g. KGateway) +- Keycloak with the expected `airm` realm installed +- Valid S3 compatible file storage service (e.g. MinIO) +- RabbitMQ operator +- Cert Manager operator +- External Secret operator +- CNPG operator +- OTEL LGTM stack installed on the cluster + +### 2. Install + +``` +cd helm/airm/charts + +# 1. Create output template just to validate (the public domain could be app-dev.silogen.ai, staging.silogen.ai, etc.) +helm template airm-api ./airm-api -n airm --create-namespace --set airm.appDomain= > airm-api-helm-generated.yaml + +# 2. Run chart install +helm install airm-api ./airm-api -n airm --create-namespace --set airm.appDomain= + +# 3. Delete chart if needed +helm delete airm-api -n airm + +# 4. Upgrade when bumping versions +helm upgrade -n airm --set airm.appDomain= airm-api ./airm-api +``` + +--- + +### 3. Helm Settings + +| Field Path | Description | Type | Example / Default | +|-------------------------------------------------------------------------------|-----------------------------------------------------------------| ------ |---------------------------------------------------------------------------------------------------| +| secretgenerator.image.repository | Docker image repository for secret generator | string | `ghcr.io/silogen/kubectl` | +| secretgenerator.image.tag | Docker image tag | string | `latest` | +| secretgenerator.image.pullPolicy | Image pull policy | string | `IfNotPresent` | +| kgateway.namespace | Namespace for kgateway resources | string | `kgateway-system` | +| kgateway.gatewayName | Gateway name | string | `https` | +| kgateway.airmapi.servicePort | Service port for airmapi | int | `80` | +| kgateway.airmapi.prefixValue | URL prefix for airmapi service | string | `airmapi` | +| kgateway.airmui.servicePort | Service port for airmui | int | `80` | +| kgateway.airmui.prefixValue | URL prefix for airmui service | string | `airmui` | +| aims.otelCollector.exporters.otlphttp.endpoint | Open Telemetry collector endpoint url for inference metrics | string | `http://lgtm-stack.otel-lgtm-stack.svc:4318` | +| aims.otelCollector.image | Base image for Open Telemetry Collector | string | `ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0` | +| aims.otelCollector.receivers.prometheus.config.scrape_configs.scrape_interval | Inference metrics scraping interval | string | `20s` | +| airm.includeDemoSetup | Include the demo organization and project setup when installing | bool | `true` | +| airm.appDomain | Public IP or domain for airm | string | `PUBLIC-IP` | +| airm.externalSecretStore.airm.name | Secret store name for airm | string | `airm-secret-store` | +| airm.externalSecretStore.minio.name | Secret store name for minio | string | `k8s-secret-store` | +| airm.externalSecretStore.keycloak.name | Secret store name for keycloak | string | `keycloak-secret-store` | +| airm.keycloak.publicUrl | Public URL to access keycloak | string | `https://kc.{{ .Values.airm.appDomain }}` | +| airm.keycloak.internalUrl | Internal URL to access keycloak | string | `http://keycloak.keycloak.svc.cluster.local:8080` | +| airm.keycloak.clientId | Client ID to access keycloak | string | `354a0fa1-35ac-4a6d-9c4d-d661129c2cd0` | +| airm.keycloak.realm | Keycloak realm for authentication | string | `airm` | +| airm.postgresql.cnpg.image | PostgreSQL container image | string | `ghcr.io/cloudnative-pg/postgresql:17` | +| airm.postgresql.cnpg.instance | Number of PostgreSQL instances | int | `1` | +| airm.postgresql.cnpg.resources.limits.cpu | CPU limit for PostgreSQL container | string | `"2"` | +| airm.postgresql.cnpg.resources.limits.memory | Memory limit for PostgreSQL container | string | `1Gi` | +| airm.postgresql.cnpg.resources.requests.cpu | CPU request for PostgreSQL container | string | `"1"` | +| airm.postgresql.cnpg.resources.requests.memory | Memory request for PostgreSQL container | string | `512Mi` | +| airm.postgresql.cnpg.storage.size | Storage size for PostgreSQL | string | `50Gi` | +| airm.postgresql.cnpg.storage.storageClass | Storage class for PostgreSQL | string | `default` | +| airm.postgresql.cnpg.walStorage.size | WAL storage size for PostgreSQL | string | `50Gi` | +| airm.postgresql.cnpg.walStorage.storageClass | WAL storage class for PostgreSQL | string | `default` | +| airm.rabbitmq.replicas | Number of replicas for the RabbitMQ cluster | int | `1` | +| airm.rabbitmq.resources.limits.cpu | CPU limit for for the RabbitMQ cluster | string | `1` | +| airm.rabbitmq.resources.limits.memory | Memory limit for for the RabbitMQ cluster | string | `1Gi` | +| airm.rabbitmq.resources.requests.cpu | CPU request for the RabbitMQ cluster | string | `500m` | +| airm.rabbitmq.resources.requests.memory | Memory request for the RabbitMQ cluster | string | `1Gi` | +| airm.rabbitmq.persistence.storage | Persistent storage size for the RabbitMQ cluster | string | `20Gi` | +| airm.rabbitmq.persistence.storageClassName | Storage class name for the RabbitMQ cluster | string | `default` | +| airm.rabbitmq.backup.enabled | Enable RabbitMQ backup | bool | `false` | +| airm.rabbitmq.backup.image | RabbitMQ backup container image | string | `amdenterpriseai/rabbitmq-backup:0.1` | +| airm.rabbitmq.backup.resources.limits.memory | Memory limit for cron job of RabbitMQ backup | string | `512Mi` | +| airm.rabbitmq.backup.resources.requests.cpu | CPU request for cron job of RabbitMQ backup | string | `250m` | +| airm.rabbitmq.backup.resources.requests.memory | Memory request for cron job of RabbitMQ backup | string | `256Mi` | +| airm.frontend.image.repository | Frontend image repository | string | `amdenterpriseai/airm-ui` | +| airm.frontend.image.tag | Frontend image tag | string | `v2025.08-rc.21` | +| airm.frontend.image.pullPolicy | Frontend image pull policy | string | `IfNotPresent` | +| airm.frontend.servicePort | Frontend service port | int | `80` | +| airm.frontend.resources.limits.memory | Memory limit for frontend | string | `4Gi` | +| airm.frontend.resources.requests.cpu | CPU request for frontend | string | `500m` | +| airm.frontend.resources.requests.memory | Memory request for frontend | string | `4Gi` | +| airm.backend.image.repository | Backend API image repository | string | `amdenterpriseai/airm-api` | +| airm.backend.image.tag | Backend API image tag | string | `v2025.08-rc.21` | +| airm.backend.image.pullPolicy | Backend API image pull policy | string | `IfNotPresent` | +| airm.backend.servicePort | Backend API service port | int | `80` | +| airm.backend.servicePortMetrics | Backend API metrics service port | int | `9009` | +| airm.backend.env.dbPort | Database port | int | `5432` | +| airm.backend.env.rabbitmqPort | RabbitMQ port | int | `5672` | +| airm.backend.env.minioUrl | Minio service URL | string | `http://minio.minio-tenant-default.svc.cluster.local:80` | +| airm.backend.env.minioBucket | Minio bucket name | string | `default-bucket` | +| airm.backend.env.prometheusUrl | Prometheus service URL | string | `http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:9090` | +| airm.backend.env.clusterAuthUrl | Cluster auth service URL | string | `http://cluster-auth.cluster-auth.svc.cluster.local:8081` | +| airm.backend.resources.limits.memory | Memory limit for backend API container | string | `1Gi` | +| airm.backend.resources.requests.cpu | CPU request for backend API container | string | `500m` | +| airm.backend.resources.requests.memory | Memory request for backend API container | string | `1Gi` | +| airm.backend.securityContext.allowPrivilegeEscalation | Security context: allow privilege escalation | bool | `false` | +| airm.backend.securityContext.runAsNonRoot | Security context: run container as non-root | bool | `true` | +| airm.backend.securityContext.runAsUser | Security context: user ID to run container as | int | `1000` | +| airm.backend.securityContext.seccompProfile.type | Security context: seccomp profile type | string | `RuntimeDefault` | +| airm.utilities.netcat.image.repository | Netcat image repository | string | `busybox` | +| airm.utilities.netcat.image.tag | Netcat image tag | string | `1.37.0` | +| airm.utilities.netcat.image.pullPolicy | Netcat image pull policy | string | `IfNotPresent` | +| airm.utilities.curl.image.repository | Curl image repository | string | `curlimages/curl` | +| airm.utilities.curl.image.tag | Curl image tag | string | `8.16.0` | +| airm.utilities.curl.image.pullPolicy | Curl image pull policy | string | `IfNotPresent` | +| airm.utilities.liquibase.image.repository | Liquibase image repository | string | `docker.io/liquibase/liquibase` | +| airm.utilities.liquibase.image.tag | Liquibase image tag | string | `4.31` | +| airm.utilities.liquibase.image.pullPolicy | Liquibase image pull policy | string | `IfNotPresent` | diff --git a/sources/airm/0.3.5/charts/airm-api/files/configure.sh b/sources/airm/0.3.5/charts/airm-api/files/configure.sh new file mode 100644 index 00000000..69a3f59d --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/files/configure.sh @@ -0,0 +1,374 @@ +#!/bin/bash + +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +##################################################################################### +echo "" +echo "Run configure script block..." +echo "" + +# --- Configuration Variables --- +# Get values from bloom configmap mounted as env + +# NOTE: ORG_NAME is hardcoded to demo because gpu operator metrics has same org name hardcoded there +# Otherwise the following line can be uncommented to consider the real org name from domain config +# ORG_NAME=$(echo $NEW_DOMAIN_NAME | awk -F '.' '{ print $2 }') +ORG_NAME="demo" +ORG_DOMAINS="[\"${NEW_DOMAIN_NAME}\"]" +CLUSTER_WORKLOADS_BASE_URL="https://workspaces.${NEW_DOMAIN_NAME}/" +CLUSTER_KUBE_API_URL="https://k8s.${NEW_DOMAIN_NAME}" +USER_EMAIL="devuser@${NEW_DOMAIN_NAME}" +PROJECT_NAME="demo" +PROJECT_DESCRIPTION="demo" +CLUSTER_NAME="demo-cluster" +TIMEOUT=300 +SLEEP_INTERVAL=5 + +# --- Input Validation --- +echo "Validating environment variables..." +echo "KEYCLOAK_CLIENT_ID: ${KEYCLOAK_CLIENT_ID}" +echo "NEW_DOMAIN_NAME: ${NEW_DOMAIN_NAME}" +echo "AIRM_API_URL: ${AIRM_API_URL}" + +function check_env_variable() { + if [ -z "${!1}" ]; then + echo "ERROR: $1 environment variable is not set." + exit 1 + fi +} + +function check_success() { + if [ "$1" -ne 0 ]; then + echo "ERROR: $2" + exit 1 + fi +} + +check_env_variable "AIRM_API_URL" +check_env_variable "KEYCLOAK_URL" +check_env_variable "KEYCLOAK_REALM" +check_env_variable "KEYCLOAK_CLIENT_SECRET" +check_env_variable "KEYCLOAK_CLIENT_ID" +check_env_variable "KEYCLOAK_ADMIN_CLIENT_ID" +check_env_variable "KEYCLOAK_ADMIN_CLIENT_SECRET" +check_env_variable "USER_PASSWORD" + +function refresh_token() { + TOKEN=$(curl -s -d "client_id=${KEYCLOAK_CLIENT_ID}" -d "username=${USER_EMAIL}" -d "password=${USER_PASSWORD}" -d 'grant_type=password' -d "client_secret=${KEYCLOAK_CLIENT_SECRET}" "${KEYCLOAK_URL}/realms/${KEYCLOAK_REALM}/protocol/openid-connect/token" | jq -r '.access_token') + if [ -z "$TOKEN" ] || [ "$TOKEN" == "null" ]; then + echo "ERROR: Failed to obtain access token from Keycloak." + exit 1 + fi +} + +function create_org() { + # Try to get ORG_ID by name + ORG_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r --arg name "$ORG_NAME" '.organizations[] | select(.name==$name) | .id') + + # If not found, create the org and fetch the ID again + if [ -z "$ORG_ID" ] || [ "$ORG_ID" == "null" ]; then + ORG_RESP=$(curl -s -o /dev/null -X POST -w "%{http_code}" "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' \ + -d "{ \"name\": \"$ORG_NAME\", \"domains\": $ORG_DOMAINS }") + echo "$ORG_RESP" + check_success "$([[ "$ORG_RESP" == "200" || "$ORG_RESP" == "201" ]] && echo 0 || echo 1)" "Failed to create organization" + + ORG_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r --arg name "$ORG_NAME" '.organizations[] | select(.name==$name) | .id') + fi + + if [ -z "$ORG_ID" ] || [ "$ORG_ID" == "null" ]; then + echo "ERROR: Failed to create or retrieve organization ID." + exit 1 + else + echo "ORG_ID=${ORG_ID}" + fi +} + +function add_user_to_org() { + # Check if user exists in org + USER_EXISTS=$(curl -s -X GET "${AIRM_API_URL}/v1/users" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" -H 'Content-Type: application/json' | jq -r --arg email "$USER_EMAIL" '.data? // [] | .[] | select(.email==$email) | .email') + # Add user to org if they don't exist + if [ -z "$USER_EXISTS" ] || [ "$USER_EXISTS" == "null" ]; then + echo "$USER_EXISTS" + echo "User '$USER_EMAIL' not found in organization. Adding..." + ADD_USER_RESP=$(curl -w "%{http_code}" -X 'POST' "${AIRM_API_URL}/v1/organizations/${ORG_ID}/users" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" -H 'Content-Type: application/json' -d '{ "email": "'"$USER_EMAIL"'", "roles": ["Platform Administrator"]}') + echo "$ADD_USER_RESP" + check_success "$([[ "$ADD_USER_RESP" == "200" || "$ADD_USER_RESP" == "201" || "$ADD_USER_RESP" == "null201" ]] && echo 0 || echo 1)" "Failed to add user to organization" + else + echo "User '$USER_EMAIL' already exists in organization." + fi +} + +function create_project() { + PROJECT_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/projects" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" | jq -r '.projects[] | select(.name=="'$PROJECT_NAME'") | .id') + + for (( i=0; i<=TIMEOUT; i+=SLEEP_INTERVAL )); do + CLUSTER_STATUS=$(curl -s -X GET "${AIRM_API_URL}/v1/clusters/$CLUSTER_ID" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r '.status') + + if [ "$CLUSTER_STATUS" == "healthy" ]; then + echo "Cluster is healthy!" + break # Exit the loop if the cluster is healthy + fi + echo "Cluster status: $CLUSTER_STATUS. Waiting $SLEEP_INTERVAL seconds... ($i/$TIMEOUT seconds elapsed)" + sleep $SLEEP_INTERVAL + done + + if [ "$CLUSTER_STATUS" != "healthy" ]; then + echo "ERROR: Cluster did not become healthy within $TIMEOUT seconds." + exit 1 + fi + + if [ -z "$PROJECT_ID" ] || [ "$PROJECT_ID" == "null" ]; then + echo "Projects '$PROJECT_NAME' not found. Creating..." + PROJECT_ID=$(curl -X 'POST' \ + "${AIRM_API_URL}/v1/projects" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' \ + -d '{ + "name": "'"$PROJECT_NAME"'", + "description": "'"$PROJECT_DESCRIPTION"'", + "cluster_id": "'"$CLUSTER_ID"'", + "quota": { + "cpu_milli_cores": 0, + "memory_bytes": 0, + "ephemeral_storage_bytes": 0, + "gpu_count": 0 + } + }' | jq -r '.id') + echo "$PROJECT_ID" + check_success "$([[ "$PROJECT_ID" != "null" ]] && echo 0 || echo 1)" "Failed to create project" + else + echo "Project '$PROJECT_NAME' already exists with ID: $PROJECT_ID" + fi +} + +function add_minio_secret_and_storage_to_project() { + for (( i=0; i<=TIMEOUT; i+=SLEEP_INTERVAL )); do + PROJECT_STATUS=$(curl -s -X GET "${AIRM_API_URL}/v1/projects/$PROJECT_ID" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r '.status') + + if [ "$PROJECT_STATUS" == "Ready" ]; then + echo "Project is ready!" + break # Exit the loop if the project is ready + fi + echo "Project status: $PROJECT_STATUS. Waiting $SLEEP_INTERVAL seconds... ($i/$TIMEOUT seconds elapsed)" + sleep $SLEEP_INTERVAL + done + + SECRET_NAME="minio-credentials-fetcher" + STORAGE_NAME="minio-storage" + + SECRET_IN_PROJECT=$(curl -X 'GET' \ + "${AIRM_API_URL}/v1/projects/${PROJECT_ID}/secrets" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer ${TOKEN}" | jq -r '.project_secrets[] | select(.secret.name=="'"$SECRET_NAME"'") | .id') + EXTERNAL_SECRET_API_VERSION="v1beta1" + EXTERNAL_SECRET_MANIFEST=$(cat < /dev/null 2>&1; then + echo "AIRM API is ready!" + break + else + echo "Waiting for AIRM API..." + sleep 10 + fi + done + + echo "All dependencies are ready!" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: configure + image: "{{ .Values.airm.utilities.clusterTool.image.repository }}:{{ .Values.airm.utilities.clusterTool.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.clusterTool.image.pullPolicy }}" + command: ["/bin/bash"] + args: ["/scripts/configure.sh"] + env: + - name: DEBIAN_FRONTEND + value: "noninteractive" + - name: ORG_NAME + value: "demo" + - name: NEW_DOMAIN_NAME + value: "{{ .Values.airm.appDomain }}" + - name: KEYCLOAK_CLIENT_ID + value: "{{ .Values.airm.keycloak.clientId }}" + - name: KEYCLOAK_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: KEYCLOAK_SECRET + name: "{{ .Release.Name }}-keycloak-ui-creds" + - name: USER_EMAIL + value: "devuser@{{ .Values.airm.appDomain }}" + - name: KEYCLOAK_URL + value: "{{ .Values.airm.keycloak.internalUrl }}" + - name: KEYCLOAK_REALM + value: "{{ .Values.airm.keycloak.realm }}" + - name: KEYCLOAK_ADMIN_CLIENT_ID + valueFrom: + secretKeyRef: + key: client-id + name: "{{ .Release.Name }}-keycloak-admin-client" + - name: KEYCLOAK_ADMIN_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: client-secret + name: "{{ .Release.Name }}-keycloak-admin-client" + - name: AIRM_API_URL + value: "http://{{ .Release.Name }}-api.{{ .Release.Namespace }}.svc.cluster.local" + - name: USER_PASSWORD + valueFrom: + secretKeyRef: + key: USER_PASSWORD + name: "{{ .Release.Name }}-user-credentials" + volumeMounts: + - name: configure-script + mountPath: /scripts + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + volumes: + - name: configure-script + configMap: + name: "{{ .Release.Name }}-configure-script" + defaultMode: 0755 + +{{- end }} diff --git a/sources/airm/0.3.5/charts/airm-api/templates/airm-es.yaml b/sources/airm/0.3.5/charts/airm-api/templates/airm-es.yaml new file mode 100644 index 00000000..4dd18aeb --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/templates/airm-es.yaml @@ -0,0 +1,215 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cnpg-superuser" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-cnpg-superuser-username + property: value + secretKey: username + - remoteRef: + key: airm-cnpg-superuser-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-cnpg-superuser" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cnpg-user" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-cnpg-user-username + property: value + secretKey: username + - remoteRef: + key: airm-cnpg-user-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-cnpg-user" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-keycloak-admin-client" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-keycloak-admin-client-id + property: value + secretKey: client-id + - remoteRef: + key: airm-keycloak-admin-client-secret + property: value + secretKey: client-secret + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.keycloak.name }} + target: + name: "{{ .Release.Name }}-keycloak-admin-client" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-rabbitmq-admin" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-rabbitmq-user-username + property: value + secretKey: username + - remoteRef: + key: airm-rabbitmq-user-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-rabbitmq-admin" + template: + data: + default_user.conf: | + default_user = {{ "{{ .username }}" }} + default_pass = {{ "{{ .password }}" }} + password: '{{ "{{ .password }}" }}' + username: '{{ "{{ .username }}" }}' + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-api-minio-credentials" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: minio-api-access-key + property: value + secretKey: minio-access-key + - remoteRef: + key: minio-api-secret-key + property: value + secretKey: minio-secret-key + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.minio.name }} + target: + name: "{{ .Release.Name }}-api-minio-credentials" +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-secrets-airm" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-ui-auth-nextauth-secret + property: value + secretKey: NEXTAUTH_SECRET + refreshInterval: 15s + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-secrets-airm" +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-keycloak-ui-client" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-ui-keycloak-secret + property: value + secretKey: KEYCLOAK_SECRET + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.keycloak.name }} + target: + name: "{{ .Release.Name }}-keycloak-ui-creds" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cluster-auth-secrets" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + refreshInterval: 5m + target: + name: "{{ .Release.Name }}-cluster-auth-admin" + data: + - secretKey: admin-token + remoteRef: + key: cluster-auth-admin-token + property: value +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-user-credentials" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: keycloak-initial-devuser-password + property: value + secretKey: USER_PASSWORD + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-user-credentials" + template: + type: Opaque diff --git a/sources/airm/0.3.5/charts/airm-api/templates/airm-httproute.yaml b/sources/airm/0.3.5/charts/airm-api/templates/airm-httproute.yaml new file mode 100644 index 00000000..3393d6a5 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/templates/airm-httproute.yaml @@ -0,0 +1,81 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: '{{ .Release.Name }}api-route' + namespace: '{{ .Release.Namespace }}' +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: {{ .Values.kgateway.gatewayName }} + namespace: {{ .Values.kgateway.namespace }} + rules: + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-api' + port: {{ .Values.kgateway.airmapi.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmapi.prefixValue }}\..* + path: + type: RegularExpression + value: .*/stream.* + timeouts: + backendRequest: {{ .Values.kgateway.airmapi.timeouts.stream.backendRequest }} + request: {{ .Values.kgateway.airmapi.timeouts.stream.request }} + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-api' + port: {{ .Values.kgateway.airmapi.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmapi.prefixValue }}\..* + path: + type: PathPrefix + value: / + timeouts: + backendRequest: {{ .Values.kgateway.airmapi.timeouts.nonStream.backendRequest }} + request: {{ .Values.kgateway.airmapi.timeouts.nonStream.request }} +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: '{{ .Release.Name }}ui-route' + namespace: '{{ .Release.Namespace }}' +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: {{ .Values.kgateway.gatewayName }} + namespace: {{ .Values.kgateway.namespace }} + rules: + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-ui' + port: {{ .Values.kgateway.airmui.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmui.prefixValue }}\..* + path: + type: PathPrefix + value: / + timeouts: + backendRequest: {{ .Values.kgateway.airmui.timeouts.backendRequest }} + request: {{ .Values.kgateway.airmui.timeouts.request }} diff --git a/sources/airm/0.3.5/charts/airm-api/templates/airm-rabbitmq-cluster.yaml b/sources/airm/0.3.5/charts/airm-api/templates/airm-rabbitmq-cluster.yaml new file mode 100644 index 00000000..3db2ff07 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/templates/airm-rabbitmq-cluster.yaml @@ -0,0 +1,69 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: rabbitmq.com/v1beta1 +kind: RabbitmqCluster +metadata: + name: '{{ .Release.Name }}-rabbitmq' + namespace: '{{ .Release.Namespace }}' +spec: + persistence: + {{- toYaml .Values.airm.rabbitmq.persistence | nindent 4 }} + replicas: {{ .Values.airm.rabbitmq.replicas }} + resources: + {{- toYaml .Values.airm.rabbitmq.resources | nindent 4 }} + secretBackend: + externalSecret: + name: '{{ .Release.Name }}-rabbitmq-admin' + tls: + secretName: '{{ .Release.Name }}-tls-secret' +--- +{{- if .Values.airm.rabbitmq.backup.enabled -}} + +apiVersion: batch/v1 +kind: CronJob +metadata: + name: '{{ .Release.Name }}-rabbitmq-backup-cron' + namespace: '{{ .Release.Namespace }}' +spec: + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + spec: + containers: + - env: + - name: RABBITMQ_URL + value: 'http://{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local:15672' + - name: RABBITMQ_USER + valueFrom: + secretKeyRef: + key: username + name: '{{ .Release.Name }}-rabbitmq-admin' + - name: RABBITMQ_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: '{{ .Release.Name }}-rabbitmq-admin' + - name: S3_HOST + value: "{{ .Values.airm.backend.env.minioUrl }}" + - name: S3_ACCESS_KEY + valueFrom: + secretKeyRef: + key: minio-access-key + name: '{{ .Release.Name }}-api-minio-credentials' + - name: S3_SECRET_KEY + valueFrom: + secretKeyRef: + key: minio-secret-key + name: '{{ .Release.Name }}-api-minio-credentials' + image: '{{ .Values.airm.rabbitmq.backup.image }}' + name: rabbitmq-backup-cron + resources: + {{- toYaml .Values.airm.rabbitmq.backup.resources | nindent 16 }} + restartPolicy: OnFailure + schedule: 0 * * * * + +{{- end }} diff --git a/sources/airm/0.3.5/charts/airm-api/templates/airm-vllm-collector.yaml b/sources/airm/0.3.5/charts/airm-api/templates/airm-vllm-collector.yaml new file mode 100644 index 00000000..f12aa532 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/templates/airm-vllm-collector.yaml @@ -0,0 +1,93 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: "{{ .Release.Name }}-{{ .Values.aims.otelCollector.name }}" + namespace: "{{ .Release.Namespace }}" +spec: + mode: daemonset + image: "{{ .Values.aims.otelCollector.image }}" + config: + receivers: + prometheus: + config: + scrape_configs: + - job_name: "vllm" + metrics_path: /metrics + scrape_interval: "{{ .Values.aims.otelCollector.receivers.prometheus.config.scrape_configs.scrape_interval }}" + kubernetes_sd_configs: + - role: pod + relabel_configs: + # Only scrape pods with the workload-id label + - source_labels: + [__meta_kubernetes_pod_label_airm_silogen_ai_workload_id] + action: keep + regex: .+ + # Only scrape pods with app label starting with isvc. + - source_labels: [__meta_kubernetes_pod_label_app] + action: keep + regex: isvc\..* + # Set the workload_id from the label + - source_labels: + [__meta_kubernetes_pod_label_airm_silogen_ai_workload_id] + target_label: workload_id + # Set service name from app label + - source_labels: [__meta_kubernetes_pod_label_app] + target_label: service + # Set service instance id from pod name + - source_labels: [__meta_kubernetes_pod_name] + target_label: service_instance_id + # Set the scrape target to port 8000 + - source_labels: [__meta_kubernetes_pod_ip] + target_label: __address__ + replacement: $1:8000 + otlp: + protocols: + grpc: {} + http: {} + + processors: + resource: + attributes: + - key: airm.silogen.ai/workload-id + from_attribute: workload_id + action: upsert + - key: service.instance.id + from_attribute: service_instance_id + action: upsert + - key: service.name + from_attribute: service + action: upsert + + transform: + metric_statements: + - context: datapoint + statements: + - set(attributes["workload_id"], resource.attributes["airm.silogen.ai/workload-id"]) where attributes["workload_id"] == nil + - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where attributes["service_instance_id"] == nil + - set(attributes["service"], resource.attributes["service.name"]) where attributes["service"] == nil + + exporters: + otlphttp: + endpoint: "{{ .Values.aims.otelCollector.exporters.otlphttp.endpoint }}" + + service: + pipelines: + metrics: + receivers: [prometheus] + processors: [resource, transform] + exporters: [otlphttp] + + traces: + receivers: [otlp] + processors: [resource] + exporters: [otlphttp] + + logs: + receivers: [otlp] + processors: [resource] + exporters: [otlphttp] diff --git a/sources/airm/0.3.5/charts/airm-api/values.yaml b/sources/airm/0.3.5/charts/airm-api/values.yaml new file mode 100644 index 00000000..4213af3d --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/values.yaml @@ -0,0 +1,166 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +secretgenerator: + image: + repository: amdenterpriseai/cluster-tool + tag: latest + pullPolicy: IfNotPresent + +kgateway: + namespace: kgateway-system + gatewayName: https + airmapi: + servicePort: 80 + prefixValue: airmapi + timeouts: + stream: + backendRequest: 30m + request: 30m + nonStream: + backendRequest: 10m + request: 10m + airmui: + servicePort: 80 + prefixValue: airmui + timeouts: + backendRequest: 1m + request: 1m + keycloak: + prefixValue: kc + +aims: + otelCollector: + image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0 + receivers: + prometheus: + config: + scrape_configs: + scrape_interval: 20s + exporters: + otlphttp: + endpoint: "http://lgtm-stack.otel-lgtm-stack.svc:4318" + name: "vllm-collector" + +airm: + appDomain: PUBLIC-IP + includeDemoSetup: true + + externalSecretStore: + airm: + name: openbao-secret-store + minio: + name: openbao-secret-store + keycloak: + name: openbao-secret-store + + postgresql: + enabled: true + cnpg: + image: ghcr.io/cloudnative-pg/postgresql:17 + instance: 1 + resources: + limits: + cpu: "2" + memory: 1Gi + requests: + cpu: "1" + memory: 512Mi + storage: + size: 50Gi + storageClass: default + walStorage: + size: 50Gi + storageClass: default + + rabbitmq: + replicas: 1 + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: '1' + memory: 1Gi + persistence: + storage: 20Gi + storageClassName: default + backup: + enabled: false + image: amdenterpriseai/rabbitmq-backup:0.1 + resources: + limits: + memory: 512Mi + requests: + cpu: 250m + memory: 256Mi + + keycloak: + internalUrl: http://keycloak.keycloak.svc.cluster.local:8080 + clientId: "354a0fa1-35ac-4a6d-9c4d-d661129c2cd0" + realm: airm + + frontend: + image: + repository: amdenterpriseai/airm-ui + tag: 0.3.5 + pullPolicy: IfNotPresent + servicePort: 80 + resources: + limits: + memory: 4Gi + requests: + cpu: 500m + memory: 4Gi + + backend: + image: + repository: amdenterpriseai/airm-api + tag: 0.3.5 + pullPolicy: IfNotPresent + + servicePort: 80 + servicePortMetrics: 9009 + env: + dbPort: 5432 + rabbitmqPort: 5672 + minioUrl: http://minio.minio-tenant-default.svc.cluster.local:80 + minioBucket: default-bucket + prometheusUrl: http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:9090 + clusterAuthUrl: http://cluster-auth.cluster-auth.svc.cluster.local:8081 + + resources: + limits: + memory: 1Gi + requests: + cpu: 500m + memory: 1Gi + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + + utilities: + netcat: + image: + repository: busybox + tag: 1.37.0 + pullPolicy: IfNotPresent + curl: + image: + repository: curlimages/curl + tag: 8.16.0 + pullPolicy: IfNotPresent + liquibase: + image: + repository: docker.io/liquibase/liquibase + tag: 4.31 + pullPolicy: IfNotPresent + clusterTool: + image: + repository: amdenterpriseai/cluster-tool + tag: latest + pullPolicy: IfNotPresent diff --git a/sources/airm/0.3.5/charts/airm-dispatcher/.helmignore b/sources/airm/0.3.5/charts/airm-dispatcher/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-dispatcher/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.5/charts/airm-dispatcher/Chart.yaml b/sources/airm/0.3.5/charts/airm-dispatcher/Chart.yaml new file mode 100644 index 00000000..96b660fd --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-dispatcher/Chart.yaml @@ -0,0 +1,29 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm-dispatcher +description: A Helm chart for AIRM Dispatcher + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.5 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" diff --git a/sources/airm/0.3.5/charts/airm-dispatcher/README.md b/sources/airm/0.3.5/charts/airm-dispatcher/README.md new file mode 100644 index 00000000..0b85c706 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-dispatcher/README.md @@ -0,0 +1,54 @@ + + +# HELM CHARTS + +Simple instructions to deploy AIRM dispatcher application using helm chart. +The dispatcher can be run on a compute cluster, which may or may not be the same as the one hosting the AIRM API and UI. + +### 1. Requirements + +The following external components must be available in the Kubernetes cluster before the helm chart can be installed. + +- Accessible RabbitMQ cluster (must be the same cluster used by AIRM API). +- Kaiwo installed on the cluster (along with all its dependencies) + +### 2. Install + +``` +cd helm/airm/charts + +# 1. Create output template just to validate (the public domain could be app-dev.silogen.ai, staging.silogen.ai, etc.) +helm template airm-dispatcher ./airm-dispatcher -n airm --create-namespace > airm-dispatcher-helm-generated.yaml + +# 2. Run chart install +helm install airm-dispatcher ./airm-dispatcher -n airm --create-namespace + +# 3. Delete chart if needed +helm delete airm-dispatcher -n airm + +# 4. Upgrade when bumping versions +helm upgrade -n airm --set airm-dispatcher ./airm-dispatcher +``` + +--- + +### 3. Helm Settings + +| Field Path | Description | Type | Example / Default | +|---------------------------------------------|--------------------------------------------------------------|---------|-----------------------------------| +| airm.dispatcher.image.repository | Dispatcher image repository | string | `amdenterpriseai/airm-dispatcher` | +| airm.dispatcher.image.tag | Dispatcher image tag | string | `v2025.08-rc.21` | +| airm.dispatcher.image.pullPolicy | Dispatcher image pull policy | string | `IfNotPresent` | +| airm.dispatcher.servicePort | Dispatcher service port | int | `80` | +| airm.utilities.netcat.image.repository | Netcat image repository | string | `busybox` | +| airm.utilities.netcat.image.tag | Netcat image tag | string | `1.37.0` | +| airm.utilities.netcat.image.pullPolicy | Netcat image pull policy | string | `IfNotPresent` | +| airm.utilities.curl.image.repository | Curl image repository | string | `curlimages/curl` | +| airm.utilities.curl.image.tag | Curl image tag | string | `8.16.0` | +| airm.utilities.curl.image.pullPolicy | Curl image pull policy | string | `IfNotPresent` | +| airm.additionalClusterRoles.platformAdmin | Additional cluster roles for the Platform Administrator role | array | `[]` | +| airm.additionalClusterRoles.projectMember | Additional cluster roles for the Project Member role | array | `[]` | diff --git a/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-cluster-policies.yaml b/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-cluster-policies.yaml new file mode 100644 index 00000000..caf92aa6 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-cluster-policies.yaml @@ -0,0 +1,352 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-project-namespace-rolebinding +spec: + background: false + rules: + - name: generate-project-namespace-rolebinding + match: + any: + - resources: + kinds: + - Namespace + operations: + - CREATE + preconditions: + any: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/project-id" || '''' }}`}}' + operator: NotEquals + value: "" + skipBackgroundRequests: true + generate: + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + name: "project-member-role-binding" + namespace: "{{`{{request.object.metadata.name}}`}}" + synchronize: true + data: + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: airm-project-member + subjects: + - kind: Group + # Add this in for backwards compatibility + name: "oidc{{`{{request.object.metadata.name}}`}}" + apiGroup: rbac.authorization.k8s.io + - kind: Group + # The kubernetes cluster apply an OIDC prefix of 'oidc:', so we adjust the groups to expect that + name: "oidc:{{`{{request.object.metadata.name}}`}}" + apiGroup: rbac.authorization.k8s.io +--- +# Kyverno policy that enforces that workloads submitted to a namespace managed by AIRMan have the +# correct kueue lables and field set, so that they are bound by the quota of the namespace +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-quota-enforcement-for-workloads +spec: + background: false + rules: + - name: set-queue-name-from-namespace-default + match: + resources: + kinds: + - Deployment + - StatefulSet + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + + - name: set-queue-name-from-namespace-jobs + match: + resources: + kinds: + - Job # https://kueue.sigs.k8s.io/docs/tasks/run/jobs/ + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + spec: + suspend: true + + - name: set-queue-name-from-namespace-cronjobs + match: + resources: + kinds: + - CronJob # https://kueue.sigs.k8s.io/docs/tasks/run/run_cronjobs/ + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.spec.jobTemplate.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + spec: + jobTemplate: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + spec: + suspend: true + + - name: set-queue-name-from-namespace-kaiwo + match: + resources: + kinds: + - KaiwoJob + - KaiwoService + - AIMService + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: "{{`{{request.object.spec.clusterQueue || '' }}`}}" + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + spec: + clusterQueue: "{{`{{request.namespace }}`}}" +--- +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-workload-tracking-policy +spec: + background: false + rules: + # For all supported types, if airm.silogen.ai/workload-id and airm.silogen.ai/component-id are not set, we assume + # it has been submitted from outside of AIRMan. In that case, we set airm.silogen.ai/auto-discovered: true, so it can + # be tracked upstream. We also set airm.silogen.ai/discovered-component-type so that we can identify the type of component + # that was originally tracked, and ignore children created by it. See remove-auto-discovered-annotations-inherited-from-parent + # We also try to capture the user who submitted the workload, and consume it in the application + + # Please note that ReplicaSet is not supported because by default it is filtered away by Kyverno by default: https://github.com/kyverno/kyverno/blob/main/charts/kyverno/values.yaml#L270 + - name: add-discovery-annotations-for-supported-types + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || '''' }}`}}' + operator: Equals + value: "" + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || '''' }}`}}' + operator: Equals + value: "" + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/auto-discovered" || '''' }}`}}' + operator: Equals + value: "" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + annotations: + airm.silogen.ai/submitter: "{{`{{request.userInfo.username }}`}}" + airm.silogen.ai/auto-discovered: "true" + airm.silogen.ai/discovered-component-type: "{{`{{request.object.kind }}`}}" + # For all supported types, if airm.silogen.ai/auto-discovered is set and the airm.silogen.ai/discovered-component-type + # doesnt match the kind of the current component, we assume this type has been created by a parent which is also + # supported by AIRMan and we dont need to track this type upstream, so we unset the airm.silogen.ai/auto-discovered annotation. + # This is mostly to account for KaiwoJob, KaiwoService, AIMService which propagate annotations to pods. + - name: remove-auto-discovered-annotations-inherited-from-parent + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/auto-discovered" || '''' }}`}}' + operator: Equals + value: "true" + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/discovered-component-type" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.object.kind }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + annotations: + airm.silogen.ai/auto-discovered: "false" + # For all supported types, if airm.silogen.ai/project-id does not match that of the namespace label, overwrite it + # with the expected value, to avoid metrics getting mixed up between projects. + - name: set-project-id-from-namespace-label + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + context: + - name: ns_labels + apiCall: + urlPath: "/api/v1/namespaces/{{`{{request.namespace }}`}}" + method: GET + jmesPath: "metadata.labels" + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/project-id" || '''' }}`}}' + operator: NotEquals + value: '{{`{{ns_labels."airm.silogen.ai/project-id" }}`}}' + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/project-id: '{{`{{ns_labels."airm.silogen.ai/project-id" }}`}}' + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to types that expect them at + # metadata.labels. The expectation is that these are propagated to the subsequent pods that are created. + + # If the resource is spawned off by a CRD, it will not know about the labels on the previous version of the object, + # so we also check request.oldObject for the labels to try and preserve them if they were already set. + - name: add-workload-and-component-id-default + match: + resources: + kinds: + - Pod + - KaiwoJob + - KaiwoService + - AIMService + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to objects with templates and + # also add it to spec.template.metadata.labels to ensure that the pods created by them contain the labels as well + - name: add-workload-and-component-id-to-objects-with-template + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + template: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to cronjob and + # also add it to spec.jobTemplate.metadata.labels to ensure that the pods created by the cronjob + # contain it as well + - name: add-workload-and-component-id-cronjobs + match: + resources: + kinds: + - CronJob + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + jobTemplate: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + template: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' diff --git a/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-cluster-roles.yaml b/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-cluster-roles.yaml new file mode 100644 index 00000000..2461e894 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-cluster-roles.yaml @@ -0,0 +1,164 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: airm-platform-admin +rules: + - apiGroups: [""] + resources: + [ + "pods", + "events", + "services", + "configmaps", + "persistentvolumes", + "persistentvolumeclaims", + "namespaces", + "serviceaccounts", + ] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: [""] + resources: ["pods/log", "pods/exec", "pods/attach", "pods/portforward"] + verbs: ["*"] + - apiGroups: ["apps"] + resources: ["deployments", "deployments/scale", "replicasets", "statefulsets", "daemonsets"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses", "networkpolicies"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimmodels", "aimmodelcaches", "aimservicetemplates", "aimruntimeconfigs", "aimtemplatecaches"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes", "gateways"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["config.kaiwo.silogen.ai"] + resources: ["kaiwoconfigs"] + verbs: ["*"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwoqueueconfigs"] + verbs: ["*"] + - apiGroups: ["aim.silogen.ai"] + resources: [ "aimclustermodels", "aimclusterservicetemplates", "aimclusterruntimeconfigs", "aimclustermodelsources" ] + verbs: ["*"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "clusterroles", "rolebindings", "clusterrolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["external-secrets.io"] + resources: ["externalsecrets"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: ["external-secrets.io"] + resources: ["clustersecretstores"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["kueue.x-k8s.io"] + resources: ["clusterqueues", "resourceflavors", "localqueues", "workloadpriorityclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list"] +{{- range .Values.airm.additionalClusterRoles.platformAdmin }} + - apiGroups: + {{ .apiGroups | toYaml | nindent 6 }} + resources: + {{ .resources | toYaml | nindent 6 }} + verbs: + {{ .verbs | toYaml | nindent 6 }} +{{- end }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: airm-platform-admin-role-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: airm-platform-admin +subjects: + - kind: Group + # Add this in for backwards compatibility + name: "oidcairm-role:Platform Administrator" + apiGroup: rbac.authorization.k8s.io + - kind: Group + # The kubernetes cluster apply an OIDC prefix of 'oidc':, so we adjust the group to expect that + name: "oidc:airm-role:Platform Administrator" + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: airm-project-member +rules: + - apiGroups: [""] + resources: + [ + "pods", + "pods/log", + "pods/exec", + "pods/attach", + "pods/portforward", + "events", + "services", + "configmaps", + "persistentvolumes", + "persistentvolumeclaims", + ] + verbs: ["*"] + - apiGroups: ["apps"] + resources: ["deployments", "replicasets", "statefulsets", "daemonsets"] + verbs: ["*"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["*"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses", "networkpolicies"] + verbs: ["*"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["*"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimmodels", "aimmodelcaches", "aimservicetemplates", "aimruntimeconfigs", "aimtemplatecaches"] + verbs: ["*"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes", "gateways"] + verbs: ["*"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch", "create"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "clusterroles", "rolebindings", "clusterrolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["external-secrets.io"] + resources: ["clustersecretstores", "externalsecrets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] +{{- range .Values.airm.additionalClusterRoles.projectMember }} + - apiGroups: + {{ .apiGroups | toYaml | nindent 6 }} + resources: + {{ .resources | toYaml | nindent 6 }} + verbs: + {{ .verbs | toYaml | nindent 6 }} +{{- end }} diff --git a/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml b/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml new file mode 100644 index 00000000..8a3489ef --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml @@ -0,0 +1,343 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-cluster-nodes-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Replace + jobTemplate: + spec: + template: + spec: + containers: + - command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/clusters/nodes + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + name: airm-cluster-nodes-cron + resources: + limits: + memory: 100Mi + requests: + cpu: 50m + memory: 100Mi + restartPolicy: OnFailure + schedule: 0 * * * * +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-aim-models-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Forbid + schedule: "*/5 * * * *" + suspend: false + jobTemplate: + spec: + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + restartPolicy: OnFailure + initContainers: + - name: check-dispatcher-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm Dispatcher at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm Dispatcher is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-dispatcher.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.servicePort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-aim-models-cron + command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/aims/cluster-models + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + resources: + requests: + memory: "100Mi" + cpu: "50m" + limits: + memory: "100Mi" +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-heartbeat-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Replace + schedule: "*/1 * * * *" + suspend: false + jobTemplate: + spec: + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + restartPolicy: OnFailure + initContainers: + - name: check-dispatcher-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm Dispatcher at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm Dispatcher is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-dispatcher.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.servicePort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-heartbeat-cron + command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/heartbeats + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + resources: + requests: + memory: "100Mi" + cpu: "50m" + limits: + memory: "100Mi" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: "{{ .Release.Name }}-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + replicas: 1 + selector: + matchLabels: + app: "{{ .Release.Name }}-dispatcher" + template: + metadata: + labels: + app: "{{ .Release.Name }}-dispatcher" + spec: + serviceAccountName: "{{ .Release.Name }}-dispatcher-sa" + {{- with .Values.airm.dispatcher.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + initContainers: + - name: check-rabbitmq-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm RabbitMQ at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm RabbitMQ is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.env.rabbitmqPort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-dispatcher + image: "{{ .Values.airm.dispatcher.image.repository }}:{{ .Values.airm.dispatcher.image.tag }}" + imagePullPolicy: "{{ .Values.airm.dispatcher.image.pullPolicy }}" + ports: + - containerPort: 8080 + env: + - name: KUBE_CLUSTER_NAME + value: demo-cluster + - name: ORG_NAME + value: demo + - name: RABBITMQ_HOST + value: "{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local" + - name: RABBITMQ_PORT + value: "{{ .Values.airm.dispatcher.env.rabbitmqPort }}" + - name: RABBITMQ_AIRM_COMMON_VHOST + value: "vh_airm_common" + - name: RABBITMQ_AIRM_COMMON_QUEUE + value: "airm_common" + - name: RABBITMQ_USER + valueFrom: + secretKeyRef: + name: "{{ .Release.Name }}-rabbitmq-common-vhost-user" + key: username + - name: RABBITMQ_PASSWORD + valueFrom: + secretKeyRef: + name: "{{ .Release.Name }}-rabbitmq-common-vhost-user" + key: password + livenessProbe: + httpGet: + path: /v1/health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + successThreshold: 1 + readinessProbe: + httpGet: + path: /v1/health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + successThreshold: 1 + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "1Gi" + securityContext: + runAsUser: 0 +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: "{{ .Release.Name }}-dispatcher-cluster-access-binding" +subjects: + - kind: ServiceAccount + name: "{{ .Release.Name }}-dispatcher-sa" + namespace: "{{ .Release.Namespace }}" +roleRef: + kind: ClusterRole + name: "{{ .Release.Name }}-dispatcher-cluster-access-role" + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: "{{ .Release.Name }}-dispatcher-cluster-access-role" +rules: + - apiGroups: [""] + resources: ["services", "namespaces", "configmaps", "pods"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["deployments", "statefulsets", "daemonsets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices", "kaiwoqueueconfigs"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["external-secrets.io"] + resources: ["externalsecrets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimclustermodels"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: "{{ .Release.Name }}-dispatcher-sa" + namespace: "{{ .Release.Namespace }}" + +--- +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}-dispatcher" + namespace: "{{ .Release.Namespace }}" + labels: + app: "{{ .Release.Name }}-dispatcher" +spec: + ports: + - name: web + port: {{ .Values.airm.dispatcher.servicePort }} + targetPort: 8080 + type: ClusterIP + selector: + app: "{{ .Release.Name }}-dispatcher" diff --git a/sources/airm/0.3.5/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml b/sources/airm/0.3.5/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml new file mode 100644 index 00000000..e930efd0 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml @@ -0,0 +1,35 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# These are additional cluster roles needed by kyverno background controller to be able to +# create rolebindings in namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kyverno:airm-policy-roles + labels: + rbac.kyverno.io/aggregate-to-background-controller: "true" +rules: + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterroles", "rolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] # allow kyverno to bind clusterroles via rolebindings + resources: ["clusterroles"] + verbs: ["bind"] +--- +# These are additional cluster roles needed by kyverno reports controller to be able to +# manage custom resources for reporting +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kyverno:airm-reports-policy-roles + labels: + rbac.kyverno.io/aggregate-to-reports-controller: "true" +rules: + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["get", "list", "watch"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices"] + verbs: ["get", "list", "watch"] diff --git a/sources/airm/0.3.5/charts/airm-dispatcher/values.yaml b/sources/airm/0.3.5/charts/airm-dispatcher/values.yaml new file mode 100644 index 00000000..6babd81c --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-dispatcher/values.yaml @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +airm: + dispatcher: + image: + repository: amdenterpriseai/airm-dispatcher + tag: 0.3.5 + pullPolicy: IfNotPresent + servicePort: 80 + env: + rabbitmqPort: 5672 + utilities: + netcat: + image: + repository: busybox + tag: 1.37.0 + pullPolicy: IfNotPresent + curl: + image: + repository: curlimages/curl + tag: 8.16.0 + pullPolicy: IfNotPresent + additionalClusterRoles: + platformAdmin: [] + projectMember: [] diff --git a/sources/airm/0.3.5/values.yaml b/sources/airm/0.3.5/values.yaml new file mode 100644 index 00000000..69346880 --- /dev/null +++ b/sources/airm/0.3.5/values.yaml @@ -0,0 +1,3 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT From 72088e1c8cc84f8b5cee75d4fe12d37a354a596c Mon Sep 17 00:00:00 2001 From: brownzebra Date: Fri, 20 Feb 2026 14:34:53 +0200 Subject: [PATCH 018/115] cleanup-workflow-to-not-commit-target-revision --- .github/workflows/release-pipeline.yaml | 36 ++++++++++++++----------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/.github/workflows/release-pipeline.yaml b/.github/workflows/release-pipeline.yaml index 5ba3500f..46773285 100644 --- a/.github/workflows/release-pipeline.yaml +++ b/.github/workflows/release-pipeline.yaml @@ -42,27 +42,31 @@ jobs: fi echo "next=$VERSION" >> $GITHUB_OUTPUT - - name: Update helm values file - uses: mikefarah/yq@master + - name: Validate LATEST_RELEASE matches release version env: - GIT_TAG: ${{ steps.semver.outputs.next }} - with: - cmd: | - yq -i '.clusterForge.targetRevision = env(GIT_TAG)' root/values.yaml - yq -i '.targetRevision = env(GIT_TAG)' scripts/init-gitea-job/values.yaml - - - name: Commit and push changes - uses: stefanzweifel/git-auto-commit-action@v4 - env: - GIT_TAG: ${{ steps.semver.outputs.next }} - with: - commit_message: 'Update version to ${{ env.GIT_TAG }} [actions skip]' + VERSION: ${{ steps.semver.outputs.next }} + run: | + # Extract LATEST_RELEASE from bootstrap.sh + LATEST_RELEASE=$(grep '^LATEST_RELEASE=' scripts/bootstrap.sh | cut -d'"' -f2 | sed 's/^v//') + + # Extract base version (before -rc or -alpha, etc.) + RELEASE_BASE=$(echo "$VERSION" | sed 's/^v//' | sed 's/-rc[0-9]*$//' | sed 's/-alpha[0-9]*$//' | sed 's/-beta[0-9]*$//') + LATEST_BASE=$(echo "$LATEST_RELEASE" | sed 's/-rc[0-9]*$//' | sed 's/-alpha[0-9]*$//' | sed 's/-beta[0-9]*$//') + + echo "Release version: $VERSION (base: $RELEASE_BASE)" + echo "LATEST_RELEASE in bootstrap.sh: $LATEST_RELEASE (base: $LATEST_BASE)" + + if [[ "$RELEASE_BASE" != "$LATEST_BASE" ]]; then + echo "::warning::LATEST_RELEASE base version ($LATEST_BASE) in scripts/bootstrap.sh does not match release version base ($RELEASE_BASE)" + echo "::warning::Consider updating LATEST_RELEASE in scripts/bootstrap.sh to match the release being created" + else + echo "✓ LATEST_RELEASE base version matches release version base" + fi - name: Create GitHub Release env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} VERSION: ${{ steps.semver.outputs.next }} - EXTRA_ARGS: ${{ steps.version.outputs.extra_args }} run: | # Prepare release artifact tar -zcvf "release-enterprise-ai-${VERSION}.tar.gz" --transform 's,^,cluster-forge/,' root/ scripts/ sources @@ -134,4 +138,4 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} SBOM_NAME: ${{ steps.generate_sbom.outputs.sbom_name }} run: | - gh release upload ${VERSION} ${SBOM_NAME} --clobber + gh release upload ${VERSION} ${SBOM_NAME} --clobber \ No newline at end of file From e39c6ec5734b58520f3b5ff3e5f8c454c2cffbc1 Mon Sep 17 00:00:00 2001 From: brownzebra Date: Fri, 20 Feb 2026 14:44:26 +0200 Subject: [PATCH 019/115] target-revision-main-update-airm-version --- root/values.yaml | 4 ++-- sbom/components.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index 7a20e4c5..6716fcec 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -1,6 +1,6 @@ clusterForge: repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" - targetRevision: v1.8.0-rc4 + targetRevision: main # source helm values file from separate git repo externalValues: enabled: true @@ -724,7 +724,7 @@ apps: - /spec/accessModes # AMD Resource Manager (AIRM) airm: - path: airm/0.3.4 + path: airm/0.3.5 namespace: airm valuesFile: values.yaml helmParameters: diff --git a/sbom/components.yaml b/sbom/components.yaml index 821ab6d0..b1d8b9d5 100644 --- a/sbom/components.yaml +++ b/sbom/components.yaml @@ -10,7 +10,7 @@ components: license: MIT License licenseUrl: https://github.com/silogen/kaiwo/blob/main/LICENSE airm: - path: airm/0.3.4 + path: airm/0.3.5 valuesFile: values.yaml sourceUrl: https://github.com/silogen/cluster-forge/tree/main/sources/airm projectUrl: https://github.com/silogen/cluster-forge/tree/main/sources/airm From 56c00b9e35add4e5578388b2151e14b05b3f5af7 Mon Sep 17 00:00:00 2001 From: brownzebra Date: Fri, 20 Feb 2026 14:50:24 +0200 Subject: [PATCH 020/115] simplifty-target-revision-bootstrap --- scripts/bootstrap.sh | 62 -------------------------------------------- 1 file changed, 62 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 5f9d8962..d843dd93 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -100,56 +100,6 @@ HELP_OUTPUT esac done -validate_target_revision() { - # Always allow main and the latest release - if [ "$TARGET_REVISION" = "main" ] || [ "$TARGET_REVISION" = "$LATEST_RELEASE" ]; then - return 0 - fi - - # Check if it's a valid v1.8.0+ semantic version pattern - if [[ "$TARGET_REVISION" =~ ^v1\.8\. ]] || [[ "$TARGET_REVISION" =~ ^v1\.([9-9]|[1-9][0-9]+)\. ]] || [[ "$TARGET_REVISION" =~ ^v[2-9]\. ]]; then - return 0 - fi - - # For branches/commits, check git ancestry to see if v1.8.0-rc1 or later is in the history - echo "Checking git ancestry for target revision: $TARGET_REVISION" - - # Check if the target revision exists in git (try local first, then remote) - RESOLVED_REVISION="" - if git rev-parse --verify "$TARGET_REVISION" >/dev/null 2>&1; then - RESOLVED_REVISION="$TARGET_REVISION" - elif git rev-parse --verify "origin/$TARGET_REVISION" >/dev/null 2>&1; then - RESOLVED_REVISION="origin/$TARGET_REVISION" - echo "Found target revision as remote branch: origin/$TARGET_REVISION" - else - echo "ERROR: Target revision '$TARGET_REVISION' does not exist in git" - echo "Available branches: $(git branch -a | grep -v HEAD | sed 's/^[ *]*//' | tr '\n' ' ')" - exit 1 - fi - - # Check if v1.8.0-rc1 or any later version is an ancestor of the target revision - # We'll check for v1.8.0-rc1 as the minimum supported version - MIN_SUPPORTED_TAG="v1.8.0-rc1" - - # Check if the minimum supported tag exists - if git rev-parse --verify "$MIN_SUPPORTED_TAG" >/dev/null 2>&1; then - # Check if MIN_SUPPORTED_TAG is an ancestor of RESOLVED_REVISION - if git merge-base --is-ancestor "$MIN_SUPPORTED_TAG" "$RESOLVED_REVISION" 2>/dev/null; then - echo "Target revision '$TARGET_REVISION' is based on or after $MIN_SUPPORTED_TAG - supported" - return 0 - else - echo "ERROR: Target revision '$TARGET_REVISION' is not based on $MIN_SUPPORTED_TAG or later" - echo "The --target-revision flag only supports revisions based on $MIN_SUPPORTED_TAG and later versions" - echo "Supported: v1.8.0+, main, branches forked from v1.8.0-rc1+, or $LATEST_RELEASE" - exit 1 - fi - else - echo "WARNING: Minimum supported tag '$MIN_SUPPORTED_TAG' not found in git" - echo "Proceeding with target revision '$TARGET_REVISION' (ancestry check skipped)" - return 0 - fi -} - # Validate required arguments if [ -z "$DOMAIN" ]; then echo "ERROR: Domain argument is required" @@ -242,9 +192,6 @@ pre_cleanup() { } display_target_revision - -# Validate target revision and setup sources -validate_target_revision setup_sources setup_values_files @@ -260,15 +207,6 @@ if [ -n "$SIZE_VALUES_FILE" ]; then fi echo "Target revision: $TARGET_REVISION" echo "" -echo "⚠️ This will bootstrap ClusterForge on your cluster with the above configuration." -echo " Existing ArgoCD, OpenBao, and Gitea resources may be modified or replaced." -echo "" -read -p "Continue with bootstrap? [Y/n]: " -r -echo "" -if [[ $REPLY =~ ^[Nn]$ ]]; then - echo "Bootstrap cancelled by user." - exit 0 -fi echo "=== Starting Bootstrap Process ===" # Check for yq command availability From fe019cd29e70f033a6cd5f6b4922065574c8e053 Mon Sep 17 00:00:00 2001 From: rodrodsilo Date: Tue, 24 Feb 2026 09:14:58 +0200 Subject: [PATCH 021/115] feature: Improve liveness condition --- .../v1.0.7/templates/lgtm-stack.yaml | 86 ++++++++++++++++++- 1 file changed, 85 insertions(+), 1 deletion(-) diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml index f42e8c26..7424b8c0 100644 --- a/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml @@ -78,6 +78,82 @@ metadata: name: grafana-sidecar namespace: {{ .Release.Namespace }} --- +# ConfigMap for liveness/readiness check script +apiVersion: v1 +kind: ConfigMap +metadata: + name: lgtm-check-scripts + namespace: {{ .Release.Namespace }} +data: + check-ps.sh: | + #!/bin/bash + # Get processes using /proc filesystem (runs inside container) + PROCESS_LIST="" + for pid in /proc/[0-9]*; do + cmdline=$(cat "$$pid/cmdline" 2>/dev/null | tr "\0" " ") + if [ -n "$$cmdline" ]; then + PROCESS_LIST="$${PROCESS_LIST}$${cmdline}"$$'\n' + fi + done + + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[1;33m' + NC='\033[0m' + + if [[ -z "$$PROCESS_LIST" ]]; then + echo -e "$${RED}[FAILED] Could not retrieve process list from container$${NC}" + exit 1 + fi + + # Define required processes + REQUIRED_PROCESSES=( + "/bin/bash /otel-lgtm/run-all.sh" + "/bin/bash ./run-loki.sh" + "/bin/bash ./run-grafana.sh" + "./bin/grafana server" + "/bin/bash ./run-otelcol.sh" + "/bin/bash ./run-prometheus.sh" + "./otelcol-contrib/otelcol-contrib --feature-gates service.profilesSupport --config=file:./otelcol-config.yaml" + "./loki/loki --config.file=./loki-config.yaml" + "/bin/bash ./run-tempo.sh" + "./prometheus/prometheus --web.enable-remote-write-receiver --web.enable-otlp-receiver --enable-feature=exemplar-storage --enable-feature=native-histograms --storage.tsdb.path=/data/prometheus --config.file=./prometheus.yaml" + "/data/grafana/plugins/grafana-llm-app/gpx_llm_linux_amd64" + ) + + # Check each process and build list of missing ones + MISSING_COUNT=0 + echo "Expected processes ($${#REQUIRED_PROCESSES[@]} total):" + echo "---" + + for REQUIRED in "$${REQUIRED_PROCESSES[@]}"; do + if echo "$$PROCESS_LIST" | grep -qF "$$REQUIRED"; then + echo -e "$${GREEN} [RUNNING] $$REQUIRED$${NC}" + else + echo -e "$${RED} [MISSING] $$REQUIRED$${NC}" + MISSING_COUNT=$$((MISSING_COUNT + 1)) + fi + done + + echo "" + + if [[ $$MISSING_COUNT -eq 0 ]]; then + echo -e "$${GREEN} [OK] All required processes are running$${NC}" + else + echo -e "$${RED} [WARNING] $$MISSING_COUNT of $${#REQUIRED_PROCESSES[@]} processes are missing$${NC}" + echo "" + echo -e "$${YELLOW} This may indicate the LGTM stack is not fully operational.$${NC}" + echo -e "$${YELLOW} Consider restarting the pod.$${NC}" + fi + + echo "" + + if [[ $$MISSING_COUNT -gt 0 ]]; then + echo -e "$${RED}[FAILED] Some required processes are not running$${NC}" + exit 1 + fi + exit 0 +--- # Source: grafana/templates/configmap-dashboard-provider.yaml ###### apiVersion: v1 kind: ConfigMap @@ -235,7 +311,8 @@ spec: - | curl -f http://localhost:9090/-/ready && curl -f http://localhost:3100/ready && - curl -f http://localhost:3000/api/health + curl -f http://localhost:3000/api/health && + /scripts/check-ps.sh initialDelaySeconds: 120 periodSeconds: 30 timeoutSeconds: 15 @@ -266,6 +343,9 @@ spec: - name: sc-dashboard-provider ## mountPath: "/otel-lgtm/grafana/conf/provisioning/dashboards/sc-dashboardproviders.yaml" subPath: provider.yaml + - name: check-scripts + mountPath: /scripts + readOnly: true volumes: - name: tempo-data persistentVolumeClaim: @@ -288,3 +368,7 @@ spec: - name: sc-dashboard-provider ## configMap: name: grafana-config-dashboards + - name: check-scripts + configMap: + name: lgtm-check-scripts + defaultMode: 0755 From 2ecd90663301fc320d9654550c104640ef256fd4 Mon Sep 17 00:00:00 2001 From: rodrodsilo Date: Tue, 24 Feb 2026 19:46:42 +0200 Subject: [PATCH 022/115] fix: minor fix --- sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml index 7424b8c0..2f44a7de 100644 --- a/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml @@ -371,4 +371,4 @@ spec: - name: check-scripts configMap: name: lgtm-check-scripts - defaultMode: 0755 + defaultMode: 493 From 098c5a377ffd0a137efa1db95d825043895b4eac Mon Sep 17 00:00:00 2001 From: rodrodsilo Date: Wed, 25 Feb 2026 09:47:34 +0200 Subject: [PATCH 023/115] fix: correcting script for sh shell --- .../v1.0.7/templates/lgtm-stack.yaml | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml index 2f44a7de..cb76e752 100644 --- a/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml @@ -86,13 +86,13 @@ metadata: namespace: {{ .Release.Namespace }} data: check-ps.sh: | - #!/bin/bash + #!/bin/sh # Get processes using /proc filesystem (runs inside container) PROCESS_LIST="" for pid in /proc/[0-9]*; do - cmdline=$(cat "$$pid/cmdline" 2>/dev/null | tr "\0" " ") - if [ -n "$$cmdline" ]; then - PROCESS_LIST="$${PROCESS_LIST}$${cmdline}"$$'\n' + cmdline=$(cat "$pid/cmdline" 2>/dev/null | tr "\0" " ") + if [ -n "$cmdline" ]; then + PROCESS_LIST="${PROCESS_LIST}${cmdline}"$'\n' fi done @@ -101,8 +101,8 @@ data: YELLOW='\033[1;33m' NC='\033[0m' - if [[ -z "$$PROCESS_LIST" ]]; then - echo -e "$${RED}[FAILED] Could not retrieve process list from container$${NC}" + if [[ -z "$PROCESS_LIST" ]]; then + echo -e "${RED}[FAILED] Could not retrieve process list from container${NC}" exit 1 fi @@ -123,36 +123,37 @@ data: # Check each process and build list of missing ones MISSING_COUNT=0 - echo "Expected processes ($${#REQUIRED_PROCESSES[@]} total):" + echo "Expected processes (${#REQUIRED_PROCESSES[@]} total):" echo "---" - for REQUIRED in "$${REQUIRED_PROCESSES[@]}"; do - if echo "$$PROCESS_LIST" | grep -qF "$$REQUIRED"; then - echo -e "$${GREEN} [RUNNING] $$REQUIRED$${NC}" + for REQUIRED in "${REQUIRED_PROCESSES[@]}"; do + if echo "$PROCESS_LIST" | grep -qF "$REQUIRED"; then + echo -e "${GREEN} [RUNNING] $REQUIRED${NC}" else - echo -e "$${RED} [MISSING] $$REQUIRED$${NC}" - MISSING_COUNT=$$((MISSING_COUNT + 1)) + echo -e "${RED} [MISSING] $REQUIRED${NC}" + MISSING_COUNT=$((MISSING_COUNT + 1)) fi done echo "" - if [[ $$MISSING_COUNT -eq 0 ]]; then - echo -e "$${GREEN} [OK] All required processes are running$${NC}" + if [[ $MISSING_COUNT -eq 0 ]]; then + echo -e "${GREEN} [OK] All required processes are running${NC}" else - echo -e "$${RED} [WARNING] $$MISSING_COUNT of $${#REQUIRED_PROCESSES[@]} processes are missing$${NC}" + echo -e "${RED} [WARNING] $MISSING_COUNT of ${#REQUIRED_PROCESSES[@]} processes are missing${NC}" echo "" - echo -e "$${YELLOW} This may indicate the LGTM stack is not fully operational.$${NC}" - echo -e "$${YELLOW} Consider restarting the pod.$${NC}" + echo -e "${YELLOW} This may indicate the LGTM stack is not fully operational.${NC}" + echo -e "${YELLOW} Consider restarting the pod.${NC}" fi echo "" - if [[ $$MISSING_COUNT -gt 0 ]]; then - echo -e "$${RED}[FAILED] Some required processes are not running$${NC}" + if [[ $MISSING_COUNT -gt 0 ]]; then + echo -e "${RED}[FAILED] Some required processes are not running${NC}" exit 1 fi exit 0 + --- # Source: grafana/templates/configmap-dashboard-provider.yaml ###### apiVersion: v1 From db83b73c7eb5e25659125f66f994d3c70b2a2870 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Wed, 25 Feb 2026 12:58:50 +0200 Subject: [PATCH 024/115] fix: remove kyverno local-path policies from kyverno-config and values.yaml ignoreDifferences ref --- root/values.yaml | 12 --- .../local-path-access-mode-mutation.yaml | 95 ------------------- 2 files changed, 107 deletions(-) delete mode 100644 sources/kyverno-config/local-path-access-mode-mutation.yaml diff --git a/root/values.yaml b/root/values.yaml index 6716fcec..88d702fe 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -512,18 +512,6 @@ apps: path: kyverno-config namespace: kyverno syncWave: -2 - ignoreDifferences: - - group: "kyverno.io" - kind: "ClusterPolicy" - name: "local-path-access-mode-mutation" - jsonPointers: - - /spec/rules/0/skipBackgroundRequests - - group: "kyverno.io" - kind: "ClusterPolicy" - name: "local-path-access-mode-warning" - jsonPointers: - - /spec/rules/0/skipBackgroundRequests - - /spec/rules/0/validate/allowExistingViolations kyverno-policies-base: namespace: kyverno path: kyverno-policies/base diff --git a/sources/kyverno-config/local-path-access-mode-mutation.yaml b/sources/kyverno-config/local-path-access-mode-mutation.yaml deleted file mode 100644 index 1e1e9369..00000000 --- a/sources/kyverno-config/local-path-access-mode-mutation.yaml +++ /dev/null @@ -1,95 +0,0 @@ ---- -# Kyverno ClusterPolicy to mutate PVC access modes for local-path compatibility -# This policy is ONLY deployed to small and medium clusters via enabledApps configuration -# Large clusters use Longhorn and do NOT deploy this policy at all -apiVersion: kyverno.io/v1 -kind: ClusterPolicy -metadata: - name: local-path-access-mode-mutation - annotations: - policies.kyverno.io/title: "Local-Path Access Mode Mutation" - policies.kyverno.io/category: "Storage" - policies.kyverno.io/severity: "medium" - policies.kyverno.io/subject: "PersistentVolumeClaim" - policies.kyverno.io/minversion: "1.6.0" - policies.kyverno.io/description: >- - This policy automatically converts ReadWriteMany (RWX) and ReadOnlyMany (ROX) - access modes to ReadWriteOnce (RWO) for clusters using local-path provisioner. - This prevents PVC creation failures since local-path only supports RWO and RWOP. - NOTE: This policy is only deployed to small/medium clusters, never to large clusters. -spec: - admission: true - background: false - validationFailureAction: Enforce - rules: - - name: convert-rwx-rox-to-rwo - match: - resources: - kinds: - - PersistentVolumeClaim - preconditions: - any: - # Apply if PVC requests unsupported access modes - - key: "ReadWriteMany" - operator: AnyIn - value: "{{ request.object.spec.accessModes || [] }}" - - key: "ReadOnlyMany" - operator: AnyIn - value: "{{ request.object.spec.accessModes || [] }}" - mutate: - patchStrategicMerge: - spec: - # Replace access modes with RWO (only supported mode for local-path) - accessModes: - - ReadWriteOnce - metadata: - annotations: - +(kyverno.io/mutation-applied): "local-path-rwx-to-rwo" - +(kyverno.io/policy-reason): "local-path provisioner only supports ReadWriteOnce and ReadWriteOncePod" - ---- -# Validation policy to warn about access mode changes -apiVersion: kyverno.io/v1 -kind: ClusterPolicy -metadata: - name: local-path-access-mode-warning - annotations: - policies.kyverno.io/title: "Local-Path Access Mode Warning" - policies.kyverno.io/category: "Storage" - policies.kyverno.io/severity: "low" - policies.kyverno.io/subject: "PersistentVolumeClaim" - policies.kyverno.io/description: >- - This policy generates warnings when PVCs request RWX/ROX access modes - that will be converted to RWO due to local-path provisioner limitations. - NOTE: This policy is only deployed to small/medium clusters, never to large clusters. -spec: - admission: true - background: false - validationFailureAction: Audit # Warning only, don't block - rules: - - name: warn-access-mode-conversion - match: - resources: - kinds: - - PersistentVolumeClaim - preconditions: - any: - # Warn for unsupported access modes that will be converted - - key: "ReadWriteMany" - operator: AnyIn - value: "{{ request.object.spec.accessModes || [] }}" - - key: "ReadOnlyMany" - operator: AnyIn - value: "{{ request.object.spec.accessModes || [] }}" - validate: - message: >- - WARNING: The requested access mode(s) {{ request.object.spec.accessModes | join(',') }} - are not supported by the local-path provisioner used in small/medium clusters. - The access mode has been automatically converted to ReadWriteOnce (RWO). - For ReadWriteMany support, consider using a large cluster with Longhorn storage. - deny: - conditions: - # This condition is always false, so it only generates a warning - - key: "false" - operator: Equals - value: "true" \ No newline at end of file From 7bf3f96e53d34daa6a16879893628a15905ba624 Mon Sep 17 00:00:00 2001 From: brownzebra Date: Wed, 25 Feb 2026 13:57:43 +0200 Subject: [PATCH 025/115] put the access mode mutation into templates dir --- .../storage-local-path/{ => templates}/access-mode-mutation.yaml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sources/kyverno-policies/storage-local-path/{ => templates}/access-mode-mutation.yaml (100%) diff --git a/sources/kyverno-policies/storage-local-path/access-mode-mutation.yaml b/sources/kyverno-policies/storage-local-path/templates/access-mode-mutation.yaml similarity index 100% rename from sources/kyverno-policies/storage-local-path/access-mode-mutation.yaml rename to sources/kyverno-policies/storage-local-path/templates/access-mode-mutation.yaml From fa5e8b608456bad2109c78e921264b68a09fd12e Mon Sep 17 00:00:00 2001 From: brownzebra Date: Wed, 25 Feb 2026 14:54:11 +0200 Subject: [PATCH 026/115] escaping special characters in kyverno template --- .../templates/access-mode-mutation.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sources/kyverno-policies/storage-local-path/templates/access-mode-mutation.yaml b/sources/kyverno-policies/storage-local-path/templates/access-mode-mutation.yaml index c01a9bcc..20bab9db 100644 --- a/sources/kyverno-policies/storage-local-path/templates/access-mode-mutation.yaml +++ b/sources/kyverno-policies/storage-local-path/templates/access-mode-mutation.yaml @@ -32,10 +32,10 @@ spec: # Apply if PVC requests unsupported access modes - key: "ReadWriteMany" operator: AnyIn - value: "{{ request.object.spec.accessModes || [] }}" + value: "{{ "{{" }} request.object.spec.accessModes || [] {{ "}}" }}" - key: "ReadOnlyMany" operator: AnyIn - value: "{{ request.object.spec.accessModes || [] }}" + value: "{{ "{{" }} request.object.spec.accessModes || [] {{ "}}" }}" mutate: patchStrategicMerge: spec: @@ -44,7 +44,7 @@ spec: - ReadWriteOnce metadata: annotations: - +(kyverno.io/original-access-modes): "{{ request.object.spec.accessModes && request.object.spec.accessModes | join(',') || 'undefined' }}" + +(kyverno.io/original-access-modes): "{{ "{{" }} request.object.spec.accessModes && request.object.spec.accessModes | join(',') || 'undefined' {{ "}}" }}" +(kyverno.io/mutation-applied): "local-path-rwx-to-rwo" +(kyverno.io/policy-reason): "local-path provisioner only supports ReadWriteOnce and ReadWriteOncePod" @@ -78,13 +78,13 @@ spec: # Warn for unsupported access modes that will be converted - key: "ReadWriteMany" operator: AnyIn - value: "{{ request.object.spec.accessModes || [] }}" + value: "{{ "{{" }} request.object.spec.accessModes || [] {{ "}}" }}" - key: "ReadOnlyMany" operator: AnyIn - value: "{{ request.object.spec.accessModes || [] }}" + value: "{{ "{{" }} request.object.spec.accessModes || [] {{ "}}" }}" validate: message: >- - WARNING: The requested access mode(s) {{ request.object.spec.accessModes && request.object.spec.accessModes | join(',') || 'undefined' }} + WARNING: The requested access mode(s) {{ "{{" }} request.object.spec.accessModes && request.object.spec.accessModes | join(',') || 'undefined' {{ "}}" }} are not supported by the local-path provisioner used in small/medium clusters. The access mode has been automatically converted to ReadWriteOnce (RWO). For ReadWriteMany support, consider using a large cluster with Longhorn storage. From a4867e82a9450059a0e4bf31e2ba1f56b1a01eaf Mon Sep 17 00:00:00 2001 From: brownzebra Date: Wed, 25 Feb 2026 15:49:15 +0200 Subject: [PATCH 027/115] ignore background requests and esixting violations fields in upgrade situations --- root/values_medium.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/root/values_medium.yaml b/root/values_medium.yaml index b69b7e4f..192856ea 100644 --- a/root/values_medium.yaml +++ b/root/values_medium.yaml @@ -55,7 +55,12 @@ apps: kyverno-policies-storage-local-path: namespace: kyverno path: kyverno-policies/storage-local-path - ignoreDifferences: [] + ignoreDifferences: + - group: kyverno.io + kind: ClusterPolicy + jsonPointers: + - /spec/rules/0/skipBackgroundRequests + - /spec/rules/0/validate/allowExistingViolations syncWave: -2 argocd: From 4bb6777b80d722544b7a879da5fdbc16831631cc Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Thu, 26 Feb 2026 07:32:03 +0200 Subject: [PATCH 028/115] fix(values_small.yaml): was missing base kyverno policies --- root/values_small.yaml | 55 +++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/root/values_small.yaml b/root/values_small.yaml index 9af1e1de..2d212e89 100644 --- a/root/values_small.yaml +++ b/root/values_small.yaml @@ -3,49 +3,50 @@ # Medium & Small clusters add local-path storage policy for RWX→RWO conversion enabledApps: - - kyverno-policies-storage-local-path # Local-path storage policies (small/medium only) - aim-cluster-model-source + - airm + - amd-gpu-operator + - amd-gpu-operator-config + - appwrapper - argocd - argocd-config - cert-manager - - openbao - - openbao-config + - cluster-auth + - cluster-auth-config + - cnpg-operator - external-secrets - external-secrets-config + - gateway-api - gitea - gitea-config - - gateway-api - - metallb - - kgateway-crds - - kgateway - - kgateway-config - - prometheus-crds - - opentelemetry-operator - - otel-lgtm-stack - - cnpg-operator - - cluster-auth - - cluster-auth-config - - keycloak - - kyverno - - kyverno-config - - amd-gpu-operator - - amd-gpu-operator-config - - kuberay-operator + - kaiwo + - kaiwo-config + - kaiwo-crds - keda - kedify-otel - - kserve-crds + - keycloak + - kgateway + - kgateway-config + - kgateway-crds - kserve - - rabbitmq + - kserve-crds + - kuberay-operator - kueue - kueue-config - - appwrapper + - kyverno + - kyverno-config + - kyverno-policies-base # applicable to all cluster sizes + - kyverno-policies-storage-local-path # small & medium cluster sizes only + - metallb - minio-operator - minio-tenant - minio-tenant-config - - kaiwo-crds - - kaiwo - - kaiwo-config - - airm + - openbao + - openbao-config + - opentelemetry-operator + - otel-lgtm-stack + - prometheus-crds + - rabbitmq apps: # Modular Kyverno policy applications (only the storage-local-path addition) From 5343776253b2e10c3fb83ac508eb5c5d59f2155b Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Thu, 26 Feb 2026 07:38:02 +0200 Subject: [PATCH 029/115] fix: app dependency sequence via waves otel -> keda -> kedify-otel -> kserve refactor: multiply existing syncWaves by 10 to create flexibility for custom sequeneces --- root/values.yaml | 76 ++++++++++++++++++++--------------------- root/values_medium.yaml | 2 +- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index 6716fcec..d381661e 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -103,11 +103,11 @@ apps: clientSecret: $$argocd-oidc-creds:client_secret rootCA: $cluster-tls:cert requestedScopes: ["openid", "profile", "email", "groups"] - syncWave: -3 + syncWave: -30 argocd-config: path: argocd-config namespace: argocd - syncWave: -2 + syncWave: -20 ignoreDifferences: - group: external-secrets.io kind: ExternalSecret @@ -118,7 +118,7 @@ apps: cert-manager: namespace: cert-manager path: cert-manager/v1.18.2 - syncWave: -4 + syncWave: -40 valuesObject: installCRDs: true openbao: @@ -146,7 +146,7 @@ apps: replicas: 1 ui: enabled: true - syncWave: -4 + syncWave: -40 ignoreDifferences: - group: "apps" kind: "Deployment" @@ -164,16 +164,16 @@ apps: helmParameters: - name: domain value: "{{ .Values.global.domain }}" - syncWave: -2 + syncWave: -20 external-secrets: path: external-secrets/0.15.1 namespace: external-secrets valuesFile: values.yaml - syncWave: -4 + syncWave: -40 external-secrets-config: path: external-secrets-config namespace: external-secrets - syncWave: -2 + syncWave: -20 gitea: path: gitea/12.3.0 namespace: cf-gitea @@ -212,7 +212,7 @@ apps: value: "{{ .Values.global.domain }}" - name: gitea.config.server.ROOT_URL value: "https://gitea.{{ .Values.global.domain }}" - syncWave: -3 + syncWave: -30 gitea-config: path: gitea-config namespace: cf-gitea @@ -222,21 +222,21 @@ apps: value: "https://kc.{{ .Values.global.domain }}" - name: keycloak.realm value: "airm" - syncWave: -2 + syncWave: -20 # Network apps gateway-api: path: gateway-api/v1.3.0 namespace: default - syncWave: -5 + syncWave: -50 metallb: path: metallb/v0.15.2 namespace: default - syncWave: -4 + syncWave: -40 kgateway-crds: path: kgateway-crds/v2.1.0-main namespace: kgateway-system valuesFile: values.yaml - syncWave: -3 + syncWave: -30 kgateway: path: kgateway/v2.1.0-main namespace: kgateway-system @@ -246,7 +246,7 @@ apps: registry: "ghcr.io" repository: silogen/kgateway-v2.1.0-main-websocket tag: "0.0.1" - syncWave: -2 + syncWave: -20 kgateway-config: path: kgateway-config namespace: kgateway-system @@ -254,13 +254,13 @@ apps: helmParameters: - name: domain value: "{{ .Values.global.domain }}" - syncWave: -2 + syncWave: -20 # Monitoring prometheus-crds: path: prometheus-operator-crds/23.0.0 namespace: prometheus-system valuesFile: values.yaml - syncWave: -5 + syncWave: -50 opentelemetry-operator: path: opentelemetry-operator/0.93.1 namespace: opentelemetry-operator-system @@ -271,7 +271,7 @@ apps: collectorImage: repository: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib tag: "0.140.0" - syncWave: -3 + syncWave: -30 otel-lgtm-stack: path: otel-lgtm-stack/v1.0.7 namespace: otel-lgtm-stack @@ -345,23 +345,23 @@ apps: helmParameters: - name: cluster.name value: "{{ .Values.global.domain }}" - syncWave: -2 + syncWave: -20 # Databases cnpg-operator: path: cnpg-operator/0.26.0 namespace: cnpg-system valuesFile: values.yaml - syncWave: -3 + syncWave: -30 # Access control cluster-auth: path: cluster-auth/0.5.0 namespace: cluster-auth valuesFile: values.yaml - syncWave: -2 + syncWave: -20 cluster-auth-config: path: cluster-auth-config namespace: cluster-auth - syncWave: -2 + syncWave: -20 ignoreDifferences: - group: external-secrets.io kind: ExternalSecret @@ -495,7 +495,7 @@ apps: helmParameters: - name: domain value: "{{ .Values.global.domain }}" - syncWave: -1 + syncWave: -10 ignoreDifferences: - group: external-secrets.io kind: ExternalSecret @@ -507,11 +507,11 @@ apps: path: kyverno/3.5.1 namespace: kyverno valuesFile: values.yaml - syncWave: -3 + syncWave: -30 kyverno-config: path: kyverno-config namespace: kyverno - syncWave: -2 + syncWave: -20 ignoreDifferences: - group: "kyverno.io" kind: "ClusterPolicy" @@ -527,7 +527,7 @@ apps: kyverno-policies-base: namespace: kyverno path: kyverno-policies/base - syncWave: -2 + syncWave: -20 # GPU amd-gpu-operator: path: amd-gpu-operator/v1.4.1 @@ -536,7 +536,7 @@ apps: crds: defaultCR: install: false - syncWave: -1 + syncWave: -10 amd-gpu-operator-config: path: amd-gpu-operator-config namespace: kube-amd-gpu @@ -545,13 +545,13 @@ apps: path: kuberay-operator/1.4.2 namespace: default valuesFile: values.yaml - syncWave: -1 + syncWave: -10 # Autoscaling keda: path: keda/2.18.1 namespace: keda valuesFile: values.yaml - syncWave: -1 + syncWave: -10 kedify-otel: path: kedify-otel/v0.0.6 namespace: keda @@ -559,13 +559,13 @@ apps: # Cluster-forge specific values for kedify-otel validatingAdmissionPolicy: enabled: false - syncWave: 0 + syncWave: -5 # ML/AI kserve-crds: path: kserve-crds/v0.16.0 namespace: kserve-system valuesFile: values.yaml - syncWave: -3 + syncWave: -30 kserve: path: kserve/v0.16.0 namespace: kserve-system @@ -573,12 +573,12 @@ apps: kserve: controller: deploymentMode: "Standard" - syncWave: -1 + syncWave: 0 # Queues rabbitmq: path: rabbitmq/v2.15.0 namespace: rabbitmq-system - syncWave: -1 + syncWave: -10 kueue: path: kueue/0.13.0 namespace: kueue-system @@ -632,21 +632,21 @@ apps: - "pod" - "deployment" - "statefulset" - syncWave: -1 + syncWave: -10 kueue-config: path: kueue-config namespace: kueue-system - syncWave: -1 + syncWave: -10 appwrapper: path: appwrapper/v1.1.2 namespace: appwrapper-system - syncWave: -1 + syncWave: -10 # Storage minio-operator: path: minio-operator/7.1.1 namespace: minio-operator valuesFile: values.yaml - syncWave: -1 + syncWave: -10 minio-tenant: path: minio-tenant/7.1.1 namespace: minio-tenant-default @@ -697,16 +697,16 @@ apps: aim-cluster-model-source: path: aim-cluster-model-source namespace: kaiwo-system - syncWave: -2 + syncWave: -20 kaiwo-crds: path: kaiwo-crds/v0.2.0-rc11 namespace: kaiwo-system - syncWave: -2 + syncWave: -20 kaiwo: path: kaiwo/v0.2.0-rc11 namespace: kaiwo-system valuesFile: values.yaml - syncWave: -1 + syncWave: -10 kaiwo-config: path: kaiwo-config namespace: kaiwo-system diff --git a/root/values_medium.yaml b/root/values_medium.yaml index b69b7e4f..769bdabc 100644 --- a/root/values_medium.yaml +++ b/root/values_medium.yaml @@ -56,7 +56,7 @@ apps: namespace: kyverno path: kyverno-policies/storage-local-path ignoreDifferences: [] - syncWave: -2 + syncWave: -20 argocd: valuesObject: From 3f478f26a4d5056d670a7cbf1ab7546b416d6bc8 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Thu, 26 Feb 2026 07:42:02 +0200 Subject: [PATCH 030/115] fix: add ignoreDifferences section for keify-otel --- root/values.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/root/values.yaml b/root/values.yaml index d381661e..d6d34780 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -560,6 +560,17 @@ apps: validatingAdmissionPolicy: enabled: false syncWave: -5 + ignoreDifferences: + - group: "" + kind: "Service" + name: "keda-otel-scaler" + jqPathExpressions: + - ".status" + - group: "apps" + kind: "Deployment" + jqPathExpressions: + - ".status.readyReplicas" + - ".status.availableReplicas" # ML/AI kserve-crds: path: kserve-crds/v0.16.0 From 73fe6ef3fbe8b6a92fedb309d6cb563a81f27be3 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Thu, 26 Feb 2026 07:50:03 +0200 Subject: [PATCH 031/115] feat: add custom keda/scaledObject health check at argo app level --- root/values.yaml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/root/values.yaml b/root/values.yaml index d6d34780..aa667d19 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -70,6 +70,30 @@ apps: hs.status = "Healthy" hs.message = "Always Healthy - See https://github.com/argoproj/argo-cd/pull/24284" return hs + resource.customizations.health.keda.sh_ScaledObject: | + hs = {} + if obj.status ~= nil then + if obj.status.conditions ~= nil then + for _, condition in ipairs(obj.status.conditions) do + if condition.type == "Ready" then + if condition.status == "True" then + hs.status = "Healthy" + hs.message = "ScaledObject is ready" + else + hs.status = "Degraded" + hs.message = condition.reason or "ScaledObject not ready" + end + return hs + end + end + end + hs.status = "Progressing" + hs.message = "ScaledObject status unknown" + else + hs.status = "Progressing" + hs.message = "ScaledObject status unknown" + end + return hs params: server.insecure: true rbac: From d25d7aafa6a66edadf40502280fb999f7034ecf4 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Thu, 26 Feb 2026 13:49:16 +0200 Subject: [PATCH 032/115] fix: remove trailing pipe --- .../storage-local-path/templates/access-mode-mutation.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sources/kyverno-policies/storage-local-path/templates/access-mode-mutation.yaml b/sources/kyverno-policies/storage-local-path/templates/access-mode-mutation.yaml index 20bab9db..24cc6f09 100644 --- a/sources/kyverno-policies/storage-local-path/templates/access-mode-mutation.yaml +++ b/sources/kyverno-policies/storage-local-path/templates/access-mode-mutation.yaml @@ -44,7 +44,7 @@ spec: - ReadWriteOnce metadata: annotations: - +(kyverno.io/original-access-modes): "{{ "{{" }} request.object.spec.accessModes && request.object.spec.accessModes | join(',') || 'undefined' {{ "}}" }}" + +(kyverno.io/original-access-modes): "{{ "{{" }} join(',', request.object.spec.accessModes) {{ "}}" }}" +(kyverno.io/mutation-applied): "local-path-rwx-to-rwo" +(kyverno.io/policy-reason): "local-path provisioner only supports ReadWriteOnce and ReadWriteOncePod" @@ -84,7 +84,7 @@ spec: value: "{{ "{{" }} request.object.spec.accessModes || [] {{ "}}" }}" validate: message: >- - WARNING: The requested access mode(s) {{ "{{" }} request.object.spec.accessModes && request.object.spec.accessModes | join(',') || 'undefined' {{ "}}" }} + WARNING: The requested access mode(s) {{ "{{" }} join(',', request.object.spec.accessModes) {{ "}}" }} are not supported by the local-path provisioner used in small/medium clusters. The access mode has been automatically converted to ReadWriteOnce (RWO). For ReadWriteMany support, consider using a large cluster with Longhorn storage. From fbf2e1ff7974f7d3f5d6fc0c2bcd35b83e6462a2 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Thu, 26 Feb 2026 13:44:32 +0200 Subject: [PATCH 033/115] feat: add modelcache resource contraint policy (wiring still pending) --- .../clusterPolicy.yaml | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 sources/kyverno-policies/modelcache-resource-constraints/clusterPolicy.yaml diff --git a/sources/kyverno-policies/modelcache-resource-constraints/clusterPolicy.yaml b/sources/kyverno-policies/modelcache-resource-constraints/clusterPolicy.yaml new file mode 100644 index 00000000..f8e2573e --- /dev/null +++ b/sources/kyverno-policies/modelcache-resource-constraints/clusterPolicy.yaml @@ -0,0 +1,31 @@ +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: modelcache-download-resource-limits +spec: + rules: + - name: add-resource-constraints-to-download-jobs + match: + any: + - resources: + kinds: + - Job + selector: + matchLabels: + app.kubernetes.io/managed-by: modelcache-controller + mutate: + patchStrategicMerge: + spec: + template: + spec: + containers: + - (name): "*" + resources: + limits: + memory: "32Gi" + requests: + memory: "4Gi" + cpu: "2" + env: + - name: HF_XET_HIGH_PERFORMANCE + value: "0" \ No newline at end of file From f0364d76a742bc3f08f0ed3e546a76a232ce4faa Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Thu, 26 Feb 2026 16:23:20 +0200 Subject: [PATCH 034/115] refactor: move enabledApps into large for greater clarity, instead of values.yaml having the array setup for large; fix: add kyverno policy for modelcache resource constraints --- root/values.yaml | 47 +++--------------------------------------- root/values_large.yaml | 46 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 44 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index aa667d19..70383b37 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -10,50 +10,9 @@ externalValues: global: domain: # to be filled by bootstrap script clusterSize: # to be filled by bootstrap script (small, medium, large) -enabledApps: - - aim-cluster-model-source - - airm - - amd-gpu-operator - - amd-gpu-operator-config - - appwrapper - - argocd - - argocd-config - - cert-manager - - cluster-auth - - cluster-auth-config - - cnpg-operator - - external-secrets - - external-secrets-config - - gateway-api - - gitea - - gitea-config - - kaiwo - - kaiwo-config - - kaiwo-crds - - keda - - kedify-otel - - keycloak - - kgateway - - kgateway-config - - kgateway-crds - - kserve - - kserve-crds - - kueue - - kueue-config - - kuberay-operator - - kyverno - - kyverno-config - - kyverno-policies-base - - metallb - - minio-operator - - minio-tenant - - minio-tenant-config - - openbao - - openbao-config - - opentelemetry-operator - - otel-lgtm-stack - - prometheus-crds - - rabbitmq + +# enabledApps: defined within cluster_[small|medium|large].yaml + apps: # Core apps argocd: diff --git a/root/values_large.yaml b/root/values_large.yaml index 6efb6d52..b1bd8c9a 100644 --- a/root/values_large.yaml +++ b/root/values_large.yaml @@ -6,6 +6,52 @@ # LARGE CLUSTER: All apps enabled (inherited from base values.yaml) # Uses Longhorn storage with native RWX support - no access mode mutation needed +enabledApps: + - aim-cluster-model-source + - airm + - amd-gpu-operator + - amd-gpu-operator-config + - appwrapper + - argocd + - argocd-config + - cert-manager + - cluster-auth + - cluster-auth-config + - cnpg-operator + - external-secrets + - external-secrets-config + - gateway-api + - gitea + - gitea-config + - kaiwo + - kaiwo-config + - kaiwo-crds + - keda + - kedify-otel + - keycloak + - kgateway + - kgateway-config + - kgateway-crds + - kserve + - kserve-crds + - kueue + - kueue-config + - kuberay-operator + - kyverno + - kyverno-config + - kyverno-policies-base + - kyverno-policies-modelcache-resource-constraints + - metallb + - minio-operator + - minio-tenant + - minio-tenant-config + - openbao + - openbao-config + - opentelemetry-operator + - otel-lgtm-stack + - prometheus-crds + - rabbitmq + apps: # Git - Gitea (Production: single replica, sqlite3) gitea: From 33bd42474f9ad923f645febbde478f08b2c2f38f Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Thu, 26 Feb 2026 16:28:24 +0200 Subject: [PATCH 035/115] feat: add Chart.yaml and templates folder for new kyverno policy --- .../Chart.yaml | 19 +++++++++++++++++++ .../{ => templates}/clusterPolicy.yaml | 0 2 files changed, 19 insertions(+) create mode 100644 sources/kyverno-policies/modelcache-resource-constraints/Chart.yaml rename sources/kyverno-policies/modelcache-resource-constraints/{ => templates}/clusterPolicy.yaml (100%) diff --git a/sources/kyverno-policies/modelcache-resource-constraints/Chart.yaml b/sources/kyverno-policies/modelcache-resource-constraints/Chart.yaml new file mode 100644 index 00000000..a2cd1caa --- /dev/null +++ b/sources/kyverno-policies/modelcache-resource-constraints/Chart.yaml @@ -0,0 +1,19 @@ +apiVersion: v2 +name: kyverno-policies-modelcache-resource-constraints +description: Kyverno policies for modelcache resource constraints (large cluster size only) +type: application +version: 1.0.0 +appVersion: "1.0.0" +keywords: + - kyverno + - policies + - modelcache + - resource constraints +maintainers: + - name: ClusterForge Team +home: https://github.com/silogen/cluster-forge +sources: + - https://github.com/silogen/cluster-forge +annotations: + cluster-forge.io/target-sizes: "large" + cluster-forge.io/description: "Restricts download resource usage from modelcache" \ No newline at end of file diff --git a/sources/kyverno-policies/modelcache-resource-constraints/clusterPolicy.yaml b/sources/kyverno-policies/modelcache-resource-constraints/templates/clusterPolicy.yaml similarity index 100% rename from sources/kyverno-policies/modelcache-resource-constraints/clusterPolicy.yaml rename to sources/kyverno-policies/modelcache-resource-constraints/templates/clusterPolicy.yaml From cded60774ff23c6e2920873ac08c792ae3e80150 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Thu, 26 Feb 2026 16:31:26 +0200 Subject: [PATCH 036/115] fix: add app definition --- root/values_large.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/root/values_large.yaml b/root/values_large.yaml index b1bd8c9a..7ec7f964 100644 --- a/root/values_large.yaml +++ b/root/values_large.yaml @@ -53,6 +53,17 @@ enabledApps: - rabbitmq apps: + kyverno-policies-modelcache-resource-constraints: + namespace: kyverno + path: kyverno-policies/modelcache-resource-constraints + ignoreDifferences: [] + syncWave: -20 + ignoreDifferences: + - group: kyverno.io + kind: ClusterPolicy + jsonPointers: + - /spec/rules/0/skipBackgroundRequests + - /spec/rules/0/validate/allowExistingViolations # Git - Gitea (Production: single replica, sqlite3) gitea: valuesObject: From ae881169926ec601c225ed4ab1a64353383e7ce5 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Thu, 26 Feb 2026 20:23:58 +0200 Subject: [PATCH 037/115] fix: new base policy (resource constraints on modelcache) --- root/values.yaml | 47 +++++++++++- root/values_large.yaml | 46 ------------ .../base/templates/dynamic-pvc-creation.yaml | 73 +++++++++++++++++++ .../modelcache-resource-constraints.yaml} | 0 .../Chart.yaml | 19 ----- 5 files changed, 117 insertions(+), 68 deletions(-) create mode 100644 sources/kyverno-policies/base/templates/dynamic-pvc-creation.yaml rename sources/kyverno-policies/{modelcache-resource-constraints/templates/clusterPolicy.yaml => base/templates/modelcache-resource-constraints.yaml} (100%) delete mode 100644 sources/kyverno-policies/modelcache-resource-constraints/Chart.yaml diff --git a/root/values.yaml b/root/values.yaml index 70383b37..aa667d19 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -10,9 +10,50 @@ externalValues: global: domain: # to be filled by bootstrap script clusterSize: # to be filled by bootstrap script (small, medium, large) - -# enabledApps: defined within cluster_[small|medium|large].yaml - +enabledApps: + - aim-cluster-model-source + - airm + - amd-gpu-operator + - amd-gpu-operator-config + - appwrapper + - argocd + - argocd-config + - cert-manager + - cluster-auth + - cluster-auth-config + - cnpg-operator + - external-secrets + - external-secrets-config + - gateway-api + - gitea + - gitea-config + - kaiwo + - kaiwo-config + - kaiwo-crds + - keda + - kedify-otel + - keycloak + - kgateway + - kgateway-config + - kgateway-crds + - kserve + - kserve-crds + - kueue + - kueue-config + - kuberay-operator + - kyverno + - kyverno-config + - kyverno-policies-base + - metallb + - minio-operator + - minio-tenant + - minio-tenant-config + - openbao + - openbao-config + - opentelemetry-operator + - otel-lgtm-stack + - prometheus-crds + - rabbitmq apps: # Core apps argocd: diff --git a/root/values_large.yaml b/root/values_large.yaml index 7ec7f964..812d3c94 100644 --- a/root/values_large.yaml +++ b/root/values_large.yaml @@ -6,52 +6,6 @@ # LARGE CLUSTER: All apps enabled (inherited from base values.yaml) # Uses Longhorn storage with native RWX support - no access mode mutation needed -enabledApps: - - aim-cluster-model-source - - airm - - amd-gpu-operator - - amd-gpu-operator-config - - appwrapper - - argocd - - argocd-config - - cert-manager - - cluster-auth - - cluster-auth-config - - cnpg-operator - - external-secrets - - external-secrets-config - - gateway-api - - gitea - - gitea-config - - kaiwo - - kaiwo-config - - kaiwo-crds - - keda - - kedify-otel - - keycloak - - kgateway - - kgateway-config - - kgateway-crds - - kserve - - kserve-crds - - kueue - - kueue-config - - kuberay-operator - - kyverno - - kyverno-config - - kyverno-policies-base - - kyverno-policies-modelcache-resource-constraints - - metallb - - minio-operator - - minio-tenant - - minio-tenant-config - - openbao - - openbao-config - - opentelemetry-operator - - otel-lgtm-stack - - prometheus-crds - - rabbitmq - apps: kyverno-policies-modelcache-resource-constraints: namespace: kyverno diff --git a/sources/kyverno-policies/base/templates/dynamic-pvc-creation.yaml b/sources/kyverno-policies/base/templates/dynamic-pvc-creation.yaml new file mode 100644 index 00000000..600a95b1 --- /dev/null +++ b/sources/kyverno-policies/base/templates/dynamic-pvc-creation.yaml @@ -0,0 +1,73 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kyverno-pvc-generator +rules: + - apiGroups: + - "" + resources: + - persistentvolumeclaims + verbs: + - create +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kyverno-pvc-generator-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kyverno-pvc-generator +subjects: + - kind: ServiceAccount + name: kyverno-background-controller + namespace: kyverno +--- +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: dynamic-pvc-creation +spec: + admission: true + background: false + emitWarning: false + rules: + - generate: + apiVersion: v1 + data: + spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: '{{ request.object.metadata.annotations."pvc.silogen.ai/user-pvc-size" + }}' + storageClassName: '{{ request.object.metadata.annotations."pvc.silogen.ai/user-pvc-storage-class-name" + }}' + kind: PersistentVolumeClaim + name: pvc-{{ request.object.metadata.annotations."pvc.silogen.ai/user-pvc-uid" }} + namespace: '{{ request.object.metadata.namespace }}' + synchronize: false + match: + resources: + kinds: + - Deployment + - Pod + name: create-pvc-if-annotated + preconditions: + all: + - key: '{{ request.object.metadata.annotations."pvc.silogen.ai/user-pvc-auto-create" + }}' + operator: Equals + value: "true" + - key: '{{ request.object.metadata.annotations."pvc.silogen.ai/user-pvc-size" + }}' + operator: NotEquals + value: "" + - key: '{{ request.object.metadata.annotations."pvc.silogen.ai/user-pvc-storage-class-name" + }}' + operator: NotEquals + value: "" + skipBackgroundRequests: true + validationFailureAction: Enforce \ No newline at end of file diff --git a/sources/kyverno-policies/modelcache-resource-constraints/templates/clusterPolicy.yaml b/sources/kyverno-policies/base/templates/modelcache-resource-constraints.yaml similarity index 100% rename from sources/kyverno-policies/modelcache-resource-constraints/templates/clusterPolicy.yaml rename to sources/kyverno-policies/base/templates/modelcache-resource-constraints.yaml diff --git a/sources/kyverno-policies/modelcache-resource-constraints/Chart.yaml b/sources/kyverno-policies/modelcache-resource-constraints/Chart.yaml deleted file mode 100644 index a2cd1caa..00000000 --- a/sources/kyverno-policies/modelcache-resource-constraints/Chart.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: v2 -name: kyverno-policies-modelcache-resource-constraints -description: Kyverno policies for modelcache resource constraints (large cluster size only) -type: application -version: 1.0.0 -appVersion: "1.0.0" -keywords: - - kyverno - - policies - - modelcache - - resource constraints -maintainers: - - name: ClusterForge Team -home: https://github.com/silogen/cluster-forge -sources: - - https://github.com/silogen/cluster-forge -annotations: - cluster-forge.io/target-sizes: "large" - cluster-forge.io/description: "Restricts download resource usage from modelcache" \ No newline at end of file From 1d899e4bf4c2cc528edf371132b73de99a4152d1 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Thu, 26 Feb 2026 20:26:19 +0200 Subject: [PATCH 038/115] fix: rm large relic --- root/values_large.yaml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/root/values_large.yaml b/root/values_large.yaml index 812d3c94..6efb6d52 100644 --- a/root/values_large.yaml +++ b/root/values_large.yaml @@ -7,17 +7,6 @@ # Uses Longhorn storage with native RWX support - no access mode mutation needed apps: - kyverno-policies-modelcache-resource-constraints: - namespace: kyverno - path: kyverno-policies/modelcache-resource-constraints - ignoreDifferences: [] - syncWave: -20 - ignoreDifferences: - - group: kyverno.io - kind: ClusterPolicy - jsonPointers: - - /spec/rules/0/skipBackgroundRequests - - /spec/rules/0/validate/allowExistingViolations # Git - Gitea (Production: single replica, sqlite3) gitea: valuesObject: From b10a02a227f645303599b6148211322f5538f205 Mon Sep 17 00:00:00 2001 From: woojae-siloai Date: Mon, 2 Mar 2026 15:09:42 +0200 Subject: [PATCH 039/115] add cluster-policy to kaiwo --- .../v0.2.0-rc10/templates/cluster-policy.yaml | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 sources/kaiwo/v0.2.0-rc10/templates/cluster-policy.yaml diff --git a/sources/kaiwo/v0.2.0-rc10/templates/cluster-policy.yaml b/sources/kaiwo/v0.2.0-rc10/templates/cluster-policy.yaml new file mode 100644 index 00000000..03c60fb3 --- /dev/null +++ b/sources/kaiwo/v0.2.0-rc10/templates/cluster-policy.yaml @@ -0,0 +1,35 @@ +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: modelcache-download-resource-limits +spec: + rules: + - name: add-resource-constraints-to-download-jobs + match: + any: + - resources: + kinds: + - Job + selector: + matchLabels: + app.kubernetes.io/managed-by: modelcache-controller + mutate: + patchStrategicMerge: + spec: + template: + spec: + containers: + - (name): "model-download" + resources: + limits: + memory: "32Gi" + requests: + memory: "4Gi" + cpu: "2" + env: + - name: HF_XET_HIGH_PERFORMANCE + value: "0" + - name: HF_XET_NUM_CONCURRENT_RANGE_GETS + value: "8" + - name: HF_XET_RECONSTRUCT_WRITE_SEQUENTIALLY + value: "1" From ae1441c2521d99299830ef68a6dc1cff056def7d Mon Sep 17 00:00:00 2001 From: woojae-siloai Date: Mon, 2 Mar 2026 15:21:43 +0200 Subject: [PATCH 040/115] move cluster-policy for kaiwo to kyverno-policies --- .../base/templates/modelcache-resource-constraints.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sources/kyverno-policies/base/templates/modelcache-resource-constraints.yaml b/sources/kyverno-policies/base/templates/modelcache-resource-constraints.yaml index f8e2573e..03c60fb3 100644 --- a/sources/kyverno-policies/base/templates/modelcache-resource-constraints.yaml +++ b/sources/kyverno-policies/base/templates/modelcache-resource-constraints.yaml @@ -19,7 +19,7 @@ spec: template: spec: containers: - - (name): "*" + - (name): "model-download" resources: limits: memory: "32Gi" @@ -28,4 +28,8 @@ spec: cpu: "2" env: - name: HF_XET_HIGH_PERFORMANCE - value: "0" \ No newline at end of file + value: "0" + - name: HF_XET_NUM_CONCURRENT_RANGE_GETS + value: "8" + - name: HF_XET_RECONSTRUCT_WRITE_SEQUENTIALLY + value: "1" From c2e10f48792604ec8c5ab1cf671238892a2113ad Mon Sep 17 00:00:00 2001 From: woojae-siloai Date: Mon, 2 Mar 2026 15:22:45 +0200 Subject: [PATCH 041/115] delete cluster-policy from kaiwo --- .../v0.2.0-rc10/templates/cluster-policy.yaml | 35 ------------------- 1 file changed, 35 deletions(-) delete mode 100644 sources/kaiwo/v0.2.0-rc10/templates/cluster-policy.yaml diff --git a/sources/kaiwo/v0.2.0-rc10/templates/cluster-policy.yaml b/sources/kaiwo/v0.2.0-rc10/templates/cluster-policy.yaml deleted file mode 100644 index 03c60fb3..00000000 --- a/sources/kaiwo/v0.2.0-rc10/templates/cluster-policy.yaml +++ /dev/null @@ -1,35 +0,0 @@ -apiVersion: kyverno.io/v1 -kind: ClusterPolicy -metadata: - name: modelcache-download-resource-limits -spec: - rules: - - name: add-resource-constraints-to-download-jobs - match: - any: - - resources: - kinds: - - Job - selector: - matchLabels: - app.kubernetes.io/managed-by: modelcache-controller - mutate: - patchStrategicMerge: - spec: - template: - spec: - containers: - - (name): "model-download" - resources: - limits: - memory: "32Gi" - requests: - memory: "4Gi" - cpu: "2" - env: - - name: HF_XET_HIGH_PERFORMANCE - value: "0" - - name: HF_XET_NUM_CONCURRENT_RANGE_GETS - value: "8" - - name: HF_XET_RECONSTRUCT_WRITE_SEQUENTIALLY - value: "1" From 64dd39f51eec2f4b8e70b0cedf41155333502d3e Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 22:49:51 +0200 Subject: [PATCH 042/115] fix: remove duplicate ignoreDifferences (and scope for entire array, not just element 0) --- root/values_medium.yaml | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/root/values_medium.yaml b/root/values_medium.yaml index 83e7ea95..e0ab43fc 100644 --- a/root/values_medium.yaml +++ b/root/values_medium.yaml @@ -1,8 +1,5 @@ -# MEDIUM CLUSTER: All apps enabled (inherited from base values.yaml) -# Add Kyverno policy for local-path access mode mutation - -# Medium & Small clusters add local-path storage policy for RWX→RWO conversion # Medium & Small clusters add local-path storage policy for RWX→RWO conversion + enabledApps: - aim-cluster-model-source - airm @@ -55,14 +52,13 @@ apps: kyverno-policies-storage-local-path: namespace: kyverno path: kyverno-policies/storage-local-path - ignoreDifferences: [] syncWave: -20 ignoreDifferences: - group: kyverno.io kind: ClusterPolicy jsonPointers: - - /spec/rules/0/skipBackgroundRequests - - /spec/rules/0/validate/allowExistingViolations + - /spec/rules/*/skipBackgroundRequests + - /spec/rules/*/validate/allowExistingViolations argocd: valuesObject: applicationSet: From 844dc6f019eca9d63520b2a3c77472ad242b006f Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Fri, 27 Feb 2026 12:17:12 +0200 Subject: [PATCH 043/115] refactor: bootsrap without yq --- scripts/bootstrap_v2.sh | 322 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 322 insertions(+) create mode 100644 scripts/bootstrap_v2.sh diff --git a/scripts/bootstrap_v2.sh b/scripts/bootstrap_v2.sh new file mode 100644 index 00000000..026f5ce1 --- /dev/null +++ b/scripts/bootstrap_v2.sh @@ -0,0 +1,322 @@ +#!/bin/bash + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Initialize variables +LATEST_RELEASE="v1.8.0" +TARGET_REVISION="$LATEST_RELEASE" + +CLUSTER_SIZE="medium" # Default to medium +DOMAIN="" +KUBE_VERSION=1.33 +VALUES_FILE="values.yaml" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --CLUSTER-SIZE|--cluster-size|-s) + if [ -z "$2" ]; then + echo "ERROR: --cluster-size requires an argument" + exit 1 + fi + CLUSTER_SIZE="$2" + shift 2 + ;; + --CLUSTER-SIZE=*) + CLUSTER_SIZE="${1#*=}" + shift + ;; + --cluster-size=*) + CLUSTER_SIZE="${1#*=}" + shift + ;; + -s=*) + CLUSTER_SIZE="${1#*=}" + shift + ;; + --TARGET-REVISION|--target-revision|-r) + if [ -z "$2" ]; then + echo "WARNING: defaulting to --target-revision=$LATEST_RELEASE (no value specified)" + TARGET_REVISION="$LATEST_RELEASE" + shift + else + TARGET_REVISION="$2" + shift 2 + fi + ;; + --TARGET-REVISION=*) + TARGET_REVISION="${1#*=}" + shift + ;; + --target-revision=*) + TARGET_REVISION="${1#*=}" + shift + ;; + -r=*) + TARGET_REVISION="${1#*=}" + shift + ;; + --help|-h) + cat < [values_file] + + Arguments: + domain Required. Cluster domain (e.g., example.com) + values_file Optional. Values .yaml file to use, default: root/values.yaml + + Options: + -r, --target-revision cluster-forge git revision to seed into cluster-values/values.yaml file + options: [tag|commit_hash|branch_name], default: $LATEST_RELEASE + -s, --cluster-size options: [small|medium|large], default: medium + + Examples: + $0 compute.amd.com values_custom.yaml --cluster-size=large + $0 112.100.97.17.nip.io + $0 dev.example.com --cluster-size=small --target-revision=$LATEST_RELEASE + $0 dev.example.com -s=small -r=$LATEST_RELEASE +HELP_OUTPUT + exit 0 + ;; + --*) + echo "ERROR: Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + *) + # Positional arguments + if [ -z "$DOMAIN" ]; then + DOMAIN="$1" + elif [ "$VALUES_FILE" = "values.yaml" ]; then + VALUES_FILE="$1" + else + echo "ERROR: Too many arguments: $1" + echo "Usage: $0 [--CLUSTER_SIZE=small|medium|large] [--dev] [values_file]" + exit 1 + fi + shift + ;; + esac +done + +# Validate required arguments +if [ -z "$DOMAIN" ]; then + echo "ERROR: Domain argument is required" + echo "Usage: $0 [values_file] [--CLUSTER_SIZE=small|medium|large]" + echo "Use --help for more details" + exit 1 +fi + +# Validate cluster size +case "$CLUSTER_SIZE" in + small|medium|large) + ;; + *) + echo "ERROR: Invalid cluster size '$CLUSTER_SIZE'" + echo "Valid sizes: small, medium, large" + exit 1 + ;; +esac + +# Validate values file exists +if [ ! -f "${SCRIPT_DIR}/../root/${VALUES_FILE}" ]; then + echo "ERROR: Values file not found: ${SCRIPT_DIR}/../root/${VALUES_FILE}" + exit 1 +fi + +# Check if size-specific values file exists +setup_values_files() { + SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" + + if [ ! -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + echo "WARNING: Size-specific values file not found: ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" + echo "Proceeding with base values file only: ${VALUES_FILE}" + SIZE_VALUES_FILE="" + else + echo "Using size-specific values file: ${SIZE_VALUES_FILE}" + fi +} + +display_target_revision() { + # Check if TARGET_REVISION was explicitly set via command line flag + # by comparing against the default value + if [ "$TARGET_REVISION" != "$LATEST_RELEASE" ]; then + echo "Using specified targetRevision: $TARGET_REVISION" + else + echo "Using default targetRevision: $TARGET_REVISION" + fi +} + +# Since we only support v1.8.0+, always use local sources +setup_sources() { + SOURCE_ROOT="${SCRIPT_DIR}/.." + echo "Using local sources for target revision: $TARGET_REVISION" +} + +pre_cleanup() { + echo "" + echo "=== Pre-cleanup: Checking for previous runs ===" + + # Check if gitea-init-job exists and completed successfully + if kubectl get job gitea-init-job -n cf-gitea >/dev/null 2>&1; then + if kubectl get job gitea-init-job -n cf-gitea -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; then + echo "Found completed gitea-init-job - removing Gitea to start fresh" + + # Delete all Gitea resources + kubectl delete job gitea-init-job -n cf-gitea --ignore-not-found=true + kubectl delete deployment gitea -n cf-gitea --ignore-not-found=true + kubectl delete statefulset gitea -n cf-gitea --ignore-not-found=true + kubectl delete service gitea -n cf-gitea --ignore-not-found=true + kubectl delete service gitea-http -n cf-gitea --ignore-not-found=true + kubectl delete service gitea-ssh -n cf-gitea --ignore-not-found=true + kubectl delete pvc -n cf-gitea -l app.kubernetes.io/name=gitea --ignore-not-found=true + kubectl delete configmap initial-cf-values -n cf-gitea --ignore-not-found=true + kubectl delete secret gitea-admin-credentials -n cf-gitea --ignore-not-found=true + kubectl delete ingress -n cf-gitea -l app.kubernetes.io/name=gitea --ignore-not-found=true + + echo "Gitea resources deleted" + fi + fi + + # Always delete openbao-init-job to allow re-initialization + kubectl delete job openbao-init-job -n cf-openbao --ignore-not-found=true + + # Clean up any bootstrap manifest files from previous runs + rm -f /tmp/cluster-forge-bootstrap.yaml + + echo "=== Pre-cleanup complete ===" + echo "" +} + +# NEW: ArgoCD-Native Template Rendering Function +render_cluster_forge_manifests() { + echo "" + echo "=== Rendering ClusterForge Manifests ===" + echo "Domain: $DOMAIN" + echo "Base values: $VALUES_FILE" + echo "Cluster size: $CLUSTER_SIZE" + echo "Target revision: $TARGET_REVISION" + echo "" + + local helm_args=( + "cluster-forge" "${SOURCE_ROOT}/root" + "--namespace" "argocd" + "--values" "${SOURCE_ROOT}/root/${VALUES_FILE}" + ) + + # Add size-specific values if they exist + if [ -n "$SIZE_VALUES_FILE" ]; then + helm_args+=( + "--values" "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" + ) + echo "Size overlay: $SIZE_VALUES_FILE" + fi + + # Set runtime configuration + helm_args+=( + "--set" "global.domain=$DOMAIN" + "--set" "global.clusterSize=values_${CLUSTER_SIZE}.yaml" + "--set" "externalValues.enabled=true" + "--set" "clusterForge.repoUrl=http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" + "--set" "clusterForge.targetRevision=$TARGET_REVISION" + "--set" "externalValues.repoUrl=http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" + "--set" "externalValues.targetRevision=main" + "--kube-version" "$KUBE_VERSION" + ) + + echo "🚀 Rendering all manifests using ArgoCD-native templating..." + + # Render all manifests in one go - no yq, no temp files, no manual extraction! + helm template "${helm_args[@]}" > /tmp/cluster-forge-bootstrap.yaml + + echo "✅ All manifests rendered to /tmp/cluster-forge-bootstrap.yaml" + echo "" +} + +# NEW: Smart Application of Sync Waves +apply_manifests_by_sync_wave() { + echo "=== Applying Manifests by Sync Wave ===" + + # Create required namespaces first + echo "Creating namespaces..." + kubectl create ns argocd --dry-run=client -o yaml | kubectl apply -f - + kubectl create ns cf-gitea --dry-run=client -o yaml | kubectl apply -f - + kubectl create ns cf-openbao --dry-run=client -o yaml | kubectl apply -f - + + # Apply manifests - ArgoCD will handle sync waves naturally + echo "🎯 Applying all ClusterForge manifests..." + kubectl apply -f /tmp/cluster-forge-bootstrap.yaml + + echo "" + echo "🎉 Bootstrap manifests applied!" + echo "ArgoCD will now orchestrate the deployment using sync waves." + echo "" + echo "=== Monitoring Key Components ===" + + # Wait for ArgoCD to be ready (it should be in the bootstrap manifests) + if kubectl get statefulset argocd-application-controller -n argocd >/dev/null 2>&1; then + echo "⏳ Waiting for ArgoCD Application Controller..." + kubectl rollout status statefulset/argocd-application-controller -n argocd --timeout=300s + echo "✅ ArgoCD Application Controller ready" + fi + + if kubectl get deployment argocd-repo-server -n argocd >/dev/null 2>&1; then + echo "⏳ Waiting for ArgoCD Repo Server..." + kubectl rollout status deploy/argocd-repo-server -n argocd --timeout=300s + echo "✅ ArgoCD Repo Server ready" + fi + + # Monitor progress of core applications + echo "" + echo "📊 Core applications will be deployed by ArgoCD in sync wave order:" + echo " Wave -5: CRDs and operators" + echo " Wave -4: Core infrastructure (ArgoCD, OpenBao, External Secrets)" + echo " Wave -3: Network and storage" + echo " Wave -2: Configuration and secrets management" + echo " Wave -1: Application dependencies" + echo " Wave 0: Applications" + echo "" +} + +# NEW: Post-Bootstrap Status Check +show_bootstrap_summary() { + echo "=== ClusterForge Bootstrap Complete ===" + echo "" + echo "Domain: $DOMAIN" + echo "Cluster size: $CLUSTER_SIZE" + echo "Target revision: $TARGET_REVISION" + echo "" + echo "🌐 Access URLs:" + echo " ArgoCD: https://argocd.${DOMAIN}" + echo " Gitea: https://gitea.${DOMAIN}" + echo "" + echo "📋 Next steps:" + echo " 1. Monitor ArgoCD applications: kubectl get apps -n argocd" + echo " 2. Check sync status: kubectl get apps -n argocd -o wide" + echo " 3. View ArgoCD UI for detailed deployment progress" + echo "" + echo "🧹 Cleanup: Bootstrap manifest saved at /tmp/cluster-forge-bootstrap.yaml" + echo "" + echo "This is the way! 🚀" +} + +# Main execution flow +main() { + display_target_revision + setup_sources + setup_values_files + + # Run pre-cleanup (removing till refined) + # pre_cleanup + + # NEW APPROACH: Single ArgoCD-native rendering and application + render_cluster_forge_manifests + apply_manifests_by_sync_wave + + # Show final status + show_bootstrap_summary +} + +# Execute main function +main \ No newline at end of file From 8d84f717efb5ed2330590b10e9f22ad71284bdec Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Fri, 27 Feb 2026 13:02:50 +0200 Subject: [PATCH 044/115] feat: remove forceful bootstrap of cluster-forge child apps --- scripts/bootstrap_v2.sh | 142 ++++++++++++++++++++++++++++------------ 1 file changed, 101 insertions(+), 41 deletions(-) mode change 100644 => 100755 scripts/bootstrap_v2.sh diff --git a/scripts/bootstrap_v2.sh b/scripts/bootstrap_v2.sh old mode 100644 new mode 100755 index 026f5ce1..33c93f44 --- a/scripts/bootstrap_v2.sh +++ b/scripts/bootstrap_v2.sh @@ -67,15 +67,22 @@ while [[ $# -gt 0 ]]; do values_file Optional. Values .yaml file to use, default: root/values.yaml Options: - -r, --target-revision cluster-forge git revision to seed into cluster-values/values.yaml file + -r, --target-revision cluster-forge git revision for ArgoCD to sync from options: [tag|commit_hash|branch_name], default: $LATEST_RELEASE + IMPORTANT: Only apps enabled in target revision will be deployed -s, --cluster-size options: [small|medium|large], default: medium Examples: $0 compute.amd.com values_custom.yaml --cluster-size=large $0 112.100.97.17.nip.io - $0 dev.example.com --cluster-size=small --target-revision=$LATEST_RELEASE - $0 dev.example.com -s=small -r=$LATEST_RELEASE + $0 dev.example.com --cluster-size=small --target-revision=v1.8.0 + $0 dev.example.com -s=small -r=feature-branch + + Target Revision Behavior: + • Bootstrap will deploy ArgoCD + cluster-forge parent app + • ArgoCD will sync ALL apps from the specified target revision + • Only apps enabled in target revision will be deployed + • Apps disabled in target revision will be pruned if they exist HELP_OUTPUT exit 0 ;; @@ -183,7 +190,7 @@ pre_cleanup() { kubectl delete job openbao-init-job -n cf-openbao --ignore-not-found=true # Clean up any bootstrap manifest files from previous runs - rm -f /tmp/cluster-forge-bootstrap.yaml + rm -f /tmp/cluster-forge-bootstrap.yaml /tmp/cluster-forge-parent-app.yaml echo "=== Pre-cleanup complete ===" echo "" @@ -234,49 +241,98 @@ render_cluster_forge_manifests() { echo "" } -# NEW: Smart Application of Sync Waves -apply_manifests_by_sync_wave() { - echo "=== Applying Manifests by Sync Wave ===" +# NEW: Render and Apply Only Essential Components for ArgoCD Takeover +render_cluster_forge_parent_app() { + echo "" + echo "=== Rendering cluster-forge Parent App ===" + echo "Using existing template: root/templates/cluster-forge.yaml" + echo "Target revision: $TARGET_REVISION" + echo "Cluster size: $CLUSTER_SIZE" - # Create required namespaces first - echo "Creating namespaces..." - kubectl create ns argocd --dry-run=client -o yaml | kubectl apply -f - - kubectl create ns cf-gitea --dry-run=client -o yaml | kubectl apply -f - - kubectl create ns cf-openbao --dry-run=client -o yaml | kubectl apply -f - + # Create minimal values for just rendering the cluster-forge parent app + local temp_values=$(mktemp) + cat > "$temp_values" < /tmp/cluster-forge-parent-app.yaml + + # Cleanup temp file + rm -f "$temp_values" + + echo "✅ cluster-forge parent app rendered to /tmp/cluster-forge-parent-app.yaml" +} - # Apply manifests - ArgoCD will handle sync waves naturally - echo "🎯 Applying all ClusterForge manifests..." - kubectl apply -f /tmp/cluster-forge-bootstrap.yaml +bootstrap_argocd_managed_approach() { + echo "=== ArgoCD-Managed Bootstrap ===" + echo "" + echo "🎯 Strategy: Let ArgoCD manage everything from target revision: $TARGET_REVISION" + echo " This ensures only apps enabled in target revision are deployed" + echo "" + # Create argocd namespace first + echo "Creating ArgoCD namespace..." + kubectl create ns argocd --dry-run=client -o yaml | kubectl apply -f - + + # Step 1: Deploy ArgoCD itself from local render (needed to bootstrap) echo "" - echo "🎉 Bootstrap manifests applied!" - echo "ArgoCD will now orchestrate the deployment using sync waves." + echo "📦 Step 1: Deploying ArgoCD..." + awk '/name: argocd$/,/^---$/{if(/^---$/ && NR>1) exit; print}' /tmp/cluster-forge-bootstrap.yaml | \ + kubectl apply -f - + + # Wait for ArgoCD to be ready + echo "⏳ Waiting for ArgoCD to become ready..." + kubectl rollout status statefulset/argocd-application-controller -n argocd --timeout=300s + kubectl rollout status deploy/argocd-repo-server -n argocd --timeout=300s + kubectl rollout status deploy/argocd-redis -n argocd --timeout=300s + echo "✅ ArgoCD ready!" + + # Step 2: Render and apply the cluster-forge parent app using the template echo "" - echo "=== Monitoring Key Components ===" - - # Wait for ArgoCD to be ready (it should be in the bootstrap manifests) - if kubectl get statefulset argocd-application-controller -n argocd >/dev/null 2>&1; then - echo "⏳ Waiting for ArgoCD Application Controller..." - kubectl rollout status statefulset/argocd-application-controller -n argocd --timeout=300s - echo "✅ ArgoCD Application Controller ready" - fi + echo "📦 Step 2: Rendering and applying cluster-forge parent application..." + echo " Using template: root/templates/cluster-forge.yaml" + echo " This will sync from target revision and manage all child apps" - if kubectl get deployment argocd-repo-server -n argocd >/dev/null 2>&1; then - echo "⏳ Waiting for ArgoCD Repo Server..." - kubectl rollout status deploy/argocd-repo-server -n argocd --timeout=300s - echo "✅ ArgoCD Repo Server ready" - fi + # Render the cluster-forge parent app using the existing template + render_cluster_forge_parent_app + + # Apply the rendered cluster-forge parent app + kubectl apply -f /tmp/cluster-forge-parent-app.yaml - # Monitor progress of core applications + echo "✅ cluster-forge parent application applied!" + + echo "" + echo "🎉 ArgoCD-Managed Bootstrap Complete!" echo "" - echo "📊 Core applications will be deployed by ArgoCD in sync wave order:" - echo " Wave -5: CRDs and operators" - echo " Wave -4: Core infrastructure (ArgoCD, OpenBao, External Secrets)" - echo " Wave -3: Network and storage" - echo " Wave -2: Configuration and secrets management" - echo " Wave -1: Application dependencies" - echo " Wave 0: Applications" + echo "📋 What happens now:" + echo " 1. ✅ ArgoCD is running and managing the cluster" + echo " 2. 🎯 cluster-forge app will sync from: $TARGET_REVISION" + echo " 3. 📦 ONLY apps enabled in $TARGET_REVISION will be deployed" + echo " 4. ⚡ Sync waves will ensure proper deployment order" + echo " 5. 🔄 ArgoCD will automatically prune apps disabled in target revision" echo "" + echo "🚀 The cluster will now converge to the exact state defined in target revision!" } # NEW: Post-Bootstrap Status Check @@ -295,8 +351,12 @@ show_bootstrap_summary() { echo " 1. Monitor ArgoCD applications: kubectl get apps -n argocd" echo " 2. Check sync status: kubectl get apps -n argocd -o wide" echo " 3. View ArgoCD UI for detailed deployment progress" + echo " 4. ArgoCD will sync apps from target revision: $TARGET_REVISION" + echo " (Only apps enabled in that revision will be deployed)" echo "" - echo "🧹 Cleanup: Bootstrap manifest saved at /tmp/cluster-forge-bootstrap.yaml" + echo "🧹 Cleanup: Bootstrap manifests saved at:" + echo " - /tmp/cluster-forge-bootstrap.yaml (all apps rendered)" + echo " - /tmp/cluster-forge-parent-app.yaml (parent app only)" echo "" echo "This is the way! 🚀" } @@ -310,9 +370,9 @@ main() { # Run pre-cleanup (removing till refined) # pre_cleanup - # NEW APPROACH: Single ArgoCD-native rendering and application + # NEW APPROACH: Render locally, but only bootstrap ArgoCD + parent app render_cluster_forge_manifests - apply_manifests_by_sync_wave + bootstrap_argocd_managed_approach # Show final status show_bootstrap_summary From 72cb3c822a973aafbfab01432060961e0f98c112 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Fri, 27 Feb 2026 13:05:18 +0200 Subject: [PATCH 045/115] qa: verify new bootstrap by removing component on branch (AIRM) --- root/values.yaml | 2 +- root/values_medium.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index aa667d19..89f3f158 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -12,7 +12,7 @@ global: clusterSize: # to be filled by bootstrap script (small, medium, large) enabledApps: - aim-cluster-model-source - - airm + #- airm - amd-gpu-operator - amd-gpu-operator-config - appwrapper diff --git a/root/values_medium.yaml b/root/values_medium.yaml index 83e7ea95..d6d1964f 100644 --- a/root/values_medium.yaml +++ b/root/values_medium.yaml @@ -5,7 +5,7 @@ # Medium & Small clusters add local-path storage policy for RWX→RWO conversion enabledApps: - aim-cluster-model-source - - airm + #- airm - amd-gpu-operator - amd-gpu-operator-config - appwrapper From a9232350c0fec09e8727e9f1dc8a0a9a8fcdedb1 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Fri, 27 Feb 2026 13:20:00 +0200 Subject: [PATCH 046/115] fix: integrate Opoenbao and Gitea in to bootstrap_v2 --- scripts/bootstrap_v2.sh | 298 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 282 insertions(+), 16 deletions(-) diff --git a/scripts/bootstrap_v2.sh b/scripts/bootstrap_v2.sh index 33c93f44..80b20afd 100755 --- a/scripts/bootstrap_v2.sh +++ b/scripts/bootstrap_v2.sh @@ -79,7 +79,10 @@ while [[ $# -gt 0 ]]; do $0 dev.example.com -s=small -r=feature-branch Target Revision Behavior: - • Bootstrap will deploy ArgoCD + cluster-forge parent app + • Bootstrap will deploy ArgoCD + essential infrastructure (Gitea, OpenBao) + • Gitea will be initialized to provide git repositories for ArgoCD + • OpenBao will be initialized to provide secrets management + • cluster-forge parent app will then be deployed to manage remaining apps • ArgoCD will sync ALL apps from the specified target revision • Only apps enabled in target revision will be deployed • Apps disabled in target revision will be pruned if they exist @@ -190,7 +193,8 @@ pre_cleanup() { kubectl delete job openbao-init-job -n cf-openbao --ignore-not-found=true # Clean up any bootstrap manifest files from previous runs - rm -f /tmp/cluster-forge-bootstrap.yaml /tmp/cluster-forge-parent-app.yaml + rm -f /tmp/cluster-forge-bootstrap.yaml /tmp/cluster-forge-parent-app.yaml \ + /tmp/argocd-app.yaml /tmp/gitea-app.yaml /tmp/openbao-app.yaml echo "=== Pre-cleanup complete ===" echo "" @@ -289,6 +293,7 @@ bootstrap_argocd_managed_approach() { echo "" echo "🎯 Strategy: Let ArgoCD manage everything from target revision: $TARGET_REVISION" echo " This ensures only apps enabled in target revision are deployed" + echo " Note: Using yq for reliable ArgoCD application extraction only" echo "" # Create argocd namespace first @@ -298,8 +303,35 @@ bootstrap_argocd_managed_approach() { # Step 1: Deploy ArgoCD itself from local render (needed to bootstrap) echo "" echo "📦 Step 1: Deploying ArgoCD..." - awk '/name: argocd$/,/^---$/{if(/^---$/ && NR>1) exit; print}' /tmp/cluster-forge-bootstrap.yaml | \ - kubectl apply -f - + + # Use yq to reliably extract the ArgoCD application (acceptable for ArgoCD setup) + echo "Extracting ArgoCD application using yq (for bootstrap reliability only)..." + + # Check if yq command is available + if command -v yq >/dev/null 2>&1; then + YQ_CMD="yq" + elif [ -f "$HOME/yq" ]; then + YQ_CMD="$HOME/yq" + else + echo "ERROR: yq command not found. Please install yq or place it in $HOME/yq" + exit 1 + fi + + # Extract ArgoCD application using yq + $YQ_CMD eval 'select(.kind == "Application" and .metadata.name == "argocd")' /tmp/cluster-forge-bootstrap.yaml > /tmp/argocd-app.yaml + + # Verify we got a valid ArgoCD application + if [ -s /tmp/argocd-app.yaml ] && grep -q "kind: Application" /tmp/argocd-app.yaml; then + echo "✅ Extracted ArgoCD application using yq" + kubectl apply -f /tmp/argocd-app.yaml + else + echo "ERROR: Could not extract ArgoCD application from rendered manifests" + echo "Available applications:" + $YQ_CMD eval '.metadata.name' /tmp/cluster-forge-bootstrap.yaml | grep -v "null" | head -10 + exit 1 + fi + + rm -f /tmp/argocd-app.yaml # Wait for ArgoCD to be ready echo "⏳ Waiting for ArgoCD to become ready..." @@ -308,19 +340,26 @@ bootstrap_argocd_managed_approach() { kubectl rollout status deploy/argocd-redis -n argocd --timeout=300s echo "✅ ArgoCD ready!" - # Step 2: Render and apply the cluster-forge parent app using the template + # Step 2: Deploy essential infrastructure first (Gitea, then OpenBao) echo "" - echo "📦 Step 2: Rendering and applying cluster-forge parent application..." - echo " Using template: root/templates/cluster-forge.yaml" - echo " This will sync from target revision and manage all child apps" + echo "📦 Step 2: Deploying essential infrastructure..." + echo " Note: Gitea must be deployed first to provide git repositories for ArgoCD" - # Render the cluster-forge parent app using the existing template - render_cluster_forge_parent_app - - # Apply the rendered cluster-forge parent app - kubectl apply -f /tmp/cluster-forge-parent-app.yaml + # Extract and deploy Gitea first (it's needed for git repositories) + echo " Extracting and deploying Gitea..." + $YQ_CMD eval 'select(.kind == "Application" and .metadata.name == "gitea")' /tmp/cluster-forge-bootstrap.yaml > /tmp/gitea-app.yaml + kubectl apply -f /tmp/gitea-app.yaml + + # Extract and deploy OpenBao (needed for secrets) + echo " Extracting and deploying OpenBao..." + $YQ_CMD eval 'select(.kind == "Application" and .metadata.name == "openbao")' /tmp/cluster-forge-bootstrap.yaml > /tmp/openbao-app.yaml + kubectl apply -f /tmp/openbao-app.yaml + + echo "✅ Essential infrastructure applications deployed" - echo "✅ cluster-forge parent application applied!" + # Step 3: Wait for infrastructure and initialize + echo "" + echo "📦 Step 3: Initializing essential infrastructure..." echo "" echo "🎉 ArgoCD-Managed Bootstrap Complete!" @@ -331,8 +370,233 @@ bootstrap_argocd_managed_approach() { echo " 3. 📦 ONLY apps enabled in $TARGET_REVISION will be deployed" echo " 4. ⚡ Sync waves will ensure proper deployment order" echo " 5. 🔄 ArgoCD will automatically prune apps disabled in target revision" + # Wait for Gitea to be deployed and initialize (critical for git repositories) + wait_for_gitea_and_initialize + + # Wait for OpenBao to be deployed and initialize (critical for secrets) + wait_for_openbao_and_initialize + + # Step 4: Now apply the cluster-forge parent app (after git repositories exist) echo "" - echo "🚀 The cluster will now converge to the exact state defined in target revision!" + echo "📦 Step 4: Applying cluster-forge parent application..." + echo " Now that git repositories exist, ArgoCD can manage all remaining apps" + + # Render the cluster-forge parent app using the existing template + render_cluster_forge_parent_app + + # Apply the rendered cluster-forge parent app + kubectl apply -f /tmp/cluster-forge-parent-app.yaml + + echo "✅ cluster-forge parent application applied!" + echo "🚀 ArgoCD will now manage all remaining applications from target revision: $TARGET_REVISION" +} + +# Wait for Gitea to be deployed by ArgoCD and run initialization +wait_for_gitea_and_initialize() { + echo "" + echo "=== Gitea Initialization ===" + echo "Waiting for ArgoCD to deploy Gitea..." + + # Wait for Gitea Deployment to exist (deployed by ArgoCD) + echo "⏳ Waiting for Gitea Deployment to be created by ArgoCD..." + local timeout=300 + local elapsed=0 + while [ $elapsed -lt $timeout ]; do + if kubectl get deployment gitea -n cf-gitea >/dev/null 2>&1; then + echo "✅ Gitea Deployment found" + break + fi + sleep 5 + elapsed=$((elapsed + 5)) + echo " Waiting for ArgoCD to create Gitea Deployment... ($elapsed/$timeout seconds)" + done + + if [ $elapsed -ge $timeout ]; then + echo "⚠️ WARNING: Gitea Deployment not found after $timeout seconds" + echo " ArgoCD may still be syncing. Gitea init will be skipped." + echo " WARNING: Without Gitea initialization, ArgoCD may not have git repositories!" + return + fi + + # Wait for Gitea to be running + echo "⏳ Waiting for Gitea deployment to be ready..." + if kubectl rollout status deploy/gitea -n cf-gitea --timeout=300s; then + echo "✅ Gitea is running" + else + echo "⚠️ WARNING: Gitea deployment not ready within timeout" + echo " Gitea initialization will be skipped." + echo " WARNING: ArgoCD may not have access to git repositories!" + return + fi + + # Now run the Gitea initialization (extracted from original bootstrap.sh) + echo "" + echo "📦 Running Gitea initialization..." + + # Check if yq is available for value extraction (needed for init) + if command -v yq >/dev/null 2>&1; then + YQ_CMD="yq" + elif [ -f "$HOME/yq" ]; then + YQ_CMD="$HOME/yq" + else + echo "ERROR: yq not found. Gitea initialization requires yq." + echo "Without Gitea initialization, ArgoCD will not have git repositories!" + exit 1 + fi + + # Generate admin password function (from original bootstrap.sh) + generate_password() { + openssl rand -hex 16 | tr 'a-f' 'A-F' | head -c 32 + } + + # Extract Gitea values for initialization + echo "Extracting Gitea values for initialization..." + $YQ_CMD eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > /tmp/gitea_values.yaml + $YQ_CMD eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/gitea_size_values.yaml + + # Create merged values configmap (needed by gitea-init-job) + echo "Creating initial-cf-values configmap..." + + # Recreate merged values like original bootstrap (needed for gitea-init-job) + local SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" + if [ -n "$SIZE_VALUES_FILE" ]; then + # Merge base values with size-specific overrides + VALUES=$($YQ_CMD eval-all '. as $item ireduce ({}; . * $item)' \ + "${SOURCE_ROOT}/root/${VALUES_FILE}" \ + "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | \ + $YQ_CMD eval ".global.domain = \"${DOMAIN}\"") + else + # Use base values only + VALUES=$(cat "${SOURCE_ROOT}/root/${VALUES_FILE}" | $YQ_CMD ".global.domain = \"${DOMAIN}\"") + fi + + # Apply the target revision override + VALUES=$(echo "$VALUES" | $YQ_CMD eval ".clusterForge.targetRevision = \"${TARGET_REVISION}\"") + + kubectl create configmap initial-cf-values --from-literal=initial-cf-values="$VALUES" --dry-run=client -o yaml | kubectl apply -n cf-gitea -f - + + # Create Gitea admin credentials + echo "Creating Gitea admin credentials..." + kubectl create secret generic gitea-admin-credentials \ + --namespace=cf-gitea \ + --from-literal=username=silogen-admin \ + --from-literal=password=$(generate_password) \ + --dry-run=client -o yaml | kubectl apply -f - + + # Run Gitea initialization job + echo "Deploying Gitea initialization job..." + helm template --release-name gitea-init "${SOURCE_ROOT}/scripts/init-gitea-job" \ + --set clusterSize="${SIZE_VALUES_FILE}" \ + --set domain="${DOMAIN}" \ + --set targetRevision="${TARGET_REVISION}" \ + --kube-version="${KUBE_VERSION}" | kubectl apply -f - + + # Wait for initialization to complete + echo "⏳ Waiting for Gitea initialization to complete..." + if kubectl wait --for=condition=complete --timeout=300s job/gitea-init-job -n cf-gitea; then + echo "✅ Gitea initialization completed successfully" + echo "📦 Git repositories are now available for ArgoCD" + else + echo "❌ ERROR: Gitea initialization timed out or failed" + echo " This is CRITICAL - ArgoCD needs git repositories to function!" + echo " Check job status: kubectl describe job gitea-init-job -n cf-gitea" + echo " Check logs: kubectl logs -l job-name=gitea-init-job -n cf-gitea" + exit 1 + fi + + # Cleanup temporary files + rm -f /tmp/gitea_values.yaml /tmp/gitea_size_values.yaml + + echo "📦 Gitea initialization phase complete" +} + +# Wait for OpenBao to be deployed by ArgoCD and run initialization +wait_for_openbao_and_initialize() { + echo "" + echo "=== OpenBao Initialization ===" + echo "Waiting for ArgoCD to deploy OpenBao..." + + # Wait for OpenBao StatefulSet to exist (deployed by ArgoCD) + echo "⏳ Waiting for OpenBao StatefulSet to be created by ArgoCD..." + local timeout=300 + local elapsed=0 + while [ $elapsed -lt $timeout ]; do + if kubectl get statefulset openbao -n cf-openbao >/dev/null 2>&1; then + echo "✅ OpenBao StatefulSet found" + break + fi + sleep 5 + elapsed=$((elapsed + 5)) + echo " Waiting for ArgoCD to create OpenBao StatefulSet... ($elapsed/$timeout seconds)" + done + + if [ $elapsed -ge $timeout ]; then + echo "⚠️ WARNING: OpenBao StatefulSet not found after $timeout seconds" + echo " ArgoCD may still be syncing. OpenBao init will be skipped." + echo " You may need to run OpenBao initialization manually later." + return + fi + + # Wait for OpenBao to be running + echo "⏳ Waiting for OpenBao pod to be ready..." + if kubectl wait --for=jsonpath='{.status.phase}'=Running pod/openbao-0 -n cf-openbao --timeout=300s; then + echo "✅ OpenBao is running" + else + echo "⚠️ WARNING: OpenBao pod not ready within timeout" + echo " OpenBao initialization will be skipped." + return + fi + + # Now run the OpenBao initialization (extracted from original bootstrap.sh) + echo "" + echo "🔐 Running OpenBao initialization..." + + # Check if yq is available for value extraction (needed for init) + if command -v yq >/dev/null 2>&1; then + YQ_CMD="yq" + elif [ -f "$HOME/yq" ]; then + YQ_CMD="$HOME/yq" + else + echo "WARNING: yq not found. Skipping OpenBao initialization." + echo "You may need to initialize OpenBao manually." + return + fi + + # Extract OpenBao values for initialization (reusing existing logic) + echo "Extracting OpenBao values for initialization..." + $YQ_CMD eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > /tmp/openbao_values.yaml + $YQ_CMD eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/openbao_size_values.yaml + + # Create initial secrets config for init job (separate from ArgoCD-managed version) + echo "Creating initial OpenBao secrets configuration..." + cat "${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-manager-cm.yaml" | \ + sed "s|name: openbao-secret-manager-scripts|name: openbao-secret-manager-scripts-init|g" | kubectl apply -f - + + cat "${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml" | \ + sed "s|{{ .Values.domain }}|${DOMAIN}|g" | \ + sed "s|name: openbao-secrets-config|name: openbao-secrets-init-config|g" | kubectl apply -f - + + # Run OpenBao initialization job + echo "Deploying OpenBao initialization job..." + helm template --release-name openbao-init "${SOURCE_ROOT}/scripts/init-openbao-job" \ + -f /tmp/openbao_values.yaml \ + --set domain="${DOMAIN}" \ + --kube-version="${KUBE_VERSION}" | kubectl apply -f - + + # Wait for initialization to complete + echo "⏳ Waiting for OpenBao initialization to complete..." + if kubectl wait --for=condition=complete --timeout=300s job/openbao-init-job -n cf-openbao; then + echo "✅ OpenBao initialization completed successfully" + else + echo "⚠️ WARNING: OpenBao initialization timed out or failed" + echo " Check job status: kubectl describe job openbao-init-job -n cf-openbao" + echo " Check logs: kubectl logs -l job-name=openbao-init-job -n cf-openbao" + fi + + # Cleanup temporary files + rm -f /tmp/openbao_values.yaml /tmp/openbao_size_values.yaml + + echo "🔐 OpenBao initialization phase complete" } # NEW: Post-Bootstrap Status Check @@ -351,8 +615,10 @@ show_bootstrap_summary() { echo " 1. Monitor ArgoCD applications: kubectl get apps -n argocd" echo " 2. Check sync status: kubectl get apps -n argocd -o wide" echo " 3. View ArgoCD UI for detailed deployment progress" - echo " 4. ArgoCD will sync apps from target revision: $TARGET_REVISION" + echo " 4. ArgoCD is syncing apps from target revision: $TARGET_REVISION" echo " (Only apps enabled in that revision will be deployed)" + echo " 5. Gitea provides git repositories: https://gitea.${DOMAIN}" + echo " 6. Essential infrastructure (Gitea, OpenBao) is initialized" echo "" echo "🧹 Cleanup: Bootstrap manifests saved at:" echo " - /tmp/cluster-forge-bootstrap.yaml (all apps rendered)" From 7193db6e1631b374cda1799d1447b3407c08fe2e Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Fri, 27 Feb 2026 15:18:54 +0200 Subject: [PATCH 047/115] fix: gitea bootstraping --- scripts/bootstrap_v2.sh | 149 +++++++++++++++++++++++++++++----------- 1 file changed, 108 insertions(+), 41 deletions(-) diff --git a/scripts/bootstrap_v2.sh b/scripts/bootstrap_v2.sh index 80b20afd..c8ab57ad 100755 --- a/scripts/bootstrap_v2.sh +++ b/scripts/bootstrap_v2.sh @@ -340,26 +340,24 @@ bootstrap_argocd_managed_approach() { kubectl rollout status deploy/argocd-redis -n argocd --timeout=300s echo "✅ ArgoCD ready!" - # Step 2: Deploy essential infrastructure first (Gitea, then OpenBao) + # Step 2: Deploy Gitea directly (cannot be deployed by ArgoCD initially) echo "" - echo "📦 Step 2: Deploying essential infrastructure..." - echo " Note: Gitea must be deployed first to provide git repositories for ArgoCD" - - # Extract and deploy Gitea first (it's needed for git repositories) - echo " Extracting and deploying Gitea..." - $YQ_CMD eval 'select(.kind == "Application" and .metadata.name == "gitea")' /tmp/cluster-forge-bootstrap.yaml > /tmp/gitea-app.yaml - kubectl apply -f /tmp/gitea-app.yaml + echo "📦 Step 2: Deploying Gitea directly..." + echo " Note: Gitea must be deployed directly to provide git repositories for ArgoCD" + + deploy_gitea_directly - # Extract and deploy OpenBao (needed for secrets) + # Step 3: Deploy OpenBao via ArgoCD (it can be managed by ArgoCD) + echo "" + echo "📦 Step 3: Deploying OpenBao via ArgoCD..." echo " Extracting and deploying OpenBao..." $YQ_CMD eval 'select(.kind == "Application" and .metadata.name == "openbao")' /tmp/cluster-forge-bootstrap.yaml > /tmp/openbao-app.yaml kubectl apply -f /tmp/openbao-app.yaml - - echo "✅ Essential infrastructure applications deployed" + echo "✅ OpenBao application deployed" - # Step 3: Wait for infrastructure and initialize + # Step 4: Initialize infrastructure echo "" - echo "📦 Step 3: Initializing essential infrastructure..." + echo "📦 Step 4: Initializing essential infrastructure..." echo "" echo "🎉 ArgoCD-Managed Bootstrap Complete!" @@ -391,44 +389,113 @@ bootstrap_argocd_managed_approach() { echo "🚀 ArgoCD will now manage all remaining applications from target revision: $TARGET_REVISION" } -# Wait for Gitea to be deployed by ArgoCD and run initialization -wait_for_gitea_and_initialize() { +# Deploy Gitea directly using helm (cannot rely on ArgoCD initially) +deploy_gitea_directly() { echo "" - echo "=== Gitea Initialization ===" - echo "Waiting for ArgoCD to deploy Gitea..." + echo "=== Direct Gitea Deployment ===" + echo "Gitea must be deployed directly since ArgoCD needs git repositories to function" - # Wait for Gitea Deployment to exist (deployed by ArgoCD) - echo "⏳ Waiting for Gitea Deployment to be created by ArgoCD..." - local timeout=300 - local elapsed=0 - while [ $elapsed -lt $timeout ]; do - if kubectl get deployment gitea -n cf-gitea >/dev/null 2>&1; then - echo "✅ Gitea Deployment found" - break - fi - sleep 5 - elapsed=$((elapsed + 5)) - echo " Waiting for ArgoCD to create Gitea Deployment... ($elapsed/$timeout seconds)" - done + # Create cf-gitea namespace first + echo "Creating cf-gitea namespace..." + kubectl create ns cf-gitea --dry-run=client -o yaml | kubectl apply -f - - if [ $elapsed -ge $timeout ]; then - echo "⚠️ WARNING: Gitea Deployment not found after $timeout seconds" - echo " ArgoCD may still be syncing. Gitea init will be skipped." - echo " WARNING: Without Gitea initialization, ArgoCD may not have git repositories!" - return + # Check if yq is available for value extraction + if command -v yq >/dev/null 2>&1; then + YQ_CMD="yq" + elif [ -f "$HOME/yq" ]; then + YQ_CMD="$HOME/yq" + else + echo "ERROR: yq command not found. Please install yq or place it in $HOME/yq" + exit 1 fi - # Wait for Gitea to be running + # Extract Gitea version from values + echo "Extracting Gitea configuration..." + + # Create merged values for version extraction (similar to original bootstrap.sh) + local SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" + if [ -n "$SIZE_VALUES_FILE" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + # Merge base values with size-specific overrides + $YQ_CMD eval-all '. as $item ireduce ({}; . * $item)' \ + "${SOURCE_ROOT}/root/${VALUES_FILE}" \ + "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | \ + $YQ_CMD eval ".global.domain = \"${DOMAIN}\"" > /tmp/merged_values.yaml + else + # Use base values only + cat "${SOURCE_ROOT}/root/${VALUES_FILE}" | $YQ_CMD ".global.domain = \"${DOMAIN}\"" > /tmp/merged_values.yaml + fi + + # Extract Gitea version from merged values + GITEA_VERSION=$($YQ_CMD eval '.apps.gitea.path' /tmp/merged_values.yaml | cut -d'/' -f2) + + if [ -z "$GITEA_VERSION" ]; then + echo "ERROR: Could not extract Gitea version from values" + exit 1 + fi + + echo "Using Gitea version: $GITEA_VERSION" + + # Extract Gitea-specific values for helm template + $YQ_CMD eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > /tmp/gitea_values.yaml + + if [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + $YQ_CMD eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/gitea_size_values.yaml + else + # Create empty file if size-specific values don't exist + echo "{}" > /tmp/gitea_size_values.yaml + fi + + # Deploy Gitea directly using helm template + echo "🚀 Deploying Gitea using helm template..." + helm template --release-name gitea "${SOURCE_ROOT}/sources/gitea/${GITEA_VERSION}" --namespace cf-gitea \ + -f /tmp/gitea_values.yaml \ + -f /tmp/gitea_size_values.yaml \ + --set gitea.config.server.ROOT_URL="https://gitea.${DOMAIN}/" \ + --kube-version="${KUBE_VERSION}" | kubectl apply -f - + + # Wait for Gitea deployment to be ready echo "⏳ Waiting for Gitea deployment to be ready..." if kubectl rollout status deploy/gitea -n cf-gitea --timeout=300s; then - echo "✅ Gitea is running" + echo "✅ Gitea deployment is ready" else - echo "⚠️ WARNING: Gitea deployment not ready within timeout" - echo " Gitea initialization will be skipped." - echo " WARNING: ArgoCD may not have access to git repositories!" - return + echo "❌ ERROR: Gitea deployment failed to become ready" + echo "Check deployment status: kubectl get deployment gitea -n cf-gitea" + echo "Check logs: kubectl logs -l app=gitea -n cf-gitea" + exit 1 + fi + + # Cleanup temporary files + rm -f /tmp/gitea_values.yaml /tmp/gitea_size_values.yaml /tmp/merged_values.yaml + + echo "✅ Gitea deployed directly and ready for initialization" +} + +# Initialize Gitea after direct deployment +wait_for_gitea_and_initialize() { + echo "" + echo "=== Gitea Initialization ===" + echo "Gitea has been deployed directly - proceeding with initialization..." + + # Verify Gitea deployment exists (should already be deployed directly) + if ! kubectl get deployment gitea -n cf-gitea >/dev/null 2>&1; then + echo "❌ ERROR: Gitea deployment not found" + echo " This should not happen - deploy_gitea_directly should have created it" + exit 1 fi + echo "✅ Gitea Deployment confirmed" + + # Gitea should already be ready from direct deployment, but double-check + echo "⏳ Verifying Gitea is ready..." + if ! kubectl rollout status deploy/gitea -n cf-gitea --timeout=60s; then + echo "❌ ERROR: Gitea deployment not ready" + echo " Check deployment status: kubectl get deployment gitea -n cf-gitea" + echo " Check logs: kubectl logs -l app=gitea -n cf-gitea" + exit 1 + fi + + echo "✅ Gitea is ready for initialization" + # Now run the Gitea initialization (extracted from original bootstrap.sh) echo "" echo "📦 Running Gitea initialization..." From e3c1ba5cbbf8c1593e940d35863b9759da114cea Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Fri, 27 Feb 2026 15:27:10 +0200 Subject: [PATCH 048/115] fix: openbao needed before Gitea --- scripts/bootstrap_v2.sh | 67 ++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/scripts/bootstrap_v2.sh b/scripts/bootstrap_v2.sh index c8ab57ad..9fd38310 100755 --- a/scripts/bootstrap_v2.sh +++ b/scripts/bootstrap_v2.sh @@ -340,24 +340,33 @@ bootstrap_argocd_managed_approach() { kubectl rollout status deploy/argocd-redis -n argocd --timeout=300s echo "✅ ArgoCD ready!" - # Step 2: Deploy Gitea directly (cannot be deployed by ArgoCD initially) + # Step 2: Deploy OpenBao via ArgoCD (must be initialized before Gitea for secrets) echo "" - echo "📦 Step 2: Deploying Gitea directly..." - echo " Note: Gitea must be deployed directly to provide git repositories for ArgoCD" - - deploy_gitea_directly - - # Step 3: Deploy OpenBao via ArgoCD (it can be managed by ArgoCD) - echo "" - echo "📦 Step 3: Deploying OpenBao via ArgoCD..." + echo "📦 Step 2: Deploying OpenBao via ArgoCD..." + echo " Note: OpenBao must be initialized first to provide secrets for ExternalSecrets" echo " Extracting and deploying OpenBao..." $YQ_CMD eval 'select(.kind == "Application" and .metadata.name == "openbao")' /tmp/cluster-forge-bootstrap.yaml > /tmp/openbao-app.yaml kubectl apply -f /tmp/openbao-app.yaml echo "✅ OpenBao application deployed" - # Step 4: Initialize infrastructure + # Wait for OpenBao to be deployed and initialize (critical for secrets before Gitea) + wait_for_openbao_and_initialize + + # Additional wait for ExternalSecrets to be ready after OpenBao initialization + echo "⏳ Allowing time for ExternalSecrets controller to sync with initialized OpenBao..." + sleep 15 + echo "✅ OpenBao initialization and ExternalSecrets sync complete" + + # Step 3: Deploy Gitea directly (now that OpenBao secrets are available) echo "" - echo "📦 Step 4: Initializing essential infrastructure..." + echo "📦 Step 3: Deploying Gitea directly..." + echo " Note: Gitea must be deployed directly to provide git repositories for ArgoCD" + echo " OpenBao is now initialized, so ExternalSecrets can pull credentials" + + deploy_gitea_directly + + # Wait for Gitea to be deployed and initialize (critical for git repositories) + wait_for_gitea_and_initialize echo "" echo "🎉 ArgoCD-Managed Bootstrap Complete!" @@ -368,11 +377,6 @@ bootstrap_argocd_managed_approach() { echo " 3. 📦 ONLY apps enabled in $TARGET_REVISION will be deployed" echo " 4. ⚡ Sync waves will ensure proper deployment order" echo " 5. 🔄 ArgoCD will automatically prune apps disabled in target revision" - # Wait for Gitea to be deployed and initialize (critical for git repositories) - wait_for_gitea_and_initialize - - # Wait for OpenBao to be deployed and initialize (critical for secrets) - wait_for_openbao_and_initialize # Step 4: Now apply the cluster-forge parent app (after git repositories exist) echo "" @@ -399,6 +403,24 @@ deploy_gitea_directly() { echo "Creating cf-gitea namespace..." kubectl create ns cf-gitea --dry-run=client -o yaml | kubectl apply -f - + # Wait briefly for any ExternalSecrets to be processed now that OpenBao is initialized + echo "⏳ Waiting for ExternalSecrets to sync from OpenBao (if any)..." + sleep 10 + + # Generate password function (needed for manual secret creation) + generate_password() { + openssl rand -hex 16 | tr 'a-f' 'A-F' | head -c 32 + } + + # Create gitea-admin-credentials secret manually (ensures it exists regardless of ExternalSecret status) + echo "🔐 Creating gitea-admin-credentials secret..." + kubectl create secret generic gitea-admin-credentials \ + --namespace=cf-gitea \ + --from-literal=username=silogen-admin \ + --from-literal=password=$(generate_password) \ + --dry-run=client -o yaml | kubectl apply -f - + echo "✅ gitea-admin-credentials secret created" + # Check if yq is available for value extraction if command -v yq >/dev/null 2>&1; then YQ_CMD="yq" @@ -511,11 +533,6 @@ wait_for_gitea_and_initialize() { exit 1 fi - # Generate admin password function (from original bootstrap.sh) - generate_password() { - openssl rand -hex 16 | tr 'a-f' 'A-F' | head -c 32 - } - # Extract Gitea values for initialization echo "Extracting Gitea values for initialization..." $YQ_CMD eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > /tmp/gitea_values.yaml @@ -542,13 +559,7 @@ wait_for_gitea_and_initialize() { kubectl create configmap initial-cf-values --from-literal=initial-cf-values="$VALUES" --dry-run=client -o yaml | kubectl apply -n cf-gitea -f - - # Create Gitea admin credentials - echo "Creating Gitea admin credentials..." - kubectl create secret generic gitea-admin-credentials \ - --namespace=cf-gitea \ - --from-literal=username=silogen-admin \ - --from-literal=password=$(generate_password) \ - --dry-run=client -o yaml | kubectl apply -f - + # Note: gitea-admin-credentials secret already created in deploy_gitea_directly function # Run Gitea initialization job echo "Deploying Gitea initialization job..." From a983c6d6a1bf0d1d165e05307189085461d5fe37 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Fri, 27 Feb 2026 15:43:27 +0200 Subject: [PATCH 049/115] fix: openbao deployment --- scripts/bootstrap_v2.sh | 224 +++++++++++++++++++++------------------- 1 file changed, 119 insertions(+), 105 deletions(-) diff --git a/scripts/bootstrap_v2.sh b/scripts/bootstrap_v2.sh index 9fd38310..349aea67 100755 --- a/scripts/bootstrap_v2.sh +++ b/scripts/bootstrap_v2.sh @@ -340,22 +340,13 @@ bootstrap_argocd_managed_approach() { kubectl rollout status deploy/argocd-redis -n argocd --timeout=300s echo "✅ ArgoCD ready!" - # Step 2: Deploy OpenBao via ArgoCD (must be initialized before Gitea for secrets) + # Step 2: Deploy OpenBao directly (must be initialized before Gitea for secrets) echo "" - echo "📦 Step 2: Deploying OpenBao via ArgoCD..." - echo " Note: OpenBao must be initialized first to provide secrets for ExternalSecrets" - echo " Extracting and deploying OpenBao..." - $YQ_CMD eval 'select(.kind == "Application" and .metadata.name == "openbao")' /tmp/cluster-forge-bootstrap.yaml > /tmp/openbao-app.yaml - kubectl apply -f /tmp/openbao-app.yaml - echo "✅ OpenBao application deployed" - - # Wait for OpenBao to be deployed and initialize (critical for secrets before Gitea) - wait_for_openbao_and_initialize + echo "📦 Step 2: Deploying OpenBao directly..." + echo " Note: OpenBao must be deployed directly to provide secrets for ExternalSecrets" + echo " OpenBao cannot be deployed via ArgoCD as it's a fundamental dependency" - # Additional wait for ExternalSecrets to be ready after OpenBao initialization - echo "⏳ Allowing time for ExternalSecrets controller to sync with initialized OpenBao..." - sleep 15 - echo "✅ OpenBao initialization and ExternalSecrets sync complete" + deploy_openbao_directly # Step 3: Deploy Gitea directly (now that OpenBao secrets are available) echo "" @@ -393,6 +384,116 @@ bootstrap_argocd_managed_approach() { echo "🚀 ArgoCD will now manage all remaining applications from target revision: $TARGET_REVISION" } +# Deploy OpenBao directly using helm (cannot rely on ArgoCD for fundamental dependencies) +deploy_openbao_directly() { + echo "" + echo "=== Direct OpenBao Deployment ===" + echo "OpenBao must be deployed directly since it provides secrets for ExternalSecrets" + + # Create cf-openbao namespace first + echo "Creating cf-openbao namespace..." + kubectl create ns cf-openbao --dry-run=client -o yaml | kubectl apply -f - + + # Check if yq is available for value extraction + if command -v yq >/dev/null 2>&1; then + YQ_CMD="yq" + elif [ -f "$HOME/yq" ]; then + YQ_CMD="$HOME/yq" + else + echo "ERROR: yq command not found. Please install yq or place it in $HOME/yq" + exit 1 + fi + + # Create merged values for version extraction (similar to original bootstrap.sh) + local SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" + if [ -n "$SIZE_VALUES_FILE" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + # Merge base values with size-specific overrides + $YQ_CMD eval-all '. as $item ireduce ({}; . * $item)' \ + "${SOURCE_ROOT}/root/${VALUES_FILE}" \ + "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | \ + $YQ_CMD eval ".global.domain = \"${DOMAIN}\"" > /tmp/merged_values.yaml + else + # Use base values only + cat "${SOURCE_ROOT}/root/${VALUES_FILE}" | $YQ_CMD ".global.domain = \"${DOMAIN}\"" > /tmp/merged_values.yaml + fi + + # Extract OpenBao version from merged values + OPENBAO_VERSION=$($YQ_CMD eval '.apps.openbao.path' /tmp/merged_values.yaml | cut -d'/' -f2) + + if [ -z "$OPENBAO_VERSION" ]; then + echo "ERROR: Could not extract OpenBao version from values" + exit 1 + fi + + echo "Using OpenBao version: $OPENBAO_VERSION" + + # Extract OpenBao-specific values for helm template + $YQ_CMD eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > /tmp/openbao_values.yaml + + if [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + $YQ_CMD eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/openbao_size_values.yaml + else + # Create empty file if size-specific values don't exist + echo "{}" > /tmp/openbao_size_values.yaml + fi + + # Deploy OpenBao directly using helm template (following original bootstrap.sh pattern) + echo "🚀 Deploying OpenBao using helm template..." + helm template --release-name openbao "${SOURCE_ROOT}/sources/openbao/${OPENBAO_VERSION}" --namespace cf-openbao \ + -f /tmp/openbao_values.yaml \ + -f /tmp/openbao_size_values.yaml \ + --set ui.enabled=true \ + --kube-version="${KUBE_VERSION}" | kubectl apply --server-side --field-manager=argocd-controller --force-conflicts -f - + + # Wait for OpenBao pod to be running + echo "⏳ Waiting for OpenBao pod to be ready..." + if kubectl wait --for=jsonpath='{.status.phase}'=Running pod/openbao-0 -n cf-openbao --timeout=300s; then + echo "✅ OpenBao pod is running" + else + echo "❌ ERROR: OpenBao pod failed to start" + echo "Check pod status: kubectl get pod openbao-0 -n cf-openbao" + echo "Check logs: kubectl logs openbao-0 -n cf-openbao" + exit 1 + fi + + # Create initial secrets config for init job (following original bootstrap.sh pattern) + echo "Creating initial OpenBao secrets configuration..." + cat "${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-manager-cm.yaml" | \ + sed "s|name: openbao-secret-manager-scripts|name: openbao-secret-manager-scripts-init|g" | kubectl apply -f - + + cat "${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml" | \ + sed "s|{{ .Values.domain }}|${DOMAIN}|g" | \ + sed "s|name: openbao-secrets-config|name: openbao-secrets-init-config|g" | kubectl apply -f - + + # Run OpenBao initialization job + echo "Deploying OpenBao initialization job..." + helm template --release-name openbao-init "${SOURCE_ROOT}/scripts/init-openbao-job" \ + -f /tmp/openbao_values.yaml \ + --set domain="${DOMAIN}" \ + --kube-version="${KUBE_VERSION}" | kubectl apply -f - + + # Wait for initialization to complete + echo "⏳ Waiting for OpenBao initialization to complete..." + if kubectl wait --for=condition=complete --timeout=300s job/openbao-init-job -n cf-openbao; then + echo "✅ OpenBao initialization completed successfully" + else + echo "❌ ERROR: OpenBao initialization timed out or failed" + echo "Check job status: kubectl describe job openbao-init-job -n cf-openbao" + echo "Check logs: kubectl logs -l job-name=openbao-init-job -n cf-openbao" + exit 1 + fi + + # Allow time for ExternalSecrets controller to sync with initialized OpenBao + echo "⏳ Allowing time for ExternalSecrets controller to sync with initialized OpenBao..." + sleep 15 + echo "✅ OpenBao initialization and ExternalSecrets sync complete" + + # Cleanup temporary files + rm -f /tmp/openbao_values.yaml /tmp/openbao_size_values.yaml /tmp/merged_values.yaml + + echo "✅ OpenBao deployed directly and initialized successfully" +} + # Deploy Gitea directly using helm (cannot rely on ArgoCD initially) deploy_gitea_directly() { echo "" @@ -588,95 +689,6 @@ wait_for_gitea_and_initialize() { echo "📦 Gitea initialization phase complete" } -# Wait for OpenBao to be deployed by ArgoCD and run initialization -wait_for_openbao_and_initialize() { - echo "" - echo "=== OpenBao Initialization ===" - echo "Waiting for ArgoCD to deploy OpenBao..." - - # Wait for OpenBao StatefulSet to exist (deployed by ArgoCD) - echo "⏳ Waiting for OpenBao StatefulSet to be created by ArgoCD..." - local timeout=300 - local elapsed=0 - while [ $elapsed -lt $timeout ]; do - if kubectl get statefulset openbao -n cf-openbao >/dev/null 2>&1; then - echo "✅ OpenBao StatefulSet found" - break - fi - sleep 5 - elapsed=$((elapsed + 5)) - echo " Waiting for ArgoCD to create OpenBao StatefulSet... ($elapsed/$timeout seconds)" - done - - if [ $elapsed -ge $timeout ]; then - echo "⚠️ WARNING: OpenBao StatefulSet not found after $timeout seconds" - echo " ArgoCD may still be syncing. OpenBao init will be skipped." - echo " You may need to run OpenBao initialization manually later." - return - fi - - # Wait for OpenBao to be running - echo "⏳ Waiting for OpenBao pod to be ready..." - if kubectl wait --for=jsonpath='{.status.phase}'=Running pod/openbao-0 -n cf-openbao --timeout=300s; then - echo "✅ OpenBao is running" - else - echo "⚠️ WARNING: OpenBao pod not ready within timeout" - echo " OpenBao initialization will be skipped." - return - fi - - # Now run the OpenBao initialization (extracted from original bootstrap.sh) - echo "" - echo "🔐 Running OpenBao initialization..." - - # Check if yq is available for value extraction (needed for init) - if command -v yq >/dev/null 2>&1; then - YQ_CMD="yq" - elif [ -f "$HOME/yq" ]; then - YQ_CMD="$HOME/yq" - else - echo "WARNING: yq not found. Skipping OpenBao initialization." - echo "You may need to initialize OpenBao manually." - return - fi - - # Extract OpenBao values for initialization (reusing existing logic) - echo "Extracting OpenBao values for initialization..." - $YQ_CMD eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > /tmp/openbao_values.yaml - $YQ_CMD eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/openbao_size_values.yaml - - # Create initial secrets config for init job (separate from ArgoCD-managed version) - echo "Creating initial OpenBao secrets configuration..." - cat "${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-manager-cm.yaml" | \ - sed "s|name: openbao-secret-manager-scripts|name: openbao-secret-manager-scripts-init|g" | kubectl apply -f - - - cat "${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml" | \ - sed "s|{{ .Values.domain }}|${DOMAIN}|g" | \ - sed "s|name: openbao-secrets-config|name: openbao-secrets-init-config|g" | kubectl apply -f - - - # Run OpenBao initialization job - echo "Deploying OpenBao initialization job..." - helm template --release-name openbao-init "${SOURCE_ROOT}/scripts/init-openbao-job" \ - -f /tmp/openbao_values.yaml \ - --set domain="${DOMAIN}" \ - --kube-version="${KUBE_VERSION}" | kubectl apply -f - - - # Wait for initialization to complete - echo "⏳ Waiting for OpenBao initialization to complete..." - if kubectl wait --for=condition=complete --timeout=300s job/openbao-init-job -n cf-openbao; then - echo "✅ OpenBao initialization completed successfully" - else - echo "⚠️ WARNING: OpenBao initialization timed out or failed" - echo " Check job status: kubectl describe job openbao-init-job -n cf-openbao" - echo " Check logs: kubectl logs -l job-name=openbao-init-job -n cf-openbao" - fi - - # Cleanup temporary files - rm -f /tmp/openbao_values.yaml /tmp/openbao_size_values.yaml - - echo "🔐 OpenBao initialization phase complete" -} - # NEW: Post-Bootstrap Status Check show_bootstrap_summary() { echo "=== ClusterForge Bootstrap Complete ===" @@ -688,6 +700,7 @@ show_bootstrap_summary() { echo "🌐 Access URLs:" echo " ArgoCD: https://argocd.${DOMAIN}" echo " Gitea: https://gitea.${DOMAIN}" + echo " OpenBao: https://openbao.${DOMAIN}" echo "" echo "📋 Next steps:" echo " 1. Monitor ArgoCD applications: kubectl get apps -n argocd" @@ -695,8 +708,9 @@ show_bootstrap_summary() { echo " 3. View ArgoCD UI for detailed deployment progress" echo " 4. ArgoCD is syncing apps from target revision: $TARGET_REVISION" echo " (Only apps enabled in that revision will be deployed)" - echo " 5. Gitea provides git repositories: https://gitea.${DOMAIN}" - echo " 6. Essential infrastructure (Gitea, OpenBao) is initialized" + echo " 5. Gitea provides git repositories: https://gitea.${DOMAIN}" + echo " 6. OpenBao provides secrets management: https://openbao.${DOMAIN}" + echo " 7. Essential infrastructure (OpenBao, Gitea) is initialized" echo "" echo "🧹 Cleanup: Bootstrap manifests saved at:" echo " - /tmp/cluster-forge-bootstrap.yaml (all apps rendered)" From 6a051a0cce7948c2402191cda5514297643d4bdc Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Fri, 27 Feb 2026 15:48:52 +0200 Subject: [PATCH 050/115] fix: argo crd ordering error --- scripts/bootstrap_v2.sh | 104 +++++++++++++++++++++++++++------------- 1 file changed, 71 insertions(+), 33 deletions(-) diff --git a/scripts/bootstrap_v2.sh b/scripts/bootstrap_v2.sh index 349aea67..1e7b3acd 100755 --- a/scripts/bootstrap_v2.sh +++ b/scripts/bootstrap_v2.sh @@ -288,26 +288,13 @@ EOF echo "✅ cluster-forge parent app rendered to /tmp/cluster-forge-parent-app.yaml" } -bootstrap_argocd_managed_approach() { - echo "=== ArgoCD-Managed Bootstrap ===" - echo "" - echo "🎯 Strategy: Let ArgoCD manage everything from target revision: $TARGET_REVISION" - echo " This ensures only apps enabled in target revision are deployed" - echo " Note: Using yq for reliable ArgoCD application extraction only" +# Deploy ArgoCD directly using helm (following original bootstrap.sh pattern) +deploy_argocd_directly() { echo "" - - # Create argocd namespace first - echo "Creating ArgoCD namespace..." - kubectl create ns argocd --dry-run=client -o yaml | kubectl apply -f - - - # Step 1: Deploy ArgoCD itself from local render (needed to bootstrap) - echo "" - echo "📦 Step 1: Deploying ArgoCD..." - - # Use yq to reliably extract the ArgoCD application (acceptable for ArgoCD setup) - echo "Extracting ArgoCD application using yq (for bootstrap reliability only)..." + echo "=== Direct ArgoCD Deployment ===" + echo "ArgoCD must be deployed directly before it can manage other applications" - # Check if yq command is available + # Check if yq is available for value extraction if command -v yq >/dev/null 2>&1; then YQ_CMD="yq" elif [ -f "$HOME/yq" ]; then @@ -317,28 +304,79 @@ bootstrap_argocd_managed_approach() { exit 1 fi - # Extract ArgoCD application using yq - $YQ_CMD eval 'select(.kind == "Application" and .metadata.name == "argocd")' /tmp/cluster-forge-bootstrap.yaml > /tmp/argocd-app.yaml - - # Verify we got a valid ArgoCD application - if [ -s /tmp/argocd-app.yaml ] && grep -q "kind: Application" /tmp/argocd-app.yaml; then - echo "✅ Extracted ArgoCD application using yq" - kubectl apply -f /tmp/argocd-app.yaml + # Create merged values for version extraction (similar to original bootstrap.sh) + local SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" + if [ -n "$SIZE_VALUES_FILE" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + # Merge base values with size-specific overrides + $YQ_CMD eval-all '. as $item ireduce ({}; . * $item)' \ + "${SOURCE_ROOT}/root/${VALUES_FILE}" \ + "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | \ + $YQ_CMD eval ".global.domain = \"${DOMAIN}\"" > /tmp/merged_values.yaml else - echo "ERROR: Could not extract ArgoCD application from rendered manifests" - echo "Available applications:" - $YQ_CMD eval '.metadata.name' /tmp/cluster-forge-bootstrap.yaml | grep -v "null" | head -10 + # Use base values only + cat "${SOURCE_ROOT}/root/${VALUES_FILE}" | $YQ_CMD ".global.domain = \"${DOMAIN}\"" > /tmp/merged_values.yaml + fi + + # Extract ArgoCD version from merged values + ARGOCD_VERSION=$($YQ_CMD eval '.apps.argocd.path' /tmp/merged_values.yaml | cut -d'/' -f2) + + if [ -z "$ARGOCD_VERSION" ]; then + echo "ERROR: Could not extract ArgoCD version from values" exit 1 fi - rm -f /tmp/argocd-app.yaml - - # Wait for ArgoCD to be ready + echo "Using ArgoCD version: $ARGOCD_VERSION" + + # Extract ArgoCD-specific values for helm template + $YQ_CMD eval '.apps.argocd.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > /tmp/argocd_values.yaml + + if [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + $YQ_CMD eval '.apps.argocd.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/argocd_size_values.yaml + else + # Create empty file if size-specific values don't exist + echo "{}" > /tmp/argocd_size_values.yaml + fi + + # Deploy ArgoCD directly using helm template (following original bootstrap.sh pattern) + echo "🚀 Deploying ArgoCD using helm template..." + helm template --release-name argocd "${SOURCE_ROOT}/sources/argocd/${ARGOCD_VERSION}" --namespace argocd \ + -f /tmp/argocd_values.yaml \ + -f /tmp/argocd_size_values.yaml \ + --set global.domain="argocd.${DOMAIN}" \ + --kube-version="${KUBE_VERSION}" | kubectl apply --server-side --field-manager=argocd-controller --force-conflicts -f - + + # Wait for ArgoCD components to be ready echo "⏳ Waiting for ArgoCD to become ready..." kubectl rollout status statefulset/argocd-application-controller -n argocd --timeout=300s - kubectl rollout status deploy/argocd-repo-server -n argocd --timeout=300s + kubectl rollout status deploy/argocd-applicationset-controller -n argocd --timeout=300s kubectl rollout status deploy/argocd-redis -n argocd --timeout=300s - echo "✅ ArgoCD ready!" + kubectl rollout status deploy/argocd-repo-server -n argocd --timeout=300s + + # Cleanup temporary files + rm -f /tmp/argocd_values.yaml /tmp/argocd_size_values.yaml /tmp/merged_values.yaml + + echo "✅ ArgoCD deployed directly and ready to manage applications" +} + +bootstrap_argocd_managed_approach() { + echo "=== Direct Infrastructure + ArgoCD-Managed Apps ===" + echo "" + echo "🎯 Strategy: Deploy fundamental infrastructure directly, then let ArgoCD manage apps" + echo " 1. Deploy ArgoCD, OpenBao, Gitea directly (fundamental dependencies)" + echo " 2. ArgoCD then manages remaining apps from target revision: $TARGET_REVISION" + echo " 3. This ensures only apps enabled in target revision are deployed" + echo "" + + # Create argocd namespace first + echo "Creating ArgoCD namespace..." + kubectl create ns argocd --dry-run=client -o yaml | kubectl apply -f - + + # Step 1: Deploy ArgoCD directly (cannot use ArgoCD Application before ArgoCD exists) + echo "" + echo "📦 Step 1: Deploying ArgoCD directly..." + echo " Note: ArgoCD must be deployed directly before it can manage applications" + + deploy_argocd_directly # Step 2: Deploy OpenBao directly (must be initialized before Gitea for secrets) echo "" From b70834a397e51abeea977c215af229909d714604 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Fri, 27 Feb 2026 21:29:54 +0200 Subject: [PATCH 051/115] fix: add back AIRM as enabled app after successful test --- root/values.yaml | 2 +- root/values_medium.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index 89f3f158..aa667d19 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -12,7 +12,7 @@ global: clusterSize: # to be filled by bootstrap script (small, medium, large) enabledApps: - aim-cluster-model-source - #- airm + - airm - amd-gpu-operator - amd-gpu-operator-config - appwrapper diff --git a/root/values_medium.yaml b/root/values_medium.yaml index d6d1964f..83e7ea95 100644 --- a/root/values_medium.yaml +++ b/root/values_medium.yaml @@ -5,7 +5,7 @@ # Medium & Small clusters add local-path storage policy for RWX→RWO conversion enabledApps: - aim-cluster-model-source - #- airm + - airm - amd-gpu-operator - amd-gpu-operator-config - appwrapper From e62f54d403d25b7e2024adcd50fb4ce41c24f40a Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Fri, 27 Feb 2026 21:31:43 +0200 Subject: [PATCH 052/115] feat(bootstrap): only deploy cluster-forge Argo app, children sync via Argocd; rm openbao explicit bootstrap and allow argo to handle --- scripts/bootstrap.sh | 129 +++++++++++++++++++++++++------------------ 1 file changed, 76 insertions(+), 53 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index d843dd93..58c5d0aa 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -67,7 +67,7 @@ while [[ $# -gt 0 ]]; do values_file Optional. Values .yaml file to use, default: root/values.yaml Options: - -r, --target-revision cluster-forge git revision to seed into cluster-values/values.yaml file + -r, --target-revision cluster-forge git revision for ArgoCD to sync from options: [tag|commit_hash|branch_name], default: $LATEST_RELEASE -s, --cluster-size options: [small|medium|large], default: medium @@ -76,6 +76,12 @@ while [[ $# -gt 0 ]]; do $0 112.100.97.17.nip.io $0 dev.example.com --cluster-size=small --target-revision=$LATEST_RELEASE $0 dev.example.com -s=small -r=$LATEST_RELEASE + + Bootstrap Behavior: + • Bootstrap deploys ArgoCD + Gitea directly (essential infrastructure) + • cluster-forge parent app then deployed to manage remaining apps + • ArgoCD syncs ALL apps from specified target revision + • OpenBao and other apps deploy via ArgoCD (not directly) HELP_OUTPUT exit 0 ;; @@ -267,10 +273,9 @@ get_openbao_value() { # Extract version information from app paths extract_app_versions() { ARGOCD_VERSION=$($YQ_CMD eval '.apps.argocd.path' /tmp/merged_values.yaml | cut -d'/' -f2) - OPENBAO_VERSION=$($YQ_CMD eval '.apps.openbao.path' /tmp/merged_values.yaml | cut -d'/' -f2) GITEA_VERSION=$($YQ_CMD eval '.apps.gitea.path' /tmp/merged_values.yaml | cut -d'/' -f2) - echo "Extracted versions - ArgoCD: $ARGOCD_VERSION, OpenBao: $OPENBAO_VERSION, Gitea: $GITEA_VERSION" + echo "Extracted versions - ArgoCD: $ARGOCD_VERSION, Gitea: $GITEA_VERSION" } # Merge values files early so all subsequent operations can use the merged config @@ -279,10 +284,10 @@ merge_values_files # Extract version information from merged values extract_app_versions -# Create namespaces +# Create namespaces for direct deployments only kubectl create ns argocd --dry-run=client -o yaml | kubectl apply -f - kubectl create ns cf-gitea --dry-run=client -o yaml | kubectl apply -f - -kubectl create ns cf-openbao --dry-run=client -o yaml | kubectl apply -f - +# Note: cf-openbao namespace will be created by ArgoCD when it deploys OpenBao echo "" echo "=== ArgoCD Bootstrap ===" @@ -301,36 +306,8 @@ kubectl rollout status deploy/argocd-redis -n argocd kubectl rollout status deploy/argocd-repo-server -n argocd echo "" -echo "=== OpenBao Bootstrap ===" -# Extract OpenBao values from merged config -$YQ_CMD eval '.apps.openbao.valuesObject' ${SOURCE_ROOT}/root/${VALUES_FILE} > /tmp/openbao_values.yaml -$YQ_CMD eval '.apps.openbao.valuesObject' ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE} > /tmp/openbao_size_values.yaml -# Use server-side apply to match ArgoCD's field management strategy -helm template --release-name openbao ${SOURCE_ROOT}/sources/openbao/${OPENBAO_VERSION} --namespace cf-openbao \ - -f /tmp/openbao_values.yaml \ - -f /tmp/openbao_size_values.yaml \ - --set ui.enabled=true \ - --kube-version=${KUBE_VERSION} | kubectl apply --server-side --field-manager=argocd-controller --force-conflicts -f - -kubectl wait --for=jsonpath='{.status.phase}'=Running pod/openbao-0 -n cf-openbao --timeout=100s - -# Create initial secrets config for init job (separate from ArgoCD-managed version) -echo "Creating initial OpenBao secrets configuration..." -cat ${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-manager-cm.yaml | \ - sed "s|name: openbao-secret-manager-scripts|name: openbao-secret-manager-scripts-init|g" | kubectl apply -f - - -# Create initial secrets config for init job (separate from ArgoCD-managed version) -echo "Creating initial OpenBao secrets configuration..." -cat ${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml | \ - sed "s|{{ .Values.domain }}|${DOMAIN}|g" | \ - sed "s|name: openbao-secrets-config|name: openbao-secrets-init-config|g" | kubectl apply -f - - -# Pass OpenBao configuration to init script -helm template --release-name openbao-init ${SOURCE_ROOT}/scripts/init-openbao-job \ - -f /tmp/openbao_values.yaml \ - --set domain="${DOMAIN}" \ - --kube-version=${KUBE_VERSION} | kubectl apply -f - -kubectl wait --for=condition=complete --timeout=300s job/openbao-init-job -n cf-openbao - +echo "=== Skipping OpenBao Direct Deployment ===" +echo "OpenBao will be deployed via ArgoCD after cluster-forge parent app is applied" echo "" echo "=== Gitea Bootstrap ===" generate_password() { @@ -369,26 +346,72 @@ helm template --release-name gitea-init ${SOURCE_ROOT}/scripts/init-gitea-job \ kubectl wait --for=condition=complete --timeout=300s job/gitea-init-job -n cf-gitea echo "" -echo "=== Creating ClusterForge App-of-Apps ===" +echo "=== Creating ClusterForge Parent App-of-Apps ===" echo "Cluster size: $CLUSTER_SIZE" -helm template ${SOURCE_ROOT}/root \ - -f /tmp/merged_values.yaml \ - --kube-version=${KUBE_VERSION} | kubectl apply -f - - -echo <<__SUMMARY__ - - === ClusterForge Bootstrap Complete ===" - - Domain: $DOMAIN - Cluster size: $CLUSTER_SIZE - Target revision: $TARGET_REVISION - - Access ArgoCD at: https://argocd.${DOMAIN} - Access Gitea at: https://gitea.${DOMAIN} - - This is the way! +echo "Target revision: $TARGET_REVISION" + +# Create minimal values for rendering only the cluster-forge parent app +cat > /tmp/cluster_forge_values.yaml < Date: Fri, 27 Feb 2026 21:37:05 +0200 Subject: [PATCH 053/115] feat: rm yq dependency and check; fix: rm partially implemented cleanup function --- scripts/bootstrap.sh | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 58c5d0aa..d6629c45 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -202,7 +202,7 @@ setup_sources setup_values_files # Run pre-cleanup -pre_cleanup +# pre_cleanup echo "=== ClusterForge Bootstrap ===" echo "Domain: $DOMAIN" @@ -215,16 +215,6 @@ echo "Target revision: $TARGET_REVISION" echo "" echo "=== Starting Bootstrap Process ===" -# Check for yq command availability -if command -v yq >/dev/null 2>&1; then - YQ_CMD="yq" -elif [ -f "$HOME/yq" ]; then - YQ_CMD="$HOME/yq" -else - echo "ERROR: yq command not found. Please install yq or place it in $HOME/yq" - exit 1 -fi - # Update the global.clusterSize in the base values file with mapped filename if [ -n "$SIZE_VALUES_FILE" ]; then $YQ_CMD -i ".global.clusterSize = \"${SIZE_VALUES_FILE}\"" "${SOURCE_ROOT}/root/${VALUES_FILE}" From 7276dd59be623258a7d4799cdf264a8bd03320e9 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Fri, 27 Feb 2026 21:56:27 +0200 Subject: [PATCH 054/115] feat: separate values for argo and gitea, so yq can completely be removed from bootstrap.sh chore: rm script for validate-cluster-sizes (out of date) and unused chore: rm redundant values in values_large.yaml so diff vs. base is clearer --- root/values.yaml | 8 ++ root/values_argocd.yaml | 58 ++++++++++ root/values_gitea.yaml | 31 ++++++ root/values_large.yaml | 86 --------------- scripts/bootstrap.sh | 97 ++++++----------- scripts/validate-cluster-sizes.sh | 170 ------------------------------ 6 files changed, 127 insertions(+), 323 deletions(-) create mode 100644 root/values_argocd.yaml create mode 100644 root/values_gitea.yaml delete mode 100755 scripts/validate-cluster-sizes.sh diff --git a/root/values.yaml b/root/values.yaml index aa667d19..754ec14f 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -397,6 +397,14 @@ apps: path: keycloak-old namespace: keycloak valuesObject: + replicaCount: 1 + resources: + limits: + cpu: "500m" + memory: "2Gi" + requests: + cpu: "250m" + memory: "512Mi" podLabels: app: keycloak auth: diff --git a/root/values_argocd.yaml b/root/values_argocd.yaml new file mode 100644 index 00000000..649c37a0 --- /dev/null +++ b/root/values_argocd.yaml @@ -0,0 +1,58 @@ +# Dedicated ArgoCD values for bootstrap +# Extracted from the argocd.valuesObject section of values.yaml +applicationSet: + replicas: 1 +configs: + cm: + create: true + resource.customizations.health.opentelemetry.io_OpenTelemetryCollector: | + hs = {} + hs.status = "Healthy" + hs.message = "Always Healthy - See https://github.com/argoproj/argo-cd/pull/24284" + return hs + resource.customizations.health.keda.sh_ScaledObject: | + hs = {} + if obj.status ~= nil then + if obj.status.conditions ~= nil then + for _, condition in ipairs(obj.status.conditions) do + if condition.type == "Ready" then + if condition.status == "True" then + hs.status = "Healthy" + hs.message = "ScaledObject is ready" + else + hs.status = "Degraded" + hs.message = condition.reason or "ScaledObject not ready" + end + return hs + end + end + end + hs.status = "Progressing" + hs.message = "ScaledObject status unknown" + else + hs.status = "Progressing" + hs.message = "ScaledObject status unknown" + end + return hs + params: + server.insecure: true + rbac: + create: true + policy.csv: | + g, argocd-users, role:admin +controller: + replicas: 1 +redis: + enabled: true +redis-ha: + enabled: false +repoServer: + replicas: 1 + autoscaling: + enabled: false +server: + replicas: 1 + autoscaling: + enabled: false +global: + domain: # to be set by bootstrap script \ No newline at end of file diff --git a/root/values_gitea.yaml b/root/values_gitea.yaml new file mode 100644 index 00000000..f2abd256 --- /dev/null +++ b/root/values_gitea.yaml @@ -0,0 +1,31 @@ +# Dedicated Gitea values for bootstrap +# Extracted from the gitea.valuesObject section of values.yaml +clusterDomain: # to be set by bootstrap script +strategy: + type: "Recreate" +gitea: + admin: + existingSecret: gitea-admin-credentials + config: + server: + ROOT_URL: # to be set by bootstrap script + database: + DB_TYPE: sqlite3 + session: + PROVIDER: memory + cache: + ADAPTER: memory + queue: + TYPE: level +valkey-cluster: + enabled: false +valkey: + enabled: false +postgresql: + enabled: false +postgresql-ha: + enabled: false +persistence: + enabled: true +test: + enabled: false \ No newline at end of file diff --git a/root/values_large.yaml b/root/values_large.yaml index 6efb6d52..fd5909aa 100644 --- a/root/values_large.yaml +++ b/root/values_large.yaml @@ -1,41 +1,4 @@ -# LARGE CLUSTER CONFIGURATION (Production: app-dev.silogen.ai) -# Actual Hardware: 4 nodes (HA control plane and at least one worker node) -# -# This configuration reflects a production deployment -# -# LARGE CLUSTER: All apps enabled (inherited from base values.yaml) -# Uses Longhorn storage with native RWX support - no access mode mutation needed - apps: - # Git - Gitea (Production: single replica, sqlite3) - gitea: - valuesObject: - postgresql-ha: - enabled: false - valkey-cluster: - enabled: false - # Core apps - ArgoCD (Production: single replicas, appropriate for 4-node cluster) - argocd: - valuesObject: - # Production config: Single replicas (no HA for 4-node cluster) - applicationSet: - replicas: 1 - controller: - replicas: 1 - redis: - enabled: true - redis-ha: - enabled: false - repoServer: - replicas: 1 - autoscaling: - enabled: false - server: - replicas: 1 - autoscaling: - enabled: false - - # Storage - MinIO Tenant (Production: single server configuration) minio-tenant: valuesObject: tenant: @@ -45,8 +8,6 @@ apps: size: 500Gi storageClassName: direct volumesPerServer: 1 - - # Secrets - OpenBao (Production: 3 replicas, 10Gi storage) openbao: valuesObject: server: @@ -55,50 +16,3 @@ apps: replicas: 3 raft: enabled: true - - # Workload scheduling - Kueue (Production: single replica) - kueue: - valuesObject: - controllerManager: - replicas: 1 - - # Authentication - Keycloak (Production: single replica) - keycloak: - valuesObject: - replicaCount: 1 - resources: - limits: - cpu: "500m" - memory: "2Gi" - requests: - cpu: "250m" - memory: "512Mi" - - # Monitoring - OTEL LGTM Stack (Production configuration) - otel-lgtm-stack: - valuesObject: - # Production: Standard monitoring resources - lgtm: - storage: - # Production storage sizes (default storage class) - extra: 50Gi - grafana: 10Gi - loki: 50Gi - mimir: 50Gi - tempo: 50Gi - collectors: - resources: - logs: - limits: - cpu: '1' - memory: 2Gi - requests: - cpu: 200m - memory: 400Mi - metrics: - limits: - cpu: '2' - memory: 8Gi - requests: - cpu: 500m - memory: 1Gi diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index d6629c45..670646c5 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -215,63 +215,24 @@ echo "Target revision: $TARGET_REVISION" echo "" echo "=== Starting Bootstrap Process ===" -# Update the global.clusterSize in the base values file with mapped filename -if [ -n "$SIZE_VALUES_FILE" ]; then - $YQ_CMD -i ".global.clusterSize = \"${SIZE_VALUES_FILE}\"" "${SOURCE_ROOT}/root/${VALUES_FILE}" -else - $YQ_CMD -i ".global.clusterSize = \"values_${CLUSTER_SIZE}.yaml\"" "${SOURCE_ROOT}/root/${VALUES_FILE}" -fi - -# Note: clusterForge.targetRevision will be set by the gitea-init-job -# in the cluster-values repository (which overwrites the base values as the final values file) -echo "Target revision $TARGET_REVISION will be set in cluster-values repo by gitea-init-job" - -# Function to merge values files early for use throughout the script -merge_values_files() { - echo "Merging values files..." - if [ -n "$SIZE_VALUES_FILE" ]; then - # Merge base values with size-specific overrides - VALUES=$($YQ_CMD eval-all '. as $item ireduce ({}; . * $item)' \ - ${SOURCE_ROOT}/root/${VALUES_FILE} \ - ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE} | \ - $YQ_CMD eval ".global.domain = \"${DOMAIN}\"") - else - # Use base values only - VALUES=$(cat ${SOURCE_ROOT}/root/${VALUES_FILE} | $YQ_CMD ".global.domain = \"${DOMAIN}\"") - fi - - # Apply the target revision override (matching what cluster-values repo will contain) - echo "Applying targetRevision override: $TARGET_REVISION" - VALUES=$(echo "$VALUES" | $YQ_CMD eval ".clusterForge.targetRevision = \"${TARGET_REVISION}\"") - - # Write merged values to temp file for use throughout script - echo "$VALUES" > /tmp/merged_values.yaml - echo "Merged values written to /tmp/merged_values.yaml" -} - -# Helper functions to extract values from merged configuration -get_argocd_value() { - local path="$1" - $YQ_CMD eval ".apps.argocd.valuesObject.${path}" /tmp/merged_values.yaml -} - -get_openbao_value() { - local path="$1" - $YQ_CMD eval ".apps.openbao.valuesObject.${path}" /tmp/merged_values.yaml -} - -# Extract version information from app paths +# Extract version information from app paths using sed/awk (no yq needed) extract_app_versions() { - ARGOCD_VERSION=$($YQ_CMD eval '.apps.argocd.path' /tmp/merged_values.yaml | cut -d'/' -f2) - GITEA_VERSION=$($YQ_CMD eval '.apps.gitea.path' /tmp/merged_values.yaml | cut -d'/' -f2) + # Extract ArgoCD version from path like "sources/argocd/8.3.5" + ARGOCD_VERSION=$(grep -A 5 "^ argocd:" "${SOURCE_ROOT}/root/${VALUES_FILE}" | \ + grep "path:" | sed 's/.*argocd\///' | sed 's/ *$//') + + # Extract Gitea version from path like "sources/gitea/12.3.0" + GITEA_VERSION=$(grep -A 5 "^ gitea:" "${SOURCE_ROOT}/root/${VALUES_FILE}" | \ + grep "path:" | sed 's/.*gitea\///' | sed 's/ *$//') echo "Extracted versions - ArgoCD: $ARGOCD_VERSION, Gitea: $GITEA_VERSION" } -# Merge values files early so all subsequent operations can use the merged config -merge_values_files +# Note: clusterForge.targetRevision will be set by the gitea-init-job +# in the cluster-values repository (which overwrites the base values as the final values file) +echo "Target revision $TARGET_REVISION will be set in cluster-values repo by gitea-init-job" -# Extract version information from merged values +# Extract version information from values extract_app_versions # Create namespaces for direct deployments only @@ -281,13 +242,9 @@ kubectl create ns cf-gitea --dry-run=client -o yaml | kubectl apply -f - echo "" echo "=== ArgoCD Bootstrap ===" -# Extract ArgoCD values from merged config and write to temp values file -$YQ_CMD eval '.apps.argocd.valuesObject' ${SOURCE_ROOT}/root/${VALUES_FILE} > /tmp/argocd_values.yaml -$YQ_CMD eval '.apps.argocd.valuesObject' ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE} > /tmp/argocd_size_values.yaml -# Use server-side apply to match ArgoCD's self-management strategy +# Deploy ArgoCD using dedicated values file (no yq extraction needed) helm template --release-name argocd ${SOURCE_ROOT}/sources/argocd/${ARGOCD_VERSION} --namespace argocd \ - -f /tmp/argocd_values.yaml \ - -f /tmp/argocd_size_values.yaml \ + -f ${SOURCE_ROOT}/root/values_argocd.yaml \ --set global.domain="argocd.${DOMAIN}" \ --kube-version=${KUBE_VERSION} | kubectl apply --server-side --field-manager=argocd-controller --force-conflicts -f - kubectl rollout status statefulset/argocd-application-controller -n argocd @@ -304,23 +261,29 @@ generate_password() { openssl rand -hex 16 | tr 'a-f' 'A-F' | head -c 32 } -# Create initial-cf-values configmap with merged values -echo "Creating initial-cf-values configmap from merged configuration..." -kubectl create configmap initial-cf-values --from-literal=initial-cf-values="$(cat /tmp/merged_values.yaml)" --dry-run=client -o yaml | kubectl apply -n cf-gitea -f - - +# Create gitea admin credentials secret kubectl create secret generic gitea-admin-credentials \ --namespace=cf-gitea \ --from-literal=username=silogen-admin \ --from-literal=password=$(generate_password) \ --dry-run=client -o yaml | kubectl apply -f - -$YQ_CMD eval '.apps.gitea.valuesObject' ${SOURCE_ROOT}/root/${VALUES_FILE} > /tmp/gitea_values.yaml -$YQ_CMD eval '.apps.gitea.valuesObject' ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE} > /tmp/gitea_size_values.yaml +# Create initial-cf-values configmap with basic values for gitea-init-job +# Use simple shell variables instead of merged YAML +cat > /tmp/simple_values.yaml << EOF +global: + domain: ${DOMAIN} + clusterSize: values_${CLUSTER_SIZE}.yaml +clusterForge: + targetRevision: ${TARGET_REVISION} +EOF + +kubectl create configmap initial-cf-values --from-literal=initial-cf-values="$(cat /tmp/simple_values.yaml)" --dry-run=client -o yaml | kubectl apply -n cf-gitea -f - -# Bootstrap Gitea +# Bootstrap Gitea using dedicated values file (no yq extraction needed) helm template --release-name gitea ${SOURCE_ROOT}/sources/gitea/${GITEA_VERSION} --namespace cf-gitea \ - -f /tmp/gitea_values.yaml \ - -f /tmp/gitea_size_values.yaml \ + -f ${SOURCE_ROOT}/root/values_gitea.yaml \ + --set clusterDomain="${DOMAIN}" \ --set gitea.config.server.ROOT_URL="https://gitea.${DOMAIN}/" \ --kube-version=${KUBE_VERSION} | kubectl apply -f - kubectl rollout status deploy/gitea -n cf-gitea @@ -404,4 +367,4 @@ __SUMMARY__ # Cleanup temporary files echo "Cleaning up temporary files..." -rm -f /tmp/merged_values.yaml /tmp/argocd_values.yaml \ No newline at end of file +rm -f /tmp/simple_values.yaml /tmp/cluster_forge_values.yaml \ No newline at end of file diff --git a/scripts/validate-cluster-sizes.sh b/scripts/validate-cluster-sizes.sh deleted file mode 100755 index 31a54649..00000000 --- a/scripts/validate-cluster-sizes.sh +++ /dev/null @@ -1,170 +0,0 @@ -#!/bin/bash -# ClusterForge Size Configuration Validation Script -# ============================================================================= -# This script validates the YAML structure and shows how size configurations work -# for ClusterForge applications without requiring Helm to be installed. - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$SCRIPT_DIR/.." - -# Colors for output -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_section() { - echo -e "${YELLOW}[SECTION]${NC} $1" -} - -# Check YAML syntax using available tools -check_yaml() { - local file="$1" - local filename=$(basename "$file") - - # Check if file exists - if [ ! -f "$file" ]; then - echo "❌ $filename: File not found" - return 1 - fi - - # Check if file is readable - if [ ! -r "$file" ]; then - echo "❌ $filename: File not readable" - return 1 - fi - - # Try different validation methods - local validation_method="" - local temp_output="" - - # Method 1: Try yq v4+ syntax - if command -v yq &> /dev/null; then - if temp_output=$(yq eval '.' "$file" 2>&1); then - validation_method="yq v4" - elif temp_output=$(yq . "$file" 2>&1); then - validation_method="yq v3" - elif temp_output=$(yq r "$file" 2>&1); then - validation_method="yq v2" - fi - fi - - # Method 2: Try python if yq failed - if [ -z "$validation_method" ] && command -v python3 &> /dev/null; then - if temp_output=$(python3 -c "import yaml; yaml.safe_load(open('$file', 'r'))" 2>&1); then - validation_method="python3" - fi - fi - - # Method 3: Try python2 if python3 failed - if [ -z "$validation_method" ] && command -v python &> /dev/null; then - if temp_output=$(python -c "import yaml; yaml.safe_load(open('$file', 'r'))" 2>&1); then - validation_method="python2" - fi - fi - - # If validation succeeded with any method - if [ -n "$validation_method" ]; then - log_success "$filename: Valid YAML syntax (validated with $validation_method)" - return 0 - fi - - # All validation methods failed - fall back to basic checks - log_info "$filename: Cannot validate YAML syntax (no working validator found)" - - # Check for common YAML issues - if grep -q $'\t' "$file"; then - echo "❌ $filename: Contains tabs (YAML requires spaces)" - return 1 - fi - - # Check for basic structure (allow comments at start) - if grep -m 1 "^[a-zA-Z]" "$file" >/dev/null 2>&1; then - log_success "$filename: Basic structure OK (install yq/python for full validation)" - return 0 - else - echo "❌ $filename: No valid YAML content found" - return 1 - fi -} - -# Show key differences between configurations -show_config_differences() { - local size="$1" - - log_section "Key differences for $size cluster:" - - case "$size" in - small) - echo " - ArgoCD: Single replica, no HA Redis" - echo " - MinIO: 1 server, 500GB storage" - echo " - OpenBao: Single instance (no HA)" - echo " - Prometheus: 7d retention, minimal resources" - echo " - Target: 1-5 users, development/testing" - ;; - medium) - echo " - ArgoCD: 2 replicas with HA Redis" - echo " - MinIO: 3 servers, 6TB total storage" - echo " - OpenBao: 3 replicas with Raft HA" - echo " - Enhanced resources for team collaboration" - echo " - Target: 5-20 users, production workloads" - ;; - large) - echo " - ArgoCD: 3 replicas with enhanced PDB" - echo " - MinIO: External HA S3 recommended" - echo " - OpenBao: Full HA with enhanced security" - echo " - Full observability stack with extended retention" - echo " - Target: 10s-100s users, enterprise scale" - ;; - esac -} - -main() { - log_info "Validating ClusterForge configuration files..." - echo - - # Validate base configuration - log_section "Base Configuration" - check_yaml "$PROJECT_ROOT/root/values.yaml" - echo - - # Validate size-specific configurations - for size in small medium large; do - log_section "$size Cluster Configuration" - check_yaml "$PROJECT_ROOT/root/values_$size.yaml" - show_config_differences "$size" - echo - done - - log_section "Configuration Summary" - echo "✅ Base values.yaml: All ClusterForge applications enabled" - echo "✅ values_small.yaml: Minimal resources for 1-5 users (dev/test)" - echo "✅ values_medium.yaml: Balanced setup for 5-20 users (teams)" - echo "✅ values_large.yaml: Enterprise features for 10s-100s users" - echo - - log_section "Usage Examples" - echo " # Small cluster (development/testing):" - echo " ./scripts/bootstrap.sh dev.example.com --CLUSTER_SIZE=small" - echo - echo " # Medium cluster (team production - default):" - echo " ./scripts/bootstrap.sh team.example.com" - echo - echo " # Large cluster (enterprise scale):" - echo " ./scripts/bootstrap.sh prod.example.com --CLUSTER_SIZE=large" - echo - - log_success "All ClusterForge size configurations are valid! This is the way." -} - -main "$@" \ No newline at end of file From ebc4e116b3525b36266bed0ee6419596d5bc5547 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Fri, 27 Feb 2026 22:03:03 +0200 Subject: [PATCH 055/115] chore: rm bootstrap v2, as ideas have been merged into base bootstrap.sh --- scripts/bootstrap_v2.sh | 778 ---------------------------------------- 1 file changed, 778 deletions(-) delete mode 100755 scripts/bootstrap_v2.sh diff --git a/scripts/bootstrap_v2.sh b/scripts/bootstrap_v2.sh deleted file mode 100755 index 1e7b3acd..00000000 --- a/scripts/bootstrap_v2.sh +++ /dev/null @@ -1,778 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Initialize variables -LATEST_RELEASE="v1.8.0" -TARGET_REVISION="$LATEST_RELEASE" - -CLUSTER_SIZE="medium" # Default to medium -DOMAIN="" -KUBE_VERSION=1.33 -VALUES_FILE="values.yaml" - -# Parse arguments -while [[ $# -gt 0 ]]; do - case $1 in - --CLUSTER-SIZE|--cluster-size|-s) - if [ -z "$2" ]; then - echo "ERROR: --cluster-size requires an argument" - exit 1 - fi - CLUSTER_SIZE="$2" - shift 2 - ;; - --CLUSTER-SIZE=*) - CLUSTER_SIZE="${1#*=}" - shift - ;; - --cluster-size=*) - CLUSTER_SIZE="${1#*=}" - shift - ;; - -s=*) - CLUSTER_SIZE="${1#*=}" - shift - ;; - --TARGET-REVISION|--target-revision|-r) - if [ -z "$2" ]; then - echo "WARNING: defaulting to --target-revision=$LATEST_RELEASE (no value specified)" - TARGET_REVISION="$LATEST_RELEASE" - shift - else - TARGET_REVISION="$2" - shift 2 - fi - ;; - --TARGET-REVISION=*) - TARGET_REVISION="${1#*=}" - shift - ;; - --target-revision=*) - TARGET_REVISION="${1#*=}" - shift - ;; - -r=*) - TARGET_REVISION="${1#*=}" - shift - ;; - --help|-h) - cat < [values_file] - - Arguments: - domain Required. Cluster domain (e.g., example.com) - values_file Optional. Values .yaml file to use, default: root/values.yaml - - Options: - -r, --target-revision cluster-forge git revision for ArgoCD to sync from - options: [tag|commit_hash|branch_name], default: $LATEST_RELEASE - IMPORTANT: Only apps enabled in target revision will be deployed - -s, --cluster-size options: [small|medium|large], default: medium - - Examples: - $0 compute.amd.com values_custom.yaml --cluster-size=large - $0 112.100.97.17.nip.io - $0 dev.example.com --cluster-size=small --target-revision=v1.8.0 - $0 dev.example.com -s=small -r=feature-branch - - Target Revision Behavior: - • Bootstrap will deploy ArgoCD + essential infrastructure (Gitea, OpenBao) - • Gitea will be initialized to provide git repositories for ArgoCD - • OpenBao will be initialized to provide secrets management - • cluster-forge parent app will then be deployed to manage remaining apps - • ArgoCD will sync ALL apps from the specified target revision - • Only apps enabled in target revision will be deployed - • Apps disabled in target revision will be pruned if they exist -HELP_OUTPUT - exit 0 - ;; - --*) - echo "ERROR: Unknown option: $1" - echo "Use --help for usage information" - exit 1 - ;; - *) - # Positional arguments - if [ -z "$DOMAIN" ]; then - DOMAIN="$1" - elif [ "$VALUES_FILE" = "values.yaml" ]; then - VALUES_FILE="$1" - else - echo "ERROR: Too many arguments: $1" - echo "Usage: $0 [--CLUSTER_SIZE=small|medium|large] [--dev] [values_file]" - exit 1 - fi - shift - ;; - esac -done - -# Validate required arguments -if [ -z "$DOMAIN" ]; then - echo "ERROR: Domain argument is required" - echo "Usage: $0 [values_file] [--CLUSTER_SIZE=small|medium|large]" - echo "Use --help for more details" - exit 1 -fi - -# Validate cluster size -case "$CLUSTER_SIZE" in - small|medium|large) - ;; - *) - echo "ERROR: Invalid cluster size '$CLUSTER_SIZE'" - echo "Valid sizes: small, medium, large" - exit 1 - ;; -esac - -# Validate values file exists -if [ ! -f "${SCRIPT_DIR}/../root/${VALUES_FILE}" ]; then - echo "ERROR: Values file not found: ${SCRIPT_DIR}/../root/${VALUES_FILE}" - exit 1 -fi - -# Check if size-specific values file exists -setup_values_files() { - SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" - - if [ ! -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then - echo "WARNING: Size-specific values file not found: ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" - echo "Proceeding with base values file only: ${VALUES_FILE}" - SIZE_VALUES_FILE="" - else - echo "Using size-specific values file: ${SIZE_VALUES_FILE}" - fi -} - -display_target_revision() { - # Check if TARGET_REVISION was explicitly set via command line flag - # by comparing against the default value - if [ "$TARGET_REVISION" != "$LATEST_RELEASE" ]; then - echo "Using specified targetRevision: $TARGET_REVISION" - else - echo "Using default targetRevision: $TARGET_REVISION" - fi -} - -# Since we only support v1.8.0+, always use local sources -setup_sources() { - SOURCE_ROOT="${SCRIPT_DIR}/.." - echo "Using local sources for target revision: $TARGET_REVISION" -} - -pre_cleanup() { - echo "" - echo "=== Pre-cleanup: Checking for previous runs ===" - - # Check if gitea-init-job exists and completed successfully - if kubectl get job gitea-init-job -n cf-gitea >/dev/null 2>&1; then - if kubectl get job gitea-init-job -n cf-gitea -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; then - echo "Found completed gitea-init-job - removing Gitea to start fresh" - - # Delete all Gitea resources - kubectl delete job gitea-init-job -n cf-gitea --ignore-not-found=true - kubectl delete deployment gitea -n cf-gitea --ignore-not-found=true - kubectl delete statefulset gitea -n cf-gitea --ignore-not-found=true - kubectl delete service gitea -n cf-gitea --ignore-not-found=true - kubectl delete service gitea-http -n cf-gitea --ignore-not-found=true - kubectl delete service gitea-ssh -n cf-gitea --ignore-not-found=true - kubectl delete pvc -n cf-gitea -l app.kubernetes.io/name=gitea --ignore-not-found=true - kubectl delete configmap initial-cf-values -n cf-gitea --ignore-not-found=true - kubectl delete secret gitea-admin-credentials -n cf-gitea --ignore-not-found=true - kubectl delete ingress -n cf-gitea -l app.kubernetes.io/name=gitea --ignore-not-found=true - - echo "Gitea resources deleted" - fi - fi - - # Always delete openbao-init-job to allow re-initialization - kubectl delete job openbao-init-job -n cf-openbao --ignore-not-found=true - - # Clean up any bootstrap manifest files from previous runs - rm -f /tmp/cluster-forge-bootstrap.yaml /tmp/cluster-forge-parent-app.yaml \ - /tmp/argocd-app.yaml /tmp/gitea-app.yaml /tmp/openbao-app.yaml - - echo "=== Pre-cleanup complete ===" - echo "" -} - -# NEW: ArgoCD-Native Template Rendering Function -render_cluster_forge_manifests() { - echo "" - echo "=== Rendering ClusterForge Manifests ===" - echo "Domain: $DOMAIN" - echo "Base values: $VALUES_FILE" - echo "Cluster size: $CLUSTER_SIZE" - echo "Target revision: $TARGET_REVISION" - echo "" - - local helm_args=( - "cluster-forge" "${SOURCE_ROOT}/root" - "--namespace" "argocd" - "--values" "${SOURCE_ROOT}/root/${VALUES_FILE}" - ) - - # Add size-specific values if they exist - if [ -n "$SIZE_VALUES_FILE" ]; then - helm_args+=( - "--values" "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" - ) - echo "Size overlay: $SIZE_VALUES_FILE" - fi - - # Set runtime configuration - helm_args+=( - "--set" "global.domain=$DOMAIN" - "--set" "global.clusterSize=values_${CLUSTER_SIZE}.yaml" - "--set" "externalValues.enabled=true" - "--set" "clusterForge.repoUrl=http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" - "--set" "clusterForge.targetRevision=$TARGET_REVISION" - "--set" "externalValues.repoUrl=http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" - "--set" "externalValues.targetRevision=main" - "--kube-version" "$KUBE_VERSION" - ) - - echo "🚀 Rendering all manifests using ArgoCD-native templating..." - - # Render all manifests in one go - no yq, no temp files, no manual extraction! - helm template "${helm_args[@]}" > /tmp/cluster-forge-bootstrap.yaml - - echo "✅ All manifests rendered to /tmp/cluster-forge-bootstrap.yaml" - echo "" -} - -# NEW: Render and Apply Only Essential Components for ArgoCD Takeover -render_cluster_forge_parent_app() { - echo "" - echo "=== Rendering cluster-forge Parent App ===" - echo "Using existing template: root/templates/cluster-forge.yaml" - echo "Target revision: $TARGET_REVISION" - echo "Cluster size: $CLUSTER_SIZE" - - # Create minimal values for just rendering the cluster-forge parent app - local temp_values=$(mktemp) - cat > "$temp_values" < /tmp/cluster-forge-parent-app.yaml - - # Cleanup temp file - rm -f "$temp_values" - - echo "✅ cluster-forge parent app rendered to /tmp/cluster-forge-parent-app.yaml" -} - -# Deploy ArgoCD directly using helm (following original bootstrap.sh pattern) -deploy_argocd_directly() { - echo "" - echo "=== Direct ArgoCD Deployment ===" - echo "ArgoCD must be deployed directly before it can manage other applications" - - # Check if yq is available for value extraction - if command -v yq >/dev/null 2>&1; then - YQ_CMD="yq" - elif [ -f "$HOME/yq" ]; then - YQ_CMD="$HOME/yq" - else - echo "ERROR: yq command not found. Please install yq or place it in $HOME/yq" - exit 1 - fi - - # Create merged values for version extraction (similar to original bootstrap.sh) - local SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" - if [ -n "$SIZE_VALUES_FILE" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then - # Merge base values with size-specific overrides - $YQ_CMD eval-all '. as $item ireduce ({}; . * $item)' \ - "${SOURCE_ROOT}/root/${VALUES_FILE}" \ - "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | \ - $YQ_CMD eval ".global.domain = \"${DOMAIN}\"" > /tmp/merged_values.yaml - else - # Use base values only - cat "${SOURCE_ROOT}/root/${VALUES_FILE}" | $YQ_CMD ".global.domain = \"${DOMAIN}\"" > /tmp/merged_values.yaml - fi - - # Extract ArgoCD version from merged values - ARGOCD_VERSION=$($YQ_CMD eval '.apps.argocd.path' /tmp/merged_values.yaml | cut -d'/' -f2) - - if [ -z "$ARGOCD_VERSION" ]; then - echo "ERROR: Could not extract ArgoCD version from values" - exit 1 - fi - - echo "Using ArgoCD version: $ARGOCD_VERSION" - - # Extract ArgoCD-specific values for helm template - $YQ_CMD eval '.apps.argocd.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > /tmp/argocd_values.yaml - - if [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then - $YQ_CMD eval '.apps.argocd.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/argocd_size_values.yaml - else - # Create empty file if size-specific values don't exist - echo "{}" > /tmp/argocd_size_values.yaml - fi - - # Deploy ArgoCD directly using helm template (following original bootstrap.sh pattern) - echo "🚀 Deploying ArgoCD using helm template..." - helm template --release-name argocd "${SOURCE_ROOT}/sources/argocd/${ARGOCD_VERSION}" --namespace argocd \ - -f /tmp/argocd_values.yaml \ - -f /tmp/argocd_size_values.yaml \ - --set global.domain="argocd.${DOMAIN}" \ - --kube-version="${KUBE_VERSION}" | kubectl apply --server-side --field-manager=argocd-controller --force-conflicts -f - - - # Wait for ArgoCD components to be ready - echo "⏳ Waiting for ArgoCD to become ready..." - kubectl rollout status statefulset/argocd-application-controller -n argocd --timeout=300s - kubectl rollout status deploy/argocd-applicationset-controller -n argocd --timeout=300s - kubectl rollout status deploy/argocd-redis -n argocd --timeout=300s - kubectl rollout status deploy/argocd-repo-server -n argocd --timeout=300s - - # Cleanup temporary files - rm -f /tmp/argocd_values.yaml /tmp/argocd_size_values.yaml /tmp/merged_values.yaml - - echo "✅ ArgoCD deployed directly and ready to manage applications" -} - -bootstrap_argocd_managed_approach() { - echo "=== Direct Infrastructure + ArgoCD-Managed Apps ===" - echo "" - echo "🎯 Strategy: Deploy fundamental infrastructure directly, then let ArgoCD manage apps" - echo " 1. Deploy ArgoCD, OpenBao, Gitea directly (fundamental dependencies)" - echo " 2. ArgoCD then manages remaining apps from target revision: $TARGET_REVISION" - echo " 3. This ensures only apps enabled in target revision are deployed" - echo "" - - # Create argocd namespace first - echo "Creating ArgoCD namespace..." - kubectl create ns argocd --dry-run=client -o yaml | kubectl apply -f - - - # Step 1: Deploy ArgoCD directly (cannot use ArgoCD Application before ArgoCD exists) - echo "" - echo "📦 Step 1: Deploying ArgoCD directly..." - echo " Note: ArgoCD must be deployed directly before it can manage applications" - - deploy_argocd_directly - - # Step 2: Deploy OpenBao directly (must be initialized before Gitea for secrets) - echo "" - echo "📦 Step 2: Deploying OpenBao directly..." - echo " Note: OpenBao must be deployed directly to provide secrets for ExternalSecrets" - echo " OpenBao cannot be deployed via ArgoCD as it's a fundamental dependency" - - deploy_openbao_directly - - # Step 3: Deploy Gitea directly (now that OpenBao secrets are available) - echo "" - echo "📦 Step 3: Deploying Gitea directly..." - echo " Note: Gitea must be deployed directly to provide git repositories for ArgoCD" - echo " OpenBao is now initialized, so ExternalSecrets can pull credentials" - - deploy_gitea_directly - - # Wait for Gitea to be deployed and initialize (critical for git repositories) - wait_for_gitea_and_initialize - - echo "" - echo "🎉 ArgoCD-Managed Bootstrap Complete!" - echo "" - echo "📋 What happens now:" - echo " 1. ✅ ArgoCD is running and managing the cluster" - echo " 2. 🎯 cluster-forge app will sync from: $TARGET_REVISION" - echo " 3. 📦 ONLY apps enabled in $TARGET_REVISION will be deployed" - echo " 4. ⚡ Sync waves will ensure proper deployment order" - echo " 5. 🔄 ArgoCD will automatically prune apps disabled in target revision" - - # Step 4: Now apply the cluster-forge parent app (after git repositories exist) - echo "" - echo "📦 Step 4: Applying cluster-forge parent application..." - echo " Now that git repositories exist, ArgoCD can manage all remaining apps" - - # Render the cluster-forge parent app using the existing template - render_cluster_forge_parent_app - - # Apply the rendered cluster-forge parent app - kubectl apply -f /tmp/cluster-forge-parent-app.yaml - - echo "✅ cluster-forge parent application applied!" - echo "🚀 ArgoCD will now manage all remaining applications from target revision: $TARGET_REVISION" -} - -# Deploy OpenBao directly using helm (cannot rely on ArgoCD for fundamental dependencies) -deploy_openbao_directly() { - echo "" - echo "=== Direct OpenBao Deployment ===" - echo "OpenBao must be deployed directly since it provides secrets for ExternalSecrets" - - # Create cf-openbao namespace first - echo "Creating cf-openbao namespace..." - kubectl create ns cf-openbao --dry-run=client -o yaml | kubectl apply -f - - - # Check if yq is available for value extraction - if command -v yq >/dev/null 2>&1; then - YQ_CMD="yq" - elif [ -f "$HOME/yq" ]; then - YQ_CMD="$HOME/yq" - else - echo "ERROR: yq command not found. Please install yq or place it in $HOME/yq" - exit 1 - fi - - # Create merged values for version extraction (similar to original bootstrap.sh) - local SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" - if [ -n "$SIZE_VALUES_FILE" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then - # Merge base values with size-specific overrides - $YQ_CMD eval-all '. as $item ireduce ({}; . * $item)' \ - "${SOURCE_ROOT}/root/${VALUES_FILE}" \ - "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | \ - $YQ_CMD eval ".global.domain = \"${DOMAIN}\"" > /tmp/merged_values.yaml - else - # Use base values only - cat "${SOURCE_ROOT}/root/${VALUES_FILE}" | $YQ_CMD ".global.domain = \"${DOMAIN}\"" > /tmp/merged_values.yaml - fi - - # Extract OpenBao version from merged values - OPENBAO_VERSION=$($YQ_CMD eval '.apps.openbao.path' /tmp/merged_values.yaml | cut -d'/' -f2) - - if [ -z "$OPENBAO_VERSION" ]; then - echo "ERROR: Could not extract OpenBao version from values" - exit 1 - fi - - echo "Using OpenBao version: $OPENBAO_VERSION" - - # Extract OpenBao-specific values for helm template - $YQ_CMD eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > /tmp/openbao_values.yaml - - if [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then - $YQ_CMD eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/openbao_size_values.yaml - else - # Create empty file if size-specific values don't exist - echo "{}" > /tmp/openbao_size_values.yaml - fi - - # Deploy OpenBao directly using helm template (following original bootstrap.sh pattern) - echo "🚀 Deploying OpenBao using helm template..." - helm template --release-name openbao "${SOURCE_ROOT}/sources/openbao/${OPENBAO_VERSION}" --namespace cf-openbao \ - -f /tmp/openbao_values.yaml \ - -f /tmp/openbao_size_values.yaml \ - --set ui.enabled=true \ - --kube-version="${KUBE_VERSION}" | kubectl apply --server-side --field-manager=argocd-controller --force-conflicts -f - - - # Wait for OpenBao pod to be running - echo "⏳ Waiting for OpenBao pod to be ready..." - if kubectl wait --for=jsonpath='{.status.phase}'=Running pod/openbao-0 -n cf-openbao --timeout=300s; then - echo "✅ OpenBao pod is running" - else - echo "❌ ERROR: OpenBao pod failed to start" - echo "Check pod status: kubectl get pod openbao-0 -n cf-openbao" - echo "Check logs: kubectl logs openbao-0 -n cf-openbao" - exit 1 - fi - - # Create initial secrets config for init job (following original bootstrap.sh pattern) - echo "Creating initial OpenBao secrets configuration..." - cat "${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-manager-cm.yaml" | \ - sed "s|name: openbao-secret-manager-scripts|name: openbao-secret-manager-scripts-init|g" | kubectl apply -f - - - cat "${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml" | \ - sed "s|{{ .Values.domain }}|${DOMAIN}|g" | \ - sed "s|name: openbao-secrets-config|name: openbao-secrets-init-config|g" | kubectl apply -f - - - # Run OpenBao initialization job - echo "Deploying OpenBao initialization job..." - helm template --release-name openbao-init "${SOURCE_ROOT}/scripts/init-openbao-job" \ - -f /tmp/openbao_values.yaml \ - --set domain="${DOMAIN}" \ - --kube-version="${KUBE_VERSION}" | kubectl apply -f - - - # Wait for initialization to complete - echo "⏳ Waiting for OpenBao initialization to complete..." - if kubectl wait --for=condition=complete --timeout=300s job/openbao-init-job -n cf-openbao; then - echo "✅ OpenBao initialization completed successfully" - else - echo "❌ ERROR: OpenBao initialization timed out or failed" - echo "Check job status: kubectl describe job openbao-init-job -n cf-openbao" - echo "Check logs: kubectl logs -l job-name=openbao-init-job -n cf-openbao" - exit 1 - fi - - # Allow time for ExternalSecrets controller to sync with initialized OpenBao - echo "⏳ Allowing time for ExternalSecrets controller to sync with initialized OpenBao..." - sleep 15 - echo "✅ OpenBao initialization and ExternalSecrets sync complete" - - # Cleanup temporary files - rm -f /tmp/openbao_values.yaml /tmp/openbao_size_values.yaml /tmp/merged_values.yaml - - echo "✅ OpenBao deployed directly and initialized successfully" -} - -# Deploy Gitea directly using helm (cannot rely on ArgoCD initially) -deploy_gitea_directly() { - echo "" - echo "=== Direct Gitea Deployment ===" - echo "Gitea must be deployed directly since ArgoCD needs git repositories to function" - - # Create cf-gitea namespace first - echo "Creating cf-gitea namespace..." - kubectl create ns cf-gitea --dry-run=client -o yaml | kubectl apply -f - - - # Wait briefly for any ExternalSecrets to be processed now that OpenBao is initialized - echo "⏳ Waiting for ExternalSecrets to sync from OpenBao (if any)..." - sleep 10 - - # Generate password function (needed for manual secret creation) - generate_password() { - openssl rand -hex 16 | tr 'a-f' 'A-F' | head -c 32 - } - - # Create gitea-admin-credentials secret manually (ensures it exists regardless of ExternalSecret status) - echo "🔐 Creating gitea-admin-credentials secret..." - kubectl create secret generic gitea-admin-credentials \ - --namespace=cf-gitea \ - --from-literal=username=silogen-admin \ - --from-literal=password=$(generate_password) \ - --dry-run=client -o yaml | kubectl apply -f - - echo "✅ gitea-admin-credentials secret created" - - # Check if yq is available for value extraction - if command -v yq >/dev/null 2>&1; then - YQ_CMD="yq" - elif [ -f "$HOME/yq" ]; then - YQ_CMD="$HOME/yq" - else - echo "ERROR: yq command not found. Please install yq or place it in $HOME/yq" - exit 1 - fi - - # Extract Gitea version from values - echo "Extracting Gitea configuration..." - - # Create merged values for version extraction (similar to original bootstrap.sh) - local SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" - if [ -n "$SIZE_VALUES_FILE" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then - # Merge base values with size-specific overrides - $YQ_CMD eval-all '. as $item ireduce ({}; . * $item)' \ - "${SOURCE_ROOT}/root/${VALUES_FILE}" \ - "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | \ - $YQ_CMD eval ".global.domain = \"${DOMAIN}\"" > /tmp/merged_values.yaml - else - # Use base values only - cat "${SOURCE_ROOT}/root/${VALUES_FILE}" | $YQ_CMD ".global.domain = \"${DOMAIN}\"" > /tmp/merged_values.yaml - fi - - # Extract Gitea version from merged values - GITEA_VERSION=$($YQ_CMD eval '.apps.gitea.path' /tmp/merged_values.yaml | cut -d'/' -f2) - - if [ -z "$GITEA_VERSION" ]; then - echo "ERROR: Could not extract Gitea version from values" - exit 1 - fi - - echo "Using Gitea version: $GITEA_VERSION" - - # Extract Gitea-specific values for helm template - $YQ_CMD eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > /tmp/gitea_values.yaml - - if [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then - $YQ_CMD eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/gitea_size_values.yaml - else - # Create empty file if size-specific values don't exist - echo "{}" > /tmp/gitea_size_values.yaml - fi - - # Deploy Gitea directly using helm template - echo "🚀 Deploying Gitea using helm template..." - helm template --release-name gitea "${SOURCE_ROOT}/sources/gitea/${GITEA_VERSION}" --namespace cf-gitea \ - -f /tmp/gitea_values.yaml \ - -f /tmp/gitea_size_values.yaml \ - --set gitea.config.server.ROOT_URL="https://gitea.${DOMAIN}/" \ - --kube-version="${KUBE_VERSION}" | kubectl apply -f - - - # Wait for Gitea deployment to be ready - echo "⏳ Waiting for Gitea deployment to be ready..." - if kubectl rollout status deploy/gitea -n cf-gitea --timeout=300s; then - echo "✅ Gitea deployment is ready" - else - echo "❌ ERROR: Gitea deployment failed to become ready" - echo "Check deployment status: kubectl get deployment gitea -n cf-gitea" - echo "Check logs: kubectl logs -l app=gitea -n cf-gitea" - exit 1 - fi - - # Cleanup temporary files - rm -f /tmp/gitea_values.yaml /tmp/gitea_size_values.yaml /tmp/merged_values.yaml - - echo "✅ Gitea deployed directly and ready for initialization" -} - -# Initialize Gitea after direct deployment -wait_for_gitea_and_initialize() { - echo "" - echo "=== Gitea Initialization ===" - echo "Gitea has been deployed directly - proceeding with initialization..." - - # Verify Gitea deployment exists (should already be deployed directly) - if ! kubectl get deployment gitea -n cf-gitea >/dev/null 2>&1; then - echo "❌ ERROR: Gitea deployment not found" - echo " This should not happen - deploy_gitea_directly should have created it" - exit 1 - fi - - echo "✅ Gitea Deployment confirmed" - - # Gitea should already be ready from direct deployment, but double-check - echo "⏳ Verifying Gitea is ready..." - if ! kubectl rollout status deploy/gitea -n cf-gitea --timeout=60s; then - echo "❌ ERROR: Gitea deployment not ready" - echo " Check deployment status: kubectl get deployment gitea -n cf-gitea" - echo " Check logs: kubectl logs -l app=gitea -n cf-gitea" - exit 1 - fi - - echo "✅ Gitea is ready for initialization" - - # Now run the Gitea initialization (extracted from original bootstrap.sh) - echo "" - echo "📦 Running Gitea initialization..." - - # Check if yq is available for value extraction (needed for init) - if command -v yq >/dev/null 2>&1; then - YQ_CMD="yq" - elif [ -f "$HOME/yq" ]; then - YQ_CMD="$HOME/yq" - else - echo "ERROR: yq not found. Gitea initialization requires yq." - echo "Without Gitea initialization, ArgoCD will not have git repositories!" - exit 1 - fi - - # Extract Gitea values for initialization - echo "Extracting Gitea values for initialization..." - $YQ_CMD eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > /tmp/gitea_values.yaml - $YQ_CMD eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/gitea_size_values.yaml - - # Create merged values configmap (needed by gitea-init-job) - echo "Creating initial-cf-values configmap..." - - # Recreate merged values like original bootstrap (needed for gitea-init-job) - local SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" - if [ -n "$SIZE_VALUES_FILE" ]; then - # Merge base values with size-specific overrides - VALUES=$($YQ_CMD eval-all '. as $item ireduce ({}; . * $item)' \ - "${SOURCE_ROOT}/root/${VALUES_FILE}" \ - "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | \ - $YQ_CMD eval ".global.domain = \"${DOMAIN}\"") - else - # Use base values only - VALUES=$(cat "${SOURCE_ROOT}/root/${VALUES_FILE}" | $YQ_CMD ".global.domain = \"${DOMAIN}\"") - fi - - # Apply the target revision override - VALUES=$(echo "$VALUES" | $YQ_CMD eval ".clusterForge.targetRevision = \"${TARGET_REVISION}\"") - - kubectl create configmap initial-cf-values --from-literal=initial-cf-values="$VALUES" --dry-run=client -o yaml | kubectl apply -n cf-gitea -f - - - # Note: gitea-admin-credentials secret already created in deploy_gitea_directly function - - # Run Gitea initialization job - echo "Deploying Gitea initialization job..." - helm template --release-name gitea-init "${SOURCE_ROOT}/scripts/init-gitea-job" \ - --set clusterSize="${SIZE_VALUES_FILE}" \ - --set domain="${DOMAIN}" \ - --set targetRevision="${TARGET_REVISION}" \ - --kube-version="${KUBE_VERSION}" | kubectl apply -f - - - # Wait for initialization to complete - echo "⏳ Waiting for Gitea initialization to complete..." - if kubectl wait --for=condition=complete --timeout=300s job/gitea-init-job -n cf-gitea; then - echo "✅ Gitea initialization completed successfully" - echo "📦 Git repositories are now available for ArgoCD" - else - echo "❌ ERROR: Gitea initialization timed out or failed" - echo " This is CRITICAL - ArgoCD needs git repositories to function!" - echo " Check job status: kubectl describe job gitea-init-job -n cf-gitea" - echo " Check logs: kubectl logs -l job-name=gitea-init-job -n cf-gitea" - exit 1 - fi - - # Cleanup temporary files - rm -f /tmp/gitea_values.yaml /tmp/gitea_size_values.yaml - - echo "📦 Gitea initialization phase complete" -} - -# NEW: Post-Bootstrap Status Check -show_bootstrap_summary() { - echo "=== ClusterForge Bootstrap Complete ===" - echo "" - echo "Domain: $DOMAIN" - echo "Cluster size: $CLUSTER_SIZE" - echo "Target revision: $TARGET_REVISION" - echo "" - echo "🌐 Access URLs:" - echo " ArgoCD: https://argocd.${DOMAIN}" - echo " Gitea: https://gitea.${DOMAIN}" - echo " OpenBao: https://openbao.${DOMAIN}" - echo "" - echo "📋 Next steps:" - echo " 1. Monitor ArgoCD applications: kubectl get apps -n argocd" - echo " 2. Check sync status: kubectl get apps -n argocd -o wide" - echo " 3. View ArgoCD UI for detailed deployment progress" - echo " 4. ArgoCD is syncing apps from target revision: $TARGET_REVISION" - echo " (Only apps enabled in that revision will be deployed)" - echo " 5. Gitea provides git repositories: https://gitea.${DOMAIN}" - echo " 6. OpenBao provides secrets management: https://openbao.${DOMAIN}" - echo " 7. Essential infrastructure (OpenBao, Gitea) is initialized" - echo "" - echo "🧹 Cleanup: Bootstrap manifests saved at:" - echo " - /tmp/cluster-forge-bootstrap.yaml (all apps rendered)" - echo " - /tmp/cluster-forge-parent-app.yaml (parent app only)" - echo "" - echo "This is the way! 🚀" -} - -# Main execution flow -main() { - display_target_revision - setup_sources - setup_values_files - - # Run pre-cleanup (removing till refined) - # pre_cleanup - - # NEW APPROACH: Render locally, but only bootstrap ArgoCD + parent app - render_cluster_forge_manifests - bootstrap_argocd_managed_approach - - # Show final status - show_bootstrap_summary -} - -# Execute main function -main \ No newline at end of file From 1868d4fbb21915aeedcd359940bbafadf9e3895c Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Fri, 27 Feb 2026 22:32:12 +0200 Subject: [PATCH 056/115] chore: streamline openbao manifest applies --- root/values_openbao.yaml | 23 +++++++++++++++ scripts/bootstrap.sh | 64 ++++++++++++++++++++++++++++++++-------- 2 files changed, 75 insertions(+), 12 deletions(-) create mode 100644 root/values_openbao.yaml diff --git a/root/values_openbao.yaml b/root/values_openbao.yaml new file mode 100644 index 00000000..1dde2688 --- /dev/null +++ b/root/values_openbao.yaml @@ -0,0 +1,23 @@ +# Dedicated OpenBao values for bootstrap +# Extracted from the openbao.valuesObject section of values.yaml +injector: + enabled: false +server: + affinity: | + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: openbao + app.kubernetes.io/instance: openbao + component: server + topologyKey: kubernetes.io/hostname + ha: + enabled: false + raft: + enabled: false + replicas: 1 +ui: + enabled: true \ No newline at end of file diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 670646c5..05676f5c 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -78,10 +78,10 @@ while [[ $# -gt 0 ]]; do $0 dev.example.com -s=small -r=$LATEST_RELEASE Bootstrap Behavior: - • Bootstrap deploys ArgoCD + Gitea directly (essential infrastructure) + • Bootstrap deploys ArgoCD + OpenBao + Gitea directly (essential infrastructure) • cluster-forge parent app then deployed to manage remaining apps - • ArgoCD syncs ALL apps from specified target revision - • OpenBao and other apps deploy via ArgoCD (not directly) + • ArgoCD syncs remaining apps from specified target revision + • Direct deployment ensures proper initialization order and timing HELP_OUTPUT exit 0 ;; @@ -221,11 +221,15 @@ extract_app_versions() { ARGOCD_VERSION=$(grep -A 5 "^ argocd:" "${SOURCE_ROOT}/root/${VALUES_FILE}" | \ grep "path:" | sed 's/.*argocd\///' | sed 's/ *$//') + # Extract OpenBao version from path like "sources/openbao/0.18.2" + OPENBAO_VERSION=$(grep -A 5 "^ openbao:" "${SOURCE_ROOT}/root/${VALUES_FILE}" | \ + grep "path:" | sed 's/.*openbao\///' | sed 's/ *$//') + # Extract Gitea version from path like "sources/gitea/12.3.0" GITEA_VERSION=$(grep -A 5 "^ gitea:" "${SOURCE_ROOT}/root/${VALUES_FILE}" | \ grep "path:" | sed 's/.*gitea\///' | sed 's/ *$//') - echo "Extracted versions - ArgoCD: $ARGOCD_VERSION, Gitea: $GITEA_VERSION" + echo "Extracted versions - ArgoCD: $ARGOCD_VERSION, OpenBao: $OPENBAO_VERSION, Gitea: $GITEA_VERSION" } # Note: clusterForge.targetRevision will be set by the gitea-init-job @@ -253,8 +257,43 @@ kubectl rollout status deploy/argocd-redis -n argocd kubectl rollout status deploy/argocd-repo-server -n argocd echo "" -echo "=== Skipping OpenBao Direct Deployment ===" -echo "OpenBao will be deployed via ArgoCD after cluster-forge parent app is applied" +echo "=== OpenBao Bootstrap ===" +echo "Deploying OpenBao directly to ensure initialization before dependent apps" + +# Create cf-openbao namespace +kubectl create ns cf-openbao --dry-run=client -o yaml | kubectl apply -f - + +# Deploy OpenBao using dedicated values file (no yq extraction needed) +helm template --release-name openbao ${SOURCE_ROOT}/sources/openbao/${OPENBAO_VERSION} --namespace cf-openbao \ + -f ${SOURCE_ROOT}/root/values_openbao.yaml \ + --set ui.enabled=true \ + --kube-version=${KUBE_VERSION} | kubectl apply --server-side --field-manager=argocd-controller --force-conflicts -f - + +# Wait for OpenBao pod to be running +echo "⏳ Waiting for OpenBao pod to be ready..." +kubectl wait --for=jsonpath='{.status.phase}'=Running pod/openbao-0 -n cf-openbao --timeout=300s + +# Deploy OpenBao initialization job directly (critical for bootstrap) +echo "🔐 Deploying OpenBao initialization job..." +helm template --release-name openbao-init ${SOURCE_ROOT}/scripts/init-openbao-job \ + -f ${SOURCE_ROOT}/root/values_openbao.yaml \ + --set domain="${DOMAIN}" \ + --kube-version=${KUBE_VERSION} | kubectl apply -f - + +# Wait for initialization to complete +echo "⏳ Waiting for OpenBao initialization to complete..." +kubectl wait --for=condition=complete --timeout=300s job/openbao-init-job -n cf-openbao + +# Deploy OpenBao configuration (CronJobs) directly after initialization +echo "🔧 Deploying OpenBao configuration (CronJobs for ongoing management)..." + +# Deploy the entire openbao-config chart efficiently +helm template --release-name openbao-config "${SOURCE_ROOT}/sources/openbao-config/0.1.0" \ + --namespace cf-openbao \ + --set domain="${DOMAIN}" \ + --kube-version="${KUBE_VERSION}" | kubectl apply -f - + +echo "✅ OpenBao deployed, initialized, and configured directly" echo "" echo "=== Gitea Bootstrap ===" generate_password() { @@ -347,15 +386,16 @@ Target revision: $TARGET_REVISION 🌐 Access URLs: ArgoCD: https://argocd.${DOMAIN} + OpenBao: https://openbao.${DOMAIN} Gitea: https://gitea.${DOMAIN} 📋 What happens now: - 1. ✅ ArgoCD is running and managing the cluster - 2. ✅ Gitea provides git repositories for ArgoCD - 3. 🎯 cluster-forge app will sync from: $TARGET_REVISION - 4. 📦 ArgoCD will deploy ALL enabled apps from target revision - 5. 🔄 OpenBao and other apps deploy via ArgoCD (not directly) - 6. ⚡ Sync waves ensure proper deployment order + 1. ✅ ArgoCD is running and managing the cluster + 2. ✅ OpenBao provides secrets management and is fully initialized + 3. ✅ Gitea provides git repositories for ArgoCD + 4. 🎯 cluster-forge app will sync from: $TARGET_REVISION + 5. 📦 ArgoCD will deploy remaining enabled apps from target revision + 6. ⚡ Sync waves ensure proper deployment order for remaining apps 📋 Next steps: 1. Monitor ArgoCD applications: kubectl get apps -n argocd From b43683f4b74315e0fb1dd02eb9deea4ad50ba337 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Fri, 27 Feb 2026 22:37:33 +0200 Subject: [PATCH 057/115] fix: openbao deployment flow --- scripts/bootstrap.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 05676f5c..aa7fcc2b 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -273,6 +273,20 @@ helm template --release-name openbao ${SOURCE_ROOT}/sources/openbao/${OPENBAO_VE echo "⏳ Waiting for OpenBao pod to be ready..." kubectl wait --for=jsonpath='{.status.phase}'=Running pod/openbao-0 -n cf-openbao --timeout=300s +# Create the special init ConfigMaps that the init job expects +echo "🔧 Creating init-specific ConfigMaps for OpenBao initialization..." + +# Create the init version of secret manager scripts +cat "${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-manager-cm.yaml" | \ + sed "s|name: openbao-secret-manager-scripts|name: openbao-secret-manager-scripts-init|g" | \ + kubectl apply -f - + +# Create the init version of secret definitions +cat "${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml" | \ + sed "s|{{ .Values.domain }}|${DOMAIN}|g" | \ + sed "s|name: openbao-secrets-config|name: openbao-secrets-init-config|g" | \ + kubectl apply -f - + # Deploy OpenBao initialization job directly (critical for bootstrap) echo "🔐 Deploying OpenBao initialization job..." helm template --release-name openbao-init ${SOURCE_ROOT}/scripts/init-openbao-job \ From 56bb57ab9ff0473dc1ae9e92b2a74cd914a96b35 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Fri, 27 Feb 2026 22:45:30 +0200 Subject: [PATCH 058/115] chore: rm partial cleanup function --- scripts/bootstrap.sh | 68 +++++++++++++------------------------------- 1 file changed, 20 insertions(+), 48 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index aa7fcc2b..8ce5b982 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -78,7 +78,8 @@ while [[ $# -gt 0 ]]; do $0 dev.example.com -s=small -r=$LATEST_RELEASE Bootstrap Behavior: - • Bootstrap deploys ArgoCD + OpenBao + Gitea directly (essential infrastructure) + • Bootstrap deploys essential infrastructure directly: + ArgoCD + Gateway API CRDs + OpenBao + Gitea • cluster-forge parent app then deployed to manage remaining apps • ArgoCD syncs remaining apps from specified target revision • Direct deployment ensures proper initialization order and timing @@ -160,50 +161,10 @@ setup_sources() { echo "Using local sources for target revision: $TARGET_REVISION" } -pre_cleanup() { - echo "" - echo "=== Pre-cleanup: Checking for previous runs ===" - - # Check if gitea-init-job exists and completed successfully - if kubectl get job gitea-init-job -n cf-gitea >/dev/null 2>&1; then - if kubectl get job gitea-init-job -n cf-gitea -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; then - echo "Found completed gitea-init-job - removing Gitea to start fresh" - - # Delete all Gitea resources - kubectl delete job gitea-init-job -n cf-gitea --ignore-not-found=true - kubectl delete deployment gitea -n cf-gitea --ignore-not-found=true - kubectl delete statefulset gitea -n cf-gitea --ignore-not-found=true - kubectl delete service gitea -n cf-gitea --ignore-not-found=true - kubectl delete service gitea-http -n cf-gitea --ignore-not-found=true - kubectl delete service gitea-ssh -n cf-gitea --ignore-not-found=true - kubectl delete pvc -n cf-gitea -l app.kubernetes.io/name=gitea --ignore-not-found=true - kubectl delete configmap initial-cf-values -n cf-gitea --ignore-not-found=true - kubectl delete secret gitea-admin-credentials -n cf-gitea --ignore-not-found=true - kubectl delete ingress -n cf-gitea -l app.kubernetes.io/name=gitea --ignore-not-found=true - - echo "Gitea resources deleted" - fi - fi - - # Always delete openbao-init-job to allow re-initialization - kubectl delete job openbao-init-job -n cf-openbao --ignore-not-found=true - - # Delete temporary files - rm -f /tmp/merged_values.yaml /tmp/argocd_values.yaml /tmp/argocd_size_values.yaml \ - /tmp/openbao_values.yaml /tmp/openbao_size_values.yaml \ - /tmp/gitea_values.yaml /tmp/gitea_size_values.yaml - - echo "=== Pre-cleanup complete ===" - echo "" -} - display_target_revision setup_sources setup_values_files -# Run pre-cleanup -# pre_cleanup - echo "=== ClusterForge Bootstrap ===" echo "Domain: $DOMAIN" echo "Base values: $VALUES_FILE" @@ -229,7 +190,11 @@ extract_app_versions() { GITEA_VERSION=$(grep -A 5 "^ gitea:" "${SOURCE_ROOT}/root/${VALUES_FILE}" | \ grep "path:" | sed 's/.*gitea\///' | sed 's/ *$//') - echo "Extracted versions - ArgoCD: $ARGOCD_VERSION, OpenBao: $OPENBAO_VERSION, Gitea: $GITEA_VERSION" + # Extract Gateway API version from path like "sources/gateway-api/v1.3.0" + GATEWAY_API_VERSION=$(grep -A 5 "^ gateway-api:" "${SOURCE_ROOT}/root/${VALUES_FILE}" | \ + grep "path:" | head -1 | sed 's/.*gateway-api\///' | sed 's/ *$//') + + echo "Extracted versions - ArgoCD: $ARGOCD_VERSION, OpenBao: $OPENBAO_VERSION, Gitea: $GITEA_VERSION, Gateway-API: $GATEWAY_API_VERSION" } # Note: clusterForge.targetRevision will be set by the gitea-init-job @@ -263,6 +228,12 @@ echo "Deploying OpenBao directly to ensure initialization before dependent apps" # Create cf-openbao namespace kubectl create ns cf-openbao --dry-run=client -o yaml | kubectl apply -f - +# Deploy Gateway API CRDs (required for OpenBao HTTPRoute) +echo "🌐 Deploying Gateway API CRDs (required for routing)..." +echo " Note: ArgoCD will later adopt these CRDs into management" +helm template --release-name gateway-api "${SOURCE_ROOT}/sources/gateway-api/${GATEWAY_API_VERSION}" \ + --kube-version="${KUBE_VERSION}" | kubectl apply --server-side --field-manager=argocd-controller --force-conflicts -f - + # Deploy OpenBao using dedicated values file (no yq extraction needed) helm template --release-name openbao ${SOURCE_ROOT}/sources/openbao/${OPENBAO_VERSION} --namespace cf-openbao \ -f ${SOURCE_ROOT}/root/values_openbao.yaml \ @@ -404,12 +375,13 @@ Target revision: $TARGET_REVISION Gitea: https://gitea.${DOMAIN} 📋 What happens now: - 1. ✅ ArgoCD is running and managing the cluster - 2. ✅ OpenBao provides secrets management and is fully initialized - 3. ✅ Gitea provides git repositories for ArgoCD - 4. 🎯 cluster-forge app will sync from: $TARGET_REVISION - 5. 📦 ArgoCD will deploy remaining enabled apps from target revision - 6. ⚡ Sync waves ensure proper deployment order for remaining apps + 1. ✅ ArgoCD is running and managing the cluster + 2. ✅ Gateway API CRDs are installed for routing + 3. ✅ OpenBao provides secrets management and is fully initialized + 4. ✅ Gitea provides git repositories for ArgoCD + 5. 🎯 cluster-forge app will sync from: $TARGET_REVISION + 6. 📦 ArgoCD will deploy remaining enabled apps from target revision + 7. ⚡ Sync waves ensure proper deployment order for remaining apps 📋 Next steps: 1. Monitor ArgoCD applications: kubectl get apps -n argocd From 54b7a77f4a6a9856d360e58f50d61304e80c60fd Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Fri, 27 Feb 2026 23:03:17 +0200 Subject: [PATCH 059/115] refactor: simplify bootstrap script --- scripts/bootstrap.sh | 244 +++++++++++-------------------------------- 1 file changed, 61 insertions(+), 183 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 8ce5b982..e9598f16 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -74,12 +74,11 @@ while [[ $# -gt 0 ]]; do Examples: $0 compute.amd.com values_custom.yaml --cluster-size=large $0 112.100.97.17.nip.io - $0 dev.example.com --cluster-size=small --target-revision=$LATEST_RELEASE - $0 dev.example.com -s=small -r=$LATEST_RELEASE + $0 dev.example.com --cluster-size=small --target-revision=v1.8.0 + $0 dev.example.com -s=small -r=feature-branch Bootstrap Behavior: - • Bootstrap deploys essential infrastructure directly: - ArgoCD + Gateway API CRDs + OpenBao + Gitea + • Bootstrap deploys ArgoCD + OpenBao + Gitea directly (essential infrastructure) • cluster-forge parent app then deployed to manage remaining apps • ArgoCD syncs remaining apps from specified target revision • Direct deployment ensures proper initialization order and timing @@ -132,234 +131,108 @@ if [ ! -f "${SCRIPT_DIR}/../root/${VALUES_FILE}" ]; then exit 1 fi -# Check if size-specific values file exists -setup_values_files() { - SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" - - if [ ! -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then - echo "WARNING: Size-specific values file not found: ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" - echo "Proceeding with base values file only: ${VALUES_FILE}" - SIZE_VALUES_FILE="" - else - echo "Using size-specific values file: ${SIZE_VALUES_FILE}" - fi -} - -display_target_revision() { - # Check if TARGET_REVISION was explicitly set via command line flag - # by comparing against the default value - if [ "$TARGET_REVISION" != "$LATEST_RELEASE" ]; then - echo "Using specified targetRevision: $TARGET_REVISION" - else - echo "Using default targetRevision: $TARGET_REVISION" - fi -} - -# Since we only support v1.8.0+, always use local sources -setup_sources() { - SOURCE_ROOT="${SCRIPT_DIR}/.." - echo "Using local sources for target revision: $TARGET_REVISION" -} - -display_target_revision -setup_sources -setup_values_files +SOURCE_ROOT="${SCRIPT_DIR}/.." +SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" echo "=== ClusterForge Bootstrap ===" echo "Domain: $DOMAIN" echo "Base values: $VALUES_FILE" echo "Cluster size: $CLUSTER_SIZE" -if [ -n "$SIZE_VALUES_FILE" ]; then - echo "Size overlay: $SIZE_VALUES_FILE" -fi echo "Target revision: $TARGET_REVISION" -echo "" -echo "=== Starting Bootstrap Process ===" - -# Extract version information from app paths using sed/awk (no yq needed) -extract_app_versions() { - # Extract ArgoCD version from path like "sources/argocd/8.3.5" - ARGOCD_VERSION=$(grep -A 5 "^ argocd:" "${SOURCE_ROOT}/root/${VALUES_FILE}" | \ - grep "path:" | sed 's/.*argocd\///' | sed 's/ *$//') - - # Extract OpenBao version from path like "sources/openbao/0.18.2" - OPENBAO_VERSION=$(grep -A 5 "^ openbao:" "${SOURCE_ROOT}/root/${VALUES_FILE}" | \ - grep "path:" | sed 's/.*openbao\///' | sed 's/ *$//') - - # Extract Gitea version from path like "sources/gitea/12.3.0" - GITEA_VERSION=$(grep -A 5 "^ gitea:" "${SOURCE_ROOT}/root/${VALUES_FILE}" | \ - grep "path:" | sed 's/.*gitea\///' | sed 's/ *$//') - - # Extract Gateway API version from path like "sources/gateway-api/v1.3.0" - GATEWAY_API_VERSION=$(grep -A 5 "^ gateway-api:" "${SOURCE_ROOT}/root/${VALUES_FILE}" | \ - grep "path:" | head -1 | sed 's/.*gateway-api\///' | sed 's/ *$//') - - echo "Extracted versions - ArgoCD: $ARGOCD_VERSION, OpenBao: $OPENBAO_VERSION, Gitea: $GITEA_VERSION, Gateway-API: $GATEWAY_API_VERSION" -} - -# Note: clusterForge.targetRevision will be set by the gitea-init-job -# in the cluster-values repository (which overwrites the base values as the final values file) -echo "Target revision $TARGET_REVISION will be set in cluster-values repo by gitea-init-job" -# Extract version information from values -extract_app_versions +helm template cluster-forge "${SOURCE_ROOT}/root" \ + --show-only templates/cluster-forge.yaml \ + -f "${SOURCE_ROOT}/root/${VALUES_FILE}" \ + -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" \ + --set global.domain="${DOMAIN}" \ + --set clusterForge.targetRevision="${TARGET_REVISION}" \ + --set externalValues.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" \ + --set clusterForge.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" \ + --namespace argocd \ + --kube-version "${KUBE_VERSION}" | kubectl apply -f - +echo "" -# Create namespaces for direct deployments only +# Create namespaces kubectl create ns argocd --dry-run=client -o yaml | kubectl apply -f - kubectl create ns cf-gitea --dry-run=client -o yaml | kubectl apply -f - -# Note: cf-openbao namespace will be created by ArgoCD when it deploys OpenBao +kubectl create ns cf-openbao --dry-run=client -o yaml | kubectl apply -f - -echo "" +# ArgoCD bootstrap echo "=== ArgoCD Bootstrap ===" -# Deploy ArgoCD using dedicated values file (no yq extraction needed) -helm template --release-name argocd ${SOURCE_ROOT}/sources/argocd/${ARGOCD_VERSION} --namespace argocd \ +helm template --release-name argocd ${SOURCE_ROOT}/sources/argocd/8.3.5 --namespace argocd \ -f ${SOURCE_ROOT}/root/values_argocd.yaml \ --set global.domain="argocd.${DOMAIN}" \ --kube-version=${KUBE_VERSION} | kubectl apply --server-side --field-manager=argocd-controller --force-conflicts -f - -kubectl rollout status statefulset/argocd-application-controller -n argocd -kubectl rollout status deploy/argocd-applicationset-controller -n argocd -kubectl rollout status deploy/argocd-redis -n argocd -kubectl rollout status deploy/argocd-repo-server -n argocd +kubectl rollout status statefulset/argocd-application-controller -n argocd --timeout=300s +kubectl rollout status deploy/argocd-applicationset-controller -n argocd --timeout=300s +kubectl rollout status deploy/argocd-redis -n argocd --timeout=300s +kubectl rollout status deploy/argocd-repo-server -n argocd --timeout=300s -echo "" +# OpenBao bootstrap echo "=== OpenBao Bootstrap ===" -echo "Deploying OpenBao directly to ensure initialization before dependent apps" - -# Create cf-openbao namespace -kubectl create ns cf-openbao --dry-run=client -o yaml | kubectl apply -f - - -# Deploy Gateway API CRDs (required for OpenBao HTTPRoute) -echo "🌐 Deploying Gateway API CRDs (required for routing)..." -echo " Note: ArgoCD will later adopt these CRDs into management" -helm template --release-name gateway-api "${SOURCE_ROOT}/sources/gateway-api/${GATEWAY_API_VERSION}" \ - --kube-version="${KUBE_VERSION}" | kubectl apply --server-side --field-manager=argocd-controller --force-conflicts -f - - -# Deploy OpenBao using dedicated values file (no yq extraction needed) -helm template --release-name openbao ${SOURCE_ROOT}/sources/openbao/${OPENBAO_VERSION} --namespace cf-openbao \ +helm template --release-name openbao ${SOURCE_ROOT}/sources/openbao/0.18.2 --namespace cf-openbao \ -f ${SOURCE_ROOT}/root/values_openbao.yaml \ --set ui.enabled=true \ --kube-version=${KUBE_VERSION} | kubectl apply --server-side --field-manager=argocd-controller --force-conflicts -f - - -# Wait for OpenBao pod to be running -echo "⏳ Waiting for OpenBao pod to be ready..." kubectl wait --for=jsonpath='{.status.phase}'=Running pod/openbao-0 -n cf-openbao --timeout=300s -# Create the special init ConfigMaps that the init job expects -echo "🔧 Creating init-specific ConfigMaps for OpenBao initialization..." - -# Create the init version of secret manager scripts -cat "${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-manager-cm.yaml" | \ - sed "s|name: openbao-secret-manager-scripts|name: openbao-secret-manager-scripts-init|g" | \ - kubectl apply -f - - -# Create the init version of secret definitions -cat "${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml" | \ - sed "s|{{ .Values.domain }}|${DOMAIN}|g" | \ - sed "s|name: openbao-secrets-config|name: openbao-secrets-init-config|g" | \ - kubectl apply -f - - -# Deploy OpenBao initialization job directly (critical for bootstrap) -echo "🔐 Deploying OpenBao initialization job..." helm template --release-name openbao-init ${SOURCE_ROOT}/scripts/init-openbao-job \ -f ${SOURCE_ROOT}/root/values_openbao.yaml \ --set domain="${DOMAIN}" \ --kube-version=${KUBE_VERSION} | kubectl apply -f - - -# Wait for initialization to complete -echo "⏳ Waiting for OpenBao initialization to complete..." kubectl wait --for=condition=complete --timeout=300s job/openbao-init-job -n cf-openbao -# Deploy OpenBao configuration (CronJobs) directly after initialization -echo "🔧 Deploying OpenBao configuration (CronJobs for ongoing management)..." - -# Deploy the entire openbao-config chart efficiently -helm template --release-name openbao-config "${SOURCE_ROOT}/sources/openbao-config/0.1.0" \ - --namespace cf-openbao \ - --set domain="${DOMAIN}" \ - --kube-version="${KUBE_VERSION}" | kubectl apply -f - - -echo "✅ OpenBao deployed, initialized, and configured directly" -echo "" +# Gitea bootstrap echo "=== Gitea Bootstrap ===" generate_password() { openssl rand -hex 16 | tr 'a-f' 'A-F' | head -c 32 } -# Create gitea admin credentials secret -kubectl create secret generic gitea-admin-credentials \ - --namespace=cf-gitea \ - --from-literal=username=silogen-admin \ - --from-literal=password=$(generate_password) \ - --dry-run=client -o yaml | kubectl apply -f - - -# Create initial-cf-values configmap with basic values for gitea-init-job -# Use simple shell variables instead of merged YAML +# Create initial-cf-values configmap (simple values for gitea-init-job) cat > /tmp/simple_values.yaml << EOF global: domain: ${DOMAIN} - clusterSize: values_${CLUSTER_SIZE}.yaml + clusterSize: ${SIZE_VALUES_FILE} clusterForge: targetRevision: ${TARGET_REVISION} EOF kubectl create configmap initial-cf-values --from-literal=initial-cf-values="$(cat /tmp/simple_values.yaml)" --dry-run=client -o yaml | kubectl apply -n cf-gitea -f - -# Bootstrap Gitea using dedicated values file (no yq extraction needed) -helm template --release-name gitea ${SOURCE_ROOT}/sources/gitea/${GITEA_VERSION} --namespace cf-gitea \ +kubectl create secret generic gitea-admin-credentials \ + --namespace=cf-gitea \ + --from-literal=username=silogen-admin \ + --from-literal=password=$(generate_password) \ + --dry-run=client -o yaml | kubectl apply -f - + +helm template --release-name gitea ${SOURCE_ROOT}/sources/gitea/12.3.0 --namespace cf-gitea \ -f ${SOURCE_ROOT}/root/values_gitea.yaml \ --set clusterDomain="${DOMAIN}" \ --set gitea.config.server.ROOT_URL="https://gitea.${DOMAIN}/" \ --kube-version=${KUBE_VERSION} | kubectl apply -f - -kubectl rollout status deploy/gitea -n cf-gitea +kubectl rollout status deploy/gitea -n cf-gitea --timeout=300s -# Gitea Init Job helm template --release-name gitea-init ${SOURCE_ROOT}/scripts/init-gitea-job \ - --set clusterSize="${SIZE_VALUES_FILE:-values_${CLUSTER_SIZE}.yaml}" \ + --set clusterSize="${SIZE_VALUES_FILE}" \ --set domain="${DOMAIN}" \ --set targetRevision="${TARGET_REVISION}" \ - --kube-version=${KUBE_VERSION} \ - | kubectl apply -f - - + --kube-version=${KUBE_VERSION} | kubectl apply -f - kubectl wait --for=condition=complete --timeout=300s job/gitea-init-job -n cf-gitea -echo "" -echo "=== Creating ClusterForge Parent App-of-Apps ===" -echo "Cluster size: $CLUSTER_SIZE" +# Create cluster-forge parent app only (not all apps) +echo "=== Creating ClusterForge Parent App ===" echo "Target revision: $TARGET_REVISION" -# Create minimal values for rendering only the cluster-forge parent app -cat > /tmp/cluster_forge_values.yaml < Date: Fri, 27 Feb 2026 23:06:14 +0200 Subject: [PATCH 060/115] fix: ordering issue --- scripts/bootstrap.sh | 51 ++------------------------------------------ 1 file changed, 2 insertions(+), 49 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index e9598f16..dad4fbd8 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -140,16 +140,6 @@ echo "Base values: $VALUES_FILE" echo "Cluster size: $CLUSTER_SIZE" echo "Target revision: $TARGET_REVISION" -helm template cluster-forge "${SOURCE_ROOT}/root" \ - --show-only templates/cluster-forge.yaml \ - -f "${SOURCE_ROOT}/root/${VALUES_FILE}" \ - -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" \ - --set global.domain="${DOMAIN}" \ - --set clusterForge.targetRevision="${TARGET_REVISION}" \ - --set externalValues.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" \ - --set clusterForge.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" \ - --namespace argocd \ - --kube-version "${KUBE_VERSION}" | kubectl apply -f - echo "" # Create namespaces @@ -219,6 +209,8 @@ helm template --release-name gitea-init ${SOURCE_ROOT}/scripts/init-gitea-job \ --kube-version=${KUBE_VERSION} | kubectl apply -f - kubectl wait --for=condition=complete --timeout=300s job/gitea-init-job -n cf-gitea + + # Create cluster-forge parent app only (not all apps) echo "=== Creating ClusterForge Parent App ===" echo "Target revision: $TARGET_REVISION" @@ -233,42 +225,3 @@ helm template cluster-forge "${SOURCE_ROOT}/root" \ --set clusterForge.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" \ --namespace argocd \ --kube-version "${KUBE_VERSION}" | kubectl apply -f - - -cat <<__SUMMARY__ - -=== ClusterForge Bootstrap Complete === - -Domain: $DOMAIN -Cluster size: $CLUSTER_SIZE -Target revision: $TARGET_REVISION - -🌐 Access URLs: - ArgoCD: https://argocd.${DOMAIN} - OpenBao: https://openbao.${DOMAIN} - Gitea: https://gitea.${DOMAIN} - - Credentials: - ArgoCD admin username: admin - ArgoCD admin password: (check argocd-initial-admin-secret in argocd namespace) - OpenBao token: (check openbao-initial-admin-secret in cf-openbao namespace - Gitea admin username: silogen-admin - Gitea admin password: (check gitea-admin-credentials secret in cf-gitea namespace) - -📋 What happens now: - 1. ✅ ArgoCD is running and managing the cluster - 2. ✅ OpenBao provides secrets management and is fully initialized - 3. ✅ Gitea provides git source of truth ArgoCD (unless cluster size is small) - 4. 🎯 cluster-forge app will sync from: $TARGET_REVISION - 5. 📦 ArgoCD will deploy remaining enabled apps from target revision - 6. ⚡ Sync waves ensure proper deployment order for remaining apps - -📋 Next steps: - 1. Monitor ArgoCD applications: kubectl get apps -n argocd - 2. Check sync status: kubectl get apps -n argocd -o wide - 3. View ArgoCD UI for detailed deployment progress - -This is the way! 🚀 -__SUMMARY__ - -# Cleanup temporary files -rm -f /tmp/simple_values.yaml From 95818fab97828db60df32e07c12ed35dc825290d Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Sat, 28 Feb 2026 22:46:38 +0200 Subject: [PATCH 061/115] feat: allow parent cluster-forge app to bootstrap openbao (thereby simplify bootstrap logic) --- root/values.yaml | 12 +- root/values_argocd.yaml | 58 ----- root/values_gitea.yaml | 31 --- root/values_openbao.yaml | 23 -- scripts/bootstrap.sh | 486 +++++++++++++++++++++++---------------- 5 files changed, 300 insertions(+), 310 deletions(-) delete mode 100644 root/values_argocd.yaml delete mode 100644 root/values_gitea.yaml delete mode 100644 root/values_openbao.yaml diff --git a/root/values.yaml b/root/values.yaml index 754ec14f..c3c9f1d3 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -49,6 +49,7 @@ enabledApps: - minio-tenant - minio-tenant-config - openbao + - openbao-init - openbao-config - opentelemetry-operator - otel-lgtm-stack @@ -170,7 +171,7 @@ apps: replicas: 1 ui: enabled: true - syncWave: -40 + syncWave: -70 ignoreDifferences: - group: "apps" kind: "Deployment" @@ -181,6 +182,15 @@ apps: name: "openbao" jsonPointers: - /spec/volumeClaimTemplates + openbao-init: + path: ../scripts/init-openbao-job + namespace: cf-openbao + valuesObject: + domain: # to be filled by cluster-forge app + helmParameters: + - name: domain + value: "{{ .Values.global.domain }}" + syncWave: -60 openbao-config: path: openbao-config/0.1.0 namespace: cf-openbao diff --git a/root/values_argocd.yaml b/root/values_argocd.yaml deleted file mode 100644 index 649c37a0..00000000 --- a/root/values_argocd.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# Dedicated ArgoCD values for bootstrap -# Extracted from the argocd.valuesObject section of values.yaml -applicationSet: - replicas: 1 -configs: - cm: - create: true - resource.customizations.health.opentelemetry.io_OpenTelemetryCollector: | - hs = {} - hs.status = "Healthy" - hs.message = "Always Healthy - See https://github.com/argoproj/argo-cd/pull/24284" - return hs - resource.customizations.health.keda.sh_ScaledObject: | - hs = {} - if obj.status ~= nil then - if obj.status.conditions ~= nil then - for _, condition in ipairs(obj.status.conditions) do - if condition.type == "Ready" then - if condition.status == "True" then - hs.status = "Healthy" - hs.message = "ScaledObject is ready" - else - hs.status = "Degraded" - hs.message = condition.reason or "ScaledObject not ready" - end - return hs - end - end - end - hs.status = "Progressing" - hs.message = "ScaledObject status unknown" - else - hs.status = "Progressing" - hs.message = "ScaledObject status unknown" - end - return hs - params: - server.insecure: true - rbac: - create: true - policy.csv: | - g, argocd-users, role:admin -controller: - replicas: 1 -redis: - enabled: true -redis-ha: - enabled: false -repoServer: - replicas: 1 - autoscaling: - enabled: false -server: - replicas: 1 - autoscaling: - enabled: false -global: - domain: # to be set by bootstrap script \ No newline at end of file diff --git a/root/values_gitea.yaml b/root/values_gitea.yaml deleted file mode 100644 index f2abd256..00000000 --- a/root/values_gitea.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# Dedicated Gitea values for bootstrap -# Extracted from the gitea.valuesObject section of values.yaml -clusterDomain: # to be set by bootstrap script -strategy: - type: "Recreate" -gitea: - admin: - existingSecret: gitea-admin-credentials - config: - server: - ROOT_URL: # to be set by bootstrap script - database: - DB_TYPE: sqlite3 - session: - PROVIDER: memory - cache: - ADAPTER: memory - queue: - TYPE: level -valkey-cluster: - enabled: false -valkey: - enabled: false -postgresql: - enabled: false -postgresql-ha: - enabled: false -persistence: - enabled: true -test: - enabled: false \ No newline at end of file diff --git a/root/values_openbao.yaml b/root/values_openbao.yaml deleted file mode 100644 index 1dde2688..00000000 --- a/root/values_openbao.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# Dedicated OpenBao values for bootstrap -# Extracted from the openbao.valuesObject section of values.yaml -injector: - enabled: false -server: - affinity: | - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/name: openbao - app.kubernetes.io/instance: openbao - component: server - topologyKey: kubernetes.io/hostname - ha: - enabled: false - raft: - enabled: false - replicas: 1 -ui: - enabled: true \ No newline at end of file diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index dad4fbd8..1d4fb8d7 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -2,226 +2,318 @@ set -euo pipefail -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Initialize variables LATEST_RELEASE="v1.8.0" -TARGET_REVISION="$LATEST_RELEASE" +# Initialize variables CLUSTER_SIZE="medium" # Default to medium +DEFAULT_TIMEOUT="5m" DOMAIN="" KUBE_VERSION=1.33 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TARGET_REVISION="$LATEST_RELEASE" +TEMPLATE_ONLY=false VALUES_FILE="values.yaml" +APPS="" -# Parse arguments -while [[ $# -gt 0 ]]; do - case $1 in - --CLUSTER-SIZE|--cluster-size|-s) - if [ -z "$2" ]; then - echo "ERROR: --cluster-size requires an argument" - exit 1 - fi - CLUSTER_SIZE="$2" - shift 2 - ;; - --CLUSTER-SIZE=*) - CLUSTER_SIZE="${1#*=}" - shift - ;; - --cluster-size=*) - CLUSTER_SIZE="${1#*=}" - shift +parse_args() { + # Parse arguments + while [[ $# -gt 0 ]]; do + case $1 in + --CLUSTER-SIZE|--cluster-size|-s) + if [ -z "$2" ]; then + echo "ERROR: --cluster-size requires an argument" + exit 1 + fi + CLUSTER_SIZE="$2" + shift 2 + ;; + --CLUSTER-SIZE=*) + CLUSTER_SIZE="${1#*=}" + shift + ;; + --cluster-size=*) + CLUSTER_SIZE="${1#*=}" + shift + ;; + -s=*) + CLUSTER_SIZE="${1#*=}" + shift + ;; + --TARGET-REVISION|--target-revision|-r) + if [ -z "$2" ]; then + echo "WARNING: defaulting to --target-revision=$LATEST_RELEASE (no value specified)" + TARGET_REVISION="$LATEST_RELEASE" + shift + else + TARGET_REVISION="$2" + shift 2 + fi + ;; + --TARGET-REVISION=*) + TARGET_REVISION="${1#*=}" + shift + ;; + --target-revision=*) + TARGET_REVISION="${1#*=}" + shift + ;; + -r=*) + TARGET_REVISION="${1#*=}" + shift + ;; + --template-only|-t) + TEMPLATE_ONLY=true + shift + ;; + --apps=*) + APPS="${1#*=}" + TEMPLATE_ONLY=true + shift + ;; + --help|-h) + cat < [values_file] + + Arguments: + domain Required. Cluster domain (e.g., example.com) + values_file Optional. Values .yaml file to use, default: root/values.yaml + + Options: + --apps=APP1,APP2 Render only specified components (implies --template-only) + options: namespaces, argocd, gitea, cluster-forge, or any cluster-forge child app (see values.yaml for app names) + --cluster-size, -s options: [small|medium|large], default: medium + --target-revision, -r cluster-forge git revision for ArgoCD to sync from + options: [tag|commit_hash|branch_name], default: $LATEST_RELEASE + --template-only, -t Output YAML manifests to stdout instead of applying to cluster + + + Examples: + $0 compute.amd.com values_custom.yaml --cluster-size=large + $0 112.100.97.17.nip.io + $0 dev.example.com --cluster-size=small --target-revision=v1.8.0 + $0 dev.example.com -s=small -r=feature-branch + + Bootstrap Behavior: + • Bootstrap deploys ArgoCD + Gitea directly (essential infrastructure) + • cluster-forge parent app then deployed to manage remaining apps including OpenBao + • ArgoCD syncs remaining apps from specified target revision with proper syncWave ordering + • Direct deployment ensures proper initialization order and timing +HELP_OUTPUT + exit 0 ;; - -s=*) - CLUSTER_SIZE="${1#*=}" - shift + --*) + echo "ERROR: Unknown option: $1" + echo "Use --help for usage information" + exit 1 ;; - --TARGET-REVISION|--target-revision|-r) - if [ -z "$2" ]; then - echo "WARNING: defaulting to --target-revision=$LATEST_RELEASE (no value specified)" - TARGET_REVISION="$LATEST_RELEASE" - shift + *) + # Positional arguments + if [ -z "$DOMAIN" ]; then + DOMAIN="$1" + elif [ "$VALUES_FILE" = "values.yaml" ]; then + VALUES_FILE="$1" else - TARGET_REVISION="$2" - shift 2 + echo "ERROR: Too many arguments: $1" + echo "Usage: $0 [--CLUSTER_SIZE=small|medium|large] [--dev] [values_file]" + exit 1 fi - ;; - --TARGET-REVISION=*) - TARGET_REVISION="${1#*=}" - shift - ;; - --target-revision=*) - TARGET_REVISION="${1#*=}" shift ;; - -r=*) - TARGET_REVISION="${1#*=}" - shift - ;; - --help|-h) - cat < [values_file] - - Arguments: - domain Required. Cluster domain (e.g., example.com) - values_file Optional. Values .yaml file to use, default: root/values.yaml - - Options: - -r, --target-revision cluster-forge git revision for ArgoCD to sync from - options: [tag|commit_hash|branch_name], default: $LATEST_RELEASE - -s, --cluster-size options: [small|medium|large], default: medium - - Examples: - $0 compute.amd.com values_custom.yaml --cluster-size=large - $0 112.100.97.17.nip.io - $0 dev.example.com --cluster-size=small --target-revision=v1.8.0 - $0 dev.example.com -s=small -r=feature-branch - - Bootstrap Behavior: - • Bootstrap deploys ArgoCD + OpenBao + Gitea directly (essential infrastructure) - • cluster-forge parent app then deployed to manage remaining apps - • ArgoCD syncs remaining apps from specified target revision - • Direct deployment ensures proper initialization order and timing -HELP_OUTPUT - exit 0 - ;; - --*) - echo "ERROR: Unknown option: $1" - echo "Use --help for usage information" + esac + done +} + +validate_args() { + # Validate required arguments + if [ -z "$DOMAIN" ]; then + echo "ERROR: Domain argument is required" + echo "Usage: $0 [values_file] [--CLUSTER_SIZE=small|medium|large]" + echo "Use --help for more details" exit 1 + fi + + # Validate cluster size + case "$CLUSTER_SIZE" in + small|medium|large) ;; *) - # Positional arguments - if [ -z "$DOMAIN" ]; then - DOMAIN="$1" - elif [ "$VALUES_FILE" = "values.yaml" ]; then - VALUES_FILE="$1" - else - echo "ERROR: Too many arguments: $1" - echo "Usage: $0 [--CLUSTER_SIZE=small|medium|large] [--dev] [values_file]" - exit 1 - fi - shift + echo "ERROR: Invalid cluster size '$CLUSTER_SIZE'" + echo "Valid sizes: small, medium, large" + exit 1 ;; esac -done - -# Validate required arguments -if [ -z "$DOMAIN" ]; then - echo "ERROR: Domain argument is required" - echo "Usage: $0 [values_file] [--CLUSTER_SIZE=small|medium|large]" - echo "Use --help for more details" - exit 1 -fi - -# Validate cluster size -case "$CLUSTER_SIZE" in - small|medium|large) - ;; - *) - echo "ERROR: Invalid cluster size '$CLUSTER_SIZE'" - echo "Valid sizes: small, medium, large" - exit 1 - ;; -esac - -# Validate values file exists -if [ ! -f "${SCRIPT_DIR}/../root/${VALUES_FILE}" ]; then - echo "ERROR: Values file not found: ${SCRIPT_DIR}/../root/${VALUES_FILE}" - exit 1 -fi - -SOURCE_ROOT="${SCRIPT_DIR}/.." -SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" - -echo "=== ClusterForge Bootstrap ===" -echo "Domain: $DOMAIN" -echo "Base values: $VALUES_FILE" -echo "Cluster size: $CLUSTER_SIZE" -echo "Target revision: $TARGET_REVISION" - -echo "" + + # Validate values file exists + if [ ! -f "${SCRIPT_DIR}/../root/${VALUES_FILE}" ]; then + echo "ERROR: Values file not found: ${SCRIPT_DIR}/../root/${VALUES_FILE}" + exit 1 + fi + + SOURCE_ROOT="${SCRIPT_DIR}/.." + SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" +} + +print_summary() { + # Don't print summary if just outputting templates + # if [ "$TEMPLATE_ONLY" = true ]; then + # return + # fi + + cat < /tmp/argocd_bootstrap_values.yaml << EOF +global: + domain: argocd.${DOMAIN} +EOF + + # Extract and merge ArgoCD values from the apps structure + yq eval '.apps.argocd.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" >> /tmp/argocd_bootstrap_values.yaml + if [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + if yq eval '.apps.argocd.valuesObject // ""' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | grep -q .; then + yq eval '.apps.argocd.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | yq eval-all 'select(fileIndex == 0) * select(fileIndex == 1)' /tmp/argocd_bootstrap_values.yaml - > /tmp/argocd_bootstrap_values_merged.yaml + mv /tmp/argocd_bootstrap_values_merged.yaml /tmp/argocd_bootstrap_values.yaml + fi + fi +} # ArgoCD bootstrap -echo "=== ArgoCD Bootstrap ===" -helm template --release-name argocd ${SOURCE_ROOT}/sources/argocd/8.3.5 --namespace argocd \ - -f ${SOURCE_ROOT}/root/values_argocd.yaml \ - --set global.domain="argocd.${DOMAIN}" \ - --kube-version=${KUBE_VERSION} | kubectl apply --server-side --field-manager=argocd-controller --force-conflicts -f - -kubectl rollout status statefulset/argocd-application-controller -n argocd --timeout=300s -kubectl rollout status deploy/argocd-applicationset-controller -n argocd --timeout=300s -kubectl rollout status deploy/argocd-redis -n argocd --timeout=300s -kubectl rollout status deploy/argocd-repo-server -n argocd --timeout=300s - -# OpenBao bootstrap -echo "=== OpenBao Bootstrap ===" -helm template --release-name openbao ${SOURCE_ROOT}/sources/openbao/0.18.2 --namespace cf-openbao \ - -f ${SOURCE_ROOT}/root/values_openbao.yaml \ - --set ui.enabled=true \ - --kube-version=${KUBE_VERSION} | kubectl apply --server-side --field-manager=argocd-controller --force-conflicts -f - -kubectl wait --for=jsonpath='{.status.phase}'=Running pod/openbao-0 -n cf-openbao --timeout=300s - -helm template --release-name openbao-init ${SOURCE_ROOT}/scripts/init-openbao-job \ - -f ${SOURCE_ROOT}/root/values_openbao.yaml \ - --set domain="${DOMAIN}" \ - --kube-version=${KUBE_VERSION} | kubectl apply -f - -kubectl wait --for=condition=complete --timeout=300s job/openbao-init-job -n cf-openbao - -# Gitea bootstrap -echo "=== Gitea Bootstrap ===" -generate_password() { - openssl rand -hex 16 | tr 'a-f' 'A-F' | head -c 32 +bootstrap_argocd() { + echo "=== ArgoCD Bootstrap ===" + extract_argocd_values + helm template --release-name argocd ${SOURCE_ROOT}/sources/argocd/8.3.5 --namespace argocd \ + --values /tmp/argocd_bootstrap_values.yaml \ + --kube-version=${KUBE_VERSION} | apply_or_template --server-side --field-manager=argocd-controller --force-conflicts -f - + if [ "$TEMPLATE_ONLY" = false ]; then + kubectl rollout status statefulset/argocd-application-controller -n argocd --timeout="${DEFAULT_TIMEOUT}" + kubectl rollout status deploy/argocd-applicationset-controller -n argocd --timeout="${DEFAULT_TIMEOUT}" + kubectl rollout status deploy/argocd-redis -n argocd --timeout="${DEFAULT_TIMEOUT}" + kubectl rollout status deploy/argocd-repo-server -n argocd --timeout="${DEFAULT_TIMEOUT}" + fi } -# Create initial-cf-values configmap (simple values for gitea-init-job) -cat > /tmp/simple_values.yaml << EOF -global: - domain: ${DOMAIN} - clusterSize: ${SIZE_VALUES_FILE} -clusterForge: - targetRevision: ${TARGET_REVISION} +# OpenBao is now deployed by ArgoCD with syncWave -70/-60 + +# Extract Gitea values using yq +extract_gitea_values() { + # Create temporary values file for Gitea bootstrap + cat > /tmp/gitea_bootstrap_values.yaml << EOF +clusterDomain: ${DOMAIN} +gitea: + config: + server: + ROOT_URL: https://gitea.${DOMAIN}/ EOF + + # Extract and merge Gitea values from the apps structure + yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" >> /tmp/gitea_bootstrap_values.yaml + if [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + if yq eval '.apps.gitea.valuesObject // ""' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | grep -q .; then + yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | yq eval-all 'select(fileIndex == 0) * select(fileIndex == 1)' /tmp/gitea_bootstrap_values.yaml - > /tmp/gitea_bootstrap_values_merged.yaml + mv /tmp/gitea_bootstrap_values_merged.yaml /tmp/gitea_bootstrap_values.yaml + fi + fi +} + +bootstrap_gitea() { + # Gitea bootstrap + echo "=== Gitea Bootstrap ===" + generate_password() { + openssl rand -hex 16 | tr 'a-f' 'A-F' | head -c 32 + } + + # Create initial-cf-values configmap (simple values for gitea-init-job) + cat > /tmp/simple_values.yaml << EOF + global: + domain: ${DOMAIN} + clusterSize: ${SIZE_VALUES_FILE} + clusterForge: + targetRevision: ${TARGET_REVISION} +EOF + + kubectl create configmap initial-cf-values --from-literal=initial-cf-values="$(cat /tmp/simple_values.yaml)" --dry-run=client -o yaml | apply_or_template -n cf-gitea -f - + + kubectl create secret generic gitea-admin-credentials \ + --namespace=cf-gitea \ + --from-literal=username=silogen-admin \ + --from-literal=password=$(generate_password) \ + --dry-run=client -o yaml | apply_or_template -f - + + extract_gitea_values + helm template --release-name gitea ${SOURCE_ROOT}/sources/gitea/12.3.0 --namespace cf-gitea \ + --values /tmp/gitea_bootstrap_values.yaml \ + --kube-version=${KUBE_VERSION} | apply_or_template -f - + + helm template --release-name gitea-init ${SOURCE_ROOT}/scripts/init-gitea-job \ + --set clusterSize="${SIZE_VALUES_FILE}" \ + --set domain="${DOMAIN}" \ + --set targetRevision="${TARGET_REVISION}" \ + --kube-version=${KUBE_VERSION} | apply_or_template -f - + if [ "$TEMPLATE_ONLY" = false ]; then + kubectl rollout status deploy/gitea -n cf-gitea --timeout="${DEFAULT_TIMEOUT}" + kubectl wait --for=condition=complete --timeout="${DEFAULT_TIMEOUT}" job/gitea-init-job -n cf-gitea + fi +} + +apply_cluster_forge_parent_app() { + # Create cluster-forge parent app only (not all apps) + echo "=== Creating ClusterForge Parent App ===" + echo "Target revision: $TARGET_REVISION" + + helm template cluster-forge "${SOURCE_ROOT}/root" \ + --show-only templates/cluster-forge.yaml \ + --values "${SOURCE_ROOT}/root/${VALUES_FILE}" \ + --values "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" \ + --set global.domain="${DOMAIN}" \ + --set clusterForge.targetRevision="${TARGET_REVISION}" \ + --set externalValues.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" \ + --set clusterForge.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" \ + --namespace argocd \ + --kube-version "${KUBE_VERSION}" | apply_or_template -f - +} + +main() { + parse_args "$@" + validate_args + print_summary + should_run namespaces && create_namespaces + should_run argocd && bootstrap_argocd + should_run gitea && bootstrap_gitea + should_run cluster-forge && apply_cluster_forge_parent_app +} -kubectl create configmap initial-cf-values --from-literal=initial-cf-values="$(cat /tmp/simple_values.yaml)" --dry-run=client -o yaml | kubectl apply -n cf-gitea -f - - -kubectl create secret generic gitea-admin-credentials \ - --namespace=cf-gitea \ - --from-literal=username=silogen-admin \ - --from-literal=password=$(generate_password) \ - --dry-run=client -o yaml | kubectl apply -f - - -helm template --release-name gitea ${SOURCE_ROOT}/sources/gitea/12.3.0 --namespace cf-gitea \ - -f ${SOURCE_ROOT}/root/values_gitea.yaml \ - --set clusterDomain="${DOMAIN}" \ - --set gitea.config.server.ROOT_URL="https://gitea.${DOMAIN}/" \ - --kube-version=${KUBE_VERSION} | kubectl apply -f - -kubectl rollout status deploy/gitea -n cf-gitea --timeout=300s - -helm template --release-name gitea-init ${SOURCE_ROOT}/scripts/init-gitea-job \ - --set clusterSize="${SIZE_VALUES_FILE}" \ - --set domain="${DOMAIN}" \ - --set targetRevision="${TARGET_REVISION}" \ - --kube-version=${KUBE_VERSION} | kubectl apply -f - -kubectl wait --for=condition=complete --timeout=300s job/gitea-init-job -n cf-gitea - - - -# Create cluster-forge parent app only (not all apps) -echo "=== Creating ClusterForge Parent App ===" -echo "Target revision: $TARGET_REVISION" - -helm template cluster-forge "${SOURCE_ROOT}/root" \ - --show-only templates/cluster-forge.yaml \ - -f "${SOURCE_ROOT}/root/${VALUES_FILE}" \ - -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" \ - --set global.domain="${DOMAIN}" \ - --set clusterForge.targetRevision="${TARGET_REVISION}" \ - --set externalValues.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" \ - --set clusterForge.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" \ - --namespace argocd \ - --kube-version "${KUBE_VERSION}" | kubectl apply -f - +main "$@" \ No newline at end of file From 91f4a9cb98a21b495a746a9717532371bf9a3c58 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Sat, 28 Feb 2026 22:54:56 +0200 Subject: [PATCH 062/115] feat(bootstrap.sh): check for dependencies with guidance for resolving --- scripts/bootstrap.sh | 127 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 1d4fb8d7..28dd671f 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -14,6 +14,125 @@ TARGET_REVISION="$LATEST_RELEASE" TEMPLATE_ONLY=false VALUES_FILE="values.yaml" APPS="" +SKIP_DEPENDENCY_CHECK=false + +# Check for required dependencies +check_dependencies() { + local missing_deps=() + local all_good=true + + echo "=== Checking Dependencies ===" + + # Define required programs with installation instructions + declare -A REQUIRED_PROGRAMS=( + ["kubectl"]="Kubernetes CLI - https://kubernetes.io/docs/tasks/tools/install-kubectl/" + ["helm"]="Helm package manager - https://helm.sh/docs/intro/install/" + ["yq"]="YAML/JSON processor - https://github.com/mikefarah/yq#install" + ["openssl"]="OpenSSL for password generation - Usually pre-installed or via package manager" + ) + + # Define optional programs (used by shell builtins but good to check) + declare -A OPTIONAL_PROGRAMS=( + ["cat"]="cat command - Usually pre-installed" + ["grep"]="grep command - Usually pre-installed" + ["tr"]="tr command - Usually pre-installed" + ["head"]="head command - Usually pre-installed" + ) + + # Check required programs with version info + for program in "${!REQUIRED_PROGRAMS[@]}"; do + if command -v "$program" >/dev/null 2>&1; then + case "$program" in + "kubectl") + version=$(kubectl version --client 2>/dev/null | head -n1 | cut -d' ' -f3 2>/dev/null || echo "unknown") + printf " ✓ %-12s %s (%s)\n" "$program" "$(command -v "$program")" "$version" + ;; + "helm") + version=$(helm version --short --client 2>/dev/null | cut -d'+' -f1 2>/dev/null || echo "unknown") + printf " ✓ %-12s %s (%s)\n" "$program" "$(command -v "$program")" "$version" + ;; + "yq") + version=$(yq --version 2>/dev/null | head -n1 | cut -d' ' -f4 2>/dev/null || echo "unknown") + printf " ✓ %-12s %s (%s)\n" "$program" "$(command -v "$program")" "$version" + ;; + *) + printf " ✓ %-12s %s\n" "$program" "$(command -v "$program")" + ;; + esac + else + printf " ✗ %-12s MISSING\n" "$program" + missing_deps+=("$program") + all_good=false + fi + done + + # Check optional programs (warn but don't fail) + for program in "${!OPTIONAL_PROGRAMS[@]}"; do + if command -v "$program" >/dev/null 2>&1; then + printf " ✓ %-12s %s\n" "$program" "$(command -v "$program")" + else + printf " ! %-12s MISSING (usually pre-installed)\n" "$program" + fi + done + + # If any required dependencies are missing, show installation instructions + if [ "$all_good" = false ]; then + echo "" + echo "ERROR: Missing required dependencies!" + echo "" + echo "Please install the following programs:" + echo "" + + for dep in "${missing_deps[@]}"; do + echo " $dep: ${REQUIRED_PROGRAMS[$dep]}" + echo "" + + # Provide platform-specific installation hints + case "$dep" in + "kubectl") + echo " # Linux:" + echo " curl -LO \"https://dl.k8s.io/release/\$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\"" + echo " sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl" + echo "" + echo " # macOS:" + echo " brew install kubectl" + echo "" + echo " # Or download from: https://kubernetes.io/docs/tasks/tools/install-kubectl/" + ;; + "helm") + echo " # Linux/macOS:" + echo " curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash" + echo "" + echo " # Or via package manager:" + echo " # Linux: snap install helm --classic" + echo " # macOS: brew install helm" + ;; + "yq") + echo " # Linux:" + echo " sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64" + echo " sudo chmod +x /usr/local/bin/yq" + echo "" + echo " # macOS:" + echo " brew install yq" + ;; + "openssl") + echo " # Linux:" + echo " # Ubuntu/Debian: sudo apt-get install openssl" + echo " # RHEL/CentOS: sudo yum install openssl" + echo "" + echo " # macOS: Usually pre-installed, or: brew install openssl" + ;; + esac + echo "" + done + + echo "After installing the missing dependencies, please run this script again." + exit 1 + fi + + echo " ✓ All required dependencies are available!" + echo "" +} parse_args() { # Parse arguments @@ -65,6 +184,10 @@ parse_args() { TEMPLATE_ONLY=true shift ;; + --skip-deps) + SKIP_DEPENDENCY_CHECK=true + shift + ;; --apps=*) APPS="${1#*=}" TEMPLATE_ONLY=true @@ -85,6 +208,7 @@ parse_args() { --target-revision, -r cluster-forge git revision for ArgoCD to sync from options: [tag|commit_hash|branch_name], default: $LATEST_RELEASE --template-only, -t Output YAML manifests to stdout instead of applying to cluster + --skip-deps Skip dependency checking (for advanced users) Examples: @@ -308,6 +432,9 @@ apply_cluster_forge_parent_app() { main() { parse_args "$@" + if [ "$SKIP_DEPENDENCY_CHECK" = false ]; then + check_dependencies + fi validate_args print_summary should_run namespaces && create_namespaces From 4a5f5b0fc6b537d987a393ca9c4be6b9d27b3cc2 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Sat, 28 Feb 2026 23:00:59 +0200 Subject: [PATCH 063/115] feat: enable child app rendering --- root/templates/cluster-apps.yaml | 2 +- scripts/bootstrap.sh | 111 +++++++++++++++++++++++++++++-- 2 files changed, 108 insertions(+), 5 deletions(-) diff --git a/root/templates/cluster-apps.yaml b/root/templates/cluster-apps.yaml index 54d1a2d9..b0d79d11 100644 --- a/root/templates/cluster-apps.yaml +++ b/root/templates/cluster-apps.yaml @@ -30,7 +30,7 @@ spec: {{- end }} {{- if .valuesObject }} values: | -{{ .valuesObject | toYaml | nindent 8 }} + {{ .valuesObject | toYaml | nindent 8 }} {{- end }} {{- if .helmParameters }} parameters: diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 28dd671f..442d9d1f 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -413,6 +413,52 @@ EOF fi } +# Render specific cluster-forge child apps (for --apps filtering) +render_cluster_forge_child_apps() { + echo "=== Rendering ClusterForge Child Apps: ${APPS} ===" + + # Create a temporary values file with only the requested apps enabled + local temp_values="/tmp/filtered_values.yaml" + cat > "$temp_values" << EOF +global: + domain: ${DOMAIN} +enabledApps: [] +apps: {} +EOF + + # Copy specific app configurations from the main values + local IFS=',' + for app in $APPS; do + # Add to enabledApps list + yq eval ".enabledApps += [\"$app\"]" -i "$temp_values" + + # Copy app configuration if it exists in values.yaml + if yq eval ".apps | has(\"$app\")" "${SOURCE_ROOT}/root/${VALUES_FILE}" 2>/dev/null | grep -q "true"; then + yq eval ".apps[\"$app\"] = load(\"${SOURCE_ROOT}/root/${VALUES_FILE}\").apps[\"$app\"]" -i "$temp_values" + fi + + # Merge size-specific configuration if it exists + if [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + if yq eval ".apps | has(\"$app\")" "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" 2>/dev/null | grep -q "true"; then + yq eval ".apps[\"$app\"] = (.apps[\"$app\"] // {}) * load(\"${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}\").apps[\"$app\"]" -i "$temp_values" + fi + fi + done + + # Render only the cluster-apps template with filtered values + helm template cluster-forge "${SOURCE_ROOT}/root" \ + --show-only templates/cluster-apps.yaml \ + --values "$temp_values" \ + --set clusterForge.targetRevision="${TARGET_REVISION}" \ + --set externalValues.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" \ + --set clusterForge.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" \ + --namespace argocd \ + --kube-version "${KUBE_VERSION}" | apply_or_template -f - + + # Clean up + rm -f "$temp_values" +} + apply_cluster_forge_parent_app() { # Create cluster-forge parent app only (not all apps) echo "=== Creating ClusterForge Parent App ===" @@ -430,6 +476,15 @@ apply_cluster_forge_parent_app() { --kube-version "${KUBE_VERSION}" | apply_or_template -f - } +# Check if requested apps are cluster-forge child apps +is_cluster_forge_child_app() { + local app="$1" + # Check if the app is defined in the values.yaml apps section + local app_config=$(yq eval ".apps[\"$app\"]" "${SOURCE_ROOT}/root/${VALUES_FILE}" 2>/dev/null) + [ "$app_config" != "null" ] && return 0 + return 1 +} + main() { parse_args "$@" if [ "$SKIP_DEPENDENCY_CHECK" = false ]; then @@ -437,10 +492,58 @@ main() { fi validate_args print_summary - should_run namespaces && create_namespaces - should_run argocd && bootstrap_argocd - should_run gitea && bootstrap_gitea - should_run cluster-forge && apply_cluster_forge_parent_app + + # If specific apps are requested, check if they're cluster-forge child apps + if [ -n "$APPS" ]; then + local has_bootstrap_apps=false + local has_child_apps=false + local child_apps="" + + IFS=',' read -ra APP_ARRAY <<< "$APPS" + for app in "${APP_ARRAY[@]}"; do + case "$app" in + namespaces|argocd|gitea|cluster-forge) + has_bootstrap_apps=true + ;; + *) + if is_cluster_forge_child_app "$app"; then + has_child_apps=true + if [ -z "$child_apps" ]; then + child_apps="$app" + else + child_apps="$child_apps,$app" + fi + else + echo "WARNING: Unknown app '$app'. Available bootstrap apps: namespaces, argocd, gitea, cluster-forge" + echo "Or specify any cluster-forge child app from values.yaml" + fi + ;; + esac + done + + # Handle bootstrap apps + if [ "$has_bootstrap_apps" = true ]; then + should_run namespaces && create_namespaces + should_run argocd && bootstrap_argocd + should_run gitea && bootstrap_gitea + should_run cluster-forge && apply_cluster_forge_parent_app + fi + + # Handle cluster-forge child apps + if [ "$has_child_apps" = true ]; then + # Temporarily set APPS to only child apps for the render function + local original_apps="$APPS" + APPS="$child_apps" + render_cluster_forge_child_apps + APPS="$original_apps" + fi + else + # Default behavior - run all bootstrap components + should_run namespaces && create_namespaces + should_run argocd && bootstrap_argocd + should_run gitea && bootstrap_gitea + should_run cluster-forge && apply_cluster_forge_parent_app + fi } main "$@" \ No newline at end of file From 7d3537d9d80e02a7d0ec66675b0a64f95fd6f733 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Sat, 28 Feb 2026 23:09:14 +0200 Subject: [PATCH 064/115] fix: gitea-init-job race condition --- .../templates/cf-init-gitea-cm.yaml | 357 +++++++++++++----- .../templates/cf-init-gitea-job.yaml | 2 + 2 files changed, 264 insertions(+), 95 deletions(-) diff --git a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml index 36286875..78b4de82 100644 --- a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml +++ b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml @@ -4,7 +4,7 @@ metadata: name: gitea-init-scripts namespace: cf-gitea data: - init-gitea.sh: | + init-gitea.sh: |+ #!/bin/bash set -e @@ -12,73 +12,181 @@ data: GITEA_URL="${GITEA_URL:-http://gitea-http.cf-gitea.svc:3000}" GITEA_ADMIN_USER="${GITEA_ADMIN_USER:-silogen-admin}" + # Retry function with exponential backoff + retry_with_backoff() { + local max_attempts=$1 + local delay=$2 + local operation_name=$3 + shift 3 + local attempt=1 + + while [ $attempt -le $max_attempts ]; do + echo "[$operation_name] Attempt $attempt/$max_attempts..." + + if "$@"; then + echo "[$operation_name] Success on attempt $attempt" + return 0 + else + if [ $attempt -eq $max_attempts ]; then + echo "[$operation_name] Failed after $max_attempts attempts" + return 1 + fi + + echo "[$operation_name] Failed, waiting ${delay}s before retry..." + sleep $delay + delay=$((delay * 2)) # Exponential backoff + attempt=$((attempt + 1)) + fi + done + } + + # Wait for Gitea deployment to be ready + wait_for_gitea_ready() { + echo "Waiting for Gitea deployment to be ready..." + + # Wait for deployment rollout to complete + if ! kubectl rollout status deploy/gitea -n cf-gitea --timeout=300s; then + echo "ERROR: Gitea deployment failed to become ready" + return 1 + fi + + # Wait for Gitea service to be responsive + local gitea_ready=false + local attempts=0 + local max_attempts=30 + + while [ $attempts -lt $max_attempts ] && [ "$gitea_ready" = false ]; do + echo "Checking if Gitea API is responsive (attempt $((attempts + 1))/$max_attempts)..." + + if curl -s --connect-timeout 5 --max-time 10 "$GITEA_URL/api/v1/version" >/dev/null 2>&1; then + gitea_ready=true + echo "Gitea API is ready!" + else + echo "Gitea API not ready yet, waiting 10s..." + sleep 10 + attempts=$((attempts + 1)) + fi + done + + if [ "$gitea_ready" = false ]; then + echo "ERROR: Gitea API did not become ready after $((max_attempts * 10)) seconds" + return 1 + fi + + return 0 + } + + # Function to create admin access token with retry + create_admin_token() { + kubectl -n cf-gitea exec deploy/gitea -c gitea -- \ + gitea admin user generate-access-token --raw --username "$GITEA_ADMIN_USER" --token-name "api-token-$(date +%s)" --scopes all + } + + echo "=== Gitea Initialization Started ===" + echo "Domain: $DOMAIN" + echo "Gitea URL: $GITEA_URL" + echo "Admin User: $GITEA_ADMIN_USER" + echo "" + + # Step -1: Wait for Gitea to be ready + if ! wait_for_gitea_ready; then + echo "FATAL: Gitea is not ready, aborting initialization" + exit 1 + fi + echo "Step 0: Create admin access token" - GITEA_TOKEN=$(kubectl -n cf-gitea exec deploy/gitea -c gitea -- \ - gitea admin user generate-access-token --raw --username "$GITEA_ADMIN_USER" --token-name "api-token-$(date +%s)" --scopes all - ) - kubectl create secret generic gitea-admin-token --from-literal=token="${GITEA_TOKEN}" --namespace=cf-gitea --dry-run=client -o yaml | kubectl apply -f - + if retry_with_backoff 5 3 "Create admin token" create_admin_token; then + GITEA_TOKEN=$(kubectl -n cf-gitea exec deploy/gitea -c gitea -- \ + gitea admin user generate-access-token --raw --username "$GITEA_ADMIN_USER" --token-name "api-token-$(date +%s)" --scopes all + ) + kubectl create secret generic gitea-admin-token --from-literal=token="${GITEA_TOKEN}" --namespace=cf-gitea --dry-run=client -o yaml | kubectl apply -f - + echo "Admin token created successfully" + else + echo "FATAL: Failed to create admin token" + exit 1 + fi + + # Function to create organization + create_organization() { + local response=$(curl -s -w "%{http_code}" -o /tmp/org_response.json -X POST "${GITEA_URL}/api/v1/orgs" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{ + "username": "cluster-org", + "full_name": "Cluster Organization", + "description": "Organization for cluster management repositories", + "visibility": "public" + }') + + case $response in + 201|409) # 201=created, 409=already exists + echo "Organization 'cluster-org' ready" + return 0 + ;; + *) + echo "Failed to create organization (HTTP $response)" + cat /tmp/org_response.json 2>/dev/null || true + return 1 + ;; + esac + } echo "Step 1: Creating organization 'cluster-org'..." - curl -X POST "${GITEA_URL}/api/v1/orgs" \ - -H "Authorization: token ${GITEA_TOKEN}" \ - -H "Content-Type: application/json" \ - -d '{ - "username": "cluster-org", - "full_name": "Cluster Organization", - "description": "Organization for cluster management repositories", - "visibility": "public" - }' || echo "Failed to create organization, might already exist" + if ! retry_with_backoff 3 5 "Create organization" create_organization; then + echo "FATAL: Failed to create organization after retries" + exit 1 + fi + + # Function to check if repository exists + check_repo_exists() { + curl -s -f -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/cluster-org/cluster-forge" >/dev/null 2>&1 + } + + # Function to migrate repository + migrate_repository() { + local HTTP_CODE=$(curl -s -w "%{http_code}" -o /tmp/migration_response.json -X POST "${GITEA_URL}/api/v1/repos/migrate" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{ + "clone_addr": "https://github.com/silogen/cluster-forge.git", + "repo_name": "cluster-forge", + "repo_owner": "cluster-org", + "service": "git", + "mirror": true, + "mirror_interval": "15m", + "private": false + }') + + case $HTTP_CODE in + 201) + echo "Repository migration completed successfully" + return 0 + ;; + 409) + echo "Repository already exists" + return 0 + ;; + *) + echo "Migration failed with HTTP $HTTP_CODE" + cat /tmp/migration_response.json 2>/dev/null || true + + # Clean up failed repository + echo "Cleaning up failed repository..." + curl -s -X DELETE "${GITEA_URL}/api/v1/repos/cluster-org/cluster-forge" \ + -H "Authorization: token ${GITEA_TOKEN}" >/dev/null 2>&1 + return 1 + ;; + esac + } echo "Step 2: Creating repository 'cluster-forge' as mirror..." - if curl -s -f -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/cluster-org/cluster-forge" >/dev/null 2>&1; then + if check_repo_exists; then echo "Repository 'cluster-forge' already exists" else - MAX_ATTEMPTS=5 - ATTEMPT=1 - SUCCESS=false - - while [ $ATTEMPT -le $MAX_ATTEMPTS ] && [ "$SUCCESS" = false ]; do - echo "Migration attempt $ATTEMPT/$MAX_ATTEMPTS..." - - HTTP_CODE=$(curl -s -w "%{http_code}" -o /tmp/migration_response.json -X POST "${GITEA_URL}/api/v1/repos/migrate" \ - -H "Authorization: token ${GITEA_TOKEN}" \ - -H "Content-Type: application/json" \ - -d '{ - "clone_addr": "https://github.com/silogen/cluster-forge.git", - "repo_name": "cluster-forge", - "repo_owner": "cluster-org", - "service": "git", - "mirror": true, - "mirror_interval": "15m", - "private": false - }') - - case $HTTP_CODE in - 201) - echo "Repository migration completed successfully" - SUCCESS=true - ;; - *) - echo "Attempt $ATTEMPT failed with HTTP $HTTP_CODE, retrying..." - echo "Cleaning up failed repository..." - sleep 1 - curl -s -X DELETE "${GITEA_URL}/api/v1/repos/cluster-org/cluster-forge" \ - -H "Authorization: token ${GITEA_TOKEN}" >/dev/null 2>&1 - echo "Failed repository deleted..." - ;; - esac - - if [ "$SUCCESS" = false ] && [ $ATTEMPT -lt $MAX_ATTEMPTS ]; then - sleep 5 - fi - - ATTEMPT=$((ATTEMPT + 1)) - done - - if [ "$SUCCESS" = false ]; then - echo "ERROR: Failed to create mirror repository after $MAX_ATTEMPTS attempts" - exit 1 - fi + if ! retry_with_backoff 5 5 "Migrate repository" migrate_repository; then + echo "FATAL: Failed to create mirror repository after retries" + exit 1 + fi fi # set mirror default branch (--dev mode) @@ -89,19 +197,39 @@ data: -d '{"default_branch": "{{ .Values.targetRevision }}"}' fi + # Function to create cluster-values repository + create_cluster_values_repo() { + local response=$(curl -s -w "%{http_code}" -o /tmp/repo_response.json -X POST "${GITEA_URL}/api/v1/orgs/cluster-org/repos" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "cluster-values", + "description": "Cluster configuration values repository", + "private": false, + "auto_init": true + }') + + case $response in + 201|409) # 201=created, 409=already exists + echo "Repository 'cluster-values' ready" + return 0 + ;; + *) + echo "Failed to create cluster-values repository (HTTP $response)" + cat /tmp/repo_response.json 2>/dev/null || true + return 1 + ;; + esac + } + echo "Step 3: Creating repository 'cluster-values'..." - curl -X POST "${GITEA_URL}/api/v1/orgs/cluster-org/repos" \ - -H "Authorization: token ${GITEA_TOKEN}" \ - -H "Content-Type: application/json" \ - -d '{ - "name": "cluster-values", - "description": "Cluster configuration values repository", - "private": false, - "auto_init": true - }' || echo "Repository might already exist" - - echo "Step 4: Creating user 'devuser'..." - curl -X POST "${GITEA_URL}/api/v1/admin/users" \ + if ! retry_with_backoff 3 5 "Create cluster-values repo" create_cluster_values_repo; then + echo "FATAL: Failed to create cluster-values repository after retries" + exit 1 + fi + + echo "Step 4: Creating user 'devuser' (optional)..." + response=$(curl -s -w "%{http_code}" -o /tmp/user_response.json -X POST "${GITEA_URL}/api/v1/admin/users" \ -H "Authorization: token ${GITEA_TOKEN}" \ -H "Content-Type: application/json" \ -d '{ @@ -111,20 +239,36 @@ data: "full_name": "Dev User", "must_change_password": false, "send_notify": false - }' || echo "User creation failed, might already exist or insufficient permissions" + }') + + case $response in + 201) echo "User 'devuser' created successfully" ;; + 422) echo "User 'devuser' already exists" ;; + *) echo "User creation failed (HTTP $response) - continuing anyway" ;; + esac echo "Step 5: Getting organization 'cluster-org' owners team id..." OWNERS_TEAM_ID=$(curl -s -H "Authorization: token ${GITEA_TOKEN}" -H "Content-Type: application/json" \ "${GITEA_URL}/api/v1/orgs/cluster-org/teams" | \ - jq -r '.[] | select(.name == "Owners") | .id') + jq -r '.[] | select(.name == "Owners") | .id' 2>/dev/null) - echo "Step 6: Adding user 'devuser' to organization 'cluster-org' owners..." - curl -X PUT "${GITEA_URL}/api/v1/teams/${OWNERS_TEAM_ID}/members/devuser" \ - -H "Authorization: token ${GITEA_TOKEN}" \ - -H "Content-Type: application/json" || echo "Failed to add user to organization" + if [ -n "$OWNERS_TEAM_ID" ] && [ "$OWNERS_TEAM_ID" != "null" ]; then + echo "Step 6: Adding user 'devuser' to organization 'cluster-org' owners..." + response=$(curl -s -w "%{http_code}" -o /dev/null -X PUT "${GITEA_URL}/api/v1/teams/${OWNERS_TEAM_ID}/members/devuser" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json") + + case $response in + 200|204) echo "User 'devuser' added to organization successfully" ;; + *) echo "Failed to add user to organization (HTTP $response) - continuing anyway" ;; + esac + else + echo "Could not find owners team ID - skipping user organization assignment" + fi - echo "Step 7: Creating values.yaml file with cluster-forge reference in cluster-values repo..." - cat > /tmp/values.yaml << 'EOF' + # Function to create values.yaml file + create_values_file() { + cat > /tmp/values.yaml << 'VALUESEOF' clusterForge: repoURL: http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git path: root @@ -134,18 +278,41 @@ data: global: clusterSize: {{ .Values.clusterSize }} domain: DOMAIN_PLACEHOLDER - - EOF - - sed -i "s/DOMAIN_PLACEHOLDER/${DOMAIN}/g" /tmp/values.yaml - - curl -X POST "${GITEA_URL}/api/v1/repos/cluster-org/cluster-values/contents/values.yaml" \ - -H "Authorization: token ${GITEA_TOKEN}" \ - -H "Content-Type: application/json" \ - -d '{ - "message": "Initialize cluster values configuration", - "content": "'$(base64 -w 0 < /tmp/values.yaml)'", - "branch": "main" - }' || echo "Failed to create values.yaml file" + VALUESEOF + + sed -i "s/DOMAIN_PLACEHOLDER/${DOMAIN}/g" /tmp/values.yaml + + local encoded_content=$(base64 -w 0 < /tmp/values.yaml) + local response=$(curl -s -w "%{http_code}" -o /tmp/values_response.json -X POST "${GITEA_URL}/api/v1/repos/cluster-org/cluster-values/contents/values.yaml" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" \ + -d "{ + \"message\": \"Initialize cluster values configuration\", + \"content\": \"$encoded_content\", + \"branch\": \"main\" + }") + + case $response in + 201) + echo "Values.yaml file created successfully" + return 0 + ;; + 422) + echo "Values.yaml file already exists" + return 0 + ;; + *) + echo "Failed to create values.yaml file (HTTP $response)" + cat /tmp/values_response.json 2>/dev/null || true + return 1 + ;; + esac + } - echo "Setup completed successfully!" + echo "Step 7: Creating values.yaml file with cluster-forge reference in cluster-values repo..." + if retry_with_backoff 3 5 "Create values.yaml file" create_values_file; then + echo "=== Gitea Setup Completed Successfully! ===" + else + echo "WARNING: Failed to create values.yaml file, but core setup is complete" + echo "=== Gitea Setup Completed with Warnings ===" + fi diff --git a/scripts/init-gitea-job/templates/cf-init-gitea-job.yaml b/scripts/init-gitea-job/templates/cf-init-gitea-job.yaml index 0da19d31..15c12aa9 100644 --- a/scripts/init-gitea-job/templates/cf-init-gitea-job.yaml +++ b/scripts/init-gitea-job/templates/cf-init-gitea-job.yaml @@ -4,6 +4,8 @@ metadata: name: gitea-init-job namespace: cf-gitea spec: + backoffLimit: 2 + activeDeadlineSeconds: 1200 # 20 minutes total timeout template: spec: restartPolicy: Never From 51fb6c0e7d78c9fcc4f2be9c4f3bbff10d8bc786 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Sat, 28 Feb 2026 23:11:28 +0200 Subject: [PATCH 065/115] fix: inject cluster size values file --- scripts/bootstrap.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 442d9d1f..7492c54d 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -469,6 +469,7 @@ apply_cluster_forge_parent_app() { --values "${SOURCE_ROOT}/root/${VALUES_FILE}" \ --values "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" \ --set global.domain="${DOMAIN}" \ + --set global.clusterSize="${SIZE_VALUES_FILE}" \ --set clusterForge.targetRevision="${TARGET_REVISION}" \ --set externalValues.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" \ --set clusterForge.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" \ From b133d92da21a3f41349c5a9bab1ffa0ae33acde2 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Sun, 1 Mar 2026 09:04:05 +0200 Subject: [PATCH 066/115] fix: openbao health check --- root/values.yaml | 35 +++++++++++++ .../templates/cf-init-openbao-cm.yaml | 50 ++++++++++++++----- 2 files changed, 72 insertions(+), 13 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index c3c9f1d3..2d62670e 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -95,6 +95,41 @@ apps: hs.message = "ScaledObject status unknown" end return hs + resource.customizations.health.apps_StatefulSet: | + -- Custom health check for OpenBao StatefulSet + -- Ensures both StatefulSet is ready AND initialization is complete + if obj.metadata.name == "openbao" and obj.metadata.namespace == "cf-openbao" then + -- Check StatefulSet readiness + if obj.status and obj.status.readyReplicas and obj.status.replicas then + if obj.status.readyReplicas == obj.status.replicas then + -- StatefulSet is ready, now check if initialization completed + local kubectl = io.popen("kubectl get secret openbao-user -n cf-openbao --ignore-not-found -o name 2>/dev/null") + local secret_exists = kubectl:read("*line") + kubectl:close() + + if secret_exists and secret_exists ~= "" then + hs = {} + hs.status = "Healthy" + hs.message = "OpenBao StatefulSet ready and initialization complete" + return hs + else + hs = {} + hs.status = "Progressing" + hs.message = "OpenBao StatefulSet ready but initialization still in progress" + return hs + end + end + end + + -- Default StatefulSet health check + hs = {} + hs.status = "Progressing" + hs.message = "OpenBao StatefulSet not ready" + return hs + else + -- Default StatefulSet health check for non-OpenBao StatefulSets + return nil + end params: server.insecure: true rbac: diff --git a/scripts/init-openbao-job/templates/cf-init-openbao-cm.yaml b/scripts/init-openbao-job/templates/cf-init-openbao-cm.yaml index a3de50c3..bc1db2d9 100644 --- a/scripts/init-openbao-job/templates/cf-init-openbao-cm.yaml +++ b/scripts/init-openbao-job/templates/cf-init-openbao-cm.yaml @@ -55,8 +55,9 @@ data: echo "No HA replicas found (single-node deployment)." fi else - echo "ERROR: openbao-keys secret not found. Cannot unseal replicas." - exit 1 + echo "WARNING: openbao-keys secret not found, but OpenBao appears to be running." + echo "This might indicate the cluster was initialized externally or the secret was deleted." + echo "HA replica setup will be skipped." fi echo "OpenBao is fully operational. Skipping initialization." @@ -65,19 +66,42 @@ data: if [ "$OPENBAO_0_INITIALIZED" = "false" ]; then echo "Initializing OpenBao on openbao-0..." - INIT_OUTPUT=$(kubectl exec openbao-0 -- bao operator init -format=json -key-shares=1 -key-threshold=1) - echo $INIT_OUTPUT > /tmp/bao-keys.json; - - echo "Saving unseal keys and root token to openbao-keys k8s secret..." - BAO_ROOT_TOKEN=$(jq -r '.root_token' /tmp/bao-keys.json); - BAO_UNSEAL_KEY=$(jq -r '.unseal_keys_b64[0]' /tmp/bao-keys.json); - kubectl create secret generic openbao-keys -n cf-openbao \ - --from-literal=root_token="$BAO_ROOT_TOKEN" \ - --from-literal=unseal_key="$BAO_UNSEAL_KEY" \ - --dry-run=client -o yaml | kubectl apply -f - + if INIT_OUTPUT=$(kubectl exec openbao-0 -- bao operator init -format=json -key-shares=1 -key-threshold=1); then + echo "OpenBao initialization successful" + echo $INIT_OUTPUT > /tmp/bao-keys.json + + echo "Saving unseal keys and root token to openbao-keys k8s secret..." + BAO_ROOT_TOKEN=$(jq -r '.root_token' /tmp/bao-keys.json) + BAO_UNSEAL_KEY=$(jq -r '.unseal_keys_b64[0]' /tmp/bao-keys.json) + + if [ -z "$BAO_ROOT_TOKEN" ] || [ "$BAO_ROOT_TOKEN" = "null" ] || [ -z "$BAO_UNSEAL_KEY" ] || [ "$BAO_UNSEAL_KEY" = "null" ]; then + echo "ERROR: Failed to extract root token or unseal key from initialization output" + echo "Init output: $INIT_OUTPUT" + exit 1 + fi + + kubectl create secret generic openbao-keys -n cf-openbao \ + --from-literal=root_token="$BAO_ROOT_TOKEN" \ + --from-literal=unseal_key="$BAO_UNSEAL_KEY" \ + --dry-run=client -o yaml | kubectl apply -f - + echo "openbao-keys secret created successfully" + else + echo "ERROR: OpenBao initialization failed" + kubectl exec openbao-0 -- bao operator init -format=json -key-shares=1 -key-threshold=1 || true + exit 1 + fi else echo "OpenBao openbao-0 is initialized but sealed. Getting unseal key..." - BAO_UNSEAL_KEY=$(kubectl get secret openbao-keys -n cf-openbao -o jsonpath='{.data.unseal_key}' | base64 -d) + if kubectl get secret openbao-keys -n cf-openbao &>/dev/null; then + BAO_UNSEAL_KEY=$(kubectl get secret openbao-keys -n cf-openbao -o jsonpath='{.data.unseal_key}' | base64 -d) + else + echo "ERROR: OpenBao is initialized but openbao-keys secret is missing." + echo "This indicates a previous initialization failure or the secret was deleted." + echo "Manual intervention required - either:" + echo "1. Delete the OpenBao StatefulSet to start fresh, or" + echo "2. Manually unseal OpenBao if you have the keys" + exit 1 + fi fi echo "Unsealing openbao-0..." From 3201b05b10222ea8fdf3e3035a50b33eff896616 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Sun, 1 Mar 2026 10:06:45 +0200 Subject: [PATCH 067/115] qa: add xtrace flag for debugging --- scripts/bootstrap.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 7492c54d..e3b050cc 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -x set -euo pipefail From d4cd9958d01945e8eca17d7268c9ad56f1a19305 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Sun, 1 Mar 2026 10:19:44 +0200 Subject: [PATCH 068/115] fix: gitea-init cm --- .../templates/cf-init-gitea-cm.yaml | 38 +++++++++++++++---- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml index 78b4de82..294d5b40 100644 --- a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml +++ b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml @@ -78,8 +78,31 @@ data: # Function to create admin access token with retry create_admin_token() { - kubectl -n cf-gitea exec deploy/gitea -c gitea -- \ - gitea admin user generate-access-token --raw --username "$GITEA_ADMIN_USER" --token-name "api-token-$(date +%s)" --scopes all + # Check if we already have a token secret + if kubectl get secret gitea-admin-token -n cf-gitea &>/dev/null; then + echo "Admin token secret already exists, using existing token" + return 0 + fi + + # Generate a unique token name + local token_name="api-token-$(date +%s)-$$" + + # Create the token and capture the result + local token_output + if token_output=$(kubectl -n cf-gitea exec deploy/gitea -c gitea -- \ + gitea admin user generate-access-token --raw --username "$GITEA_ADMIN_USER" --token-name "$token_name" --scopes all 2>/dev/null); then + + # Store the token in a global variable for use after retry + GITEA_TOKEN="$token_output" + + # Create the secret immediately + kubectl create secret generic gitea-admin-token --from-literal=token="${GITEA_TOKEN}" --namespace=cf-gitea --dry-run=client -o yaml | kubectl apply -f - + echo "Admin token '$token_name' created and secret stored" + return 0 + else + echo "Failed to create admin token '$token_name'" + return 1 + fi } echo "=== Gitea Initialization Started ===" @@ -96,11 +119,12 @@ data: echo "Step 0: Create admin access token" if retry_with_backoff 5 3 "Create admin token" create_admin_token; then - GITEA_TOKEN=$(kubectl -n cf-gitea exec deploy/gitea -c gitea -- \ - gitea admin user generate-access-token --raw --username "$GITEA_ADMIN_USER" --token-name "api-token-$(date +%s)" --scopes all - ) - kubectl create secret generic gitea-admin-token --from-literal=token="${GITEA_TOKEN}" --namespace=cf-gitea --dry-run=client -o yaml | kubectl apply -f - - echo "Admin token created successfully" + # Token is already created and stored in GITEA_TOKEN by the create_admin_token function + if [ -z "$GITEA_TOKEN" ]; then + # If GITEA_TOKEN is empty, try to get it from the existing secret + GITEA_TOKEN=$(kubectl get secret gitea-admin-token -n cf-gitea -o jsonpath='{.data.token}' | base64 -d) + fi + echo "Admin token ready for use" else echo "FATAL: Failed to create admin token" exit 1 From 456466dcbeb7f86d757d6dc5ab18cea9c9560d93 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Sun, 1 Mar 2026 10:35:04 +0200 Subject: [PATCH 069/115] fix: openbao health cheks; sync wave gap restructure --- root/values.yaml | 78 ++++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 32 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index 2d62670e..b1200459 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -84,10 +84,37 @@ apps: hs.status = "Degraded" hs.message = condition.reason or "ScaledObject not ready" end + return hs + resource.customizations.health.batch_Job: | + -- Custom health check for Jobs, especially openbao-init + hs = {} + if obj.status ~= nil then + if obj.status.conditions ~= nil then + for _, condition in ipairs(obj.status.conditions) do + if condition.type == "Complete" and condition.status == "True" then + hs.status = "Healthy" + hs.message = "Job completed successfully" + return hs + elseif condition.type == "Failed" and condition.status == "True" then + hs.status = "Degraded" + hs.message = "Job failed" return hs end end end + -- Check for active jobs + if obj.status.active and obj.status.active > 0 then + hs.status = "Progressing" + hs.message = "Job is running" + return hs + end + end + hs.status = "Progressing" + hs.message = "Job status unknown" + return hs + end + end + end hs.status = "Progressing" hs.message = "ScaledObject status unknown" else @@ -97,39 +124,26 @@ apps: return hs resource.customizations.health.apps_StatefulSet: | -- Custom health check for OpenBao StatefulSet - -- Ensures both StatefulSet is ready AND initialization is complete - if obj.metadata.name == "openbao" and obj.metadata.namespace == "cf-openbao" then - -- Check StatefulSet readiness - if obj.status and obj.status.readyReplicas and obj.status.replicas then + -- Uses standard StatefulSet readiness for now - openbao-init job handles actual initialization + hs = {} + if obj.status ~= nil then + if obj.status.readyReplicas ~= nil and obj.status.replicas ~= nil then if obj.status.readyReplicas == obj.status.replicas then - -- StatefulSet is ready, now check if initialization completed - local kubectl = io.popen("kubectl get secret openbao-user -n cf-openbao --ignore-not-found -o name 2>/dev/null") - local secret_exists = kubectl:read("*line") - kubectl:close() - - if secret_exists and secret_exists ~= "" then - hs = {} - hs.status = "Healthy" - hs.message = "OpenBao StatefulSet ready and initialization complete" - return hs - else - hs = {} - hs.status = "Progressing" - hs.message = "OpenBao StatefulSet ready but initialization still in progress" - return hs - end + hs.status = "Healthy" + hs.message = "StatefulSet is ready" + else + hs.status = "Progressing" + hs.message = "Waiting for StatefulSet replicas to be ready" end + else + hs.status = "Progressing" + hs.message = "Waiting for StatefulSet status" end - - -- Default StatefulSet health check - hs = {} - hs.status = "Progressing" - hs.message = "OpenBao StatefulSet not ready" - return hs else - -- Default StatefulSet health check for non-OpenBao StatefulSets - return nil + hs.status = "Progressing" + hs.message = "Waiting for StatefulSet status" end + return hs params: server.insecure: true rbac: @@ -225,7 +239,7 @@ apps: helmParameters: - name: domain value: "{{ .Values.global.domain }}" - syncWave: -60 + syncWave: -50 openbao-config: path: openbao-config/0.1.0 namespace: cf-openbao @@ -233,7 +247,7 @@ apps: helmParameters: - name: domain value: "{{ .Values.global.domain }}" - syncWave: -20 + syncWave: -15 external-secrets: path: external-secrets/0.15.1 namespace: external-secrets @@ -242,7 +256,7 @@ apps: external-secrets-config: path: external-secrets-config namespace: external-secrets - syncWave: -20 + syncWave: -10 gitea: path: gitea/12.3.0 namespace: cf-gitea @@ -300,7 +314,7 @@ apps: metallb: path: metallb/v0.15.2 namespace: default - syncWave: -40 + syncWave: 10 kgateway-crds: path: kgateway-crds/v2.1.0-main namespace: kgateway-system From 7a8154a5f3fec313a6f4c4bda0ecd60a0d0d9951 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Sun, 1 Mar 2026 18:50:40 +0200 Subject: [PATCH 070/115] fix: openbao config timing and dep chain --- root/values.yaml | 6 +++--- .../0.1.0/templates/openbao-unseal-cronjob.yaml | 13 +++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index b1200459..7c2001f1 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -181,7 +181,7 @@ apps: argocd-config: path: argocd-config namespace: argocd - syncWave: -20 + syncWave: 5 ignoreDifferences: - group: external-secrets.io kind: ExternalSecret @@ -247,7 +247,7 @@ apps: helmParameters: - name: domain value: "{{ .Values.global.domain }}" - syncWave: -15 + syncWave: -25 external-secrets: path: external-secrets/0.15.1 namespace: external-secrets @@ -444,7 +444,7 @@ apps: cluster-auth-config: path: cluster-auth-config namespace: cluster-auth - syncWave: -20 + syncWave: 5 ignoreDifferences: - group: external-secrets.io kind: ExternalSecret diff --git a/sources/openbao-config/0.1.0/templates/openbao-unseal-cronjob.yaml b/sources/openbao-config/0.1.0/templates/openbao-unseal-cronjob.yaml index 9c1447e3..5f8d8887 100644 --- a/sources/openbao-config/0.1.0/templates/openbao-unseal-cronjob.yaml +++ b/sources/openbao-config/0.1.0/templates/openbao-unseal-cronjob.yaml @@ -17,6 +17,19 @@ spec: spec: restartPolicy: Never serviceAccountName: openbao-unseal-job-sa + initContainers: + - name: wait-for-keys + image: ghcr.io/silogen/cluster-tool:latest + command: ["/bin/bash"] + args: + - -c + - | + echo "Waiting for openbao-keys secret to exist..." + while ! kubectl get secret openbao-keys -n cf-openbao >/dev/null 2>&1; do + echo "Secret openbao-keys not found, waiting 10 seconds..." + sleep 10 + done + echo "Secret openbao-keys found, proceeding with unseal job" containers: - name: openbao-init image: ghcr.io/silogen/cluster-tool:latest From b5dd7f99a6575c03bef55161c32865a4a2376436 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Sun, 1 Mar 2026 20:38:22 +0200 Subject: [PATCH 071/115] fix: add openbao-init to enabled apps --- root/values_large.yaml | 46 +++++++++++++++++++++++++++++++++++++++++ root/values_medium.yaml | 1 + root/values_small.yaml | 1 + 3 files changed, 48 insertions(+) diff --git a/root/values_large.yaml b/root/values_large.yaml index fd5909aa..cdaf0544 100644 --- a/root/values_large.yaml +++ b/root/values_large.yaml @@ -1,3 +1,49 @@ +enabledApps: + - aim-cluster-model-source + - airm + - amd-gpu-operator + - amd-gpu-operator-config + - appwrapper + - argocd + - argocd-config + - cert-manager + - cluster-auth + - cluster-auth-config + - cnpg-operator + - external-secrets + - external-secrets-config + - gateway-api + - gitea + - gitea-config + - kaiwo + - kaiwo-config + - kaiwo-crds + - keda + - kedify-otel + - keycloak + - kgateway + - kgateway-config + - kgateway-crds + - kserve + - kserve-crds + - kueue + - kueue-config + - kuberay-operator + - kyverno + - kyverno-config + - kyverno-policies-base + - metallb + - minio-operator + - minio-tenant + - minio-tenant-config + - openbao + - openbao-init + - openbao-config + - opentelemetry-operator + - otel-lgtm-stack + - prometheus-crds + - rabbitmq + apps: minio-tenant: valuesObject: diff --git a/root/values_medium.yaml b/root/values_medium.yaml index 83e7ea95..98872ee6 100644 --- a/root/values_medium.yaml +++ b/root/values_medium.yaml @@ -44,6 +44,7 @@ enabledApps: - minio-tenant-config - openbao - openbao-config + - openbao-init - opentelemetry-operator - otel-lgtm-stack - prometheus-crds diff --git a/root/values_small.yaml b/root/values_small.yaml index 2d212e89..c3a7282c 100644 --- a/root/values_small.yaml +++ b/root/values_small.yaml @@ -43,6 +43,7 @@ enabledApps: - minio-tenant-config - openbao - openbao-config + - openbao-init - opentelemetry-operator - otel-lgtm-stack - prometheus-crds From 8ed5cc746b41540bde549a6123c078099bf691c0 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Sun, 1 Mar 2026 20:53:34 +0200 Subject: [PATCH 072/115] feat: update SBOM scripts to collate from all cluster sizes and not depent on root/values.yaml (allowing the top-level enabledApps to be removed as well) --- root/values.yaml | 47 +----------- sbom/SBOM-QUICK-GUIDE.md | 34 +++++---- sbom/components.yaml | 27 +++++-- sbom/generate-compare-components.sh | 112 +++++++++++++++++++++++----- sbom/validate-components-sync.sh | 78 +++++++++++++++---- sbom/validate-enabled-apps.sh | 98 ++++++++++++++++++++---- 6 files changed, 281 insertions(+), 115 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index 7c2001f1..aa16b3ad 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -10,51 +10,8 @@ externalValues: global: domain: # to be filled by bootstrap script clusterSize: # to be filled by bootstrap script (small, medium, large) -enabledApps: - - aim-cluster-model-source - - airm - - amd-gpu-operator - - amd-gpu-operator-config - - appwrapper - - argocd - - argocd-config - - cert-manager - - cluster-auth - - cluster-auth-config - - cnpg-operator - - external-secrets - - external-secrets-config - - gateway-api - - gitea - - gitea-config - - kaiwo - - kaiwo-config - - kaiwo-crds - - keda - - kedify-otel - - keycloak - - kgateway - - kgateway-config - - kgateway-crds - - kserve - - kserve-crds - - kueue - - kueue-config - - kuberay-operator - - kyverno - - kyverno-config - - kyverno-policies-base - - metallb - - minio-operator - - minio-tenant - - minio-tenant-config - - openbao - - openbao-init - - openbao-config - - opentelemetry-operator - - otel-lgtm-stack - - prometheus-crds - - rabbitmq +# enabledApps list removed - each cluster size (small/medium/large) defines its own apps +# This eliminates the override/clobbering issue and makes cluster sizes more independent apps: # Core apps argocd: diff --git a/sbom/SBOM-QUICK-GUIDE.md b/sbom/SBOM-QUICK-GUIDE.md index bc007fd5..322d31bd 100644 --- a/sbom/SBOM-QUICK-GUIDE.md +++ b/sbom/SBOM-QUICK-GUIDE.md @@ -4,10 +4,10 @@ Scripts to manage component metadata for automated Software Bill of Materials (S ## Essential Workflow -When `root/values.yaml` has a new tool, you need to run these commands manually: +When any cluster size configuration (`values_small.yaml`, `values_medium.yaml`, `values_large.yaml`) has new apps, you need to run these commands manually: ```bash -# 1. Generate/sync components from enabledApps +# 1. Generate/sync components from enabledApps across all cluster sizes ./generate-compare-components.sh # 2. Manually fill out sourceUrl and projectUrl in components.yaml @@ -23,20 +23,22 @@ When `root/values.yaml` has a new tool, you need to run these commands manually: ## Quick Start ### Adding a New Component -1. **Update values.yaml**: Add to `enabledApps` list AND add app definition in `apps` section -2. **Run workflow**: Execute the 4 commands above -3. **Commit changes**: All files should be ready for PR +1. **Update cluster size files**: Add to `enabledApps` list in relevant cluster size files (`values_small.yaml`, `values_medium.yaml`, `values_large.yaml`) +2. **Add app definition**: Add app definition in the `apps` section of appropriate values file (typically base `values.yaml` or cluster-specific file) +3. **Run workflow**: Execute the 4 commands above +4. **Commit changes**: All files should be ready for PR ### Removing a Component -1. **Remove from enabledApps**: Remove from `enabledApps` list in `root/values.yaml` +1. **Remove from enabledApps**: Remove from `enabledApps` list in relevant cluster size files 2. **Regenerate**: Run `./generate-compare-components.sh` (automatically removes from components.yaml) 3. **Validate**: Run `./validate-sync.sh` to confirm removal ## Scripts ### Generation Scripts -**`generate-compare-components.sh`** - Syncs `components.yaml` with enabled apps from `root/values.yaml` -- Processes only apps listed in `enabledApps` (excludes `-config` apps) +**`generate-compare-components.sh`** - Syncs `components.yaml` with enabled apps from all cluster sizes +- Collects apps from `values.yaml`, `values_small.yaml`, `values_medium.yaml`, `values_large.yaml` +- Processes only apps listed in `enabledApps` across all configurations (excludes `-config` apps) - Includes pre-validation to catch configuration issues early - Preserves existing metadata (sourceUrl, projectUrl, license fields) - Creates timestamped backups when needed @@ -51,8 +53,8 @@ When `root/values.yaml` has a new tool, you need to run these commands manually: - Use this for complete validation before commits **Individual Validators** (for targeted debugging): -- **`validate-enabled-apps.sh`** - Checks enabledApps have corresponding app definitions -- **`validate-components-sync.sh`** - Verifies components.yaml reflects current enabledApps +- **`validate-enabled-apps.sh`** - Checks enabledApps across all cluster sizes have corresponding app definitions +- **`validate-components-sync.sh`** - Verifies components.yaml reflects current enabledApps from all cluster configurations - **`validate-metadata.sh`** - Ensures all required metadata fields are populated ## Validation Workflow @@ -61,13 +63,14 @@ The new modular validation system ensures data consistency: ``` 1. EnabledApps Consistency Check - ├── Validates all enabledApps have app definitions + ├── Validates all enabledApps across cluster sizes have app definitions + ├── Collects from values.yaml, values_small.yaml, values_medium.yaml, values_large.yaml └── Filters out -config apps appropriately 2. Components Sync Check - ├── Verifies components.yaml matches enabledApps + ├── Verifies components.yaml matches enabledApps from all cluster configurations ├── Checks for missing/extra components - └── Validates path/valuesFile consistency + └── Validates path/valuesFile consistency across cluster files 3. Metadata Completeness Check ├── Ensures sourceUrl and projectUrl are populated @@ -92,7 +95,8 @@ The GitHub workflow `.github/workflows/pr-component-validation.yaml` now include ## Important Notes -- **EnabledApps is the source of truth**: Components are generated only for apps in the `enabledApps` list +- **EnabledApps across cluster sizes is the source of truth**: Components are generated from apps in `enabledApps` lists across all cluster configurations +- **No base enabledApps**: The base `values.yaml` no longer contains enabledApps to avoid override conflicts - **Manual metadata required**: `sourceUrl` and `projectUrl` must be added manually (requires human knowledge) - **Scripts are idempotent**: Safe to run multiple times - **Validation before commit**: Always run `./validate-sync.sh` before creating PRs @@ -101,7 +105,7 @@ The GitHub workflow `.github/workflows/pr-component-validation.yaml` now include ## Troubleshooting **Error: "Enabled app has no definition"** -→ Add the app definition to the `apps` section in `root/values.yaml` +→ Add the app definition to the `apps` section in `root/values.yaml` or appropriate cluster size file **Error: "Component missing/extra"** → Run `./generate-compare-components.sh` to sync components.yaml diff --git a/sbom/components.yaml b/sbom/components.yaml index b1d8b9d5..a125043b 100644 --- a/sbom/components.yaml +++ b/sbom/components.yaml @@ -1,5 +1,6 @@ # Generated components metadata for SBOM creation -# This file contains simplified component information for apps in enabledApps +# This file contains simplified component information for apps across all cluster sizes +# Collected from: values.yaml, values_small.yaml, values_medium.yaml, values_large.yaml # Apps with "config" suffix are excluded from this SBOM components: @@ -131,12 +132,6 @@ components: projectUrl: https://github.com/kserve/kserve license: Apache License 2.0 licenseUrl: https://github.com/kserve/kserve/blob/master/LICENSE - kueue: - path: kueue/0.13.0 - sourceUrl: oci://registry.k8s.io/kueue/charts/kueue - projectUrl: https://github.com/kubernetes-sigs/kueue - license: Apache License 2.0 - licenseUrl: https://github.com/kubernetes-sigs/kueue/blob/main/LICENSE kuberay-operator: path: kuberay-operator/1.4.2 valuesFile: values.yaml @@ -144,6 +139,12 @@ components: projectUrl: https://github.com/ray-project/kuberay license: Apache License 2.0 licenseUrl: https://github.com/ray-project/kuberay/blob/master/LICENSE + kueue: + path: kueue/0.13.0 + sourceUrl: oci://registry.k8s.io/kueue/charts/kueue + projectUrl: https://github.com/kubernetes-sigs/kueue + license: Apache License 2.0 + licenseUrl: https://github.com/kubernetes-sigs/kueue/blob/main/LICENSE kyverno: path: kyverno/3.5.1 valuesFile: values.yaml @@ -157,6 +158,12 @@ components: projectUrl: https://github.com/kyverno/kyverno license: Apache License 2.0 licenseUrl: https://github.com/kyverno/kyverno/blob/main/LICENSE + kyverno-policies-storage-local-path: + path: kyverno-policies/storage-local-path + sourceUrl: https://github.com/silogen/cluster-forge/tree/main/sources/kyverno-policies/storage-local-path + projectUrl: https://github.com/silogen/cluster-forge/ + license: Apache License 2.0 + licenseUrl: https://github.com/silogen/cluster-forge/blob/main/LICENSE metallb: path: metallb/v0.15.2 sourceUrl: https://raw.githubusercontent.com/metallb/metallb/v0.15.2/config/manifests/metallb-native.yaml @@ -182,6 +189,12 @@ components: projectUrl: https://github.com/openbao/openbao license: Mozilla Public License 2.0 licenseUrl: https://github.com/openbao/openbao/blob/main/LICENSE + openbao-init: + path: ../scripts/init-openbao-job + sourceUrl: https://github.com/silogen/cluster-forge/tree/main/scripts/init-openbao-job + projectUrl: https://github.com/silogen/cluster-forge/ + license: Apache License 2.0 + licenseUrl: https://github.com/silogen/cluster-forge/blob/main/LICENSE opentelemetry-operator: path: opentelemetry-operator/0.93.1 sourceUrl: https://open-telemetry.github.io/opentelemetry-helm-charts diff --git a/sbom/generate-compare-components.sh b/sbom/generate-compare-components.sh index 26644d5e..507d4cd9 100755 --- a/sbom/generate-compare-components.sh +++ b/sbom/generate-compare-components.sh @@ -2,16 +2,20 @@ set -euo pipefail -# Script to update components.yaml from enabledApps in values.yaml +# Script to update components.yaml from enabledApps across all cluster sizes +# Collects components from values.yaml, values_small.yaml, values_medium.yaml, values_large.yaml # Only updates if there are new items or changes to existing ones # Preserves existing sourceUrl and projectUrl values # Only includes apps that are in the enabledApps list (excluding -config apps) -VALUES_FILE="../root/values.yaml" +BASE_VALUES_FILE="../root/values.yaml" +SMALL_VALUES_FILE="../root/values_small.yaml" +MEDIUM_VALUES_FILE="../root/values_medium.yaml" +LARGE_VALUES_FILE="../root/values_large.yaml" OUTPUT_FILE="./components.yaml" TEMP_FILE="./components.yaml.tmp" -echo "⚙️ Generating/Updating components.yaml from enabledApps..." +echo "⚙️ Generating/Updating components.yaml from enabledApps across all cluster sizes..." # Self-validation: Check enabledApps consistency before processing (fail-fast) echo "🔍 Pre-validation: Checking enabledApps consistency..." @@ -30,14 +34,50 @@ fi echo "" echo "Checking for updates to components.yaml..." -# Check if values.yaml exists -if [[ ! -f "$VALUES_FILE" ]]; then - echo "❌ Error: $VALUES_FILE not found" - exit 1 +# Function to collect enabled apps from a values file +collect_enabled_apps() { + local values_file="$1" + if [[ -f "$values_file" ]]; then + yq eval '.enabledApps[]' "$values_file" 2>/dev/null || echo "" + else + echo "" + fi +} + +# Collect enabled apps from all cluster size configurations +echo "🔍 Collecting enabled apps from all cluster configurations..." +all_enabled_apps="" + +# Collect from base values.yaml (if enabledApps exists) +base_apps=$(collect_enabled_apps "$BASE_VALUES_FILE") +if [[ -n "$base_apps" ]]; then + echo " 📄 Found apps in values.yaml: $(echo "$base_apps" | wc -l) apps" + all_enabled_apps="$all_enabled_apps$base_apps"$'\n' fi -# Get all enabled app names that don't end with -config from values.yaml -enabled_apps=$(yq eval '.enabledApps[]' "$VALUES_FILE" 2>/dev/null || echo "") +# Collect from small cluster values +small_apps=$(collect_enabled_apps "$SMALL_VALUES_FILE") +if [[ -n "$small_apps" ]]; then + echo " 📄 Found apps in values_small.yaml: $(echo "$small_apps" | wc -l) apps" + all_enabled_apps="$all_enabled_apps$small_apps"$'\n' +fi + +# Collect from medium cluster values +medium_apps=$(collect_enabled_apps "$MEDIUM_VALUES_FILE") +if [[ -n "$medium_apps" ]]; then + echo " 📄 Found apps in values_medium.yaml: $(echo "$medium_apps" | wc -l) apps" + all_enabled_apps="$all_enabled_apps$medium_apps"$'\n' +fi + +# Collect from large cluster values +large_apps=$(collect_enabled_apps "$LARGE_VALUES_FILE") +if [[ -n "$large_apps" ]]; then + echo " 📄 Found apps in values_large.yaml: $(echo "$large_apps" | wc -l) apps" + all_enabled_apps="$all_enabled_apps$large_apps"$'\n' +fi + +# Get unique enabled apps (remove duplicates and empty lines) +enabled_apps=$(echo "$all_enabled_apps" | sort -u | grep -v '^$' || echo "") if [[ -z "$enabled_apps" ]]; then echo "Warning: No enabled apps found in enabledApps list" @@ -71,9 +111,21 @@ else # Check each enabled app from values.yaml for app in $app_names; do - # Get current values from values.yaml apps section - current_path=$(yq eval ".apps.\"$app\".path" "$VALUES_FILE") - current_values_file=$(yq eval ".apps.\"$app\".valuesFile // \"null\"" "$VALUES_FILE") + # Get current values from apps section (check all cluster files) + current_path="" + current_values_file="null" + + # Try to find the app definition in any of the cluster configuration files + for values_file in "$BASE_VALUES_FILE" "$SMALL_VALUES_FILE" "$MEDIUM_VALUES_FILE" "$LARGE_VALUES_FILE"; do + if [[ -f "$values_file" ]]; then + app_path=$(yq eval ".apps.\"$app\".path // \"null\"" "$values_file" 2>/dev/null || echo "null") + if [[ "$app_path" != "null" ]]; then + current_path="$app_path" + current_values_file=$(yq eval ".apps.\"$app\".valuesFile // \"null\"" "$values_file") + break + fi + fi + done # Check if app exists in components.yaml existing_app=$(yq eval ".components.\"$app\" // \"null\"" "$OUTPUT_FILE") @@ -124,7 +176,8 @@ echo "Updating $OUTPUT_FILE..." # Create components.yaml header cat > "$TEMP_FILE" << 'EOF' # Generated components metadata for SBOM creation -# This file contains simplified component information for apps in enabledApps +# This file contains simplified component information for apps across all cluster sizes +# Collected from: values.yaml, values_small.yaml, values_medium.yaml, values_large.yaml # Apps with "config" suffix are excluded from this SBOM components: @@ -134,12 +187,23 @@ EOF for app in $app_names; do echo " $app:" >> "$TEMP_FILE" - # Get path from values.yaml - path=$(yq eval ".apps.\"$app\".path" "$VALUES_FILE") - echo " path: $path" >> "$TEMP_FILE" + # Get path and valuesFile from any cluster configuration file + path="" + values_file="null" + + # Try to find the app definition in any of the cluster configuration files + for config_file in "$BASE_VALUES_FILE" "$SMALL_VALUES_FILE" "$MEDIUM_VALUES_FILE" "$LARGE_VALUES_FILE"; do + if [[ -f "$config_file" ]]; then + app_path=$(yq eval ".apps.\"$app\".path // \"null\"" "$config_file" 2>/dev/null || echo "null") + if [[ "$app_path" != "null" ]]; then + path="$app_path" + values_file=$(yq eval ".apps.\"$app\".valuesFile // \"null\"" "$config_file") + break + fi + fi + done - # Get valuesFile from values.yaml if it exists - values_file=$(yq eval ".apps.\"$app\".valuesFile // \"null\"" "$VALUES_FILE") + echo " path: $path" >> "$TEMP_FILE" if [[ "$values_file" != "null" ]]; then echo " valuesFile: $values_file" >> "$TEMP_FILE" fi @@ -193,7 +257,17 @@ echo "$app_names" | wc -l | xargs echo "Total components:" echo "" echo "Components with valuesFile:" for app in $app_names; do - values_file=$(yq eval ".apps.\"$app\".valuesFile // \"null\"" "$VALUES_FILE") + # Check all cluster configuration files for valuesFile + values_file="null" + for config_file in "$BASE_VALUES_FILE" "$SMALL_VALUES_FILE" "$MEDIUM_VALUES_FILE" "$LARGE_VALUES_FILE"; do + if [[ -f "$config_file" ]]; then + app_path=$(yq eval ".apps.\"$app\".path // \"null\"" "$config_file" 2>/dev/null || echo "null") + if [[ "$app_path" != "null" ]]; then + values_file=$(yq eval ".apps.\"$app\".valuesFile // \"null\"" "$config_file") + break + fi + fi + done if [[ "$values_file" != "null" ]]; then echo " - $app" fi diff --git a/sbom/validate-components-sync.sh b/sbom/validate-components-sync.sh index 476e1e15..4d39c706 100755 --- a/sbom/validate-components-sync.sh +++ b/sbom/validate-components-sync.sh @@ -2,10 +2,13 @@ set -euo pipefail -# validate-components-sync.sh - Validate components.yaml sync with enabledApps +# validate-components-sync.sh - Validate components.yaml sync with enabledApps across all cluster sizes # Checks that components.yaml reflects current enabledApps and path consistency -VALUES_FILE="../root/values.yaml" +BASE_VALUES_FILE="../root/values.yaml" +SMALL_VALUES_FILE="../root/values_small.yaml" +MEDIUM_VALUES_FILE="../root/values_medium.yaml" +LARGE_VALUES_FILE="../root/values_large.yaml" COMPONENTS_FILE="./components.yaml" echo "🔄 Validating components.yaml reflects enabledApps..." @@ -17,14 +20,37 @@ if [[ ! -f "$COMPONENTS_FILE" ]]; then exit 1 fi -# Check if values.yaml exists -if [[ ! -f "$VALUES_FILE" ]]; then - echo "❌ Error: $VALUES_FILE not found" - exit 1 +# Function to collect enabled apps from a values file +collect_enabled_apps() { + local values_file="$1" + if [[ -f "$values_file" ]]; then + yq eval '.enabledApps[]' "$values_file" 2>/dev/null || echo "" + else + echo "" + fi +} + +# Collect enabled apps from all cluster size configurations +all_enabled_apps="" + +# Collect from base values.yaml (if enabledApps exists) +base_apps=$(collect_enabled_apps "$BASE_VALUES_FILE") +if [[ -n "$base_apps" ]]; then + all_enabled_apps="$all_enabled_apps$base_apps"$'\n' fi -# Get enabled apps (filtered, same as generation script) -enabled_apps=$(yq eval '.enabledApps[]' "$VALUES_FILE" 2>/dev/null || echo "") +# Collect from cluster size values +for size_file in "$SMALL_VALUES_FILE" "$MEDIUM_VALUES_FILE" "$LARGE_VALUES_FILE"; do + if [[ -f "$size_file" ]]; then + size_apps=$(collect_enabled_apps "$size_file") + if [[ -n "$size_apps" ]]; then + all_enabled_apps="$all_enabled_apps$size_apps"$'\n' + fi + fi +done + +# Get unique enabled apps (remove duplicates and empty lines) +enabled_apps=$(echo "$all_enabled_apps" | sort -u | grep -v '^$' || echo "") enabled_apps_filtered=$(echo "$enabled_apps" | grep -v -- '-config$' || echo "") # Get components in components.yaml @@ -66,29 +92,49 @@ if [ ${#missing_components[@]} -ne 0 ] || [ ${#extra_components[@]} -ne 0 ]; the exit 1 fi -# Check path consistency between values.yaml and components.yaml +# Check path consistency between cluster configuration files and components.yaml echo "⚙️ Checking path/valuesFile consistency..." path_mismatches=() while IFS= read -r app; do [[ -z "$app" ]] && continue - # Get paths from both files - values_path=$(yq eval ".apps.\"$app\".path" "$VALUES_FILE" 2>/dev/null || echo "null") + # Find the app definition in any of the cluster configuration files + values_path="" + for config_file in "$BASE_VALUES_FILE" "$SMALL_VALUES_FILE" "$MEDIUM_VALUES_FILE" "$LARGE_VALUES_FILE"; do + if [[ -f "$config_file" ]]; then + app_path=$(yq eval ".apps.\"$app\".path // \"null\"" "$config_file" 2>/dev/null || echo "null") + if [[ "$app_path" != "null" ]]; then + values_path="$app_path" + break + fi + fi + done + component_path=$(yq eval ".components.\"$app\".path" "$COMPONENTS_FILE" 2>/dev/null || echo "null") if [[ "$values_path" != "$component_path" ]]; then - path_mismatches+=("$app: values.yaml='$values_path' vs components.yaml='$component_path'") - echo "❌ Path mismatch for '$app': values.yaml='$values_path' vs components.yaml='$component_path'" + path_mismatches+=("$app: cluster-configs='$values_path' vs components.yaml='$component_path'") + echo "❌ Path mismatch for '$app': cluster-configs='$values_path' vs components.yaml='$component_path'" fi # Check valuesFile consistency - values_file_values=$(yq eval ".apps.\"$app\".valuesFile // \"null\"" "$VALUES_FILE" 2>/dev/null || echo "null") + values_file_values="null" + for config_file in "$BASE_VALUES_FILE" "$SMALL_VALUES_FILE" "$MEDIUM_VALUES_FILE" "$LARGE_VALUES_FILE"; do + if [[ -f "$config_file" ]]; then + app_path_check=$(yq eval ".apps.\"$app\".path // \"null\"" "$config_file" 2>/dev/null || echo "null") + if [[ "$app_path_check" != "null" ]]; then + values_file_values=$(yq eval ".apps.\"$app\".valuesFile // \"null\"" "$config_file" 2>/dev/null || echo "null") + break + fi + fi + done + values_file_components=$(yq eval ".components.\"$app\".valuesFile // \"null\"" "$COMPONENTS_FILE" 2>/dev/null || echo "null") if [[ "$values_file_values" != "$values_file_components" ]]; then - path_mismatches+=("$app valuesFile: values.yaml='$values_file_values' vs components.yaml='$values_file_components'") - echo "❌ ValuesFile mismatch for '$app': values.yaml='$values_file_values' vs components.yaml='$values_file_components'" + path_mismatches+=("$app valuesFile: cluster-configs='$values_file_values' vs components.yaml='$values_file_components'") + echo "❌ ValuesFile mismatch for '$app': cluster-configs='$values_file_values' vs components.yaml='$values_file_components'" fi done <<< "$enabled_apps_filtered" diff --git a/sbom/validate-enabled-apps.sh b/sbom/validate-enabled-apps.sh index 53a78685..c1128ed6 100755 --- a/sbom/validate-enabled-apps.sh +++ b/sbom/validate-enabled-apps.sh @@ -2,21 +2,60 @@ set -euo pipefail -# validate-enabled-apps.sh - Validate enabledApps consistency -# Checks that all apps in enabledApps have corresponding definitions in apps section +# validate-enabled-apps.sh - Validate enabledApps consistency across all cluster sizes +# Checks that all apps in enabledApps from all cluster configurations have corresponding definitions in apps section -VALUES_FILE="../root/values.yaml" +BASE_VALUES_FILE="../root/values.yaml" +SMALL_VALUES_FILE="../root/values_small.yaml" +MEDIUM_VALUES_FILE="../root/values_medium.yaml" +LARGE_VALUES_FILE="../root/values_large.yaml" -echo "📋 Validating enabledApps have app definitions..." +echo "📋 Validating enabledApps have app definitions across all cluster sizes..." -# Check if values.yaml exists -if [[ ! -f "$VALUES_FILE" ]]; then - echo "❌ Error: $VALUES_FILE not found" - exit 1 +# Function to collect enabled apps from a values file +collect_enabled_apps() { + local values_file="$1" + if [[ -f "$values_file" ]]; then + yq eval '.enabledApps[]' "$values_file" 2>/dev/null || echo "" + else + echo "" + fi +} + +# Collect enabled apps from all cluster size configurations +echo "🔍 Collecting enabled apps from all cluster configurations..." +all_enabled_apps="" + +# Collect from base values.yaml (if enabledApps exists) +base_apps=$(collect_enabled_apps "$BASE_VALUES_FILE") +if [[ -n "$base_apps" ]]; then + echo " 📄 Found apps in values.yaml: $(echo "$base_apps" | wc -l) apps" + all_enabled_apps="$all_enabled_apps$base_apps"$'\n' fi -# Get all enabled apps -enabled_apps=$(yq eval '.enabledApps[]' "$VALUES_FILE" 2>/dev/null || echo "") +# Collect from small cluster values +small_apps=$(collect_enabled_apps "$SMALL_VALUES_FILE") +if [[ -n "$small_apps" ]]; then + echo " 📄 Found apps in values_small.yaml: $(echo "$small_apps" | wc -l) apps" + all_enabled_apps="$all_enabled_apps$small_apps"$'\n' +fi + +# Collect from medium cluster values +medium_apps=$(collect_enabled_apps "$MEDIUM_VALUES_FILE") +if [[ -n "$medium_apps" ]]; then + echo " 📄 Found apps in values_medium.yaml: $(echo "$medium_apps" | wc -l) apps" + all_enabled_apps="$all_enabled_apps$medium_apps"$'\n' +fi + +# Collect from large cluster values +large_apps=$(collect_enabled_apps "$LARGE_VALUES_FILE") +if [[ -n "$large_apps" ]]; then + echo " 📄 Found apps in values_large.yaml: $(echo "$large_apps" | wc -l) apps" + all_enabled_apps="$all_enabled_apps$large_apps"$'\n' +fi + +# Get unique enabled apps (remove duplicates and empty lines) +enabled_apps=$(echo "$all_enabled_apps" | sort -u | grep -v '^$' || echo "") if [[ -z "$enabled_apps" ]]; then echo "ℹ️ No enabled apps found in enabledApps list" @@ -31,11 +70,44 @@ if [[ -z "$enabled_apps_filtered" ]]; then exit 0 fi -# Get all defined apps in apps section -defined_apps=$(yq eval '.apps | keys | .[]' "$VALUES_FILE" 2>/dev/null || echo "") +# Function to collect app definitions from a values file +collect_app_definitions() { + local values_file="$1" + if [[ -f "$values_file" ]]; then + yq eval '.apps | keys | .[]' "$values_file" 2>/dev/null || echo "" + else + echo "" + fi +} + +# Collect app definitions from all cluster size configurations +echo "🔍 Collecting app definitions from all cluster configurations..." +all_defined_apps="" + +# Collect from base values.yaml +base_defined_apps=$(collect_app_definitions "$BASE_VALUES_FILE") +if [[ -n "$base_defined_apps" ]]; then + echo " 📄 Found app definitions in values.yaml: $(echo "$base_defined_apps" | wc -l) apps" + all_defined_apps="$all_defined_apps$base_defined_apps"$'\n' +fi + +# Collect from cluster size values +for size_file in "$SMALL_VALUES_FILE" "$MEDIUM_VALUES_FILE" "$LARGE_VALUES_FILE"; do + if [[ -f "$size_file" ]]; then + size_defined_apps=$(collect_app_definitions "$size_file") + if [[ -n "$size_defined_apps" ]]; then + size_name=$(basename "$size_file") + echo " 📄 Found app definitions in $size_name: $(echo "$size_defined_apps" | wc -l) apps" + all_defined_apps="$all_defined_apps$size_defined_apps"$'\n' + fi + fi +done + +# Get unique defined apps (remove duplicates and empty lines) +defined_apps=$(echo "$all_defined_apps" | sort -u | grep -v '^$' || echo "") if [[ -z "$defined_apps" ]]; then - echo "❌ Error: No app definitions found in apps section" + echo "❌ Error: No app definitions found in any cluster configuration files" exit 1 fi From 76985cbab573fc533b6f852d238801fc241f9e75 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Sun, 1 Mar 2026 21:03:30 +0200 Subject: [PATCH 073/115] feat: improved error messaging, dependency validation, and progress visibility --- scripts/bootstrap.sh | 65 ++++++++++++++-- scripts/debug-cluster-state.sh | 133 +++++++++++++++++++++++++++++++++ 2 files changed, 191 insertions(+), 7 deletions(-) create mode 100755 scripts/debug-cluster-state.sh diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index e3b050cc..ce6a7ce5 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -1,20 +1,20 @@ -#!/bin/bash -x +#!/bin/bash set -euo pipefail LATEST_RELEASE="v1.8.0" # Initialize variables +APPS="" CLUSTER_SIZE="medium" # Default to medium DEFAULT_TIMEOUT="5m" DOMAIN="" KUBE_VERSION=1.33 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SKIP_DEPENDENCY_CHECK=false TARGET_REVISION="$LATEST_RELEASE" TEMPLATE_ONLY=false VALUES_FILE="values.yaml" -APPS="" -SKIP_DEPENDENCY_CHECK=false # Check for required dependencies check_dependencies() { @@ -464,6 +464,29 @@ apply_cluster_forge_parent_app() { echo "=== Creating ClusterForge Parent App ===" echo "Target revision: $TARGET_REVISION" + if [ "$TEMPLATE_ONLY" = false ]; then + # Check if ArgoCD Application CRDs are available before proceeding + echo "🔍 Checking ArgoCD Application CRDs availability..." + if ! kubectl get crd applications.argoproj.io >/dev/null 2>&1; then + echo "❌ Error: ArgoCD Application CRDs not found!" + echo " ArgoCD must be deployed before creating Applications." + echo " Please ensure ArgoCD bootstrap completed successfully." + echo "" + echo "💡 Suggestion: Re-run bootstrap without --apps filter to deploy ArgoCD first:" + echo " $0 $DOMAIN --cluster-size=$CLUSTER_SIZE --target-revision=$TARGET_REVISION" + exit 1 + fi + echo "✅ ArgoCD Application CRDs found" + + # Verify ArgoCD namespace exists + if ! kubectl get namespace argocd >/dev/null 2>&1; then + echo "❌ Error: ArgoCD namespace 'argocd' not found!" + echo " Please ensure ArgoCD was deployed first." + exit 1 + fi + echo "✅ ArgoCD namespace found" + fi + helm template cluster-forge "${SOURCE_ROOT}/root" \ --show-only templates/cluster-forge.yaml \ --values "${SOURCE_ROOT}/root/${VALUES_FILE}" \ @@ -540,10 +563,38 @@ main() { fi else # Default behavior - run all bootstrap components - should_run namespaces && create_namespaces - should_run argocd && bootstrap_argocd - should_run gitea && bootstrap_gitea - should_run cluster-forge && apply_cluster_forge_parent_app + echo "🚀 Running full bootstrap sequence..." + echo "📋 Bootstrap order: namespaces → argocd → gitea → cluster-forge" + + if should_run namespaces; then + echo "📦 Step 1/4: Creating namespaces" + create_namespaces + else + echo "⏭️ Step 1/4: Skipping namespaces" + fi + + if should_run argocd; then + echo "📦 Step 2/4: Bootstrapping ArgoCD" + bootstrap_argocd + else + echo "⏭️ Step 2/4: Skipping ArgoCD" + fi + + if should_run gitea; then + echo "📦 Step 3/4: Bootstrapping Gitea" + bootstrap_gitea + else + echo "⏭️ Step 3/4: Skipping Gitea" + fi + + if should_run cluster-forge; then + echo "📦 Step 4/4: Creating ClusterForge parent app" + apply_cluster_forge_parent_app + else + echo "⏭️ Step 4/4: Skipping ClusterForge" + fi + + echo "✅ Bootstrap sequence completed" fi } diff --git a/scripts/debug-cluster-state.sh b/scripts/debug-cluster-state.sh new file mode 100755 index 00000000..3b836999 --- /dev/null +++ b/scripts/debug-cluster-state.sh @@ -0,0 +1,133 @@ +#!/bin/bash + +# Debug script to check current cluster state for bootstrap troubleshooting + +set -euo pipefail + +echo "🔍 ClusterForge Bootstrap Debug Report" +echo "========================================" +echo + +# Check basic cluster connectivity +echo "📡 Cluster Connectivity:" +if kubectl auth can-i get pods >/dev/null 2>&1; then + echo " ✅ Kubectl access: Working" +else + echo " ❌ Kubectl access: Failed" + echo " Please check kubeconfig and cluster connectivity" + exit 1 +fi + +# Check namespaces +echo +echo "📦 Namespaces:" +for ns in argocd cf-gitea cf-openbao; do + if kubectl get namespace "$ns" >/dev/null 2>&1; then + echo " ✅ $ns: Exists" + else + echo " ❌ $ns: Missing" + fi +done + +# Check ArgoCD CRDs +echo +echo "🔧 ArgoCD CRDs:" +if kubectl get crd applications.argoproj.io >/dev/null 2>&1; then + echo " ✅ applications.argoproj.io: Available" +else + echo " ❌ applications.argoproj.io: Missing" + echo " ArgoCD must be deployed first" +fi + +# Check ArgoCD deployment +echo +echo "⚙️ ArgoCD Deployment:" +if kubectl get namespace argocd >/dev/null 2>&1; then + argocd_pods=$(kubectl get pods -n argocd --no-headers 2>/dev/null | wc -l || echo "0") + ready_pods=$(kubectl get pods -n argocd --no-headers 2>/dev/null | grep -c "Running" || echo "0") + echo " 📊 Pods: $ready_pods/$argocd_pods running" + + if [ "$argocd_pods" -gt 0 ]; then + echo " 📋 Pod Status:" + kubectl get pods -n argocd --no-headers 2>/dev/null | while read pod status ready age; do + if [ "$status" = "Running" ]; then + echo " ✅ $pod: $status" + else + echo " ❌ $pod: $status" + fi + done + fi +else + echo " ❌ ArgoCD namespace not found" +fi + +# Check Gitea deployment +echo +echo "📚 Gitea Deployment:" +if kubectl get namespace cf-gitea >/dev/null 2>&1; then + gitea_pods=$(kubectl get pods -n cf-gitea --no-headers 2>/dev/null | wc -l || echo "0") + ready_gitea=$(kubectl get pods -n cf-gitea --no-headers 2>/dev/null | grep -c "Running" || echo "0") + echo " 📊 Pods: $ready_gitea/$gitea_pods running" +else + echo " ❌ Gitea namespace not found" +fi + +# Check OpenBao deployment +echo +echo "🔐 OpenBao Deployment:" +if kubectl get namespace cf-openbao >/dev/null 2>&1; then + openbao_pods=$(kubectl get pods -n cf-openbao --no-headers 2>/dev/null | wc -l || echo "0") + ready_openbao=$(kubectl get pods -n cf-openbao --no-headers 2>/dev/null | grep -c "Running" || echo "0") + echo " 📊 Pods: $ready_openbao/$openbao_pods running" + + # Check specifically for openbao-0 readiness + if kubectl get pod openbao-0 -n cf-openbao >/dev/null 2>&1; then + ready_status=$(kubectl get pod openbao-0 -n cf-openbao -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' || echo "Unknown") + if [ "$ready_status" = "True" ]; then + echo " ✅ openbao-0: Ready" + else + echo " ❌ openbao-0: Not Ready" + echo " This is the original issue - OpenBao readiness probe failing" + fi + fi +else + echo " ❌ OpenBao namespace not found" +fi + +# Check ArgoCD Applications +echo +echo "📱 ArgoCD Applications:" +if kubectl get crd applications.argoproj.io >/dev/null 2>&1 && kubectl get namespace argocd >/dev/null 2>&1; then + apps=$(kubectl get applications -n argocd --no-headers 2>/dev/null | wc -l || echo "0") + if [ "$apps" -gt 0 ]; then + echo " 📊 Found $apps applications:" + kubectl get applications -n argocd --no-headers 2>/dev/null | while read name health sync; do + if [ "$health" = "Healthy" ] && [ "$sync" = "Synced" ]; then + echo " ✅ $name: $health/$sync" + else + echo " ❌ $name: $health/$sync" + fi + done + else + echo " 📊 No applications found" + echo " This suggests cluster-forge parent app was not created" + fi +else + echo " ❌ Cannot check applications (ArgoCD not ready)" +fi + +echo +echo "📋 Recommendations:" +if ! kubectl get crd applications.argoproj.io >/dev/null 2>&1; then + echo " 🔄 Re-run bootstrap to deploy ArgoCD first" + echo " ./scripts/bootstrap.sh --cluster-size=" +elif [ "$apps" -eq 0 ]; then + echo " 🔄 Create cluster-forge parent application" + echo " ./scripts/bootstrap.sh --cluster-size= --apps=cluster-forge" +else + echo " ✅ Bootstrap appears to be in progress" + echo " Check individual application sync status in ArgoCD UI" +fi + +echo +echo "🔚 Debug report complete" \ No newline at end of file From 633632c6f8f60cda6ae199067e629b7047c0f028 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Sun, 1 Mar 2026 22:42:36 +0200 Subject: [PATCH 074/115] fix: openbao cm refs; simplify some redundant bootstrap checks --- root/values.yaml | 2 +- scripts/bootstrap.sh | 23 +------------------ .../templates/cf-init-openbao-job.yaml | 4 ++-- 3 files changed, 4 insertions(+), 25 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index aa16b3ad..13e1083f 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -204,7 +204,7 @@ apps: helmParameters: - name: domain value: "{{ .Values.global.domain }}" - syncWave: -25 + syncWave: -60 external-secrets: path: external-secrets/0.15.1 namespace: external-secrets diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index ce6a7ce5..4fdc520d 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -464,28 +464,7 @@ apply_cluster_forge_parent_app() { echo "=== Creating ClusterForge Parent App ===" echo "Target revision: $TARGET_REVISION" - if [ "$TEMPLATE_ONLY" = false ]; then - # Check if ArgoCD Application CRDs are available before proceeding - echo "🔍 Checking ArgoCD Application CRDs availability..." - if ! kubectl get crd applications.argoproj.io >/dev/null 2>&1; then - echo "❌ Error: ArgoCD Application CRDs not found!" - echo " ArgoCD must be deployed before creating Applications." - echo " Please ensure ArgoCD bootstrap completed successfully." - echo "" - echo "💡 Suggestion: Re-run bootstrap without --apps filter to deploy ArgoCD first:" - echo " $0 $DOMAIN --cluster-size=$CLUSTER_SIZE --target-revision=$TARGET_REVISION" - exit 1 - fi - echo "✅ ArgoCD Application CRDs found" - - # Verify ArgoCD namespace exists - if ! kubectl get namespace argocd >/dev/null 2>&1; then - echo "❌ Error: ArgoCD namespace 'argocd' not found!" - echo " Please ensure ArgoCD was deployed first." - exit 1 - fi - echo "✅ ArgoCD namespace found" - fi + helm template cluster-forge "${SOURCE_ROOT}/root" \ --show-only templates/cluster-forge.yaml \ diff --git a/scripts/init-openbao-job/templates/cf-init-openbao-job.yaml b/scripts/init-openbao-job/templates/cf-init-openbao-job.yaml index 31f0d6f7..c01b7d57 100644 --- a/scripts/init-openbao-job/templates/cf-init-openbao-job.yaml +++ b/scripts/init-openbao-job/templates/cf-init-openbao-job.yaml @@ -52,8 +52,8 @@ spec: defaultMode: 0755 - name: secrets-config configMap: - name: openbao-secrets-init-config + name: openbao-secrets-config - name: secret-manager configMap: - name: openbao-secret-manager-scripts-init + name: openbao-secret-manager-scripts defaultMode: 0755 From 71bd3e5b3eb267fa8b1c4ec80330c5d4c193cb24 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Sun, 1 Mar 2026 22:51:30 +0200 Subject: [PATCH 075/115] fix: rm sessionAffinity warning for Gitea --- sources/gitea/12.3.0/templates/gitea/http-svc.yaml | 3 +++ sources/gitea/12.3.0/templates/gitea/ssh-svc.yaml | 3 +++ sources/gitea/12.3.0/values.yaml | 2 ++ 3 files changed, 8 insertions(+) diff --git a/sources/gitea/12.3.0/templates/gitea/http-svc.yaml b/sources/gitea/12.3.0/templates/gitea/http-svc.yaml index 28bd2182..9638d2f4 100644 --- a/sources/gitea/12.3.0/templates/gitea/http-svc.yaml +++ b/sources/gitea/12.3.0/templates/gitea/http-svc.yaml @@ -43,6 +43,9 @@ spec: {{- if and .Values.service.http.clusterIP (eq .Values.service.http.type "ClusterIP") }} clusterIP: {{ .Values.service.http.clusterIP }} {{- end }} + {{- if and .Values.service.http.sessionAffinity (ne .Values.service.http.clusterIP "None") (ne .Values.service.http.sessionAffinity "None") }} + sessionAffinity: {{ .Values.service.http.sessionAffinity }} + {{- end }} ports: - name: http port: {{ .Values.service.http.port }} diff --git a/sources/gitea/12.3.0/templates/gitea/ssh-svc.yaml b/sources/gitea/12.3.0/templates/gitea/ssh-svc.yaml index b2046fe1..1676aea6 100644 --- a/sources/gitea/12.3.0/templates/gitea/ssh-svc.yaml +++ b/sources/gitea/12.3.0/templates/gitea/ssh-svc.yaml @@ -29,6 +29,9 @@ spec: {{- if and .Values.service.ssh.clusterIP (eq .Values.service.ssh.type "ClusterIP") }} clusterIP: {{ .Values.service.ssh.clusterIP }} {{- end }} + {{- if and .Values.service.ssh.sessionAffinity (ne .Values.service.ssh.clusterIP "None") (ne .Values.service.ssh.sessionAffinity "None") }} + sessionAffinity: {{ .Values.service.ssh.sessionAffinity }} + {{- end }} {{- if .Values.service.ssh.externalIPs }} externalIPs: {{- toYaml .Values.service.ssh.externalIPs | nindent 4 }} diff --git a/sources/gitea/12.3.0/values.yaml b/sources/gitea/12.3.0/values.yaml index 4f323cb4..b09138bb 100644 --- a/sources/gitea/12.3.0/values.yaml +++ b/sources/gitea/12.3.0/values.yaml @@ -115,6 +115,7 @@ service: type: ClusterIP port: 3000 clusterIP: None + sessionAffinity: "None" loadBalancerIP: nodePort: externalTrafficPolicy: @@ -143,6 +144,7 @@ service: type: ClusterIP port: 22 clusterIP: None + sessionAffinity: "None" loadBalancerIP: nodePort: externalTrafficPolicy: From abfdb6f8645e6624adcf9762853200c7f8edf99b Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Sun, 1 Mar 2026 23:14:07 +0200 Subject: [PATCH 076/115] fix: KC resourcePreset for medium --- root/values_medium.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/root/values_medium.yaml b/root/values_medium.yaml index 98872ee6..a71a5b2f 100644 --- a/root/values_medium.yaml +++ b/root/values_medium.yaml @@ -56,7 +56,6 @@ apps: kyverno-policies-storage-local-path: namespace: kyverno path: kyverno-policies/storage-local-path - ignoreDifferences: [] syncWave: -20 ignoreDifferences: - group: kyverno.io @@ -189,6 +188,12 @@ apps: accessModes: - ReadWriteOnce + keycloak: + valuesObject: + # Increase memory resources for Keycloak to prevent OOMKilled during initialization + # Medium preset provides 1536Mi memory limit vs small preset's 768Mi + resourcesPreset: "medium" + otel-lgtm-stack: valuesObject: collectors: From d5255ad59131f89516bbb6ca5a312418d347fed2 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 10:36:16 +0200 Subject: [PATCH 077/115] fix: remove non-manifest printing when using --apps / --template-only flag --- docs/bootstrap_guide.md | 60 +++++++++++++++++++++++++++++++++++++++++ scripts/bootstrap.sh | 49 +++++++++++++++++++++------------ 2 files changed, 92 insertions(+), 17 deletions(-) diff --git a/docs/bootstrap_guide.md b/docs/bootstrap_guide.md index 6874bafe..0d250a19 100644 --- a/docs/bootstrap_guide.md +++ b/docs/bootstrap_guide.md @@ -23,7 +23,13 @@ This guide explains how to bootstrap a complete GitOps environment using Cluster ### Options +- **--apps=APP1,APP2**: Deploy only specified components (default: applies to cluster) + - options: `namespaces`, `argocd`, `gitea`, `cluster-forge`, or any cluster-forge child app (see values.yaml for app names) + - Use with `--template-only` to render instead of applying - **--CLUSTER_SIZE** `[small|medium|large]`: Cluster size configuration (default: `medium`) +- **--template-only**, **-t**: Output YAML manifests to stdout instead of applying to cluster +- **--target-revision**, **-r**: cluster-forge git revision for ArgoCD to sync from +- **--skip-deps**: Skip dependency checking (for advanced users) - **--help**, **-h**: Show usage information ### Examples @@ -34,6 +40,15 @@ This guide explains how to bootstrap a complete GitOps environment using Cluster # Large cluster ./scripts/bootstrap.sh example.com --CLUSTER_SIZE=large + +# Deploy only specific components +./scripts/bootstrap.sh example.com --apps=openbao,openbao-init + +# Render templates for debugging (doesn't apply) +./scripts/bootstrap.sh example.com --apps=gitea --template-only + +# Deploy from specific git branch +./scripts/bootstrap.sh example.com --target-revision=feature-branch ``` ## How It Works @@ -340,6 +355,51 @@ global: myCustomValue: "something" ``` +## Selective Component Deployment + +The `--apps` flag allows you to deploy only specific components instead of the full stack. This is useful for: + +- **Development workflows**: Deploy only the components you're working on +- **Troubleshooting**: Deploy components individually to isolate issues +- **Testing**: Validate specific component configurations +- **Incremental deployment**: Add components to an existing cluster + +### Bootstrap Components + +These are the core infrastructure components deployed manually via helm template: + +- `namespaces` - Creates required namespaces (argocd, cf-gitea, cf-openbao) +- `argocd` - GitOps controller for managing all other components +- `gitea` - Self-hosted Git server for cluster-forge and cluster-values repositories +- `cluster-forge` - ArgoCD parent application that manages all child apps + +### Cluster-Forge Child Apps + +Any application listed in `enabledApps` from values.yaml can be deployed individually: + +```bash +# Deploy only OpenBao components +./scripts/bootstrap.sh example.com --apps=openbao,openbao-init,openbao-config + +# Deploy only monitoring stack +./scripts/bootstrap.sh example.com --apps=prometheus-crds,otel-lgtm-stack,opentelemetry-operator + +# Deploy identity management +./scripts/bootstrap.sh example.com --apps=keycloak,cluster-auth,cluster-auth-config +``` + +### Template-Only Mode + +Combine with `--template-only` to render manifests without applying: + +```bash +# Generate YAML for debugging +./scripts/bootstrap.sh example.com --apps=keycloak --template-only > keycloak-manifests.yaml + +# View what would be deployed +./scripts/bootstrap.sh example.com --apps=openbao,openbao-init --template-only | kubectl diff -f - +``` + ## File Cleanup The bootstrap script automatically cleans up temporary files at the end: diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 4fdc520d..a32952c6 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -18,10 +18,13 @@ VALUES_FILE="values.yaml" # Check for required dependencies check_dependencies() { + local silent="${1:-false}" local missing_deps=() local all_good=true - echo "=== Checking Dependencies ===" + if [ "$silent" != "true" ]; then + echo "=== Checking Dependencies ===" + fi # Define required programs with installation instructions declare -A REQUIRED_PROGRAMS=( @@ -45,22 +48,22 @@ check_dependencies() { case "$program" in "kubectl") version=$(kubectl version --client 2>/dev/null | head -n1 | cut -d' ' -f3 2>/dev/null || echo "unknown") - printf " ✓ %-12s %s (%s)\n" "$program" "$(command -v "$program")" "$version" + [ "$silent" != "true" ] && printf " ✓ %-12s %s (%s)\n" "$program" "$(command -v "$program")" "$version" ;; "helm") version=$(helm version --short --client 2>/dev/null | cut -d'+' -f1 2>/dev/null || echo "unknown") - printf " ✓ %-12s %s (%s)\n" "$program" "$(command -v "$program")" "$version" + [ "$silent" != "true" ] && printf " ✓ %-12s %s (%s)\n" "$program" "$(command -v "$program")" "$version" ;; "yq") version=$(yq --version 2>/dev/null | head -n1 | cut -d' ' -f4 2>/dev/null || echo "unknown") - printf " ✓ %-12s %s (%s)\n" "$program" "$(command -v "$program")" "$version" + [ "$silent" != "true" ] && printf " ✓ %-12s %s (%s)\n" "$program" "$(command -v "$program")" "$version" ;; *) - printf " ✓ %-12s %s\n" "$program" "$(command -v "$program")" + [ "$silent" != "true" ] && printf " ✓ %-12s %s\n" "$program" "$(command -v "$program")" ;; esac else - printf " ✗ %-12s MISSING\n" "$program" + [ "$silent" != "true" ] && printf " ✗ %-12s MISSING\n" "$program" missing_deps+=("$program") all_good=false fi @@ -69,9 +72,9 @@ check_dependencies() { # Check optional programs (warn but don't fail) for program in "${!OPTIONAL_PROGRAMS[@]}"; do if command -v "$program" >/dev/null 2>&1; then - printf " ✓ %-12s %s\n" "$program" "$(command -v "$program")" + [ "$silent" != "true" ] && printf " ✓ %-12s %s\n" "$program" "$(command -v "$program")" else - printf " ! %-12s MISSING (usually pre-installed)\n" "$program" + [ "$silent" != "true" ] && printf " ! %-12s MISSING (usually pre-installed)\n" "$program" fi done @@ -130,8 +133,10 @@ check_dependencies() { exit 1 fi - echo " ✓ All required dependencies are available!" - echo "" + if [ "$silent" != "true" ]; then + echo " ✓ All required dependencies are available!" + echo "" + fi } parse_args() { @@ -190,7 +195,6 @@ parse_args() { ;; --apps=*) APPS="${1#*=}" - TEMPLATE_ONLY=true shift ;; --help|-h) @@ -202,8 +206,9 @@ parse_args() { values_file Optional. Values .yaml file to use, default: root/values.yaml Options: - --apps=APP1,APP2 Render only specified components (implies --template-only) + --apps=APP1,APP2 Deploy only specified components options: namespaces, argocd, gitea, cluster-forge, or any cluster-forge child app (see values.yaml for app names) + Use with --template-only to render instead of applying --cluster-size, -s options: [small|medium|large], default: medium --target-revision, -r cluster-forge git revision for ArgoCD to sync from options: [tag|commit_hash|branch_name], default: $LATEST_RELEASE @@ -216,6 +221,8 @@ parse_args() { $0 112.100.97.17.nip.io $0 dev.example.com --cluster-size=small --target-revision=v1.8.0 $0 dev.example.com -s=small -r=feature-branch + $0 example.com --apps=openbao,openbao-init + $0 example.com --apps=keycloak -t Bootstrap Behavior: • Bootstrap deploys ArgoCD + Gitea directly (essential infrastructure) @@ -415,7 +422,6 @@ EOF # Render specific cluster-forge child apps (for --apps filtering) render_cluster_forge_child_apps() { - echo "=== Rendering ClusterForge Child Apps: ${APPS} ===" # Create a temporary values file with only the requested apps enabled local temp_values="/tmp/filtered_values.yaml" @@ -490,11 +496,20 @@ is_cluster_forge_child_app() { main() { parse_args "$@" - if [ "$SKIP_DEPENDENCY_CHECK" = false ]; then - check_dependencies + # Use silent dependency check when using --apps for cleaner output + if [ -z "$APPS" ]; then + if [ "$SKIP_DEPENDENCY_CHECK" = false ]; then + check_dependencies + fi + validate_args + print_summary + else + # For --apps mode, check deps silently and skip verbose output + if [ "$SKIP_DEPENDENCY_CHECK" = false ]; then + check_dependencies true + fi + validate_args fi - validate_args - print_summary # If specific apps are requested, check if they're cluster-forge child apps if [ -n "$APPS" ]; then From 1ec0892e805da39c074a17d46530022ea2979bfe Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 11:09:45 +0200 Subject: [PATCH 078/115] qa: revert init-gitea-job scripts to see if needed with restructuring --- .../templates/cf-init-gitea-cm.yaml | 381 +++++------------- .../templates/cf-init-gitea-job.yaml | 2 - 2 files changed, 95 insertions(+), 288 deletions(-) diff --git a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml index 294d5b40..36286875 100644 --- a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml +++ b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml @@ -4,7 +4,7 @@ metadata: name: gitea-init-scripts namespace: cf-gitea data: - init-gitea.sh: |+ + init-gitea.sh: | #!/bin/bash set -e @@ -12,205 +12,73 @@ data: GITEA_URL="${GITEA_URL:-http://gitea-http.cf-gitea.svc:3000}" GITEA_ADMIN_USER="${GITEA_ADMIN_USER:-silogen-admin}" - # Retry function with exponential backoff - retry_with_backoff() { - local max_attempts=$1 - local delay=$2 - local operation_name=$3 - shift 3 - local attempt=1 - - while [ $attempt -le $max_attempts ]; do - echo "[$operation_name] Attempt $attempt/$max_attempts..." - - if "$@"; then - echo "[$operation_name] Success on attempt $attempt" - return 0 - else - if [ $attempt -eq $max_attempts ]; then - echo "[$operation_name] Failed after $max_attempts attempts" - return 1 - fi - - echo "[$operation_name] Failed, waiting ${delay}s before retry..." - sleep $delay - delay=$((delay * 2)) # Exponential backoff - attempt=$((attempt + 1)) - fi - done - } - - # Wait for Gitea deployment to be ready - wait_for_gitea_ready() { - echo "Waiting for Gitea deployment to be ready..." - - # Wait for deployment rollout to complete - if ! kubectl rollout status deploy/gitea -n cf-gitea --timeout=300s; then - echo "ERROR: Gitea deployment failed to become ready" - return 1 - fi - - # Wait for Gitea service to be responsive - local gitea_ready=false - local attempts=0 - local max_attempts=30 - - while [ $attempts -lt $max_attempts ] && [ "$gitea_ready" = false ]; do - echo "Checking if Gitea API is responsive (attempt $((attempts + 1))/$max_attempts)..." - - if curl -s --connect-timeout 5 --max-time 10 "$GITEA_URL/api/v1/version" >/dev/null 2>&1; then - gitea_ready=true - echo "Gitea API is ready!" - else - echo "Gitea API not ready yet, waiting 10s..." - sleep 10 - attempts=$((attempts + 1)) - fi - done - - if [ "$gitea_ready" = false ]; then - echo "ERROR: Gitea API did not become ready after $((max_attempts * 10)) seconds" - return 1 - fi - - return 0 - } - - # Function to create admin access token with retry - create_admin_token() { - # Check if we already have a token secret - if kubectl get secret gitea-admin-token -n cf-gitea &>/dev/null; then - echo "Admin token secret already exists, using existing token" - return 0 - fi - - # Generate a unique token name - local token_name="api-token-$(date +%s)-$$" - - # Create the token and capture the result - local token_output - if token_output=$(kubectl -n cf-gitea exec deploy/gitea -c gitea -- \ - gitea admin user generate-access-token --raw --username "$GITEA_ADMIN_USER" --token-name "$token_name" --scopes all 2>/dev/null); then - - # Store the token in a global variable for use after retry - GITEA_TOKEN="$token_output" - - # Create the secret immediately - kubectl create secret generic gitea-admin-token --from-literal=token="${GITEA_TOKEN}" --namespace=cf-gitea --dry-run=client -o yaml | kubectl apply -f - - echo "Admin token '$token_name' created and secret stored" - return 0 - else - echo "Failed to create admin token '$token_name'" - return 1 - fi - } - - echo "=== Gitea Initialization Started ===" - echo "Domain: $DOMAIN" - echo "Gitea URL: $GITEA_URL" - echo "Admin User: $GITEA_ADMIN_USER" - echo "" - - # Step -1: Wait for Gitea to be ready - if ! wait_for_gitea_ready; then - echo "FATAL: Gitea is not ready, aborting initialization" - exit 1 - fi - echo "Step 0: Create admin access token" - if retry_with_backoff 5 3 "Create admin token" create_admin_token; then - # Token is already created and stored in GITEA_TOKEN by the create_admin_token function - if [ -z "$GITEA_TOKEN" ]; then - # If GITEA_TOKEN is empty, try to get it from the existing secret - GITEA_TOKEN=$(kubectl get secret gitea-admin-token -n cf-gitea -o jsonpath='{.data.token}' | base64 -d) - fi - echo "Admin token ready for use" - else - echo "FATAL: Failed to create admin token" - exit 1 - fi - - # Function to create organization - create_organization() { - local response=$(curl -s -w "%{http_code}" -o /tmp/org_response.json -X POST "${GITEA_URL}/api/v1/orgs" \ - -H "Authorization: token ${GITEA_TOKEN}" \ - -H "Content-Type: application/json" \ - -d '{ - "username": "cluster-org", - "full_name": "Cluster Organization", - "description": "Organization for cluster management repositories", - "visibility": "public" - }') - - case $response in - 201|409) # 201=created, 409=already exists - echo "Organization 'cluster-org' ready" - return 0 - ;; - *) - echo "Failed to create organization (HTTP $response)" - cat /tmp/org_response.json 2>/dev/null || true - return 1 - ;; - esac - } + GITEA_TOKEN=$(kubectl -n cf-gitea exec deploy/gitea -c gitea -- \ + gitea admin user generate-access-token --raw --username "$GITEA_ADMIN_USER" --token-name "api-token-$(date +%s)" --scopes all + ) + kubectl create secret generic gitea-admin-token --from-literal=token="${GITEA_TOKEN}" --namespace=cf-gitea --dry-run=client -o yaml | kubectl apply -f - echo "Step 1: Creating organization 'cluster-org'..." - if ! retry_with_backoff 3 5 "Create organization" create_organization; then - echo "FATAL: Failed to create organization after retries" - exit 1 - fi - - # Function to check if repository exists - check_repo_exists() { - curl -s -f -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/cluster-org/cluster-forge" >/dev/null 2>&1 - } - - # Function to migrate repository - migrate_repository() { - local HTTP_CODE=$(curl -s -w "%{http_code}" -o /tmp/migration_response.json -X POST "${GITEA_URL}/api/v1/repos/migrate" \ - -H "Authorization: token ${GITEA_TOKEN}" \ - -H "Content-Type: application/json" \ - -d '{ - "clone_addr": "https://github.com/silogen/cluster-forge.git", - "repo_name": "cluster-forge", - "repo_owner": "cluster-org", - "service": "git", - "mirror": true, - "mirror_interval": "15m", - "private": false - }') - - case $HTTP_CODE in - 201) - echo "Repository migration completed successfully" - return 0 - ;; - 409) - echo "Repository already exists" - return 0 - ;; - *) - echo "Migration failed with HTTP $HTTP_CODE" - cat /tmp/migration_response.json 2>/dev/null || true - - # Clean up failed repository - echo "Cleaning up failed repository..." - curl -s -X DELETE "${GITEA_URL}/api/v1/repos/cluster-org/cluster-forge" \ - -H "Authorization: token ${GITEA_TOKEN}" >/dev/null 2>&1 - return 1 - ;; - esac - } + curl -X POST "${GITEA_URL}/api/v1/orgs" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{ + "username": "cluster-org", + "full_name": "Cluster Organization", + "description": "Organization for cluster management repositories", + "visibility": "public" + }' || echo "Failed to create organization, might already exist" echo "Step 2: Creating repository 'cluster-forge' as mirror..." - if check_repo_exists; then + if curl -s -f -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/cluster-org/cluster-forge" >/dev/null 2>&1; then echo "Repository 'cluster-forge' already exists" else - if ! retry_with_backoff 5 5 "Migrate repository" migrate_repository; then - echo "FATAL: Failed to create mirror repository after retries" - exit 1 - fi + MAX_ATTEMPTS=5 + ATTEMPT=1 + SUCCESS=false + + while [ $ATTEMPT -le $MAX_ATTEMPTS ] && [ "$SUCCESS" = false ]; do + echo "Migration attempt $ATTEMPT/$MAX_ATTEMPTS..." + + HTTP_CODE=$(curl -s -w "%{http_code}" -o /tmp/migration_response.json -X POST "${GITEA_URL}/api/v1/repos/migrate" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{ + "clone_addr": "https://github.com/silogen/cluster-forge.git", + "repo_name": "cluster-forge", + "repo_owner": "cluster-org", + "service": "git", + "mirror": true, + "mirror_interval": "15m", + "private": false + }') + + case $HTTP_CODE in + 201) + echo "Repository migration completed successfully" + SUCCESS=true + ;; + *) + echo "Attempt $ATTEMPT failed with HTTP $HTTP_CODE, retrying..." + echo "Cleaning up failed repository..." + sleep 1 + curl -s -X DELETE "${GITEA_URL}/api/v1/repos/cluster-org/cluster-forge" \ + -H "Authorization: token ${GITEA_TOKEN}" >/dev/null 2>&1 + echo "Failed repository deleted..." + ;; + esac + + if [ "$SUCCESS" = false ] && [ $ATTEMPT -lt $MAX_ATTEMPTS ]; then + sleep 5 + fi + + ATTEMPT=$((ATTEMPT + 1)) + done + + if [ "$SUCCESS" = false ]; then + echo "ERROR: Failed to create mirror repository after $MAX_ATTEMPTS attempts" + exit 1 + fi fi # set mirror default branch (--dev mode) @@ -221,39 +89,19 @@ data: -d '{"default_branch": "{{ .Values.targetRevision }}"}' fi - # Function to create cluster-values repository - create_cluster_values_repo() { - local response=$(curl -s -w "%{http_code}" -o /tmp/repo_response.json -X POST "${GITEA_URL}/api/v1/orgs/cluster-org/repos" \ - -H "Authorization: token ${GITEA_TOKEN}" \ - -H "Content-Type: application/json" \ - -d '{ - "name": "cluster-values", - "description": "Cluster configuration values repository", - "private": false, - "auto_init": true - }') - - case $response in - 201|409) # 201=created, 409=already exists - echo "Repository 'cluster-values' ready" - return 0 - ;; - *) - echo "Failed to create cluster-values repository (HTTP $response)" - cat /tmp/repo_response.json 2>/dev/null || true - return 1 - ;; - esac - } - echo "Step 3: Creating repository 'cluster-values'..." - if ! retry_with_backoff 3 5 "Create cluster-values repo" create_cluster_values_repo; then - echo "FATAL: Failed to create cluster-values repository after retries" - exit 1 - fi - - echo "Step 4: Creating user 'devuser' (optional)..." - response=$(curl -s -w "%{http_code}" -o /tmp/user_response.json -X POST "${GITEA_URL}/api/v1/admin/users" \ + curl -X POST "${GITEA_URL}/api/v1/orgs/cluster-org/repos" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "cluster-values", + "description": "Cluster configuration values repository", + "private": false, + "auto_init": true + }' || echo "Repository might already exist" + + echo "Step 4: Creating user 'devuser'..." + curl -X POST "${GITEA_URL}/api/v1/admin/users" \ -H "Authorization: token ${GITEA_TOKEN}" \ -H "Content-Type: application/json" \ -d '{ @@ -263,36 +111,20 @@ data: "full_name": "Dev User", "must_change_password": false, "send_notify": false - }') - - case $response in - 201) echo "User 'devuser' created successfully" ;; - 422) echo "User 'devuser' already exists" ;; - *) echo "User creation failed (HTTP $response) - continuing anyway" ;; - esac + }' || echo "User creation failed, might already exist or insufficient permissions" echo "Step 5: Getting organization 'cluster-org' owners team id..." OWNERS_TEAM_ID=$(curl -s -H "Authorization: token ${GITEA_TOKEN}" -H "Content-Type: application/json" \ "${GITEA_URL}/api/v1/orgs/cluster-org/teams" | \ - jq -r '.[] | select(.name == "Owners") | .id' 2>/dev/null) + jq -r '.[] | select(.name == "Owners") | .id') - if [ -n "$OWNERS_TEAM_ID" ] && [ "$OWNERS_TEAM_ID" != "null" ]; then - echo "Step 6: Adding user 'devuser' to organization 'cluster-org' owners..." - response=$(curl -s -w "%{http_code}" -o /dev/null -X PUT "${GITEA_URL}/api/v1/teams/${OWNERS_TEAM_ID}/members/devuser" \ - -H "Authorization: token ${GITEA_TOKEN}" \ - -H "Content-Type: application/json") - - case $response in - 200|204) echo "User 'devuser' added to organization successfully" ;; - *) echo "Failed to add user to organization (HTTP $response) - continuing anyway" ;; - esac - else - echo "Could not find owners team ID - skipping user organization assignment" - fi + echo "Step 6: Adding user 'devuser' to organization 'cluster-org' owners..." + curl -X PUT "${GITEA_URL}/api/v1/teams/${OWNERS_TEAM_ID}/members/devuser" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" || echo "Failed to add user to organization" - # Function to create values.yaml file - create_values_file() { - cat > /tmp/values.yaml << 'VALUESEOF' + echo "Step 7: Creating values.yaml file with cluster-forge reference in cluster-values repo..." + cat > /tmp/values.yaml << 'EOF' clusterForge: repoURL: http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git path: root @@ -302,41 +134,18 @@ data: global: clusterSize: {{ .Values.clusterSize }} domain: DOMAIN_PLACEHOLDER - VALUESEOF - - sed -i "s/DOMAIN_PLACEHOLDER/${DOMAIN}/g" /tmp/values.yaml - - local encoded_content=$(base64 -w 0 < /tmp/values.yaml) - local response=$(curl -s -w "%{http_code}" -o /tmp/values_response.json -X POST "${GITEA_URL}/api/v1/repos/cluster-org/cluster-values/contents/values.yaml" \ - -H "Authorization: token ${GITEA_TOKEN}" \ - -H "Content-Type: application/json" \ - -d "{ - \"message\": \"Initialize cluster values configuration\", - \"content\": \"$encoded_content\", - \"branch\": \"main\" - }") - - case $response in - 201) - echo "Values.yaml file created successfully" - return 0 - ;; - 422) - echo "Values.yaml file already exists" - return 0 - ;; - *) - echo "Failed to create values.yaml file (HTTP $response)" - cat /tmp/values_response.json 2>/dev/null || true - return 1 - ;; - esac - } + + EOF - echo "Step 7: Creating values.yaml file with cluster-forge reference in cluster-values repo..." - if retry_with_backoff 3 5 "Create values.yaml file" create_values_file; then - echo "=== Gitea Setup Completed Successfully! ===" - else - echo "WARNING: Failed to create values.yaml file, but core setup is complete" - echo "=== Gitea Setup Completed with Warnings ===" - fi + sed -i "s/DOMAIN_PLACEHOLDER/${DOMAIN}/g" /tmp/values.yaml + + curl -X POST "${GITEA_URL}/api/v1/repos/cluster-org/cluster-values/contents/values.yaml" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{ + "message": "Initialize cluster values configuration", + "content": "'$(base64 -w 0 < /tmp/values.yaml)'", + "branch": "main" + }' || echo "Failed to create values.yaml file" + + echo "Setup completed successfully!" diff --git a/scripts/init-gitea-job/templates/cf-init-gitea-job.yaml b/scripts/init-gitea-job/templates/cf-init-gitea-job.yaml index 15c12aa9..0da19d31 100644 --- a/scripts/init-gitea-job/templates/cf-init-gitea-job.yaml +++ b/scripts/init-gitea-job/templates/cf-init-gitea-job.yaml @@ -4,8 +4,6 @@ metadata: name: gitea-init-job namespace: cf-gitea spec: - backoffLimit: 2 - activeDeadlineSeconds: 1200 # 20 minutes total timeout template: spec: restartPolicy: Never From d73d7987eaded146e6baa662d2d6c167fb215918 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 11:12:45 +0200 Subject: [PATCH 079/115] qa: rm unneeded debug script --- scripts/debug-cluster-state.sh | 133 --------------------------------- 1 file changed, 133 deletions(-) delete mode 100755 scripts/debug-cluster-state.sh diff --git a/scripts/debug-cluster-state.sh b/scripts/debug-cluster-state.sh deleted file mode 100755 index 3b836999..00000000 --- a/scripts/debug-cluster-state.sh +++ /dev/null @@ -1,133 +0,0 @@ -#!/bin/bash - -# Debug script to check current cluster state for bootstrap troubleshooting - -set -euo pipefail - -echo "🔍 ClusterForge Bootstrap Debug Report" -echo "========================================" -echo - -# Check basic cluster connectivity -echo "📡 Cluster Connectivity:" -if kubectl auth can-i get pods >/dev/null 2>&1; then - echo " ✅ Kubectl access: Working" -else - echo " ❌ Kubectl access: Failed" - echo " Please check kubeconfig and cluster connectivity" - exit 1 -fi - -# Check namespaces -echo -echo "📦 Namespaces:" -for ns in argocd cf-gitea cf-openbao; do - if kubectl get namespace "$ns" >/dev/null 2>&1; then - echo " ✅ $ns: Exists" - else - echo " ❌ $ns: Missing" - fi -done - -# Check ArgoCD CRDs -echo -echo "🔧 ArgoCD CRDs:" -if kubectl get crd applications.argoproj.io >/dev/null 2>&1; then - echo " ✅ applications.argoproj.io: Available" -else - echo " ❌ applications.argoproj.io: Missing" - echo " ArgoCD must be deployed first" -fi - -# Check ArgoCD deployment -echo -echo "⚙️ ArgoCD Deployment:" -if kubectl get namespace argocd >/dev/null 2>&1; then - argocd_pods=$(kubectl get pods -n argocd --no-headers 2>/dev/null | wc -l || echo "0") - ready_pods=$(kubectl get pods -n argocd --no-headers 2>/dev/null | grep -c "Running" || echo "0") - echo " 📊 Pods: $ready_pods/$argocd_pods running" - - if [ "$argocd_pods" -gt 0 ]; then - echo " 📋 Pod Status:" - kubectl get pods -n argocd --no-headers 2>/dev/null | while read pod status ready age; do - if [ "$status" = "Running" ]; then - echo " ✅ $pod: $status" - else - echo " ❌ $pod: $status" - fi - done - fi -else - echo " ❌ ArgoCD namespace not found" -fi - -# Check Gitea deployment -echo -echo "📚 Gitea Deployment:" -if kubectl get namespace cf-gitea >/dev/null 2>&1; then - gitea_pods=$(kubectl get pods -n cf-gitea --no-headers 2>/dev/null | wc -l || echo "0") - ready_gitea=$(kubectl get pods -n cf-gitea --no-headers 2>/dev/null | grep -c "Running" || echo "0") - echo " 📊 Pods: $ready_gitea/$gitea_pods running" -else - echo " ❌ Gitea namespace not found" -fi - -# Check OpenBao deployment -echo -echo "🔐 OpenBao Deployment:" -if kubectl get namespace cf-openbao >/dev/null 2>&1; then - openbao_pods=$(kubectl get pods -n cf-openbao --no-headers 2>/dev/null | wc -l || echo "0") - ready_openbao=$(kubectl get pods -n cf-openbao --no-headers 2>/dev/null | grep -c "Running" || echo "0") - echo " 📊 Pods: $ready_openbao/$openbao_pods running" - - # Check specifically for openbao-0 readiness - if kubectl get pod openbao-0 -n cf-openbao >/dev/null 2>&1; then - ready_status=$(kubectl get pod openbao-0 -n cf-openbao -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' || echo "Unknown") - if [ "$ready_status" = "True" ]; then - echo " ✅ openbao-0: Ready" - else - echo " ❌ openbao-0: Not Ready" - echo " This is the original issue - OpenBao readiness probe failing" - fi - fi -else - echo " ❌ OpenBao namespace not found" -fi - -# Check ArgoCD Applications -echo -echo "📱 ArgoCD Applications:" -if kubectl get crd applications.argoproj.io >/dev/null 2>&1 && kubectl get namespace argocd >/dev/null 2>&1; then - apps=$(kubectl get applications -n argocd --no-headers 2>/dev/null | wc -l || echo "0") - if [ "$apps" -gt 0 ]; then - echo " 📊 Found $apps applications:" - kubectl get applications -n argocd --no-headers 2>/dev/null | while read name health sync; do - if [ "$health" = "Healthy" ] && [ "$sync" = "Synced" ]; then - echo " ✅ $name: $health/$sync" - else - echo " ❌ $name: $health/$sync" - fi - done - else - echo " 📊 No applications found" - echo " This suggests cluster-forge parent app was not created" - fi -else - echo " ❌ Cannot check applications (ArgoCD not ready)" -fi - -echo -echo "📋 Recommendations:" -if ! kubectl get crd applications.argoproj.io >/dev/null 2>&1; then - echo " 🔄 Re-run bootstrap to deploy ArgoCD first" - echo " ./scripts/bootstrap.sh --cluster-size=" -elif [ "$apps" -eq 0 ]; then - echo " 🔄 Create cluster-forge parent application" - echo " ./scripts/bootstrap.sh --cluster-size= --apps=cluster-forge" -else - echo " ✅ Bootstrap appears to be in progress" - echo " Check individual application sync status in ArgoCD UI" -fi - -echo -echo "🔚 Debug report complete" \ No newline at end of file From cbc73c17a6f915fde63f44e2bebe81b5f8bee137 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 11:21:01 +0200 Subject: [PATCH 080/115] refactor: move Gitea sessionAffinity warning from vendor source to values.yaml --- root/values.yaml | 5 +++++ sources/gitea/12.3.0/templates/gitea/http-svc.yaml | 3 --- sources/gitea/12.3.0/templates/gitea/ssh-svc.yaml | 3 --- sources/gitea/12.3.0/values.yaml | 2 -- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index 13e1083f..a438b5dc 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -247,6 +247,11 @@ apps: enabled: true test: enabled: false + service: + http: + sessionAffinity: "None" + ssh: + sessionAffinity: "None" helmParameters: - name: clusterDomain value: "{{ .Values.global.domain }}" diff --git a/sources/gitea/12.3.0/templates/gitea/http-svc.yaml b/sources/gitea/12.3.0/templates/gitea/http-svc.yaml index 9638d2f4..28bd2182 100644 --- a/sources/gitea/12.3.0/templates/gitea/http-svc.yaml +++ b/sources/gitea/12.3.0/templates/gitea/http-svc.yaml @@ -43,9 +43,6 @@ spec: {{- if and .Values.service.http.clusterIP (eq .Values.service.http.type "ClusterIP") }} clusterIP: {{ .Values.service.http.clusterIP }} {{- end }} - {{- if and .Values.service.http.sessionAffinity (ne .Values.service.http.clusterIP "None") (ne .Values.service.http.sessionAffinity "None") }} - sessionAffinity: {{ .Values.service.http.sessionAffinity }} - {{- end }} ports: - name: http port: {{ .Values.service.http.port }} diff --git a/sources/gitea/12.3.0/templates/gitea/ssh-svc.yaml b/sources/gitea/12.3.0/templates/gitea/ssh-svc.yaml index 1676aea6..b2046fe1 100644 --- a/sources/gitea/12.3.0/templates/gitea/ssh-svc.yaml +++ b/sources/gitea/12.3.0/templates/gitea/ssh-svc.yaml @@ -29,9 +29,6 @@ spec: {{- if and .Values.service.ssh.clusterIP (eq .Values.service.ssh.type "ClusterIP") }} clusterIP: {{ .Values.service.ssh.clusterIP }} {{- end }} - {{- if and .Values.service.ssh.sessionAffinity (ne .Values.service.ssh.clusterIP "None") (ne .Values.service.ssh.sessionAffinity "None") }} - sessionAffinity: {{ .Values.service.ssh.sessionAffinity }} - {{- end }} {{- if .Values.service.ssh.externalIPs }} externalIPs: {{- toYaml .Values.service.ssh.externalIPs | nindent 4 }} diff --git a/sources/gitea/12.3.0/values.yaml b/sources/gitea/12.3.0/values.yaml index b09138bb..4f323cb4 100644 --- a/sources/gitea/12.3.0/values.yaml +++ b/sources/gitea/12.3.0/values.yaml @@ -115,7 +115,6 @@ service: type: ClusterIP port: 3000 clusterIP: None - sessionAffinity: "None" loadBalancerIP: nodePort: externalTrafficPolicy: @@ -144,7 +143,6 @@ service: type: ClusterIP port: 22 clusterIP: None - sessionAffinity: "None" loadBalancerIP: nodePort: externalTrafficPolicy: From e3a803a1acaceb0763093bbe2d1a1c7fff8f9b20 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 11:37:31 +0200 Subject: [PATCH 081/115] refactor: alpha-sort apps --- root/values.yaml | 915 +++++++++++++++---------------- root/values_large.yaml | 41 +- root/values_large_original.yaml | 64 +++ root/values_medium.yaml | 208 ++++--- root/values_medium_original.yaml | 212 +++++++ root/values_original.yaml | 808 +++++++++++++++++++++++++++ root/values_small.yaml | 167 +++--- root/values_small_original.yaml | 189 +++++++ 8 files changed, 1912 insertions(+), 692 deletions(-) create mode 100644 root/values_large_original.yaml create mode 100644 root/values_medium_original.yaml create mode 100644 root/values_original.yaml create mode 100644 root/values_small_original.yaml diff --git a/root/values.yaml b/root/values.yaml index a438b5dc..42904a61 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -1,46 +1,97 @@ clusterForge: repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" targetRevision: main -# source helm values file from separate git repo externalValues: enabled: true + path: values.yaml repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" targetRevision: main - path: values.yaml global: - domain: # to be filled by bootstrap script clusterSize: # to be filled by bootstrap script (small, medium, large) -# enabledApps list removed - each cluster size (small/medium/large) defines its own apps -# This eliminates the override/clobbering issue and makes cluster sizes more independent + domain: # to be filled by bootstrap script + apps: - # Core apps + aim-cluster-model-source: + namespace: kaiwo-system + path: aim-cluster-model-source + syncWave: -20 + airm: + helmParameters: + - name: airm-api.airm.appDomain + value: "{{ .Values.global.domain }}" + ignoreDifferences: + - group: external-secrets.io + jqPathExpressions: + - ".spec.data[].remoteRef.conversionStrategy" + - ".spec.data[].remoteRef.decodingStrategy" + - ".spec.data[].remoteRef.metadataPolicy" + kind: ExternalSecret + - group: kyverno.io + jqPathExpressions: + - ".spec.rules" + kind: ClusterPolicy + namespace: airm + path: airm/0.3.5 + syncWave: 0 + valuesFile: values.yaml + amd-gpu-operator: + namespace: kube-amd-gpu + path: amd-gpu-operator/v1.4.1 + syncWave: -10 + valuesObject: + crds: + defaultCR: + install: false + amd-gpu-operator-config: + namespace: kube-amd-gpu + path: amd-gpu-operator-config + syncWave: 0 + appwrapper: + namespace: appwrapper-system + path: appwrapper/v1.1.2 + syncWave: -10 argocd: - path: argocd/8.3.5 + helmParameters: + - name: global.domain + value: "argocd.{{ .Values.global.domain }}" + - name: configs.cm.oidc\.config + value: | + name: Keycloak + issuer: https://kc.{{ .Values.global.domain }}/realms/airm + clientID: argocd + clientSecret: $$argocd-oidc-creds:client_secret + rootCA: $cluster-tls:cert + requestedScopes: ["openid", "profile", "email", "groups"] namespace: argocd + path: argocd/8.3.5 + syncWave: -30 valuesObject: applicationSet: replicas: 1 configs: cm: create: true - resource.customizations.health.opentelemetry.io_OpenTelemetryCollector: | - hs = {} - hs.status = "Healthy" - hs.message = "Always Healthy - See https://github.com/argoproj/argo-cd/pull/24284" - return hs - resource.customizations.health.keda.sh_ScaledObject: | + resource.customizations.health.apps_StatefulSet: | + -- Custom health check for OpenBao StatefulSet + -- Uses standard StatefulSet readiness for now - openbao-init job handles actual initialization hs = {} if obj.status ~= nil then - if obj.status.conditions ~= nil then - for _, condition in ipairs(obj.status.conditions) do - if condition.type == "Ready" then - if condition.status == "True" then - hs.status = "Healthy" - hs.message = "ScaledObject is ready" - else - hs.status = "Degraded" - hs.message = condition.reason or "ScaledObject not ready" - end + if obj.status.readyReplicas ~= nil and obj.status.replicas ~= nil then + if obj.status.readyReplicas == obj.status.replicas then + hs.status = "Healthy" + hs.message = "StatefulSet is ready" + else + hs.status = "Progressing" + hs.message = "Waiting for StatefulSet replicas to be ready" + end + else + hs.status = "Progressing" + hs.message = "Waiting for StatefulSet status" + end + else + hs.status = "Progressing" + hs.message = "Waiting for StatefulSet status" + end return hs resource.customizations.health.batch_Job: | -- Custom health check for Jobs, especially openbao-init @@ -79,27 +130,24 @@ apps: hs.message = "ScaledObject status unknown" end return hs - resource.customizations.health.apps_StatefulSet: | - -- Custom health check for OpenBao StatefulSet - -- Uses standard StatefulSet readiness for now - openbao-init job handles actual initialization + resource.customizations.health.keda.sh_ScaledObject: | hs = {} if obj.status ~= nil then - if obj.status.readyReplicas ~= nil and obj.status.replicas ~= nil then - if obj.status.readyReplicas == obj.status.replicas then - hs.status = "Healthy" - hs.message = "StatefulSet is ready" - else - hs.status = "Progressing" - hs.message = "Waiting for StatefulSet replicas to be ready" - end - else - hs.status = "Progressing" - hs.message = "Waiting for StatefulSet status" - end - else - hs.status = "Progressing" - hs.message = "Waiting for StatefulSet status" - end + if obj.status.conditions ~= nil then + for _, condition in ipairs(obj.status.conditions) do + if condition.type == "Ready" then + if condition.status == "True" then + hs.status = "Healthy" + hs.message = "ScaledObject is ready" + else + hs.status = "Degraded" + hs.message = condition.reason or "ScaledObject not ready" + end + return hs + resource.customizations.health.opentelemetry.io_OpenTelemetryCollector: | + hs = {} + hs.status = "Healthy" + hs.message = "Always Healthy - See https://github.com/argoproj/argo-cd/pull/24284" return hs params: server.insecure: true @@ -109,330 +157,216 @@ apps: g, argocd-users, role:admin controller: replicas: 1 + global: + domain: # to be filled by cluster-forge app redis: enabled: true redis-ha: enabled: false repoServer: - replicas: 1 autoscaling: enabled: false - server: replicas: 1 + server: autoscaling: enabled: false - global: - domain: # to be filled by cluster-forge app - helmParameters: - - name: global.domain - value: "argocd.{{ .Values.global.domain }}" - - name: configs.cm.oidc\.config - value: | - name: Keycloak - issuer: https://kc.{{ .Values.global.domain }}/realms/airm - clientID: argocd - clientSecret: $$argocd-oidc-creds:client_secret - rootCA: $cluster-tls:cert - requestedScopes: ["openid", "profile", "email", "groups"] - syncWave: -30 + replicas: 1 argocd-config: - path: argocd-config - namespace: argocd - syncWave: 5 ignoreDifferences: - group: external-secrets.io - kind: ExternalSecret jqPathExpressions: - ".spec.data[].remoteRef.conversionStrategy" - ".spec.data[].remoteRef.decodingStrategy" - ".spec.data[].remoteRef.metadataPolicy" + kind: ExternalSecret + namespace: argocd + path: argocd-config + syncWave: 5 cert-manager: namespace: cert-manager path: cert-manager/v1.18.2 syncWave: -40 valuesObject: installCRDs: true - openbao: - path: openbao/0.18.2 - namespace: cf-openbao - valuesObject: - injector: - enabled: false - server: - affinity: | - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/name: openbao - app.kubernetes.io/instance: openbao - component: server - topologyKey: kubernetes.io/hostname - ha: - enabled: false - raft: - enabled: false - replicas: 1 - ui: - enabled: true - syncWave: -70 + cluster-auth: + namespace: cluster-auth + path: cluster-auth/0.5.0 + syncWave: -20 + valuesFile: values.yaml + cluster-auth-config: ignoreDifferences: - - group: "apps" - kind: "Deployment" - jsonPointers: - - /spec/replicas - - group: "apps" - kind: "StatefulSet" - name: "openbao" - jsonPointers: - - /spec/volumeClaimTemplates - openbao-init: - path: ../scripts/init-openbao-job - namespace: cf-openbao - valuesObject: - domain: # to be filled by cluster-forge app - helmParameters: - - name: domain - value: "{{ .Values.global.domain }}" - syncWave: -50 - openbao-config: - path: openbao-config/0.1.0 - namespace: cf-openbao + - group: external-secrets.io + jqPathExpressions: + - ".spec.data[].remoteRef.conversionStrategy" + - ".spec.data[].remoteRef.decodingStrategy" + - ".spec.data[].remoteRef.metadataPolicy" + kind: ExternalSecret + namespace: cluster-auth + path: cluster-auth-config + syncWave: 5 + cnpg-operator: + namespace: cnpg-system + path: cnpg-operator/0.26.0 + syncWave: -30 valuesFile: values.yaml - helmParameters: - - name: domain - value: "{{ .Values.global.domain }}" - syncWave: -60 external-secrets: - path: external-secrets/0.15.1 namespace: external-secrets - valuesFile: values.yaml + path: external-secrets/0.15.1 syncWave: -40 + valuesFile: values.yaml external-secrets-config: - path: external-secrets-config namespace: external-secrets + path: external-secrets-config syncWave: -10 + gateway-api: + namespace: default + path: gateway-api/v1.3.0 + syncWave: -50 gitea: - path: gitea/12.3.0 + helmParameters: + - name: clusterDomain + value: "{{ .Values.global.domain }}" + - name: gitea.config.server.ROOT_URL + value: "https://gitea.{{ .Values.global.domain }}" namespace: cf-gitea + path: gitea/12.3.0 + syncWave: -30 valuesObject: clusterDomain: # to be filled by cluster-forge app - strategy: - type: "Recreate" gitea: admin: existingSecret: gitea-admin-credentials config: - server: - ROOT_URL: # to be filled by cluster-forge app - database: - DB_TYPE: sqlite3 - session: - PROVIDER: memory cache: ADAPTER: memory + database: + DB_TYPE: sqlite3 queue: TYPE: level - valkey-cluster: - enabled: false - valkey: - enabled: false + server: + ROOT_URL: # to be filled by cluster-forge app + session: + PROVIDER: memory + persistence: + enabled: true postgresql: enabled: false postgresql-ha: enabled: false - persistence: - enabled: true - test: - enabled: false service: http: sessionAffinity: "None" ssh: sessionAffinity: "None" - helmParameters: - - name: clusterDomain - value: "{{ .Values.global.domain }}" - - name: gitea.config.server.ROOT_URL - value: "https://gitea.{{ .Values.global.domain }}" - syncWave: -30 + strategy: + type: "Recreate" + test: + enabled: false + valkey: + enabled: false + valkey-cluster: + enabled: false gitea-config: - path: gitea-config - namespace: cf-gitea - valuesFile: values.yaml helmParameters: - - name: keycloak.url - value: "https://kc.{{ .Values.global.domain }}" - name: keycloak.realm value: "airm" + - name: keycloak.url + value: "https://kc.{{ .Values.global.domain }}" + namespace: cf-gitea + path: gitea-config syncWave: -20 - # Network apps - gateway-api: - path: gateway-api/v1.3.0 - namespace: default - syncWave: -50 - metallb: - path: metallb/v0.15.2 - namespace: default - syncWave: 10 - kgateway-crds: - path: kgateway-crds/v2.1.0-main - namespace: kgateway-system valuesFile: values.yaml - syncWave: -30 - kgateway: - path: kgateway/v2.1.0-main - namespace: kgateway-system - valuesObject: - controller: - image: - registry: "ghcr.io" - repository: silogen/kgateway-v2.1.0-main-websocket - tag: "0.0.1" - syncWave: -20 - kgateway-config: - path: kgateway-config - namespace: kgateway-system + kaiwo: + namespace: kaiwo-system + path: kaiwo/v0.2.0-rc11 + syncWave: -10 valuesFile: values.yaml - helmParameters: - - name: domain - value: "{{ .Values.global.domain }}" + kaiwo-config: + ignoreDifferences: + - group: external-secrets.io + jqPathExpressions: + - ".spec.data[].remoteRef.conversionStrategy" + - ".spec.data[].remoteRef.decodingStrategy" + - ".spec.data[].remoteRef.metadataPolicy" + kind: ExternalSecret + - group: "" + jsonPointers: + - /spec/accessModes + kind: "PersistentVolumeClaim" + namespace: kaiwo-system + path: kaiwo-config + syncWave: 0 + kaiwo-crds: + namespace: kaiwo-system + path: kaiwo-crds/v0.2.0-rc11 syncWave: -20 - # Monitoring - prometheus-crds: - path: prometheus-operator-crds/23.0.0 - namespace: prometheus-system + keda: + namespace: keda + path: keda/2.18.1 + syncWave: -10 valuesFile: values.yaml - syncWave: -50 - opentelemetry-operator: - path: opentelemetry-operator/0.93.1 - namespace: opentelemetry-operator-system - valuesObject: - # Cluster-forge specific values for opentelemetry-operator - # Sets the collector image to use contrib version (required for kaiwo/kedify-otel) - manager: - collectorImage: - repository: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib - tag: "0.140.0" - syncWave: -30 - otel-lgtm-stack: - path: otel-lgtm-stack/v1.0.7 - namespace: otel-lgtm-stack + kedify-otel: + ignoreDifferences: + - group: "" + jqPathExpressions: + - ".status" + kind: "Service" + name: "keda-otel-scaler" + - group: "apps" + jqPathExpressions: + - ".status.availableReplicas" + - ".status.readyReplicas" + kind: "Deployment" + namespace: keda + path: kedify-otel/v0.0.6 + syncWave: -5 valuesObject: - # Cluster-forge specific configuration for OpenTelemetry LGTM Stack - # This file overrides values.yaml for cluster-forge deployments - # Cluster identification - will be populated by root/values.yaml helmParameters - cluster: - name: # to be filled by cluster-forge app based on domain - # Component enablement (cluster-forge defaults) - dashboards: - enabled: true - nodeExporter: - enabled: true - kubeStateMetrics: - enabled: true - # Storage configuration optimized for cluster-forge - lgtm: - storage: - # Tempo storage for traces - tempo: 50Gi - # Loki storage for logs - loki: 50Gi - # Grafana storage for dashboards/config - grafana: 10Gi - # Mimir/Prometheus storage for metrics - mimir: 50Gi - # Loki additional storage - extra: 50Gi - # LGTM stack main deployment resources - resources: - limits: - memory: 8Gi - requests: - memory: 2Gi - cpu: "1" - # Resource configuration optimized for cluster-forge - collectors: - resources: - # Metrics collector (deployment mode) - metrics: - limits: - memory: 8Gi - cpu: "2" - requests: - memory: 1Gi - cpu: 500m - # Logs collector (daemonset mode) - logs: - limits: - memory: 2Gi - cpu: "1" - requests: - memory: 400Mi - cpu: 200m - # Service configuration - services: - # Main LGTM stack service ports - lgtm: - grafana: 3000 - otelGrpc: 4317 - otelHttp: 4318 - prometheus: 9090 - loki: 3100 - # Kube state metrics service port - kubeStateMetrics: - http: 8080 - # Node exporter service port - nodeExporter: - metrics: 9100 + validatingAdmissionPolicy: + enabled: false + keycloak: helmParameters: - - name: cluster.name + - name: domain value: "{{ .Values.global.domain }}" - syncWave: -20 - # Databases - cnpg-operator: - path: cnpg-operator/0.26.0 - namespace: cnpg-system - valuesFile: values.yaml - syncWave: -30 - # Access control - cluster-auth: - path: cluster-auth/0.5.0 - namespace: cluster-auth - valuesFile: values.yaml - syncWave: -20 - cluster-auth-config: - path: cluster-auth-config - namespace: cluster-auth - syncWave: 5 ignoreDifferences: - group: external-secrets.io - kind: ExternalSecret jqPathExpressions: - ".spec.data[].remoteRef.conversionStrategy" - ".spec.data[].remoteRef.decodingStrategy" - ".spec.data[].remoteRef.metadataPolicy" - keycloak: - path: keycloak-old + kind: ExternalSecret namespace: keycloak + path: keycloak-old + syncWave: -10 valuesObject: - replicaCount: 1 - resources: - limits: - cpu: "500m" - memory: "2Gi" - requests: - cpu: "250m" - memory: "512Mi" - podLabels: - app: keycloak auth: adminUser: admin existingSecret: "keycloak-credentials" passwordSecretKey: "KEYCLOAK_INITIAL_ADMIN_PASSWORD" extraStartupArgs: "--cache=ispn --features=scripts,admin-fine-grained-authz,token-exchange --import-realm" + extraVolumeMounts: + - mountPath: /opt/keycloak/providers + name: keycloak-package-volume + - mountPath: /opt/keycloak/data/import + name: keycloak-realm-volume + extraVolumes: + - configMap: + items: + - key: keycloak-scripts.json + path: META-INF/keycloak-scripts.json + - key: domain-group-authenticator.js + path: domain-group-authenticator.js + name: keycloak-scripts + name: keycloak-script-volume + - emptyDir: {} + name: keycloak-package-volume + - configMap: + name: keycloak-realm-templates-7kgh2hc6b2 + name: keycloak-airm-realm-template-volume + - emptyDir: {} + name: keycloak-realm-volume + - configMap: + name: keycloak-realm-templates-k8s + name: keycloak-k8s-realm-template-volume initContainers: - command: - /bin/sh @@ -521,136 +455,64 @@ apps: name: keycloak-k8s-realm-template-volume - mountPath: /opt/realms name: keycloak-realm-volume - extraVolumes: - - configMap: - name: keycloak-scripts - items: - - key: keycloak-scripts.json - path: META-INF/keycloak-scripts.json - - key: domain-group-authenticator.js - path: domain-group-authenticator.js - name: keycloak-script-volume - - emptyDir: {} - name: keycloak-package-volume - - configMap: - name: keycloak-realm-templates-7kgh2hc6b2 - name: keycloak-airm-realm-template-volume - - emptyDir: {} - name: keycloak-realm-volume - - configMap: - name: keycloak-realm-templates-k8s - name: keycloak-k8s-realm-template-volume - extraVolumeMounts: - - mountPath: /opt/keycloak/providers - name: keycloak-package-volume - - mountPath: /opt/keycloak/data/import - name: keycloak-realm-volume - helmParameters: - - name: domain - value: "{{ .Values.global.domain }}" - syncWave: -10 - ignoreDifferences: - - group: external-secrets.io - kind: ExternalSecret - jqPathExpressions: - - ".spec.data[].remoteRef.conversionStrategy" - - ".spec.data[].remoteRef.decodingStrategy" - - ".spec.data[].remoteRef.metadataPolicy" - kyverno: - path: kyverno/3.5.1 - namespace: kyverno - valuesFile: values.yaml - syncWave: -30 - kyverno-config: - path: kyverno-config - namespace: kyverno - syncWave: -20 - ignoreDifferences: - - group: "kyverno.io" - kind: "ClusterPolicy" - name: "local-path-access-mode-mutation" - jsonPointers: - - /spec/rules/0/skipBackgroundRequests - - group: "kyverno.io" - kind: "ClusterPolicy" - name: "local-path-access-mode-warning" - jsonPointers: - - /spec/rules/0/skipBackgroundRequests - - /spec/rules/0/validate/allowExistingViolations - kyverno-policies-base: - namespace: kyverno - path: kyverno-policies/base + podLabels: + app: keycloak + replicaCount: 1 + resources: + limits: + cpu: "500m" + memory: "2Gi" + requests: + cpu: "250m" + memory: "512Mi" + kgateway: + namespace: kgateway-system + path: kgateway/v2.1.0-main syncWave: -20 - # GPU - amd-gpu-operator: - path: amd-gpu-operator/v1.4.1 - namespace: kube-amd-gpu - valuesObject: - crds: - defaultCR: - install: false - syncWave: -10 - amd-gpu-operator-config: - path: amd-gpu-operator-config - namespace: kube-amd-gpu - syncWave: 0 - kuberay-operator: - path: kuberay-operator/1.4.2 - namespace: default - valuesFile: values.yaml - syncWave: -10 - # Autoscaling - keda: - path: keda/2.18.1 - namespace: keda - valuesFile: values.yaml - syncWave: -10 - kedify-otel: - path: kedify-otel/v0.0.6 - namespace: keda valuesObject: - # Cluster-forge specific values for kedify-otel - validatingAdmissionPolicy: - enabled: false - syncWave: -5 - ignoreDifferences: - - group: "" - kind: "Service" - name: "keda-otel-scaler" - jqPathExpressions: - - ".status" - - group: "apps" - kind: "Deployment" - jqPathExpressions: - - ".status.readyReplicas" - - ".status.availableReplicas" - # ML/AI - kserve-crds: - path: kserve-crds/v0.16.0 - namespace: kserve-system + controller: + image: + registry: "ghcr.io" + repository: silogen/kgateway-v2.1.0-main-websocket + tag: "0.0.1" + kgateway-config: + helmParameters: + - name: domain + value: "{{ .Values.global.domain }}" + namespace: kgateway-system + path: kgateway-config + syncWave: -20 valuesFile: values.yaml + kgateway-crds: + namespace: kgateway-system + path: kgateway-crds/v2.1.0-main syncWave: -30 + valuesFile: values.yaml kserve: - path: kserve/v0.16.0 namespace: kserve-system + path: kserve/v0.16.0 + syncWave: 0 valuesObject: kserve: controller: deploymentMode: "Standard" - syncWave: 0 - # Queues - rabbitmq: - path: rabbitmq/v2.15.0 - namespace: rabbitmq-system + kserve-crds: + namespace: kserve-system + path: kserve-crds/v0.16.0 + syncWave: -30 + valuesFile: values.yaml + kuberay-operator: + namespace: default + path: kuberay-operator/1.4.2 syncWave: -10 + valuesFile: values.yaml kueue: - path: kueue/0.13.0 namespace: kueue-system + path: kueue/0.13.0 + syncWave: -10 valuesObject: controllerManager: replicas: 1 - mutatingWebhook: - reinvocationPolicy: IfNeeded managerConfig: controllerManagerConfigYaml: |- apiVersion: config.kueue.x-k8s.io/v1beta1 @@ -696,113 +558,216 @@ apps: - "pod" - "deployment" - "statefulset" - syncWave: -10 + mutatingWebhook: + reinvocationPolicy: IfNeeded kueue-config: - path: kueue-config namespace: kueue-system + path: kueue-config syncWave: -10 - appwrapper: - path: appwrapper/v1.1.2 - namespace: appwrapper-system - syncWave: -10 - # Storage + kyverno: + namespace: kyverno + path: kyverno/3.5.1 + syncWave: -30 + valuesFile: values.yaml + kyverno-config: + ignoreDifferences: + - group: "kyverno.io" + jsonPointers: + - /spec/rules/0/skipBackgroundRequests + kind: "ClusterPolicy" + name: "local-path-access-mode-mutation" + - group: "kyverno.io" + jsonPointers: + - /spec/rules/0/skipBackgroundRequests + - /spec/rules/0/validate/allowExistingViolations + kind: "ClusterPolicy" + name: "local-path-access-mode-warning" + namespace: kyverno + path: kyverno-config + syncWave: -20 + kyverno-policies-base: + namespace: kyverno + path: kyverno-policies/base + syncWave: -20 + metallb: + namespace: default + path: metallb/v0.15.2 + syncWave: 10 minio-operator: - path: minio-operator/7.1.1 namespace: minio-operator - valuesFile: values.yaml + path: minio-operator/7.1.1 syncWave: -10 + valuesFile: values.yaml minio-tenant: - path: minio-tenant/7.1.1 namespace: minio-tenant-default + path: minio-tenant/7.1.1 + syncWave: 0 valuesObject: tenant: - name: default-minio-tenant - configSecret: - name: default-minio-tenant-env-configuration - existingSecret: true - pools: - - servers: 1 - name: pool-0 - volumesPerServer: 1 - size: 250Gi # Reduced from 500Gi for workstation - storageClassName: direct buckets: - name: default-bucket objectLock: true - name: models objectLock: true - users: - - name: default-user certificate: - requestAutoCert: false externalCaCertSecret: - name: cluster-tls type: kubernetes.io/secret/v1 + requestAutoCert: false + configSecret: + existingSecret: true + name: default-minio-tenant-env-configuration env: - name: MINIO_PROMETHEUS_AUTH_TYPE value: "public" - syncWave: 0 + name: default-minio-tenant + pools: + - name: pool-0 + servers: 1 + size: 250Gi # Reduced from 500Gi for workstation + storageClassName: direct + volumesPerServer: 1 + users: + - name: default-user minio-tenant-config: - path: minio-tenant-config - namespace: minio-tenant-default - valuesFile: values.yaml helmParameters: - name: domain value: "{{ .Values.global.domain }}" - syncWave: 0 ignoreDifferences: - group: external-secrets.io - kind: ExternalSecret jqPathExpressions: - ".spec.data[].remoteRef.conversionStrategy" - ".spec.data[].remoteRef.decodingStrategy" - ".spec.data[].remoteRef.metadataPolicy" - # Kaiwo (Kubernetes AI Workload Orchestrator) - aim-cluster-model-source: - path: aim-cluster-model-source - namespace: kaiwo-system - syncWave: -20 - kaiwo-crds: - path: kaiwo-crds/v0.2.0-rc11 - namespace: kaiwo-system - syncWave: -20 - kaiwo: - path: kaiwo/v0.2.0-rc11 - namespace: kaiwo-system - valuesFile: values.yaml - syncWave: -10 - kaiwo-config: - path: kaiwo-config - namespace: kaiwo-system + kind: ExternalSecret + namespace: minio-tenant-default + path: minio-tenant-config syncWave: 0 + valuesFile: values.yaml + openbao: ignoreDifferences: - - group: external-secrets.io - kind: ExternalSecret - jqPathExpressions: - - ".spec.data[].remoteRef.conversionStrategy" - - ".spec.data[].remoteRef.decodingStrategy" - - ".spec.data[].remoteRef.metadataPolicy" - - group: "" - kind: "PersistentVolumeClaim" + - group: "apps" jsonPointers: - - /spec/accessModes - # AMD Resource Manager (AIRM) - airm: - path: airm/0.3.5 - namespace: airm + - /spec/replicas + kind: "Deployment" + - group: "apps" + jsonPointers: + - /spec/volumeClaimTemplates + kind: "StatefulSet" + name: "openbao" + namespace: cf-openbao + path: openbao/0.18.2 + syncWave: -70 + valuesObject: + injector: + enabled: false + server: + affinity: | + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: openbao + app.kubernetes.io/instance: openbao + component: server + topologyKey: kubernetes.io/hostname + ha: + enabled: false + raft: + enabled: false + replicas: 1 + ui: + enabled: true + openbao-config: + helmParameters: + - name: domain + value: "{{ .Values.global.domain }}" + namespace: cf-openbao + path: openbao-config/0.1.0 + syncWave: -60 valuesFile: values.yaml + openbao-init: helmParameters: - - name: airm-api.airm.appDomain + - name: domain value: "{{ .Values.global.domain }}" - syncWave: 0 - ignoreDifferences: - - group: external-secrets.io - kind: ExternalSecret - jqPathExpressions: - - ".spec.data[].remoteRef.conversionStrategy" - - ".spec.data[].remoteRef.decodingStrategy" - - ".spec.data[].remoteRef.metadataPolicy" - - group: kyverno.io - kind: ClusterPolicy - jqPathExpressions: - - ".spec.rules" + namespace: cf-openbao + path: ../scripts/init-openbao-job + syncWave: -50 + valuesObject: + domain: # to be filled by cluster-forge app + opentelemetry-operator: + namespace: opentelemetry-operator-system + path: opentelemetry-operator/0.93.1 + syncWave: -30 + valuesObject: + manager: + collectorImage: + repository: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib + tag: "0.140.0" + otel-lgtm-stack: + helmParameters: + - name: cluster.name + value: "{{ .Values.global.domain }}" + namespace: otel-lgtm-stack + path: otel-lgtm-stack/v1.0.7 + syncWave: -20 + valuesObject: + cluster: + name: # to be filled by cluster-forge app based on domain + collectors: + resources: + logs: + limits: + cpu: "1" + memory: 2Gi + requests: + cpu: 200m + memory: 400Mi + metrics: + limits: + cpu: "2" + memory: 8Gi + requests: + cpu: 500m + memory: 1Gi + dashboards: + enabled: true + kubeStateMetrics: + enabled: true + lgtm: + resources: + limits: + memory: 8Gi + requests: + cpu: "1" + memory: 2Gi + storage: + extra: 50Gi + grafana: 10Gi + loki: 50Gi + mimir: 50Gi + tempo: 50Gi + nodeExporter: + enabled: true + services: + kubeStateMetrics: + http: 8080 + lgtm: + grafana: 3000 + loki: 3100 + otelGrpc: 4317 + otelHttp: 4318 + prometheus: 9090 + nodeExporter: + metrics: 9100 + prometheus-crds: + namespace: prometheus-system + path: prometheus-operator-crds/23.0.0 + syncWave: -50 + valuesFile: values.yaml + rabbitmq: + namespace: rabbitmq-system + path: rabbitmq/v2.15.0 + syncWave: -10 diff --git a/root/values_large.yaml b/root/values_large.yaml index cdaf0544..d9d59843 100644 --- a/root/values_large.yaml +++ b/root/values_large.yaml @@ -1,3 +1,21 @@ +apps: + minio-tenant: + valuesObject: + tenant: + pools: + - name: pool-0 + servers: 1 + size: 500Gi + storageClassName: direct + volumesPerServer: 1 + openbao: + valuesObject: + server: + ha: + enabled: true + raft: + enabled: true + replicas: 3 enabledApps: - aim-cluster-model-source - airm @@ -37,28 +55,9 @@ enabledApps: - minio-tenant - minio-tenant-config - openbao - - openbao-init - openbao-config + - openbao-init - opentelemetry-operator - otel-lgtm-stack - prometheus-crds - - rabbitmq - -apps: - minio-tenant: - valuesObject: - tenant: - pools: - - name: pool-0 - servers: 1 - size: 500Gi - storageClassName: direct - volumesPerServer: 1 - openbao: - valuesObject: - server: - ha: - enabled: true - replicas: 3 - raft: - enabled: true + - rabbitmq \ No newline at end of file diff --git a/root/values_large_original.yaml b/root/values_large_original.yaml new file mode 100644 index 00000000..cdaf0544 --- /dev/null +++ b/root/values_large_original.yaml @@ -0,0 +1,64 @@ +enabledApps: + - aim-cluster-model-source + - airm + - amd-gpu-operator + - amd-gpu-operator-config + - appwrapper + - argocd + - argocd-config + - cert-manager + - cluster-auth + - cluster-auth-config + - cnpg-operator + - external-secrets + - external-secrets-config + - gateway-api + - gitea + - gitea-config + - kaiwo + - kaiwo-config + - kaiwo-crds + - keda + - kedify-otel + - keycloak + - kgateway + - kgateway-config + - kgateway-crds + - kserve + - kserve-crds + - kueue + - kueue-config + - kuberay-operator + - kyverno + - kyverno-config + - kyverno-policies-base + - metallb + - minio-operator + - minio-tenant + - minio-tenant-config + - openbao + - openbao-init + - openbao-config + - opentelemetry-operator + - otel-lgtm-stack + - prometheus-crds + - rabbitmq + +apps: + minio-tenant: + valuesObject: + tenant: + pools: + - name: pool-0 + servers: 1 + size: 500Gi + storageClassName: direct + volumesPerServer: 1 + openbao: + valuesObject: + server: + ha: + enabled: true + replicas: 3 + raft: + enabled: true diff --git a/root/values_medium.yaml b/root/values_medium.yaml index a71a5b2f..b0fabeca 100644 --- a/root/values_medium.yaml +++ b/root/values_medium.yaml @@ -3,66 +3,7 @@ # Medium & Small clusters add local-path storage policy for RWX→RWO conversion # Medium & Small clusters add local-path storage policy for RWX→RWO conversion -enabledApps: - - aim-cluster-model-source - - airm - - amd-gpu-operator - - amd-gpu-operator-config - - appwrapper - - argocd - - argocd-config - - cert-manager - - cluster-auth - - cluster-auth-config - - cnpg-operator - - external-secrets - - external-secrets-config - - gateway-api - - gitea - - gitea-config - - kaiwo - - kaiwo-config - - kaiwo-crds - - keda - - kedify-otel - - keycloak - - kgateway - - kgateway-config - - kgateway-crds - - kserve - - kserve-crds - - kueue - - kueue-config - - kuberay-operator - - kyverno - - kyverno-config - - kyverno-policies-base - - kyverno-policies-storage-local-path - - metallb - - minio-operator - - minio-tenant - - minio-tenant-config - - openbao - - openbao-config - - openbao-init - - opentelemetry-operator - - otel-lgtm-stack - - prometheus-crds - - rabbitmq - - apps: - # Modular Kyverno policy applications (only the storage-local-path addition) - kyverno-policies-storage-local-path: - namespace: kyverno - path: kyverno-policies/storage-local-path - syncWave: -20 - ignoreDifferences: - - group: kyverno.io - kind: ClusterPolicy - jsonPointers: - - /spec/rules/0/skipBackgroundRequests - - /spec/rules/0/validate/allowExistingViolations argocd: valuesObject: applicationSet: @@ -104,7 +45,37 @@ apps: requests: cpu: "125m" memory: "256Mi" - + grafana: + valuesObject: + persistence: + accessModes: + - ReadWriteOnce + enabled: true + size: 5Gi + storageClassName: direct + replicas: 1 + resources: + limits: + cpu: "1000m" + memory: "2Gi" + requests: + cpu: "250m" + memory: "512Mi" + keycloak: + valuesObject: + # Increase memory resources for Keycloak to prevent OOMKilled during initialization + # Medium preset provides 1536Mi memory limit vs small preset's 768Mi + resourcesPreset: "medium" + kyverno-policies-storage-local-path: + ignoreDifferences: + - group: kyverno.io + jsonPointers: + - /spec/rules/0/skipBackgroundRequests + - /spec/rules/0/validate/allowExistingViolations + kind: ClusterPolicy + namespace: kyverno + path: kyverno-policies/storage-local-path + syncWave: -20 minio-tenant: valuesObject: tenant: @@ -128,18 +99,17 @@ apps: requests: cpu: "1000m" memory: "2Gi" - openbao: valuesObject: server: + dataStorage: + size: 5Gi + storageClass: direct ha: enabled: false - replicas: 1 raft: enabled: false - dataStorage: - size: 5Gi - storageClass: direct + replicas: 1 resources: limits: cpu: "1000m" @@ -147,13 +117,25 @@ apps: requests: cpu: "250m" memory: "512Mi" - + opentelemetry-operator: + valuesObject: + manager: + resources: + requests: + cpu: "250m" + memory: "512Mi" + otel-lgtm-stack: + valuesObject: + collectors: + resources: + metrics: + cpu: '1' + limits: + memory: 4Gi prometheus: valuesObject: prometheus: prometheusSpec: - retention: 15d - retentionSize: 20GB resources: limits: cpu: "2000m" @@ -161,52 +143,60 @@ apps: requests: cpu: "500m" memory: "1Gi" + retention: 15d + retentionSize: 20GB storageSpec: volumeClaimTemplate: spec: accessModes: - ReadWriteOnce - storageClassName: direct resources: requests: storage: 25Gi - - grafana: - valuesObject: - replicas: 1 - resources: - limits: - cpu: "1000m" - memory: "2Gi" - requests: - cpu: "250m" - memory: "512Mi" - persistence: - enabled: true - size: 5Gi - storageClassName: direct - accessModes: - - ReadWriteOnce - - keycloak: - valuesObject: - # Increase memory resources for Keycloak to prevent OOMKilled during initialization - # Medium preset provides 1536Mi memory limit vs small preset's 768Mi - resourcesPreset: "medium" - - otel-lgtm-stack: - valuesObject: - collectors: - resources: - metrics: - limits: - memory: 4Gi - cpu: '1' - - opentelemetry-operator: - valuesObject: - manager: - resources: - requests: - cpu: "250m" - memory: "512Mi" + storageClassName: direct +enabledApps: + - aim-cluster-model-source + - airm + - amd-gpu-operator + - amd-gpu-operator-config + - appwrapper + - argocd + - argocd-config + - cert-manager + - cluster-auth + - cluster-auth-config + - cnpg-operator + - external-secrets + - external-secrets-config + - gateway-api + - gitea + - gitea-config + - kaiwo + - kaiwo-config + - kaiwo-crds + - keda + - kedify-otel + - keycloak + - kgateway + - kgateway-config + - kgateway-crds + - kserve + - kserve-crds + - kueue + - kueue-config + - kuberay-operator + - kyverno + - kyverno-config + - kyverno-policies-base + - kyverno-policies-storage-local-path + - metallb + - minio-operator + - minio-tenant + - minio-tenant-config + - openbao + - openbao-config + - openbao-init + - opentelemetry-operator + - otel-lgtm-stack + - prometheus-crds + - rabbitmq \ No newline at end of file diff --git a/root/values_medium_original.yaml b/root/values_medium_original.yaml new file mode 100644 index 00000000..a71a5b2f --- /dev/null +++ b/root/values_medium_original.yaml @@ -0,0 +1,212 @@ +# MEDIUM CLUSTER: All apps enabled (inherited from base values.yaml) +# Add Kyverno policy for local-path access mode mutation + +# Medium & Small clusters add local-path storage policy for RWX→RWO conversion +# Medium & Small clusters add local-path storage policy for RWX→RWO conversion +enabledApps: + - aim-cluster-model-source + - airm + - amd-gpu-operator + - amd-gpu-operator-config + - appwrapper + - argocd + - argocd-config + - cert-manager + - cluster-auth + - cluster-auth-config + - cnpg-operator + - external-secrets + - external-secrets-config + - gateway-api + - gitea + - gitea-config + - kaiwo + - kaiwo-config + - kaiwo-crds + - keda + - kedify-otel + - keycloak + - kgateway + - kgateway-config + - kgateway-crds + - kserve + - kserve-crds + - kueue + - kueue-config + - kuberay-operator + - kyverno + - kyverno-config + - kyverno-policies-base + - kyverno-policies-storage-local-path + - metallb + - minio-operator + - minio-tenant + - minio-tenant-config + - openbao + - openbao-config + - openbao-init + - opentelemetry-operator + - otel-lgtm-stack + - prometheus-crds + - rabbitmq + + +apps: + # Modular Kyverno policy applications (only the storage-local-path addition) + kyverno-policies-storage-local-path: + namespace: kyverno + path: kyverno-policies/storage-local-path + syncWave: -20 + ignoreDifferences: + - group: kyverno.io + kind: ClusterPolicy + jsonPointers: + - /spec/rules/0/skipBackgroundRequests + - /spec/rules/0/validate/allowExistingViolations + argocd: + valuesObject: + applicationSet: + replicas: 1 + controller: + replicas: 1 + resources: + limits: + cpu: "1000m" + memory: "2Gi" + requests: + cpu: "500m" + memory: "1Gi" + redis-ha: + enabled: false + redis: + resources: + limits: + cpu: "500m" + memory: "1Gi" + requests: + cpu: "250m" + memory: "512Mi" + repoServer: + replicas: 1 + resources: + limits: + cpu: "500m" + memory: "1Gi" + requests: + cpu: "250m" + memory: "512Mi" + server: + replicas: 1 + resources: + limits: + cpu: "500m" + memory: "1Gi" + requests: + cpu: "125m" + memory: "256Mi" + + minio-tenant: + valuesObject: + tenant: + buckets: + - name: default-bucket + objectLock: true + - name: models + objectLock: true + - name: datasets + objectLock: false + pools: + - name: pool-0 + servers: 1 + size: 2Ti + storageClassName: direct + volumesPerServer: 2 + resources: + limits: + cpu: "4000m" + memory: "8Gi" + requests: + cpu: "1000m" + memory: "2Gi" + + openbao: + valuesObject: + server: + ha: + enabled: false + replicas: 1 + raft: + enabled: false + dataStorage: + size: 5Gi + storageClass: direct + resources: + limits: + cpu: "1000m" + memory: "2Gi" + requests: + cpu: "250m" + memory: "512Mi" + + prometheus: + valuesObject: + prometheus: + prometheusSpec: + retention: 15d + retentionSize: 20GB + resources: + limits: + cpu: "2000m" + memory: "4Gi" + requests: + cpu: "500m" + memory: "1Gi" + storageSpec: + volumeClaimTemplate: + spec: + accessModes: + - ReadWriteOnce + storageClassName: direct + resources: + requests: + storage: 25Gi + + grafana: + valuesObject: + replicas: 1 + resources: + limits: + cpu: "1000m" + memory: "2Gi" + requests: + cpu: "250m" + memory: "512Mi" + persistence: + enabled: true + size: 5Gi + storageClassName: direct + accessModes: + - ReadWriteOnce + + keycloak: + valuesObject: + # Increase memory resources for Keycloak to prevent OOMKilled during initialization + # Medium preset provides 1536Mi memory limit vs small preset's 768Mi + resourcesPreset: "medium" + + otel-lgtm-stack: + valuesObject: + collectors: + resources: + metrics: + limits: + memory: 4Gi + cpu: '1' + + opentelemetry-operator: + valuesObject: + manager: + resources: + requests: + cpu: "250m" + memory: "512Mi" diff --git a/root/values_original.yaml b/root/values_original.yaml new file mode 100644 index 00000000..a438b5dc --- /dev/null +++ b/root/values_original.yaml @@ -0,0 +1,808 @@ +clusterForge: + repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" + targetRevision: main +# source helm values file from separate git repo +externalValues: + enabled: true + repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" + targetRevision: main + path: values.yaml +global: + domain: # to be filled by bootstrap script + clusterSize: # to be filled by bootstrap script (small, medium, large) +# enabledApps list removed - each cluster size (small/medium/large) defines its own apps +# This eliminates the override/clobbering issue and makes cluster sizes more independent +apps: + # Core apps + argocd: + path: argocd/8.3.5 + namespace: argocd + valuesObject: + applicationSet: + replicas: 1 + configs: + cm: + create: true + resource.customizations.health.opentelemetry.io_OpenTelemetryCollector: | + hs = {} + hs.status = "Healthy" + hs.message = "Always Healthy - See https://github.com/argoproj/argo-cd/pull/24284" + return hs + resource.customizations.health.keda.sh_ScaledObject: | + hs = {} + if obj.status ~= nil then + if obj.status.conditions ~= nil then + for _, condition in ipairs(obj.status.conditions) do + if condition.type == "Ready" then + if condition.status == "True" then + hs.status = "Healthy" + hs.message = "ScaledObject is ready" + else + hs.status = "Degraded" + hs.message = condition.reason or "ScaledObject not ready" + end + return hs + resource.customizations.health.batch_Job: | + -- Custom health check for Jobs, especially openbao-init + hs = {} + if obj.status ~= nil then + if obj.status.conditions ~= nil then + for _, condition in ipairs(obj.status.conditions) do + if condition.type == "Complete" and condition.status == "True" then + hs.status = "Healthy" + hs.message = "Job completed successfully" + return hs + elseif condition.type == "Failed" and condition.status == "True" then + hs.status = "Degraded" + hs.message = "Job failed" + return hs + end + end + end + -- Check for active jobs + if obj.status.active and obj.status.active > 0 then + hs.status = "Progressing" + hs.message = "Job is running" + return hs + end + end + hs.status = "Progressing" + hs.message = "Job status unknown" + return hs + end + end + end + hs.status = "Progressing" + hs.message = "ScaledObject status unknown" + else + hs.status = "Progressing" + hs.message = "ScaledObject status unknown" + end + return hs + resource.customizations.health.apps_StatefulSet: | + -- Custom health check for OpenBao StatefulSet + -- Uses standard StatefulSet readiness for now - openbao-init job handles actual initialization + hs = {} + if obj.status ~= nil then + if obj.status.readyReplicas ~= nil and obj.status.replicas ~= nil then + if obj.status.readyReplicas == obj.status.replicas then + hs.status = "Healthy" + hs.message = "StatefulSet is ready" + else + hs.status = "Progressing" + hs.message = "Waiting for StatefulSet replicas to be ready" + end + else + hs.status = "Progressing" + hs.message = "Waiting for StatefulSet status" + end + else + hs.status = "Progressing" + hs.message = "Waiting for StatefulSet status" + end + return hs + params: + server.insecure: true + rbac: + create: true + policy.csv: | + g, argocd-users, role:admin + controller: + replicas: 1 + redis: + enabled: true + redis-ha: + enabled: false + repoServer: + replicas: 1 + autoscaling: + enabled: false + server: + replicas: 1 + autoscaling: + enabled: false + global: + domain: # to be filled by cluster-forge app + helmParameters: + - name: global.domain + value: "argocd.{{ .Values.global.domain }}" + - name: configs.cm.oidc\.config + value: | + name: Keycloak + issuer: https://kc.{{ .Values.global.domain }}/realms/airm + clientID: argocd + clientSecret: $$argocd-oidc-creds:client_secret + rootCA: $cluster-tls:cert + requestedScopes: ["openid", "profile", "email", "groups"] + syncWave: -30 + argocd-config: + path: argocd-config + namespace: argocd + syncWave: 5 + ignoreDifferences: + - group: external-secrets.io + kind: ExternalSecret + jqPathExpressions: + - ".spec.data[].remoteRef.conversionStrategy" + - ".spec.data[].remoteRef.decodingStrategy" + - ".spec.data[].remoteRef.metadataPolicy" + cert-manager: + namespace: cert-manager + path: cert-manager/v1.18.2 + syncWave: -40 + valuesObject: + installCRDs: true + openbao: + path: openbao/0.18.2 + namespace: cf-openbao + valuesObject: + injector: + enabled: false + server: + affinity: | + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: openbao + app.kubernetes.io/instance: openbao + component: server + topologyKey: kubernetes.io/hostname + ha: + enabled: false + raft: + enabled: false + replicas: 1 + ui: + enabled: true + syncWave: -70 + ignoreDifferences: + - group: "apps" + kind: "Deployment" + jsonPointers: + - /spec/replicas + - group: "apps" + kind: "StatefulSet" + name: "openbao" + jsonPointers: + - /spec/volumeClaimTemplates + openbao-init: + path: ../scripts/init-openbao-job + namespace: cf-openbao + valuesObject: + domain: # to be filled by cluster-forge app + helmParameters: + - name: domain + value: "{{ .Values.global.domain }}" + syncWave: -50 + openbao-config: + path: openbao-config/0.1.0 + namespace: cf-openbao + valuesFile: values.yaml + helmParameters: + - name: domain + value: "{{ .Values.global.domain }}" + syncWave: -60 + external-secrets: + path: external-secrets/0.15.1 + namespace: external-secrets + valuesFile: values.yaml + syncWave: -40 + external-secrets-config: + path: external-secrets-config + namespace: external-secrets + syncWave: -10 + gitea: + path: gitea/12.3.0 + namespace: cf-gitea + valuesObject: + clusterDomain: # to be filled by cluster-forge app + strategy: + type: "Recreate" + gitea: + admin: + existingSecret: gitea-admin-credentials + config: + server: + ROOT_URL: # to be filled by cluster-forge app + database: + DB_TYPE: sqlite3 + session: + PROVIDER: memory + cache: + ADAPTER: memory + queue: + TYPE: level + valkey-cluster: + enabled: false + valkey: + enabled: false + postgresql: + enabled: false + postgresql-ha: + enabled: false + persistence: + enabled: true + test: + enabled: false + service: + http: + sessionAffinity: "None" + ssh: + sessionAffinity: "None" + helmParameters: + - name: clusterDomain + value: "{{ .Values.global.domain }}" + - name: gitea.config.server.ROOT_URL + value: "https://gitea.{{ .Values.global.domain }}" + syncWave: -30 + gitea-config: + path: gitea-config + namespace: cf-gitea + valuesFile: values.yaml + helmParameters: + - name: keycloak.url + value: "https://kc.{{ .Values.global.domain }}" + - name: keycloak.realm + value: "airm" + syncWave: -20 + # Network apps + gateway-api: + path: gateway-api/v1.3.0 + namespace: default + syncWave: -50 + metallb: + path: metallb/v0.15.2 + namespace: default + syncWave: 10 + kgateway-crds: + path: kgateway-crds/v2.1.0-main + namespace: kgateway-system + valuesFile: values.yaml + syncWave: -30 + kgateway: + path: kgateway/v2.1.0-main + namespace: kgateway-system + valuesObject: + controller: + image: + registry: "ghcr.io" + repository: silogen/kgateway-v2.1.0-main-websocket + tag: "0.0.1" + syncWave: -20 + kgateway-config: + path: kgateway-config + namespace: kgateway-system + valuesFile: values.yaml + helmParameters: + - name: domain + value: "{{ .Values.global.domain }}" + syncWave: -20 + # Monitoring + prometheus-crds: + path: prometheus-operator-crds/23.0.0 + namespace: prometheus-system + valuesFile: values.yaml + syncWave: -50 + opentelemetry-operator: + path: opentelemetry-operator/0.93.1 + namespace: opentelemetry-operator-system + valuesObject: + # Cluster-forge specific values for opentelemetry-operator + # Sets the collector image to use contrib version (required for kaiwo/kedify-otel) + manager: + collectorImage: + repository: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib + tag: "0.140.0" + syncWave: -30 + otel-lgtm-stack: + path: otel-lgtm-stack/v1.0.7 + namespace: otel-lgtm-stack + valuesObject: + # Cluster-forge specific configuration for OpenTelemetry LGTM Stack + # This file overrides values.yaml for cluster-forge deployments + # Cluster identification - will be populated by root/values.yaml helmParameters + cluster: + name: # to be filled by cluster-forge app based on domain + # Component enablement (cluster-forge defaults) + dashboards: + enabled: true + nodeExporter: + enabled: true + kubeStateMetrics: + enabled: true + # Storage configuration optimized for cluster-forge + lgtm: + storage: + # Tempo storage for traces + tempo: 50Gi + # Loki storage for logs + loki: 50Gi + # Grafana storage for dashboards/config + grafana: 10Gi + # Mimir/Prometheus storage for metrics + mimir: 50Gi + # Loki additional storage + extra: 50Gi + # LGTM stack main deployment resources + resources: + limits: + memory: 8Gi + requests: + memory: 2Gi + cpu: "1" + # Resource configuration optimized for cluster-forge + collectors: + resources: + # Metrics collector (deployment mode) + metrics: + limits: + memory: 8Gi + cpu: "2" + requests: + memory: 1Gi + cpu: 500m + # Logs collector (daemonset mode) + logs: + limits: + memory: 2Gi + cpu: "1" + requests: + memory: 400Mi + cpu: 200m + # Service configuration + services: + # Main LGTM stack service ports + lgtm: + grafana: 3000 + otelGrpc: 4317 + otelHttp: 4318 + prometheus: 9090 + loki: 3100 + # Kube state metrics service port + kubeStateMetrics: + http: 8080 + # Node exporter service port + nodeExporter: + metrics: 9100 + helmParameters: + - name: cluster.name + value: "{{ .Values.global.domain }}" + syncWave: -20 + # Databases + cnpg-operator: + path: cnpg-operator/0.26.0 + namespace: cnpg-system + valuesFile: values.yaml + syncWave: -30 + # Access control + cluster-auth: + path: cluster-auth/0.5.0 + namespace: cluster-auth + valuesFile: values.yaml + syncWave: -20 + cluster-auth-config: + path: cluster-auth-config + namespace: cluster-auth + syncWave: 5 + ignoreDifferences: + - group: external-secrets.io + kind: ExternalSecret + jqPathExpressions: + - ".spec.data[].remoteRef.conversionStrategy" + - ".spec.data[].remoteRef.decodingStrategy" + - ".spec.data[].remoteRef.metadataPolicy" + keycloak: + path: keycloak-old + namespace: keycloak + valuesObject: + replicaCount: 1 + resources: + limits: + cpu: "500m" + memory: "2Gi" + requests: + cpu: "250m" + memory: "512Mi" + podLabels: + app: keycloak + auth: + adminUser: admin + existingSecret: "keycloak-credentials" + passwordSecretKey: "KEYCLOAK_INITIAL_ADMIN_PASSWORD" + extraStartupArgs: "--cache=ispn --features=scripts,admin-fine-grained-authz,token-exchange --import-realm" + initContainers: + - command: + - /bin/sh + - -c + - | + cd /opt/scripts + zip -r /opt/keycloak/providers/SilogenExtensionPackage.jar . + image: ghcr.io/silogen/keycloak-init:0.1 + name: init-auth-extensions + volumeMounts: + - mountPath: /opt/keycloak/providers + name: keycloak-package-volume + - mountPath: /opt/scripts + name: keycloak-script-volume + - command: + - /bin/sh + - -c + - | + if [ -f "/opt/realm_templates/airm/airm-realm.json" ]; then + cp /opt/realm_templates/airm/airm-realm.json /opt/realms/airm-realm.json + sed -i -e "s/__AIRM_FRONTEND_CLIENT_SECRET__/$AIRM_FRONTEND_CLIENT_SECRET/g" /opt/realms/airm-realm.json + sed -i -e "s/__AIRM_ADMIN_CLIENT_ID__/$AIRM_ADMIN_CLIENT_ID/g" /opt/realms/airm-realm.json + sed -i -e "s/__AIRM_ADMIN_CLIENT_SECRET__/$AIRM_ADMIN_CLIENT_SECRET/g" /opt/realms/airm-realm.json + sed -i -e "s/__AIRM_CI_CLIENT_SECRET__/$AIRM_CI_CLIENT_SECRET/g" /opt/realms/airm-realm.json + + sed -i -e "s/__K8S_CLIENT_SECRET__/$K8S_CLIENT_SECRET/g" /opt/realms/airm-realm.json + sed -i -e "s/__MINIO_CLIENT_SECRET__/$MINIO_CLIENT_SECRET/g" /opt/realms/airm-realm.json + sed -i -e "s/__GITEA_CLIENT_SECRET__/$GITEA_CLIENT_SECRET/g" /opt/realms/airm-realm.json + sed -i -e "s/__ARGOCD_CLIENT_SECRET__/$ARGOCD_CLIENT_SECRET/g" /opt/realms/airm-realm.json + sed -i -e "s/__DEVUSER_INITIAL_PASSWORD__/$DEVUSER_INITIAL_PASSWORD/g" /opt/realms/airm-realm.json + else + echo "Warning: /opt/realm_templates/airm/airm-realm.json not found, skipping airm realm setup" + fi + env: + - name: AIRM_FRONTEND_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: FRONTEND_CLIENT_SECRET + name: airm-realm-credentials + - name: AIRM_ADMIN_CLIENT_ID + valueFrom: + secretKeyRef: + key: ADMIN_CLIENT_ID + name: airm-realm-credentials + - name: AIRM_ADMIN_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: ADMIN_CLIENT_SECRET + name: airm-realm-credentials + - name: AIRM_CI_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: CI_CLIENT_SECRET + name: airm-realm-credentials + - name: K8S_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: K8S_CLIENT_SECRET + name: airm-realm-credentials + - name: MINIO_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: MINIO_CLIENT_SECRET + name: airm-realm-credentials + - name: GITEA_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: GITEA_CLIENT_SECRET + name: airm-realm-credentials + - name: ARGOCD_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: ARGOCD_CLIENT_SECRET + name: airm-realm-credentials + - name: DEVUSER_INITIAL_PASSWORD + valueFrom: + secretKeyRef: + key: KEYCLOAK_INITIAL_DEVUSER_PASSWORD + name: airm-realm-credentials + image: ghcr.io/silogen/keycloak-init:0.1 + name: init-realm-scripts + volumeMounts: + - mountPath: /opt/realm_templates/airm + name: keycloak-airm-realm-template-volume + - mountPath: /opt/realm_templates/k8s + name: keycloak-k8s-realm-template-volume + - mountPath: /opt/realms + name: keycloak-realm-volume + extraVolumes: + - configMap: + name: keycloak-scripts + items: + - key: keycloak-scripts.json + path: META-INF/keycloak-scripts.json + - key: domain-group-authenticator.js + path: domain-group-authenticator.js + name: keycloak-script-volume + - emptyDir: {} + name: keycloak-package-volume + - configMap: + name: keycloak-realm-templates-7kgh2hc6b2 + name: keycloak-airm-realm-template-volume + - emptyDir: {} + name: keycloak-realm-volume + - configMap: + name: keycloak-realm-templates-k8s + name: keycloak-k8s-realm-template-volume + extraVolumeMounts: + - mountPath: /opt/keycloak/providers + name: keycloak-package-volume + - mountPath: /opt/keycloak/data/import + name: keycloak-realm-volume + helmParameters: + - name: domain + value: "{{ .Values.global.domain }}" + syncWave: -10 + ignoreDifferences: + - group: external-secrets.io + kind: ExternalSecret + jqPathExpressions: + - ".spec.data[].remoteRef.conversionStrategy" + - ".spec.data[].remoteRef.decodingStrategy" + - ".spec.data[].remoteRef.metadataPolicy" + kyverno: + path: kyverno/3.5.1 + namespace: kyverno + valuesFile: values.yaml + syncWave: -30 + kyverno-config: + path: kyverno-config + namespace: kyverno + syncWave: -20 + ignoreDifferences: + - group: "kyverno.io" + kind: "ClusterPolicy" + name: "local-path-access-mode-mutation" + jsonPointers: + - /spec/rules/0/skipBackgroundRequests + - group: "kyverno.io" + kind: "ClusterPolicy" + name: "local-path-access-mode-warning" + jsonPointers: + - /spec/rules/0/skipBackgroundRequests + - /spec/rules/0/validate/allowExistingViolations + kyverno-policies-base: + namespace: kyverno + path: kyverno-policies/base + syncWave: -20 + # GPU + amd-gpu-operator: + path: amd-gpu-operator/v1.4.1 + namespace: kube-amd-gpu + valuesObject: + crds: + defaultCR: + install: false + syncWave: -10 + amd-gpu-operator-config: + path: amd-gpu-operator-config + namespace: kube-amd-gpu + syncWave: 0 + kuberay-operator: + path: kuberay-operator/1.4.2 + namespace: default + valuesFile: values.yaml + syncWave: -10 + # Autoscaling + keda: + path: keda/2.18.1 + namespace: keda + valuesFile: values.yaml + syncWave: -10 + kedify-otel: + path: kedify-otel/v0.0.6 + namespace: keda + valuesObject: + # Cluster-forge specific values for kedify-otel + validatingAdmissionPolicy: + enabled: false + syncWave: -5 + ignoreDifferences: + - group: "" + kind: "Service" + name: "keda-otel-scaler" + jqPathExpressions: + - ".status" + - group: "apps" + kind: "Deployment" + jqPathExpressions: + - ".status.readyReplicas" + - ".status.availableReplicas" + # ML/AI + kserve-crds: + path: kserve-crds/v0.16.0 + namespace: kserve-system + valuesFile: values.yaml + syncWave: -30 + kserve: + path: kserve/v0.16.0 + namespace: kserve-system + valuesObject: + kserve: + controller: + deploymentMode: "Standard" + syncWave: 0 + # Queues + rabbitmq: + path: rabbitmq/v2.15.0 + namespace: rabbitmq-system + syncWave: -10 + kueue: + path: kueue/0.13.0 + namespace: kueue-system + valuesObject: + controllerManager: + replicas: 1 + mutatingWebhook: + reinvocationPolicy: IfNeeded + managerConfig: + controllerManagerConfigYaml: |- + apiVersion: config.kueue.x-k8s.io/v1beta1 + kind: Configuration + health: + healthProbeBindAddress: :8081 + metrics: + bindAddress: :8443 + # enableClusterQueueResources: true + webhook: + port: 9443 + leaderElection: + leaderElect: true + resourceName: c1f6bfd2.kueue.x-k8s.io + controller: + groupKindConcurrency: + Job.batch: 5 + Pod: 5 + Workload.kueue.x-k8s.io: 5 + LocalQueue.kueue.x-k8s.io: 1 + Cohort.kueue.x-k8s.io: 1 + ClusterQueue.kueue.x-k8s.io: 1 + ResourceFlavor.kueue.x-k8s.io: 1 + clientConnection: + qps: 50 + burst: 100 + managedJobsNamespaceSelector: + matchLabels: + kueue-managed: "true" + integrations: + frameworks: + - "batch/job" + - "kubeflow.org/mpijob" + - "ray.io/rayjob" + - "ray.io/raycluster" + - "jobset.x-k8s.io/jobset" + - "kubeflow.org/paddlejob" + - "kubeflow.org/pytorchjob" + - "kubeflow.org/tfjob" + - "kubeflow.org/xgboostjob" + - "kubeflow.org/jaxjob" + - "workload.codeflare.dev/appwrapper" + - "pod" + - "deployment" + - "statefulset" + syncWave: -10 + kueue-config: + path: kueue-config + namespace: kueue-system + syncWave: -10 + appwrapper: + path: appwrapper/v1.1.2 + namespace: appwrapper-system + syncWave: -10 + # Storage + minio-operator: + path: minio-operator/7.1.1 + namespace: minio-operator + valuesFile: values.yaml + syncWave: -10 + minio-tenant: + path: minio-tenant/7.1.1 + namespace: minio-tenant-default + valuesObject: + tenant: + name: default-minio-tenant + configSecret: + name: default-minio-tenant-env-configuration + existingSecret: true + pools: + - servers: 1 + name: pool-0 + volumesPerServer: 1 + size: 250Gi # Reduced from 500Gi for workstation + storageClassName: direct + buckets: + - name: default-bucket + objectLock: true + - name: models + objectLock: true + users: + - name: default-user + certificate: + requestAutoCert: false + externalCaCertSecret: + - name: cluster-tls + type: kubernetes.io/secret/v1 + env: + - name: MINIO_PROMETHEUS_AUTH_TYPE + value: "public" + syncWave: 0 + minio-tenant-config: + path: minio-tenant-config + namespace: minio-tenant-default + valuesFile: values.yaml + helmParameters: + - name: domain + value: "{{ .Values.global.domain }}" + syncWave: 0 + ignoreDifferences: + - group: external-secrets.io + kind: ExternalSecret + jqPathExpressions: + - ".spec.data[].remoteRef.conversionStrategy" + - ".spec.data[].remoteRef.decodingStrategy" + - ".spec.data[].remoteRef.metadataPolicy" + # Kaiwo (Kubernetes AI Workload Orchestrator) + aim-cluster-model-source: + path: aim-cluster-model-source + namespace: kaiwo-system + syncWave: -20 + kaiwo-crds: + path: kaiwo-crds/v0.2.0-rc11 + namespace: kaiwo-system + syncWave: -20 + kaiwo: + path: kaiwo/v0.2.0-rc11 + namespace: kaiwo-system + valuesFile: values.yaml + syncWave: -10 + kaiwo-config: + path: kaiwo-config + namespace: kaiwo-system + syncWave: 0 + ignoreDifferences: + - group: external-secrets.io + kind: ExternalSecret + jqPathExpressions: + - ".spec.data[].remoteRef.conversionStrategy" + - ".spec.data[].remoteRef.decodingStrategy" + - ".spec.data[].remoteRef.metadataPolicy" + - group: "" + kind: "PersistentVolumeClaim" + jsonPointers: + - /spec/accessModes + # AMD Resource Manager (AIRM) + airm: + path: airm/0.3.5 + namespace: airm + valuesFile: values.yaml + helmParameters: + - name: airm-api.airm.appDomain + value: "{{ .Values.global.domain }}" + syncWave: 0 + ignoreDifferences: + - group: external-secrets.io + kind: ExternalSecret + jqPathExpressions: + - ".spec.data[].remoteRef.conversionStrategy" + - ".spec.data[].remoteRef.decodingStrategy" + - ".spec.data[].remoteRef.metadataPolicy" + - group: kyverno.io + kind: ClusterPolicy + jqPathExpressions: + - ".spec.rules" diff --git a/root/values_small.yaml b/root/values_small.yaml index c3a7282c..d1ee57f4 100644 --- a/root/values_small.yaml +++ b/root/values_small.yaml @@ -2,67 +2,7 @@ # Add Kyverno policy for local-path access mode mutation # Medium & Small clusters add local-path storage policy for RWX→RWO conversion -enabledApps: - - aim-cluster-model-source - - airm - - amd-gpu-operator - - amd-gpu-operator-config - - appwrapper - - argocd - - argocd-config - - cert-manager - - cluster-auth - - cluster-auth-config - - cnpg-operator - - external-secrets - - external-secrets-config - - gateway-api - - gitea - - gitea-config - - kaiwo - - kaiwo-config - - kaiwo-crds - - keda - - kedify-otel - - keycloak - - kgateway - - kgateway-config - - kgateway-crds - - kserve - - kserve-crds - - kuberay-operator - - kueue - - kueue-config - - kyverno - - kyverno-config - - kyverno-policies-base # applicable to all cluster sizes - - kyverno-policies-storage-local-path # small & medium cluster sizes only - - metallb - - minio-operator - - minio-tenant - - minio-tenant-config - - openbao - - openbao-config - - openbao-init - - opentelemetry-operator - - otel-lgtm-stack - - prometheus-crds - - rabbitmq - apps: - # Modular Kyverno policy applications (only the storage-local-path addition) - kyverno-policies-storage-local-path: - namespace: kyverno - path: kyverno-policies/storage-local-path - source: clusterForge - syncOptions: - - CreateNamespace=true - ignoreDifferences: [] - wave: 26 # Deploy after base policies - syncWave: - - group: kyverno.io - kind: ClusterPolicy - argocd: valuesObject: applicationSet: @@ -104,7 +44,33 @@ apps: requests: cpu: "125m" memory: "256Mi" - + grafana: + valuesObject: + persistence: + accessModes: + - ReadWriteOnce + enabled: true + size: 5Gi + storageClassName: local-path + replicas: 1 + resources: + limits: + cpu: "1000m" + memory: "2Gi" + requests: + cpu: "250m" + memory: "512Mi" + kyverno-policies-storage-local-path: + ignoreDifferences: [] + namespace: kyverno + path: kyverno-policies/storage-local-path + source: clusterForge + syncOptions: + - CreateNamespace=true + syncWave: + - group: kyverno.io + kind: ClusterPolicy + wave: 26 # Deploy after base policies minio-tenant: valuesObject: tenant: @@ -128,18 +94,17 @@ apps: requests: cpu: "1000m" memory: "2Gi" - openbao: valuesObject: server: + dataStorage: + size: 5Gi + storageClass: local-path ha: enabled: false - replicas: 1 raft: enabled: false - dataStorage: - size: 5Gi - storageClass: local-path + replicas: 1 resources: limits: cpu: "1000m" @@ -147,13 +112,10 @@ apps: requests: cpu: "250m" memory: "512Mi" - prometheus: valuesObject: prometheus: prometheusSpec: - retention: 15d - retentionSize: 20GB resources: limits: cpu: "2000m" @@ -161,29 +123,60 @@ apps: requests: cpu: "500m" memory: "1Gi" + retention: 15d + retentionSize: 20GB storageSpec: volumeClaimTemplate: spec: accessModes: - ReadWriteOnce - storageClassName: local-path resources: requests: storage: 25Gi - - grafana: - valuesObject: - replicas: 1 - resources: - limits: - cpu: "1000m" - memory: "2Gi" - requests: - cpu: "250m" - memory: "512Mi" - persistence: - enabled: true - size: 5Gi - storageClassName: local-path - accessModes: - - ReadWriteOnce + storageClassName: local-path +enabledApps: + - aim-cluster-model-source + - airm + - amd-gpu-operator + - amd-gpu-operator-config + - appwrapper + - argocd + - argocd-config + - cert-manager + - cluster-auth + - cluster-auth-config + - cnpg-operator + - external-secrets + - external-secrets-config + - gateway-api + - gitea + - gitea-config + - kaiwo + - kaiwo-config + - kaiwo-crds + - keda + - kedify-otel + - keycloak + - kgateway + - kgateway-config + - kgateway-crds + - kserve + - kserve-crds + - kuberay-operator + - kueue + - kueue-config + - kyverno + - kyverno-config + - kyverno-policies-base # applicable to all cluster sizes + - kyverno-policies-storage-local-path # small & medium cluster sizes only + - metallb + - minio-operator + - minio-tenant + - minio-tenant-config + - openbao + - openbao-config + - openbao-init + - opentelemetry-operator + - otel-lgtm-stack + - prometheus-crds + - rabbitmq \ No newline at end of file diff --git a/root/values_small_original.yaml b/root/values_small_original.yaml new file mode 100644 index 00000000..c3a7282c --- /dev/null +++ b/root/values_small_original.yaml @@ -0,0 +1,189 @@ +# SMALL CLUSTER: All apps enabled (inherited from base values.yaml) +# Add Kyverno policy for local-path access mode mutation + +# Medium & Small clusters add local-path storage policy for RWX→RWO conversion +enabledApps: + - aim-cluster-model-source + - airm + - amd-gpu-operator + - amd-gpu-operator-config + - appwrapper + - argocd + - argocd-config + - cert-manager + - cluster-auth + - cluster-auth-config + - cnpg-operator + - external-secrets + - external-secrets-config + - gateway-api + - gitea + - gitea-config + - kaiwo + - kaiwo-config + - kaiwo-crds + - keda + - kedify-otel + - keycloak + - kgateway + - kgateway-config + - kgateway-crds + - kserve + - kserve-crds + - kuberay-operator + - kueue + - kueue-config + - kyverno + - kyverno-config + - kyverno-policies-base # applicable to all cluster sizes + - kyverno-policies-storage-local-path # small & medium cluster sizes only + - metallb + - minio-operator + - minio-tenant + - minio-tenant-config + - openbao + - openbao-config + - openbao-init + - opentelemetry-operator + - otel-lgtm-stack + - prometheus-crds + - rabbitmq + +apps: + # Modular Kyverno policy applications (only the storage-local-path addition) + kyverno-policies-storage-local-path: + namespace: kyverno + path: kyverno-policies/storage-local-path + source: clusterForge + syncOptions: + - CreateNamespace=true + ignoreDifferences: [] + wave: 26 # Deploy after base policies + syncWave: + - group: kyverno.io + kind: ClusterPolicy + + argocd: + valuesObject: + applicationSet: + replicas: 1 + controller: + replicas: 1 + resources: + limits: + cpu: "2000m" + memory: "4Gi" + requests: + cpu: "500m" + memory: "1Gi" + redis-ha: + enabled: false + redis: + resources: + limits: + cpu: "1000m" + memory: "2Gi" + requests: + cpu: "250m" + memory: "512Mi" + repoServer: + replicas: 1 + resources: + limits: + cpu: "1000m" + memory: "2Gi" + requests: + cpu: "250m" + memory: "512Mi" + server: + replicas: 1 + resources: + limits: + cpu: "500m" + memory: "1Gi" + requests: + cpu: "125m" + memory: "256Mi" + + minio-tenant: + valuesObject: + tenant: + buckets: + - name: default-bucket + objectLock: true + - name: models + objectLock: true + - name: datasets + objectLock: false + pools: + - name: pool-0 + servers: 1 + size: 2Ti + storageClassName: local-path + volumesPerServer: 2 + resources: + limits: + cpu: "4000m" + memory: "8Gi" + requests: + cpu: "1000m" + memory: "2Gi" + + openbao: + valuesObject: + server: + ha: + enabled: false + replicas: 1 + raft: + enabled: false + dataStorage: + size: 5Gi + storageClass: local-path + resources: + limits: + cpu: "1000m" + memory: "2Gi" + requests: + cpu: "250m" + memory: "512Mi" + + prometheus: + valuesObject: + prometheus: + prometheusSpec: + retention: 15d + retentionSize: 20GB + resources: + limits: + cpu: "2000m" + memory: "4Gi" + requests: + cpu: "500m" + memory: "1Gi" + storageSpec: + volumeClaimTemplate: + spec: + accessModes: + - ReadWriteOnce + storageClassName: local-path + resources: + requests: + storage: 25Gi + + grafana: + valuesObject: + replicas: 1 + resources: + limits: + cpu: "1000m" + memory: "2Gi" + requests: + cpu: "250m" + memory: "512Mi" + persistence: + enabled: true + size: 5Gi + storageClassName: local-path + accessModes: + - ReadWriteOnce From 9ef87d60eca0d73d70a37643a39743c9d7150ba1 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 11:37:55 +0200 Subject: [PATCH 082/115] rm values backup file --- root/values_original.yaml | 808 -------------------------------------- 1 file changed, 808 deletions(-) delete mode 100644 root/values_original.yaml diff --git a/root/values_original.yaml b/root/values_original.yaml deleted file mode 100644 index a438b5dc..00000000 --- a/root/values_original.yaml +++ /dev/null @@ -1,808 +0,0 @@ -clusterForge: - repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" - targetRevision: main -# source helm values file from separate git repo -externalValues: - enabled: true - repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" - targetRevision: main - path: values.yaml -global: - domain: # to be filled by bootstrap script - clusterSize: # to be filled by bootstrap script (small, medium, large) -# enabledApps list removed - each cluster size (small/medium/large) defines its own apps -# This eliminates the override/clobbering issue and makes cluster sizes more independent -apps: - # Core apps - argocd: - path: argocd/8.3.5 - namespace: argocd - valuesObject: - applicationSet: - replicas: 1 - configs: - cm: - create: true - resource.customizations.health.opentelemetry.io_OpenTelemetryCollector: | - hs = {} - hs.status = "Healthy" - hs.message = "Always Healthy - See https://github.com/argoproj/argo-cd/pull/24284" - return hs - resource.customizations.health.keda.sh_ScaledObject: | - hs = {} - if obj.status ~= nil then - if obj.status.conditions ~= nil then - for _, condition in ipairs(obj.status.conditions) do - if condition.type == "Ready" then - if condition.status == "True" then - hs.status = "Healthy" - hs.message = "ScaledObject is ready" - else - hs.status = "Degraded" - hs.message = condition.reason or "ScaledObject not ready" - end - return hs - resource.customizations.health.batch_Job: | - -- Custom health check for Jobs, especially openbao-init - hs = {} - if obj.status ~= nil then - if obj.status.conditions ~= nil then - for _, condition in ipairs(obj.status.conditions) do - if condition.type == "Complete" and condition.status == "True" then - hs.status = "Healthy" - hs.message = "Job completed successfully" - return hs - elseif condition.type == "Failed" and condition.status == "True" then - hs.status = "Degraded" - hs.message = "Job failed" - return hs - end - end - end - -- Check for active jobs - if obj.status.active and obj.status.active > 0 then - hs.status = "Progressing" - hs.message = "Job is running" - return hs - end - end - hs.status = "Progressing" - hs.message = "Job status unknown" - return hs - end - end - end - hs.status = "Progressing" - hs.message = "ScaledObject status unknown" - else - hs.status = "Progressing" - hs.message = "ScaledObject status unknown" - end - return hs - resource.customizations.health.apps_StatefulSet: | - -- Custom health check for OpenBao StatefulSet - -- Uses standard StatefulSet readiness for now - openbao-init job handles actual initialization - hs = {} - if obj.status ~= nil then - if obj.status.readyReplicas ~= nil and obj.status.replicas ~= nil then - if obj.status.readyReplicas == obj.status.replicas then - hs.status = "Healthy" - hs.message = "StatefulSet is ready" - else - hs.status = "Progressing" - hs.message = "Waiting for StatefulSet replicas to be ready" - end - else - hs.status = "Progressing" - hs.message = "Waiting for StatefulSet status" - end - else - hs.status = "Progressing" - hs.message = "Waiting for StatefulSet status" - end - return hs - params: - server.insecure: true - rbac: - create: true - policy.csv: | - g, argocd-users, role:admin - controller: - replicas: 1 - redis: - enabled: true - redis-ha: - enabled: false - repoServer: - replicas: 1 - autoscaling: - enabled: false - server: - replicas: 1 - autoscaling: - enabled: false - global: - domain: # to be filled by cluster-forge app - helmParameters: - - name: global.domain - value: "argocd.{{ .Values.global.domain }}" - - name: configs.cm.oidc\.config - value: | - name: Keycloak - issuer: https://kc.{{ .Values.global.domain }}/realms/airm - clientID: argocd - clientSecret: $$argocd-oidc-creds:client_secret - rootCA: $cluster-tls:cert - requestedScopes: ["openid", "profile", "email", "groups"] - syncWave: -30 - argocd-config: - path: argocd-config - namespace: argocd - syncWave: 5 - ignoreDifferences: - - group: external-secrets.io - kind: ExternalSecret - jqPathExpressions: - - ".spec.data[].remoteRef.conversionStrategy" - - ".spec.data[].remoteRef.decodingStrategy" - - ".spec.data[].remoteRef.metadataPolicy" - cert-manager: - namespace: cert-manager - path: cert-manager/v1.18.2 - syncWave: -40 - valuesObject: - installCRDs: true - openbao: - path: openbao/0.18.2 - namespace: cf-openbao - valuesObject: - injector: - enabled: false - server: - affinity: | - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/name: openbao - app.kubernetes.io/instance: openbao - component: server - topologyKey: kubernetes.io/hostname - ha: - enabled: false - raft: - enabled: false - replicas: 1 - ui: - enabled: true - syncWave: -70 - ignoreDifferences: - - group: "apps" - kind: "Deployment" - jsonPointers: - - /spec/replicas - - group: "apps" - kind: "StatefulSet" - name: "openbao" - jsonPointers: - - /spec/volumeClaimTemplates - openbao-init: - path: ../scripts/init-openbao-job - namespace: cf-openbao - valuesObject: - domain: # to be filled by cluster-forge app - helmParameters: - - name: domain - value: "{{ .Values.global.domain }}" - syncWave: -50 - openbao-config: - path: openbao-config/0.1.0 - namespace: cf-openbao - valuesFile: values.yaml - helmParameters: - - name: domain - value: "{{ .Values.global.domain }}" - syncWave: -60 - external-secrets: - path: external-secrets/0.15.1 - namespace: external-secrets - valuesFile: values.yaml - syncWave: -40 - external-secrets-config: - path: external-secrets-config - namespace: external-secrets - syncWave: -10 - gitea: - path: gitea/12.3.0 - namespace: cf-gitea - valuesObject: - clusterDomain: # to be filled by cluster-forge app - strategy: - type: "Recreate" - gitea: - admin: - existingSecret: gitea-admin-credentials - config: - server: - ROOT_URL: # to be filled by cluster-forge app - database: - DB_TYPE: sqlite3 - session: - PROVIDER: memory - cache: - ADAPTER: memory - queue: - TYPE: level - valkey-cluster: - enabled: false - valkey: - enabled: false - postgresql: - enabled: false - postgresql-ha: - enabled: false - persistence: - enabled: true - test: - enabled: false - service: - http: - sessionAffinity: "None" - ssh: - sessionAffinity: "None" - helmParameters: - - name: clusterDomain - value: "{{ .Values.global.domain }}" - - name: gitea.config.server.ROOT_URL - value: "https://gitea.{{ .Values.global.domain }}" - syncWave: -30 - gitea-config: - path: gitea-config - namespace: cf-gitea - valuesFile: values.yaml - helmParameters: - - name: keycloak.url - value: "https://kc.{{ .Values.global.domain }}" - - name: keycloak.realm - value: "airm" - syncWave: -20 - # Network apps - gateway-api: - path: gateway-api/v1.3.0 - namespace: default - syncWave: -50 - metallb: - path: metallb/v0.15.2 - namespace: default - syncWave: 10 - kgateway-crds: - path: kgateway-crds/v2.1.0-main - namespace: kgateway-system - valuesFile: values.yaml - syncWave: -30 - kgateway: - path: kgateway/v2.1.0-main - namespace: kgateway-system - valuesObject: - controller: - image: - registry: "ghcr.io" - repository: silogen/kgateway-v2.1.0-main-websocket - tag: "0.0.1" - syncWave: -20 - kgateway-config: - path: kgateway-config - namespace: kgateway-system - valuesFile: values.yaml - helmParameters: - - name: domain - value: "{{ .Values.global.domain }}" - syncWave: -20 - # Monitoring - prometheus-crds: - path: prometheus-operator-crds/23.0.0 - namespace: prometheus-system - valuesFile: values.yaml - syncWave: -50 - opentelemetry-operator: - path: opentelemetry-operator/0.93.1 - namespace: opentelemetry-operator-system - valuesObject: - # Cluster-forge specific values for opentelemetry-operator - # Sets the collector image to use contrib version (required for kaiwo/kedify-otel) - manager: - collectorImage: - repository: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib - tag: "0.140.0" - syncWave: -30 - otel-lgtm-stack: - path: otel-lgtm-stack/v1.0.7 - namespace: otel-lgtm-stack - valuesObject: - # Cluster-forge specific configuration for OpenTelemetry LGTM Stack - # This file overrides values.yaml for cluster-forge deployments - # Cluster identification - will be populated by root/values.yaml helmParameters - cluster: - name: # to be filled by cluster-forge app based on domain - # Component enablement (cluster-forge defaults) - dashboards: - enabled: true - nodeExporter: - enabled: true - kubeStateMetrics: - enabled: true - # Storage configuration optimized for cluster-forge - lgtm: - storage: - # Tempo storage for traces - tempo: 50Gi - # Loki storage for logs - loki: 50Gi - # Grafana storage for dashboards/config - grafana: 10Gi - # Mimir/Prometheus storage for metrics - mimir: 50Gi - # Loki additional storage - extra: 50Gi - # LGTM stack main deployment resources - resources: - limits: - memory: 8Gi - requests: - memory: 2Gi - cpu: "1" - # Resource configuration optimized for cluster-forge - collectors: - resources: - # Metrics collector (deployment mode) - metrics: - limits: - memory: 8Gi - cpu: "2" - requests: - memory: 1Gi - cpu: 500m - # Logs collector (daemonset mode) - logs: - limits: - memory: 2Gi - cpu: "1" - requests: - memory: 400Mi - cpu: 200m - # Service configuration - services: - # Main LGTM stack service ports - lgtm: - grafana: 3000 - otelGrpc: 4317 - otelHttp: 4318 - prometheus: 9090 - loki: 3100 - # Kube state metrics service port - kubeStateMetrics: - http: 8080 - # Node exporter service port - nodeExporter: - metrics: 9100 - helmParameters: - - name: cluster.name - value: "{{ .Values.global.domain }}" - syncWave: -20 - # Databases - cnpg-operator: - path: cnpg-operator/0.26.0 - namespace: cnpg-system - valuesFile: values.yaml - syncWave: -30 - # Access control - cluster-auth: - path: cluster-auth/0.5.0 - namespace: cluster-auth - valuesFile: values.yaml - syncWave: -20 - cluster-auth-config: - path: cluster-auth-config - namespace: cluster-auth - syncWave: 5 - ignoreDifferences: - - group: external-secrets.io - kind: ExternalSecret - jqPathExpressions: - - ".spec.data[].remoteRef.conversionStrategy" - - ".spec.data[].remoteRef.decodingStrategy" - - ".spec.data[].remoteRef.metadataPolicy" - keycloak: - path: keycloak-old - namespace: keycloak - valuesObject: - replicaCount: 1 - resources: - limits: - cpu: "500m" - memory: "2Gi" - requests: - cpu: "250m" - memory: "512Mi" - podLabels: - app: keycloak - auth: - adminUser: admin - existingSecret: "keycloak-credentials" - passwordSecretKey: "KEYCLOAK_INITIAL_ADMIN_PASSWORD" - extraStartupArgs: "--cache=ispn --features=scripts,admin-fine-grained-authz,token-exchange --import-realm" - initContainers: - - command: - - /bin/sh - - -c - - | - cd /opt/scripts - zip -r /opt/keycloak/providers/SilogenExtensionPackage.jar . - image: ghcr.io/silogen/keycloak-init:0.1 - name: init-auth-extensions - volumeMounts: - - mountPath: /opt/keycloak/providers - name: keycloak-package-volume - - mountPath: /opt/scripts - name: keycloak-script-volume - - command: - - /bin/sh - - -c - - | - if [ -f "/opt/realm_templates/airm/airm-realm.json" ]; then - cp /opt/realm_templates/airm/airm-realm.json /opt/realms/airm-realm.json - sed -i -e "s/__AIRM_FRONTEND_CLIENT_SECRET__/$AIRM_FRONTEND_CLIENT_SECRET/g" /opt/realms/airm-realm.json - sed -i -e "s/__AIRM_ADMIN_CLIENT_ID__/$AIRM_ADMIN_CLIENT_ID/g" /opt/realms/airm-realm.json - sed -i -e "s/__AIRM_ADMIN_CLIENT_SECRET__/$AIRM_ADMIN_CLIENT_SECRET/g" /opt/realms/airm-realm.json - sed -i -e "s/__AIRM_CI_CLIENT_SECRET__/$AIRM_CI_CLIENT_SECRET/g" /opt/realms/airm-realm.json - - sed -i -e "s/__K8S_CLIENT_SECRET__/$K8S_CLIENT_SECRET/g" /opt/realms/airm-realm.json - sed -i -e "s/__MINIO_CLIENT_SECRET__/$MINIO_CLIENT_SECRET/g" /opt/realms/airm-realm.json - sed -i -e "s/__GITEA_CLIENT_SECRET__/$GITEA_CLIENT_SECRET/g" /opt/realms/airm-realm.json - sed -i -e "s/__ARGOCD_CLIENT_SECRET__/$ARGOCD_CLIENT_SECRET/g" /opt/realms/airm-realm.json - sed -i -e "s/__DEVUSER_INITIAL_PASSWORD__/$DEVUSER_INITIAL_PASSWORD/g" /opt/realms/airm-realm.json - else - echo "Warning: /opt/realm_templates/airm/airm-realm.json not found, skipping airm realm setup" - fi - env: - - name: AIRM_FRONTEND_CLIENT_SECRET - valueFrom: - secretKeyRef: - key: FRONTEND_CLIENT_SECRET - name: airm-realm-credentials - - name: AIRM_ADMIN_CLIENT_ID - valueFrom: - secretKeyRef: - key: ADMIN_CLIENT_ID - name: airm-realm-credentials - - name: AIRM_ADMIN_CLIENT_SECRET - valueFrom: - secretKeyRef: - key: ADMIN_CLIENT_SECRET - name: airm-realm-credentials - - name: AIRM_CI_CLIENT_SECRET - valueFrom: - secretKeyRef: - key: CI_CLIENT_SECRET - name: airm-realm-credentials - - name: K8S_CLIENT_SECRET - valueFrom: - secretKeyRef: - key: K8S_CLIENT_SECRET - name: airm-realm-credentials - - name: MINIO_CLIENT_SECRET - valueFrom: - secretKeyRef: - key: MINIO_CLIENT_SECRET - name: airm-realm-credentials - - name: GITEA_CLIENT_SECRET - valueFrom: - secretKeyRef: - key: GITEA_CLIENT_SECRET - name: airm-realm-credentials - - name: ARGOCD_CLIENT_SECRET - valueFrom: - secretKeyRef: - key: ARGOCD_CLIENT_SECRET - name: airm-realm-credentials - - name: DEVUSER_INITIAL_PASSWORD - valueFrom: - secretKeyRef: - key: KEYCLOAK_INITIAL_DEVUSER_PASSWORD - name: airm-realm-credentials - image: ghcr.io/silogen/keycloak-init:0.1 - name: init-realm-scripts - volumeMounts: - - mountPath: /opt/realm_templates/airm - name: keycloak-airm-realm-template-volume - - mountPath: /opt/realm_templates/k8s - name: keycloak-k8s-realm-template-volume - - mountPath: /opt/realms - name: keycloak-realm-volume - extraVolumes: - - configMap: - name: keycloak-scripts - items: - - key: keycloak-scripts.json - path: META-INF/keycloak-scripts.json - - key: domain-group-authenticator.js - path: domain-group-authenticator.js - name: keycloak-script-volume - - emptyDir: {} - name: keycloak-package-volume - - configMap: - name: keycloak-realm-templates-7kgh2hc6b2 - name: keycloak-airm-realm-template-volume - - emptyDir: {} - name: keycloak-realm-volume - - configMap: - name: keycloak-realm-templates-k8s - name: keycloak-k8s-realm-template-volume - extraVolumeMounts: - - mountPath: /opt/keycloak/providers - name: keycloak-package-volume - - mountPath: /opt/keycloak/data/import - name: keycloak-realm-volume - helmParameters: - - name: domain - value: "{{ .Values.global.domain }}" - syncWave: -10 - ignoreDifferences: - - group: external-secrets.io - kind: ExternalSecret - jqPathExpressions: - - ".spec.data[].remoteRef.conversionStrategy" - - ".spec.data[].remoteRef.decodingStrategy" - - ".spec.data[].remoteRef.metadataPolicy" - kyverno: - path: kyverno/3.5.1 - namespace: kyverno - valuesFile: values.yaml - syncWave: -30 - kyverno-config: - path: kyverno-config - namespace: kyverno - syncWave: -20 - ignoreDifferences: - - group: "kyverno.io" - kind: "ClusterPolicy" - name: "local-path-access-mode-mutation" - jsonPointers: - - /spec/rules/0/skipBackgroundRequests - - group: "kyverno.io" - kind: "ClusterPolicy" - name: "local-path-access-mode-warning" - jsonPointers: - - /spec/rules/0/skipBackgroundRequests - - /spec/rules/0/validate/allowExistingViolations - kyverno-policies-base: - namespace: kyverno - path: kyverno-policies/base - syncWave: -20 - # GPU - amd-gpu-operator: - path: amd-gpu-operator/v1.4.1 - namespace: kube-amd-gpu - valuesObject: - crds: - defaultCR: - install: false - syncWave: -10 - amd-gpu-operator-config: - path: amd-gpu-operator-config - namespace: kube-amd-gpu - syncWave: 0 - kuberay-operator: - path: kuberay-operator/1.4.2 - namespace: default - valuesFile: values.yaml - syncWave: -10 - # Autoscaling - keda: - path: keda/2.18.1 - namespace: keda - valuesFile: values.yaml - syncWave: -10 - kedify-otel: - path: kedify-otel/v0.0.6 - namespace: keda - valuesObject: - # Cluster-forge specific values for kedify-otel - validatingAdmissionPolicy: - enabled: false - syncWave: -5 - ignoreDifferences: - - group: "" - kind: "Service" - name: "keda-otel-scaler" - jqPathExpressions: - - ".status" - - group: "apps" - kind: "Deployment" - jqPathExpressions: - - ".status.readyReplicas" - - ".status.availableReplicas" - # ML/AI - kserve-crds: - path: kserve-crds/v0.16.0 - namespace: kserve-system - valuesFile: values.yaml - syncWave: -30 - kserve: - path: kserve/v0.16.0 - namespace: kserve-system - valuesObject: - kserve: - controller: - deploymentMode: "Standard" - syncWave: 0 - # Queues - rabbitmq: - path: rabbitmq/v2.15.0 - namespace: rabbitmq-system - syncWave: -10 - kueue: - path: kueue/0.13.0 - namespace: kueue-system - valuesObject: - controllerManager: - replicas: 1 - mutatingWebhook: - reinvocationPolicy: IfNeeded - managerConfig: - controllerManagerConfigYaml: |- - apiVersion: config.kueue.x-k8s.io/v1beta1 - kind: Configuration - health: - healthProbeBindAddress: :8081 - metrics: - bindAddress: :8443 - # enableClusterQueueResources: true - webhook: - port: 9443 - leaderElection: - leaderElect: true - resourceName: c1f6bfd2.kueue.x-k8s.io - controller: - groupKindConcurrency: - Job.batch: 5 - Pod: 5 - Workload.kueue.x-k8s.io: 5 - LocalQueue.kueue.x-k8s.io: 1 - Cohort.kueue.x-k8s.io: 1 - ClusterQueue.kueue.x-k8s.io: 1 - ResourceFlavor.kueue.x-k8s.io: 1 - clientConnection: - qps: 50 - burst: 100 - managedJobsNamespaceSelector: - matchLabels: - kueue-managed: "true" - integrations: - frameworks: - - "batch/job" - - "kubeflow.org/mpijob" - - "ray.io/rayjob" - - "ray.io/raycluster" - - "jobset.x-k8s.io/jobset" - - "kubeflow.org/paddlejob" - - "kubeflow.org/pytorchjob" - - "kubeflow.org/tfjob" - - "kubeflow.org/xgboostjob" - - "kubeflow.org/jaxjob" - - "workload.codeflare.dev/appwrapper" - - "pod" - - "deployment" - - "statefulset" - syncWave: -10 - kueue-config: - path: kueue-config - namespace: kueue-system - syncWave: -10 - appwrapper: - path: appwrapper/v1.1.2 - namespace: appwrapper-system - syncWave: -10 - # Storage - minio-operator: - path: minio-operator/7.1.1 - namespace: minio-operator - valuesFile: values.yaml - syncWave: -10 - minio-tenant: - path: minio-tenant/7.1.1 - namespace: minio-tenant-default - valuesObject: - tenant: - name: default-minio-tenant - configSecret: - name: default-minio-tenant-env-configuration - existingSecret: true - pools: - - servers: 1 - name: pool-0 - volumesPerServer: 1 - size: 250Gi # Reduced from 500Gi for workstation - storageClassName: direct - buckets: - - name: default-bucket - objectLock: true - - name: models - objectLock: true - users: - - name: default-user - certificate: - requestAutoCert: false - externalCaCertSecret: - - name: cluster-tls - type: kubernetes.io/secret/v1 - env: - - name: MINIO_PROMETHEUS_AUTH_TYPE - value: "public" - syncWave: 0 - minio-tenant-config: - path: minio-tenant-config - namespace: minio-tenant-default - valuesFile: values.yaml - helmParameters: - - name: domain - value: "{{ .Values.global.domain }}" - syncWave: 0 - ignoreDifferences: - - group: external-secrets.io - kind: ExternalSecret - jqPathExpressions: - - ".spec.data[].remoteRef.conversionStrategy" - - ".spec.data[].remoteRef.decodingStrategy" - - ".spec.data[].remoteRef.metadataPolicy" - # Kaiwo (Kubernetes AI Workload Orchestrator) - aim-cluster-model-source: - path: aim-cluster-model-source - namespace: kaiwo-system - syncWave: -20 - kaiwo-crds: - path: kaiwo-crds/v0.2.0-rc11 - namespace: kaiwo-system - syncWave: -20 - kaiwo: - path: kaiwo/v0.2.0-rc11 - namespace: kaiwo-system - valuesFile: values.yaml - syncWave: -10 - kaiwo-config: - path: kaiwo-config - namespace: kaiwo-system - syncWave: 0 - ignoreDifferences: - - group: external-secrets.io - kind: ExternalSecret - jqPathExpressions: - - ".spec.data[].remoteRef.conversionStrategy" - - ".spec.data[].remoteRef.decodingStrategy" - - ".spec.data[].remoteRef.metadataPolicy" - - group: "" - kind: "PersistentVolumeClaim" - jsonPointers: - - /spec/accessModes - # AMD Resource Manager (AIRM) - airm: - path: airm/0.3.5 - namespace: airm - valuesFile: values.yaml - helmParameters: - - name: airm-api.airm.appDomain - value: "{{ .Values.global.domain }}" - syncWave: 0 - ignoreDifferences: - - group: external-secrets.io - kind: ExternalSecret - jqPathExpressions: - - ".spec.data[].remoteRef.conversionStrategy" - - ".spec.data[].remoteRef.decodingStrategy" - - ".spec.data[].remoteRef.metadataPolicy" - - group: kyverno.io - kind: ClusterPolicy - jqPathExpressions: - - ".spec.rules" From 4198a2bc9a13468e663220b27d5469f5b3ca960c Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 11:58:09 +0200 Subject: [PATCH 083/115] fix remove main as hard-coded revision, since bootstrap.sh always clobbers (only update one place!) --- root/values.yaml | 6 +++--- scripts/bootstrap.sh | 27 +++++++++++++++++---------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index 42904a61..217576a8 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -1,14 +1,14 @@ clusterForge: repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" - targetRevision: main + targetRevision: # injected via scripts/bootstrap.sh; tag, branch, or commit externalValues: enabled: true path: values.yaml repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" targetRevision: main global: - clusterSize: # to be filled by bootstrap script (small, medium, large) - domain: # to be filled by bootstrap script + clusterSize: # injected via scripts/bootstrap.sh + domain: # injected via scripts/bootstrap.sh apps: aim-cluster-model-source: diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index a32952c6..f0f584ae 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -387,16 +387,22 @@ bootstrap_gitea() { openssl rand -hex 16 | tr 'a-f' 'A-F' | head -c 32 } - # Create initial-cf-values configmap (simple values for gitea-init-job) - cat > /tmp/simple_values.yaml << EOF - global: - domain: ${DOMAIN} - clusterSize: ${SIZE_VALUES_FILE} - clusterForge: - targetRevision: ${TARGET_REVISION} -EOF + # Create initial-cf-values configmap (complete values for gitea-init-job) + # Use the complete root values.yaml with filled placeholders instead of simplified version + cp "${SOURCE_ROOT}/root/${VALUES_FILE}" /tmp/complete_values.yaml + + # Fill in placeholder values using yq (these are used by gitea-init job) + yq eval ".global.domain = \"${DOMAIN}\"" -i /tmp/complete_values.yaml + yq eval ".global.clusterSize = \"${SIZE_VALUES_FILE}\"" -i /tmp/complete_values.yaml + yq eval ".clusterForge.targetRevision = \"${TARGET_REVISION}\"" -i /tmp/complete_values.yaml + + # Merge with size-specific values if they exist + if [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + yq eval-all 'select(fileIndex == 0) * select(fileIndex == 1)' /tmp/complete_values.yaml "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/complete_values_merged.yaml + mv /tmp/complete_values_merged.yaml /tmp/complete_values.yaml + fi - kubectl create configmap initial-cf-values --from-literal=initial-cf-values="$(cat /tmp/simple_values.yaml)" --dry-run=client -o yaml | apply_or_template -n cf-gitea -f - + kubectl create configmap initial-cf-values --from-literal=initial-cf-values="$(cat /tmp/complete_values.yaml)" --dry-run=client -o yaml | apply_or_template -n cf-gitea -f - kubectl create secret generic gitea-admin-credentials \ --namespace=cf-gitea \ @@ -479,8 +485,9 @@ apply_cluster_forge_parent_app() { --set global.domain="${DOMAIN}" \ --set global.clusterSize="${SIZE_VALUES_FILE}" \ --set clusterForge.targetRevision="${TARGET_REVISION}" \ + --set externalValues.targetRevision="${TARGET_REVISION}" \ --set externalValues.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" \ - --set clusterForge.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" \ + --set clusterForge.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-forge.git" \ --namespace argocd \ --kube-version "${KUBE_VERSION}" | apply_or_template -f - } From 4a88c30effe4ec49617f2e30125e0ffb69307a9e Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 12:01:27 +0200 Subject: [PATCH 084/115] perf(Keycloak): improve use of heap memory mgmt (solves OOM issues despite 2GB memory limit) --- root/values.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/root/values.yaml b/root/values.yaml index 217576a8..681aa34d 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -342,6 +342,14 @@ apps: adminUser: admin existingSecret: "keycloak-credentials" passwordSecretKey: "KEYCLOAK_INITIAL_ADMIN_PASSWORD" + extraEnvVars: + - name: JAVA_OPTS_APPEND + value: >- + -XX:MaxRAMPercentage=65.0 + -XX:InitialRAMPercentage=50.0 + -XX:MaxMetaspaceSize=512m + -XX:+ExitOnOutOfMemoryError + -Djava.awt.headless=true extraStartupArgs: "--cache=ispn --features=scripts,admin-fine-grained-authz,token-exchange --import-realm" extraVolumeMounts: - mountPath: /opt/keycloak/providers From 634ed924fee346bdb0c3d1abf6506c88101307b2 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 12:12:37 +0200 Subject: [PATCH 085/115] docs: update to reflect restructuring --- PRD.md | 76 +++++++++++++++++++++--------- README.md | 22 +++++---- docs/bootstrap_guide.md | 41 ++++++++-------- docs/cluster_size_configuration.md | 6 +-- docs/kyverno_access_mode_policy.md | 14 +++--- docs/kyverno_modular_design.md | 6 +-- docs/values_inheritance_pattern.md | 6 +-- 7 files changed, 104 insertions(+), 67 deletions(-) diff --git a/PRD.md b/PRD.md index 82cb7b8d..7883a3f2 100644 --- a/PRD.md +++ b/PRD.md @@ -28,13 +28,13 @@ Cluster-Forge uses a three-phase bootstrap process that establishes GitOps infra **Phase 2: GitOps Foundation Bootstrap** (Manual Helm Templates) 1. **ArgoCD** (v8.3.5) - GitOps controller deployed via helm template + kubectl apply -2. **OpenBao** (v0.18.2) - Secrets management with init job to configure vault, policies, and initial secrets -3. **Gitea** (v12.3.0) - Git server with init job to create cluster-forge and cluster-values repositories +2. **Gitea** (v12.3.0) - Git server with init job to create cluster-forge and cluster-values repositories **Phase 3: App-of-Apps Deployment** (ArgoCD-Managed) - Creates cluster-forge Application pointing to root/ helm chart -- ArgoCD syncs and manages all remaining applications from enabledApps list -- Applications deployed in wave order (-5 to 0) based on dependencies +- ArgoCD syncs all remaining applications including OpenBao from enabledApps list +- Applications deployed in wave order (-70 to 0) based on dependencies +- OpenBao (v0.18.2) deployed via ArgoCD with openbao-init job for vault configuration ### Dual Repository GitOps Pattern @@ -96,11 +96,11 @@ The cluster-forge Application uses multi-source feature when externalValues.enab ### Component Categories -**Layer 1: GitOps Foundation** (Sync Wave -4 to -3) -- ArgoCD 8.3.5 - GitOps continuous deployment controller -- Gitea 12.3.0 - Self-hosted Git server with SQLite backend -- OpenBao 0.18.2 - Vault-compatible secrets management -- External Secrets 0.15.1 - Secrets synchronization operator +**Layer 1: GitOps Foundation** (Bootstrap + Sync Wave -70 to -30) +- ArgoCD 8.3.5 - GitOps continuous deployment controller (bootstrap) +- Gitea 12.3.0 - Self-hosted Git server with SQLite backend (bootstrap) +- OpenBao 0.18.2 - Vault-compatible secrets management (ArgoCD-managed, sync wave -70) +- External Secrets 0.15.1 - Secrets synchronization operator (sync wave -40) **Layer 2: Core Infrastructure** (Sync Wave -5 to -2) @@ -199,24 +199,57 @@ cluster-forge/ ### Single-Command Bootstrap -The bootstrap.sh script orchestrates complete cluster setup: +The bootstrap.sh script orchestrates complete cluster setup with flexible options: ```bash -./scripts/bootstrap.sh [--CLUSTER_SIZE=small|medium|large] +./scripts/bootstrap.sh [options] ``` +**Available Options:** +- `--cluster-size=[small|medium|large]` - Cluster size configuration (default: medium) +- `--apps=APP1,APP2` - Deploy only specified components + - Bootstrap apps: `namespaces`, `argocd`, `gitea`, `cluster-forge` + - Child apps: Any app from enabledApps list (e.g., `openbao`, `keycloak`, `keda`) +- `--target-revision=BRANCH` - cluster-forge git revision for ArgoCD (default: latest release tag) +- `--template-only` or `-t` - Output YAML manifests instead of applying to cluster +- `--skip-deps` - Skip dependency checking for advanced users + **Bootstrap Process:** -1. **Validation** - Checks domain, cluster size, values files, yq tool availability +1. **Validation** - Checks domain, cluster size, values files, required tool availability 2. **Pre-cleanup** - Removes previous installations if gitea-init-job completed 3. **Values Merge** - Combines base + size-specific values with domain injection -4. **Namespace Creation** - Creates argocd, cf-gitea, cf-openbao namespaces +4. **Namespace Creation** - Creates argocd, cf-gitea namespaces 5. **ArgoCD Deployment** - helm template + kubectl apply with server-side apply -6. **OpenBao Deployment** - helm template + kubectl apply, waits for pod ready -7. **OpenBao Init Job** - Configures vault policies, auth methods, initial secrets -8. **Gitea Deployment** - helm template + kubectl apply, waits for rollout -9. **Gitea Init Job** - Creates cluster-org, clones/pushes cluster-forge and cluster-values repos -10. **ClusterForge App** - Creates root Application with merged values -11. **Cleanup** - Removes temporary values files +6. **Gitea Deployment** - helm template + kubectl apply, waits for rollout +7. **Gitea Init Job** - Creates cluster-org, clones/pushes cluster-forge and cluster-values repos +8. **ClusterForge App** - Creates root Application that manages all remaining components via ArgoCD +9. **Component Deployment** - ArgoCD syncs all enabledApps including OpenBao, secrets, and application stack + +### Selective Component Deployment + +The `--apps` flag enables targeted deployment for development and troubleshooting: + +**Bootstrap Components** (deployed via helm template): +- `namespaces` - Core namespaces (argocd, cf-gitea) +- `argocd` - GitOps controller +- `gitea` - Local Git server +- `cluster-forge` - Root ArgoCD Application + +**Child Components** (deployed via ArgoCD sync): +- Any application from enabledApps list +- Examples: `openbao,openbao-init`, `keycloak`, `keda,kedify-otel` + +**Usage Examples:** +```bash +# Deploy only core GitOps foundation +./scripts/bootstrap.sh example.com --apps=namespaces,argocd,gitea,cluster-forge + +# Deploy only secrets management +./scripts/bootstrap.sh example.com --apps=openbao,openbao-init,openbao-config + +# Render manifests for debugging +./scripts/bootstrap.sh example.com --apps=keycloak --template-only +``` ### Self-Contained GitOps @@ -314,8 +347,9 @@ Each major component has -config variant: **Bootstrap Secret Flow:** - bootstrap.sh generates initial passwords with `openssl rand -hex 16` -- openbao-init-job writes secrets to OpenBao -- External Secrets Operator syncs to Kubernetes Secrets +- ArgoCD deploys OpenBao via cluster-forge Application +- openbao-init-job (sync wave -50) writes secrets to OpenBao +- External Secrets Operator (sync wave -40) syncs to Kubernetes Secrets - Applications consume via secret references ### Modular Policy System diff --git a/README.md b/README.md index 3058fb03..4b9cceeb 100644 --- a/README.md +++ b/README.md @@ -19,19 +19,25 @@ Using a bootstrap-first deployment model, Cluster-Forge establishes GitOps infra ### Single-Command Deployment ```bash -./scripts/bootstrap.sh [--CLUSTER_SIZE=small|medium|large] +./scripts/bootstrap.sh [--cluster-size=small|medium|large] ``` ### Size-Aware Deployment Examples ```bash # Small cluster (1-5 users, development/testing) -./scripts/bootstrap.sh dev.example.com --CLUSTER_SIZE=small +./scripts/bootstrap.sh dev.example.com --cluster-size=small # Medium cluster (5-20 users, team production) [DEFAULT] -./scripts/bootstrap.sh team.example.com --CLUSTER_SIZE=medium +./scripts/bootstrap.sh team.example.com --cluster-size=medium # Large cluster (10s-100s users, enterprise scale) -./scripts/bootstrap.sh prod.example.com --CLUSTER_SIZE=large +./scripts/bootstrap.sh prod.example.com --cluster-size=large + +# Deploy only specific components +./scripts/bootstrap.sh dev.example.com --apps=argocd,gitea,cluster-forge + +# Deploy from specific branch/tag +./scripts/bootstrap.sh prod.example.com --target-revision=v1.8.0 ``` For detailed deployment instructions, see the [Bootstrap Guide](docs/bootstrap_guide.md). @@ -48,13 +54,13 @@ Cluster-Forge uses a three-phase bootstrap process: **Phase 2: GitOps Foundation Bootstrap** (Manual Helm Templates) 1. **ArgoCD** (v8.3.5) - GitOps controller deployed via helm template -2. **OpenBao** (v0.18.2) - Secrets management with initialization job -3. **Gitea** (v12.3.0) - Git server with initialization job +2. **Gitea** (v12.3.0) - Git server with initialization job **Phase 3: App-of-Apps Deployment** (ArgoCD-Managed) - Creates cluster-forge Application pointing to root/ helm chart -- ArgoCD syncs all remaining applications from enabledApps list -- Applications deployed in wave order (-5 to 0) based on dependencies +- ArgoCD syncs all remaining applications including OpenBao from enabledApps list +- Applications deployed in wave order (-70 to 0) based on dependencies +- OpenBao (v0.18.2) managed via ArgoCD with openbao-init job ### Dual Repository GitOps Pattern diff --git a/docs/bootstrap_guide.md b/docs/bootstrap_guide.md index 0d250a19..fe64f52e 100644 --- a/docs/bootstrap_guide.md +++ b/docs/bootstrap_guide.md @@ -1,6 +1,6 @@ # Bootstrap Guide -This guide explains how to bootstrap a complete GitOps environment using Cluster-Forge's three-phase deployment model. The bootstrap process establishes ArgoCD, OpenBao (secret management), and Gitea (Git repository) before deploying the full application stack. +This guide explains how to bootstrap a complete GitOps environment using Cluster-Forge's three-phase deployment model. The bootstrap process establishes ArgoCD and Gitea (Git repository) as foundation components, then creates the cluster-forge Application which manages all remaining components including OpenBao and the full application stack via ArgoCD. ## Prerequisites @@ -14,7 +14,7 @@ This guide explains how to bootstrap a complete GitOps environment using Cluster ## Usage ```bash -./scripts/bootstrap.sh [--CLUSTER_SIZE=small|medium|large] +./scripts/bootstrap.sh [--cluster-size=small|medium|large] ``` ### Arguments @@ -26,7 +26,7 @@ This guide explains how to bootstrap a complete GitOps environment using Cluster - **--apps=APP1,APP2**: Deploy only specified components (default: applies to cluster) - options: `namespaces`, `argocd`, `gitea`, `cluster-forge`, or any cluster-forge child app (see values.yaml for app names) - Use with `--template-only` to render instead of applying -- **--CLUSTER_SIZE** `[small|medium|large]`: Cluster size configuration (default: `medium`) +- **--cluster-size** `[small|medium|large]`: Cluster size configuration (default: `medium`) - **--template-only**, **-t**: Output YAML manifests to stdout instead of applying to cluster - **--target-revision**, **-r**: cluster-forge git revision for ArgoCD to sync from - **--skip-deps**: Skip dependency checking (for advanced users) @@ -39,7 +39,7 @@ This guide explains how to bootstrap a complete GitOps environment using Cluster ./scripts/bootstrap.sh 192.168.1.100.nip.io # Large cluster -./scripts/bootstrap.sh example.com --CLUSTER_SIZE=large +./scripts/bootstrap.sh example.com --cluster-size=large # Deploy only specific components ./scripts/bootstrap.sh example.com --apps=openbao,openbao-init @@ -71,10 +71,9 @@ The bootstrap script uses a three-phase deployment model: - Sets `global.domain` and `global.clusterSize` in merged configuration **2. Namespace Creation** -Creates three namespaces for core components: +Creates two namespaces for bootstrap components: - `argocd` - GitOps controller - `cf-gitea` - Git repository server -- `cf-openbao` - Secret management system **3. ArgoCD Bootstrap** - Extracts ArgoCD values from merged configuration @@ -86,18 +85,7 @@ Creates three namespaces for core components: - redis Deployment - repo-server Deployment -**4. OpenBao Bootstrap** -- Extracts OpenBao values from merged configuration -- Deploys OpenBao using `helm template` with server-side apply -- Waits for `openbao-0` pod to be running -- Runs initialization job (`openbao-init-job`) which: - - Initializes OpenBao Raft cluster - - Unseals all pods (3 for large clusters with HA) - - Configures Vault policies for each namespace - - Creates Kubernetes auth method - - Stores initialization keys and secrets - -**5. Gitea Bootstrap** +**4. Gitea Bootstrap** - Generates random admin password using `openssl rand -hex 16` - Creates `initial-cf-values` ConfigMap with merged configuration - Creates `gitea-admin-credentials` secret @@ -112,14 +100,23 @@ Creates three namespaces for core components: ### Phase 3: App-of-Apps Deployment (ArgoCD-Managed) -**6. ClusterForge Application Deployment** +**5. ClusterForge Application Deployment** - Renders root helm chart with merged configuration - Creates `cluster-forge` Application resource in ArgoCD +- ArgoCD syncs all remaining components in wave order: + - Wave -70: OpenBao (secrets management) + - Wave -60: OpenBao configuration + - Wave -50: OpenBao initialization job + - Wave -40: External Secrets, Cert-Manager + - Wave -30 to 0: All other applications + +**Key Improvement**: OpenBao is now managed by ArgoCD rather than bootstrapped separately, simplifying the bootstrap process while maintaining proper dependency ordering through sync waves. + - When `externalValues.enabled: true`, uses multi-source feature: - Source 1: cluster-forge repo (root/ helm chart) - Source 2: cluster-values repo (custom values.yaml) -- ArgoCD deploys all enabled applications based on configuration -- Applications deployed in wave order (-5 to 0) based on dependencies +- ArgoCD manages the complete application lifecycle +- Proper dependency ordering ensures OpenBao is ready before applications that depend on secrets **7. Cleanup** - Removes temporary merged values files from /tmp/ @@ -164,7 +161,7 @@ ClusterForge uses a layered configuration approach with YAML merge semantics: targetRevision: main global: - clusterSize: medium # Set by --CLUSTER_SIZE flag + clusterSize: medium # Set by --cluster-size flag domain: example.com # Set by domain argument ``` diff --git a/docs/cluster_size_configuration.md b/docs/cluster_size_configuration.md index 3c91524d..fb652774 100644 --- a/docs/cluster_size_configuration.md +++ b/docs/cluster_size_configuration.md @@ -99,9 +99,9 @@ The bootstrap script automatically applies the appropriate size configuration: ./scripts/bootstrap.sh example.com # Explicitly specify cluster size -./scripts/bootstrap.sh example.com --CLUSTER_SIZE=small -./scripts/bootstrap.sh example.com --CLUSTER_SIZE=medium -./scripts/bootstrap.sh example.com --CLUSTER_SIZE=large +./scripts/bootstrap.sh example.com --cluster-size=small +./scripts/bootstrap.sh example.com --cluster-size=medium +./scripts/bootstrap.sh example.com --cluster-size=large ``` ### Configuration Merge Logic diff --git a/docs/kyverno_access_mode_policy.md b/docs/kyverno_access_mode_policy.md index 81379df0..16bece97 100644 --- a/docs/kyverno_access_mode_policy.md +++ b/docs/kyverno_access_mode_policy.md @@ -110,9 +110,9 @@ cluster-forge/ # The policy is deployed or not based on values_*.yaml configuration # Large clusters simply don't include the policy in enabledApps -./scripts/bootstrap.sh example.com --CLUSTER_SIZE=small # Policy deployed -./scripts/bootstrap.sh example.com --CLUSTER_SIZE=medium # Policy deployed -./scripts/bootstrap.sh example.com --CLUSTER_SIZE=large # Policy NOT deployed +./scripts/bootstrap.sh example.com --cluster-size=small # Policy deployed +./scripts/bootstrap.sh example.com --cluster-size=medium # Policy deployed +./scripts/bootstrap.sh example.com --cluster-size=large # Policy NOT deployed ``` ## Usage Examples @@ -121,7 +121,7 @@ cluster-forge/ #### Small/Medium Cluster ```bash -./scripts/bootstrap.sh dev.example.com --CLUSTER_SIZE=small +./scripts/bootstrap.sh dev.example.com --cluster-size=small ``` **Result**: - Kyverno policy **deployed** @@ -130,7 +130,7 @@ cluster-forge/ #### Large Cluster ```bash -./scripts/bootstrap.sh prod.example.com --CLUSTER_SIZE=large +./scripts/bootstrap.sh prod.example.com --cluster-size=large ``` **Result**: - Kyverno policy **NOT deployed at all** @@ -217,7 +217,7 @@ kubectl get applications -n argocd -o yaml | grep "local-path-access-mode-polici # If found, the wrong values_*.yaml was used # Redeploy with correct size: -./scripts/bootstrap.sh --CLUSTER_SIZE=large + ./scripts/bootstrap.sh --cluster-size=large ``` #### Policy NOT Working on Small/Medium Clusters @@ -262,7 +262,7 @@ kubectl logs -n kyverno -l app.kubernetes.io/name=kyverno ### Upgrading from Small/Medium to Large 1. **Deploy large cluster configuration**: ```bash - ./scripts/bootstrap.sh --CLUSTER_SIZE=large +./scripts/bootstrap.sh --cluster-size=large ``` 2. **Policy automatically removed** (not in enabledApps) 3. **Deploy Longhorn** for native RWX support diff --git a/docs/kyverno_modular_design.md b/docs/kyverno_modular_design.md index 92e8ebeb..9925e4d5 100644 --- a/docs/kyverno_modular_design.md +++ b/docs/kyverno_modular_design.md @@ -125,13 +125,13 @@ enabledApps: ```bash # Small cluster - Main + local-path policies -./scripts/bootstrap.sh dev.example.com --CLUSTER_SIZE=small +./scripts/bootstrap.sh dev.example.com --cluster-size=small # Medium cluster - Main + local-path policies -./scripts/bootstrap.sh team.example.com --CLUSTER_SIZE=medium +./scripts/bootstrap.sh team.example.com --cluster-size=medium # Large cluster - Exactly same as main branch -./scripts/bootstrap.sh prod.example.com --CLUSTER_SIZE=large +./scripts/bootstrap.sh prod.example.com --cluster-size=large ``` ### 🔍 **Policy Verification** diff --git a/docs/values_inheritance_pattern.md b/docs/values_inheritance_pattern.md index 131a26e6..634fcd3c 100644 --- a/docs/values_inheritance_pattern.md +++ b/docs/values_inheritance_pattern.md @@ -10,12 +10,12 @@ Cluster-Forge implements a sophisticated dual-repository GitOps deployment patte ```yaml clusterForge: repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" - targetRevision: main + targetRevision: # filled by bootstrap script --target-revision externalValues: enabled: true # Uses multi-source pattern repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" - targetRevision: main + targetRevision: main # always main for local cluster overrides ``` **Purpose**: Self-contained cluster-native GitOps with local Gitea @@ -31,7 +31,7 @@ externalValues: ```yaml clusterForge: repoUrl: "https://github.com/silogen/cluster-forge.git" - targetRevision: v1.8.0-rc2 + targetRevision: # filled by bootstrap script --target-revision (e.g., v1.8.0, feature-branch) externalValues: enabled: false # Single source from GitHub From 8070fe61b7642a216e77330f3b1bdca5a5021376 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 13:28:29 +0200 Subject: [PATCH 086/115] fix: don't clobber cluster-values revision --- scripts/bootstrap.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index f0f584ae..30a44c06 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -482,12 +482,9 @@ apply_cluster_forge_parent_app() { --show-only templates/cluster-forge.yaml \ --values "${SOURCE_ROOT}/root/${VALUES_FILE}" \ --values "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" \ - --set global.domain="${DOMAIN}" \ --set global.clusterSize="${SIZE_VALUES_FILE}" \ + --set global.domain="${DOMAIN}" \ --set clusterForge.targetRevision="${TARGET_REVISION}" \ - --set externalValues.targetRevision="${TARGET_REVISION}" \ - --set externalValues.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" \ - --set clusterForge.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-forge.git" \ --namespace argocd \ --kube-version "${KUBE_VERSION}" | apply_or_template -f - } From 4bb510f8c659f515622eae4648c667c88fd99b32 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 13:29:00 +0200 Subject: [PATCH 087/115] ux: have gitea-init-job retry with same job so you don't see multiple failed pods on bootstrap --- .../init-gitea-job/templates/cf-init-gitea-cm.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml index 36286875..f7201567 100644 --- a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml +++ b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml @@ -12,6 +12,20 @@ data: GITEA_URL="${GITEA_URL:-http://gitea-http.cf-gitea.svc:3000}" GITEA_ADMIN_USER="${GITEA_ADMIN_USER:-silogen-admin}" + echo "Waiting for Gitea service to be ready..." + for i in {1..30}; do + if nslookup gitea-http.cf-gitea.svc >/dev/null 2>&1; then + echo "DNS resolution successful for gitea-http.cf-gitea.svc" + break + fi + if [ $i -eq 30 ]; then + echo "ERROR: DNS resolution failed after 30 attempts" + exit 1 + fi + echo "Waiting for DNS resolution... attempt $i/30" + sleep 2 + done + echo "Step 0: Create admin access token" GITEA_TOKEN=$(kubectl -n cf-gitea exec deploy/gitea -c gitea -- \ gitea admin user generate-access-token --raw --username "$GITEA_ADMIN_USER" --token-name "api-token-$(date +%s)" --scopes all From 2341e856731b8e81e756843a708255b4252cdc65 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 13:33:53 +0200 Subject: [PATCH 088/115] perf: improve gitea init job readiness check --- scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml index f7201567..fde53a2f 100644 --- a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml +++ b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml @@ -14,15 +14,15 @@ data: echo "Waiting for Gitea service to be ready..." for i in {1..30}; do - if nslookup gitea-http.cf-gitea.svc >/dev/null 2>&1; then - echo "DNS resolution successful for gitea-http.cf-gitea.svc" + if curl -s --max-time 5 "${GITEA_URL}" >/dev/null 2>&1; then + echo "Gitea service is ready and responding" break fi if [ $i -eq 30 ]; then - echo "ERROR: DNS resolution failed after 30 attempts" + echo "ERROR: Gitea service not ready after 30 attempts" exit 1 fi - echo "Waiting for DNS resolution... attempt $i/30" + echo "Waiting for Gitea to be ready... attempt $i/30" sleep 2 done From 382079c8ccc4700cc9ca589047975fe42749b85e Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 13:37:29 +0200 Subject: [PATCH 089/115] ux: rm gitea sessionAffinity warning --- root/values.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index 681aa34d..c507ee54 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -254,10 +254,8 @@ apps: postgresql-ha: enabled: false service: - http: - sessionAffinity: "None" - ssh: - sessionAffinity: "None" + http: {} + ssh: {} strategy: type: "Recreate" test: From 093bb880a8c52191d59dedaebf719b90340b1b10 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 13:59:48 +0200 Subject: [PATCH 090/115] perf: tweak minio create user cronjob, as got OOMKilled --- .../templates/minio-create-user-cronjob.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sources/minio-tenant-config/templates/minio-create-user-cronjob.yaml b/sources/minio-tenant-config/templates/minio-create-user-cronjob.yaml index 952023b8..75bb5b1c 100644 --- a/sources/minio-tenant-config/templates/minio-create-user-cronjob.yaml +++ b/sources/minio-tenant-config/templates/minio-create-user-cronjob.yaml @@ -18,10 +18,10 @@ spec: name: mc resources: requests: - memory: "32Mi" + memory: "64Mi" cpu: "100m" limits: - memory: "128Mi" + memory: "256Mi" cpu: "500m" volumeMounts: - mountPath: /tmp/minio-config From e4b4dbc4429031fdf0ab4223351f8151ff1ad375 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 22:49:51 +0200 Subject: [PATCH 091/115] fix: remove duplicate ignoreDifferences (and scope for entire array, not just element 0) --- root/values_medium.yaml | 119 ++++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 60 deletions(-) diff --git a/root/values_medium.yaml b/root/values_medium.yaml index b0fabeca..aad172c8 100644 --- a/root/values_medium.yaml +++ b/root/values_medium.yaml @@ -1,9 +1,64 @@ -# MEDIUM CLUSTER: All apps enabled (inherited from base values.yaml) -# Add Kyverno policy for local-path access mode mutation - -# Medium & Small clusters add local-path storage policy for RWX→RWO conversion # Medium & Small clusters add local-path storage policy for RWX→RWO conversion + +enabledApps: + - aim-cluster-model-source + - airm + - amd-gpu-operator + - amd-gpu-operator-config + - appwrapper + - argocd + - argocd-config + - cert-manager + - cluster-auth + - cluster-auth-config + - cnpg-operator + - external-secrets + - external-secrets-config + - gateway-api + - gitea + - gitea-config + - kaiwo + - kaiwo-config + - kaiwo-crds + - keda + - kedify-otel + - keycloak + - kgateway + - kgateway-config + - kgateway-crds + - kserve + - kserve-crds + - kueue + - kueue-config + - kuberay-operator + - kyverno + - kyverno-config + - kyverno-policies-base + - kyverno-policies-storage-local-path + - metallb + - minio-operator + - minio-tenant + - minio-tenant-config + - openbao + - openbao-config + - opentelemetry-operator + - otel-lgtm-stack + - prometheus-crds + - rabbitmq + + apps: + # Modular Kyverno policy applications (only the storage-local-path addition) + kyverno-policies-storage-local-path: + namespace: kyverno + path: kyverno-policies/storage-local-path + syncWave: -20 + ignoreDifferences: + - group: kyverno.io + kind: ClusterPolicy + jsonPointers: + - /spec/rules/*/skipBackgroundRequests + - /spec/rules/*/validate/allowExistingViolations argocd: valuesObject: applicationSet: @@ -66,16 +121,6 @@ apps: # Increase memory resources for Keycloak to prevent OOMKilled during initialization # Medium preset provides 1536Mi memory limit vs small preset's 768Mi resourcesPreset: "medium" - kyverno-policies-storage-local-path: - ignoreDifferences: - - group: kyverno.io - jsonPointers: - - /spec/rules/0/skipBackgroundRequests - - /spec/rules/0/validate/allowExistingViolations - kind: ClusterPolicy - namespace: kyverno - path: kyverno-policies/storage-local-path - syncWave: -20 minio-tenant: valuesObject: tenant: @@ -154,49 +199,3 @@ apps: requests: storage: 25Gi storageClassName: direct -enabledApps: - - aim-cluster-model-source - - airm - - amd-gpu-operator - - amd-gpu-operator-config - - appwrapper - - argocd - - argocd-config - - cert-manager - - cluster-auth - - cluster-auth-config - - cnpg-operator - - external-secrets - - external-secrets-config - - gateway-api - - gitea - - gitea-config - - kaiwo - - kaiwo-config - - kaiwo-crds - - keda - - kedify-otel - - keycloak - - kgateway - - kgateway-config - - kgateway-crds - - kserve - - kserve-crds - - kueue - - kueue-config - - kuberay-operator - - kyverno - - kyverno-config - - kyverno-policies-base - - kyverno-policies-storage-local-path - - metallb - - minio-operator - - minio-tenant - - minio-tenant-config - - openbao - - openbao-config - - openbao-init - - opentelemetry-operator - - otel-lgtm-stack - - prometheus-crds - - rabbitmq \ No newline at end of file From 8fbfa7bc8c4f897ec29c794c38fa10b36032d639 Mon Sep 17 00:00:00 2001 From: brownzebra Date: Tue, 3 Mar 2026 11:19:16 +0200 Subject: [PATCH 092/115] EAI-1238 deprecate scripts/init-openbao-job/values.yaml --- scripts/init-openbao-job/values.yaml | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 scripts/init-openbao-job/values.yaml diff --git a/scripts/init-openbao-job/values.yaml b/scripts/init-openbao-job/values.yaml deleted file mode 100644 index 4c682279..00000000 --- a/scripts/init-openbao-job/values.yaml +++ /dev/null @@ -1,19 +0,0 @@ -domain: # to be filled by bootstrap script - -# OpenBao server configuration from merged values -server: - ha: - enabled: false # from merged config - replicas: 1 # from merged config - raft: - enabled: false # from merged config - dataStorage: - size: 5Gi # from merged config - storageClass: "" # from merged config - resources: # from merged config - requests: - cpu: "250m" - memory: "512Mi" - limits: - cpu: "1000m" - memory: "2Gi" \ No newline at end of file From 9b20d3c5ad221860bcd4937e5cae169559e3b925 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Mon, 2 Mar 2026 22:26:34 +0200 Subject: [PATCH 093/115] feat: implement AIRM_IMAGE_REPOSITORY env when running bootstrap.sh (passed from bloom or just set inline) --- scripts/bootstrap.sh | 19 ++++++++++++++----- .../templates/cf-init-gitea-cm.yaml | 18 ++++++++++++++++++ scripts/init-gitea-job/values.yaml | 3 +++ 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 30a44c06..d84f434c 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -415,11 +415,20 @@ bootstrap_gitea() { --values /tmp/gitea_bootstrap_values.yaml \ --kube-version=${KUBE_VERSION} | apply_or_template -f - - helm template --release-name gitea-init ${SOURCE_ROOT}/scripts/init-gitea-job \ - --set clusterSize="${SIZE_VALUES_FILE}" \ - --set domain="${DOMAIN}" \ - --set targetRevision="${TARGET_REVISION}" \ - --kube-version=${KUBE_VERSION} | apply_or_template -f - + # Gitea Init Job + HELM_ARGS="--release-name gitea-init ${SOURCE_ROOT}/scripts/init-gitea-job \ + --set clusterSize=${SIZE_VALUES_FILE:-values_${CLUSTER_SIZE}.yaml} \ + --set domain=${DOMAIN} \ + --set targetRevision=${TARGET_REVISION} \ + --kube-version=${KUBE_VERSION}" + + # Only add airmImageRepository if AIRM_IMAGE_REPOSITORY is set and non-empty + if [ -n "${AIRM_IMAGE_REPOSITORY:-}" ]; then + HELM_ARGS="${HELM_ARGS} --set airmImageRepository=${AIRM_IMAGE_REPOSITORY}" + fi + + helm template ${HELM_ARGS} | kubectl apply -f - + if [ "$TEMPLATE_ONLY" = false ]; then kubectl rollout status deploy/gitea -n cf-gitea --timeout="${DEFAULT_TIMEOUT}" kubectl wait --for=condition=complete --timeout="${DEFAULT_TIMEOUT}" job/gitea-init-job -n cf-gitea diff --git a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml index fde53a2f..c0a45bba 100644 --- a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml +++ b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml @@ -148,6 +148,24 @@ data: global: clusterSize: {{ .Values.clusterSize }} domain: DOMAIN_PLACEHOLDER + {{- if .Values.airmImageRepository }} + + # AIRM Image Repository Configuration + airm-api: + airm: + backend: + image: + repository: {{ .Values.airmImageRepository }}/airm-api + frontend: + image: + repository: {{ .Values.airmImageRepository }}/airm-ui + + airm-dispatcher: + airm: + dispatcher: + image: + repository: {{ .Values.airmImageRepository }}/airm-dispatcher + {{- end }} EOF diff --git a/scripts/init-gitea-job/values.yaml b/scripts/init-gitea-job/values.yaml index 17075359..25886155 100644 --- a/scripts/init-gitea-job/values.yaml +++ b/scripts/init-gitea-job/values.yaml @@ -5,3 +5,6 @@ clusterSize: null domain: null # Git revision to deploy (injected by bootstrap script) targetRevision: v1.8.0-rc4 +# Base image repository for AIRM components (injected by bootstrap script when specified) +# Example: "ghcr.io/silogen" +airmImageRepository: null From 45176abb6779184e0eb53cb8f71950aee34a9a75 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Tue, 3 Mar 2026 12:59:36 +0200 Subject: [PATCH 094/115] docs: document AIRM_IMAGE_REPOSITORY feature --- PRD.md | 2 ++ README.md | 1 + docs/bootstrap_guide.md | 27 +++++++++++++++++++++++++++ docs/values_inheritance_pattern.md | 24 ++++++++++++++++++++++++ 4 files changed, 54 insertions(+) diff --git a/PRD.md b/PRD.md index 7883a3f2..cd725ba2 100644 --- a/PRD.md +++ b/PRD.md @@ -158,6 +158,7 @@ The cluster-forge Application uses multi-source feature when externalValues.enab **Layer 6: AIRM Application** (Sync Wave 0) - AIRM 0.3.2 - AMD Resource Manager application suite +- Configurable image repositories for custom registries and air-gapped deployments - AIM Cluster Model Source - Cluster resource models for AIRM ### Repository Structure @@ -486,6 +487,7 @@ Kueue manages scheduling for: **FR1: AIRM Platform Delivery** - Deploy AMD Resource Manager (AIRM) 0.3.2 with UI and API +- Support configurable image repositories via `airmImageRepository` bootstrap parameter - Provide model serving with KServe v0.16.0 - Support distributed computing via KubeRay Operator 1.4.2 - Enable workflow orchestration through Kaiwo v0.2.0-rc11 diff --git a/README.md b/README.md index 4b9cceeb..c62736ed 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,7 @@ See [Values Inheritance Pattern](docs/values_inheritance_pattern.md) for detaile ### Layer 6: AIRM Application - **AIRM 0.3.2** - AMD Resource Manager application suite - **AIM Cluster Model Source** - Cluster resource models for AIRM +- **Configurable Image Repositories** - Supports custom container registries via cluster-bloom `AIRM_IMAGE_REPOSITORY` parameter ## � Configuration diff --git a/docs/bootstrap_guide.md b/docs/bootstrap_guide.md index fe64f52e..67e4ce77 100644 --- a/docs/bootstrap_guide.md +++ b/docs/bootstrap_guide.md @@ -32,6 +32,10 @@ This guide explains how to bootstrap a complete GitOps environment using Cluster - **--skip-deps**: Skip dependency checking (for advanced users) - **--help**, **-h**: Show usage information +### Environment Variables + +- **AIRM_IMAGE_REPOSITORY**: Optional base repository URL for AIRM container images. When set, overrides default AIRM image repositories in the cluster-values configuration. Example: `ghcr.io/mycompany` + ### Examples ```bash @@ -49,6 +53,13 @@ This guide explains how to bootstrap a complete GitOps environment using Cluster # Deploy from specific git branch ./scripts/bootstrap.sh example.com --target-revision=feature-branch +./scripts/bootstrap.sh example.com --CLUSTER_SIZE=large + +# Custom AIRM image repository +AIRM_IMAGE_REPOSITORY=ghcr.io/mycompany ./scripts/bootstrap.sh example.com + +# Air-gapped deployment with local registry +AIRM_IMAGE_REPOSITORY=harbor.internal.com/airm ./scripts/bootstrap.sh 192.168.1.100.nip.io --CLUSTER_SIZE=small ``` ## How It Works @@ -163,6 +174,22 @@ ClusterForge uses a layered configuration approach with YAML merge semantics: global: clusterSize: medium # Set by --cluster-size flag domain: example.com # Set by domain argument + + # AIRM Image Repository Configuration (optional, only when AIRM_IMAGE_REPOSITORY is set) + airm-api: + airm: + backend: + image: + repository: ghcr.io/mycompany/airm-api + frontend: + image: + repository: ghcr.io/mycompany/airm-ui + + airm-dispatcher: + airm: + dispatcher: + image: + repository: ghcr.io/mycompany/airm-dispatcher ``` ### Value Merging Order diff --git a/docs/values_inheritance_pattern.md b/docs/values_inheritance_pattern.md index 634fcd3c..960667d1 100644 --- a/docs/values_inheritance_pattern.md +++ b/docs/values_inheritance_pattern.md @@ -164,6 +164,30 @@ git push # ArgoCD automatically detects and syncs the changes ``` +**Example: AIRM Image Repository Configuration** + +To configure custom AIRM image repositories post-bootstrap, modify `cluster-values/values.yaml`: + +```yaml +# Custom AIRM image repositories for private registry +airm-api: + airm: + backend: + image: + repository: harbor.mycompany.com/airm/airm-api + frontend: + image: + repository: harbor.mycompany.com/airm/airm-ui + +airm-dispatcher: + airm: + dispatcher: + image: + repository: harbor.mycompany.com/airm/airm-dispatcher +``` + +This allows deployment from private registries, air-gapped environments, or custom built images. + ### Configuration Version Control Benefits of the dual-repository pattern: From 1ad26f8a8acc8905db5bbe8ae1d97fe9a19716fe Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Tue, 3 Mar 2026 13:49:01 +0200 Subject: [PATCH 095/115] feat: refactor AIRM_IMAGE_REPOSITORY as command arg as opposed to env injection --- scripts/bootstrap.sh | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index d84f434c..b60e46a8 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -197,23 +197,36 @@ parse_args() { APPS="${1#*=}" shift ;; + --airm-image-repository) + if [ -z "$2" ]; then + echo "ERROR: --airm-image-repository requires an argument" + exit 1 + fi + AIRM_IMAGE_REPOSITORY="$2" + shift 2 + ;; + --airm-image-repository=*) + AIRM_IMAGE_REPOSITORY="${1#*=}" + shift + ;; --help|-h) cat < [values_file] Arguments: - domain Required. Cluster domain (e.g., example.com) - values_file Optional. Values .yaml file to use, default: root/values.yaml + domain REQUIRED. Cluster domain (e.g., myIp.nip.io) + values_file Optional. Values .yaml file to use, default: root/values.yaml Options: - --apps=APP1,APP2 Deploy only specified components - options: namespaces, argocd, gitea, cluster-forge, or any cluster-forge child app (see values.yaml for app names) - Use with --template-only to render instead of applying - --cluster-size, -s options: [small|medium|large], default: medium - --target-revision, -r cluster-forge git revision for ArgoCD to sync from - options: [tag|commit_hash|branch_name], default: $LATEST_RELEASE - --template-only, -t Output YAML manifests to stdout instead of applying to cluster - --skip-deps Skip dependency checking (for advanced users) + --airm-image-repository=url Custom AIRM image repository for gitea-init job (e.g., ghcr.io/silogen, requires regcreds) + --apps=app1[,app2,...] Deploy (kubectl apply) specified components onlye + options: namespaces, argocd, gitea, cluster-forge, or any cluster-forge child app (see values.yaml for app names) + + --cluster-size=[size], -s [size] can be one of small|medium|large, default: medium + --help, -h Show this help message and exit + --skip-deps Skip dependency checking (not recommended) + --target-revision, -r Git revision for ArgoCD to sync from, [tag|commit_hash|branch_name], default: $LATEST_RELEASE + --template-only, -t Output YAML manifests to stdout instead of applying to cluster Examples: @@ -225,10 +238,9 @@ parse_args() { $0 example.com --apps=keycloak -t Bootstrap Behavior: - • Bootstrap deploys ArgoCD + Gitea directly (essential infrastructure) - • cluster-forge parent app then deployed to manage remaining apps including OpenBao - • ArgoCD syncs remaining apps from specified target revision with proper syncWave ordering - • Direct deployment ensures proper initialization order and timing + • deploys ArgoCD + Gitea directly (essential infrastructure) + • apply the cluster-forge application manifest (parent app only) + • ArgoCD syncs remaining apps from specified target revision, respecting syncWaves and dependencies HELP_OUTPUT exit 0 ;; From 466b1eb4dd93f561a5b297eaa0d84dfcb6e719d6 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Tue, 3 Mar 2026 15:16:03 +0200 Subject: [PATCH 096/115] fix: revert to dedicated openbao deploy before cluster-forge parent application --- root/values_large_original.yaml | 64 ---------- root/values_medium_original.yaml | 212 ------------------------------- root/values_small_original.yaml | 189 --------------------------- scripts/bootstrap.sh | 92 +++++++++++--- 4 files changed, 76 insertions(+), 481 deletions(-) delete mode 100644 root/values_large_original.yaml delete mode 100644 root/values_medium_original.yaml delete mode 100644 root/values_small_original.yaml diff --git a/root/values_large_original.yaml b/root/values_large_original.yaml deleted file mode 100644 index cdaf0544..00000000 --- a/root/values_large_original.yaml +++ /dev/null @@ -1,64 +0,0 @@ -enabledApps: - - aim-cluster-model-source - - airm - - amd-gpu-operator - - amd-gpu-operator-config - - appwrapper - - argocd - - argocd-config - - cert-manager - - cluster-auth - - cluster-auth-config - - cnpg-operator - - external-secrets - - external-secrets-config - - gateway-api - - gitea - - gitea-config - - kaiwo - - kaiwo-config - - kaiwo-crds - - keda - - kedify-otel - - keycloak - - kgateway - - kgateway-config - - kgateway-crds - - kserve - - kserve-crds - - kueue - - kueue-config - - kuberay-operator - - kyverno - - kyverno-config - - kyverno-policies-base - - metallb - - minio-operator - - minio-tenant - - minio-tenant-config - - openbao - - openbao-init - - openbao-config - - opentelemetry-operator - - otel-lgtm-stack - - prometheus-crds - - rabbitmq - -apps: - minio-tenant: - valuesObject: - tenant: - pools: - - name: pool-0 - servers: 1 - size: 500Gi - storageClassName: direct - volumesPerServer: 1 - openbao: - valuesObject: - server: - ha: - enabled: true - replicas: 3 - raft: - enabled: true diff --git a/root/values_medium_original.yaml b/root/values_medium_original.yaml deleted file mode 100644 index a71a5b2f..00000000 --- a/root/values_medium_original.yaml +++ /dev/null @@ -1,212 +0,0 @@ -# MEDIUM CLUSTER: All apps enabled (inherited from base values.yaml) -# Add Kyverno policy for local-path access mode mutation - -# Medium & Small clusters add local-path storage policy for RWX→RWO conversion -# Medium & Small clusters add local-path storage policy for RWX→RWO conversion -enabledApps: - - aim-cluster-model-source - - airm - - amd-gpu-operator - - amd-gpu-operator-config - - appwrapper - - argocd - - argocd-config - - cert-manager - - cluster-auth - - cluster-auth-config - - cnpg-operator - - external-secrets - - external-secrets-config - - gateway-api - - gitea - - gitea-config - - kaiwo - - kaiwo-config - - kaiwo-crds - - keda - - kedify-otel - - keycloak - - kgateway - - kgateway-config - - kgateway-crds - - kserve - - kserve-crds - - kueue - - kueue-config - - kuberay-operator - - kyverno - - kyverno-config - - kyverno-policies-base - - kyverno-policies-storage-local-path - - metallb - - minio-operator - - minio-tenant - - minio-tenant-config - - openbao - - openbao-config - - openbao-init - - opentelemetry-operator - - otel-lgtm-stack - - prometheus-crds - - rabbitmq - - -apps: - # Modular Kyverno policy applications (only the storage-local-path addition) - kyverno-policies-storage-local-path: - namespace: kyverno - path: kyverno-policies/storage-local-path - syncWave: -20 - ignoreDifferences: - - group: kyverno.io - kind: ClusterPolicy - jsonPointers: - - /spec/rules/0/skipBackgroundRequests - - /spec/rules/0/validate/allowExistingViolations - argocd: - valuesObject: - applicationSet: - replicas: 1 - controller: - replicas: 1 - resources: - limits: - cpu: "1000m" - memory: "2Gi" - requests: - cpu: "500m" - memory: "1Gi" - redis-ha: - enabled: false - redis: - resources: - limits: - cpu: "500m" - memory: "1Gi" - requests: - cpu: "250m" - memory: "512Mi" - repoServer: - replicas: 1 - resources: - limits: - cpu: "500m" - memory: "1Gi" - requests: - cpu: "250m" - memory: "512Mi" - server: - replicas: 1 - resources: - limits: - cpu: "500m" - memory: "1Gi" - requests: - cpu: "125m" - memory: "256Mi" - - minio-tenant: - valuesObject: - tenant: - buckets: - - name: default-bucket - objectLock: true - - name: models - objectLock: true - - name: datasets - objectLock: false - pools: - - name: pool-0 - servers: 1 - size: 2Ti - storageClassName: direct - volumesPerServer: 2 - resources: - limits: - cpu: "4000m" - memory: "8Gi" - requests: - cpu: "1000m" - memory: "2Gi" - - openbao: - valuesObject: - server: - ha: - enabled: false - replicas: 1 - raft: - enabled: false - dataStorage: - size: 5Gi - storageClass: direct - resources: - limits: - cpu: "1000m" - memory: "2Gi" - requests: - cpu: "250m" - memory: "512Mi" - - prometheus: - valuesObject: - prometheus: - prometheusSpec: - retention: 15d - retentionSize: 20GB - resources: - limits: - cpu: "2000m" - memory: "4Gi" - requests: - cpu: "500m" - memory: "1Gi" - storageSpec: - volumeClaimTemplate: - spec: - accessModes: - - ReadWriteOnce - storageClassName: direct - resources: - requests: - storage: 25Gi - - grafana: - valuesObject: - replicas: 1 - resources: - limits: - cpu: "1000m" - memory: "2Gi" - requests: - cpu: "250m" - memory: "512Mi" - persistence: - enabled: true - size: 5Gi - storageClassName: direct - accessModes: - - ReadWriteOnce - - keycloak: - valuesObject: - # Increase memory resources for Keycloak to prevent OOMKilled during initialization - # Medium preset provides 1536Mi memory limit vs small preset's 768Mi - resourcesPreset: "medium" - - otel-lgtm-stack: - valuesObject: - collectors: - resources: - metrics: - limits: - memory: 4Gi - cpu: '1' - - opentelemetry-operator: - valuesObject: - manager: - resources: - requests: - cpu: "250m" - memory: "512Mi" diff --git a/root/values_small_original.yaml b/root/values_small_original.yaml deleted file mode 100644 index c3a7282c..00000000 --- a/root/values_small_original.yaml +++ /dev/null @@ -1,189 +0,0 @@ -# SMALL CLUSTER: All apps enabled (inherited from base values.yaml) -# Add Kyverno policy for local-path access mode mutation - -# Medium & Small clusters add local-path storage policy for RWX→RWO conversion -enabledApps: - - aim-cluster-model-source - - airm - - amd-gpu-operator - - amd-gpu-operator-config - - appwrapper - - argocd - - argocd-config - - cert-manager - - cluster-auth - - cluster-auth-config - - cnpg-operator - - external-secrets - - external-secrets-config - - gateway-api - - gitea - - gitea-config - - kaiwo - - kaiwo-config - - kaiwo-crds - - keda - - kedify-otel - - keycloak - - kgateway - - kgateway-config - - kgateway-crds - - kserve - - kserve-crds - - kuberay-operator - - kueue - - kueue-config - - kyverno - - kyverno-config - - kyverno-policies-base # applicable to all cluster sizes - - kyverno-policies-storage-local-path # small & medium cluster sizes only - - metallb - - minio-operator - - minio-tenant - - minio-tenant-config - - openbao - - openbao-config - - openbao-init - - opentelemetry-operator - - otel-lgtm-stack - - prometheus-crds - - rabbitmq - -apps: - # Modular Kyverno policy applications (only the storage-local-path addition) - kyverno-policies-storage-local-path: - namespace: kyverno - path: kyverno-policies/storage-local-path - source: clusterForge - syncOptions: - - CreateNamespace=true - ignoreDifferences: [] - wave: 26 # Deploy after base policies - syncWave: - - group: kyverno.io - kind: ClusterPolicy - - argocd: - valuesObject: - applicationSet: - replicas: 1 - controller: - replicas: 1 - resources: - limits: - cpu: "2000m" - memory: "4Gi" - requests: - cpu: "500m" - memory: "1Gi" - redis-ha: - enabled: false - redis: - resources: - limits: - cpu: "1000m" - memory: "2Gi" - requests: - cpu: "250m" - memory: "512Mi" - repoServer: - replicas: 1 - resources: - limits: - cpu: "1000m" - memory: "2Gi" - requests: - cpu: "250m" - memory: "512Mi" - server: - replicas: 1 - resources: - limits: - cpu: "500m" - memory: "1Gi" - requests: - cpu: "125m" - memory: "256Mi" - - minio-tenant: - valuesObject: - tenant: - buckets: - - name: default-bucket - objectLock: true - - name: models - objectLock: true - - name: datasets - objectLock: false - pools: - - name: pool-0 - servers: 1 - size: 2Ti - storageClassName: local-path - volumesPerServer: 2 - resources: - limits: - cpu: "4000m" - memory: "8Gi" - requests: - cpu: "1000m" - memory: "2Gi" - - openbao: - valuesObject: - server: - ha: - enabled: false - replicas: 1 - raft: - enabled: false - dataStorage: - size: 5Gi - storageClass: local-path - resources: - limits: - cpu: "1000m" - memory: "2Gi" - requests: - cpu: "250m" - memory: "512Mi" - - prometheus: - valuesObject: - prometheus: - prometheusSpec: - retention: 15d - retentionSize: 20GB - resources: - limits: - cpu: "2000m" - memory: "4Gi" - requests: - cpu: "500m" - memory: "1Gi" - storageSpec: - volumeClaimTemplate: - spec: - accessModes: - - ReadWriteOnce - storageClassName: local-path - resources: - requests: - storage: 25Gi - - grafana: - valuesObject: - replicas: 1 - resources: - limits: - cpu: "1000m" - memory: "2Gi" - requests: - cpu: "250m" - memory: "512Mi" - persistence: - enabled: true - size: 5Gi - storageClassName: local-path - accessModes: - - ReadWriteOnce diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index b60e46a8..2ee36cb9 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -220,7 +220,7 @@ parse_args() { Options: --airm-image-repository=url Custom AIRM image repository for gitea-init job (e.g., ghcr.io/silogen, requires regcreds) --apps=app1[,app2,...] Deploy (kubectl apply) specified components onlye - options: namespaces, argocd, gitea, cluster-forge, or any cluster-forge child app (see values.yaml for app names) + options: namespaces, argocd, openbao, gitea, cluster-forge, or any cluster-forge child app (see values.yaml for app names) --cluster-size=[size], -s [size] can be one of small|medium|large, default: medium --help, -h Show this help message and exit @@ -234,11 +234,11 @@ parse_args() { $0 112.100.97.17.nip.io $0 dev.example.com --cluster-size=small --target-revision=v1.8.0 $0 dev.example.com -s=small -r=feature-branch - $0 example.com --apps=openbao,openbao-init + $0 example.com --apps=openbao $0 example.com --apps=keycloak -t Bootstrap Behavior: - • deploys ArgoCD + Gitea directly (essential infrastructure) + • deploys ArgoCD + OpenBao + Gitea directly (essential infrastructure) • apply the cluster-forge application manifest (parent app only) • ArgoCD syncs remaining apps from specified target revision, respecting syncWaves and dependencies HELP_OUTPUT @@ -331,7 +331,7 @@ apply_or_template() { # Create namespaces create_namespaces() { - for ns in argocd cf-gitea; do + for ns in argocd cf-gitea cf-openbao; do kubectl create ns "$ns" --dry-run=client -o yaml | apply_or_template -f - done } @@ -369,7 +369,59 @@ bootstrap_argocd() { fi } -# OpenBao is now deployed by ArgoCD with syncWave -70/-60 +# Extract OpenBao values using yq +extract_openbao_values() { + # Create temporary values file for OpenBao bootstrap + cat > /tmp/openbao_bootstrap_values.yaml << EOF +# OpenBao bootstrap values +EOF + + # Extract and merge OpenBao values from the apps structure + yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" >> /tmp/openbao_bootstrap_values.yaml + if [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + if yq eval '.apps.openbao.valuesObject // ""' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | grep -q .; then + yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | yq eval-all 'select(fileIndex == 0) * select(fileIndex == 1)' /tmp/openbao_bootstrap_values.yaml - > /tmp/openbao_bootstrap_values_merged.yaml + mv /tmp/openbao_bootstrap_values_merged.yaml /tmp/openbao_bootstrap_values.yaml + fi + fi +} + +bootstrap_openbao() { + echo "=== OpenBao Bootstrap ===" + + # Get OpenBao version from app path + OPENBAO_VERSION=$(yq eval '.apps.openbao.path' "${SOURCE_ROOT}/root/${VALUES_FILE}" | cut -d'/' -f2) + echo "OpenBao version: $OPENBAO_VERSION" + + extract_openbao_values + + # Use server-side apply to match ArgoCD's field management strategy + helm template --release-name openbao ${SOURCE_ROOT}/sources/openbao/${OPENBAO_VERSION} --namespace cf-openbao \ + --values /tmp/openbao_bootstrap_values.yaml \ + --set ui.enabled=true \ + --kube-version=${KUBE_VERSION} | apply_or_template --server-side --field-manager=argocd-controller --force-conflicts -f - + + if [ "$TEMPLATE_ONLY" = false ]; then + kubectl wait --for=jsonpath='{.status.phase}'=Running pod/openbao-0 -n cf-openbao --timeout=100s + + # Create initial secrets config for init job (separate from ArgoCD-managed version) + echo "Creating initial OpenBao secrets configuration..." + cat ${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-manager-cm.yaml | \ + sed "s|name: openbao-secret-manager-scripts|name: openbao-secret-manager-scripts-init|g" | kubectl apply -f - + + # Create initial secrets config for init job (separate from ArgoCD-managed version) + cat ${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml | \ + sed "s|{{ .Values.domain }}|${DOMAIN}|g" | \ + sed "s|name: openbao-secrets-config|name: openbao-secrets-init-config|g" | kubectl apply -f - + + # Pass OpenBao configuration to init script + helm template --release-name openbao-init ${SOURCE_ROOT}/scripts/init-openbao-job \ + --values /tmp/openbao_bootstrap_values.yaml \ + --set domain="${DOMAIN}" \ + --kube-version=${KUBE_VERSION} | kubectl apply -f - + kubectl wait --for=condition=complete --timeout=300s job/openbao-init-job -n cf-openbao + fi +} # Extract Gitea values using yq extract_gitea_values() { @@ -545,7 +597,7 @@ main() { IFS=',' read -ra APP_ARRAY <<< "$APPS" for app in "${APP_ARRAY[@]}"; do case "$app" in - namespaces|argocd|gitea|cluster-forge) + namespaces|argocd|openbao|gitea|cluster-forge) has_bootstrap_apps=true ;; *) @@ -557,7 +609,7 @@ main() { child_apps="$child_apps,$app" fi else - echo "WARNING: Unknown app '$app'. Available bootstrap apps: namespaces, argocd, gitea, cluster-forge" + echo "WARNING: Unknown app '$app'. Available bootstrap apps: namespaces, argocd, openbao, gitea, cluster-forge" echo "Or specify any cluster-forge child app from values.yaml" fi ;; @@ -568,6 +620,7 @@ main() { if [ "$has_bootstrap_apps" = true ]; then should_run namespaces && create_namespaces should_run argocd && bootstrap_argocd + should_run openbao && bootstrap_openbao should_run gitea && bootstrap_gitea should_run cluster-forge && apply_cluster_forge_parent_app fi @@ -583,34 +636,41 @@ main() { else # Default behavior - run all bootstrap components echo "🚀 Running full bootstrap sequence..." - echo "📋 Bootstrap order: namespaces → argocd → gitea → cluster-forge" + echo "📋 Bootstrap order: namespaces → argocd → openbao → gitea → cluster-forge" if should_run namespaces; then - echo "📦 Step 1/4: Creating namespaces" + echo "📦 Step 1/5: Creating namespaces" create_namespaces else - echo "⏭️ Step 1/4: Skipping namespaces" + echo "⏭️ Step 1/5: Skipping namespaces" fi if should_run argocd; then - echo "📦 Step 2/4: Bootstrapping ArgoCD" + echo "📦 Step 2/5: Bootstrapping ArgoCD" bootstrap_argocd else - echo "⏭️ Step 2/4: Skipping ArgoCD" + echo "⏭️ Step 2/5: Skipping ArgoCD" + fi + + if should_run openbao; then + echo "📦 Step 3/5: Bootstrapping OpenBao" + bootstrap_openbao + else + echo "⏭️ Step 3/5: Skipping OpenBao" fi if should_run gitea; then - echo "📦 Step 3/4: Bootstrapping Gitea" + echo "📦 Step 4/5: Bootstrapping Gitea" bootstrap_gitea else - echo "⏭️ Step 3/4: Skipping Gitea" + echo "⏭️ Step 4/5: Skipping Gitea" fi if should_run cluster-forge; then - echo "📦 Step 4/4: Creating ClusterForge parent app" + echo "📦 Step 5/5: Creating ClusterForge parent app" apply_cluster_forge_parent_app else - echo "⏭️ Step 4/4: Skipping ClusterForge" + echo "⏭️ Step 5/5: Skipping ClusterForge" fi echo "✅ Bootstrap sequence completed" From 4812ac5e3a59959b536676b6e3c9d754fb88086f Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Tue, 3 Mar 2026 20:08:11 +0200 Subject: [PATCH 097/115] fix: revert openbao and gitea scripts to match main, preserving --airm-image-repository support and app filtering --- scripts/bootstrap.sh | 79 +++++++------------ .../templates/cf-init-openbao-cm.yaml | 50 +++--------- .../templates/cf-init-openbao-job.yaml | 4 +- scripts/init-openbao-job/values.yaml | 19 +++++ 4 files changed, 64 insertions(+), 88 deletions(-) create mode 100644 scripts/init-openbao-job/values.yaml diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 2ee36cb9..0a839957 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -369,38 +369,26 @@ bootstrap_argocd() { fi } -# Extract OpenBao values using yq -extract_openbao_values() { - # Create temporary values file for OpenBao bootstrap - cat > /tmp/openbao_bootstrap_values.yaml << EOF -# OpenBao bootstrap values -EOF - - # Extract and merge OpenBao values from the apps structure - yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" >> /tmp/openbao_bootstrap_values.yaml - if [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then - if yq eval '.apps.openbao.valuesObject // ""' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | grep -q .; then - yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | yq eval-all 'select(fileIndex == 0) * select(fileIndex == 1)' /tmp/openbao_bootstrap_values.yaml - > /tmp/openbao_bootstrap_values_merged.yaml - mv /tmp/openbao_bootstrap_values_merged.yaml /tmp/openbao_bootstrap_values.yaml - fi - fi -} + bootstrap_openbao() { echo "=== OpenBao Bootstrap ===" - # Get OpenBao version from app path + # Get OpenBao version from app path - using same method as main OPENBAO_VERSION=$(yq eval '.apps.openbao.path' "${SOURCE_ROOT}/root/${VALUES_FILE}" | cut -d'/' -f2) echo "OpenBao version: $OPENBAO_VERSION" - extract_openbao_values + # Extract OpenBao values from merged config - matching main approach + yq eval '.apps.openbao.valuesObject' ${SOURCE_ROOT}/root/${VALUES_FILE} > /tmp/openbao_values.yaml + yq eval '.apps.openbao.valuesObject' ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE} > /tmp/openbao_size_values.yaml # Use server-side apply to match ArgoCD's field management strategy helm template --release-name openbao ${SOURCE_ROOT}/sources/openbao/${OPENBAO_VERSION} --namespace cf-openbao \ - --values /tmp/openbao_bootstrap_values.yaml \ + -f /tmp/openbao_values.yaml \ + -f /tmp/openbao_size_values.yaml \ --set ui.enabled=true \ --kube-version=${KUBE_VERSION} | apply_or_template --server-side --field-manager=argocd-controller --force-conflicts -f - - + if [ "$TEMPLATE_ONLY" = false ]; then kubectl wait --for=jsonpath='{.status.phase}'=Running pod/openbao-0 -n cf-openbao --timeout=100s @@ -409,48 +397,32 @@ bootstrap_openbao() { cat ${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-manager-cm.yaml | \ sed "s|name: openbao-secret-manager-scripts|name: openbao-secret-manager-scripts-init|g" | kubectl apply -f - - # Create initial secrets config for init job (separate from ArgoCD-managed version) + # Create initial secrets config for init job (separate from ArgoCD-managed version) cat ${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml | \ sed "s|{{ .Values.domain }}|${DOMAIN}|g" | \ sed "s|name: openbao-secrets-config|name: openbao-secrets-init-config|g" | kubectl apply -f - # Pass OpenBao configuration to init script helm template --release-name openbao-init ${SOURCE_ROOT}/scripts/init-openbao-job \ - --values /tmp/openbao_bootstrap_values.yaml \ + -f /tmp/openbao_values.yaml \ --set domain="${DOMAIN}" \ --kube-version=${KUBE_VERSION} | kubectl apply -f - kubectl wait --for=condition=complete --timeout=300s job/openbao-init-job -n cf-openbao fi } -# Extract Gitea values using yq -extract_gitea_values() { - # Create temporary values file for Gitea bootstrap - cat > /tmp/gitea_bootstrap_values.yaml << EOF -clusterDomain: ${DOMAIN} -gitea: - config: - server: - ROOT_URL: https://gitea.${DOMAIN}/ -EOF - - # Extract and merge Gitea values from the apps structure - yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" >> /tmp/gitea_bootstrap_values.yaml - if [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then - if yq eval '.apps.gitea.valuesObject // ""' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | grep -q .; then - yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | yq eval-all 'select(fileIndex == 0) * select(fileIndex == 1)' /tmp/gitea_bootstrap_values.yaml - > /tmp/gitea_bootstrap_values_merged.yaml - mv /tmp/gitea_bootstrap_values_merged.yaml /tmp/gitea_bootstrap_values.yaml - fi - fi -} + bootstrap_gitea() { - # Gitea bootstrap echo "=== Gitea Bootstrap ===" generate_password() { openssl rand -hex 16 | tr 'a-f' 'A-F' | head -c 32 } + # Get Gitea version from app path - matching main approach + GITEA_VERSION=$(yq eval '.apps.gitea.path' "${SOURCE_ROOT}/root/${VALUES_FILE}" | cut -d'/' -f2) + echo "Gitea version: $GITEA_VERSION" + # Create initial-cf-values configmap (complete values for gitea-init-job) # Use the complete root values.yaml with filled placeholders instead of simplified version cp "${SOURCE_ROOT}/root/${VALUES_FILE}" /tmp/complete_values.yaml @@ -474,12 +446,22 @@ bootstrap_gitea() { --from-literal=password=$(generate_password) \ --dry-run=client -o yaml | apply_or_template -f - - extract_gitea_values - helm template --release-name gitea ${SOURCE_ROOT}/sources/gitea/12.3.0 --namespace cf-gitea \ - --values /tmp/gitea_bootstrap_values.yaml \ + # Extract Gitea values like main does + yq eval '.apps.gitea.valuesObject' ${SOURCE_ROOT}/root/${VALUES_FILE} > /tmp/gitea_values.yaml + yq eval '.apps.gitea.valuesObject' ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE} > /tmp/gitea_size_values.yaml + + # Bootstrap Gitea - matching main approach + helm template --release-name gitea ${SOURCE_ROOT}/sources/gitea/${GITEA_VERSION} --namespace cf-gitea \ + -f /tmp/gitea_values.yaml \ + -f /tmp/gitea_size_values.yaml \ + --set gitea.config.server.ROOT_URL="https://gitea.${DOMAIN}/" \ --kube-version=${KUBE_VERSION} | apply_or_template -f - - # Gitea Init Job + if [ "$TEMPLATE_ONLY" = false ]; then + kubectl rollout status deploy/gitea -n cf-gitea --timeout="${DEFAULT_TIMEOUT}" + fi + + # Gitea Init Job - preserve AIRM repository functionality HELM_ARGS="--release-name gitea-init ${SOURCE_ROOT}/scripts/init-gitea-job \ --set clusterSize=${SIZE_VALUES_FILE:-values_${CLUSTER_SIZE}.yaml} \ --set domain=${DOMAIN} \ @@ -491,10 +473,9 @@ bootstrap_gitea() { HELM_ARGS="${HELM_ARGS} --set airmImageRepository=${AIRM_IMAGE_REPOSITORY}" fi - helm template ${HELM_ARGS} | kubectl apply -f - + helm template ${HELM_ARGS} | apply_or_template -f - if [ "$TEMPLATE_ONLY" = false ]; then - kubectl rollout status deploy/gitea -n cf-gitea --timeout="${DEFAULT_TIMEOUT}" kubectl wait --for=condition=complete --timeout="${DEFAULT_TIMEOUT}" job/gitea-init-job -n cf-gitea fi } diff --git a/scripts/init-openbao-job/templates/cf-init-openbao-cm.yaml b/scripts/init-openbao-job/templates/cf-init-openbao-cm.yaml index bc1db2d9..a3de50c3 100644 --- a/scripts/init-openbao-job/templates/cf-init-openbao-cm.yaml +++ b/scripts/init-openbao-job/templates/cf-init-openbao-cm.yaml @@ -55,9 +55,8 @@ data: echo "No HA replicas found (single-node deployment)." fi else - echo "WARNING: openbao-keys secret not found, but OpenBao appears to be running." - echo "This might indicate the cluster was initialized externally or the secret was deleted." - echo "HA replica setup will be skipped." + echo "ERROR: openbao-keys secret not found. Cannot unseal replicas." + exit 1 fi echo "OpenBao is fully operational. Skipping initialization." @@ -66,42 +65,19 @@ data: if [ "$OPENBAO_0_INITIALIZED" = "false" ]; then echo "Initializing OpenBao on openbao-0..." - if INIT_OUTPUT=$(kubectl exec openbao-0 -- bao operator init -format=json -key-shares=1 -key-threshold=1); then - echo "OpenBao initialization successful" - echo $INIT_OUTPUT > /tmp/bao-keys.json - - echo "Saving unseal keys and root token to openbao-keys k8s secret..." - BAO_ROOT_TOKEN=$(jq -r '.root_token' /tmp/bao-keys.json) - BAO_UNSEAL_KEY=$(jq -r '.unseal_keys_b64[0]' /tmp/bao-keys.json) - - if [ -z "$BAO_ROOT_TOKEN" ] || [ "$BAO_ROOT_TOKEN" = "null" ] || [ -z "$BAO_UNSEAL_KEY" ] || [ "$BAO_UNSEAL_KEY" = "null" ]; then - echo "ERROR: Failed to extract root token or unseal key from initialization output" - echo "Init output: $INIT_OUTPUT" - exit 1 - fi - - kubectl create secret generic openbao-keys -n cf-openbao \ - --from-literal=root_token="$BAO_ROOT_TOKEN" \ - --from-literal=unseal_key="$BAO_UNSEAL_KEY" \ - --dry-run=client -o yaml | kubectl apply -f - - echo "openbao-keys secret created successfully" - else - echo "ERROR: OpenBao initialization failed" - kubectl exec openbao-0 -- bao operator init -format=json -key-shares=1 -key-threshold=1 || true - exit 1 - fi + INIT_OUTPUT=$(kubectl exec openbao-0 -- bao operator init -format=json -key-shares=1 -key-threshold=1) + echo $INIT_OUTPUT > /tmp/bao-keys.json; + + echo "Saving unseal keys and root token to openbao-keys k8s secret..." + BAO_ROOT_TOKEN=$(jq -r '.root_token' /tmp/bao-keys.json); + BAO_UNSEAL_KEY=$(jq -r '.unseal_keys_b64[0]' /tmp/bao-keys.json); + kubectl create secret generic openbao-keys -n cf-openbao \ + --from-literal=root_token="$BAO_ROOT_TOKEN" \ + --from-literal=unseal_key="$BAO_UNSEAL_KEY" \ + --dry-run=client -o yaml | kubectl apply -f - else echo "OpenBao openbao-0 is initialized but sealed. Getting unseal key..." - if kubectl get secret openbao-keys -n cf-openbao &>/dev/null; then - BAO_UNSEAL_KEY=$(kubectl get secret openbao-keys -n cf-openbao -o jsonpath='{.data.unseal_key}' | base64 -d) - else - echo "ERROR: OpenBao is initialized but openbao-keys secret is missing." - echo "This indicates a previous initialization failure or the secret was deleted." - echo "Manual intervention required - either:" - echo "1. Delete the OpenBao StatefulSet to start fresh, or" - echo "2. Manually unseal OpenBao if you have the keys" - exit 1 - fi + BAO_UNSEAL_KEY=$(kubectl get secret openbao-keys -n cf-openbao -o jsonpath='{.data.unseal_key}' | base64 -d) fi echo "Unsealing openbao-0..." diff --git a/scripts/init-openbao-job/templates/cf-init-openbao-job.yaml b/scripts/init-openbao-job/templates/cf-init-openbao-job.yaml index c01b7d57..31f0d6f7 100644 --- a/scripts/init-openbao-job/templates/cf-init-openbao-job.yaml +++ b/scripts/init-openbao-job/templates/cf-init-openbao-job.yaml @@ -52,8 +52,8 @@ spec: defaultMode: 0755 - name: secrets-config configMap: - name: openbao-secrets-config + name: openbao-secrets-init-config - name: secret-manager configMap: - name: openbao-secret-manager-scripts + name: openbao-secret-manager-scripts-init defaultMode: 0755 diff --git a/scripts/init-openbao-job/values.yaml b/scripts/init-openbao-job/values.yaml new file mode 100644 index 00000000..4c682279 --- /dev/null +++ b/scripts/init-openbao-job/values.yaml @@ -0,0 +1,19 @@ +domain: # to be filled by bootstrap script + +# OpenBao server configuration from merged values +server: + ha: + enabled: false # from merged config + replicas: 1 # from merged config + raft: + enabled: false # from merged config + dataStorage: + size: 5Gi # from merged config + storageClass: "" # from merged config + resources: # from merged config + requests: + cpu: "250m" + memory: "512Mi" + limits: + cpu: "1000m" + memory: "2Gi" \ No newline at end of file From 607e5e287d6a6752cc0f50eb6ded5f3e1a4d7d01 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Tue, 3 Mar 2026 20:16:02 +0200 Subject: [PATCH 098/115] fix: add proper values files checks --- scripts/bootstrap.sh | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 0a839957..38dbb29c 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -293,7 +293,20 @@ validate_args() { fi SOURCE_ROOT="${SCRIPT_DIR}/.." - SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" + setup_values_files +} + +# Check if size-specific values file exists - matching main approach +setup_values_files() { + SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" + + if [ ! -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + echo "WARNING: Size-specific values file not found: ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" + echo "Proceeding with base values file only: ${VALUES_FILE}" + SIZE_VALUES_FILE="" + else + echo "Using size-specific values file: ${SIZE_VALUES_FILE}" + fi } print_summary() { @@ -346,7 +359,7 @@ EOF # Extract and merge ArgoCD values from the apps structure yq eval '.apps.argocd.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" >> /tmp/argocd_bootstrap_values.yaml - if [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then if yq eval '.apps.argocd.valuesObject // ""' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | grep -q .; then yq eval '.apps.argocd.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | yq eval-all 'select(fileIndex == 0) * select(fileIndex == 1)' /tmp/argocd_bootstrap_values.yaml - > /tmp/argocd_bootstrap_values_merged.yaml mv /tmp/argocd_bootstrap_values_merged.yaml /tmp/argocd_bootstrap_values.yaml @@ -380,7 +393,11 @@ bootstrap_openbao() { # Extract OpenBao values from merged config - matching main approach yq eval '.apps.openbao.valuesObject' ${SOURCE_ROOT}/root/${VALUES_FILE} > /tmp/openbao_values.yaml - yq eval '.apps.openbao.valuesObject' ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE} > /tmp/openbao_size_values.yaml + if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + yq eval '.apps.openbao.valuesObject' ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE} > /tmp/openbao_size_values.yaml + else + echo "# No size-specific values" > /tmp/openbao_size_values.yaml + fi # Use server-side apply to match ArgoCD's field management strategy helm template --release-name openbao ${SOURCE_ROOT}/sources/openbao/${OPENBAO_VERSION} --namespace cf-openbao \ @@ -429,11 +446,15 @@ bootstrap_gitea() { # Fill in placeholder values using yq (these are used by gitea-init job) yq eval ".global.domain = \"${DOMAIN}\"" -i /tmp/complete_values.yaml - yq eval ".global.clusterSize = \"${SIZE_VALUES_FILE}\"" -i /tmp/complete_values.yaml + if [ -n "${SIZE_VALUES_FILE}" ]; then + yq eval ".global.clusterSize = \"${SIZE_VALUES_FILE}\"" -i /tmp/complete_values.yaml + else + yq eval ".global.clusterSize = \"values_${CLUSTER_SIZE}.yaml\"" -i /tmp/complete_values.yaml + fi yq eval ".clusterForge.targetRevision = \"${TARGET_REVISION}\"" -i /tmp/complete_values.yaml # Merge with size-specific values if they exist - if [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then yq eval-all 'select(fileIndex == 0) * select(fileIndex == 1)' /tmp/complete_values.yaml "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/complete_values_merged.yaml mv /tmp/complete_values_merged.yaml /tmp/complete_values.yaml fi @@ -448,7 +469,11 @@ bootstrap_gitea() { # Extract Gitea values like main does yq eval '.apps.gitea.valuesObject' ${SOURCE_ROOT}/root/${VALUES_FILE} > /tmp/gitea_values.yaml - yq eval '.apps.gitea.valuesObject' ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE} > /tmp/gitea_size_values.yaml + if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + yq eval '.apps.gitea.valuesObject' ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE} > /tmp/gitea_size_values.yaml + else + echo "# No size-specific values" > /tmp/gitea_size_values.yaml + fi # Bootstrap Gitea - matching main approach helm template --release-name gitea ${SOURCE_ROOT}/sources/gitea/${GITEA_VERSION} --namespace cf-gitea \ From 2e56f4fcff318cf35f1d69fd020e4af085bacde5 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Tue, 3 Mar 2026 20:21:09 +0200 Subject: [PATCH 099/115] qa: add error handling and yq fix --- scripts/bootstrap.sh | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 38dbb29c..b68e8500 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -387,16 +387,26 @@ bootstrap_argocd() { bootstrap_openbao() { echo "=== OpenBao Bootstrap ===" + # Debug output for troubleshooting + echo "Debug: SOURCE_ROOT='${SOURCE_ROOT}'" + echo "Debug: VALUES_FILE='${VALUES_FILE}'" + echo "Debug: SIZE_VALUES_FILE='${SIZE_VALUES_FILE}'" + echo "Debug: CLUSTER_SIZE='${CLUSTER_SIZE}'" + # Get OpenBao version from app path - using same method as main OPENBAO_VERSION=$(yq eval '.apps.openbao.path' "${SOURCE_ROOT}/root/${VALUES_FILE}" | cut -d'/' -f2) echo "OpenBao version: $OPENBAO_VERSION" # Extract OpenBao values from merged config - matching main approach - yq eval '.apps.openbao.valuesObject' ${SOURCE_ROOT}/root/${VALUES_FILE} > /tmp/openbao_values.yaml + echo "Extracting OpenBao values..." + yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > /tmp/openbao_values.yaml || { echo "ERROR: Failed to extract OpenBao values from ${VALUES_FILE}"; exit 1; } + if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then - yq eval '.apps.openbao.valuesObject' ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE} > /tmp/openbao_size_values.yaml + echo "Extracting OpenBao size-specific values from ${SIZE_VALUES_FILE}..." + yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/openbao_size_values.yaml || { echo "ERROR: Failed to extract OpenBao size values from ${SIZE_VALUES_FILE}"; exit 1; } else - echo "# No size-specific values" > /tmp/openbao_size_values.yaml + echo "No size-specific values file, creating empty placeholder..." + printf "# No size-specific values\n" > /tmp/openbao_size_values.yaml fi # Use server-side apply to match ArgoCD's field management strategy @@ -436,6 +446,12 @@ bootstrap_gitea() { openssl rand -hex 16 | tr 'a-f' 'A-F' | head -c 32 } + # Debug output for troubleshooting + echo "Debug: SOURCE_ROOT='${SOURCE_ROOT}'" + echo "Debug: VALUES_FILE='${VALUES_FILE}'" + echo "Debug: SIZE_VALUES_FILE='${SIZE_VALUES_FILE}'" + echo "Debug: CLUSTER_SIZE='${CLUSTER_SIZE}'" + # Get Gitea version from app path - matching main approach GITEA_VERSION=$(yq eval '.apps.gitea.path' "${SOURCE_ROOT}/root/${VALUES_FILE}" | cut -d'/' -f2) echo "Gitea version: $GITEA_VERSION" @@ -468,11 +484,15 @@ bootstrap_gitea() { --dry-run=client -o yaml | apply_or_template -f - # Extract Gitea values like main does - yq eval '.apps.gitea.valuesObject' ${SOURCE_ROOT}/root/${VALUES_FILE} > /tmp/gitea_values.yaml + echo "Extracting Gitea values..." + yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > /tmp/gitea_values.yaml || { echo "ERROR: Failed to extract Gitea values from ${VALUES_FILE}"; exit 1; } + if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then - yq eval '.apps.gitea.valuesObject' ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE} > /tmp/gitea_size_values.yaml + echo "Extracting Gitea size-specific values from ${SIZE_VALUES_FILE}..." + yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/gitea_size_values.yaml || { echo "ERROR: Failed to extract Gitea size values from ${SIZE_VALUES_FILE}"; exit 1; } else - echo "# No size-specific values" > /tmp/gitea_size_values.yaml + echo "No size-specific values file, creating empty placeholder..." + printf "# No size-specific values\n" > /tmp/gitea_size_values.yaml fi # Bootstrap Gitea - matching main approach From 7f05e92dc2fad5f5c16b44633a5da0d5922828f4 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Tue, 3 Mar 2026 20:22:51 +0200 Subject: [PATCH 100/115] qa: update check logic for apps.openbao --- scripts/bootstrap.sh | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index b68e8500..515b1553 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -403,7 +403,17 @@ bootstrap_openbao() { if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then echo "Extracting OpenBao size-specific values from ${SIZE_VALUES_FILE}..." - yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/openbao_size_values.yaml || { echo "ERROR: Failed to extract OpenBao size values from ${SIZE_VALUES_FILE}"; exit 1; } + echo "Checking if openbao section exists in size values file..." + if yq eval '.apps.openbao' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" >/dev/null 2>&1 && [ "$(yq eval '.apps.openbao' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}")" != "null" ]; then + echo "OpenBao section found, extracting values..." + yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/openbao_size_values.yaml || { + echo "WARNING: Failed to extract OpenBao valuesObject from ${SIZE_VALUES_FILE}, using empty values" + printf "# OpenBao valuesObject not found in size file\n" > /tmp/openbao_size_values.yaml + } + else + echo "No OpenBao section in size values file, creating empty placeholder..." + printf "# No OpenBao section in size-specific values\n" > /tmp/openbao_size_values.yaml + fi else echo "No size-specific values file, creating empty placeholder..." printf "# No size-specific values\n" > /tmp/openbao_size_values.yaml @@ -489,7 +499,17 @@ bootstrap_gitea() { if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then echo "Extracting Gitea size-specific values from ${SIZE_VALUES_FILE}..." - yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/gitea_size_values.yaml || { echo "ERROR: Failed to extract Gitea size values from ${SIZE_VALUES_FILE}"; exit 1; } + echo "Checking if gitea section exists in size values file..." + if yq eval '.apps.gitea' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" >/dev/null 2>&1 && [ "$(yq eval '.apps.gitea' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}")" != "null" ]; then + echo "Gitea section found, extracting values..." + yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/gitea_size_values.yaml || { + echo "WARNING: Failed to extract Gitea valuesObject from ${SIZE_VALUES_FILE}, using empty values" + printf "# Gitea valuesObject not found in size file\n" > /tmp/gitea_size_values.yaml + } + else + echo "No Gitea section in size values file, creating empty placeholder..." + printf "# No Gitea section in size-specific values\n" > /tmp/gitea_size_values.yaml + fi else echo "No size-specific values file, creating empty placeholder..." printf "# No size-specific values\n" > /tmp/gitea_size_values.yaml From b558b911cb002c1514a4c338fb09d682471346b3 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Tue, 3 Mar 2026 20:25:17 +0200 Subject: [PATCH 101/115] fix: resolve potential issue with previous run temp file permissions --- scripts/bootstrap.sh | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 515b1553..8f633a05 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -397,32 +397,36 @@ bootstrap_openbao() { OPENBAO_VERSION=$(yq eval '.apps.openbao.path' "${SOURCE_ROOT}/root/${VALUES_FILE}" | cut -d'/' -f2) echo "OpenBao version: $OPENBAO_VERSION" + # Create a unique temp directory to avoid permission issues + TEMP_DIR=$(mktemp -d -t cf-bootstrap.XXXXXX) || { echo "ERROR: Cannot create temp directory"; exit 1; } + echo "Using temp directory: $TEMP_DIR" + # Extract OpenBao values from merged config - matching main approach echo "Extracting OpenBao values..." - yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > /tmp/openbao_values.yaml || { echo "ERROR: Failed to extract OpenBao values from ${VALUES_FILE}"; exit 1; } + yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > "${TEMP_DIR}/openbao_values.yaml" || { echo "ERROR: Failed to extract OpenBao values from ${VALUES_FILE}"; exit 1; } if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then echo "Extracting OpenBao size-specific values from ${SIZE_VALUES_FILE}..." echo "Checking if openbao section exists in size values file..." if yq eval '.apps.openbao' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" >/dev/null 2>&1 && [ "$(yq eval '.apps.openbao' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}")" != "null" ]; then echo "OpenBao section found, extracting values..." - yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/openbao_size_values.yaml || { + yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > "${TEMP_DIR}/openbao_size_values.yaml" || { echo "WARNING: Failed to extract OpenBao valuesObject from ${SIZE_VALUES_FILE}, using empty values" - printf "# OpenBao valuesObject not found in size file\n" > /tmp/openbao_size_values.yaml + printf "# OpenBao valuesObject not found in size file\n" > "${TEMP_DIR}/openbao_size_values.yaml" } else echo "No OpenBao section in size values file, creating empty placeholder..." - printf "# No OpenBao section in size-specific values\n" > /tmp/openbao_size_values.yaml + printf "# No OpenBao section in size-specific values\n" > "${TEMP_DIR}/openbao_size_values.yaml" fi else echo "No size-specific values file, creating empty placeholder..." - printf "# No size-specific values\n" > /tmp/openbao_size_values.yaml + printf "# No size-specific values\n" > "${TEMP_DIR}/openbao_size_values.yaml" fi # Use server-side apply to match ArgoCD's field management strategy helm template --release-name openbao ${SOURCE_ROOT}/sources/openbao/${OPENBAO_VERSION} --namespace cf-openbao \ - -f /tmp/openbao_values.yaml \ - -f /tmp/openbao_size_values.yaml \ + -f "${TEMP_DIR}/openbao_values.yaml" \ + -f "${TEMP_DIR}/openbao_size_values.yaml" \ --set ui.enabled=true \ --kube-version=${KUBE_VERSION} | apply_or_template --server-side --field-manager=argocd-controller --force-conflicts -f - @@ -441,11 +445,14 @@ bootstrap_openbao() { # Pass OpenBao configuration to init script helm template --release-name openbao-init ${SOURCE_ROOT}/scripts/init-openbao-job \ - -f /tmp/openbao_values.yaml \ + -f "${TEMP_DIR}/openbao_values.yaml" \ --set domain="${DOMAIN}" \ --kube-version=${KUBE_VERSION} | kubectl apply -f - kubectl wait --for=condition=complete --timeout=300s job/openbao-init-job -n cf-openbao fi + + # Cleanup temp directory + rm -rf "${TEMP_DIR}" } From 82d9064c60ca0784899ba86ed961523f3779770c Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Tue, 3 Mar 2026 20:30:57 +0200 Subject: [PATCH 102/115] fix: same temp folder approach as now working openbao --- scripts/bootstrap.sh | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 8f633a05..54570e47 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -473,26 +473,30 @@ bootstrap_gitea() { GITEA_VERSION=$(yq eval '.apps.gitea.path' "${SOURCE_ROOT}/root/${VALUES_FILE}" | cut -d'/' -f2) echo "Gitea version: $GITEA_VERSION" + # Create a unique temp directory to avoid permission issues + TEMP_DIR=$(mktemp -d -t cf-gitea-bootstrap.XXXXXX) || { echo "ERROR: Cannot create temp directory"; exit 1; } + echo "Using temp directory: $TEMP_DIR" + # Create initial-cf-values configmap (complete values for gitea-init-job) # Use the complete root values.yaml with filled placeholders instead of simplified version - cp "${SOURCE_ROOT}/root/${VALUES_FILE}" /tmp/complete_values.yaml + cp "${SOURCE_ROOT}/root/${VALUES_FILE}" "${TEMP_DIR}/complete_values.yaml" # Fill in placeholder values using yq (these are used by gitea-init job) - yq eval ".global.domain = \"${DOMAIN}\"" -i /tmp/complete_values.yaml + yq eval ".global.domain = \"${DOMAIN}\"" -i "${TEMP_DIR}/complete_values.yaml" if [ -n "${SIZE_VALUES_FILE}" ]; then - yq eval ".global.clusterSize = \"${SIZE_VALUES_FILE}\"" -i /tmp/complete_values.yaml + yq eval ".global.clusterSize = \"${SIZE_VALUES_FILE}\"" -i "${TEMP_DIR}/complete_values.yaml" else - yq eval ".global.clusterSize = \"values_${CLUSTER_SIZE}.yaml\"" -i /tmp/complete_values.yaml + yq eval ".global.clusterSize = \"values_${CLUSTER_SIZE}.yaml\"" -i "${TEMP_DIR}/complete_values.yaml" fi - yq eval ".clusterForge.targetRevision = \"${TARGET_REVISION}\"" -i /tmp/complete_values.yaml + yq eval ".clusterForge.targetRevision = \"${TARGET_REVISION}\"" -i "${TEMP_DIR}/complete_values.yaml" # Merge with size-specific values if they exist if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then - yq eval-all 'select(fileIndex == 0) * select(fileIndex == 1)' /tmp/complete_values.yaml "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/complete_values_merged.yaml - mv /tmp/complete_values_merged.yaml /tmp/complete_values.yaml + yq eval-all 'select(fileIndex == 0) * select(fileIndex == 1)' "${TEMP_DIR}/complete_values.yaml" "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > "${TEMP_DIR}/complete_values_merged.yaml" + mv "${TEMP_DIR}/complete_values_merged.yaml" "${TEMP_DIR}/complete_values.yaml" fi - kubectl create configmap initial-cf-values --from-literal=initial-cf-values="$(cat /tmp/complete_values.yaml)" --dry-run=client -o yaml | apply_or_template -n cf-gitea -f - + kubectl create configmap initial-cf-values --from-literal=initial-cf-values="$(cat "${TEMP_DIR}/complete_values.yaml")" --dry-run=client -o yaml | apply_or_template -n cf-gitea -f - kubectl create secret generic gitea-admin-credentials \ --namespace=cf-gitea \ @@ -502,30 +506,30 @@ bootstrap_gitea() { # Extract Gitea values like main does echo "Extracting Gitea values..." - yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > /tmp/gitea_values.yaml || { echo "ERROR: Failed to extract Gitea values from ${VALUES_FILE}"; exit 1; } + yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > "${TEMP_DIR}/gitea_values.yaml" || { echo "ERROR: Failed to extract Gitea values from ${VALUES_FILE}"; exit 1; } if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then echo "Extracting Gitea size-specific values from ${SIZE_VALUES_FILE}..." echo "Checking if gitea section exists in size values file..." if yq eval '.apps.gitea' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" >/dev/null 2>&1 && [ "$(yq eval '.apps.gitea' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}")" != "null" ]; then echo "Gitea section found, extracting values..." - yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > /tmp/gitea_size_values.yaml || { + yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > "${TEMP_DIR}/gitea_size_values.yaml" || { echo "WARNING: Failed to extract Gitea valuesObject from ${SIZE_VALUES_FILE}, using empty values" - printf "# Gitea valuesObject not found in size file\n" > /tmp/gitea_size_values.yaml + printf "# Gitea valuesObject not found in size file\n" > "${TEMP_DIR}/gitea_size_values.yaml" } else echo "No Gitea section in size values file, creating empty placeholder..." - printf "# No Gitea section in size-specific values\n" > /tmp/gitea_size_values.yaml + printf "# No Gitea section in size-specific values\n" > "${TEMP_DIR}/gitea_size_values.yaml" fi else echo "No size-specific values file, creating empty placeholder..." - printf "# No size-specific values\n" > /tmp/gitea_size_values.yaml + printf "# No size-specific values\n" > "${TEMP_DIR}/gitea_size_values.yaml" fi # Bootstrap Gitea - matching main approach helm template --release-name gitea ${SOURCE_ROOT}/sources/gitea/${GITEA_VERSION} --namespace cf-gitea \ - -f /tmp/gitea_values.yaml \ - -f /tmp/gitea_size_values.yaml \ + -f "${TEMP_DIR}/gitea_values.yaml" \ + -f "${TEMP_DIR}/gitea_size_values.yaml" \ --set gitea.config.server.ROOT_URL="https://gitea.${DOMAIN}/" \ --kube-version=${KUBE_VERSION} | apply_or_template -f - @@ -550,6 +554,9 @@ bootstrap_gitea() { if [ "$TEMPLATE_ONLY" = false ]; then kubectl wait --for=condition=complete --timeout="${DEFAULT_TIMEOUT}" job/gitea-init-job -n cf-gitea fi + + # Cleanup temp directory + rm -rf "${TEMP_DIR}" } # Render specific cluster-forge child apps (for --apps filtering) From 731b47757b9928fdbbd5278c502de351fdbec3f5 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Tue, 3 Mar 2026 21:07:45 +0200 Subject: [PATCH 103/115] fix: airm image seeding for cluster-values --- .../templates/cf-init-gitea-cm.yaml | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml index c0a45bba..e623f67b 100644 --- a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml +++ b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml @@ -151,20 +151,14 @@ data: {{- if .Values.airmImageRepository }} # AIRM Image Repository Configuration - airm-api: - airm: - backend: - image: - repository: {{ .Values.airmImageRepository }}/airm-api - frontend: - image: - repository: {{ .Values.airmImageRepository }}/airm-ui - - airm-dispatcher: - airm: - dispatcher: - image: - repository: {{ .Values.airmImageRepository }}/airm-dispatcher + airm: + helmParameters: + - name: airm-api.airm.backend.image.repository + value: {{ .Values.airmImageRepository }}/airm-api + - name: airm-ui.airm.frontend.image.repository + value: {{ .Values.airmImageRepository }}/airm-ui + - name: airm-dispatcher.airm.dispatcher.image.repository + value: {{ .Values.airmImageRepository }}/airm-dispatcher {{- end }} EOF From 466d70a74f518232479102dd1dbe6fc799746658 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Tue, 3 Mar 2026 21:24:16 +0200 Subject: [PATCH 104/115] fix: apps key in cluster-values.yaml --- .../templates/cf-init-gitea-cm.yaml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml index e623f67b..63864171 100644 --- a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml +++ b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml @@ -151,14 +151,15 @@ data: {{- if .Values.airmImageRepository }} # AIRM Image Repository Configuration - airm: - helmParameters: - - name: airm-api.airm.backend.image.repository - value: {{ .Values.airmImageRepository }}/airm-api - - name: airm-ui.airm.frontend.image.repository - value: {{ .Values.airmImageRepository }}/airm-ui - - name: airm-dispatcher.airm.dispatcher.image.repository - value: {{ .Values.airmImageRepository }}/airm-dispatcher + apps: + airm: + helmParameters: + - name: airm-api.airm.backend.image.repository + value: {{ .Values.airmImageRepository }}/airm-api + - name: airm-ui.airm.frontend.image.repository + value: {{ .Values.airmImageRepository }}/airm-ui + - name: airm-dispatcher.airm.dispatcher.image.repository + value: {{ .Values.airmImageRepository }}/airm-dispatcher {{- end }} EOF From 66af8d09ce0de93a81fac7b394313f07af06ee67 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Tue, 3 Mar 2026 21:36:59 +0200 Subject: [PATCH 105/115] fix: add regcreds for airm images --- .../templates/cf-init-gitea-cm.yaml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml index 63864171..d9024a12 100644 --- a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml +++ b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml @@ -154,14 +154,28 @@ data: apps: airm: helmParameters: + # BACKEND (airm-api) - name: airm-api.airm.backend.image.repository value: {{ .Values.airmImageRepository }}/airm-api + - name: airm-api.airm.backend.imagePullSecrets[0].name + value: regcred + + # FRONTEND (airm-ui) - name: airm-ui.airm.frontend.image.repository value: {{ .Values.airmImageRepository }}/airm-ui + - name: airm-api.airm.frontend.imagePullSecrets[0].name + value: regcred + + # DISPATCHER (airm-dispatcher) - name: airm-dispatcher.airm.dispatcher.image.repository value: {{ .Values.airmImageRepository }}/airm-dispatcher - {{- end }} + - name: airm-dispatcher.airm.dispatcher.imagePullSecrets[0].name + value: regcred + # AGENT (airm-agent) + - name: agent.airm.imagePullSecrets[0].name + value: regcred + {{- end }} EOF sed -i "s/DOMAIN_PLACEHOLDER/${DOMAIN}/g" /tmp/values.yaml From 56273d4996fb045c18408b37a5f93da192639a8f Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Tue, 3 Mar 2026 21:59:28 +0200 Subject: [PATCH 106/115] fix: cp error in airm images --- scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml index d9024a12..bddf674f 100644 --- a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml +++ b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml @@ -159,19 +159,16 @@ data: value: {{ .Values.airmImageRepository }}/airm-api - name: airm-api.airm.backend.imagePullSecrets[0].name value: regcred - # FRONTEND (airm-ui) - name: airm-ui.airm.frontend.image.repository value: {{ .Values.airmImageRepository }}/airm-ui - - name: airm-api.airm.frontend.imagePullSecrets[0].name + - name: airm-ui.airm.frontend.imagePullSecrets[0].name value: regcred - # DISPATCHER (airm-dispatcher) - name: airm-dispatcher.airm.dispatcher.image.repository value: {{ .Values.airmImageRepository }}/airm-dispatcher - name: airm-dispatcher.airm.dispatcher.imagePullSecrets[0].name value: regcred - # AGENT (airm-agent) - name: agent.airm.imagePullSecrets[0].name value: regcred From 17ae2946eb113b5ebd44b2f3108971d3fe25b146 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Tue, 3 Mar 2026 22:21:00 +0200 Subject: [PATCH 107/115] fix: value quotes for aimr images --- root/values_large.yaml | 39 ++++---- root/values_small.yaml | 95 +++++++++---------- .../templates/cf-init-gitea-cm.yaml | 10 +- 3 files changed, 71 insertions(+), 73 deletions(-) diff --git a/root/values_large.yaml b/root/values_large.yaml index d9d59843..0ff528d0 100644 --- a/root/values_large.yaml +++ b/root/values_large.yaml @@ -1,21 +1,3 @@ -apps: - minio-tenant: - valuesObject: - tenant: - pools: - - name: pool-0 - servers: 1 - size: 500Gi - storageClassName: direct - volumesPerServer: 1 - openbao: - valuesObject: - server: - ha: - enabled: true - raft: - enabled: true - replicas: 3 enabledApps: - aim-cluster-model-source - airm @@ -60,4 +42,23 @@ enabledApps: - opentelemetry-operator - otel-lgtm-stack - prometheus-crds - - rabbitmq \ No newline at end of file + - rabbitmq + +apps: + minio-tenant: + valuesObject: + tenant: + pools: + - name: pool-0 + servers: 1 + size: 500Gi + storageClassName: direct + volumesPerServer: 1 + openbao: + valuesObject: + server: + ha: + enabled: true + raft: + enabled: true + replicas: 3 diff --git a/root/values_small.yaml b/root/values_small.yaml index d1ee57f4..2b9ae69b 100644 --- a/root/values_small.yaml +++ b/root/values_small.yaml @@ -1,7 +1,50 @@ -# SMALL CLUSTER: All apps enabled (inherited from base values.yaml) -# Add Kyverno policy for local-path access mode mutation +enabledApps: + - aim-cluster-model-source + - airm + - amd-gpu-operator + - amd-gpu-operator-config + - appwrapper + - argocd + - argocd-config + - cert-manager + - cluster-auth + - cluster-auth-config + - cnpg-operator + - external-secrets + - external-secrets-config + - gateway-api + - gitea + - gitea-config + - kaiwo + - kaiwo-config + - kaiwo-crds + - keda + - kedify-otel + - keycloak + - kgateway + - kgateway-config + - kgateway-crds + - kserve + - kserve-crds + - kuberay-operator + - kueue + - kueue-config + - kyverno + - kyverno-config + - kyverno-policies-base # applicable to all cluster sizes + - kyverno-policies-storage-local-path # small & medium cluster sizes only + - metallb + - minio-operator + - minio-tenant + - minio-tenant-config + - openbao + - openbao-config + - openbao-init + - opentelemetry-operator + - otel-lgtm-stack + - prometheus-crds + - rabbitmq -# Medium & Small clusters add local-path storage policy for RWX→RWO conversion apps: argocd: valuesObject: @@ -134,49 +177,3 @@ apps: requests: storage: 25Gi storageClassName: local-path -enabledApps: - - aim-cluster-model-source - - airm - - amd-gpu-operator - - amd-gpu-operator-config - - appwrapper - - argocd - - argocd-config - - cert-manager - - cluster-auth - - cluster-auth-config - - cnpg-operator - - external-secrets - - external-secrets-config - - gateway-api - - gitea - - gitea-config - - kaiwo - - kaiwo-config - - kaiwo-crds - - keda - - kedify-otel - - keycloak - - kgateway - - kgateway-config - - kgateway-crds - - kserve - - kserve-crds - - kuberay-operator - - kueue - - kueue-config - - kyverno - - kyverno-config - - kyverno-policies-base # applicable to all cluster sizes - - kyverno-policies-storage-local-path # small & medium cluster sizes only - - metallb - - minio-operator - - minio-tenant - - minio-tenant-config - - openbao - - openbao-config - - openbao-init - - opentelemetry-operator - - otel-lgtm-stack - - prometheus-crds - - rabbitmq \ No newline at end of file diff --git a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml index bddf674f..2c22ab2e 100644 --- a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml +++ b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml @@ -156,17 +156,17 @@ data: helmParameters: # BACKEND (airm-api) - name: airm-api.airm.backend.image.repository - value: {{ .Values.airmImageRepository }}/airm-api + value: "{{ .Values.airmImageRepository }}/airm-api" - name: airm-api.airm.backend.imagePullSecrets[0].name value: regcred # FRONTEND (airm-ui) - - name: airm-ui.airm.frontend.image.repository - value: {{ .Values.airmImageRepository }}/airm-ui - - name: airm-ui.airm.frontend.imagePullSecrets[0].name + - name: airm-api.airm.frontend.image.repository + value: "{{ .Values.airmImageRepository }}/airm-ui" + - name: airm-api.airm.frontend.imagePullSecrets[0].name value: regcred # DISPATCHER (airm-dispatcher) - name: airm-dispatcher.airm.dispatcher.image.repository - value: {{ .Values.airmImageRepository }}/airm-dispatcher + value: "{{ .Values.airmImageRepository }}/airm-dispatcher" - name: airm-dispatcher.airm.dispatcher.imagePullSecrets[0].name value: regcred # AGENT (airm-agent) From a843ff665178b8bffc23dc79aa22f4f88359b5cd Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Wed, 4 Mar 2026 10:06:31 +0200 Subject: [PATCH 108/115] fix: add domain when using AIRM_IMAGE_REPOSITORY --- scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml index 2c22ab2e..04debc7c 100644 --- a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml +++ b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml @@ -154,6 +154,9 @@ data: apps: airm: helmParameters: + # APP-DOMAIN (defined in base, but would be overridden here if omitted) + - name: airm-api.airm.appDomain + value: "{{ .Values.global.domain }}" # BACKEND (airm-api) - name: airm-api.airm.backend.image.repository value: "{{ .Values.airmImageRepository }}/airm-api" From 249d550a1458e0a2a1b904f55c65bf8daefa7f8c Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Wed, 4 Mar 2026 10:38:40 +0200 Subject: [PATCH 109/115] fix: init-gitea to ref domain without global --- scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml index 04debc7c..991c81cb 100644 --- a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml +++ b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml @@ -156,7 +156,7 @@ data: helmParameters: # APP-DOMAIN (defined in base, but would be overridden here if omitted) - name: airm-api.airm.appDomain - value: "{{ .Values.global.domain }}" + value: "{{ .Values.domain }}" # BACKEND (airm-api) - name: airm-api.airm.backend.image.repository value: "{{ .Values.airmImageRepository }}/airm-api" From 9215c99b76395547be80c1e0108b1eb51b4daec6 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Wed, 4 Mar 2026 11:14:10 +0200 Subject: [PATCH 110/115] fix: adjust syncWaves in light of cluster-auth erroring on restarting kgateway --- root/values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index c507ee54..b8d979d1 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -191,7 +191,7 @@ apps: cluster-auth: namespace: cluster-auth path: cluster-auth/0.5.0 - syncWave: -20 + syncWave: -25 valuesFile: values.yaml cluster-auth-config: ignoreDifferences: @@ -487,7 +487,7 @@ apps: value: "{{ .Values.global.domain }}" namespace: kgateway-system path: kgateway-config - syncWave: -20 + syncWave: -15 valuesFile: values.yaml kgateway-crds: namespace: kgateway-system From b225e21b80c3ba55b61ee82662d13f5e9b46ad6e Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Wed, 4 Mar 2026 12:29:09 +0200 Subject: [PATCH 111/115] fix: argo health checks --- root/values.yaml | 22 +++++------ sources/argocd/values_ha.yaml | 73 +++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 11 deletions(-) diff --git a/root/values.yaml b/root/values.yaml index b8d979d1..44630018 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -94,7 +94,7 @@ apps: end return hs resource.customizations.health.batch_Job: | - -- Custom health check for Jobs, especially openbao-init + -- Custom health check for Jobs hs = {} if obj.status ~= nil then if obj.status.conditions ~= nil then @@ -119,16 +119,6 @@ apps: end hs.status = "Progressing" hs.message = "Job status unknown" - return hs - end - end - end - hs.status = "Progressing" - hs.message = "ScaledObject status unknown" - else - hs.status = "Progressing" - hs.message = "ScaledObject status unknown" - end return hs resource.customizations.health.keda.sh_ScaledObject: | hs = {} @@ -143,6 +133,16 @@ apps: hs.status = "Degraded" hs.message = condition.reason or "ScaledObject not ready" end + return hs + end + end + end + hs.status = "Progressing" + hs.message = "ScaledObject status unknown" + else + hs.status = "Progressing" + hs.message = "ScaledObject status unknown" + end return hs resource.customizations.health.opentelemetry.io_OpenTelemetryCollector: | hs = {} diff --git a/sources/argocd/values_ha.yaml b/sources/argocd/values_ha.yaml index 7e9aab69..87d0a808 100644 --- a/sources/argocd/values_ha.yaml +++ b/sources/argocd/values_ha.yaml @@ -9,6 +9,79 @@ configs: hs.status = "Healthy" hs.message = "Always Healthy - See https://github.com/argoproj/argo-cd/pull/24284" return hs + resource.customizations.health.batch_Job: | + -- Custom health check for Jobs + hs = {} + if obj.status ~= nil then + if obj.status.conditions ~= nil then + for _, condition in ipairs(obj.status.conditions) do + if condition.type == "Complete" and condition.status == "True" then + hs.status = "Healthy" + hs.message = "Job completed successfully" + return hs + elseif condition.type == "Failed" and condition.status == "True" then + hs.status = "Degraded" + hs.message = "Job failed" + return hs + end + end + end + -- Check for active jobs + if obj.status.active and obj.status.active > 0 then + hs.status = "Progressing" + hs.message = "Job is running" + return hs + end + end + hs.status = "Progressing" + hs.message = "Job status unknown" + return hs + resource.customizations.health.apps_StatefulSet: | + -- Custom health check for OpenBao StatefulSet + -- Uses standard StatefulSet readiness for now - openbao-init job handles actual initialization + hs = {} + if obj.status ~= nil then + if obj.status.readyReplicas ~= nil and obj.status.replicas ~= nil then + if obj.status.readyReplicas == obj.status.replicas then + hs.status = "Healthy" + hs.message = "StatefulSet is ready" + else + hs.status = "Progressing" + hs.message = "Waiting for StatefulSet replicas to be ready" + end + else + hs.status = "Progressing" + hs.message = "Waiting for StatefulSet status" + end + else + hs.status = "Progressing" + hs.message = "Waiting for StatefulSet status" + end + return hs + resource.customizations.health.keda.sh_ScaledObject: | + hs = {} + if obj.status ~= nil then + if obj.status.conditions ~= nil then + for _, condition in ipairs(obj.status.conditions) do + if condition.type == "Ready" then + if condition.status == "True" then + hs.status = "Healthy" + hs.message = "ScaledObject is ready" + else + hs.status = "Degraded" + hs.message = condition.reason or "ScaledObject not ready" + end + return hs + end + end + end + hs.status = "Progressing" + hs.message = "ScaledObject status unknown" + else + hs.status = "Progressing" + hs.message = "ScaledObject status unknown" + end + return hs rbac: policy.csv: | g, argocd-users, role:admin From e28b7672cf3800e72b6c6dc7e4f105a3ceb008c8 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Wed, 4 Mar 2026 12:34:25 +0200 Subject: [PATCH 112/115] revert value_ha.yaml (not in use) --- sources/argocd/values_ha.yaml | 73 ----------------------------------- 1 file changed, 73 deletions(-) diff --git a/sources/argocd/values_ha.yaml b/sources/argocd/values_ha.yaml index 87d0a808..7e9aab69 100644 --- a/sources/argocd/values_ha.yaml +++ b/sources/argocd/values_ha.yaml @@ -9,79 +9,6 @@ configs: hs.status = "Healthy" hs.message = "Always Healthy - See https://github.com/argoproj/argo-cd/pull/24284" return hs - resource.customizations.health.batch_Job: | - -- Custom health check for Jobs - hs = {} - if obj.status ~= nil then - if obj.status.conditions ~= nil then - for _, condition in ipairs(obj.status.conditions) do - if condition.type == "Complete" and condition.status == "True" then - hs.status = "Healthy" - hs.message = "Job completed successfully" - return hs - elseif condition.type == "Failed" and condition.status == "True" then - hs.status = "Degraded" - hs.message = "Job failed" - return hs - end - end - end - -- Check for active jobs - if obj.status.active and obj.status.active > 0 then - hs.status = "Progressing" - hs.message = "Job is running" - return hs - end - end - hs.status = "Progressing" - hs.message = "Job status unknown" - return hs - resource.customizations.health.apps_StatefulSet: | - -- Custom health check for OpenBao StatefulSet - -- Uses standard StatefulSet readiness for now - openbao-init job handles actual initialization - hs = {} - if obj.status ~= nil then - if obj.status.readyReplicas ~= nil and obj.status.replicas ~= nil then - if obj.status.readyReplicas == obj.status.replicas then - hs.status = "Healthy" - hs.message = "StatefulSet is ready" - else - hs.status = "Progressing" - hs.message = "Waiting for StatefulSet replicas to be ready" - end - else - hs.status = "Progressing" - hs.message = "Waiting for StatefulSet status" - end - else - hs.status = "Progressing" - hs.message = "Waiting for StatefulSet status" - end - return hs - resource.customizations.health.keda.sh_ScaledObject: | - hs = {} - if obj.status ~= nil then - if obj.status.conditions ~= nil then - for _, condition in ipairs(obj.status.conditions) do - if condition.type == "Ready" then - if condition.status == "True" then - hs.status = "Healthy" - hs.message = "ScaledObject is ready" - else - hs.status = "Degraded" - hs.message = condition.reason or "ScaledObject not ready" - end - return hs - end - end - end - hs.status = "Progressing" - hs.message = "ScaledObject status unknown" - else - hs.status = "Progressing" - hs.message = "ScaledObject status unknown" - end - return hs rbac: policy.csv: | g, argocd-users, role:admin From 7207dc75f14e8dd24dc8dc9495ce9367b1ab948f Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Wed, 4 Mar 2026 13:40:20 +0200 Subject: [PATCH 113/115] Update bootstrap_guide.md remove deprecated ENV var for AIRM IMAGE (implemented as --airm-image-repository --- docs/bootstrap_guide.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docs/bootstrap_guide.md b/docs/bootstrap_guide.md index 67e4ce77..e6f3e523 100644 --- a/docs/bootstrap_guide.md +++ b/docs/bootstrap_guide.md @@ -32,10 +32,6 @@ This guide explains how to bootstrap a complete GitOps environment using Cluster - **--skip-deps**: Skip dependency checking (for advanced users) - **--help**, **-h**: Show usage information -### Environment Variables - -- **AIRM_IMAGE_REPOSITORY**: Optional base repository URL for AIRM container images. When set, overrides default AIRM image repositories in the cluster-values configuration. Example: `ghcr.io/mycompany` - ### Examples ```bash From 5eeba3281fdb8e9d17b731dbbb7552985299a07c Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Wed, 4 Mar 2026 14:07:24 +0200 Subject: [PATCH 114/115] docs: update v1.8.0 changes from recent updates --- PRD.md | 63 ++++++++++++++++++------------ docs/bootstrap_guide.md | 15 ++++--- docs/cluster_size_configuration.md | 16 ++++---- 3 files changed, 57 insertions(+), 37 deletions(-) diff --git a/PRD.md b/PRD.md index cd725ba2..9488a4a9 100644 --- a/PRD.md +++ b/PRD.md @@ -56,16 +56,18 @@ Three cluster profiles with inheritance-based resource optimization: **Small Clusters** (1-5 users, dev/test): - Single replica deployments (ArgoCD, Redis, etc.) -- Reduced resource limits (ArgoCD controller: 2 CPU, 4Gi RAM) +- Reduced resource limits (ArgoCD controller: 2 CPU, 2Gi RAM) - Adds kyverno-policies-storage-local-path for RWX→RWO PVC mutation -- MinIO tenant: 250Gi storage, single server +- MinIO tenant: 2Ti storage, single server +- Mix of local-path and direct storage classes - Suitable for: Local workstations, development environments **Medium Clusters** (5-20 users, team production): - Single replica with moderate resource allocation - Same storage policies as small (local-path support) -- ArgoCD controller: 2 CPU, 4Gi RAM -- Default configuration for balanced performance +- ArgoCD controller: 1 CPU, 2Gi RAM +- MinIO tenant: 2Ti storage +- Uses direct storage class consistently - Suitable for: Small teams, staging environments **Large Clusters** (10s-100s users, enterprise scale): @@ -73,6 +75,7 @@ Three cluster profiles with inheritance-based resource optimization: - No local-path policies (assumes distributed storage like Longhorn) - MinIO tenant: 500Gi storage - Production-grade resource allocation +- Uses direct storage class for all persistent volumes - Suitable for: Production deployments, multi-tenant environments Size configurations use YAML merge semantics where size-specific values override base values.yaml settings. @@ -80,15 +83,20 @@ Size configurations use YAML merge semantics where size-specific values override ### App-of-Apps Architecture Cluster-Forge root chart generates ArgoCD Application manifests from: -- `enabledApps[]` - List of applications to deploy +- `enabledApps[]` - List of applications to deploy (defined in size-specific values files) - `apps.` - Configuration for each application including: - `path` - Relative path in sources/ directory - `namespace` - Target Kubernetes namespace - - `syncWave` - Deployment order (-5 to 0) + - `syncWave` - Deployment order (-70 to 0) - `valuesObject` - Inline Helm values - `helmParameters` - Templated Helm parameters (e.g., domain injection) - `ignoreDifferences` - ArgoCD diff exclusions +**Size-Specific Application Sets:** +- **Small clusters**: 46 enabled applications including storage-local-path policies +- **Medium clusters**: 47 enabled applications including storage-local-path policies and openbao-init +- **Large clusters**: 45 enabled applications excluding storage-local-path policies + The cluster-forge Application uses multi-source feature when externalValues.enabled=true: - Source 1: cluster-forge repo (root/ helm chart) - Source 2: cluster-values repo (custom values.yaml) @@ -157,8 +165,8 @@ The cluster-forge Application uses multi-source feature when externalValues.enab - RabbitMQ v2.15.0 - Message broker for async processing **Layer 6: AIRM Application** (Sync Wave 0) -- AIRM 0.3.2 - AMD Resource Manager application suite -- Configurable image repositories for custom registries and air-gapped deployments +- AIRM 0.3.5 - AMD Resource Manager application suite +- Configurable image repositories for custom registries and air-gapped deployments via --airm-image-repository flag - AIM Cluster Model Source - Cluster resource models for AIRM ### Repository Structure @@ -209,22 +217,24 @@ The bootstrap.sh script orchestrates complete cluster setup with flexible option **Available Options:** - `--cluster-size=[small|medium|large]` - Cluster size configuration (default: medium) - `--apps=APP1,APP2` - Deploy only specified components - - Bootstrap apps: `namespaces`, `argocd`, `gitea`, `cluster-forge` - - Child apps: Any app from enabledApps list (e.g., `openbao`, `keycloak`, `keda`) + - Bootstrap apps: `namespaces`, `argocd`, `openbao`, `gitea`, `cluster-forge` + - Child apps: Any app from enabledApps list (e.g., `keycloak`, `keda`, `airm`) - `--target-revision=BRANCH` - cluster-forge git revision for ArgoCD (default: latest release tag) - `--template-only` or `-t` - Output YAML manifests instead of applying to cluster - `--skip-deps` - Skip dependency checking for advanced users +- `--airm-image-repository=REPO` - Custom AIRM container image repository for air-gapped deployments **Bootstrap Process:** -1. **Validation** - Checks domain, cluster size, values files, required tool availability +1. **Validation** - Checks domain, cluster size, values files, required tool availability (kubectl, helm, yq with version checking) 2. **Pre-cleanup** - Removes previous installations if gitea-init-job completed 3. **Values Merge** - Combines base + size-specific values with domain injection -4. **Namespace Creation** - Creates argocd, cf-gitea namespaces -5. **ArgoCD Deployment** - helm template + kubectl apply with server-side apply -6. **Gitea Deployment** - helm template + kubectl apply, waits for rollout -7. **Gitea Init Job** - Creates cluster-org, clones/pushes cluster-forge and cluster-values repos -8. **ClusterForge App** - Creates root Application that manages all remaining components via ArgoCD -9. **Component Deployment** - ArgoCD syncs all enabledApps including OpenBao, secrets, and application stack +4. **Namespace Creation** - Creates argocd, cf-gitea, openbao namespaces +5. **ArgoCD Deployment** - helm template + kubectl apply with server-side apply using --field-manager=argocd-controller +6. **OpenBao Bootstrap** - Separate bootstrap phase for secrets management foundation +7. **Gitea Deployment** - helm template + kubectl apply, waits for rollout +8. **Gitea Init Job** - Creates cluster-org, clones/pushes cluster-forge and cluster-values repos with AIRM image repository support +9. **ClusterForge App** - Creates root Application that manages all remaining components via ArgoCD +10. **Component Deployment** - ArgoCD syncs all enabledApps including secrets and application stack ### Selective Component Deployment @@ -250,6 +260,9 @@ The `--apps` flag enables targeted deployment for development and troubleshootin # Render manifests for debugging ./scripts/bootstrap.sh example.com --apps=keycloak --template-only + +# Deploy with custom AIRM image repository for air-gapped environments +./scripts/bootstrap.sh example.com --airm-image-repository=registry.internal.com/airm ``` ### Self-Contained GitOps @@ -461,11 +474,13 @@ Kueue manages scheduling for: - Production should use Cert-Manager with ACME **Required Tools:** -- yq v4+ (YAML processor) -- helm 3.0+ -- kubectl +- yq v4+ (YAML processor) with automatic version checking +- helm 3.0+ with automatic version checking +- kubectl with automatic version checking - openssl (for password generation) +Bootstrap script provides comprehensive dependency validation with platform-specific installation instructions for missing tools. + ### Resource Requirements **Small Cluster:** @@ -486,8 +501,8 @@ Kueue manages scheduling for: ### Functional Requirements **FR1: AIRM Platform Delivery** -- Deploy AMD Resource Manager (AIRM) 0.3.2 with UI and API -- Support configurable image repositories via `airmImageRepository` bootstrap parameter +- Deploy AMD Resource Manager (AIRM) 0.3.5 with UI and API +- Support configurable image repositories via `--airm-image-repository` bootstrap parameter - Provide model serving with KServe v0.16.0 - Support distributed computing via KubeRay Operator 1.4.2 - Enable workflow orchestration through Kaiwo v0.2.0-rc11 @@ -613,7 +628,7 @@ ClusterForge includes comprehensive SBOM tooling in `/sbom`: ## Version Information -**Current Release:** v1.8.0-rc2 +**Current Release:** v1.8.0 **Key Component Versions:** - ArgoCD: 8.3.5 @@ -622,7 +637,7 @@ ClusterForge includes comprehensive SBOM tooling in `/sbom`: - Keycloak: keycloak-old chart - KServe: v0.16.0 - Kaiwo: v0.2.0-rc11 -- AIRM: 0.3.2 +- AIRM: 0.3.5 - Kueue: 0.13.0 - AMD GPU Operator: v1.4.1 - OTEL-LGTM Stack: v1.0.7 diff --git a/docs/bootstrap_guide.md b/docs/bootstrap_guide.md index e6f3e523..287c3973 100644 --- a/docs/bootstrap_guide.md +++ b/docs/bootstrap_guide.md @@ -1,6 +1,6 @@ # Bootstrap Guide -This guide explains how to bootstrap a complete GitOps environment using Cluster-Forge's three-phase deployment model. The bootstrap process establishes ArgoCD and Gitea (Git repository) as foundation components, then creates the cluster-forge Application which manages all remaining components including OpenBao and the full application stack via ArgoCD. +This guide explains how to bootstrap a complete GitOps environment using Cluster-Forge's five-step deployment model. The bootstrap process establishes ArgoCD, OpenBao, and Gitea as foundation components, then creates the cluster-forge Application which manages all remaining components via ArgoCD. ## Prerequisites @@ -24,12 +24,14 @@ This guide explains how to bootstrap a complete GitOps environment using Cluster ### Options - **--apps=APP1,APP2**: Deploy only specified components (default: applies to cluster) - - options: `namespaces`, `argocd`, `gitea`, `cluster-forge`, or any cluster-forge child app (see values.yaml for app names) + - Bootstrap apps: `namespaces`, `argocd`, `openbao`, `gitea`, `cluster-forge` + - Child apps: Any app from enabledApps list (see values_{size}.yaml for app names) - Use with `--template-only` to render instead of applying - **--cluster-size** `[small|medium|large]`: Cluster size configuration (default: `medium`) - **--template-only**, **-t**: Output YAML manifests to stdout instead of applying to cluster - **--target-revision**, **-r**: cluster-forge git revision for ArgoCD to sync from - **--skip-deps**: Skip dependency checking (for advanced users) +- **--airm-image-repository=REPO**: Custom AIRM container image repository for air-gapped deployments - **--help**, **-h**: Show usage information ### Examples @@ -52,15 +54,15 @@ This guide explains how to bootstrap a complete GitOps environment using Cluster ./scripts/bootstrap.sh example.com --CLUSTER_SIZE=large # Custom AIRM image repository -AIRM_IMAGE_REPOSITORY=ghcr.io/mycompany ./scripts/bootstrap.sh example.com +./scripts/bootstrap.sh example.com --airm-image-repository=ghcr.io/mycompany # Air-gapped deployment with local registry -AIRM_IMAGE_REPOSITORY=harbor.internal.com/airm ./scripts/bootstrap.sh 192.168.1.100.nip.io --CLUSTER_SIZE=small +./scripts/bootstrap.sh 192.168.1.100.nip.io --cluster-size=small --airm-image-repository=harbor.internal.com/airm ``` ## How It Works -The bootstrap script uses a three-phase deployment model: +The bootstrap script uses a five-step deployment model: ### Phase 1: Pre-Cleanup - The pre_cleanup function performs selective cleanup, only affects cf-gitea and cf-openbao namespaces @@ -78,9 +80,10 @@ The bootstrap script uses a three-phase deployment model: - Sets `global.domain` and `global.clusterSize` in merged configuration **2. Namespace Creation** -Creates two namespaces for bootstrap components: +Creates three namespaces for bootstrap components: - `argocd` - GitOps controller - `cf-gitea` - Git repository server +- `cf-openbao` - Secrets management system **3. ArgoCD Bootstrap** - Extracts ArgoCD values from merged configuration diff --git a/docs/cluster_size_configuration.md b/docs/cluster_size_configuration.md index fb652774..11da7231 100644 --- a/docs/cluster_size_configuration.md +++ b/docs/cluster_size_configuration.md @@ -35,14 +35,15 @@ cluster-forge/ - **CPU**: 8-32 vCPU total - **Memory**: 32-128 GB RAM total - **GPU**: 1-4 GPUs (optional) -- **Storage**: 250Gi+ total, local-path StorageClass +- **Storage**: 2Ti+ total, local-path StorageClass - **Networking**: 1 GbE acceptable **Application Configuration**: -- **ArgoCD**: Single replica, 2 CPU / 4Gi RAM limits -- **MinIO Tenant**: 250Gi storage, single server +- **ArgoCD**: Single replica, 2 CPU / 2Gi RAM limits +- **MinIO Tenant**: 2Ti storage, single server - **OpenBao**: Single instance (no HA), 5Gi storage - **Storage Policies**: Includes `kyverno-policies-storage-local-path` for RWX→RWO conversion +- **Storage Classes**: Mix of local-path and direct storage classes - **Component Replicas**: All single replica deployments **Use Cases**: Development, testing, proof-of-concept, local workstations @@ -59,10 +60,11 @@ cluster-forge/ - **Networking**: 10 GbE recommended **Application Configuration**: -- **ArgoCD**: Single replica, 2 CPU / 4Gi RAM limits -- **MinIO Tenant**: 250Gi storage, single server +- **ArgoCD**: Single replica, 1 CPU / 2Gi RAM limits +- **MinIO Tenant**: 2Ti storage, single server - **OpenBao**: Single instance (no HA), 5Gi storage - **Storage Policies**: Includes `kyverno-policies-storage-local-path` for RWX→RWO conversion +- **Storage Classes**: Direct storage class consistently - **Component Replicas**: Balanced single replica configuration **Use Cases**: Team production workloads, staging environments, CI/CD @@ -151,8 +153,8 @@ Later values override earlier ones, allowing size files to contain only the diff | Size | Servers | Storage | Buckets | Notes | |------|---------|---------|---------|-------| -| Small | 1 | 250Gi | default-bucket, models | Single server, local-path storage | -| Medium | 1 | 250Gi | default-bucket, models | Single server, local-path or distributed | +| Small | 1 | 2Ti | default-bucket, models | Single server, local-path storage | +| Medium | 1 | 2Ti | default-bucket, models | Single server, direct storage | | Large | 1 | 500Gi | default-bucket, models | Single server, external HA S3 recommended | ### OpenBao Scaling From 30a7d2c75db0b417903d7b8bf01508cf89aaf2e9 Mon Sep 17 00:00:00 2001 From: Daniel Vaskivaara Date: Wed, 4 Mar 2026 14:24:03 +0200 Subject: [PATCH 115/115] fix(bootstrap.sh): --template-only flag to remove non-template output --- scripts/bootstrap.sh | 144 +++++++++++++++++++++++-------------------- 1 file changed, 76 insertions(+), 68 deletions(-) diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 54570e47..c2b76d15 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -16,6 +16,18 @@ TARGET_REVISION="$LATEST_RELEASE" TEMPLATE_ONLY=false VALUES_FILE="values.yaml" +# Helper function to print messages only when not in template mode +log_info() { + if [ "$TEMPLATE_ONLY" = false ]; then + echo "$@" + fi +} + +# Generate a secure random password +generate_password() { + openssl rand -hex 16 +} + # Check for required dependencies check_dependencies() { local silent="${1:-false}" @@ -301,11 +313,11 @@ setup_values_files() { SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" if [ ! -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then - echo "WARNING: Size-specific values file not found: ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" - echo "Proceeding with base values file only: ${VALUES_FILE}" + log_info "WARNING: Size-specific values file not found: ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" + log_info "Proceeding with base values file only: ${VALUES_FILE}" SIZE_VALUES_FILE="" else - echo "Using size-specific values file: ${SIZE_VALUES_FILE}" + log_info "Using size-specific values file: ${SIZE_VALUES_FILE}" fi } @@ -369,7 +381,7 @@ EOF # ArgoCD bootstrap bootstrap_argocd() { - echo "=== ArgoCD Bootstrap ===" + log_info "=== ArgoCD Bootstrap ===" extract_argocd_values helm template --release-name argocd ${SOURCE_ROOT}/sources/argocd/8.3.5 --namespace argocd \ --values /tmp/argocd_bootstrap_values.yaml \ @@ -385,41 +397,40 @@ bootstrap_argocd() { bootstrap_openbao() { - echo "=== OpenBao Bootstrap ===" + log_info "=== OpenBao Bootstrap ===" - # Debug output for troubleshooting - echo "Debug: SOURCE_ROOT='${SOURCE_ROOT}'" - echo "Debug: VALUES_FILE='${VALUES_FILE}'" - echo "Debug: SIZE_VALUES_FILE='${SIZE_VALUES_FILE}'" - echo "Debug: CLUSTER_SIZE='${CLUSTER_SIZE}'" + log_info "Debug: SOURCE_ROOT='${SOURCE_ROOT}'" + log_info "Debug: VALUES_FILE='${VALUES_FILE}'" + log_info "Debug: SIZE_VALUES_FILE='${SIZE_VALUES_FILE}'" + log_info "Debug: CLUSTER_SIZE='${CLUSTER_SIZE}'" # Get OpenBao version from app path - using same method as main OPENBAO_VERSION=$(yq eval '.apps.openbao.path' "${SOURCE_ROOT}/root/${VALUES_FILE}" | cut -d'/' -f2) - echo "OpenBao version: $OPENBAO_VERSION" - - # Create a unique temp directory to avoid permission issues - TEMP_DIR=$(mktemp -d -t cf-bootstrap.XXXXXX) || { echo "ERROR: Cannot create temp directory"; exit 1; } - echo "Using temp directory: $TEMP_DIR" - - # Extract OpenBao values from merged config - matching main approach - echo "Extracting OpenBao values..." + log_info "OpenBao version: $OPENBAO_VERSION" + + # Create a temporary directory for processing OpenBao values + TEMP_DIR=$(mktemp -d -t cf-bootstrap.XXXXXX) || { log_info "ERROR: Cannot create temp directory"; exit 1; } + log_info "Using temp directory: $TEMP_DIR" + + # Extract OpenBao values from base configuration + log_info "Extracting OpenBao values..." yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > "${TEMP_DIR}/openbao_values.yaml" || { echo "ERROR: Failed to extract OpenBao values from ${VALUES_FILE}"; exit 1; } if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then - echo "Extracting OpenBao size-specific values from ${SIZE_VALUES_FILE}..." - echo "Checking if openbao section exists in size values file..." - if yq eval '.apps.openbao' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" >/dev/null 2>&1 && [ "$(yq eval '.apps.openbao' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}")" != "null" ]; then - echo "OpenBao section found, extracting values..." - yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > "${TEMP_DIR}/openbao_size_values.yaml" || { - echo "WARNING: Failed to extract OpenBao valuesObject from ${SIZE_VALUES_FILE}, using empty values" + log_info "Extracting OpenBao size-specific values from ${SIZE_VALUES_FILE}..." + log_info "Checking if openbao section exists in size values file..." + if yq eval 'has("apps") and .apps.openbao' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | grep -q true; then + log_info "OpenBao section found, extracting values..." + if ! yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > "${TEMP_DIR}/openbao_size_values.yaml"; then + log_info "WARNING: Failed to extract OpenBao valuesObject from ${SIZE_VALUES_FILE}, using empty values" printf "# OpenBao valuesObject not found in size file\n" > "${TEMP_DIR}/openbao_size_values.yaml" - } + fi else - echo "No OpenBao section in size values file, creating empty placeholder..." + log_info "No OpenBao section in size values file, creating empty placeholder..." printf "# No OpenBao section in size-specific values\n" > "${TEMP_DIR}/openbao_size_values.yaml" fi else - echo "No size-specific values file, creating empty placeholder..." + log_info "No size-specific values file, creating empty placeholder..." printf "# No size-specific values\n" > "${TEMP_DIR}/openbao_size_values.yaml" fi @@ -434,7 +445,7 @@ bootstrap_openbao() { kubectl wait --for=jsonpath='{.status.phase}'=Running pod/openbao-0 -n cf-openbao --timeout=100s # Create initial secrets config for init job (separate from ArgoCD-managed version) - echo "Creating initial OpenBao secrets configuration..." + log_info "Creating initial OpenBao secrets configuration..." cat ${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-manager-cm.yaml | \ sed "s|name: openbao-secret-manager-scripts|name: openbao-secret-manager-scripts-init|g" | kubectl apply -f - @@ -458,24 +469,21 @@ bootstrap_openbao() { bootstrap_gitea() { - echo "=== Gitea Bootstrap ===" - generate_password() { - openssl rand -hex 16 | tr 'a-f' 'A-F' | head -c 32 - } + log_info "=== Gitea Bootstrap ===" - # Debug output for troubleshooting - echo "Debug: SOURCE_ROOT='${SOURCE_ROOT}'" - echo "Debug: VALUES_FILE='${VALUES_FILE}'" - echo "Debug: SIZE_VALUES_FILE='${SIZE_VALUES_FILE}'" - echo "Debug: CLUSTER_SIZE='${CLUSTER_SIZE}'" + # Print debug information + log_info "Debug: SOURCE_ROOT='${SOURCE_ROOT}'" + log_info "Debug: VALUES_FILE='${VALUES_FILE}'" + log_info "Debug: SIZE_VALUES_FILE='${SIZE_VALUES_FILE}'" + log_info "Debug: CLUSTER_SIZE='${CLUSTER_SIZE}'" # Get Gitea version from app path - matching main approach GITEA_VERSION=$(yq eval '.apps.gitea.path' "${SOURCE_ROOT}/root/${VALUES_FILE}" | cut -d'/' -f2) - echo "Gitea version: $GITEA_VERSION" - - # Create a unique temp directory to avoid permission issues - TEMP_DIR=$(mktemp -d -t cf-gitea-bootstrap.XXXXXX) || { echo "ERROR: Cannot create temp directory"; exit 1; } - echo "Using temp directory: $TEMP_DIR" + log_info "Gitea version: $GITEA_VERSION" + + # Create a temporary directory for processing Gitea values + TEMP_DIR=$(mktemp -d -t cf-gitea-bootstrap.XXXXXX) || { log_info "ERROR: Cannot create temp directory"; exit 1; } + log_info "Using temp directory: $TEMP_DIR" # Create initial-cf-values configmap (complete values for gitea-init-job) # Use the complete root values.yaml with filled placeholders instead of simplified version @@ -505,24 +513,24 @@ bootstrap_gitea() { --dry-run=client -o yaml | apply_or_template -f - # Extract Gitea values like main does - echo "Extracting Gitea values..." - yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > "${TEMP_DIR}/gitea_values.yaml" || { echo "ERROR: Failed to extract Gitea values from ${VALUES_FILE}"; exit 1; } + log_info "Extracting Gitea values..." + yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > "${TEMP_DIR}/gitea_values.yaml" || { log_info "ERROR: Failed to extract Gitea values from ${VALUES_FILE}"; exit 1; } if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then - echo "Extracting Gitea size-specific values from ${SIZE_VALUES_FILE}..." - echo "Checking if gitea section exists in size values file..." + log_info "Extracting Gitea size-specific values from ${SIZE_VALUES_FILE}..." + log_info "Checking if gitea section exists in size values file..." if yq eval '.apps.gitea' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" >/dev/null 2>&1 && [ "$(yq eval '.apps.gitea' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}")" != "null" ]; then - echo "Gitea section found, extracting values..." + log_info "Gitea section found, extracting values..." yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > "${TEMP_DIR}/gitea_size_values.yaml" || { - echo "WARNING: Failed to extract Gitea valuesObject from ${SIZE_VALUES_FILE}, using empty values" + log_info "WARNING: Failed to extract Gitea valuesObject from ${SIZE_VALUES_FILE}, using empty values" printf "# Gitea valuesObject not found in size file\n" > "${TEMP_DIR}/gitea_size_values.yaml" } else - echo "No Gitea section in size values file, creating empty placeholder..." + log_info "No Gitea section in size values file, creating empty placeholder..." printf "# No Gitea section in size-specific values\n" > "${TEMP_DIR}/gitea_size_values.yaml" fi else - echo "No size-specific values file, creating empty placeholder..." + log_info "No size-specific values file, creating empty placeholder..." printf "# No size-specific values\n" > "${TEMP_DIR}/gitea_size_values.yaml" fi @@ -606,8 +614,8 @@ EOF apply_cluster_forge_parent_app() { # Create cluster-forge parent app only (not all apps) - echo "=== Creating ClusterForge Parent App ===" - echo "Target revision: $TARGET_REVISION" + log_info "=== Creating ClusterForge Parent App ===" + log_info "Target revision: $TARGET_REVISION" @@ -633,15 +641,15 @@ is_cluster_forge_child_app() { main() { parse_args "$@" - # Use silent dependency check when using --apps for cleaner output - if [ -z "$APPS" ]; then + # Use silent dependency check when using --apps or template mode for cleaner output + if [ -z "$APPS" ] && [ "$TEMPLATE_ONLY" = false ]; then if [ "$SKIP_DEPENDENCY_CHECK" = false ]; then check_dependencies fi validate_args print_summary else - # For --apps mode, check deps silently and skip verbose output + # For --apps mode or template mode, check deps silently and skip verbose output if [ "$SKIP_DEPENDENCY_CHECK" = false ]; then check_dependencies true fi @@ -695,45 +703,45 @@ main() { fi else # Default behavior - run all bootstrap components - echo "🚀 Running full bootstrap sequence..." - echo "📋 Bootstrap order: namespaces → argocd → openbao → gitea → cluster-forge" + log_info "🚀 Running full bootstrap sequence..." + log_info "📋 Bootstrap order: namespaces → argocd → openbao → gitea → cluster-forge" if should_run namespaces; then - echo "📦 Step 1/5: Creating namespaces" + log_info "📦 Step 1/5: Creating namespaces" create_namespaces else - echo "⏭️ Step 1/5: Skipping namespaces" + log_info "⏭️ Step 1/5: Skipping namespaces" fi if should_run argocd; then - echo "📦 Step 2/5: Bootstrapping ArgoCD" + log_info "📦 Step 2/5: Bootstrapping ArgoCD" bootstrap_argocd else - echo "⏭️ Step 2/5: Skipping ArgoCD" + log_info "⏭️ Step 2/5: Skipping ArgoCD" fi if should_run openbao; then - echo "📦 Step 3/5: Bootstrapping OpenBao" + log_info "📦 Step 3/5: Bootstrapping OpenBao" bootstrap_openbao else - echo "⏭️ Step 3/5: Skipping OpenBao" + log_info "⏭️ Step 3/5: Skipping OpenBao" fi if should_run gitea; then - echo "📦 Step 4/5: Bootstrapping Gitea" + log_info "📦 Step 4/5: Bootstrapping Gitea" bootstrap_gitea else - echo "⏭️ Step 4/5: Skipping Gitea" + log_info "⏭️ Step 4/5: Skipping Gitea" fi if should_run cluster-forge; then - echo "📦 Step 5/5: Creating ClusterForge parent app" + log_info "📦 Step 5/5: Creating ClusterForge parent app" apply_cluster_forge_parent_app else - echo "⏭️ Step 5/5: Skipping ClusterForge" + log_info "⏭️ Step 5/5: Skipping ClusterForge" fi - echo "✅ Bootstrap sequence completed" + log_info "✅ Bootstrap sequence completed" fi }