diff --git a/.github/workflows/release-pipeline.yaml b/.github/workflows/release-pipeline.yaml index 5ba3500f..46773285 100644 --- a/.github/workflows/release-pipeline.yaml +++ b/.github/workflows/release-pipeline.yaml @@ -42,27 +42,31 @@ jobs: fi echo "next=$VERSION" >> $GITHUB_OUTPUT - - name: Update helm values file - uses: mikefarah/yq@master + - name: Validate LATEST_RELEASE matches release version env: - GIT_TAG: ${{ steps.semver.outputs.next }} - with: - cmd: | - yq -i '.clusterForge.targetRevision = env(GIT_TAG)' root/values.yaml - yq -i '.targetRevision = env(GIT_TAG)' scripts/init-gitea-job/values.yaml - - - name: Commit and push changes - uses: stefanzweifel/git-auto-commit-action@v4 - env: - GIT_TAG: ${{ steps.semver.outputs.next }} - with: - commit_message: 'Update version to ${{ env.GIT_TAG }} [actions skip]' + VERSION: ${{ steps.semver.outputs.next }} + run: | + # Extract LATEST_RELEASE from bootstrap.sh + LATEST_RELEASE=$(grep '^LATEST_RELEASE=' scripts/bootstrap.sh | cut -d'"' -f2 | sed 's/^v//') + + # Extract base version (before -rc or -alpha, etc.) + RELEASE_BASE=$(echo "$VERSION" | sed 's/^v//' | sed 's/-rc[0-9]*$//' | sed 's/-alpha[0-9]*$//' | sed 's/-beta[0-9]*$//') + LATEST_BASE=$(echo "$LATEST_RELEASE" | sed 's/-rc[0-9]*$//' | sed 's/-alpha[0-9]*$//' | sed 's/-beta[0-9]*$//') + + echo "Release version: $VERSION (base: $RELEASE_BASE)" + echo "LATEST_RELEASE in bootstrap.sh: $LATEST_RELEASE (base: $LATEST_BASE)" + + if [[ "$RELEASE_BASE" != "$LATEST_BASE" ]]; then + echo "::warning::LATEST_RELEASE base version ($LATEST_BASE) in scripts/bootstrap.sh does not match release version base ($RELEASE_BASE)" + echo "::warning::Consider updating LATEST_RELEASE in scripts/bootstrap.sh to match the release being created" + else + echo "✓ LATEST_RELEASE base version matches release version base" + fi - name: Create GitHub Release env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} VERSION: ${{ steps.semver.outputs.next }} - EXTRA_ARGS: ${{ steps.version.outputs.extra_args }} run: | # Prepare release artifact tar -zcvf "release-enterprise-ai-${VERSION}.tar.gz" --transform 's,^,cluster-forge/,' root/ scripts/ sources @@ -134,4 +138,4 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} SBOM_NAME: ${{ steps.generate_sbom.outputs.sbom_name }} run: | - gh release upload ${VERSION} ${SBOM_NAME} --clobber + gh release upload ${VERSION} ${SBOM_NAME} --clobber \ No newline at end of file diff --git a/PRD.md b/PRD.md index 98777449..9488a4a9 100644 --- a/PRD.md +++ b/PRD.md @@ -2,168 +2,654 @@ ## Executive Summary -**Cluster-Forge** is a Kubernetes platform automation tool designed to bundle various third-party, community, -and in-house components into a single, streamlined stack that can be deployed in Kubernetes clusters. -By automating the deployment process, Cluster-Forge simplifies the creation of consistent, ready-to-use clusters -with all essential services pre-configured and integrated. +**Cluster-Forge** is a Kubernetes platform automation tool that bundles third-party, community, and in-house components into a single, GitOps-managed stack deployable in Kubernetes clusters. It automates the deployment of a complete AI/ML compute platform built on AMD Enterprise AI Suite components, delivering consistent, production-ready clusters with all essential services pre-configured and integrated. + +The platform uses ArgoCD's app-of-apps pattern with a sophisticated bootstrap process that establishes GitOps infrastructure (ArgoCD, Gitea, OpenBao) before deploying the complete application stack. ## Target Users -- **Infrastructure Engineers** -- **Platform Engineers** -- **DevOps Engineers** -- **Cloud Native Engineers** -- **Site Reliability Engineers** -- **AI/ML Engineers** +- **AI/ML Engineers** - Unified platform for model training, serving, and orchestration +- **Platform Engineers** - Infrastructure automation with GitOps patterns +- **DevOps Engineers** - Consistent deployment across environments +- **Infrastructure Engineers** - Multi-cluster management and operations +- **Site Reliability Engineers** - Observability and reliability tooling +- **Research Teams** - Ephemeral test clusters for experimentation ## Product Architecture +### Bootstrap-First Deployment Model + +Cluster-Forge uses a three-phase bootstrap process that establishes GitOps infrastructure before deploying applications: + +**Phase 1: Pre-Cleanup** +- Detects and removes previous installations when gitea-init-job completed successfully +- Deletes Gitea resources, OpenBao init jobs, and temporary files +- Ensures clean state for fresh deployments + +**Phase 2: GitOps Foundation Bootstrap** (Manual Helm Templates) +1. **ArgoCD** (v8.3.5) - GitOps controller deployed via helm template + kubectl apply +2. **Gitea** (v12.3.0) - Git server with init job to create cluster-forge and cluster-values repositories + +**Phase 3: App-of-Apps Deployment** (ArgoCD-Managed) +- Creates cluster-forge Application pointing to root/ helm chart +- ArgoCD syncs all remaining applications including OpenBao from enabledApps list +- Applications deployed in wave order (-70 to 0) based on dependencies +- OpenBao (v0.18.2) deployed via ArgoCD with openbao-init job for vault configuration + ### Dual Repository GitOps Pattern -Cluster-Forge implements a sophisticated GitOps deployment pattern supporting both external GitHub deployment and local cluster-native deployment: +Cluster-Forge supports flexible GitOps repository configurations: -- **External Mode** (`values.yaml`): Traditional GitOps with GitHub dependency -- **Local Mode** (`values_cf.yaml`): Self-contained GitOps with local Gitea and separate configuration repository +**Local Mode (Default)** - Self-contained cluster-native GitOps: +- `clusterForge.repoUrl`: Points to local Gitea (http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git) +- `externalValues.enabled: true`: Separate cluster-values repository for configuration +- Initialization handled by gitea-init-job which clones and pushes repositories from initial-cf-values ConfigMap +- Zero external dependencies once bootstrapped -See [Values Inheritance Pattern](docs/values_inheritance_pattern.md) for detailed documentation. +**External Mode** - Traditional GitHub-based GitOps: +- Set `clusterForge.repoUrl` to external GitHub repository +- Supports custom branch selection for testing and development ### Size-Aware Configuration -Cluster-Forge provides three pre-configured cluster profiles with streamlined inheritance: +Three cluster profiles with inheritance-based resource optimization: + +**Small Clusters** (1-5 users, dev/test): +- Single replica deployments (ArgoCD, Redis, etc.) +- Reduced resource limits (ArgoCD controller: 2 CPU, 2Gi RAM) +- Adds kyverno-policies-storage-local-path for RWX→RWO PVC mutation +- MinIO tenant: 2Ti storage, single server +- Mix of local-path and direct storage classes +- Suitable for: Local workstations, development environments + +**Medium Clusters** (5-20 users, team production): +- Single replica with moderate resource allocation +- Same storage policies as small (local-path support) +- ArgoCD controller: 1 CPU, 2Gi RAM +- MinIO tenant: 2Ti storage +- Uses direct storage class consistently +- Suitable for: Small teams, staging environments + +**Large Clusters** (10s-100s users, enterprise scale): +- OpenBao HA: 3 replicas with Raft consensus +- No local-path policies (assumes distributed storage like Longhorn) +- MinIO tenant: 500Gi storage +- Production-grade resource allocation +- Uses direct storage class for all persistent volumes +- Suitable for: Production deployments, multi-tenant environments + +Size configurations use YAML merge semantics where size-specific values override base values.yaml settings. + +### App-of-Apps Architecture + +Cluster-Forge root chart generates ArgoCD Application manifests from: +- `enabledApps[]` - List of applications to deploy (defined in size-specific values files) +- `apps.` - Configuration for each application including: + - `path` - Relative path in sources/ directory + - `namespace` - Target Kubernetes namespace + - `syncWave` - Deployment order (-70 to 0) + - `valuesObject` - Inline Helm values + - `helmParameters` - Templated Helm parameters (e.g., domain injection) + - `ignoreDifferences` - ArgoCD diff exclusions + +**Size-Specific Application Sets:** +- **Small clusters**: 46 enabled applications including storage-local-path policies +- **Medium clusters**: 47 enabled applications including storage-local-path policies and openbao-init +- **Large clusters**: 45 enabled applications excluding storage-local-path policies + +The cluster-forge Application uses multi-source feature when externalValues.enabled=true: +- Source 1: cluster-forge repo (root/ helm chart) +- Source 2: cluster-values repo (custom values.yaml) +- Merges: base values.yaml + size values + external cluster-values/values.yaml + +### Component Categories + +**Layer 1: GitOps Foundation** (Bootstrap + Sync Wave -70 to -30) +- ArgoCD 8.3.5 - GitOps continuous deployment controller (bootstrap) +- Gitea 12.3.0 - Self-hosted Git server with SQLite backend (bootstrap) +- OpenBao 0.18.2 - Vault-compatible secrets management (ArgoCD-managed, sync wave -70) +- External Secrets 0.15.1 - Secrets synchronization operator (sync wave -40) + +**Layer 2: Core Infrastructure** (Sync Wave -5 to -2) + +*Networking:* +- Gateway API v1.3.0 - Kubernetes standard ingress API +- KGateway v2.1.0-main - Gateway API implementation with custom WebSocket support +- MetalLB v0.15.2 - Bare metal load balancer +- Cert-Manager v1.18.2 - Automated TLS certificate management + +*Policy & Security:* +- Kyverno 3.5.1 - Policy engine for admission control +- Kyverno Config - OIDC integration, policy configurations +- Kyverno Policies Base - Core security policies +- Kyverno Policies Storage-Local-Path - Access mode mutation (small/medium only) +- Cluster-Auth 0.5.0 - Kubernetes RBAC integration + +*Storage & Database:* +- CNPG Operator 0.26.0 - CloudNativePG PostgreSQL operator +- MinIO Operator 7.1.1 - S3-compatible object storage operator +- MinIO Tenant 7.1.1 - Tenant deployment with default-bucket and models buckets + +**Layer 3: Observability** (Sync Wave -5 to -2) +- Prometheus Operator CRDs 23.0.0 - Metrics infrastructure +- OpenTelemetry Operator 0.93.1 - Telemetry collection with contrib collector +- OTEL-LGTM Stack v1.0.7 - Integrated observability (Loki, Grafana, Tempo, Mimir) + - Storage: 50Gi each for tempo/loki/mimir, 10Gi grafana + - Metrics collector: 8Gi RAM, 2 CPU + - Logs collector daemonset: 2Gi RAM, 1 CPU + +**Layer 4: Identity & Access** (Sync Wave -1 to 0) +- Keycloak (keycloak-old chart) - Enterprise IAM with AIRM realm + - Custom extensions via init containers (SilogenExtensionPackage.jar) + - Realm import with domain-group-authenticator + - Client secrets for: AIRM, K8s, MinIO, Gitea, ArgoCD + +**Layer 5: AI/ML Compute Stack** (Sync Wave -3 to 0) + +*GPU & Scheduling:* +- AMD GPU Operator v1.4.1 - GPU device plugin and drivers +- KubeRay Operator 1.4.2 - Ray distributed computing framework +- Kueue 0.13.0 - Job queueing with multi-framework support + - Integrations: batch/job, Ray, MPIJob, PyTorchJob, TensorFlow, Jobset, AppWrapper, Pod, Deployment +- AppWrapper v1.1.2 - Application-level resource scheduling +- KEDA 2.18.1 - Event-driven autoscaling +- Kedify-OTEL v0.0.6 - KEDA telemetry integration + +*ML Serving & Inference:* +- KServe v0.16.0 - Model serving platform (Standard deployment mode) +- KServe CRDs v0.16.0 - Model serving custom resources + +*Workflow & Messaging:* +- Kaiwo v0.2.0-rc11 - AI workload orchestration +- Kaiwo CRDs v0.2.0-rc11 - Workflow custom resources +- RabbitMQ v2.15.0 - Message broker for async processing + +**Layer 6: AIRM Application** (Sync Wave 0) +- AIRM 0.3.5 - AMD Resource Manager application suite +- Configurable image repositories for custom registries and air-gapped deployments via --airm-image-repository flag +- AIM Cluster Model Source - Cluster resource models for AIRM + +### Repository Structure + +``` +cluster-forge/ +├── scripts/ +│ ├── bootstrap.sh # Main bootstrap orchestration +│ ├── init-gitea-job/ # Helm chart for Gitea initialization +│ ├── init-openbao-job/ # Helm chart for OpenBao initialization +│ └── utils/ # Backup/restore utilities +│ ├── export_databases.sh +│ ├── export_rabbitmq.sh +│ ├── import_databases.sh +│ ├── import_rabbitmq.sh +│ └── mirror_minio.sh +├── root/ +│ ├── Chart.yaml # ClusterForge root helm chart metadata +│ ├── values.yaml # Base configuration +│ ├── values_small.yaml # Small cluster overrides +│ ├── values_medium.yaml # Medium cluster overrides +│ ├── values_large.yaml # Large cluster overrides +│ └── templates/ +│ ├── _helpers.yaml # Template helper functions +│ ├── cluster-apps.yaml # Generates ArgoCD Application per enabledApp +│ └── cluster-forge.yaml # Self-managing ClusterForge Application +├── sources/ # Versioned helm charts and configurations +│ ├── / +│ │ ├── / # Upstream helm chart or Kustomize +│ │ ├── source.yaml # Source metadata (optional) +│ │ └── values_ha.yaml # HA overrides (optional) +│ └── / # Configuration helm charts +│ └── templates/ # ConfigMaps, Secrets, ExternalSecrets +├── docs/ # Architecture and operational documentation +└── sbom/ # Software bill of materials tooling +``` + +## Key Features + +### Single-Command Bootstrap + +The bootstrap.sh script orchestrates complete cluster setup with flexible options: + +```bash +./scripts/bootstrap.sh [options] +``` + +**Available Options:** +- `--cluster-size=[small|medium|large]` - Cluster size configuration (default: medium) +- `--apps=APP1,APP2` - Deploy only specified components + - Bootstrap apps: `namespaces`, `argocd`, `openbao`, `gitea`, `cluster-forge` + - Child apps: Any app from enabledApps list (e.g., `keycloak`, `keda`, `airm`) +- `--target-revision=BRANCH` - cluster-forge git revision for ArgoCD (default: latest release tag) +- `--template-only` or `-t` - Output YAML manifests instead of applying to cluster +- `--skip-deps` - Skip dependency checking for advanced users +- `--airm-image-repository=REPO` - Custom AIRM container image repository for air-gapped deployments + +**Bootstrap Process:** +1. **Validation** - Checks domain, cluster size, values files, required tool availability (kubectl, helm, yq with version checking) +2. **Pre-cleanup** - Removes previous installations if gitea-init-job completed +3. **Values Merge** - Combines base + size-specific values with domain injection +4. **Namespace Creation** - Creates argocd, cf-gitea, openbao namespaces +5. **ArgoCD Deployment** - helm template + kubectl apply with server-side apply using --field-manager=argocd-controller +6. **OpenBao Bootstrap** - Separate bootstrap phase for secrets management foundation +7. **Gitea Deployment** - helm template + kubectl apply, waits for rollout +8. **Gitea Init Job** - Creates cluster-org, clones/pushes cluster-forge and cluster-values repos with AIRM image repository support +9. **ClusterForge App** - Creates root Application that manages all remaining components via ArgoCD +10. **Component Deployment** - ArgoCD syncs all enabledApps including secrets and application stack + +### Selective Component Deployment + +The `--apps` flag enables targeted deployment for development and troubleshooting: + +**Bootstrap Components** (deployed via helm template): +- `namespaces` - Core namespaces (argocd, cf-gitea) +- `argocd` - GitOps controller +- `gitea` - Local Git server +- `cluster-forge` - Root ArgoCD Application + +**Child Components** (deployed via ArgoCD sync): +- Any application from enabledApps list +- Examples: `openbao,openbao-init`, `keycloak`, `keda,kedify-otel` + +**Usage Examples:** +```bash +# Deploy only core GitOps foundation +./scripts/bootstrap.sh example.com --apps=namespaces,argocd,gitea,cluster-forge + +# Deploy only secrets management +./scripts/bootstrap.sh example.com --apps=openbao,openbao-init,openbao-config + +# Render manifests for debugging +./scripts/bootstrap.sh example.com --apps=keycloak --template-only + +# Deploy with custom AIRM image repository for air-gapped environments +./scripts/bootstrap.sh example.com --airm-image-repository=registry.internal.com/airm +``` + +### Self-Contained GitOps + +Once bootstrapped, the cluster is fully self-sufficient: + +**Local Git Server (Gitea):** +- Stores cluster-forge repository (platform code) +- Stores cluster-values repository (environment-specific configuration) +- Provides Git UI at https://gitea.{domain} +- Admin credentials in gitea-admin-credentials secret +- SQLite backend for lightweight operation + +**Local Secrets Management (OpenBao):** +- Vault-compatible secrets engine +- Initialized with policies for each component +- Kubernetes auth method configured +- External Secrets Operator integration +- Secrets for: Keycloak clients, AIRM, database credentials, API keys + +**Configuration as Code:** +- All platform configuration in cluster-values repo +- Changes trigger ArgoCD sync automatically +- Full audit trail through Git history +- Rollback capability via Git revert + +### Values Inheritance System + +Three-layer configuration merge: + +1. **Base Layer** (values.yaml) - Common defaults for all sizes +2. **Size Layer** (values_{size}.yaml) - Size-specific overrides +3. **External Layer** (cluster-values/values.yaml) - Environment customization + +```yaml +# Bootstrap merges: base <- size <- external +VALUES=$(yq eval-all '. as $item ireduce ({}; . * $item)' \ + values.yaml values_medium.yaml cluster-values/values.yaml) +``` + +**Size-Specific Behaviors:** + +Small/Medium are single-node and have storage class mutation policies: +```yaml +enabledApps: + - kyverno-policies-storage-local-path # RWX→RWO mutation for local-path +``` + +Large enables Multi-Node and HA components: +```yaml +apps: + openbao: + valuesObject: + server: + ha: + enabled: true + replicas: 3 +``` + +### Component Version Management + +**Versioned Sources Structure:** +``` +sources/argocd/ + ├── 8.3.5/ # Upstream helm chart + ├── source.yaml # Source metadata (upstream repo, version) + └── values_ha.yaml # Optional HA overrides +``` + +**Configuration Companions:** +Each major component has -config variant: +- argocd-config: OIDC integration, RBAC policies, ExternalSecrets +- gitea-config: Keycloak OAuth, repository templates +- openbao-config: Policy definitions, secret paths, initialization scripts +- minio-tenant-config: Bucket policies, user credentials, gateway routes + +### Secrets Management Architecture + +**Three-Tier Secrets System:** + +1. **OpenBao (Source of Truth)** + - KV v2 secrets engine at secret/ + - Policies per namespace: argocd-policy, airm-policy, gitea-policy, etc. + - Kubernetes auth method for pod authentication + +2. **External Secrets Operator (Synchronization)** + - ExternalSecret resources in each namespace + - SecretStore points to OpenBao with serviceAccountRef + - Automatic sync from OpenBao → Kubernetes Secrets + - Example: argocd-oidc-creds ExternalSecret → OIDC client secret + +3. **Kubernetes Secrets (Consumption)** + - Standard Kubernetes Secret objects + - Referenced by pods via env, volumeMounts + - Automatically updated when OpenBao source changes + +**Bootstrap Secret Flow:** +- bootstrap.sh generates initial passwords with `openssl rand -hex 16` +- ArgoCD deploys OpenBao via cluster-forge Application +- openbao-init-job (sync wave -50) writes secrets to OpenBao +- External Secrets Operator (sync wave -40) syncs to Kubernetes Secrets +- Applications consume via secret references + +### Modular Policy System + +Kyverno policies organized by concern: + +**Base Policies** (kyverno-policies-base): +- Core security policies +- Resource quotas +- Label requirements + +**Storage Policies** (kyverno-policies-storage-local-path): +- Access mode mutation: ReadWriteMany → ReadWriteOnce +- Only enabled for small/medium clusters with local-path storage +- Prevents PVC creation failures on non-distributed storage + +**Custom Policies:** +- AIRM-specific policies included in airm chart +- Custom validations and mutations per application + +### Backup and Restore Utilities + +**Database Export/Import:** +```bash +scripts/utils/export_databases.sh # PostgreSQL dumps from CNPG +scripts/utils/import_databases.sh # Restore PostgreSQL databases +``` + +**Message Queue:** +```bash +scripts/utils/export_rabbitmq.sh # RabbitMQ definitions and messages +scripts/utils/import_rabbitmq.sh # Restore queues and exchanges +``` + +**Object Storage:** +```bash +scripts/utils/mirror_minio.sh # MinIO bucket synchronization +``` + +### Observability Stack + +**Integrated LGTM Platform:** +- **Loki** - Log aggregation with 50Gi storage +- **Grafana** - Visualization dashboards with 10Gi storage +- **Tempo** - Distributed tracing with 50Gi storage +- **Mimir** - Prometheus metrics with 50Gi storage + +**Automatic Collection:** +- Metrics collector deployment: 8Gi RAM, 2 CPU limits +- Logs collector daemonset: 2Gi RAM, 1 CPU per node +- OpenTelemetry contrib collector for advanced telemetry +- Node exporter and kube-state-metrics enabled by default + +**Service Endpoints:** +- Grafana UI: Port 3000 +- OTLP gRPC: Port 4317 +- OTLP HTTP: Port 4318 +- Prometheus: Port 9090 +- Loki: Port 3100 + +### AI/ML Workload Support + +**Multi-Framework Job Integration:** + +Kueue manages scheduling for: +- Kubernetes batch/job +- Ray (RayJob, RayCluster) +- Kubeflow (MPIJob, PyTorchJob, TFJob, XGBoostJob, JAXJob, PaddleJob) +- AppWrapper for multi-pod applications +- Pod, Deployment, StatefulSet + +**Resource Management:** +- Kueue ClusterQueues for resource pools +- LocalQueues per namespace +- ResourceFlavors for GPU/CPU quotas +- Cohort sharing across teams + +**Model Serving:** +- KServe Standard deployment mode +- InferenceService CRD for models +- Auto-scaling with KEDA +- S3 model storage via MinIO + +**GPU Support:** +- AMD GPU Operator for device plugin +- Automatic driver installation +- GPU metrics in Prometheus +- Scheduling via Kueue resource flavors + +## Technical Requirements + +### Prerequisites + +**Kubernetes Cluster:** +- Kubernetes 1.33+ (configurable via bootstrap.sh KUBE_VERSION) +- kubectl with cluster-admin access +- Working storage class (local-path for small/medium, distributed for large) +- Sufficient resources per cluster size + +**Networking:** +- Domain name or wildcard DNS (*.example.com or *.{ip}.nip.io) +- Ingress capability (Gateway API + KGateway deployed by ClusterForge) +- External LoadBalancer or MetalLB (deployed by ClusterForge) -- **Small Clusters** (1-5 users): Development/testing with minimal resources -- **Medium Clusters** (5-20 users): Team production workloads -- **Large Clusters** (10s-100s users): Enterprise scale with full features +**TLS Certificates:** +- cluster-tls secret in kgateway-system namespace +- Can be self-signed for development +- Production should use Cert-Manager with ACME -Size-specific configurations follow DRY principles, inheriting from base configuration and only overriding differences. See [Cluster Size Configuration](docs/cluster_size_configuration.md) for details. +**Required Tools:** +- yq v4+ (YAML processor) with automatic version checking +- helm 3.0+ with automatic version checking +- kubectl with automatic version checking +- openssl (for password generation) -### Workflow +Bootstrap script provides comprehensive dependency validation with platform-specific installation instructions for missing tools. -Cluster-Forge deploys all necessary components within the cluster using GitOps-controller [ArgoCD](https://argo-cd.readthedocs.io/) -and [app-of-apps pattern](https://argo-cd.readthedocs.io/en/stable/operator-manual/cluster-bootstrapping/#app-of-apps-pattern) where Cluster-Forge itself acts as an app of apps. +### Resource Requirements -### Components +**Small Cluster:** +- single node +- 250Gi+ total storage +- Local-path or hostPath storage class -Cluster-Forge repository file structure has 3 main folders: +**Medium Cluster:** +- single node +- 500Gi+ total storage +- Local-path or distributed storage -- **scripts** - bash scripts to [bootstrap](docs/bootstrap_guide.md) necessary prerequisite components for Cluster-Forge & install it -- **root** - core component, root helm chart for app-of-apps that creates all other ArgoCD applications into k8s cluster -- **sources** - folder that contains third-party, community and in-house helm charts & kubernetes manifests that represent cluster components -- **docs** - comprehensive documentation covering architecture, configuration, and operational guides +**Large Cluster:** +- multinode, HA / 3 node control plane +- 1Ti+ total storage +- Distributed storage required (Storage appliances / cloud / Longhorn, Ceph, etc.) -So using the bootstrap script user deploys ArgoCD GitOps-controller and root application which then deploys other components into the cluster. +### Functional Requirements -Here are some key components that are being deployed: +**FR1: AIRM Platform Delivery** +- Deploy AMD Resource Manager (AIRM) 0.3.5 with UI and API +- Support configurable image repositories via `--airm-image-repository` bootstrap parameter +- Provide model serving with KServe v0.16.0 +- Support distributed computing via KubeRay Operator 1.4.2 +- Enable workflow orchestration through Kaiwo v0.2.0-rc11 +- Integrate AMD GPU Operator v1.4.1 for GPU resources -#### Layer 1: GitOps Foundation (Bootstrap) -- **ArgoCD** - GitOps controller for continuous deployment -- **Gitea** - Git repository server for source management -- **OpenBao** - Vault-compatible secret management system +**FR2: GitOps Operations** +- Bootstrap ArgoCD 8.3.5 with single command +- Manage 40+ components as ArgoCD Applications +- Support multi-source Applications for values separation +- Enable local Gitea 12.3.0 for cluster-native GitOps + +**FR3: Size-Aware Deployment** +- Support small/medium/large configurations via --CLUSTER_SIZE flag +- Automatically merge size-specific values with base configuration +- Enable/disable components based on cluster size (e.g., HA modes) +- Apply appropriate policies per size (storage access modes) + +**FR4: Secrets Management** +- Initialize OpenBao 0.18.2 with vault policies +- Configure External Secrets Operator 0.15.1 integration +- Generate and store all component credentials +- Sync secrets from OpenBao to Kubernetes automatically + +**FR5: Dependency Orchestration** +- Deploy components in wave order (-5 to 0) +- Bootstrap foundation before app-of-apps (ArgoCD, OpenBao, Gitea) +- Wait for component health before proceeding +- Use ignoreDifferences for known drift patterns -#### Layer 2: Core Infrastructure -**Networking & Security:** -- **Gateway API + KGateway** - Modern ingress and traffic management -- **Cert-Manager** - Automated TLS certificate management -- **MetalLB** - Load balancer for bare metal environments -- **External Secrets Operator** - External secret integration -- **Cilium** - Network security and observability -- **Kyverno** - Policy engine with modular policy system (see [Kyverno Modular Design](docs/kyverno_modular_design.md)) +### Non-Functional Requirements -**Storage & Database:** -- **CNPG Operator** - Cloud-native PostgreSQL management -- **MinIO Operator + Tenant** - S3-compatible object storage -- **Longhorn** - Distributed block storage +**Performance:** +- Complete bootstrap in under 15 minutes (small cluster) +- ArgoCD sync time under 5 minutes for full stack +- Gitea init job completes in under 2 minutes + +**Reliability:** +- OpenBao HA with 3 replicas and Raft (large clusters) +- ArgoCD automated sync with self-heal +- Server-side apply to prevent field manager conflicts + +**Maintainability:** +- Single values file per cluster size +- DRY principle for configuration inheritance +- Versioned sources for reproducible deployments +- SBOM generation for supply chain security + +**Usability:** +- Single-command deployment +- Helpful error messages with validation +- Progress indication during bootstrap +- Access URLs displayed on completion + +## Development and Customization + +### Adding New Components + +1. Add chart to sources/{component}/{version}/ +2. Define app configuration in values.yaml: +```yaml +apps: + my-component: + path: my-component/1.0.0 + namespace: my-namespace + syncWave: -1 + valuesObject: + # component values +``` +3. Add to enabledApps list + +### Custom Cluster Values + +Create cluster-values repository with custom values.yaml: +```yaml +# Override any base configuration +global: + domain: custom.example.com + +apps: + argocd: + valuesObject: + server: + replicas: 3 # Custom override +``` + +### Size Configuration + +Modify values_{size}.yaml to adjust resources: +- Change replica counts +- Adjust CPU/memory limits +- Enable/disable HA modes +- Add size-specific enabledApps -#### Layer 3: Observability & Monitoring -- **Prometheus** - Metrics collection and alerting -- **Grafana** - Visualization and dashboarding -- **Prometheus Operator CRDs** - Metrics collection infrastructure -- **OpenTelemetry Operator** - Distributed tracing and telemetry -- **OTEL-LGTM Stack** - Unified observability platform (Loki, Grafana, Tempo, Mimir) +## Documentation -#### Layer 4: AI/ML Compute Stack -**GPU & Compute:** -- **AMD GPU Operator** - GPU device management and drivers -- **KubeRay Operator** - Ray distributed computing framework -- **KServe + CRDs** - Kubernetes-native model serving -- **Kueue** - Advanced job queueing system -- **AppWrapper** - Application scheduling and resource management -- **KEDA** - Event-driven autoscaling +Detailed documentation in `/docs`: -**Workflow & Orchestration:** -- **Kaiwo + CRDs** - Workflow management system -- **RabbitMQ** - Message broker for async processing +- [Bootstrap Guide](docs/bootstrap_guide.md) - Deployment walkthrough +- [Cluster Size Configuration](docs/cluster_size_configuration.md) - Size planning +- [Values Inheritance Pattern](docs/values_inheritance_pattern.md) - GitOps configuration +- [Kyverno Modular Design](docs/kyverno_modular_design.md) - Policy architecture +- [Kyverno Access Mode Policy](docs/kyverno_access_mode_policy.md) - Storage policies +- [Backup and Restore](docs/backup_and_restore.md) - Data protection -#### Layer 5: Identity & Access -- **Keycloak** - Enterprise identity and access management -- **Cluster-Auth** - Kubernetes RBAC integration +## Software Bill of Materials (SBOM) -#### Layer 6: AIRM App -- **AIRM API** - The central API layer for AMD Resource Manager, handling authentication, access control, and cluster coordination. -- **AIRM UI** - The frontend interface to interact with resource management features, integrated with the AIRM API and authentication services. -- **AIRM Dispatcher** - The agent responsible for dispatching compute workloads to registered Kubernetes clusters and managing their lifecycle. +ClusterForge includes comprehensive SBOM tooling in `/sbom`: -## Technical Requirements +**SBOM Files:** +- `components.yaml` - Canonical list of all components with versions, licenses, and metadata +- `SBOM-QUICK-GUIDE.md` - Guide for SBOM generation and validation -### Prerequisites & Dependencies +**Validation Scripts:** +- `validate-components-sync.sh` - Ensures components.yaml matches actual sources/ +- `validate-enabled-apps.sh` - Validates enabledApps lists reference defined components +- `validate-metadata.sh` - Checks required metadata fields +- `validate-sync.sh` - Full validation suite -#### External Dependencies -- **Kubernetes cluster** with kubectl access -- **Working storage class** for persistent volumes -- **Domain name configuration** for external access -- **cluster-tls secret** in kgateway-system namespace -- **Network connectivity** for image pulls and external services +**Generation Scripts:** +- `generate-sbom.sh` - Generates SPDX/CycloneDX SBOM documents +- `generate-compare-components.sh` - Compares component versions +- `update_licenses.sh` - Updates license information -#### Required Tools -- **Helm 3.0+** - Package management -- **kubectl** - Kubernetes CLI tool -- **OpenSSL** - Certificate and secret generation +## Version Information -### Functional Requirements +**Current Release:** v1.8.0 -**FR1: AIRM Platform Delivery** -- Deploy complete AI/ML platform with web UI and API -- Provide model serving capabilities with KServe integration -- Support distributed computing with Ray operator -- Enable workflow orchestration through Kaiwo -- Integrate GPU resource management +**Key Component Versions:** +- ArgoCD: 8.3.5 +- Gitea: 12.3.0 +- OpenBao: 0.18.2 +- Keycloak: keycloak-old chart +- KServe: v0.16.0 +- Kaiwo: v0.2.0-rc11 +- AIRM: 0.3.5 +- Kueue: 0.13.0 +- AMD GPU Operator: v1.4.1 +- OTEL-LGTM Stack: v1.0.7 -**FR2: GitOps Operations** -- Bootstrap ArgoCD foundation with single script -- Manage all components as ArgoCD Applications -- Support both external GitHub and local Gitea repositories -- Enable continuous deployment and sync capabilities -- Provide developer access to cluster configuration via Git - -**FR3: Size-Aware Deployment** -- Support small, medium, and large cluster configurations -- Implement automatic resource scaling based on cluster size -- Provide appropriate storage and access mode configurations per size -- Enable cluster-specific policy enforcement (e.g., [Kyverno Access Mode Policy](docs/kyverno_access_mode_policy.md)) - -**FR4: Dependency Management** -- Deploy components in correct dependency order -- Validate component health before proceeding -- Handle complex inter-component dependencies automatically -- Support component customization through values files +## Support and Contribution -### Non-Functional Requirements +**Repository:** https://github.com/silogen/cluster-forge -- Single-command bootstrap deployment -- Complete platform deployment in under 30 minutes -- Provide HA-configuration for all critical components -- Support air-gapped deployment scenarios -- Maintain configuration version control through Git -- Enable seamless transition from external to local repository management +**Issue Tracking:** Use GitHub Issues for bug reports and feature requests -## Documentation +**Maintainers:** ClusterForge Team -Comprehensive documentation is available in the `/docs` folder: +## License -- [Bootstrap Guide](docs/bootstrap_guide.md) - Step-by-step deployment instructions -- [Cluster Size Configuration](docs/cluster_size_configuration.md) - Small/medium/large cluster setup -- [Values Inheritance Pattern](docs/values_inheritance_pattern.md) - GitOps repository configuration -- [Kyverno Modular Design](docs/kyverno_modular_design.md) - Policy system architecture -- [Kyverno Access Mode Policy](docs/kyverno_access_mode_policy.md) - Storage compatibility policies -- [Secrets Management Architecture](docs/secrets_management_architecture.md) - Security implementation -- [Backup and Restore](docs/backup_and_restore.md) - Data protection procedures \ No newline at end of file +See [LICENSE](LICENSE) and [NOTICE](NOTICE) files for licensing information. \ No newline at end of file diff --git a/README.md b/README.md index 6b9c608e..c62736ed 100644 --- a/README.md +++ b/README.md @@ -1,130 +1,169 @@ # Cluster-Forge -**A helper tool that deploys [AMD Enterprise AI Suite](https://enterprise-ai.docs.amd.com/en/latest/) into Kubernetes cluster.** +**A Kubernetes platform automation tool that deploys [AMD Enterprise AI Suite](https://enterprise-ai.docs.amd.com/en/latest/) with complete GitOps infrastructure.** ## Overview -**Cluster-Forge** is a tool designed to bundle various third-party, community, and in-house components into a single, streamlined stack that can be deployed in Kubernetes clusters. By automating the deployment process, Cluster-Forge simplifies the creation of consistent, ready-to-use clusters. +**Cluster-Forge** bundles third-party, community, and in-house components into a single, GitOps-managed stack deployable in Kubernetes clusters. It automates the deployment of a complete AI/ML compute platform with all essential services pre-configured and integrated. -This tool is ideal for scenarios such as: +Using a bootstrap-first deployment model, Cluster-Forge establishes GitOps infrastructure (ArgoCD, Gitea, OpenBao) before deploying the complete application stack via ArgoCD's app-of-apps pattern. -- **Ephemeral test clusters** - Create temporary environments quickly -- **CI/CD pipeline clusters** - Ensure consistent testing environments -- **Multiple production clusters** - Manage a fleet of clusters efficiently -- **Reproducible environments** - Ensure consistency across deployments +**Ideal for:** + +- **AI/ML Engineers** - Unified platform for model training, serving, and orchestration +- **Platform Engineers** - Infrastructure automation with GitOps patterns +- **DevOps Teams** - Consistent deployment across development, staging, and production +- **Research Teams** - Ephemeral test clusters for experimentation ## 🚀 Quick Start -### Basic Deployment +### Single-Command Deployment ```bash -./scripts/bootstrap.sh +./scripts/bootstrap.sh [--cluster-size=small|medium|large] ``` -### Size-Aware Deployment +### Size-Aware Deployment Examples ```bash # Small cluster (1-5 users, development/testing) -./scripts/bootstrap.sh dev.example.com --CLUSTER_SIZE=small +./scripts/bootstrap.sh dev.example.com --cluster-size=small # Medium cluster (5-20 users, team production) [DEFAULT] -./scripts/bootstrap.sh team.example.com --CLUSTER_SIZE=medium +./scripts/bootstrap.sh team.example.com --cluster-size=medium # Large cluster (10s-100s users, enterprise scale) -./scripts/bootstrap.sh prod.example.com --CLUSTER_SIZE=large +./scripts/bootstrap.sh prod.example.com --cluster-size=large + +# Deploy only specific components +./scripts/bootstrap.sh dev.example.com --apps=argocd,gitea,cluster-forge + +# Deploy from specific branch/tag +./scripts/bootstrap.sh prod.example.com --target-revision=v1.8.0 ``` For detailed deployment instructions, see the [Bootstrap Guide](docs/bootstrap_guide.md). -## 📋 Workflow +## 📋 Architecture + +### Bootstrap-First Deployment + +Cluster-Forge uses a three-phase bootstrap process: -Cluster-Forge deploys all necessary components within the cluster using GitOps-controller [ArgoCD](https://argo-cd.readthedocs.io/) -and [app-of-apps pattern](https://argo-cd.readthedocs.io/en/stable/operator-manual/cluster-bootstrapping/#app-of-apps-pattern) where Cluster-Forge acts as an app of apps. +**Phase 1: Pre-Cleanup** +- Detects and removes previous installations when applicable +- Ensures clean state for fresh deployments -### GitOps Architecture +**Phase 2: GitOps Foundation Bootstrap** (Manual Helm Templates) +1. **ArgoCD** (v8.3.5) - GitOps controller deployed via helm template +2. **Gitea** (v12.3.0) - Git server with initialization job -Cluster-Forge supports two deployment modes: -- **External Mode**: Traditional GitOps with GitHub dependency -- **Local Mode**: Self-contained GitOps with local Gitea +**Phase 3: App-of-Apps Deployment** (ArgoCD-Managed) +- Creates cluster-forge Application pointing to root/ helm chart +- ArgoCD syncs all remaining applications including OpenBao from enabledApps list +- Applications deployed in wave order (-70 to 0) based on dependencies +- OpenBao (v0.18.2) managed via ArgoCD with openbao-init job -See [Values Inheritance Pattern](docs/values_inheritance_pattern.md) for detailed architecture documentation. +### Dual Repository GitOps Pattern + +**Local Mode (Default)** - Self-contained cluster-native GitOps: +- Uses local Gitea for both cluster-forge and cluster-values repositories +- Zero external dependencies once bootstrapped +- Initialization handled by gitea-init-job + +**External Mode** - Traditional GitHub-based GitOps: +- Points to external GitHub repository +- Supports custom branch selection for testing + +See [Values Inheritance Pattern](docs/values_inheritance_pattern.md) for detailed architecture. ## 🛠️ Components ### Layer 1: GitOps Foundation -- **ArgoCD** - GitOps controller for continuous deployment -- **Gitea** - Git repository server for source management -- **OpenBao** - Vault-compatible secret management system +- **ArgoCD 8.3.5** - GitOps continuous deployment controller +- **Gitea 12.3.0** - Self-hosted Git server with SQLite backend +- **OpenBao 0.18.2** - Vault-compatible secrets management +- **External Secrets 0.15.1** - Secrets synchronization operator ### Layer 2: Core Infrastructure + **Networking & Security:** -- **Gateway API + KGateway** - Modern ingress and traffic management -- **Cert-Manager** - Automated TLS certificate management -- **MetalLB** - Load balancer for bare metal environments -- **External Secrets Operator** - External secret integration -- **Cilium** - Network security and observability -- **Kyverno** - Policy engine with modular policy system +- **Gateway API v1.3.0** - Kubernetes standard ingress API +- **KGateway v2.1.0-main** - Gateway API implementation with WebSocket support +- **MetalLB v0.15.2** - Bare metal load balancer +- **Cert-Manager v1.18.2** - Automated TLS certificate management +- **Kyverno 3.5.1** - Policy engine with modular policy system **Storage & Database:** -- **Longhorn** - Distributed block storage -- **CNPG Operator** - Cloud-native PostgreSQL management -- **MinIO Operator + Tenant** - S3-compatible object storage - -### Layer 3: Observability & Monitoring -- **Prometheus** - Metrics collection and alerting -- **Grafana** - Visualization and dashboarding -- **OpenTelemetry Operator** - Distributed tracing and telemetry -- **OTEL-LGTM Stack** - Unified observability platform (Loki, Grafana, Tempo, Mimir) - -### Layer 4: AI/ML Compute Stack -**GPU & Compute:** -- **AMD GPU Operator** - GPU device management and drivers -- **KubeRay Operator** - Ray distributed computing framework -- **KServe** - Kubernetes-native model serving -- **Kueue** - Advanced job queueing system -- **AppWrapper** - Application scheduling and resource management -- **KEDA** - Event-driven autoscaling - -**Workflow & Orchestration:** -- **Kaiwo** - Workflow management system -- **RabbitMQ** - Message broker for async processing - -### Layer 5: Identity & Access -- **Keycloak** - Enterprise identity and access management -- **Cluster-Auth** - Kubernetes RBAC integration - -### Layer 6: AIRM App -- **AIRM API** - Central API layer for AMD Resource Manager -- **AIRM UI** - Frontend interface for resource management -- **AIRM Dispatcher** - Compute workload dispatching agent - -## 💾 Storage Classes - -Storage classes are provided by default with Longhorn. These can be customized as needed. - -| Purpose | StorageClass | Access Mode | Locality | -|---------|--------------|-------------|----------| -| GPU Job | mlstorage | RWO | LOCAL/remote | -| GPU Job | default | RWO | LOCAL/remote | -| Advanced usage | direct | RWO | LOCAL | -| Multi-container | multinode | RWX | ANYWHERE | - -## 📄 Configuration +- **CNPG Operator 0.26.0** - CloudNativePG PostgreSQL operator +- **MinIO Operator 7.1.1** - S3-compatible object storage operator +- **MinIO Tenant 7.1.1** - Tenant deployment with default-bucket and models buckets -### Cluster Sizing +### Layer 3: Observability +- **Prometheus Operator CRDs 23.0.0** - Metrics infrastructure +- **OpenTelemetry Operator 0.93.1** - Telemetry collection +- **OTEL-LGTM Stack v1.0.7** - Integrated observability (Loki, Grafana, Tempo, Mimir) -Cluster-Forge provides three pre-configured cluster profiles: +### Layer 4: Identity & Access +- **Keycloak** (keycloak-old chart) - Enterprise IAM with AIRM realm +- **Cluster-Auth 0.5.0** - Kubernetes RBAC integration -- **Small**: Minimal resources, local-path storage, RWX→RWO access mode conversion -- **Medium**: Balanced resources, local-path storage, RWX→RWO access mode conversion -- **Large**: Full enterprise features, Longhorn storage, native RWX support +### Layer 5: AI/ML Compute Stack + +**GPU & Scheduling:** +- **AMD GPU Operator v1.4.1** - GPU device plugin and drivers +- **KubeRay Operator 1.4.2** - Ray distributed computing framework +- **Kueue 0.13.0** - Job queueing with multi-framework support +- **AppWrapper v1.1.2** - Application-level resource scheduling +- **KEDA 2.18.1** - Event-driven autoscaling + +**ML Serving & Inference:** +- **KServe v0.16.0** - Model serving platform (Standard deployment mode) + +**Workflow & Messaging:** +- **Kaiwo v0.2.0-rc11** - AI workload orchestration +- **RabbitMQ v2.15.0** - Message broker for async processing + +### Layer 6: AIRM Application +- **AIRM 0.3.2** - AMD Resource Manager application suite +- **AIM Cluster Model Source** - Cluster resource models for AIRM +- **Configurable Image Repositories** - Supports custom container registries via cluster-bloom `AIRM_IMAGE_REPOSITORY` parameter + +## � Configuration + +### Cluster Sizing + +Three cluster profiles with inheritance-based resource optimization: + +**Small Clusters** (1-5 users, dev/test): +- Single replica deployments +- Reduced resource limits (ArgoCD controller: 2 CPU, 4Gi RAM) +- Adds kyverno-policies-storage-local-path for RWX→RWO PVC mutation +- MinIO tenant: 250Gi storage +- Suitable for: Local workstations, development environments + +**Medium Clusters** (5-20 users, team production): +- Single replica with moderate resource allocation +- Same storage policies as small (local-path support) +- ArgoCD controller: 2 CPU, 4Gi RAM +- Default configuration for balanced performance +- Suitable for: Small teams, staging environments + +**Large Clusters** (10s-100s users, enterprise scale): +- OpenBao HA: 3 replicas with Raft consensus +- No local-path policies (assumes distributed storage) +- MinIO tenant: 500Gi storage +- Production-grade resource allocation +- Suitable for: Production deployments, multi-tenant environments See [Cluster Size Configuration](docs/cluster_size_configuration.md) for detailed specifications. ### Values Files Configuration follows a streamlined inheritance pattern: -- **Base**: 52 common applications with alpha-sorted enabledApps +- **Base**: Common applications with alpha-sorted enabledApps - **Size-specific**: Only override differences from base (DRY principle) -- **Runtime**: Domain and cluster-specific parameters +- **Runtime**: Domain and cluster-specific parameters injected during bootstrap + +The bootstrap script uses YAML merge semantics where size-specific values override base values.yaml settings. ## 📚 Documentation @@ -135,11 +174,13 @@ Comprehensive documentation is available in the `/docs` folder: | **Getting Started** | [Bootstrap Guide](docs/bootstrap_guide.md) | | **Configuration** | [Cluster Size Configuration](docs/cluster_size_configuration.md) | | **Architecture** | [Values Inheritance Pattern](docs/values_inheritance_pattern.md) | -| **Security** | [Kyverno Modular Design](docs/kyverno_modular_design.md) | -| **Policies** | [Kyverno Access Mode Policy](docs/kyverno_access_mode_policy.md) | -| **Secrets** | [Secrets Management Architecture](docs/secrets_management_architecture.md) | +| **Policy System** | [Kyverno Modular Design](docs/kyverno_modular_design.md) | +| **Storage Policies** | [Kyverno Access Mode Policy](docs/kyverno_access_mode_policy.md) | | **Operations** | [Backup and Restore](docs/backup_and_restore.md) | +Additional documentation: +- **SBOM**: See `/sbom` folder for software bill of materials generation and validation + ## 📝 License Cluster-Forge is licensed under the Apache License, Version 2.0. See the [LICENSE](LICENSE) file for details. diff --git a/docs/bootstrap_guide.md b/docs/bootstrap_guide.md index b5d29a88..287c3973 100644 --- a/docs/bootstrap_guide.md +++ b/docs/bootstrap_guide.md @@ -1,78 +1,262 @@ -# Bootstrap Script +# Bootstrap Guide -This script bootstraps a complete GitOps environment with ArgoCD, OpenBao (secret management), and Gitea (Git repository) on a Kubernetes cluster. +This guide explains how to bootstrap a complete GitOps environment using Cluster-Forge's five-step deployment model. The bootstrap process establishes ArgoCD, OpenBao, and Gitea as foundation components, then creates the cluster-forge Application which manages all remaining components via ArgoCD. ## Prerequisites -- Kubernetes cluster (running and accessible via `kubectl`) +- Kubernetes cluster (1.33+ recommended, running and accessible via `kubectl`) - Tools installed: - - `kubectl` - - `helm` - - `openssl` - - `yq` + - `kubectl` with cluster-admin access + - `helm` (3.0+) + - `openssl` (for password generation) + - `yq` (v4+) ## Usage ```bash -./bootstrap.sh [values_file] +./scripts/bootstrap.sh [--cluster-size=small|medium|large] ``` -**Examples:** +### Arguments + +- **domain** (required): Cluster domain for all services (e.g., `example.com`, `192.168.1.100.nip.io`) + +### Options + +- **--apps=APP1,APP2**: Deploy only specified components (default: applies to cluster) + - Bootstrap apps: `namespaces`, `argocd`, `openbao`, `gitea`, `cluster-forge` + - Child apps: Any app from enabledApps list (see values_{size}.yaml for app names) + - Use with `--template-only` to render instead of applying +- **--cluster-size** `[small|medium|large]`: Cluster size configuration (default: `medium`) +- **--template-only**, **-t**: Output YAML manifests to stdout instead of applying to cluster +- **--target-revision**, **-r**: cluster-forge git revision for ArgoCD to sync from +- **--skip-deps**: Skip dependency checking (for advanced users) +- **--airm-image-repository=REPO**: Custom AIRM container image repository for air-gapped deployments +- **--help**, **-h**: Show usage information + +### Examples + ```bash -# Using default values_cf.yaml -./bootstrap.sh plat-dev-1.silogen.ai +# Basic usage with default medium cluster size +./scripts/bootstrap.sh 192.168.1.100.nip.io + +# Large cluster +./scripts/bootstrap.sh example.com --cluster-size=large + +# Deploy only specific components +./scripts/bootstrap.sh example.com --apps=openbao,openbao-init -# Using custom values file -./bootstrap.sh plat-dev-1.silogen.ai custom_values.yaml +# Render templates for debugging (doesn't apply) +./scripts/bootstrap.sh example.com --apps=gitea --template-only + +# Deploy from specific git branch +./scripts/bootstrap.sh example.com --target-revision=feature-branch +./scripts/bootstrap.sh example.com --CLUSTER_SIZE=large + +# Custom AIRM image repository +./scripts/bootstrap.sh example.com --airm-image-repository=ghcr.io/mycompany + +# Air-gapped deployment with local registry +./scripts/bootstrap.sh 192.168.1.100.nip.io --cluster-size=small --airm-image-repository=harbor.internal.com/airm ``` -## What Does It Do? +## How It Works -The script performs the following steps in sequence: +The bootstrap script uses a five-step deployment model: -### 1. Domain Configuration -- Validates that a domain argument is provided -- Sets the values file to use (defaults to `values_cf.yaml` if not specified) -- Uses the global domain value to render [root](../root) helm chart -- This domain is used for all service endpoints (Gitea, ArgoCD, etc.) +### Phase 1: Pre-Cleanup +- The pre_cleanup function performs selective cleanup, only affects cf-gitea and cf-openbao namespaces +- Detects previous installations by checking for completed gitea-init-job +- Removes Gitea resources to enable fresh deployment +- Deletes OpenBao initialization jobs and temporary files +- Ensures clean state for new bootstrap -### 2. Namespace Creation -Creates three namespaces for core components: +### Phase 2: GitOps Foundation Bootstrap (Manual Helm Templates) + +**1. Configuration Preparation** +- Validates required domain argument +- Validates cluster size (small, medium, or large) +- Merges base `values.yaml` with size-specific overrides `values_.yaml` +- Sets `global.domain` and `global.clusterSize` in merged configuration + +**2. Namespace Creation** +Creates three namespaces for bootstrap components: - `argocd` - GitOps controller - `cf-gitea` - Git repository server -- `cf-openbao` - Secret management system - -### 3. ArgoCD Bootstrap -- Deploys ArgoCD -- Waits for all ArgoCD components to be ready - -### 4. OpenBao Bootstrap -- Deploys OpenBao -- Waits for the first pod (`openbao-0`) to be running -- Runs initialization job (`openbao-init-job`) which: - - Initializes & configures OpenBao Raft cluster - - Unseals all pods - - Creates root credentials - -### 5. Gitea Bootstrap -- Creates gitea-admin credentials secret -- Creates ConfigMap with initial cluster forge values -- Deploys & configures Gitea +- `cf-openbao` - Secrets management system + +**3. ArgoCD Bootstrap** +- Extracts ArgoCD values from merged configuration +- Deploys ArgoCD using `helm template` with server-side apply +- Uses `--field-manager=argocd-controller` to match ArgoCD's self-management +- Waits for all ArgoCD components to be ready: + - application-controller StatefulSet + - applicationset-controller Deployment + - redis Deployment + - repo-server Deployment + +**4. Gitea Bootstrap** +- Generates random admin password using `openssl rand -hex 16` +- Creates `initial-cf-values` ConfigMap with merged configuration +- Creates `gitea-admin-credentials` secret +- Extracts Gitea values from merged configuration +- Deploys Gitea using `helm template` - Waits for Gitea deployment to be ready - Runs initialization job (`gitea-init-job`) which: - - Creates cluster-org organization - - Creates cluster-forge as a mirror repo - - Creates cluster-values as a repo with cluster configuration + - Creates admin API token + - Creates `cluster-org` organization + - Clones and pushes cluster-forge repository from initial-cf-values ConfigMap + - Creates cluster-values repository with configuration + +### Phase 3: App-of-Apps Deployment (ArgoCD-Managed) + +**5. ClusterForge Application Deployment** +- Renders root helm chart with merged configuration +- Creates `cluster-forge` Application resource in ArgoCD +- ArgoCD syncs all remaining components in wave order: + - Wave -70: OpenBao (secrets management) + - Wave -60: OpenBao configuration + - Wave -50: OpenBao initialization job + - Wave -40: External Secrets, Cert-Manager + - Wave -30 to 0: All other applications + +**Key Improvement**: OpenBao is now managed by ArgoCD rather than bootstrapped separately, simplifying the bootstrap process while maintaining proper dependency ordering through sync waves. + +- When `externalValues.enabled: true`, uses multi-source feature: + - Source 1: cluster-forge repo (root/ helm chart) + - Source 2: cluster-values repo (custom values.yaml) +- ArgoCD manages the complete application lifecycle +- Proper dependency ordering ensures OpenBao is ready before applications that depend on secrets + +**7. Cleanup** +- Removes temporary merged values files from /tmp/ + +## Cluster Configuration + +### Values Files Structure + +ClusterForge uses a layered configuration approach with YAML merge semantics: + +1. **Base values** (`root/values.yaml`): + - Contains all app definitions + - Defines default configuration for all apps + - Specifies `enabledApps` list (alpha-sorted) + - Configured with: + - `clusterForge.repoUrl` - Points to Gitea service URL (local mode) or GitHub (external mode) + - `clusterForge.targetRevision` - Version/branch to deploy + - `externalValues.enabled: true` - Enables dual-repository pattern + - `externalValues.repoUrl` - Points to cluster-values repo in Gitea + - `global.domain` - Set by bootstrap script + - `global.clusterSize` - Set by bootstrap script + +2. **Size-specific values** (`root/values_.yaml`): + - Override base values for specific cluster sizes + - Define resource limits and requests + - Single node (small and medium) RWO local-path storage + - Multinode (large) RWX storage + - Modify replica counts and HA settings + - Add size-specific enabled apps (e.g., `kyverno-policies-storage-local-path` for small/medium) + - Available sizes: `small`, `medium`, `large` + - Uses DRY principle - only contains differences from base + +3. **External values** (`cluster-values/values.yaml` in Gitea): + - Created during bootstrap in the `cluster-values` repository + - Contains cluster-specific overrides + - Can be modified post-bootstrap for customizations + - Structure: + ```yaml + clusterForge: + repoURL: http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git + path: root + targetRevision: main + + global: + clusterSize: medium # Set by --cluster-size flag + domain: example.com # Set by domain argument + + # AIRM Image Repository Configuration (optional, only when AIRM_IMAGE_REPOSITORY is set) + airm-api: + airm: + backend: + image: + repository: ghcr.io/mycompany/airm-api + frontend: + image: + repository: ghcr.io/mycompany/airm-ui -### 6. ArgoCD Application Deployment -- Creates root cluster-forge app that manages all other apps + airm-dispatcher: + airm: + dispatcher: + image: + repository: ghcr.io/mycompany/airm-dispatcher + ``` -## Access to main components +### Value Merging Order + +When ArgoCD renders the cluster-forge application, values are merged in this order (later values override earlier): + +1. Base `values.yaml` +2. Size-specific `values_.yaml` +3. External `cluster-values/values.yaml` from Gitea + +### Cluster Sizes + +Each cluster size is optimized for different resource constraints: + +- **Small**: Development/testing environments, minimal resources +- **Medium** (default): Production-ready, balanced configuration +- **Large**: High-availability, maximum performance + +Size-specific configurations typically adjust: +- Component replica counts (ArgoCD, PostgreSQL, etc.) +- Resource limits and requests (CPU, memory) +- Storage sizes (PVC, retention periods) +- High-availability features (Redis HA, multiple replicas) + +## ClusterForge App-of-Apps Model + +The bootstrap script creates the root `cluster-forge` Application in ArgoCD, which implements an app-of-apps pattern. + +### Application Structure + +The `cluster-forge` Application is defined in [root/templates/cluster-forge.yaml](../root/templates/cluster-forge.yaml): + +### Child Applications + +The root chart renders individual Application resources for each app listed in `enabledApps` using the template in [root/templates/cluster-apps.yaml](../root/templates/cluster-apps.yaml). + +Each child application includes: +- **Namespace**: Target namespace for the application +- **Path**: Location of helm chart or manifests in `sources/` +- **Values**: Configuration from `apps..valuesObject` or `valuesFile` +- **Sync wave**: Deployment order (lower numbers deploy first) +- **Sync policy**: Automated with prune and self-heal enabled +- **Ignore differences**: Optional resource-specific ignore rules + +Example child application configuration in values: + +```yaml +apps: + argocd: + path: argocd/8.3.5 + namespace: argocd + syncWave: -3 + valuesObject: + # ArgoCD-specific values + helmParameters: + - name: global.domain + value: "argocd.{{ .Values.global.domain }}" +``` + +## Access to Main Components 1. **ArgoCD:** ```bash - # Initial admin user password + # Initial admin password kubectl -n argocd get secret argocd-initial-admin-secret -o jsonpath="{.data.password}" | base64 -d + + # Access URL (replace with your domain) + echo "https://argocd.${DOMAIN}" ``` 2. **Gitea:** @@ -82,36 +266,171 @@ Creates three namespaces for core components: # Admin password kubectl -n cf-gitea get secret gitea-admin-credentials -o jsonpath="{.data.password}" | base64 -d + + # API token (created by init job) + kubectl -n cf-gitea get secret gitea-admin-token -o jsonpath="{.data.token}" | base64 -d + + # Access URL (replace with your domain) + echo "https://gitea.${DOMAIN}" ``` 3. **OpenBao:** ```bash # Root token kubectl -n cf-openbao get secret openbao-keys -o jsonpath='{.data.root_token}' | base64 -d + + # Unseal keys (stored in openbao-keys secret) + kubectl -n cf-openbao get secret openbao-keys -o jsonpath='{.data.unseal_keys_b64}' | base64 -d ``` -4. **Devuser secret:** +4. **Keycloak (deployed by ArgoCD):** ```bash - # Devuser secret - kubectl -n keycloak get secret airm-devuser-credentials -o jsonpath="{.data.KEYCLOAK_INITIAL_DEVUSER_PASSWORD}"| base64 -d + # Admin password + kubectl -n keycloak get secret keycloak-credentials -o jsonpath="{.data.KEYCLOAK_INITIAL_ADMIN_PASSWORD}" | base64 -d + + # Dev user password + kubectl -n keycloak get secret airm-devuser-credentials -o jsonpath="{.data.KEYCLOAK_INITIAL_DEVUSER_PASSWORD}" | base64 -d ``` -4. **Keycloak admin secret:** - ```bash - # Devuser secret - kubectl -n keycloak get secret keycloak-credentials -o jsonpath="{.data.KEYCLOAK_INITIAL_ADMIN_PASSWORD}"| base64 -d - ``` +## Troubleshooting -## Development +### Bootstrap Fails at Gitea Init + +If the Gitea initialization job fails during repository migration: + +```bash +# Check job logs +kubectl logs -n cf-gitea job/gitea-init-job + +# The job automatically retries migration up to 5 times +# If it continues failing, check Gitea pod logs +kubectl logs -n cf-gitea deploy/gitea -c gitea +``` + +### OpenBao Init Job Fails + +*Production mode only* + +If OpenBao initialization fails: + +```bash +# Check init job logs +kubectl logs -n cf-openbao job/openbao-init-job + +# Verify OpenBao is running +kubectl get pods -n cf-openbao + +# Re-run bootstrap (pre-cleanup will handle the retry) +./bootstrap.sh your-domain.com +``` + +### ArgoCD Applications Not Syncing + +If applications aren't deploying: + +```bash +# Check cluster-forge app status +kubectl get application cluster-forge -n argocd -o yaml + +# Check individual app status +kubectl get applications -n argocd + +# View app details in ArgoCD UI +# https://argocd.your-domain.com +``` + +### Merged Values Inspection + +The bootstrap script creates temporary merged values at `/tmp/merged_values.yaml` for debugging. You can inspect this file during bootstrap to see the final merged configuration. + +## Post-Bootstrap Customization + +### Production Mode (with Gitea) + +After bootstrap completes in production mode, you can customize the cluster by modifying the `cluster-values` repository in Gitea: + +1. **Access Gitea** at `https://gitea.${DOMAIN}` +2. **Navigate to** `cluster-org/cluster-values` repository +3. **Edit** `values.yaml` to add/override configuration +4. **Commit** changes +5. **ArgoCD** will automatically detect and apply changes + +Example customizations in `cluster-values/values.yaml`: + +```yaml +# Override app-specific values +apps: + keycloak: + valuesObject: + replicas: 2 + resources: + requests: + memory: "1Gi" + +# Disable specific apps +enabledApps: + - argocd + - gitea + # ... list only apps you want enabled + +# Add custom global values +global: + myCustomValue: "something" +``` + +## Selective Component Deployment + +The `--apps` flag allows you to deploy only specific components instead of the full stack. This is useful for: + +- **Development workflows**: Deploy only the components you're working on +- **Troubleshooting**: Deploy components individually to isolate issues +- **Testing**: Validate specific component configurations +- **Incremental deployment**: Add components to an existing cluster + +### Bootstrap Components + +These are the core infrastructure components deployed manually via helm template: + +- `namespaces` - Creates required namespaces (argocd, cf-gitea, cf-openbao) +- `argocd` - GitOps controller for managing all other components +- `gitea` - Self-hosted Git server for cluster-forge and cluster-values repositories +- `cluster-forge` - ArgoCD parent application that manages all child apps + +### Cluster-Forge Child Apps + +Any application listed in `enabledApps` from values.yaml can be deployed individually: + +```bash +# Deploy only OpenBao components +./scripts/bootstrap.sh example.com --apps=openbao,openbao-init,openbao-config + +# Deploy only monitoring stack +./scripts/bootstrap.sh example.com --apps=prometheus-crds,otel-lgtm-stack,opentelemetry-operator + +# Deploy identity management +./scripts/bootstrap.sh example.com --apps=keycloak,cluster-auth,cluster-auth-config +``` + +### Template-Only Mode + +Combine with `--template-only` to render manifests without applying: + +```bash +# Generate YAML for debugging +./scripts/bootstrap.sh example.com --apps=keycloak --template-only > keycloak-manifests.yaml + +# View what would be deployed +./scripts/bootstrap.sh example.com --apps=openbao,openbao-init --template-only | kubectl diff -f - +``` -For development purposes there is a way to sync all apps directly from cluster-forge GitHub repo bypassing gitea. Here is the possible development flow: +## File Cleanup -- Create feature-branch with your changes -- Modify `values_dev.yaml` file with the following parameters: - - `clusterForge.targetRevision` - feature-branch name - - `global.domain` - domain name -- Commit & push changes to your feature-branch -- Run `scripts/bootstrap_dev.sh` -- Wait for cluster apps to be ready -- From this point forward, any changes you push to your feature branch will be automatically synchronized to the cluster by ArgoCD. +The bootstrap script automatically cleans up temporary files at the end: +- `/tmp/merged_values.yaml` +- `/tmp/argocd_values.yaml` +- `/tmp/argocd_size_values.yaml` +- `/tmp/openbao_values.yaml` +- `/tmp/openbao_size_values.yaml` +- `/tmp/gitea_values.yaml` +- `/tmp/gitea_size_values.yaml` diff --git a/docs/cluster_size_configuration.md b/docs/cluster_size_configuration.md index 1d094ebc..11da7231 100644 --- a/docs/cluster_size_configuration.md +++ b/docs/cluster_size_configuration.md @@ -1,25 +1,26 @@ -# ClusterForge Size-Based Configuration +# Cluster-Forge Size-Based Configuration -This document describes the cluster size-based configuration system for ClusterForge applications, enabling optimal resource allocation based on cluster scale. +This document describes the cluster size-based configuration system for Cluster-Forge, enabling optimal resource allocation based on cluster scale. ## Overview -ClusterForge now supports three cluster sizes, each with optimized resource allocations for the applications deployed on top of ClusterBloom: +Cluster-Forge supports three cluster sizes, each with optimized resource allocations: - **Small**: Developer/single-user setups (1-5 users) - **Medium**: Team clusters (5-20 users) - **Large**: Production/enterprise scale (10s-100s users) +Size configurations use YAML merge semantics where size-specific values override base values.yaml settings. + ## File Structure ``` cluster-forge/ ├── root/ -│ ├── values.yaml # Base configuration (all applications enabled) +│ ├── values.yaml # Base configuration (all applications) │ ├── values_small.yaml # Small cluster overrides │ ├── values_medium.yaml # Medium cluster overrides -│ ├── values_large.yaml # Large cluster overrides -│ └── values_dev.yaml # Development environment overrides +│ └── values_large.yaml # Large cluster overrides └── scripts/ └── bootstrap.sh # Main bootstrap script with size support ``` @@ -30,202 +31,190 @@ cluster-forge/ **Target**: Developer Cluster / Single-User Setup (1-5 users) **Infrastructure**: -- **Nodes**: 1 all-in-one or 2 nodes (1×CP + 1×GPU worker) +- **Nodes**: 1-2 nodes (single all-in-one or 1 control plane + 1 worker) - **CPU**: 8-32 vCPU total - **Memory**: 32-128 GB RAM total -- **GPU**: 1-4 GPUs, no partitioning needed -- **Storage**: 1-4 TB total NVMe, Internal S3: 0.5-2 TB +- **GPU**: 1-4 GPUs (optional) +- **Storage**: 2Ti+ total, local-path StorageClass - **Networking**: 1 GbE acceptable **Application Configuration**: -- **ArgoCD**: Single replica, minimal resources -- **MinIO**: Single server, 500GB storage -- **OpenBao**: Single instance (no HA) -- **Prometheus**: 7d retention, 10GB storage -- **Grafana**: Single replica, 1GB storage +- **ArgoCD**: Single replica, 2 CPU / 2Gi RAM limits +- **MinIO Tenant**: 2Ti storage, single server +- **OpenBao**: Single instance (no HA), 5Gi storage +- **Storage Policies**: Includes `kyverno-policies-storage-local-path` for RWX→RWO conversion +- **Storage Classes**: Mix of local-path and direct storage classes +- **Component Replicas**: All single replica deployments -**Use Cases**: Development, testing, proof-of-concept +**Use Cases**: Development, testing, proof-of-concept, local workstations ### Medium Cluster (`values_medium.yaml`) **Target**: Team Cluster (5-20 users) **Infrastructure**: -- **Nodes**: 1-3 nodes (Option A: 1×CP + 1-2 GPU workers, Option B: 3×CP + GPU workers) -- **CPU**: 32-64 vCPU per GPU node -- **Memory**: 128-256 GB RAM per GPU node -- **GPU**: Up to 8 GPUs total, partitioning optional -- **Storage**: 4-16 TB total NVMe, Internal S3: 2-10 TB +- **Nodes**: 3-5 nodes +- **CPU**: 32-64 vCPU per node +- **Memory**: 128-256 GB RAM per node +- **GPU**: Up to 8 GPUs total (optional) +- **Storage**: 500Gi+ total, local-path or distributed storage - **Networking**: 10 GbE recommended **Application Configuration**: -- **ArgoCD**: 2 replicas with HA Redis -- **MinIO**: 3 servers, 6TB total (3×2TB), datasets bucket -- **OpenBao**: 3 replicas with Raft HA -- **Enhanced resources** for team collaboration +- **ArgoCD**: Single replica, 1 CPU / 2Gi RAM limits +- **MinIO Tenant**: 2Ti storage, single server +- **OpenBao**: Single instance (no HA), 5Gi storage +- **Storage Policies**: Includes `kyverno-policies-storage-local-path` for RWX→RWO conversion +- **Storage Classes**: Direct storage class consistently +- **Component Replicas**: Balanced single replica configuration -**Use Cases**: Production workloads, staging environments +**Use Cases**: Team production workloads, staging environments, CI/CD ### Large Cluster (`values_large.yaml`) -**Target**: Production-Path / Scale-Out (10s-100s users) +**Target**: Production-Path / Enterprise Scale (10s-100s users) **Infrastructure**: -- **Nodes**: 3-5 dedicated CP servers + 3-6 GPU nodes (scale to 100s) -- **CPU**: Workers: 32-96 vCPU, CP nodes: 8-16 vCPU -- **Memory**: Workers: 256-1024 GB, CP nodes: 32-64 GB +- **Nodes**: 10+ nodes (3-5 dedicated control plane + GPU workers) +- **CPU**: Workers: 32-96 vCPU, Control plane: 8-16 vCPU +- **Memory**: Workers: 256-1024 GB, Control plane: 32-64 GB - **GPU**: 8+ GPUs baseline, mixed families, heterogeneous -- **Storage**: 10-100+ TB NVMe, External HA S3 (recommended) -- **Networking**: 25 GbE or more, optional separate storage network +- **Storage**: 1Ti+ total, distributed storage required +- **Networking**: 25 GbE or more recommended **Application Configuration**: -- **ArgoCD**: 3 replicas with enhanced PDB -- **MinIO**: External HA S3 recommended -- **OpenBao**: Full HA with enhanced security -- **Full observability stack** with extended retention +- **ArgoCD**: Single replica, production-ready resources +- **MinIO Tenant**: 500Gi storage, single server (external HA S3 recommended) +- **OpenBao**: 3 replicas with Raft HA consensus +- **Storage Policies**: No local-path policies (assumes distributed storage) +- **OTEL LGTM Stack**: 50Gi storage per component (Tempo, Loki, Mimir), 10Gi Grafana +- **Component Replicas**: Production-grade, HA where applicable -**Use Cases**: Large-scale production, enterprise deployments +**Use Cases**: Large-scale production, enterprise deployments, multi-tenant environments ## Usage ### Using the Bootstrap Script -The bootstrap script automatically selects the appropriate size configuration: +The bootstrap script automatically applies the appropriate size configuration: ```bash -# Basic usage (auto-detects cluster size) +# Default (medium cluster) ./scripts/bootstrap.sh example.com # Explicitly specify cluster size -./scripts/bootstrap.sh example.com --size small -./scripts/bootstrap.sh example.com --size medium -./scripts/bootstrap.sh example.com --size large - -# CI mode (no interactive prompts) -./scripts/bootstrap.sh example.com --size medium --ci +./scripts/bootstrap.sh example.com --cluster-size=small +./scripts/bootstrap.sh example.com --cluster-size=medium +./scripts/bootstrap.sh example.com --cluster-size=large ``` -### Size Detection Logic +### Configuration Merge Logic -The bootstrap script uses multiple methods to determine cluster size: +The script combines configurations using YAML merge semantics: +1. **Base**: `values.yaml` (all applications, common defaults) +2. **Size-specific**: `values_[size].yaml` (overrides and size-specific additions) -1. **Explicit `--size` parameter** (highest priority) -2. **CLUSTER_SIZE from bloom-config ConfigMap** (if available) -3. **Auto-detection based on node count** (fallback to small) +Later values override earlier ones, allowing size files to contain only the differences (DRY principle). -### Configuration Merge Logic +## Key Configuration Differences + +### Storage Strategy -The script combines configurations in this order: -1. **Base**: `values.yaml` (all applications enabled) -2. **Size-specific**: `values_[size].yaml` (resource overrides) -3. **Environment-specific**: `values_dev.yaml' (if specified) +| Size | Storage Approach | RWX Support | Kyverno Policy | +|------|-----------------|-------------|----------------| +| Small | local-path | ❌ (mutated to RWO) | `kyverno-policies-storage-local-path` | +| Medium | local-path or distributed | ❌ (mutated to RWO) | `kyverno-policies-storage-local-path` | +| Large | Distributed storage | ✅ Native RWX | No local-path policy | -## Application-Specific Configurations +### High Availability -### ArgoCD Scaling +| Component | Small | Medium | Large | +|-----------|-------|--------|-------| +| OpenBao | Single instance | Single instance | 3 replicas (Raft HA) | +| ArgoCD | Single replica | Single replica | Single replica | +| Redis | Single instance | Single instance | Single instance | +| Gitea | Single replica | Single replica | Single replica | -| Size | Controller Replicas | Repo Server Replicas | Redis HA | Resources | -|------|--------------------|--------------------|----------|-----------| -| Small | 1 | 1 | Disabled | Minimal | -| Medium | 2 | 2 | Enabled | Standard | -| Large | 3 | 3 | Enhanced | High + PDB | +### Observability Stack +| Size | Stack | Storage per Component | Notes | +|------|-------|----------------------|-------| +| Small | Basic | Minimal | Resource-constrained | +| Medium | Basic | Moderate | Team-scale monitoring | +| Large | OTEL LGTM | 50Gi (Tempo/Loki/Mimir), 10Gi (Grafana) | Full observability platform | + +| Application | Small | Medium | Large | Notes | +|-------------|-------|--------|-------|-------| +| Gitea | Base config | Base config | SQLite, no PostgreSQL/Valkey | Lightweight for all sizes | +| Keycloak | Base config | Base config | 1 replica, optimized resources | CPU: 250-500m, Mem: 512Mi-2Gi | +| Kueue | 1 replica | 1 replica | 1 replica | Workload queue controller | +| KEDA | Base config | Base config | Base config | Event-driven autoscaling | +| KServe | Base config | Base config | Base config | ML model serving | +| Kyverno | Base policies | Base + storage-local-path | Base policies only | Policy engine | ### MinIO Tenant Scaling -| Size | Servers | Storage per Server | Total Storage | Buckets | -|------|---------|-------------------|---------------|---------| -| Small | 1 | 500Gi | 500GB | Basic (default, models) | -| Medium | 3 | 2Ti | 6TB | + datasets | -| Large | External | - | 10-100+ TB | Full enterprise | +| Size | Servers | Storage | Buckets | Notes | +|------|---------|---------|---------|-------| +| Small | 1 | 2Ti | default-bucket, models | Single server, local-path storage | +| Medium | 1 | 2Ti | default-bucket, models | Single server, direct storage | +| Large | 1 | 500Gi | default-bucket, models | Single server, external HA S3 recommended | ### OpenBao Scaling | Size | Mode | Replicas | Storage | HA Method | |------|------|----------|---------|-----------| -| Small | Standalone | 1 | 1Gi | None | -| Medium | HA | 3 | Standard | Raft | -| Large | HA | 3+ | Enhanced | Raft + external | - -## Advanced Configuration - -### Combining Size with Environment - -```bash -# Small development cluster -./scripts/bootstrap.sh dev.example.com --size small - -# Large production cluster with HA -./scripts/bootstrap.sh prod.example.com --size large -``` - -### Custom Overrides - -You can add additional override files: - -```bash -# Custom GPU configuration for large cluster -./scripts/bootstrap.sh gpu.example.com --size large -f custom-gpu-values.yaml -``` - -### Environment Variables - -The script supports environment variables: -- `CLUSTER_SIZE`: Override detected size -- `DOMAIN`: Set domain if not provided as argument -- `CI_MODE`: Enable CI mode (equivalent to `--ci`) - -## Validation - -The bootstrap script validates: -- **Node count** against cluster size requirements -- **Resource availability** for the selected size -- **Application compatibility** with cluster capabilities - -## Migration Between Sizes - -To change cluster size: - -1. **Update the size parameter**: Re-run bootstrap with new `--size` -2. **Resource validation**: Ensure cluster meets new requirements -3. **Application scaling**: ArgoCD will handle application updates -4. **Storage considerations**: May require storage expansion for larger sizes +| Small | Standalone | 1 | 5Gi | None | +| Medium | Standalone | 1 | 5Gi | None | +| Large | HA | 3 | 10Gi (default) | Raft consensus | ## Benefits 1. **Resource Optimization**: Right-sized configurations prevent over/under-provisioning -2. **Cost Efficiency**: Small clusters use minimal resources -3. **Scalability**: Easy to migrate between sizes as needs grow -4. **Consistency**: Standardized configurations across deployments -5. **Automation**: Bootstrap script handles complexity - -## Troubleshooting - -### Size Detection Issues -```bash -# Check current size detection -kubectl get configmap bloom-config -n default -o yaml - -# Force size override -./scripts/bootstrap.sh example.com --size medium -``` - -### Resource Constraints -```bash -# Validate node resources -kubectl describe nodes - -# Check for resource contention -kubectl top nodes -kubectl top pods --all-namespaces + - Small: Minimal replicas, basic resources + - Medium: Balanced configuration for team use + - Large: Production-grade with HA features + +2. **Storage Strategy**: Automatic policy application + - Small/Medium: Kyverno RWX→RWO mutation for local-path compatibility + - Large: Native RWX support with distributed storage + +3. **Cost Efficiency**: Progressive resource allocation + - Single replicas for small/medium clusters + - HA only enabled where needed (large clusters) + - DRY configuration principle reduces maintenance + +4. **Scalability**: Easy path from development to production + - Consistent application structure across sizes + - Configuration inheritance reduces duplication + - Clear upgrade path between sizes + +5. **Automation**: Bootstrap script handles all complexity + - Automatic value file merging + - Size-appropriate policy application + - Validation of configurations + +## Customization + +### Adding Custom Overrides + +Modify size-specific values files to adjust resources: + +```yaml +# values_large.yaml example +apps: + openbao: + valuesObject: + server: + ha: + enabled: true + replicas: 3 # HA for large clusters ``` -### Application Scaling Issues -```bash -# Check ArgoCD application status -kubectl get applications -n argocd - -# View specific application details -kubectl describe application -n argocd -``` +### Enabling/Disabling Applications ---- +Control which applications are deployed per size: -**This is the way** - A scalable configuration system that adapts ClusterForge applications to cluster capacity, ensuring optimal performance across all deployment sizes! \ No newline at end of file +```yaml +# values_small.yaml +enabledApps: + # Inherits base apps, adds storage policy + - kyverno-policies-storage-local-path +``` \ No newline at end of file diff --git a/docs/kyverno_access_mode_policy.md b/docs/kyverno_access_mode_policy.md index 81379df0..16bece97 100644 --- a/docs/kyverno_access_mode_policy.md +++ b/docs/kyverno_access_mode_policy.md @@ -110,9 +110,9 @@ cluster-forge/ # The policy is deployed or not based on values_*.yaml configuration # Large clusters simply don't include the policy in enabledApps -./scripts/bootstrap.sh example.com --CLUSTER_SIZE=small # Policy deployed -./scripts/bootstrap.sh example.com --CLUSTER_SIZE=medium # Policy deployed -./scripts/bootstrap.sh example.com --CLUSTER_SIZE=large # Policy NOT deployed +./scripts/bootstrap.sh example.com --cluster-size=small # Policy deployed +./scripts/bootstrap.sh example.com --cluster-size=medium # Policy deployed +./scripts/bootstrap.sh example.com --cluster-size=large # Policy NOT deployed ``` ## Usage Examples @@ -121,7 +121,7 @@ cluster-forge/ #### Small/Medium Cluster ```bash -./scripts/bootstrap.sh dev.example.com --CLUSTER_SIZE=small +./scripts/bootstrap.sh dev.example.com --cluster-size=small ``` **Result**: - Kyverno policy **deployed** @@ -130,7 +130,7 @@ cluster-forge/ #### Large Cluster ```bash -./scripts/bootstrap.sh prod.example.com --CLUSTER_SIZE=large +./scripts/bootstrap.sh prod.example.com --cluster-size=large ``` **Result**: - Kyverno policy **NOT deployed at all** @@ -217,7 +217,7 @@ kubectl get applications -n argocd -o yaml | grep "local-path-access-mode-polici # If found, the wrong values_*.yaml was used # Redeploy with correct size: -./scripts/bootstrap.sh --CLUSTER_SIZE=large + ./scripts/bootstrap.sh --cluster-size=large ``` #### Policy NOT Working on Small/Medium Clusters @@ -262,7 +262,7 @@ kubectl logs -n kyverno -l app.kubernetes.io/name=kyverno ### Upgrading from Small/Medium to Large 1. **Deploy large cluster configuration**: ```bash - ./scripts/bootstrap.sh --CLUSTER_SIZE=large +./scripts/bootstrap.sh --cluster-size=large ``` 2. **Policy automatically removed** (not in enabledApps) 3. **Deploy Longhorn** for native RWX support diff --git a/docs/kyverno_modular_design.md b/docs/kyverno_modular_design.md index 92e8ebeb..9925e4d5 100644 --- a/docs/kyverno_modular_design.md +++ b/docs/kyverno_modular_design.md @@ -125,13 +125,13 @@ enabledApps: ```bash # Small cluster - Main + local-path policies -./scripts/bootstrap.sh dev.example.com --CLUSTER_SIZE=small +./scripts/bootstrap.sh dev.example.com --cluster-size=small # Medium cluster - Main + local-path policies -./scripts/bootstrap.sh team.example.com --CLUSTER_SIZE=medium +./scripts/bootstrap.sh team.example.com --cluster-size=medium # Large cluster - Exactly same as main branch -./scripts/bootstrap.sh prod.example.com --CLUSTER_SIZE=large +./scripts/bootstrap.sh prod.example.com --cluster-size=large ``` ### 🔍 **Policy Verification** diff --git a/docs/secret-management-user-guide.md b/docs/secret-management-user-guide.md deleted file mode 100644 index aefadaec..00000000 --- a/docs/secret-management-user-guide.md +++ /dev/null @@ -1,274 +0,0 @@ -# Secret Management User Guide - -This guide provides practical instructions for end-users to manage secrets in the cluster-forge OpenBao system. - -## Overview - -The cluster-forge secret management system uses a **declarative, GitOps-based approach** where secrets are defined in configuration files and automatically created by a CronJob that runs every 5 minutes. - -**How it works:** -- **For existing components**: All application secrets are already defined and automatically managed -- **For new components**: When you add a new application that needs secrets, you define them in the configuration file, commit the changes, and they're automatically created in OpenBao. Your new component can then fetch these secrets via External Secrets Operator using ExternalSecret resources that reference the OpenBao paths. - -**Example workflow for new components:** -1. Add your application deployment files -2. Define required secrets in `openbao-secret-definitions.yaml` -3. Create ExternalSecret resources to fetch the secrets from OpenBao -4. Your application pods automatically receive the secrets as Kubernetes Secret mounts - -## Quick Start: Adding a New Secret - -### 1. Edit the Secret Definition File - -Navigate to and edit: -``` -sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml -``` - -### 2. Add Your Secret Definition - -Add a new line following this format: -``` -SECRET_PATH|TYPE|VALUE|BYTES -``` - -**Examples:** -```bash -# Random 32-byte password for your application -secrets/my-app-database-password|random||32 - -# Static API key -secrets/my-app-api-key|static|your-fixed-api-key-here|0 - -# Domain-based URL (uses templating) -secrets/my-app-callback-url|static|https://my-app.{{ .Values.domain }}/callback|0 -``` - -### 3. Commit and Push Changes - -```bash -git add sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml -git commit -m "feat: add secrets for my-app" -git push origin main -``` - -### 4. Wait for Automatic Creation - -The GitOps pipeline will automatically create your secrets. **Total time: ~20-25 minutes** - -**Pipeline stages:** -- **GitHub → Gitea sync**: ~15 minutes (Gitea syncs every 15 minutes) -- **ArgoCD deployment**: ~3 minutes (ArgoCD detects and deploys changes) -- **Secret creation**: ~0-5 minutes (CronJob runs every 5 minutes) - -**Monitor progress:** -```bash -# Check ArgoCD sync status -kubectl get application openbao-config -n argocd - -# Check recent CronJob executions -kubectl get jobs -n cf-openbao -l job-name=openbao-secret-manager --sort-by=.metadata.creationTimestamp -``` - -**✅ Your secrets are ready when:** The CronJob shows a successful completion and you can verify the secret exists in OpenBao. - -## Secret Definition Format Reference - -### Format Specification - -``` -SECRET_PATH|TYPE|VALUE|BYTES -``` - -### Field Descriptions - -| Field | Description | Examples | -|-------|-------------|----------| -| **SECRET_PATH** | Path where secret will be stored in OpenBao | `secrets/my-app-password` | -| **TYPE** | Secret type: `static` or `random` | `random`, `static` | -| **VALUE** | Used only for static secrets (supports templating) | `my-api-key`, `https://api.{{ .Values.domain }}/v1` | -| **BYTES** | Used only for random secrets (length in bytes) | `16`, `32`, `64` | - -### Secret Types - -**Random Secrets:** -```bash -# Format: secrets/path|random||BYTES -secrets/my-app-password|random||16 # 16-byte password -secrets/api-key|random||32 # 32-byte API key -``` - -**Static Secrets:** -```bash -# Format: secrets/path|static|VALUE|0 -secrets/my-api-url|static|https://api.example.com|0 # Fixed value -secrets/my-callback|static|https://app.{{ .Values.domain }}|0 # Domain templating -``` - -## Working with Secrets - -### Viewing Secret Values - -```bash -# Check if secret exists in OpenBao -kubectl exec -n cf-openbao openbao-0 -- bao kv get secrets/my-app-password - -# View secret value (requires access) -kubectl exec -n cf-openbao openbao-0 -- bao kv get -field=value secrets/my-app-password -``` - -**Note**: Secrets are never updated automatically once created to prevent breaking applications. - -## Using Secrets in Applications - -### 1. Create an ExternalSecret Resource - -Create a file like `my-app-external-secret.yaml`: - -```yaml -apiVersion: external-secrets.io/v1beta1 -kind: ExternalSecret -metadata: - name: my-app-secrets - namespace: my-namespace -spec: - refreshInterval: 60s - secretStoreRef: - name: openbao-secret-store - kind: ClusterSecretStore - target: - name: my-app-secret - creationPolicy: Owner - data: - - secretKey: password - remoteRef: - key: secrets/my-app-password - property: value - - secretKey: api-key - remoteRef: - key: secrets/my-app-api-key - property: value -``` - -### 2. Use the Secret in Your Pod - -```yaml -apiVersion: v1 -kind: Pod -metadata: - name: my-app - namespace: my-namespace -spec: - containers: - - name: app - image: my-app:latest - env: - - name: DATABASE_PASSWORD - valueFrom: - secretKeyRef: - name: my-app-secret - key: password - - name: API_KEY - valueFrom: - secretKeyRef: - name: my-app-secret - key: api-key -``` - -## Current Secret Inventory - -For a complete and up-to-date list of all secrets in the system, refer to the **source of truth**: - -``` -sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml -``` - -This file contains all currently defined secrets organized by category: -- **Cluster-Wide**: Domain configuration -- **AIRM Application**: Database, RabbitMQ, and UI authentication secrets -- **Keycloak**: Admin passwords and database credentials -- **MinIO**: Storage access keys and console credentials -- **Infrastructure**: Client secrets for Kubernetes, Gitea, and ArgoCD -- **AIWB Application**: Database and authentication secrets - -**To view current secrets:** -```bash -# View the complete secret definitions file -cat sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml - -# Or check specific secrets in OpenBao -kubectl exec -n cf-openbao openbao-0 -- bao kv list secrets/ -``` - -## Troubleshooting - -### Secret Not Created After 25 Minutes - -1. **Check ArgoCD sync status:** - ```bash - kubectl get application openbao-config -n argocd - ``` - -2. **Check CronJob execution:** - ```bash - kubectl get cronjob openbao-secret-manager -n cf-openbao - kubectl get jobs -n cf-openbao -l job-name=openbao-secret-manager - ``` - -3. **Check CronJob logs:** - ```bash - # Get the most recent job - JOB=$(kubectl get jobs -n cf-openbao -l job-name=openbao-secret-manager --sort-by=.metadata.creationTimestamp -o jsonpath='{.items[-1].metadata.name}') - kubectl logs job/$JOB -n cf-openbao - ``` - -### Secret Definition Format Errors - -**Error**: CronJob fails with parsing errors - -**Solution**: Check your secret definition format: -- Ensure exactly 4 fields separated by `|` -- No extra spaces around the `|` separators -- For random secrets: VALUE field should be empty -- For static secrets: BYTES field should be `0` - -**Example of incorrect format:** -```bash -# Wrong - extra spaces -secrets/my-secret | random | | 32 - -# Wrong - missing field -secrets/my-secret|random|32 - -# Correct -secrets/my-secret|random||32 -``` - -### Common Issues - -**ExternalSecret not syncing:** -```bash -kubectl get externalsecret my-app-secrets -n my-namespace -kubectl describe externalsecret my-app-secrets -n my-namespace -``` - -**Secret not found in OpenBao:** -```bash -kubectl exec -n cf-openbao openbao-0 -- bao kv get secrets/my-app-password -``` - -## Best Practices - -**Naming:** Use descriptive, hierarchical names like `secrets/my-app-database-password` - -**Security:** Never commit actual secret values to git. Use random secrets for passwords/tokens. - -**Organization:** Group related secrets with consistent prefixes (e.g., `secrets/airm-*`) - -**Change Management:** Test in development first, existing secrets are never updated automatically. - -## Getting Help - -**For issues:** Check troubleshooting section, ArgoCD/CronJob logs, or see [secrets management architecture documentation](secrets_management_architecture.md) - -**For architectural details:** See [secrets management architecture documentation](secrets_management_architecture.md) for comprehensive system overview \ No newline at end of file diff --git a/docs/secrets_management_architecture.md b/docs/secrets_management_architecture.md deleted file mode 100644 index 4531ed76..00000000 --- a/docs/secrets_management_architecture.md +++ /dev/null @@ -1,455 +0,0 @@ -# Secrets Management Architecture - -## Overview - -This document describes the comprehensive secrets management architecture used in cluster-forge. The system is built around OpenBao (open-source Vault fork) as the central secrets vault, with External Secrets Operator enabling seamless integration with Kubernetes workloads. - -## Architecture Diagram - -```mermaid -graph TB - %% Styling - classDef vault fill:#4B8BBE,stroke:#306998,stroke-width:3px,color:#fff - classDef k8s fill:#326CE5,stroke:#00308F,stroke-width:2px,color:#fff - classDef app fill:#00D084,stroke:#00A86B,stroke-width:2px,color:#fff - classDef external fill:#FF6B6B,stroke:#C92A2A,stroke-width:2px,color:#fff - classDef cron fill:#FFA500,stroke:#FF8C00,stroke-width:2px,color:#fff - - %% OpenBao Core - subgraph OpenBao["OpenBao Vault Cluster (cf-openbao namespace)"] - BAO0[OpenBao-0
Leader]:::vault - BAO1[OpenBao-1
Follower]:::vault - BAO2[OpenBao-2
Follower]:::vault - - BAO0 -.Raft Replication.-> BAO1 - BAO0 -.Raft Replication.-> BAO2 - - subgraph Storage["Raft Integrated Storage"] - RAFT[Persistent Volumes
Raft Consensus]:::vault - end - - subgraph Auth["Authentication"] - USERPASS[UserPass Auth
readonly-user]:::vault - end - - subgraph Secrets["Secret Engines"] - KV2[KV v2: secrets/*
Generated Credentials]:::vault - APIKEY[KV v2: apikey-groups/*
API Keys]:::vault - RANDOM[sys/tools/random
Password Generator]:::vault - end - - BAO0 --> RAFT - BAO1 --> RAFT - BAO2 --> RAFT - BAO0 --> Auth - BAO0 --> Secrets - end - - %% Bootstrap Process - subgraph Bootstrap["Bootstrap Process (bootstrap.sh)"] - INIT[1. Init OpenBao
Generate Keys]:::cron - UNSEAL[2. Unseal All Pods
Join Raft Cluster]:::cron - SETUP[3. Setup Auth & Engines
Create read-policy]:::cron - GENSEC[4. Generate Secrets
Random Passwords]:::cron - - INIT --> UNSEAL - UNSEAL --> SETUP - SETUP --> GENSEC - end - - %% Unseal Automation - subgraph UnsealAuto["Automated Unseal (CronJob)"] - CRONJOB[openbao-unseal-job
Runs every 5 minutes]:::cron - UNSEALSCRIPT[Unseal Script
Checks sealed pods]:::cron - - CRONJOB --> UNSEALSCRIPT - end - - %% Kubernetes Secrets - subgraph K8sSecrets["Kubernetes Secrets Storage"] - BAOKEYS[openbao-keys
root_token, unseal_key]:::k8s - BAOUSER[openbao-user
readonly credentials]:::k8s - GITEAADMIN[gitea-admin-credentials
bootstrap admin]:::k8s - end - - %% External Secrets Operator - subgraph ESO["External Secrets Operator (external-secrets namespace)"] - ESOCTRL[ES Controller]:::external - ESOWH[ES Webhook]:::external - ESOCERT[ES Cert Controller]:::external - - ESOCTRL -.Watches.-> ESOWH - ESOCERT -.Manages.-> ESOWH - end - - %% ClusterSecretStores - subgraph CSS["ClusterSecretStores"] - CSS1[openbao-secret-store
UserPass Auth
path: secrets/]:::external - CSS2[k8s-secret-store
K8s SA Auth
backend: cf-es-backend]:::external - CSS3[airm-secret-store
Points to OpenBao]:::external - CSS4[k8srealm-secret-store
For Keycloak]:::external - CSS5[fake-secret-store
Testing/Defaults]:::external - end - - %% Application ExternalSecrets - subgraph AppSecrets["Application ExternalSecrets"] - ES1[keycloak-credentials]:::app - ES2[airm-realm-credentials]:::app - ES3[k8s-realm-credentials]:::app - ES4[minio-tenant secrets]:::app - ES5[cnpg database credentials]:::app - ES6[rabbitmq credentials]:::app - end - - %% Applications - subgraph Apps["Applications"] - KC[Keycloak
Identity Provider]:::app - GITEA[Gitea
Git Server]:::app - MINIO[MinIO
Object Storage]:::app - CNPG[CloudNativePG
Databases]:::app - RABBIT[RabbitMQ
Message Queue]:::app - end - - %% Flow connections - Bootstrap --> BAO0 - Bootstrap --> K8sSecrets - - K8sSecrets --> UnsealAuto - UnsealAuto --> BAO0 - UnsealAuto --> BAO1 - UnsealAuto --> BAO2 - - BAO0 --> CSS1 - BAO0 --> CSS3 - BAO0 --> CSS4 - K8sSecrets --> CSS2 - - CSS1 -.Authenticates via.-> USERPASS - CSS1 -.Reads from.-> KV2 - - ESO --> CSS1 - ESO --> CSS2 - ESO --> CSS3 - ESO --> CSS4 - ESO --> CSS5 - - CSS1 --> AppSecrets - CSS2 --> AppSecrets - CSS3 --> AppSecrets - CSS4 --> AppSecrets - - AppSecrets --> KC - AppSecrets --> GITEA - AppSecrets --> MINIO - AppSecrets --> CNPG - AppSecrets --> RABBIT - - BAOUSER -.Contains credentials for.-> CSS1 - BAOKEYS -.Unseals.-> BAO0 - BAOKEYS -.Unseals.-> BAO1 - BAOKEYS -.Unseals.-> BAO2 - - %% Secret Generation Flow - RANDOM -.Generates.-> KV2 - RANDOM -.Generates.-> APIKEY -``` - -## Key Components - -### 1. OpenBao Vault Cluster - -**Deployment Model:** -- 3-node cluster in High Availability (HA) mode -- Raft integrated storage (no external dependencies) -- Each pod runs in `cf-openbao` namespace -- Auto-unseal via CronJob every 5 minutes - -**Configuration:** -```yaml -Storage: Raft integrated -UI: Enabled -Auth Methods: userpass -Secret Engines: - - secrets/ (KV v2) - Application secrets - - apikey-groups/ (KV v2) - API key management - - sys/tools/random - Password generation -``` - -### 2. Bootstrap Process - -**init-openbao.sh:** -1. Checks if OpenBao is already initialized -2. Initializes with key-shares=1, key-threshold=1 (single key setup) -3. Stores `root_token` and `unseal_key` in K8s secret `openbao-keys` -4. Unseals all 3 pods -5. Forms Raft cluster (pods join via HTTP) - -**setup-openbao.sh:** -1. Enables KV v2 engines at `secrets/` and `apikey-groups/` -2. Enables `userpass` authentication -3. Creates `read-policy` for read-only access -4. Creates `readonly-user` with read-only permissions -5. Stores readonly credentials in K8s secret `openbao-user` - -**manage-secrets.sh (NEW - Unified Secret Management):** -Replaces the old hardcoded `generate-secrets.sh` with a declarative, config-driven approach: -1. Reads secret definitions from `openbao-secret-definitions.yaml` ConfigMap -2. Supports two secret types: - - `static`: Fixed values with domain templating support (e.g., `{{ .Values.domain }}`) - - `random`: Generated using OpenBao's random tool with specified byte length -3. Uses format: `SECRET_PATH|TYPE|VALUE|BYTES` (e.g., `secrets/my-app-password|random||32`) -4. Idempotent operation - skips existing secrets, only creates missing ones -5. Handles domain templating with `envsubst` for static values -6. Special handling for `cluster-auth-openbao-token` in init mode -7. Used by both bootstrap process and ongoing CronJob management -8. Comprehensive error handling and progress reporting instead of generate-secrets.sh. - -### 3. Automated Unseal Mechanism - -**CronJob Configuration:** -- Schedule: Every 5 minutes (`*/5 * * * *`) -- Runs in `cf-openbao` namespace -- Service Account: `openbao-unseal-job-sa` -- Permissions: Get pods, exec into pods, read secrets - -**Unseal Logic:** -1. Retrieves `unseal_key` from `openbao-keys` secret -2. Finds all running OpenBao pods that are sealed -3. Executes `bao operator unseal` on each sealed pod -4. Handles pod restarts and cluster member changes - -### 4. Automated Secret Management System - -**Declarative Secret Definition System:** -- **Location**: `sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml` -- **Format**: Structured ConfigMap with pipe-delimited entries: `SECRET_PATH|TYPE|VALUE|BYTES` -- **Configuration Management**: Deployed as Helm chart enabling GitOps-based secret management -- **Domain Templating**: Static values support `{{ .Values.domain }}` templating for environment-specific configuration - -**Secret Types Supported:** -1. **Static Secrets**: - - Format: `secrets/cluster-domain|static|{{ .Values.domain }}|0` - - Use case: Fixed values, URLs, domain references - - Supports Helm templating for dynamic values -2. **Random Secrets**: - - Format: `secrets/my-app-password|random||32` - - Use case: Generated passwords, tokens, API keys - - Byte length specified in fourth field - -**CronJob-Based Management:** -- **Schedule**: Every 5 minutes (`*/5 * * * *`) -- **Purpose**: Ensures all defined secrets exist in OpenBao without manual intervention -- **Behavior**: Idempotent - only creates missing secrets, skips existing ones -- **Template**: `sources/openbao-config/0.1.0/templates/openbao-secret-manager-cronjob.yaml` -- **Service Account**: `openbao-secret-manager-sa` with minimal required permissions -- **Timeout**: 5-minute active deadline with single retry on failure - -**Configuration Management Features:** -- **Checksum Annotations**: Forces pod recreation when ConfigMap changes -- **Resource Limits**: Memory: 256Mi, CPU: 500m for controlled resource usage -- **Environment Variables**: Domain templating via Helm values injection -- **Volume Mounts**: Scripts from `openbao-secret-manager-scripts`, config from `openbao-secrets-config` - -**Adding New Secrets Workflow:** -1. Edit `sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml` -2. Add line following format: `secrets/my-app-password|random||32` -3. Commit and push to main branch -4. ArgoCD syncs the configuration within ~3 minutes -5. CronJob automatically creates the secret within ~5 minutes -6. Total time from commit to secret availability: ~8 minutes - -**For detailed user guide:** See [secret management user guide](secret-management-user-guide.md) for step-by-step instructions and examples - -**Examples from Current Configuration:** -``` -# Database credentials -secrets/airm-cnpg-user-password|random||16 - -# Static domain-based URLs -secrets/minio-openid-url|static|https://kc.{{ .Values.domain }}/realms/airm/.well-known/openid-configuration|0 - -# Fixed API keys -secrets/minio-api-access-key|static|api-default-user|0 -``` - -### 5. External Secrets Operator - -**Components:** -- **Controller**: Watches ExternalSecret resources and syncs from backends -- **Webhook**: Validates ExternalSecret/SecretStore resources -- **Cert Controller**: Manages TLS certificates for webhooks - -**ClusterSecretStore Types:** - -1. **openbao-secret-store** - - Provider: OpenBao (vault) - - Auth: UserPass (readonly-user) - - Path: secrets/ - - Used by: Most application secrets - -2. **k8s-secret-store** - - Provider: Kubernetes - - Auth: Service Account (external-secrets-readonly) - - Backend: cf-es-backend namespace - - Used by: Cross-namespace secret sharing - -3. **airm-secret-store** / **k8srealm-secret-store** - - Domain-specific stores for AIRM and K8s realm - - Point to OpenBao with specific paths - -4. **fake-secret-store** - - Provider: Fake (hardcoded values) - - Used for: Testing and default values - -### 6. Secret Flow Architecture - -```mermaid -flowchart TD - %% Styling - classDef bootstrap fill:#FFE066,stroke:#FFB800,stroke-width:3px,color:#000 - classDef vault fill:#4B8BBE,stroke:#306998,stroke-width:3px,color:#fff - classDef config fill:#9B59B6,stroke:#8E44AD,stroke-width:2px,color:#fff - classDef k8s fill:#326CE5,stroke:#00308F,stroke-width:2px,color:#fff - classDef app fill:#00D084,stroke:#00A86B,stroke-width:2px,color:#fff - classDef external fill:#FF6B6B,stroke:#C92A2A,stroke-width:2px,color:#fff - - %% Main flow - A[Bootstrap Script
1. Deploy openbao-config
2. Initialize OpenBao]:::bootstrap - B[OpenBao Vault Cluster
3 replicas
Unseals every 5min]:::vault - C[Automated Management
CronJob every 5min
- Reads config
- Creates missing
- Skips existing]:::config - D["Secret Definition
ConfigMap Helm
- Format: PATH|TYPE|...
- Domain templating
- GitOps managed"]:::config - E[KV v2 Engine
secrets/* in OpenBao]:::vault - F[ClusterSecretStore
openbao-secret-store]:::external - G[ExternalSecret
Resources]:::k8s - H[Application Pod
mounts secret]:::app - - %% Flow connections with labels - A -->|1. Deploys| B - A -->|1. Creates| D - D -->|3. Monitors definitions| C - C -->|2. Config-driven secret creation| B - B -->|4. Secrets stored| E - E -->|5. External Secrets reads| F - F -->|6. Sync to K8s| G - G -->|7. Creates K8s Secret| H - - %% Feedback loop - C -.->|Monitors ConfigMap| D -``` - -### 7. Secret Categories - -**Identity & Authentication:** -- Keycloak admin password -- OAuth client secrets (Gitea, ArgoCD, AIRM UI) -- Realm credentials (AIRM, K8s) - -**Database Credentials:** -- PostgreSQL superuser & user credentials (AIRM, Keycloak, Catalog) -- Generated via OpenBao random tool -- Managed by CloudNativePG operator - -**Storage & Messaging:** -- MinIO root password, API keys, console keys -- MinIO OpenID Connect URLs -- RabbitMQ user credentials - -**Cluster Infrastructure:** -- Cluster admin tokens -- OpenBao root token (stored in K8s) -- Domain configuration - -### 8. Security Model - -**Encryption at Rest:** -- OpenBao data encrypted in Raft storage -- Kubernetes secrets encrypted if cluster encryption is enabled - -**Access Control:** -- **Root Token**: Stored in K8s secret, used only during bootstrap -- **Readonly User**: Limited to read operations on secrets path -- **Service Accounts**: Scoped to specific namespaces - -**Network Security:** -- OpenBao accessible only within cluster (ClusterIP) -- TLS disabled for internal communication (cluster-internal) -- External Secrets uses internal service DNS - -**Secret Rotation:** -- OpenBao supports secret versioning (KV v2) -- Applications can reference specific versions -- Old versions retained for rollback - -### 9. Disaster Recovery - -**Backup Strategy:** -- OpenBao unseal key stored in `openbao-keys` K8s secret -- Root token stored in `openbao-keys` K8s secret -- Raft storage on persistent volumes - -**Recovery Process:** -1. Restore persistent volumes with Raft data -2. Deploy OpenBao pods -3. Unseal using stored unseal key -4. Verify cluster health via `bao operator raft list-peers` - -**Important Notes:** -- Single unseal key (key-shares=1) - simplified but less secure -- For production, use Shamir's Secret Sharing (key-shares=5, threshold=3) -- Consider auto-unseal with cloud KMS for production - -### 10. Integration Points - -**Gitea Configuration:** -- Admin credentials generated during bootstrap -- OAuth client secret from OpenBao -- Integrated with Keycloak via OIDC - -**Keycloak Realms:** -- Two realms: `airm` and `k8s` -- Client secrets managed in OpenBao -- Realm templates with placeholder substitution - -**CloudNativePG:** -- Superuser and application user credentials -- Secrets created before cluster bootstrap -- Automatic database initialization - -**MinIO Tenant:** -- Console and API credentials separate -- OIDC integration with Keycloak -- Auto-configured with OpenBao secrets - -## Monitoring & Observability - -**Health Checks:** -- OpenBao: `bao status` via exec probe -- External Secrets: Controller logs and metrics -- Secret Sync: ExternalSecret CR status conditions - -**Common Issues:** -- **Sealed Vault**: Check CronJob execution and unseal key -- **Secret Sync Failure**: Verify ClusterSecretStore authentication -- **Missing Secrets**: Check OpenBao path and ExternalSecret remoteRef - -## Best Practices - -1. **Never commit unseal keys or root tokens** to version control -2. **Rotate readonly user credentials** periodically -3. **Monitor ExternalSecret sync errors** for failed secret updates -4. **Use specific secret versions** in production for stability -5. **Test secret rotation** in staging before production -6. **Backup `openbao-keys` secret** to secure external location -7. **Enable audit logging** in OpenBao for compliance -8. **Use namespaced SecretStores** for tenant isolation when possible - -## Future Enhancements - -- [ ] Implement auto-unseal with cloud KMS -- [ ] Add secret rotation automation -- [ ] Enable OpenBao audit logging -- [ ] Implement Shamir's Secret Sharing (N-of-M keys) -- [ ] Add monitoring/alerting for unsealed state -- [ ] Integrate with cert-manager for TLS -- [ ] Add RBAC policies for fine-grained access -- [ ] Implement secret versioning strategy diff --git a/docs/values_inheritance_pattern.md b/docs/values_inheritance_pattern.md index 51fb529d..960667d1 100644 --- a/docs/values_inheritance_pattern.md +++ b/docs/values_inheritance_pattern.md @@ -2,111 +2,209 @@ ## Overview -ClusterForge implements a sophisticated GitOps deployment pattern that supports both external GitHub deployment and local cluster-native deployment through dual values files and repository configurations. +Cluster-Forge implements a sophisticated dual-repository GitOps deployment pattern that supports both external GitHub deployment and local cluster-native deployment through separate configuration and application repositories. ## Two Deployment Modes -### External Mode (`values.yaml`) +### Local Mode (Default) ```yaml clusterForge: - repoUrl: "https://github.com/silogen/cluster-forge.git" - targetRevision: v1.7.1 - valuesFile: values.yaml + repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" + targetRevision: # filled by bootstrap script --target-revision externalValues: - enabled: false # Uses single external source + enabled: true # Uses multi-source pattern + repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" + targetRevision: main # always main for local cluster overrides ``` -**Purpose**: Traditional GitOps with external GitHub dependency -**Use Cases**: Initial deployment, CI/CD pipelines, production releases -**Network**: Requires external internet access +**Purpose**: Self-contained cluster-native GitOps with local Gitea +**Use Cases**: Air-gapped environments, autonomous operation, production deployments +**Network**: Self-contained within cluster network +**Features**: +- Local Gitea serves both cluster-forge and cluster-values repositories +- Initialization handled by gitea-init-job during bootstrap +- Zero external dependencies once bootstrapped +- Full configuration version control within cluster -### Local Mode (`values_cf.yaml`) +### External Mode ```yaml clusterForge: - repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" - targetRevision: main + repoUrl: "https://github.com/silogen/cluster-forge.git" + targetRevision: # filled by bootstrap script --target-revision (e.g., v1.8.0, feature-branch) externalValues: - enabled: true # Uses local multi-source - repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" - targetRevision: main - path: values_cf.yaml + enabled: false # Single source from GitHub ``` -**Purpose**: Self-contained GitOps with local Gitea and separate configuration repository -**Use Cases**: Air-gapped environments, developer clusters, autonomous operation -**Network**: Self-contained within cluster network +**Purpose**: Traditional GitOps with external GitHub dependency +**Use Cases**: Initial deployment, CI/CD pipelines, feature branch testing +**Network**: Requires external internet access +**Features**: +- Direct GitHub access for application deployment +- Supports custom branch selection for testing ## Size-Specific Inheritance -ClusterForge uses Helm's multi-values file support for cluster size configuration: +Cluster-Forge uses YAML merge semantics for cluster size configuration: ```bash -helm template -f values.yaml -f values_medium.yaml +# Bootstrap merges values using yq eval-all +yq eval-all '. as $item ireduce ({}; . * $item)' \ + values.yaml values_medium.yaml ``` ### Inheritance Hierarchy -1. **Base**: `values.yaml` or `values_cf.yaml` (52 common applications) +1. **Base**: `values.yaml` (common applications and defaults) 2. **Size Override**: `values_small.yaml`, `values_medium.yaml`, or `values_large.yaml` -3. **Runtime**: Domain and cluster-specific parameters +3. **External**: `cluster-values/values.yaml` from Gitea (when externalValues.enabled: true) +4. **Runtime**: Domain and cluster-specific parameters injected during bootstrap + +### DRY Principle in Size Files + +Size files only contain differences from base (Don't Repeat Yourself): + +**Base values.yaml**: +- Complete application definitions for all apps +- Alpha-sorted `enabledApps` list +- Common defaults applicable to all sizes + +**Size-specific values**: +- Only resource overrides that differ from base +- Size-specific enabledApps additions (e.g., storage policies) +- HA configurations for large clusters + +**Example**: +```yaml +# values_small.yaml - only differences +enabledApps: + - kyverno-policies-storage-local-path # Added to base list + +apps: + argocd: + valuesObject: + controller: + resources: + limits: + cpu: 2000m # Override from base + memory: 4Gi +``` + +| Cluster Size | Apps from Base | Additional Apps | Configuration Overrides | +|--------------|----------------|-----------------|------------------------| +| **Small** | All base apps | +1 (storage policy) | Minimal resources, single replicas | +| **Medium** | All base apps | +1 (storage policy) | Balanced resources, single replicas | +| **Large** | All base apps | +0 (no additions) | Production resources, OpenBao HA (3 replicas) | + +## Bootstrap and GitOps Workflow -### Size File Structure -- **Base files**: Complete application definitions and 52 enabledApps -- **Size files**: Only contain differences from base (DRY principle) -- **Large clusters**: No size file needed (inherit everything from base) +### Bootstrap Process -| Cluster Size | Apps from Base | Additional Apps | Total Apps | -|--------------|----------------|-----------------|-----------| -| **Small** | 52 (inherited) | +1 (storage policy) | **53 apps** | -| **Medium** | 52 (inherited) | +1 (storage policy) | **53 apps** | -| **Large** | 52 (inherited) | +0 (no additions) | **52 apps** | +The bootstrap script establishes the GitOps foundation: -## Repository Transition Pattern +**Phase 1: Pre-Cleanup** +- Removes previous installations when applicable -### Bootstrap Workflow -1. **External Bootstrap**: Deploy from GitHub for initial setup -2. **Local Transition**: Switch to local Gitea for autonomous operation -3. **Developer Access**: Local Git workflows for cluster configuration -4. **Upstream Sync**: Periodic synchronization with main project +**Phase 2: GitOps Foundation Bootstrap** +1. ArgoCD deployment (helm template) +2. OpenBao deployment and initialization +3. Gitea deployment and initialization + - Creates cluster-org organization + - Clones cluster-forge from initial-cf-values ConfigMap + - Creates cluster-values repository -### Multi-Source GitOps -When using `values_cf.yaml`, ArgoCD uses two separate repositories: -- **Application Source**: `cluster-org/cluster-forge` (Helm charts and manifests) -- **Configuration Source**: `cluster-org/cluster-values` (values.yaml customizations) +**Phase 3: App-of-Apps Deployment** +- Creates cluster-forge Application in ArgoCD +- Uses multi-source when externalValues.enabled: true +- ArgoCD manages all remaining applications -This separation enables independent versioning of infrastructure vs. settings. +### Multi-Source GitOps Pattern + +When using local mode (`externalValues.enabled: true`), ArgoCD uses two separate repositories: + +**Source 1: Application Source** (`cluster-forge`) +- Helm charts and manifests in `sources/` directory +- Application definitions in `root/` chart +- Component versions and configurations + +**Source 2: Configuration Source** (`cluster-values`) +- Custom `values.yaml` for environment-specific overrides +- Domain and cluster-specific settings +- Independent versioning from application code + +This separation enables: +- Different update cadences for infrastructure vs. configuration +- Easy configuration rollback without affecting application versions +- Clear ownership separation + +### Value Merge Order + +When ArgoCD renders applications with multi-source: + +1. **Base values** from `cluster-forge/root/values.yaml` +2. **Size-specific** from `cluster-forge/root/values_.yaml` +3. **External overrides** from `cluster-values/values.yaml` +4. **Runtime parameters** (domain, targetRevision) injected by bootstrap ## Developer Workflow -### Local Configuration Management +### Local Configuration Management (Local Mode) + ```bash -# Clone local configuration repository +# Clone local configuration repository from Gitea git clone http://gitea.cluster.example.com/cluster-org/cluster-values.git cd cluster-values # Modify cluster configurations -vim values_cf.yaml -git add values_cf.yaml +vim values.yaml +git add values.yaml git commit -m "Update cluster configuration" git push -# ArgoCD automatically deploys the changes +# ArgoCD automatically detects and syncs the changes ``` +**Example: AIRM Image Repository Configuration** + +To configure custom AIRM image repositories post-bootstrap, modify `cluster-values/values.yaml`: + +```yaml +# Custom AIRM image repositories for private registry +airm-api: + airm: + backend: + image: + repository: harbor.mycompany.com/airm/airm-api + frontend: + image: + repository: harbor.mycompany.com/airm/airm-ui + +airm-dispatcher: + airm: + dispatcher: + image: + repository: harbor.mycompany.com/airm/airm-dispatcher +``` + +This allows deployment from private registries, air-gapped environments, or custom built images. + ### Configuration Version Control -- All cluster configuration changes tracked in Git history -- Pull request workflow for configuration reviews -- Automatic deployment through ArgoCD sync -- Rollback capabilities through Git revert + +Benefits of the dual-repository pattern: +- **Full Git history**: Track all cluster configuration changes +- **Pull request workflow**: Review configuration changes before deployment +- **Automatic deployment**: ArgoCD syncs on Git push +- **Rollback capabilities**: Revert via Git history +- **Separation of concerns**: Infrastructure code vs. environment configuration ## Benefits -1. **🎯 Deployment Flexibility**: External dependency → local autonomy transition +1. **🎯 Deployment Flexibility**: Support for both external and local GitOps modes 2. **🔄 Version Control**: Full Git history for all cluster configuration changes -3. **🛡️ Air-Gap Ready**: Works in secure, isolated environments -4. **👥 Developer Experience**: Local Git access for cluster configuration -5. **📦 Upstream Sync**: Can receive updates from main project +3. **🛡️ Air-Gap Ready**: Works in secure, isolated environments with local Gitea +4. **👥 Developer Experience**: Local Git access for cluster configuration management +5. **📦 Multi-Source Pattern**: Separate application code from configuration 6. **🔧 Maintainability**: DRY principle eliminates configuration redundancy +7. **🚀 Bootstrap Automation**: Single command establishes complete GitOps infrastructure -This architectural pattern enables clusters to evolve from external dependency to local autonomy while maintaining all benefits of declarative configuration management. \ No newline at end of file +This architectural pattern enables clusters to operate with full GitOps benefits while maintaining flexibility for different deployment scenarios from development to air-gapped production environments. \ No newline at end of file diff --git a/root/templates/_helpers.yaml b/root/templates/_helpers.yaml index adb29d25..096b29d5 100644 --- a/root/templates/_helpers.yaml +++ b/root/templates/_helpers.yaml @@ -1,3 +1,4 @@ +{{/* Renders a value that contains template. Usage: {{ include "common.tplvalues.render" ( dict "value" .Values.path.to.the.Value "context" $) }} diff --git a/root/templates/cluster-apps.yaml b/root/templates/cluster-apps.yaml index 54d1a2d9..b0d79d11 100644 --- a/root/templates/cluster-apps.yaml +++ b/root/templates/cluster-apps.yaml @@ -30,7 +30,7 @@ spec: {{- end }} {{- if .valuesObject }} values: | -{{ .valuesObject | toYaml | nindent 8 }} + {{ .valuesObject | toYaml | nindent 8 }} {{- end }} {{- if .helmParameters }} parameters: diff --git a/root/templates/cluster-forge.yaml b/root/templates/cluster-forge.yaml index ce7cee38..4bf42558 100644 --- a/root/templates/cluster-forge.yaml +++ b/root/templates/cluster-forge.yaml @@ -6,15 +6,14 @@ metadata: namespace: argocd spec: project: default - {{- if .Values.externalValues.enabled }} +{{- if .Values.externalValues.enabled }} # helm-chart & values file from 2 different git repos + # Uses the SAME targetRevision for both chart templates AND values sources: - repoURL: {{ .Values.clusterForge.repoUrl }} targetRevision: {{ .Values.clusterForge.targetRevision }} path: root helm: - # here we want the base values.yaml and the custom values file from external repo - # the path to the custom values file is relative to the root of the external values repo valueFiles: - {{ .Values.externalValues.path }} - {{ .Values.global.clusterSize }} @@ -22,7 +21,7 @@ spec: - repoURL: {{ .Values.externalValues.repoUrl }} targetRevision: {{ .Values.externalValues.targetRevision }} ref: cluster-values - {{ else }} +{{- else }} # helm-chart & values file within the same git repo source: repoURL: {{ .Values.clusterForge.repoUrl }} @@ -31,11 +30,12 @@ spec: helm: valueFiles: - {{ .Values.clusterForge.valuesFile }} - {{- end }} + - {{ .Values.global.clusterSize }} +{{- end }} destination: server: https://kubernetes.default.svc namespace: argocd syncPolicy: automated: prune: true - selfHeal: true + selfHeal: true \ No newline at end of file diff --git a/root/values.yaml b/root/values.yaml index bb6c2c1f..44630018 100644 --- a/root/values.yaml +++ b/root/values.yaml @@ -1,70 +1,149 @@ clusterForge: repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" - targetRevision: v1.8.0-rc2 -# source helm values file from separate git repo + targetRevision: # injected via scripts/bootstrap.sh; tag, branch, or commit externalValues: enabled: true + path: values.yaml repoUrl: "http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" targetRevision: main - path: values.yaml global: - domain: # to be filled by bootstrap script - clusterSize: # to be filled by bootstrap script (small, medium, large) -enabledApps: - - aim-cluster-model-source - - airm - - amd-gpu-operator - - amd-gpu-operator-config - - appwrapper - - argocd - - argocd-config - - cert-manager - - cluster-auth - - cluster-auth-config - - cnpg-operator - - external-secrets - - external-secrets-config - - gateway-api - - gitea - - gitea-config - - kaiwo - - kaiwo-config - - kaiwo-crds - - keda - - kedify-otel - - keycloak - - kgateway - - kgateway-config - - kgateway-crds - - kserve - - kserve-crds - - kueue - - kueue-config - - kuberay-operator - - kyverno - - kyverno-config - - kyverno-policies-base - - metallb - - minio-operator - - minio-tenant - - minio-tenant-config - - openbao - - openbao-config - - opentelemetry-operator - - otel-lgtm-stack - - prometheus-crds - - rabbitmq + clusterSize: # injected via scripts/bootstrap.sh + domain: # injected via scripts/bootstrap.sh + apps: - # Core apps + aim-cluster-model-source: + namespace: kaiwo-system + path: aim-cluster-model-source + syncWave: -20 + airm: + helmParameters: + - name: airm-api.airm.appDomain + value: "{{ .Values.global.domain }}" + ignoreDifferences: + - group: external-secrets.io + jqPathExpressions: + - ".spec.data[].remoteRef.conversionStrategy" + - ".spec.data[].remoteRef.decodingStrategy" + - ".spec.data[].remoteRef.metadataPolicy" + kind: ExternalSecret + - group: kyverno.io + jqPathExpressions: + - ".spec.rules" + kind: ClusterPolicy + namespace: airm + path: airm/0.3.5 + syncWave: 0 + valuesFile: values.yaml + amd-gpu-operator: + namespace: kube-amd-gpu + path: amd-gpu-operator/v1.4.1 + syncWave: -10 + valuesObject: + crds: + defaultCR: + install: false + amd-gpu-operator-config: + namespace: kube-amd-gpu + path: amd-gpu-operator-config + syncWave: 0 + appwrapper: + namespace: appwrapper-system + path: appwrapper/v1.1.2 + syncWave: -10 argocd: - path: argocd/8.3.5 + helmParameters: + - name: global.domain + value: "argocd.{{ .Values.global.domain }}" + - name: configs.cm.oidc\.config + value: | + name: Keycloak + issuer: https://kc.{{ .Values.global.domain }}/realms/airm + clientID: argocd + clientSecret: $$argocd-oidc-creds:client_secret + rootCA: $cluster-tls:cert + requestedScopes: ["openid", "profile", "email", "groups"] namespace: argocd + path: argocd/8.3.5 + syncWave: -30 valuesObject: applicationSet: replicas: 1 configs: cm: create: true + resource.customizations.health.apps_StatefulSet: | + -- Custom health check for OpenBao StatefulSet + -- Uses standard StatefulSet readiness for now - openbao-init job handles actual initialization + hs = {} + if obj.status ~= nil then + if obj.status.readyReplicas ~= nil and obj.status.replicas ~= nil then + if obj.status.readyReplicas == obj.status.replicas then + hs.status = "Healthy" + hs.message = "StatefulSet is ready" + else + hs.status = "Progressing" + hs.message = "Waiting for StatefulSet replicas to be ready" + end + else + hs.status = "Progressing" + hs.message = "Waiting for StatefulSet status" + end + else + hs.status = "Progressing" + hs.message = "Waiting for StatefulSet status" + end + return hs + resource.customizations.health.batch_Job: | + -- Custom health check for Jobs + hs = {} + if obj.status ~= nil then + if obj.status.conditions ~= nil then + for _, condition in ipairs(obj.status.conditions) do + if condition.type == "Complete" and condition.status == "True" then + hs.status = "Healthy" + hs.message = "Job completed successfully" + return hs + elseif condition.type == "Failed" and condition.status == "True" then + hs.status = "Degraded" + hs.message = "Job failed" + return hs + end + end + end + -- Check for active jobs + if obj.status.active and obj.status.active > 0 then + hs.status = "Progressing" + hs.message = "Job is running" + return hs + end + end + hs.status = "Progressing" + hs.message = "Job status unknown" + return hs + resource.customizations.health.keda.sh_ScaledObject: | + hs = {} + if obj.status ~= nil then + if obj.status.conditions ~= nil then + for _, condition in ipairs(obj.status.conditions) do + if condition.type == "Ready" then + if condition.status == "True" then + hs.status = "Healthy" + hs.message = "ScaledObject is ready" + else + hs.status = "Degraded" + hs.message = condition.reason or "ScaledObject not ready" + end + return hs + end + end + end + hs.status = "Progressing" + hs.message = "ScaledObject status unknown" + else + hs.status = "Progressing" + hs.message = "ScaledObject status unknown" + end + return hs resource.customizations.health.opentelemetry.io_OpenTelemetryCollector: | hs = {} hs.status = "Healthy" @@ -78,307 +157,222 @@ apps: g, argocd-users, role:admin controller: replicas: 1 + global: + domain: # to be filled by cluster-forge app redis: enabled: true redis-ha: enabled: false repoServer: - replicas: 1 autoscaling: enabled: false - server: replicas: 1 + server: autoscaling: enabled: false - global: - domain: # to be filled by cluster-forge app - helmParameters: - - name: global.domain - value: "argocd.{{ .Values.global.domain }}" - - name: configs.cm.oidc\.config - value: | - name: Keycloak - issuer: https://kc.{{ .Values.global.domain }}/realms/airm - clientID: argocd - clientSecret: $$argocd-oidc-creds:client_secret - requestedScopes: ["openid", "profile", "email", "groups"] - syncWave: -3 + replicas: 1 argocd-config: - path: argocd-config - namespace: argocd - syncWave: -2 ignoreDifferences: - group: external-secrets.io - kind: ExternalSecret jqPathExpressions: - ".spec.data[].remoteRef.conversionStrategy" - ".spec.data[].remoteRef.decodingStrategy" - ".spec.data[].remoteRef.metadataPolicy" + kind: ExternalSecret + namespace: argocd + path: argocd-config + syncWave: 5 cert-manager: namespace: cert-manager path: cert-manager/v1.18.2 - syncWave: -4 + syncWave: -40 valuesObject: installCRDs: true - openbao: - path: openbao/0.18.2 - namespace: cf-openbao - valuesObject: - injector: - enabled: false - server: - affinity: | - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/name: openbao - app.kubernetes.io/instance: openbao - component: server - topologyKey: kubernetes.io/hostname - ha: - enabled: false - raft: - enabled: false - replicas: 1 - ui: - enabled: true - syncWave: -4 + cluster-auth: + namespace: cluster-auth + path: cluster-auth/0.5.0 + syncWave: -25 + valuesFile: values.yaml + cluster-auth-config: ignoreDifferences: - - group: "apps" - kind: "Deployment" - jsonPointers: - - /spec/replicas - - group: "apps" - kind: "StatefulSet" - name: "openbao" - jsonPointers: - - /spec/volumeClaimTemplates - openbao-config: - path: openbao-config/0.1.0 - namespace: cf-openbao + - group: external-secrets.io + jqPathExpressions: + - ".spec.data[].remoteRef.conversionStrategy" + - ".spec.data[].remoteRef.decodingStrategy" + - ".spec.data[].remoteRef.metadataPolicy" + kind: ExternalSecret + namespace: cluster-auth + path: cluster-auth-config + syncWave: 5 + cnpg-operator: + namespace: cnpg-system + path: cnpg-operator/0.26.0 + syncWave: -30 valuesFile: values.yaml - helmParameters: - - name: domain - value: "{{ .Values.global.domain }}" - syncWave: -2 external-secrets: - path: external-secrets/0.15.1 namespace: external-secrets + path: external-secrets/0.15.1 + syncWave: -40 valuesFile: values.yaml - syncWave: -4 external-secrets-config: - path: external-secrets-config namespace: external-secrets - syncWave: -2 + path: external-secrets-config + syncWave: -10 + gateway-api: + namespace: default + path: gateway-api/v1.3.0 + syncWave: -50 gitea: - path: gitea/12.3.0 + helmParameters: + - name: clusterDomain + value: "{{ .Values.global.domain }}" + - name: gitea.config.server.ROOT_URL + value: "https://gitea.{{ .Values.global.domain }}" namespace: cf-gitea + path: gitea/12.3.0 + syncWave: -30 valuesObject: clusterDomain: # to be filled by cluster-forge app - strategy: - type: "Recreate" gitea: admin: existingSecret: gitea-admin-credentials config: - server: - ROOT_URL: # to be filled by cluster-forge app - database: - DB_TYPE: sqlite3 - session: - PROVIDER: memory cache: ADAPTER: memory + database: + DB_TYPE: sqlite3 queue: TYPE: level - valkey-cluster: - enabled: false - valkey: - enabled: false + server: + ROOT_URL: # to be filled by cluster-forge app + session: + PROVIDER: memory + persistence: + enabled: true postgresql: enabled: false postgresql-ha: enabled: false - persistence: - enabled: true + service: + http: {} + ssh: {} + strategy: + type: "Recreate" test: enabled: false - helmParameters: - - name: clusterDomain - value: "{{ .Values.global.domain }}" - - name: gitea.config.server.ROOT_URL - value: "https://gitea.{{ .Values.global.domain }}" - syncWave: -3 + valkey: + enabled: false + valkey-cluster: + enabled: false gitea-config: - path: gitea-config - namespace: cf-gitea - valuesFile: values.yaml helmParameters: - - name: keycloak.url - value: "https://kc.{{ .Values.global.domain }}" - name: keycloak.realm value: "airm" - syncWave: -2 - # Network apps - gateway-api: - path: gateway-api/v1.3.0 - namespace: default - syncWave: -5 - metallb: - path: metallb/v0.15.2 - namespace: default - syncWave: -4 - kgateway-crds: - path: kgateway-crds/v2.1.0-main - namespace: kgateway-system + - name: keycloak.url + value: "https://kc.{{ .Values.global.domain }}" + namespace: cf-gitea + path: gitea-config + syncWave: -20 valuesFile: values.yaml - syncWave: -3 - kgateway: - path: kgateway/v2.1.0-main - namespace: kgateway-system - valuesObject: - controller: - image: - registry: "ghcr.io" - repository: silogen/kgateway-v2.1.0-main-websocket - tag: "0.0.1" - syncWave: -2 - kgateway-config: - path: kgateway-config - namespace: kgateway-system + kaiwo: + namespace: kaiwo-system + path: kaiwo/v0.2.0-rc11 + syncWave: -10 valuesFile: values.yaml - helmParameters: - - name: domain - value: "{{ .Values.global.domain }}" - syncWave: -2 - # Monitoring - prometheus-crds: - path: prometheus-operator-crds/23.0.0 - namespace: prometheus-system + kaiwo-config: + ignoreDifferences: + - group: external-secrets.io + jqPathExpressions: + - ".spec.data[].remoteRef.conversionStrategy" + - ".spec.data[].remoteRef.decodingStrategy" + - ".spec.data[].remoteRef.metadataPolicy" + kind: ExternalSecret + - group: "" + jsonPointers: + - /spec/accessModes + kind: "PersistentVolumeClaim" + namespace: kaiwo-system + path: kaiwo-config + syncWave: 0 + kaiwo-crds: + namespace: kaiwo-system + path: kaiwo-crds/v0.2.0-rc11 + syncWave: -20 + keda: + namespace: keda + path: keda/2.18.1 + syncWave: -10 valuesFile: values.yaml + kedify-otel: + ignoreDifferences: + - group: "" + jqPathExpressions: + - ".status" + kind: "Service" + name: "keda-otel-scaler" + - group: "apps" + jqPathExpressions: + - ".status.availableReplicas" + - ".status.readyReplicas" + kind: "Deployment" + namespace: keda + path: kedify-otel/v0.0.6 syncWave: -5 - opentelemetry-operator: - path: opentelemetry-operator/0.93.1 - namespace: opentelemetry-operator-system - valuesObject: - # Cluster-forge specific values for opentelemetry-operator - # Sets the collector image to use contrib version (required for kaiwo/kedify-otel) - manager: - collectorImage: - repository: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib - tag: "0.140.0" - syncWave: -3 - otel-lgtm-stack: - path: otel-lgtm-stack/v1.0.7 - namespace: otel-lgtm-stack valuesObject: - # Cluster-forge specific configuration for OpenTelemetry LGTM Stack - # This file overrides values.yaml for cluster-forge deployments - # Cluster identification - will be populated by root/values.yaml helmParameters - cluster: - name: # to be filled by cluster-forge app based on domain - # Component enablement (cluster-forge defaults) - dashboards: - enabled: true - nodeExporter: - enabled: true - kubeStateMetrics: - enabled: true - # Storage configuration optimized for cluster-forge - lgtm: - storage: - # Tempo storage for traces - tempo: 50Gi - # Loki storage for logs - loki: 50Gi - # Grafana storage for dashboards/config - grafana: 10Gi - # Mimir/Prometheus storage for metrics - mimir: 50Gi - # Loki additional storage - extra: 50Gi - # LGTM stack main deployment resources - resources: - limits: - memory: 8Gi - requests: - memory: 2Gi - cpu: "1" - # Resource configuration optimized for cluster-forge - collectors: - resources: - # Metrics collector (deployment mode) - metrics: - limits: - memory: 8Gi - cpu: "2" - requests: - memory: 1Gi - cpu: 500m - # Logs collector (daemonset mode) - logs: - limits: - memory: 2Gi - cpu: "1" - requests: - memory: 400Mi - cpu: 200m - # Service configuration - services: - # Main LGTM stack service ports - lgtm: - grafana: 3000 - otelGrpc: 4317 - otelHttp: 4318 - prometheus: 9090 - loki: 3100 - # Kube state metrics service port - kubeStateMetrics: - http: 8080 - # Node exporter service port - nodeExporter: - metrics: 9100 + validatingAdmissionPolicy: + enabled: false + keycloak: helmParameters: - - name: cluster.name + - name: domain value: "{{ .Values.global.domain }}" - syncWave: -2 - # Databases - cnpg-operator: - path: cnpg-operator/0.26.0 - namespace: cnpg-system - valuesFile: values.yaml - syncWave: -3 - # Access control - cluster-auth: - path: cluster-auth/0.5.0 - namespace: cluster-auth - valuesFile: values.yaml - syncWave: -2 - cluster-auth-config: - path: cluster-auth-config - namespace: cluster-auth - syncWave: -2 ignoreDifferences: - group: external-secrets.io - kind: ExternalSecret jqPathExpressions: - ".spec.data[].remoteRef.conversionStrategy" - ".spec.data[].remoteRef.decodingStrategy" - ".spec.data[].remoteRef.metadataPolicy" - keycloak: - path: keycloak-old + kind: ExternalSecret namespace: keycloak + path: keycloak-old + syncWave: -10 valuesObject: - podLabels: - app: keycloak auth: adminUser: admin existingSecret: "keycloak-credentials" passwordSecretKey: "KEYCLOAK_INITIAL_ADMIN_PASSWORD" + extraEnvVars: + - name: JAVA_OPTS_APPEND + value: >- + -XX:MaxRAMPercentage=65.0 + -XX:InitialRAMPercentage=50.0 + -XX:MaxMetaspaceSize=512m + -XX:+ExitOnOutOfMemoryError + -Djava.awt.headless=true extraStartupArgs: "--cache=ispn --features=scripts,admin-fine-grained-authz,token-exchange --import-realm" + extraVolumeMounts: + - mountPath: /opt/keycloak/providers + name: keycloak-package-volume + - mountPath: /opt/keycloak/data/import + name: keycloak-realm-volume + extraVolumes: + - configMap: + items: + - key: keycloak-scripts.json + path: META-INF/keycloak-scripts.json + - key: domain-group-authenticator.js + path: domain-group-authenticator.js + name: keycloak-scripts + name: keycloak-script-volume + - emptyDir: {} + name: keycloak-package-volume + - configMap: + name: keycloak-realm-templates-7kgh2hc6b2 + name: keycloak-airm-realm-template-volume + - emptyDir: {} + name: keycloak-realm-volume + - configMap: + name: keycloak-realm-templates-k8s + name: keycloak-k8s-realm-template-volume initContainers: - command: - /bin/sh @@ -467,125 +461,64 @@ apps: name: keycloak-k8s-realm-template-volume - mountPath: /opt/realms name: keycloak-realm-volume - extraVolumes: - - configMap: - name: keycloak-scripts - items: - - key: keycloak-scripts.json - path: META-INF/keycloak-scripts.json - - key: domain-group-authenticator.js - path: domain-group-authenticator.js - name: keycloak-script-volume - - emptyDir: {} - name: keycloak-package-volume - - configMap: - name: keycloak-realm-templates-7kgh2hc6b2 - name: keycloak-airm-realm-template-volume - - emptyDir: {} - name: keycloak-realm-volume - - configMap: - name: keycloak-realm-templates-k8s - name: keycloak-k8s-realm-template-volume - extraVolumeMounts: - - mountPath: /opt/keycloak/providers - name: keycloak-package-volume - - mountPath: /opt/keycloak/data/import - name: keycloak-realm-volume - helmParameters: - - name: domain - value: "{{ .Values.global.domain }}" - syncWave: -1 - ignoreDifferences: - - group: external-secrets.io - kind: ExternalSecret - jqPathExpressions: - - ".spec.data[].remoteRef.conversionStrategy" - - ".spec.data[].remoteRef.decodingStrategy" - - ".spec.data[].remoteRef.metadataPolicy" - kyverno: - path: kyverno/3.5.1 - namespace: kyverno - valuesFile: values.yaml - syncWave: -3 - kyverno-config: - path: kyverno-config - namespace: kyverno - syncWave: -2 - ignoreDifferences: - - group: "kyverno.io" - kind: "ClusterPolicy" - name: "local-path-access-mode-mutation" - jsonPointers: - - /spec/rules/0/skipBackgroundRequests - - group: "kyverno.io" - kind: "ClusterPolicy" - name: "local-path-access-mode-warning" - jsonPointers: - - /spec/rules/0/skipBackgroundRequests - - /spec/rules/0/validate/allowExistingViolations - kyverno-policies-base: - namespace: kyverno - path: kyverno-policies/base - syncWave: -2 - # GPU - amd-gpu-operator: - path: amd-gpu-operator/v1.4.1 - namespace: kube-amd-gpu + podLabels: + app: keycloak + replicaCount: 1 + resources: + limits: + cpu: "500m" + memory: "2Gi" + requests: + cpu: "250m" + memory: "512Mi" + kgateway: + namespace: kgateway-system + path: kgateway/v2.1.0-main + syncWave: -20 valuesObject: - crds: - defaultCR: - install: false - syncWave: -1 - amd-gpu-operator-config: - path: amd-gpu-operator-config - namespace: kube-amd-gpu - syncWave: 0 - kuberay-operator: - path: kuberay-operator/1.4.2 - namespace: default - valuesFile: values.yaml - syncWave: -1 - # Autoscaling - keda: - path: keda/2.18.1 - namespace: keda + controller: + image: + registry: "ghcr.io" + repository: silogen/kgateway-v2.1.0-main-websocket + tag: "0.0.1" + kgateway-config: + helmParameters: + - name: domain + value: "{{ .Values.global.domain }}" + namespace: kgateway-system + path: kgateway-config + syncWave: -15 valuesFile: values.yaml - syncWave: -1 - kedify-otel: - path: kedify-otel/v0.0.6 - namespace: keda - valuesObject: - # Cluster-forge specific values for kedify-otel - validatingAdmissionPolicy: - enabled: false - syncWave: 0 - # ML/AI - kserve-crds: - path: kserve-crds/v0.16.0 - namespace: kserve-system + kgateway-crds: + namespace: kgateway-system + path: kgateway-crds/v2.1.0-main + syncWave: -30 valuesFile: values.yaml - syncWave: -3 kserve: - path: kserve/v0.16.0 namespace: kserve-system + path: kserve/v0.16.0 + syncWave: 0 valuesObject: kserve: controller: deploymentMode: "Standard" - syncWave: -1 - # Queues - rabbitmq: - path: rabbitmq/v2.15.0 - namespace: rabbitmq-system - syncWave: -1 + kserve-crds: + namespace: kserve-system + path: kserve-crds/v0.16.0 + syncWave: -30 + valuesFile: values.yaml + kuberay-operator: + namespace: default + path: kuberay-operator/1.4.2 + syncWave: -10 + valuesFile: values.yaml kueue: - path: kueue/0.13.0 namespace: kueue-system + path: kueue/0.13.0 + syncWave: -10 valuesObject: controllerManager: replicas: 1 - mutatingWebhook: - reinvocationPolicy: IfNeeded managerConfig: controllerManagerConfigYaml: |- apiVersion: config.kueue.x-k8s.io/v1beta1 @@ -631,113 +564,216 @@ apps: - "pod" - "deployment" - "statefulset" - syncWave: -1 + mutatingWebhook: + reinvocationPolicy: IfNeeded kueue-config: - path: kueue-config namespace: kueue-system - syncWave: -1 - appwrapper: - path: appwrapper/v1.1.2 - namespace: appwrapper-system - syncWave: -1 - # Storage + path: kueue-config + syncWave: -10 + kyverno: + namespace: kyverno + path: kyverno/3.5.1 + syncWave: -30 + valuesFile: values.yaml + kyverno-config: + ignoreDifferences: + - group: "kyverno.io" + jsonPointers: + - /spec/rules/0/skipBackgroundRequests + kind: "ClusterPolicy" + name: "local-path-access-mode-mutation" + - group: "kyverno.io" + jsonPointers: + - /spec/rules/0/skipBackgroundRequests + - /spec/rules/0/validate/allowExistingViolations + kind: "ClusterPolicy" + name: "local-path-access-mode-warning" + namespace: kyverno + path: kyverno-config + syncWave: -20 + kyverno-policies-base: + namespace: kyverno + path: kyverno-policies/base + syncWave: -20 + metallb: + namespace: default + path: metallb/v0.15.2 + syncWave: 10 minio-operator: - path: minio-operator/7.1.1 namespace: minio-operator + path: minio-operator/7.1.1 + syncWave: -10 valuesFile: values.yaml - syncWave: -1 minio-tenant: - path: minio-tenant/7.1.1 namespace: minio-tenant-default + path: minio-tenant/7.1.1 + syncWave: 0 valuesObject: tenant: - name: default-minio-tenant - configSecret: - name: default-minio-tenant-env-configuration - existingSecret: true - pools: - - servers: 1 - name: pool-0 - volumesPerServer: 1 - size: 250Gi # Reduced from 500Gi for workstation - storageClassName: direct buckets: - name: default-bucket objectLock: true - name: models objectLock: true - users: - - name: default-user certificate: - requestAutoCert: false externalCaCertSecret: - name: cluster-tls type: kubernetes.io/secret/v1 + requestAutoCert: false + configSecret: + existingSecret: true + name: default-minio-tenant-env-configuration env: - name: MINIO_PROMETHEUS_AUTH_TYPE value: "public" - syncWave: 0 + name: default-minio-tenant + pools: + - name: pool-0 + servers: 1 + size: 250Gi # Reduced from 500Gi for workstation + storageClassName: direct + volumesPerServer: 1 + users: + - name: default-user minio-tenant-config: - path: minio-tenant-config - namespace: minio-tenant-default - valuesFile: values.yaml helmParameters: - name: domain value: "{{ .Values.global.domain }}" - syncWave: 0 ignoreDifferences: - group: external-secrets.io - kind: ExternalSecret jqPathExpressions: - ".spec.data[].remoteRef.conversionStrategy" - ".spec.data[].remoteRef.decodingStrategy" - ".spec.data[].remoteRef.metadataPolicy" - # Kaiwo (Kubernetes AI Workload Orchestrator) - aim-cluster-model-source: - path: aim-cluster-model-source - namespace: kaiwo-system - syncWave: -2 - kaiwo-crds: - path: kaiwo-crds/v0.2.0-rc11 - namespace: kaiwo-system - syncWave: -2 - kaiwo: - path: kaiwo/v0.2.0-rc11 - namespace: kaiwo-system - valuesFile: values.yaml - syncWave: -1 - kaiwo-config: - path: kaiwo-config - namespace: kaiwo-system + kind: ExternalSecret + namespace: minio-tenant-default + path: minio-tenant-config syncWave: 0 + valuesFile: values.yaml + openbao: ignoreDifferences: - - group: external-secrets.io - kind: ExternalSecret - jqPathExpressions: - - ".spec.data[].remoteRef.conversionStrategy" - - ".spec.data[].remoteRef.decodingStrategy" - - ".spec.data[].remoteRef.metadataPolicy" - - group: "" - kind: "PersistentVolumeClaim" + - group: "apps" jsonPointers: - - /spec/accessModes - # AMD Resource Manager (AIRM) - airm: - path: airm/0.3.2 - namespace: airm + - /spec/replicas + kind: "Deployment" + - group: "apps" + jsonPointers: + - /spec/volumeClaimTemplates + kind: "StatefulSet" + name: "openbao" + namespace: cf-openbao + path: openbao/0.18.2 + syncWave: -70 + valuesObject: + injector: + enabled: false + server: + affinity: | + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: openbao + app.kubernetes.io/instance: openbao + component: server + topologyKey: kubernetes.io/hostname + ha: + enabled: false + raft: + enabled: false + replicas: 1 + ui: + enabled: true + openbao-config: + helmParameters: + - name: domain + value: "{{ .Values.global.domain }}" + namespace: cf-openbao + path: openbao-config/0.1.0 + syncWave: -60 valuesFile: values.yaml + openbao-init: helmParameters: - - name: airm-api.airm.appDomain + - name: domain value: "{{ .Values.global.domain }}" - syncWave: 0 - ignoreDifferences: - - group: external-secrets.io - kind: ExternalSecret - jqPathExpressions: - - ".spec.data[].remoteRef.conversionStrategy" - - ".spec.data[].remoteRef.decodingStrategy" - - ".spec.data[].remoteRef.metadataPolicy" - - group: kyverno.io - kind: ClusterPolicy - jqPathExpressions: - - ".spec.rules" + namespace: cf-openbao + path: ../scripts/init-openbao-job + syncWave: -50 + valuesObject: + domain: # to be filled by cluster-forge app + opentelemetry-operator: + namespace: opentelemetry-operator-system + path: opentelemetry-operator/0.93.1 + syncWave: -30 + valuesObject: + manager: + collectorImage: + repository: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib + tag: "0.140.0" + otel-lgtm-stack: + helmParameters: + - name: cluster.name + value: "{{ .Values.global.domain }}" + namespace: otel-lgtm-stack + path: otel-lgtm-stack/v1.0.7 + syncWave: -20 + valuesObject: + cluster: + name: # to be filled by cluster-forge app based on domain + collectors: + resources: + logs: + limits: + cpu: "1" + memory: 2Gi + requests: + cpu: 200m + memory: 400Mi + metrics: + limits: + cpu: "2" + memory: 8Gi + requests: + cpu: 500m + memory: 1Gi + dashboards: + enabled: true + kubeStateMetrics: + enabled: true + lgtm: + resources: + limits: + memory: 8Gi + requests: + cpu: "1" + memory: 2Gi + storage: + extra: 50Gi + grafana: 10Gi + loki: 50Gi + mimir: 50Gi + tempo: 50Gi + nodeExporter: + enabled: true + services: + kubeStateMetrics: + http: 8080 + lgtm: + grafana: 3000 + loki: 3100 + otelGrpc: 4317 + otelHttp: 4318 + prometheus: 9090 + nodeExporter: + metrics: 9100 + prometheus-crds: + namespace: prometheus-system + path: prometheus-operator-crds/23.0.0 + syncWave: -50 + valuesFile: values.yaml + rabbitmq: + namespace: rabbitmq-system + path: rabbitmq/v2.15.0 + syncWave: -10 diff --git a/root/values_large.yaml b/root/values_large.yaml index 6efb6d52..0ff528d0 100644 --- a/root/values_large.yaml +++ b/root/values_large.yaml @@ -1,41 +1,50 @@ -# LARGE CLUSTER CONFIGURATION (Production: app-dev.silogen.ai) -# Actual Hardware: 4 nodes (HA control plane and at least one worker node) -# -# This configuration reflects a production deployment -# -# LARGE CLUSTER: All apps enabled (inherited from base values.yaml) -# Uses Longhorn storage with native RWX support - no access mode mutation needed +enabledApps: + - aim-cluster-model-source + - airm + - amd-gpu-operator + - amd-gpu-operator-config + - appwrapper + - argocd + - argocd-config + - cert-manager + - cluster-auth + - cluster-auth-config + - cnpg-operator + - external-secrets + - external-secrets-config + - gateway-api + - gitea + - gitea-config + - kaiwo + - kaiwo-config + - kaiwo-crds + - keda + - kedify-otel + - keycloak + - kgateway + - kgateway-config + - kgateway-crds + - kserve + - kserve-crds + - kueue + - kueue-config + - kuberay-operator + - kyverno + - kyverno-config + - kyverno-policies-base + - metallb + - minio-operator + - minio-tenant + - minio-tenant-config + - openbao + - openbao-config + - openbao-init + - opentelemetry-operator + - otel-lgtm-stack + - prometheus-crds + - rabbitmq apps: - # Git - Gitea (Production: single replica, sqlite3) - gitea: - valuesObject: - postgresql-ha: - enabled: false - valkey-cluster: - enabled: false - # Core apps - ArgoCD (Production: single replicas, appropriate for 4-node cluster) - argocd: - valuesObject: - # Production config: Single replicas (no HA for 4-node cluster) - applicationSet: - replicas: 1 - controller: - replicas: 1 - redis: - enabled: true - redis-ha: - enabled: false - repoServer: - replicas: 1 - autoscaling: - enabled: false - server: - replicas: 1 - autoscaling: - enabled: false - - # Storage - MinIO Tenant (Production: single server configuration) minio-tenant: valuesObject: tenant: @@ -45,60 +54,11 @@ apps: size: 500Gi storageClassName: direct volumesPerServer: 1 - - # Secrets - OpenBao (Production: 3 replicas, 10Gi storage) openbao: valuesObject: server: ha: enabled: true - replicas: 3 raft: enabled: true - - # Workload scheduling - Kueue (Production: single replica) - kueue: - valuesObject: - controllerManager: - replicas: 1 - - # Authentication - Keycloak (Production: single replica) - keycloak: - valuesObject: - replicaCount: 1 - resources: - limits: - cpu: "500m" - memory: "2Gi" - requests: - cpu: "250m" - memory: "512Mi" - - # Monitoring - OTEL LGTM Stack (Production configuration) - otel-lgtm-stack: - valuesObject: - # Production: Standard monitoring resources - lgtm: - storage: - # Production storage sizes (default storage class) - extra: 50Gi - grafana: 10Gi - loki: 50Gi - mimir: 50Gi - tempo: 50Gi - collectors: - resources: - logs: - limits: - cpu: '1' - memory: 2Gi - requests: - cpu: 200m - memory: 400Mi - metrics: - limits: - cpu: '2' - memory: 8Gi - requests: - cpu: 500m - memory: 1Gi + replicas: 3 diff --git a/root/values_medium.yaml b/root/values_medium.yaml index 1b0a5d02..aad172c8 100644 --- a/root/values_medium.yaml +++ b/root/values_medium.yaml @@ -1,8 +1,5 @@ -# MEDIUM CLUSTER: All apps enabled (inherited from base values.yaml) -# Add Kyverno policy for local-path access mode mutation - -# Medium & Small clusters add local-path storage policy for RWX→RWO conversion # Medium & Small clusters add local-path storage policy for RWX→RWO conversion + enabledApps: - aim-cluster-model-source - airm @@ -55,9 +52,13 @@ apps: kyverno-policies-storage-local-path: namespace: kyverno path: kyverno-policies/storage-local-path - ignoreDifferences: [] - syncWave: -2 - + syncWave: -20 + ignoreDifferences: + - group: kyverno.io + kind: ClusterPolicy + jsonPointers: + - /spec/rules/*/skipBackgroundRequests + - /spec/rules/*/validate/allowExistingViolations argocd: valuesObject: applicationSet: @@ -66,8 +67,8 @@ apps: replicas: 1 resources: limits: - cpu: "2000m" - memory: "4Gi" + cpu: "1000m" + memory: "2Gi" requests: cpu: "500m" memory: "1Gi" @@ -76,8 +77,8 @@ apps: redis: resources: limits: - cpu: "1000m" - memory: "2Gi" + cpu: "500m" + memory: "1Gi" requests: cpu: "250m" memory: "512Mi" @@ -85,8 +86,8 @@ apps: replicas: 1 resources: limits: - cpu: "1000m" - memory: "2Gi" + cpu: "500m" + memory: "1Gi" requests: cpu: "250m" memory: "512Mi" @@ -99,7 +100,27 @@ apps: requests: cpu: "125m" memory: "256Mi" - + grafana: + valuesObject: + persistence: + accessModes: + - ReadWriteOnce + enabled: true + size: 5Gi + storageClassName: direct + replicas: 1 + resources: + limits: + cpu: "1000m" + memory: "2Gi" + requests: + cpu: "250m" + memory: "512Mi" + keycloak: + valuesObject: + # Increase memory resources for Keycloak to prevent OOMKilled during initialization + # Medium preset provides 1536Mi memory limit vs small preset's 768Mi + resourcesPreset: "medium" minio-tenant: valuesObject: tenant: @@ -123,18 +144,17 @@ apps: requests: cpu: "1000m" memory: "2Gi" - openbao: valuesObject: server: + dataStorage: + size: 5Gi + storageClass: direct ha: enabled: false - replicas: 1 raft: enabled: false - dataStorage: - size: 5Gi - storageClass: direct + replicas: 1 resources: limits: cpu: "1000m" @@ -142,13 +162,25 @@ apps: requests: cpu: "250m" memory: "512Mi" - + opentelemetry-operator: + valuesObject: + manager: + resources: + requests: + cpu: "250m" + memory: "512Mi" + otel-lgtm-stack: + valuesObject: + collectors: + resources: + metrics: + cpu: '1' + limits: + memory: 4Gi prometheus: valuesObject: prometheus: prometheusSpec: - retention: 15d - retentionSize: 20GB resources: limits: cpu: "2000m" @@ -156,29 +188,14 @@ apps: requests: cpu: "500m" memory: "1Gi" + retention: 15d + retentionSize: 20GB storageSpec: volumeClaimTemplate: spec: accessModes: - ReadWriteOnce - storageClassName: direct resources: requests: storage: 25Gi - - grafana: - valuesObject: - replicas: 1 - resources: - limits: - cpu: "1000m" - memory: "2Gi" - requests: - cpu: "250m" - memory: "512Mi" - persistence: - enabled: true - size: 5Gi - storageClassName: direct - accessModes: - - ReadWriteOnce + storageClassName: direct diff --git a/root/values_small.yaml b/root/values_small.yaml index 9af1e1de..2b9ae69b 100644 --- a/root/values_small.yaml +++ b/root/values_small.yaml @@ -1,66 +1,51 @@ -# SMALL CLUSTER: All apps enabled (inherited from base values.yaml) -# Add Kyverno policy for local-path access mode mutation - -# Medium & Small clusters add local-path storage policy for RWX→RWO conversion enabledApps: - - kyverno-policies-storage-local-path # Local-path storage policies (small/medium only) - aim-cluster-model-source + - airm + - amd-gpu-operator + - amd-gpu-operator-config + - appwrapper - argocd - argocd-config - cert-manager - - openbao - - openbao-config + - cluster-auth + - cluster-auth-config + - cnpg-operator - external-secrets - external-secrets-config + - gateway-api - gitea - gitea-config - - gateway-api - - metallb - - kgateway-crds - - kgateway - - kgateway-config - - prometheus-crds - - opentelemetry-operator - - otel-lgtm-stack - - cnpg-operator - - cluster-auth - - cluster-auth-config - - keycloak - - kyverno - - kyverno-config - - amd-gpu-operator - - amd-gpu-operator-config - - kuberay-operator + - kaiwo + - kaiwo-config + - kaiwo-crds - keda - kedify-otel - - kserve-crds + - keycloak + - kgateway + - kgateway-config + - kgateway-crds - kserve - - rabbitmq + - kserve-crds + - kuberay-operator - kueue - kueue-config - - appwrapper + - kyverno + - kyverno-config + - kyverno-policies-base # applicable to all cluster sizes + - kyverno-policies-storage-local-path # small & medium cluster sizes only + - metallb - minio-operator - minio-tenant - minio-tenant-config - - kaiwo-crds - - kaiwo - - kaiwo-config - - airm + - openbao + - openbao-config + - openbao-init + - opentelemetry-operator + - otel-lgtm-stack + - prometheus-crds + - rabbitmq apps: - # Modular Kyverno policy applications (only the storage-local-path addition) - kyverno-policies-storage-local-path: - namespace: kyverno - path: kyverno-policies/storage-local-path - source: clusterForge - syncOptions: - - CreateNamespace=true - ignoreDifferences: [] - wave: 26 # Deploy after base policies - syncWave: - - group: kyverno.io - kind: ClusterPolicy - argocd: valuesObject: applicationSet: @@ -102,7 +87,33 @@ apps: requests: cpu: "125m" memory: "256Mi" - + grafana: + valuesObject: + persistence: + accessModes: + - ReadWriteOnce + enabled: true + size: 5Gi + storageClassName: local-path + replicas: 1 + resources: + limits: + cpu: "1000m" + memory: "2Gi" + requests: + cpu: "250m" + memory: "512Mi" + kyverno-policies-storage-local-path: + ignoreDifferences: [] + namespace: kyverno + path: kyverno-policies/storage-local-path + source: clusterForge + syncOptions: + - CreateNamespace=true + syncWave: + - group: kyverno.io + kind: ClusterPolicy + wave: 26 # Deploy after base policies minio-tenant: valuesObject: tenant: @@ -126,18 +137,17 @@ apps: requests: cpu: "1000m" memory: "2Gi" - openbao: valuesObject: server: + dataStorage: + size: 5Gi + storageClass: local-path ha: enabled: false - replicas: 1 raft: enabled: false - dataStorage: - size: 5Gi - storageClass: local-path + replicas: 1 resources: limits: cpu: "1000m" @@ -145,13 +155,10 @@ apps: requests: cpu: "250m" memory: "512Mi" - prometheus: valuesObject: prometheus: prometheusSpec: - retention: 15d - retentionSize: 20GB resources: limits: cpu: "2000m" @@ -159,29 +166,14 @@ apps: requests: cpu: "500m" memory: "1Gi" + retention: 15d + retentionSize: 20GB storageSpec: volumeClaimTemplate: spec: accessModes: - ReadWriteOnce - storageClassName: local-path resources: requests: storage: 25Gi - - grafana: - valuesObject: - replicas: 1 - resources: - limits: - cpu: "1000m" - memory: "2Gi" - requests: - cpu: "250m" - memory: "512Mi" - persistence: - enabled: true - size: 5Gi - storageClassName: local-path - accessModes: - - ReadWriteOnce + storageClassName: local-path diff --git a/sbom/SBOM-QUICK-GUIDE.md b/sbom/SBOM-QUICK-GUIDE.md index bc007fd5..322d31bd 100644 --- a/sbom/SBOM-QUICK-GUIDE.md +++ b/sbom/SBOM-QUICK-GUIDE.md @@ -4,10 +4,10 @@ Scripts to manage component metadata for automated Software Bill of Materials (S ## Essential Workflow -When `root/values.yaml` has a new tool, you need to run these commands manually: +When any cluster size configuration (`values_small.yaml`, `values_medium.yaml`, `values_large.yaml`) has new apps, you need to run these commands manually: ```bash -# 1. Generate/sync components from enabledApps +# 1. Generate/sync components from enabledApps across all cluster sizes ./generate-compare-components.sh # 2. Manually fill out sourceUrl and projectUrl in components.yaml @@ -23,20 +23,22 @@ When `root/values.yaml` has a new tool, you need to run these commands manually: ## Quick Start ### Adding a New Component -1. **Update values.yaml**: Add to `enabledApps` list AND add app definition in `apps` section -2. **Run workflow**: Execute the 4 commands above -3. **Commit changes**: All files should be ready for PR +1. **Update cluster size files**: Add to `enabledApps` list in relevant cluster size files (`values_small.yaml`, `values_medium.yaml`, `values_large.yaml`) +2. **Add app definition**: Add app definition in the `apps` section of appropriate values file (typically base `values.yaml` or cluster-specific file) +3. **Run workflow**: Execute the 4 commands above +4. **Commit changes**: All files should be ready for PR ### Removing a Component -1. **Remove from enabledApps**: Remove from `enabledApps` list in `root/values.yaml` +1. **Remove from enabledApps**: Remove from `enabledApps` list in relevant cluster size files 2. **Regenerate**: Run `./generate-compare-components.sh` (automatically removes from components.yaml) 3. **Validate**: Run `./validate-sync.sh` to confirm removal ## Scripts ### Generation Scripts -**`generate-compare-components.sh`** - Syncs `components.yaml` with enabled apps from `root/values.yaml` -- Processes only apps listed in `enabledApps` (excludes `-config` apps) +**`generate-compare-components.sh`** - Syncs `components.yaml` with enabled apps from all cluster sizes +- Collects apps from `values.yaml`, `values_small.yaml`, `values_medium.yaml`, `values_large.yaml` +- Processes only apps listed in `enabledApps` across all configurations (excludes `-config` apps) - Includes pre-validation to catch configuration issues early - Preserves existing metadata (sourceUrl, projectUrl, license fields) - Creates timestamped backups when needed @@ -51,8 +53,8 @@ When `root/values.yaml` has a new tool, you need to run these commands manually: - Use this for complete validation before commits **Individual Validators** (for targeted debugging): -- **`validate-enabled-apps.sh`** - Checks enabledApps have corresponding app definitions -- **`validate-components-sync.sh`** - Verifies components.yaml reflects current enabledApps +- **`validate-enabled-apps.sh`** - Checks enabledApps across all cluster sizes have corresponding app definitions +- **`validate-components-sync.sh`** - Verifies components.yaml reflects current enabledApps from all cluster configurations - **`validate-metadata.sh`** - Ensures all required metadata fields are populated ## Validation Workflow @@ -61,13 +63,14 @@ The new modular validation system ensures data consistency: ``` 1. EnabledApps Consistency Check - ├── Validates all enabledApps have app definitions + ├── Validates all enabledApps across cluster sizes have app definitions + ├── Collects from values.yaml, values_small.yaml, values_medium.yaml, values_large.yaml └── Filters out -config apps appropriately 2. Components Sync Check - ├── Verifies components.yaml matches enabledApps + ├── Verifies components.yaml matches enabledApps from all cluster configurations ├── Checks for missing/extra components - └── Validates path/valuesFile consistency + └── Validates path/valuesFile consistency across cluster files 3. Metadata Completeness Check ├── Ensures sourceUrl and projectUrl are populated @@ -92,7 +95,8 @@ The GitHub workflow `.github/workflows/pr-component-validation.yaml` now include ## Important Notes -- **EnabledApps is the source of truth**: Components are generated only for apps in the `enabledApps` list +- **EnabledApps across cluster sizes is the source of truth**: Components are generated from apps in `enabledApps` lists across all cluster configurations +- **No base enabledApps**: The base `values.yaml` no longer contains enabledApps to avoid override conflicts - **Manual metadata required**: `sourceUrl` and `projectUrl` must be added manually (requires human knowledge) - **Scripts are idempotent**: Safe to run multiple times - **Validation before commit**: Always run `./validate-sync.sh` before creating PRs @@ -101,7 +105,7 @@ The GitHub workflow `.github/workflows/pr-component-validation.yaml` now include ## Troubleshooting **Error: "Enabled app has no definition"** -→ Add the app definition to the `apps` section in `root/values.yaml` +→ Add the app definition to the `apps` section in `root/values.yaml` or appropriate cluster size file **Error: "Component missing/extra"** → Run `./generate-compare-components.sh` to sync components.yaml diff --git a/sbom/components.yaml b/sbom/components.yaml index 97d608a6..a125043b 100644 --- a/sbom/components.yaml +++ b/sbom/components.yaml @@ -1,5 +1,6 @@ # Generated components metadata for SBOM creation -# This file contains simplified component information for apps in enabledApps +# This file contains simplified component information for apps across all cluster sizes +# Collected from: values.yaml, values_small.yaml, values_medium.yaml, values_large.yaml # Apps with "config" suffix are excluded from this SBOM components: @@ -10,7 +11,7 @@ components: license: MIT License licenseUrl: https://github.com/silogen/kaiwo/blob/main/LICENSE airm: - path: airm/0.3.2 + path: airm/0.3.5 valuesFile: values.yaml sourceUrl: https://github.com/silogen/cluster-forge/tree/main/sources/airm projectUrl: https://github.com/silogen/cluster-forge/tree/main/sources/airm @@ -131,12 +132,6 @@ components: projectUrl: https://github.com/kserve/kserve license: Apache License 2.0 licenseUrl: https://github.com/kserve/kserve/blob/master/LICENSE - kueue: - path: kueue/0.13.0 - sourceUrl: oci://registry.k8s.io/kueue/charts/kueue - projectUrl: https://github.com/kubernetes-sigs/kueue - license: Apache License 2.0 - licenseUrl: https://github.com/kubernetes-sigs/kueue/blob/main/LICENSE kuberay-operator: path: kuberay-operator/1.4.2 valuesFile: values.yaml @@ -144,6 +139,12 @@ components: projectUrl: https://github.com/ray-project/kuberay license: Apache License 2.0 licenseUrl: https://github.com/ray-project/kuberay/blob/master/LICENSE + kueue: + path: kueue/0.13.0 + sourceUrl: oci://registry.k8s.io/kueue/charts/kueue + projectUrl: https://github.com/kubernetes-sigs/kueue + license: Apache License 2.0 + licenseUrl: https://github.com/kubernetes-sigs/kueue/blob/main/LICENSE kyverno: path: kyverno/3.5.1 valuesFile: values.yaml @@ -157,6 +158,12 @@ components: projectUrl: https://github.com/kyverno/kyverno license: Apache License 2.0 licenseUrl: https://github.com/kyverno/kyverno/blob/main/LICENSE + kyverno-policies-storage-local-path: + path: kyverno-policies/storage-local-path + sourceUrl: https://github.com/silogen/cluster-forge/tree/main/sources/kyverno-policies/storage-local-path + projectUrl: https://github.com/silogen/cluster-forge/ + license: Apache License 2.0 + licenseUrl: https://github.com/silogen/cluster-forge/blob/main/LICENSE metallb: path: metallb/v0.15.2 sourceUrl: https://raw.githubusercontent.com/metallb/metallb/v0.15.2/config/manifests/metallb-native.yaml @@ -182,6 +189,12 @@ components: projectUrl: https://github.com/openbao/openbao license: Mozilla Public License 2.0 licenseUrl: https://github.com/openbao/openbao/blob/main/LICENSE + openbao-init: + path: ../scripts/init-openbao-job + sourceUrl: https://github.com/silogen/cluster-forge/tree/main/scripts/init-openbao-job + projectUrl: https://github.com/silogen/cluster-forge/ + license: Apache License 2.0 + licenseUrl: https://github.com/silogen/cluster-forge/blob/main/LICENSE opentelemetry-operator: path: opentelemetry-operator/0.93.1 sourceUrl: https://open-telemetry.github.io/opentelemetry-helm-charts diff --git a/sbom/generate-compare-components.sh b/sbom/generate-compare-components.sh index 26644d5e..507d4cd9 100755 --- a/sbom/generate-compare-components.sh +++ b/sbom/generate-compare-components.sh @@ -2,16 +2,20 @@ set -euo pipefail -# Script to update components.yaml from enabledApps in values.yaml +# Script to update components.yaml from enabledApps across all cluster sizes +# Collects components from values.yaml, values_small.yaml, values_medium.yaml, values_large.yaml # Only updates if there are new items or changes to existing ones # Preserves existing sourceUrl and projectUrl values # Only includes apps that are in the enabledApps list (excluding -config apps) -VALUES_FILE="../root/values.yaml" +BASE_VALUES_FILE="../root/values.yaml" +SMALL_VALUES_FILE="../root/values_small.yaml" +MEDIUM_VALUES_FILE="../root/values_medium.yaml" +LARGE_VALUES_FILE="../root/values_large.yaml" OUTPUT_FILE="./components.yaml" TEMP_FILE="./components.yaml.tmp" -echo "⚙️ Generating/Updating components.yaml from enabledApps..." +echo "⚙️ Generating/Updating components.yaml from enabledApps across all cluster sizes..." # Self-validation: Check enabledApps consistency before processing (fail-fast) echo "🔍 Pre-validation: Checking enabledApps consistency..." @@ -30,14 +34,50 @@ fi echo "" echo "Checking for updates to components.yaml..." -# Check if values.yaml exists -if [[ ! -f "$VALUES_FILE" ]]; then - echo "❌ Error: $VALUES_FILE not found" - exit 1 +# Function to collect enabled apps from a values file +collect_enabled_apps() { + local values_file="$1" + if [[ -f "$values_file" ]]; then + yq eval '.enabledApps[]' "$values_file" 2>/dev/null || echo "" + else + echo "" + fi +} + +# Collect enabled apps from all cluster size configurations +echo "🔍 Collecting enabled apps from all cluster configurations..." +all_enabled_apps="" + +# Collect from base values.yaml (if enabledApps exists) +base_apps=$(collect_enabled_apps "$BASE_VALUES_FILE") +if [[ -n "$base_apps" ]]; then + echo " 📄 Found apps in values.yaml: $(echo "$base_apps" | wc -l) apps" + all_enabled_apps="$all_enabled_apps$base_apps"$'\n' fi -# Get all enabled app names that don't end with -config from values.yaml -enabled_apps=$(yq eval '.enabledApps[]' "$VALUES_FILE" 2>/dev/null || echo "") +# Collect from small cluster values +small_apps=$(collect_enabled_apps "$SMALL_VALUES_FILE") +if [[ -n "$small_apps" ]]; then + echo " 📄 Found apps in values_small.yaml: $(echo "$small_apps" | wc -l) apps" + all_enabled_apps="$all_enabled_apps$small_apps"$'\n' +fi + +# Collect from medium cluster values +medium_apps=$(collect_enabled_apps "$MEDIUM_VALUES_FILE") +if [[ -n "$medium_apps" ]]; then + echo " 📄 Found apps in values_medium.yaml: $(echo "$medium_apps" | wc -l) apps" + all_enabled_apps="$all_enabled_apps$medium_apps"$'\n' +fi + +# Collect from large cluster values +large_apps=$(collect_enabled_apps "$LARGE_VALUES_FILE") +if [[ -n "$large_apps" ]]; then + echo " 📄 Found apps in values_large.yaml: $(echo "$large_apps" | wc -l) apps" + all_enabled_apps="$all_enabled_apps$large_apps"$'\n' +fi + +# Get unique enabled apps (remove duplicates and empty lines) +enabled_apps=$(echo "$all_enabled_apps" | sort -u | grep -v '^$' || echo "") if [[ -z "$enabled_apps" ]]; then echo "Warning: No enabled apps found in enabledApps list" @@ -71,9 +111,21 @@ else # Check each enabled app from values.yaml for app in $app_names; do - # Get current values from values.yaml apps section - current_path=$(yq eval ".apps.\"$app\".path" "$VALUES_FILE") - current_values_file=$(yq eval ".apps.\"$app\".valuesFile // \"null\"" "$VALUES_FILE") + # Get current values from apps section (check all cluster files) + current_path="" + current_values_file="null" + + # Try to find the app definition in any of the cluster configuration files + for values_file in "$BASE_VALUES_FILE" "$SMALL_VALUES_FILE" "$MEDIUM_VALUES_FILE" "$LARGE_VALUES_FILE"; do + if [[ -f "$values_file" ]]; then + app_path=$(yq eval ".apps.\"$app\".path // \"null\"" "$values_file" 2>/dev/null || echo "null") + if [[ "$app_path" != "null" ]]; then + current_path="$app_path" + current_values_file=$(yq eval ".apps.\"$app\".valuesFile // \"null\"" "$values_file") + break + fi + fi + done # Check if app exists in components.yaml existing_app=$(yq eval ".components.\"$app\" // \"null\"" "$OUTPUT_FILE") @@ -124,7 +176,8 @@ echo "Updating $OUTPUT_FILE..." # Create components.yaml header cat > "$TEMP_FILE" << 'EOF' # Generated components metadata for SBOM creation -# This file contains simplified component information for apps in enabledApps +# This file contains simplified component information for apps across all cluster sizes +# Collected from: values.yaml, values_small.yaml, values_medium.yaml, values_large.yaml # Apps with "config" suffix are excluded from this SBOM components: @@ -134,12 +187,23 @@ EOF for app in $app_names; do echo " $app:" >> "$TEMP_FILE" - # Get path from values.yaml - path=$(yq eval ".apps.\"$app\".path" "$VALUES_FILE") - echo " path: $path" >> "$TEMP_FILE" + # Get path and valuesFile from any cluster configuration file + path="" + values_file="null" + + # Try to find the app definition in any of the cluster configuration files + for config_file in "$BASE_VALUES_FILE" "$SMALL_VALUES_FILE" "$MEDIUM_VALUES_FILE" "$LARGE_VALUES_FILE"; do + if [[ -f "$config_file" ]]; then + app_path=$(yq eval ".apps.\"$app\".path // \"null\"" "$config_file" 2>/dev/null || echo "null") + if [[ "$app_path" != "null" ]]; then + path="$app_path" + values_file=$(yq eval ".apps.\"$app\".valuesFile // \"null\"" "$config_file") + break + fi + fi + done - # Get valuesFile from values.yaml if it exists - values_file=$(yq eval ".apps.\"$app\".valuesFile // \"null\"" "$VALUES_FILE") + echo " path: $path" >> "$TEMP_FILE" if [[ "$values_file" != "null" ]]; then echo " valuesFile: $values_file" >> "$TEMP_FILE" fi @@ -193,7 +257,17 @@ echo "$app_names" | wc -l | xargs echo "Total components:" echo "" echo "Components with valuesFile:" for app in $app_names; do - values_file=$(yq eval ".apps.\"$app\".valuesFile // \"null\"" "$VALUES_FILE") + # Check all cluster configuration files for valuesFile + values_file="null" + for config_file in "$BASE_VALUES_FILE" "$SMALL_VALUES_FILE" "$MEDIUM_VALUES_FILE" "$LARGE_VALUES_FILE"; do + if [[ -f "$config_file" ]]; then + app_path=$(yq eval ".apps.\"$app\".path // \"null\"" "$config_file" 2>/dev/null || echo "null") + if [[ "$app_path" != "null" ]]; then + values_file=$(yq eval ".apps.\"$app\".valuesFile // \"null\"" "$config_file") + break + fi + fi + done if [[ "$values_file" != "null" ]]; then echo " - $app" fi diff --git a/sbom/validate-components-sync.sh b/sbom/validate-components-sync.sh index 476e1e15..4d39c706 100755 --- a/sbom/validate-components-sync.sh +++ b/sbom/validate-components-sync.sh @@ -2,10 +2,13 @@ set -euo pipefail -# validate-components-sync.sh - Validate components.yaml sync with enabledApps +# validate-components-sync.sh - Validate components.yaml sync with enabledApps across all cluster sizes # Checks that components.yaml reflects current enabledApps and path consistency -VALUES_FILE="../root/values.yaml" +BASE_VALUES_FILE="../root/values.yaml" +SMALL_VALUES_FILE="../root/values_small.yaml" +MEDIUM_VALUES_FILE="../root/values_medium.yaml" +LARGE_VALUES_FILE="../root/values_large.yaml" COMPONENTS_FILE="./components.yaml" echo "🔄 Validating components.yaml reflects enabledApps..." @@ -17,14 +20,37 @@ if [[ ! -f "$COMPONENTS_FILE" ]]; then exit 1 fi -# Check if values.yaml exists -if [[ ! -f "$VALUES_FILE" ]]; then - echo "❌ Error: $VALUES_FILE not found" - exit 1 +# Function to collect enabled apps from a values file +collect_enabled_apps() { + local values_file="$1" + if [[ -f "$values_file" ]]; then + yq eval '.enabledApps[]' "$values_file" 2>/dev/null || echo "" + else + echo "" + fi +} + +# Collect enabled apps from all cluster size configurations +all_enabled_apps="" + +# Collect from base values.yaml (if enabledApps exists) +base_apps=$(collect_enabled_apps "$BASE_VALUES_FILE") +if [[ -n "$base_apps" ]]; then + all_enabled_apps="$all_enabled_apps$base_apps"$'\n' fi -# Get enabled apps (filtered, same as generation script) -enabled_apps=$(yq eval '.enabledApps[]' "$VALUES_FILE" 2>/dev/null || echo "") +# Collect from cluster size values +for size_file in "$SMALL_VALUES_FILE" "$MEDIUM_VALUES_FILE" "$LARGE_VALUES_FILE"; do + if [[ -f "$size_file" ]]; then + size_apps=$(collect_enabled_apps "$size_file") + if [[ -n "$size_apps" ]]; then + all_enabled_apps="$all_enabled_apps$size_apps"$'\n' + fi + fi +done + +# Get unique enabled apps (remove duplicates and empty lines) +enabled_apps=$(echo "$all_enabled_apps" | sort -u | grep -v '^$' || echo "") enabled_apps_filtered=$(echo "$enabled_apps" | grep -v -- '-config$' || echo "") # Get components in components.yaml @@ -66,29 +92,49 @@ if [ ${#missing_components[@]} -ne 0 ] || [ ${#extra_components[@]} -ne 0 ]; the exit 1 fi -# Check path consistency between values.yaml and components.yaml +# Check path consistency between cluster configuration files and components.yaml echo "⚙️ Checking path/valuesFile consistency..." path_mismatches=() while IFS= read -r app; do [[ -z "$app" ]] && continue - # Get paths from both files - values_path=$(yq eval ".apps.\"$app\".path" "$VALUES_FILE" 2>/dev/null || echo "null") + # Find the app definition in any of the cluster configuration files + values_path="" + for config_file in "$BASE_VALUES_FILE" "$SMALL_VALUES_FILE" "$MEDIUM_VALUES_FILE" "$LARGE_VALUES_FILE"; do + if [[ -f "$config_file" ]]; then + app_path=$(yq eval ".apps.\"$app\".path // \"null\"" "$config_file" 2>/dev/null || echo "null") + if [[ "$app_path" != "null" ]]; then + values_path="$app_path" + break + fi + fi + done + component_path=$(yq eval ".components.\"$app\".path" "$COMPONENTS_FILE" 2>/dev/null || echo "null") if [[ "$values_path" != "$component_path" ]]; then - path_mismatches+=("$app: values.yaml='$values_path' vs components.yaml='$component_path'") - echo "❌ Path mismatch for '$app': values.yaml='$values_path' vs components.yaml='$component_path'" + path_mismatches+=("$app: cluster-configs='$values_path' vs components.yaml='$component_path'") + echo "❌ Path mismatch for '$app': cluster-configs='$values_path' vs components.yaml='$component_path'" fi # Check valuesFile consistency - values_file_values=$(yq eval ".apps.\"$app\".valuesFile // \"null\"" "$VALUES_FILE" 2>/dev/null || echo "null") + values_file_values="null" + for config_file in "$BASE_VALUES_FILE" "$SMALL_VALUES_FILE" "$MEDIUM_VALUES_FILE" "$LARGE_VALUES_FILE"; do + if [[ -f "$config_file" ]]; then + app_path_check=$(yq eval ".apps.\"$app\".path // \"null\"" "$config_file" 2>/dev/null || echo "null") + if [[ "$app_path_check" != "null" ]]; then + values_file_values=$(yq eval ".apps.\"$app\".valuesFile // \"null\"" "$config_file" 2>/dev/null || echo "null") + break + fi + fi + done + values_file_components=$(yq eval ".components.\"$app\".valuesFile // \"null\"" "$COMPONENTS_FILE" 2>/dev/null || echo "null") if [[ "$values_file_values" != "$values_file_components" ]]; then - path_mismatches+=("$app valuesFile: values.yaml='$values_file_values' vs components.yaml='$values_file_components'") - echo "❌ ValuesFile mismatch for '$app': values.yaml='$values_file_values' vs components.yaml='$values_file_components'" + path_mismatches+=("$app valuesFile: cluster-configs='$values_file_values' vs components.yaml='$values_file_components'") + echo "❌ ValuesFile mismatch for '$app': cluster-configs='$values_file_values' vs components.yaml='$values_file_components'" fi done <<< "$enabled_apps_filtered" diff --git a/sbom/validate-enabled-apps.sh b/sbom/validate-enabled-apps.sh index 53a78685..c1128ed6 100755 --- a/sbom/validate-enabled-apps.sh +++ b/sbom/validate-enabled-apps.sh @@ -2,21 +2,60 @@ set -euo pipefail -# validate-enabled-apps.sh - Validate enabledApps consistency -# Checks that all apps in enabledApps have corresponding definitions in apps section +# validate-enabled-apps.sh - Validate enabledApps consistency across all cluster sizes +# Checks that all apps in enabledApps from all cluster configurations have corresponding definitions in apps section -VALUES_FILE="../root/values.yaml" +BASE_VALUES_FILE="../root/values.yaml" +SMALL_VALUES_FILE="../root/values_small.yaml" +MEDIUM_VALUES_FILE="../root/values_medium.yaml" +LARGE_VALUES_FILE="../root/values_large.yaml" -echo "📋 Validating enabledApps have app definitions..." +echo "📋 Validating enabledApps have app definitions across all cluster sizes..." -# Check if values.yaml exists -if [[ ! -f "$VALUES_FILE" ]]; then - echo "❌ Error: $VALUES_FILE not found" - exit 1 +# Function to collect enabled apps from a values file +collect_enabled_apps() { + local values_file="$1" + if [[ -f "$values_file" ]]; then + yq eval '.enabledApps[]' "$values_file" 2>/dev/null || echo "" + else + echo "" + fi +} + +# Collect enabled apps from all cluster size configurations +echo "🔍 Collecting enabled apps from all cluster configurations..." +all_enabled_apps="" + +# Collect from base values.yaml (if enabledApps exists) +base_apps=$(collect_enabled_apps "$BASE_VALUES_FILE") +if [[ -n "$base_apps" ]]; then + echo " 📄 Found apps in values.yaml: $(echo "$base_apps" | wc -l) apps" + all_enabled_apps="$all_enabled_apps$base_apps"$'\n' fi -# Get all enabled apps -enabled_apps=$(yq eval '.enabledApps[]' "$VALUES_FILE" 2>/dev/null || echo "") +# Collect from small cluster values +small_apps=$(collect_enabled_apps "$SMALL_VALUES_FILE") +if [[ -n "$small_apps" ]]; then + echo " 📄 Found apps in values_small.yaml: $(echo "$small_apps" | wc -l) apps" + all_enabled_apps="$all_enabled_apps$small_apps"$'\n' +fi + +# Collect from medium cluster values +medium_apps=$(collect_enabled_apps "$MEDIUM_VALUES_FILE") +if [[ -n "$medium_apps" ]]; then + echo " 📄 Found apps in values_medium.yaml: $(echo "$medium_apps" | wc -l) apps" + all_enabled_apps="$all_enabled_apps$medium_apps"$'\n' +fi + +# Collect from large cluster values +large_apps=$(collect_enabled_apps "$LARGE_VALUES_FILE") +if [[ -n "$large_apps" ]]; then + echo " 📄 Found apps in values_large.yaml: $(echo "$large_apps" | wc -l) apps" + all_enabled_apps="$all_enabled_apps$large_apps"$'\n' +fi + +# Get unique enabled apps (remove duplicates and empty lines) +enabled_apps=$(echo "$all_enabled_apps" | sort -u | grep -v '^$' || echo "") if [[ -z "$enabled_apps" ]]; then echo "ℹ️ No enabled apps found in enabledApps list" @@ -31,11 +70,44 @@ if [[ -z "$enabled_apps_filtered" ]]; then exit 0 fi -# Get all defined apps in apps section -defined_apps=$(yq eval '.apps | keys | .[]' "$VALUES_FILE" 2>/dev/null || echo "") +# Function to collect app definitions from a values file +collect_app_definitions() { + local values_file="$1" + if [[ -f "$values_file" ]]; then + yq eval '.apps | keys | .[]' "$values_file" 2>/dev/null || echo "" + else + echo "" + fi +} + +# Collect app definitions from all cluster size configurations +echo "🔍 Collecting app definitions from all cluster configurations..." +all_defined_apps="" + +# Collect from base values.yaml +base_defined_apps=$(collect_app_definitions "$BASE_VALUES_FILE") +if [[ -n "$base_defined_apps" ]]; then + echo " 📄 Found app definitions in values.yaml: $(echo "$base_defined_apps" | wc -l) apps" + all_defined_apps="$all_defined_apps$base_defined_apps"$'\n' +fi + +# Collect from cluster size values +for size_file in "$SMALL_VALUES_FILE" "$MEDIUM_VALUES_FILE" "$LARGE_VALUES_FILE"; do + if [[ -f "$size_file" ]]; then + size_defined_apps=$(collect_app_definitions "$size_file") + if [[ -n "$size_defined_apps" ]]; then + size_name=$(basename "$size_file") + echo " 📄 Found app definitions in $size_name: $(echo "$size_defined_apps" | wc -l) apps" + all_defined_apps="$all_defined_apps$size_defined_apps"$'\n' + fi + fi +done + +# Get unique defined apps (remove duplicates and empty lines) +defined_apps=$(echo "$all_defined_apps" | sort -u | grep -v '^$' || echo "") if [[ -z "$defined_apps" ]]; then - echo "❌ Error: No app definitions found in apps section" + echo "❌ Error: No app definitions found in any cluster configuration files" exit 1 fi diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index c0a8c461..c2b76d15 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -2,350 +2,747 @@ set -euo pipefail -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LATEST_RELEASE="v1.8.0" # Initialize variables -DOMAIN="" -VALUES_FILE="values.yaml" +APPS="" CLUSTER_SIZE="medium" # Default to medium +DEFAULT_TIMEOUT="5m" +DOMAIN="" KUBE_VERSION=1.33 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SKIP_DEPENDENCY_CHECK=false +TARGET_REVISION="$LATEST_RELEASE" +TEMPLATE_ONLY=false +VALUES_FILE="values.yaml" -DEV_MODE=false -TARGET_REVISION="main" +# Helper function to print messages only when not in template mode +log_info() { + if [ "$TEMPLATE_ONLY" = false ]; then + echo "$@" + fi +} -# Parse arguments -while [[ $# -gt 0 ]]; do - case $1 in - --CLUSTER_SIZE) - if [ -z "$2" ]; then - echo "ERROR: --CLUSTER_SIZE requires an argument" - exit 1 - fi - CLUSTER_SIZE="$2" - shift 2 - ;; - --CLUSTER_SIZE=*) - CLUSTER_SIZE="${1#*=}" - shift - ;; - --dev) - DEV_MODE=true - shift - ;; - --help|-h) - echo "Usage: $0 [options] [values_file]" - echo "" - echo "Arguments:" - echo " domain Required. Cluster domain (e.g., example.com)" - echo " values_file Optional. Values file to use (default: values.yaml)" - echo "" - echo "Options:" - echo " --CLUSTER_SIZE Optional. Cluster size [small|medium|large] (default: medium)" - echo " --dev Enable developer mode (sets Gitea repos to feature branch or custom value)" +# Generate a secure random password +generate_password() { + openssl rand -hex 16 +} + +# Check for required dependencies +check_dependencies() { + local silent="${1:-false}" + local missing_deps=() + local all_good=true + + if [ "$silent" != "true" ]; then + echo "=== Checking Dependencies ===" + fi + + # Define required programs with installation instructions + declare -A REQUIRED_PROGRAMS=( + ["kubectl"]="Kubernetes CLI - https://kubernetes.io/docs/tasks/tools/install-kubectl/" + ["helm"]="Helm package manager - https://helm.sh/docs/intro/install/" + ["yq"]="YAML/JSON processor - https://github.com/mikefarah/yq#install" + ["openssl"]="OpenSSL for password generation - Usually pre-installed or via package manager" + ) + + # Define optional programs (used by shell builtins but good to check) + declare -A OPTIONAL_PROGRAMS=( + ["cat"]="cat command - Usually pre-installed" + ["grep"]="grep command - Usually pre-installed" + ["tr"]="tr command - Usually pre-installed" + ["head"]="head command - Usually pre-installed" + ) + + # Check required programs with version info + for program in "${!REQUIRED_PROGRAMS[@]}"; do + if command -v "$program" >/dev/null 2>&1; then + case "$program" in + "kubectl") + version=$(kubectl version --client 2>/dev/null | head -n1 | cut -d' ' -f3 2>/dev/null || echo "unknown") + [ "$silent" != "true" ] && printf " ✓ %-12s %s (%s)\n" "$program" "$(command -v "$program")" "$version" + ;; + "helm") + version=$(helm version --short --client 2>/dev/null | cut -d'+' -f1 2>/dev/null || echo "unknown") + [ "$silent" != "true" ] && printf " ✓ %-12s %s (%s)\n" "$program" "$(command -v "$program")" "$version" + ;; + "yq") + version=$(yq --version 2>/dev/null | head -n1 | cut -d' ' -f4 2>/dev/null || echo "unknown") + [ "$silent" != "true" ] && printf " ✓ %-12s %s (%s)\n" "$program" "$(command -v "$program")" "$version" + ;; + *) + [ "$silent" != "true" ] && printf " ✓ %-12s %s\n" "$program" "$(command -v "$program")" + ;; + esac + else + [ "$silent" != "true" ] && printf " ✗ %-12s MISSING\n" "$program" + missing_deps+=("$program") + all_good=false + fi + done + + # Check optional programs (warn but don't fail) + for program in "${!OPTIONAL_PROGRAMS[@]}"; do + if command -v "$program" >/dev/null 2>&1; then + [ "$silent" != "true" ] && printf " ✓ %-12s %s\n" "$program" "$(command -v "$program")" + else + [ "$silent" != "true" ] && printf " ! %-12s MISSING (usually pre-installed)\n" "$program" + fi + done + + # If any required dependencies are missing, show installation instructions + if [ "$all_good" = false ]; then + echo "" + echo "ERROR: Missing required dependencies!" + echo "" + echo "Please install the following programs:" + echo "" + + for dep in "${missing_deps[@]}"; do + echo " $dep: ${REQUIRED_PROGRAMS[$dep]}" echo "" + + # Provide platform-specific installation hints + case "$dep" in + "kubectl") + echo " # Linux:" + echo " curl -LO \"https://dl.k8s.io/release/\$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\"" + echo " sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl" + echo "" + echo " # macOS:" + echo " brew install kubectl" + echo "" + echo " # Or download from: https://kubernetes.io/docs/tasks/tools/install-kubectl/" + ;; + "helm") + echo " # Linux/macOS:" + echo " curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash" + echo "" + echo " # Or via package manager:" + echo " # Linux: snap install helm --classic" + echo " # macOS: brew install helm" + ;; + "yq") + echo " # Linux:" + echo " sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64" + echo " sudo chmod +x /usr/local/bin/yq" + echo "" + echo " # macOS:" + echo " brew install yq" + ;; + "openssl") + echo " # Linux:" + echo " # Ubuntu/Debian: sudo apt-get install openssl" + echo " # RHEL/CentOS: sudo yum install openssl" + echo "" + echo " # macOS: Usually pre-installed, or: brew install openssl" + ;; + esac echo "" - echo "Examples:" - echo " $0 myIP.nip.io" - echo " $0 example.com values_custom.yaml --CLUSTER_SIZE=large" - echo " $0 --dev dev.example.com --CLUSTER_SIZE=small" - exit 0 - ;; - --*) - echo "ERROR: Unknown option: $1" - echo "Use --help for usage information" + done + + echo "After installing the missing dependencies, please run this script again." + exit 1 + fi + + if [ "$silent" != "true" ]; then + echo " ✓ All required dependencies are available!" + echo "" + fi +} + +parse_args() { + # Parse arguments + while [[ $# -gt 0 ]]; do + case $1 in + --CLUSTER-SIZE|--cluster-size|-s) + if [ -z "$2" ]; then + echo "ERROR: --cluster-size requires an argument" + exit 1 + fi + CLUSTER_SIZE="$2" + shift 2 + ;; + --CLUSTER-SIZE=*) + CLUSTER_SIZE="${1#*=}" + shift + ;; + --cluster-size=*) + CLUSTER_SIZE="${1#*=}" + shift + ;; + -s=*) + CLUSTER_SIZE="${1#*=}" + shift + ;; + --TARGET-REVISION|--target-revision|-r) + if [ -z "$2" ]; then + echo "WARNING: defaulting to --target-revision=$LATEST_RELEASE (no value specified)" + TARGET_REVISION="$LATEST_RELEASE" + shift + else + TARGET_REVISION="$2" + shift 2 + fi + ;; + --TARGET-REVISION=*) + TARGET_REVISION="${1#*=}" + shift + ;; + --target-revision=*) + TARGET_REVISION="${1#*=}" + shift + ;; + -r=*) + TARGET_REVISION="${1#*=}" + shift + ;; + --template-only|-t) + TEMPLATE_ONLY=true + shift + ;; + --skip-deps) + SKIP_DEPENDENCY_CHECK=true + shift + ;; + --apps=*) + APPS="${1#*=}" + shift + ;; + --airm-image-repository) + if [ -z "$2" ]; then + echo "ERROR: --airm-image-repository requires an argument" + exit 1 + fi + AIRM_IMAGE_REPOSITORY="$2" + shift 2 + ;; + --airm-image-repository=*) + AIRM_IMAGE_REPOSITORY="${1#*=}" + shift + ;; + --help|-h) + cat < [values_file] + + Arguments: + domain REQUIRED. Cluster domain (e.g., myIp.nip.io) + values_file Optional. Values .yaml file to use, default: root/values.yaml + + Options: + --airm-image-repository=url Custom AIRM image repository for gitea-init job (e.g., ghcr.io/silogen, requires regcreds) + --apps=app1[,app2,...] Deploy (kubectl apply) specified components onlye + options: namespaces, argocd, openbao, gitea, cluster-forge, or any cluster-forge child app (see values.yaml for app names) + + --cluster-size=[size], -s [size] can be one of small|medium|large, default: medium + --help, -h Show this help message and exit + --skip-deps Skip dependency checking (not recommended) + --target-revision, -r Git revision for ArgoCD to sync from, [tag|commit_hash|branch_name], default: $LATEST_RELEASE + --template-only, -t Output YAML manifests to stdout instead of applying to cluster + + + Examples: + $0 compute.amd.com values_custom.yaml --cluster-size=large + $0 112.100.97.17.nip.io + $0 dev.example.com --cluster-size=small --target-revision=v1.8.0 + $0 dev.example.com -s=small -r=feature-branch + $0 example.com --apps=openbao + $0 example.com --apps=keycloak -t + + Bootstrap Behavior: + • deploys ArgoCD + OpenBao + Gitea directly (essential infrastructure) + • apply the cluster-forge application manifest (parent app only) + • ArgoCD syncs remaining apps from specified target revision, respecting syncWaves and dependencies +HELP_OUTPUT + exit 0 + ;; + --*) + echo "ERROR: Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + *) + # Positional arguments + if [ -z "$DOMAIN" ]; then + DOMAIN="$1" + elif [ "$VALUES_FILE" = "values.yaml" ]; then + VALUES_FILE="$1" + else + echo "ERROR: Too many arguments: $1" + echo "Usage: $0 [--CLUSTER_SIZE=small|medium|large] [--dev] [values_file]" + exit 1 + fi + shift + ;; + esac + done +} + +validate_args() { + # Validate required arguments + if [ -z "$DOMAIN" ]; then + echo "ERROR: Domain argument is required" + echo "Usage: $0 [values_file] [--CLUSTER_SIZE=small|medium|large]" + echo "Use --help for more details" exit 1 + fi + + # Validate cluster size + case "$CLUSTER_SIZE" in + small|medium|large) ;; *) - # Positional arguments - if [ -z "$DOMAIN" ]; then - DOMAIN="$1" - elif [ "$VALUES_FILE" = "values.yaml" ]; then - VALUES_FILE="$1" - else - echo "ERROR: Too many arguments: $1" - echo "Usage: $0 [--CLUSTER_SIZE=small|medium|large] [--dev] [values_file]" - exit 1 - fi - shift + echo "ERROR: Invalid cluster size '$CLUSTER_SIZE'" + echo "Valid sizes: small, medium, large" + exit 1 ;; esac -done - -# Validate required arguments -if [ -z "$DOMAIN" ]; then - echo "ERROR: Domain argument is required" - echo "Usage: $0 [values_file] [--CLUSTER_SIZE=small|medium|large]" - echo "Use --help for more details" - exit 1 -fi - -# Validate cluster size -case "$CLUSTER_SIZE" in - small|medium|large) - ;; - *) - echo "ERROR: Invalid cluster size '$CLUSTER_SIZE'" - echo "Valid sizes: small, medium, large" - exit 1 - ;; -esac + + # Validate values file exists + if [ ! -f "${SCRIPT_DIR}/../root/${VALUES_FILE}" ]; then + echo "ERROR: Values file not found: ${SCRIPT_DIR}/../root/${VALUES_FILE}" + exit 1 + fi + + SOURCE_ROOT="${SCRIPT_DIR}/.." + setup_values_files +} -# Validate values file exists -if [ ! -f "${SCRIPT_DIR}/../root/${VALUES_FILE}" ]; then - echo "ERROR: Values file not found: ${SCRIPT_DIR}/../root/${VALUES_FILE}" - exit 1 -fi - -# Check if size-specific values file exists (optional overlay) -SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" -if [ ! -f "${SCRIPT_DIR}/../root/${SIZE_VALUES_FILE}" ]; then - echo "WARNING: Size-specific values file not found: ${SCRIPT_DIR}/../root/${SIZE_VALUES_FILE}" - echo "Proceeding with base values file only: ${VALUES_FILE}" - SIZE_VALUES_FILE="" -fi - -get_target_revision() { - if [ "$DEV_MODE" = true ]; then - CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "main") - echo "" - echo "Development mode enabled - ArgoCD will point to live GitHub repository" - echo "Current git branch: $CURRENT_BRANCH" - echo "" - read -p "Use current branch '$CURRENT_BRANCH' for targetRevision? [Y/n/custom_branch]: " choice - - case "$choice" in - n|N|no|No|NO) - echo "Exiting. Please checkout the branch you want to use and run again." - exit 0 - ;; - [Cc]ustom*|custom*) - read -p "Enter custom branch name: " custom_branch - if [ -n "$custom_branch" ]; then - TARGET_REVISION="$custom_branch" - else - echo "ERROR: Custom branch name cannot be empty" - exit 1 - fi - ;; - y|Y|yes|Yes|YES|"") - TARGET_REVISION="$CURRENT_BRANCH" - ;; - *) - # Treat any other input as a custom branch name - TARGET_REVISION="$choice" - ;; - esac - echo "Using targetRevision: $TARGET_REVISION" +# Check if size-specific values file exists - matching main approach +setup_values_files() { + SIZE_VALUES_FILE="values_${CLUSTER_SIZE}.yaml" + + if [ ! -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + log_info "WARNING: Size-specific values file not found: ${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" + log_info "Proceeding with base values file only: ${VALUES_FILE}" + SIZE_VALUES_FILE="" + else + log_info "Using size-specific values file: ${SIZE_VALUES_FILE}" fi } -pre_cleanup() { - echo "=== Pre-cleanup: Checking for previous runs ===" - - # Check if gitea-init-job exists and completed successfully - if kubectl get job gitea-init-job -n cf-gitea >/dev/null 2>&1; then - if kubectl get job gitea-init-job -n cf-gitea -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null | grep -q "True"; then - echo "Found completed gitea-init-job - removing Gitea to start fresh" - - # Delete all Gitea resources - kubectl delete job gitea-init-job -n cf-gitea --ignore-not-found=true - kubectl delete deployment gitea -n cf-gitea --ignore-not-found=true - kubectl delete statefulset gitea -n cf-gitea --ignore-not-found=true - kubectl delete service gitea -n cf-gitea --ignore-not-found=true - kubectl delete service gitea-http -n cf-gitea --ignore-not-found=true - kubectl delete service gitea-ssh -n cf-gitea --ignore-not-found=true - kubectl delete pvc -n cf-gitea -l app.kubernetes.io/name=gitea --ignore-not-found=true - kubectl delete configmap initial-cf-values -n cf-gitea --ignore-not-found=true - kubectl delete secret gitea-admin-credentials -n cf-gitea --ignore-not-found=true - kubectl delete ingress -n cf-gitea -l app.kubernetes.io/name=gitea --ignore-not-found=true - - echo "Gitea resources deleted" - fi - fi +print_summary() { + # Don't print summary if just outputting templates + # if [ "$TEMPLATE_ONLY" = true ]; then + # return + # fi - # Always delete openbao-init-job to allow re-initialization - kubectl delete job openbao-init-job -n cf-openbao --ignore-not-found=true + cat </dev/null 2>&1; then - YQ_CMD="yq" -elif [ -f "$HOME/yq" ]; then - YQ_CMD="$HOME/yq" -else - echo "ERROR: yq command not found. Please install yq or place it in $HOME/yq" - exit 1 -fi - -# Update the global.clusterSize in the base values file with full filename -$YQ_CMD -i ".global.clusterSize = \"values_${CLUSTER_SIZE}.yaml\"" "${SCRIPT_DIR}/../root/${VALUES_FILE}" - -# Function to merge values files early for use throughout the script -merge_values_files() { - echo "Merging values files..." - if [ -n "$SIZE_VALUES_FILE" ]; then - # Merge base values with size-specific overrides - VALUES=$($YQ_CMD eval-all '. as $item ireduce ({}; . * $item)' \ - ${SCRIPT_DIR}/../root/${VALUES_FILE} \ - ${SCRIPT_DIR}/../root/${SIZE_VALUES_FILE} | \ - $YQ_CMD eval ".global.domain = \"${DOMAIN}\"") + +# Helper function to either apply directly or output YAML for templating +apply_or_template() { + if [ "$TEMPLATE_ONLY" = true ]; then + cat + else + kubectl apply "$@" + fi +} + +# Create namespaces +create_namespaces() { + for ns in argocd cf-gitea cf-openbao; do + kubectl create ns "$ns" --dry-run=client -o yaml | apply_or_template -f - + done +} + +# Extract ArgoCD values using yq +extract_argocd_values() { + # Create temporary values file for ArgoCD bootstrap + cat > /tmp/argocd_bootstrap_values.yaml << EOF +global: + domain: argocd.${DOMAIN} +EOF + + # Extract and merge ArgoCD values from the apps structure + yq eval '.apps.argocd.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" >> /tmp/argocd_bootstrap_values.yaml + if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + if yq eval '.apps.argocd.valuesObject // ""' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | grep -q .; then + yq eval '.apps.argocd.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | yq eval-all 'select(fileIndex == 0) * select(fileIndex == 1)' /tmp/argocd_bootstrap_values.yaml - > /tmp/argocd_bootstrap_values_merged.yaml + mv /tmp/argocd_bootstrap_values_merged.yaml /tmp/argocd_bootstrap_values.yaml + fi + fi +} + +# ArgoCD bootstrap +bootstrap_argocd() { + log_info "=== ArgoCD Bootstrap ===" + extract_argocd_values + helm template --release-name argocd ${SOURCE_ROOT}/sources/argocd/8.3.5 --namespace argocd \ + --values /tmp/argocd_bootstrap_values.yaml \ + --kube-version=${KUBE_VERSION} | apply_or_template --server-side --field-manager=argocd-controller --force-conflicts -f - + if [ "$TEMPLATE_ONLY" = false ]; then + kubectl rollout status statefulset/argocd-application-controller -n argocd --timeout="${DEFAULT_TIMEOUT}" + kubectl rollout status deploy/argocd-applicationset-controller -n argocd --timeout="${DEFAULT_TIMEOUT}" + kubectl rollout status deploy/argocd-redis -n argocd --timeout="${DEFAULT_TIMEOUT}" + kubectl rollout status deploy/argocd-repo-server -n argocd --timeout="${DEFAULT_TIMEOUT}" + fi +} + + + +bootstrap_openbao() { + log_info "=== OpenBao Bootstrap ===" + + log_info "Debug: SOURCE_ROOT='${SOURCE_ROOT}'" + log_info "Debug: VALUES_FILE='${VALUES_FILE}'" + log_info "Debug: SIZE_VALUES_FILE='${SIZE_VALUES_FILE}'" + log_info "Debug: CLUSTER_SIZE='${CLUSTER_SIZE}'" + + # Get OpenBao version from app path - using same method as main + OPENBAO_VERSION=$(yq eval '.apps.openbao.path' "${SOURCE_ROOT}/root/${VALUES_FILE}" | cut -d'/' -f2) + log_info "OpenBao version: $OPENBAO_VERSION" + + # Create a temporary directory for processing OpenBao values + TEMP_DIR=$(mktemp -d -t cf-bootstrap.XXXXXX) || { log_info "ERROR: Cannot create temp directory"; exit 1; } + log_info "Using temp directory: $TEMP_DIR" + + # Extract OpenBao values from base configuration + log_info "Extracting OpenBao values..." + yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > "${TEMP_DIR}/openbao_values.yaml" || { echo "ERROR: Failed to extract OpenBao values from ${VALUES_FILE}"; exit 1; } + + if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + log_info "Extracting OpenBao size-specific values from ${SIZE_VALUES_FILE}..." + log_info "Checking if openbao section exists in size values file..." + if yq eval 'has("apps") and .apps.openbao' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" | grep -q true; then + log_info "OpenBao section found, extracting values..." + if ! yq eval '.apps.openbao.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > "${TEMP_DIR}/openbao_size_values.yaml"; then + log_info "WARNING: Failed to extract OpenBao valuesObject from ${SIZE_VALUES_FILE}, using empty values" + printf "# OpenBao valuesObject not found in size file\n" > "${TEMP_DIR}/openbao_size_values.yaml" + fi else - # Use base values only - VALUES=$(cat ${SCRIPT_DIR}/../root/${VALUES_FILE} | $YQ_CMD ".global.domain = \"${DOMAIN}\"") + log_info "No OpenBao section in size values file, creating empty placeholder..." + printf "# No OpenBao section in size-specific values\n" > "${TEMP_DIR}/openbao_size_values.yaml" fi + else + log_info "No size-specific values file, creating empty placeholder..." + printf "# No size-specific values\n" > "${TEMP_DIR}/openbao_size_values.yaml" + fi + + # Use server-side apply to match ArgoCD's field management strategy + helm template --release-name openbao ${SOURCE_ROOT}/sources/openbao/${OPENBAO_VERSION} --namespace cf-openbao \ + -f "${TEMP_DIR}/openbao_values.yaml" \ + -f "${TEMP_DIR}/openbao_size_values.yaml" \ + --set ui.enabled=true \ + --kube-version=${KUBE_VERSION} | apply_or_template --server-side --field-manager=argocd-controller --force-conflicts -f - + + if [ "$TEMPLATE_ONLY" = false ]; then + kubectl wait --for=jsonpath='{.status.phase}'=Running pod/openbao-0 -n cf-openbao --timeout=100s + + # Create initial secrets config for init job (separate from ArgoCD-managed version) + log_info "Creating initial OpenBao secrets configuration..." + cat ${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-manager-cm.yaml | \ + sed "s|name: openbao-secret-manager-scripts|name: openbao-secret-manager-scripts-init|g" | kubectl apply -f - + + # Create initial secrets config for init job (separate from ArgoCD-managed version) + cat ${SOURCE_ROOT}/sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml | \ + sed "s|{{ .Values.domain }}|${DOMAIN}|g" | \ + sed "s|name: openbao-secrets-config|name: openbao-secrets-init-config|g" | kubectl apply -f - - # Write merged values to temp file for use throughout script - echo "$VALUES" > /tmp/merged_values.yaml - echo "Merged values written to /tmp/merged_values.yaml" + # Pass OpenBao configuration to init script + helm template --release-name openbao-init ${SOURCE_ROOT}/scripts/init-openbao-job \ + -f "${TEMP_DIR}/openbao_values.yaml" \ + --set domain="${DOMAIN}" \ + --kube-version=${KUBE_VERSION} | kubectl apply -f - + kubectl wait --for=condition=complete --timeout=300s job/openbao-init-job -n cf-openbao + fi + + # Cleanup temp directory + rm -rf "${TEMP_DIR}" } -# Helper functions to extract values from merged configuration -get_argocd_value() { - local path="$1" - $YQ_CMD eval ".apps.argocd.valuesObject.${path}" /tmp/merged_values.yaml + + +bootstrap_gitea() { + log_info "=== Gitea Bootstrap ===" + + # Print debug information + log_info "Debug: SOURCE_ROOT='${SOURCE_ROOT}'" + log_info "Debug: VALUES_FILE='${VALUES_FILE}'" + log_info "Debug: SIZE_VALUES_FILE='${SIZE_VALUES_FILE}'" + log_info "Debug: CLUSTER_SIZE='${CLUSTER_SIZE}'" + + # Get Gitea version from app path - matching main approach + GITEA_VERSION=$(yq eval '.apps.gitea.path' "${SOURCE_ROOT}/root/${VALUES_FILE}" | cut -d'/' -f2) + log_info "Gitea version: $GITEA_VERSION" + + # Create a temporary directory for processing Gitea values + TEMP_DIR=$(mktemp -d -t cf-gitea-bootstrap.XXXXXX) || { log_info "ERROR: Cannot create temp directory"; exit 1; } + log_info "Using temp directory: $TEMP_DIR" + + # Create initial-cf-values configmap (complete values for gitea-init-job) + # Use the complete root values.yaml with filled placeholders instead of simplified version + cp "${SOURCE_ROOT}/root/${VALUES_FILE}" "${TEMP_DIR}/complete_values.yaml" + + # Fill in placeholder values using yq (these are used by gitea-init job) + yq eval ".global.domain = \"${DOMAIN}\"" -i "${TEMP_DIR}/complete_values.yaml" + if [ -n "${SIZE_VALUES_FILE}" ]; then + yq eval ".global.clusterSize = \"${SIZE_VALUES_FILE}\"" -i "${TEMP_DIR}/complete_values.yaml" + else + yq eval ".global.clusterSize = \"values_${CLUSTER_SIZE}.yaml\"" -i "${TEMP_DIR}/complete_values.yaml" + fi + yq eval ".clusterForge.targetRevision = \"${TARGET_REVISION}\"" -i "${TEMP_DIR}/complete_values.yaml" + + # Merge with size-specific values if they exist + if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + yq eval-all 'select(fileIndex == 0) * select(fileIndex == 1)' "${TEMP_DIR}/complete_values.yaml" "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > "${TEMP_DIR}/complete_values_merged.yaml" + mv "${TEMP_DIR}/complete_values_merged.yaml" "${TEMP_DIR}/complete_values.yaml" + fi + + kubectl create configmap initial-cf-values --from-literal=initial-cf-values="$(cat "${TEMP_DIR}/complete_values.yaml")" --dry-run=client -o yaml | apply_or_template -n cf-gitea -f - + + kubectl create secret generic gitea-admin-credentials \ + --namespace=cf-gitea \ + --from-literal=username=silogen-admin \ + --from-literal=password=$(generate_password) \ + --dry-run=client -o yaml | apply_or_template -f - + + # Extract Gitea values like main does + log_info "Extracting Gitea values..." + yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${VALUES_FILE}" > "${TEMP_DIR}/gitea_values.yaml" || { log_info "ERROR: Failed to extract Gitea values from ${VALUES_FILE}"; exit 1; } + + if [ -n "${SIZE_VALUES_FILE}" ] && [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + log_info "Extracting Gitea size-specific values from ${SIZE_VALUES_FILE}..." + log_info "Checking if gitea section exists in size values file..." + if yq eval '.apps.gitea' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" >/dev/null 2>&1 && [ "$(yq eval '.apps.gitea' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}")" != "null" ]; then + log_info "Gitea section found, extracting values..." + yq eval '.apps.gitea.valuesObject' "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" > "${TEMP_DIR}/gitea_size_values.yaml" || { + log_info "WARNING: Failed to extract Gitea valuesObject from ${SIZE_VALUES_FILE}, using empty values" + printf "# Gitea valuesObject not found in size file\n" > "${TEMP_DIR}/gitea_size_values.yaml" + } + else + log_info "No Gitea section in size values file, creating empty placeholder..." + printf "# No Gitea section in size-specific values\n" > "${TEMP_DIR}/gitea_size_values.yaml" + fi + else + log_info "No size-specific values file, creating empty placeholder..." + printf "# No size-specific values\n" > "${TEMP_DIR}/gitea_size_values.yaml" + fi + + # Bootstrap Gitea - matching main approach + helm template --release-name gitea ${SOURCE_ROOT}/sources/gitea/${GITEA_VERSION} --namespace cf-gitea \ + -f "${TEMP_DIR}/gitea_values.yaml" \ + -f "${TEMP_DIR}/gitea_size_values.yaml" \ + --set gitea.config.server.ROOT_URL="https://gitea.${DOMAIN}/" \ + --kube-version=${KUBE_VERSION} | apply_or_template -f - + + if [ "$TEMPLATE_ONLY" = false ]; then + kubectl rollout status deploy/gitea -n cf-gitea --timeout="${DEFAULT_TIMEOUT}" + fi + + # Gitea Init Job - preserve AIRM repository functionality + HELM_ARGS="--release-name gitea-init ${SOURCE_ROOT}/scripts/init-gitea-job \ + --set clusterSize=${SIZE_VALUES_FILE:-values_${CLUSTER_SIZE}.yaml} \ + --set domain=${DOMAIN} \ + --set targetRevision=${TARGET_REVISION} \ + --kube-version=${KUBE_VERSION}" + + # Only add airmImageRepository if AIRM_IMAGE_REPOSITORY is set and non-empty + if [ -n "${AIRM_IMAGE_REPOSITORY:-}" ]; then + HELM_ARGS="${HELM_ARGS} --set airmImageRepository=${AIRM_IMAGE_REPOSITORY}" + fi + + helm template ${HELM_ARGS} | apply_or_template -f - + + if [ "$TEMPLATE_ONLY" = false ]; then + kubectl wait --for=condition=complete --timeout="${DEFAULT_TIMEOUT}" job/gitea-init-job -n cf-gitea + fi + + # Cleanup temp directory + rm -rf "${TEMP_DIR}" } -get_openbao_value() { - local path="$1" - $YQ_CMD eval ".apps.openbao.valuesObject.${path}" /tmp/merged_values.yaml +# Render specific cluster-forge child apps (for --apps filtering) +render_cluster_forge_child_apps() { + + # Create a temporary values file with only the requested apps enabled + local temp_values="/tmp/filtered_values.yaml" + cat > "$temp_values" << EOF +global: + domain: ${DOMAIN} +enabledApps: [] +apps: {} +EOF + + # Copy specific app configurations from the main values + local IFS=',' + for app in $APPS; do + # Add to enabledApps list + yq eval ".enabledApps += [\"$app\"]" -i "$temp_values" + + # Copy app configuration if it exists in values.yaml + if yq eval ".apps | has(\"$app\")" "${SOURCE_ROOT}/root/${VALUES_FILE}" 2>/dev/null | grep -q "true"; then + yq eval ".apps[\"$app\"] = load(\"${SOURCE_ROOT}/root/${VALUES_FILE}\").apps[\"$app\"]" -i "$temp_values" + fi + + # Merge size-specific configuration if it exists + if [ -f "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" ]; then + if yq eval ".apps | has(\"$app\")" "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" 2>/dev/null | grep -q "true"; then + yq eval ".apps[\"$app\"] = (.apps[\"$app\"] // {}) * load(\"${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}\").apps[\"$app\"]" -i "$temp_values" + fi + fi + done + + # Render only the cluster-apps template with filtered values + helm template cluster-forge "${SOURCE_ROOT}/root" \ + --show-only templates/cluster-apps.yaml \ + --values "$temp_values" \ + --set clusterForge.targetRevision="${TARGET_REVISION}" \ + --set externalValues.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-values.git" \ + --set clusterForge.repoUrl="http://gitea-http.cf-gitea.svc:3000/cluster-org/cluster-forge.git" \ + --namespace argocd \ + --kube-version "${KUBE_VERSION}" | apply_or_template -f - + + # Clean up + rm -f "$temp_values" } -# Merge values files early so all subsequent operations can use the merged config -merge_values_files +apply_cluster_forge_parent_app() { + # Create cluster-forge parent app only (not all apps) + log_info "=== Creating ClusterForge Parent App ===" + log_info "Target revision: $TARGET_REVISION" + + + + helm template cluster-forge "${SOURCE_ROOT}/root" \ + --show-only templates/cluster-forge.yaml \ + --values "${SOURCE_ROOT}/root/${VALUES_FILE}" \ + --values "${SOURCE_ROOT}/root/${SIZE_VALUES_FILE}" \ + --set global.clusterSize="${SIZE_VALUES_FILE}" \ + --set global.domain="${DOMAIN}" \ + --set clusterForge.targetRevision="${TARGET_REVISION}" \ + --namespace argocd \ + --kube-version "${KUBE_VERSION}" | apply_or_template -f - +} -# Create namespaces -kubectl create ns argocd --dry-run=client -o yaml | kubectl apply -f - -kubectl create ns cf-gitea --dry-run=client -o yaml | kubectl apply -f - -kubectl create ns cf-openbao --dry-run=client -o yaml | kubectl apply -f - +# Check if requested apps are cluster-forge child apps +is_cluster_forge_child_app() { + local app="$1" + # Check if the app is defined in the values.yaml apps section + local app_config=$(yq eval ".apps[\"$app\"]" "${SOURCE_ROOT}/root/${VALUES_FILE}" 2>/dev/null) + [ "$app_config" != "null" ] && return 0 + return 1 +} -# ArgoCD bootstrap -echo "Bootstrapping ArgoCD..." -# Extract ArgoCD values from merged config and write to temp values file -$YQ_CMD eval '.apps.argocd.valuesObject' ${SCRIPT_DIR}/../root/${VALUES_FILE} > /tmp/argocd_values.yaml -$YQ_CMD eval '.apps.argocd.valuesObject' ${SCRIPT_DIR}/../root/${SIZE_VALUES_FILE} > /tmp/argocd_size_values.yaml -# Use server-side apply to match ArgoCD's self-management strategy -helm template --release-name argocd ${SCRIPT_DIR}/../sources/argocd/8.3.5 --namespace argocd \ - -f /tmp/argocd_values.yaml \ - -f /tmp/argocd_size_values.yaml \ - --set global.domain="argocd.${DOMAIN}" \ - --kube-version=${KUBE_VERSION} | kubectl apply --server-side --field-manager=argocd-controller --force-conflicts -f - -kubectl rollout status statefulset/argocd-application-controller -n argocd -kubectl rollout status deploy/argocd-applicationset-controller -n argocd -kubectl rollout status deploy/argocd-redis -n argocd -kubectl rollout status deploy/argocd-repo-server -n argocd - -# OpenBao bootstrap -echo "Bootstrapping OpenBao..." -# Extract OpenBao values from merged config -$YQ_CMD eval '.apps.openbao.valuesObject' ${SCRIPT_DIR}/../root/${VALUES_FILE} > /tmp/openbao_values.yaml -$YQ_CMD eval '.apps.openbao.valuesObject' ${SCRIPT_DIR}/../root/${SIZE_VALUES_FILE} > /tmp/openbao_size_values.yaml -# Use server-side apply to match ArgoCD's field management strategy -helm template --release-name openbao ${SCRIPT_DIR}/../sources/openbao/0.18.2 --namespace cf-openbao \ - -f /tmp/openbao_values.yaml \ - -f /tmp/openbao_size_values.yaml \ - --set ui.enabled=true \ - --kube-version=${KUBE_VERSION} | kubectl apply --server-side --field-manager=argocd-controller --force-conflicts -f - -kubectl wait --for=jsonpath='{.status.phase}'=Running pod/openbao-0 -n cf-openbao --timeout=100s - -# Create initial secrets config for init job (separate from ArgoCD-managed version) -echo "Creating initial OpenBao secrets configuration..." -cat ${SCRIPT_DIR}/../sources/openbao-config/0.1.0/templates/openbao-secret-manager-cm.yaml | \ - sed "s|name: openbao-secret-manager-scripts|name: openbao-secret-manager-scripts-init|g" | kubectl apply -f - - -# Create initial secrets config for init job (separate from ArgoCD-managed version) -echo "Creating initial OpenBao secrets configuration..." -cat ${SCRIPT_DIR}/../sources/openbao-config/0.1.0/templates/openbao-secret-definitions.yaml | \ - sed "s|{{ .Values.domain }}|${DOMAIN}|g" | \ - sed "s|name: openbao-secrets-config|name: openbao-secrets-init-config|g" | kubectl apply -f - - -# Pass OpenBao configuration to init script -helm template --release-name openbao-init ${SCRIPT_DIR}/init-openbao-job \ - -f /tmp/openbao_values.yaml \ - --set domain="${DOMAIN}" \ - --kube-version=${KUBE_VERSION} | kubectl apply -f - -kubectl wait --for=condition=complete --timeout=300s job/openbao-init-job -n cf-openbao - -# Gitea bootstrap -echo "Bootstrapping Gitea..." -generate_password() { - openssl rand -hex 16 | tr 'a-f' 'A-F' | head -c 32 +main() { + parse_args "$@" + # Use silent dependency check when using --apps or template mode for cleaner output + if [ -z "$APPS" ] && [ "$TEMPLATE_ONLY" = false ]; then + if [ "$SKIP_DEPENDENCY_CHECK" = false ]; then + check_dependencies + fi + validate_args + print_summary + else + # For --apps mode or template mode, check deps silently and skip verbose output + if [ "$SKIP_DEPENDENCY_CHECK" = false ]; then + check_dependencies true + fi + validate_args + fi + + # If specific apps are requested, check if they're cluster-forge child apps + if [ -n "$APPS" ]; then + local has_bootstrap_apps=false + local has_child_apps=false + local child_apps="" + + IFS=',' read -ra APP_ARRAY <<< "$APPS" + for app in "${APP_ARRAY[@]}"; do + case "$app" in + namespaces|argocd|openbao|gitea|cluster-forge) + has_bootstrap_apps=true + ;; + *) + if is_cluster_forge_child_app "$app"; then + has_child_apps=true + if [ -z "$child_apps" ]; then + child_apps="$app" + else + child_apps="$child_apps,$app" + fi + else + echo "WARNING: Unknown app '$app'. Available bootstrap apps: namespaces, argocd, openbao, gitea, cluster-forge" + echo "Or specify any cluster-forge child app from values.yaml" + fi + ;; + esac + done + + # Handle bootstrap apps + if [ "$has_bootstrap_apps" = true ]; then + should_run namespaces && create_namespaces + should_run argocd && bootstrap_argocd + should_run openbao && bootstrap_openbao + should_run gitea && bootstrap_gitea + should_run cluster-forge && apply_cluster_forge_parent_app + fi + + # Handle cluster-forge child apps + if [ "$has_child_apps" = true ]; then + # Temporarily set APPS to only child apps for the render function + local original_apps="$APPS" + APPS="$child_apps" + render_cluster_forge_child_apps + APPS="$original_apps" + fi + else + # Default behavior - run all bootstrap components + log_info "🚀 Running full bootstrap sequence..." + log_info "📋 Bootstrap order: namespaces → argocd → openbao → gitea → cluster-forge" + + if should_run namespaces; then + log_info "📦 Step 1/5: Creating namespaces" + create_namespaces + else + log_info "⏭️ Step 1/5: Skipping namespaces" + fi + + if should_run argocd; then + log_info "📦 Step 2/5: Bootstrapping ArgoCD" + bootstrap_argocd + else + log_info "⏭️ Step 2/5: Skipping ArgoCD" + fi + + if should_run openbao; then + log_info "📦 Step 3/5: Bootstrapping OpenBao" + bootstrap_openbao + else + log_info "⏭️ Step 3/5: Skipping OpenBao" + fi + + if should_run gitea; then + log_info "📦 Step 4/5: Bootstrapping Gitea" + bootstrap_gitea + else + log_info "⏭️ Step 4/5: Skipping Gitea" + fi + + if should_run cluster-forge; then + log_info "📦 Step 5/5: Creating ClusterForge parent app" + apply_cluster_forge_parent_app + else + log_info "⏭️ Step 5/5: Skipping ClusterForge" + fi + + log_info "✅ Bootstrap sequence completed" + fi } -# Create initial-cf-values configmap with merged values -echo "Creating initial-cf-values configmap from merged configuration..." -kubectl create configmap initial-cf-values --from-literal=initial-cf-values="$(cat /tmp/merged_values.yaml)" --dry-run=client -o yaml | kubectl apply -n cf-gitea -f - - -kubectl create secret generic gitea-admin-credentials \ - --namespace=cf-gitea \ - --from-literal=username=silogen-admin \ - --from-literal=password=$(generate_password) \ - --dry-run=client -o yaml | kubectl apply -f - - -$YQ_CMD eval '.apps.gitea.valuesObject' ${SCRIPT_DIR}/../root/${VALUES_FILE} > /tmp/gitea_values.yaml -$YQ_CMD eval '.apps.gitea.valuesObject' ${SCRIPT_DIR}/../root/${SIZE_VALUES_FILE} > /tmp/gitea_size_values.yaml - -# Bootstrap Gitea -helm template --release-name gitea ${SCRIPT_DIR}/../sources/gitea/12.3.0 --namespace cf-gitea \ - -f /tmp/gitea_values.yaml \ - -f /tmp/gitea_size_values.yaml \ - --set gitea.config.server.ROOT_URL="https://gitea.${DOMAIN}/" \ - --kube-version=${KUBE_VERSION} | kubectl apply -f - -kubectl rollout status deploy/gitea -n cf-gitea - -# Gitea Init Job -helm template --release-name gitea-init ${SCRIPT_DIR}/init-gitea-job \ - --set domain="${DOMAIN}" \ - --set clusterSize="values_${CLUSTER_SIZE}.yaml" \ - --kube-version=${KUBE_VERSION} \ - | kubectl apply -f - - -kubectl wait --for=condition=complete --timeout=300s job/gitea-init-job -n cf-gitea - -# Create cluster-forge app-of-apps with merged configuration -echo "Creating ClusterForge app-of-apps (size: $CLUSTER_SIZE)..." -helm template ${SCRIPT_DIR}/../root \ - -f /tmp/merged_values.yaml \ - --kube-version=${KUBE_VERSION} | kubectl apply -f - - -echo "" -echo "=== ClusterForge Bootstrap Complete ===" -echo "Domain: $DOMAIN" -echo "Cluster size: $CLUSTER_SIZE" -echo "Access ArgoCD at: https://argocd.${DOMAIN}" -echo "Access Gitea at: https://gitea.${DOMAIN}" -echo "" -if [ "$DEV_MODE" = true ]; then - echo "Mode: Development using non-main targetRevision" -fi -echo "Target revision: $TARGET_REVISION" -echo "Access ArgoCD at: https://argocd.${DOMAIN}" -echo "Access Gitea at: https://gitea.${DOMAIN}" -echo "" -echo "This is the way!" - -# Cleanup temporary files -echo "Cleaning up temporary files..." -rm -f /tmp/merged_values.yaml /tmp/argocd_values.yaml /tmp/openbao_values.yaml \ No newline at end of file +main "$@" \ No newline at end of file diff --git a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml index 36286875..991c81cb 100644 --- a/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml +++ b/scripts/init-gitea-job/templates/cf-init-gitea-cm.yaml @@ -12,6 +12,20 @@ data: GITEA_URL="${GITEA_URL:-http://gitea-http.cf-gitea.svc:3000}" GITEA_ADMIN_USER="${GITEA_ADMIN_USER:-silogen-admin}" + echo "Waiting for Gitea service to be ready..." + for i in {1..30}; do + if curl -s --max-time 5 "${GITEA_URL}" >/dev/null 2>&1; then + echo "Gitea service is ready and responding" + break + fi + if [ $i -eq 30 ]; then + echo "ERROR: Gitea service not ready after 30 attempts" + exit 1 + fi + echo "Waiting for Gitea to be ready... attempt $i/30" + sleep 2 + done + echo "Step 0: Create admin access token" GITEA_TOKEN=$(kubectl -n cf-gitea exec deploy/gitea -c gitea -- \ gitea admin user generate-access-token --raw --username "$GITEA_ADMIN_USER" --token-name "api-token-$(date +%s)" --scopes all @@ -134,7 +148,34 @@ data: global: clusterSize: {{ .Values.clusterSize }} domain: DOMAIN_PLACEHOLDER + {{- if .Values.airmImageRepository }} + # AIRM Image Repository Configuration + apps: + airm: + helmParameters: + # APP-DOMAIN (defined in base, but would be overridden here if omitted) + - name: airm-api.airm.appDomain + value: "{{ .Values.domain }}" + # BACKEND (airm-api) + - name: airm-api.airm.backend.image.repository + value: "{{ .Values.airmImageRepository }}/airm-api" + - name: airm-api.airm.backend.imagePullSecrets[0].name + value: regcred + # FRONTEND (airm-ui) + - name: airm-api.airm.frontend.image.repository + value: "{{ .Values.airmImageRepository }}/airm-ui" + - name: airm-api.airm.frontend.imagePullSecrets[0].name + value: regcred + # DISPATCHER (airm-dispatcher) + - name: airm-dispatcher.airm.dispatcher.image.repository + value: "{{ .Values.airmImageRepository }}/airm-dispatcher" + - name: airm-dispatcher.airm.dispatcher.imagePullSecrets[0].name + value: regcred + # AGENT (airm-agent) + - name: agent.airm.imagePullSecrets[0].name + value: regcred + {{- end }} EOF sed -i "s/DOMAIN_PLACEHOLDER/${DOMAIN}/g" /tmp/values.yaml diff --git a/scripts/init-gitea-job/values.yaml b/scripts/init-gitea-job/values.yaml index 2790dc92..25886155 100644 --- a/scripts/init-gitea-job/values.yaml +++ b/scripts/init-gitea-job/values.yaml @@ -1,3 +1,10 @@ -domain: # to be filled by bootstrap script -clusterSize: values_medium.yaml # to be filled by bootstrap script -targetRevision: v1.8.0-rc2 +# small|medium|large (injected by bootstrap script) +clusterSize: null +# Domain for the cluster (injected by bootstrap script) +# Example: "compute.amd.com" +domain: null +# Git revision to deploy (injected by bootstrap script) +targetRevision: v1.8.0-rc4 +# Base image repository for AIRM components (injected by bootstrap script when specified) +# Example: "ghcr.io/silogen" +airmImageRepository: null diff --git a/scripts/validate-cluster-sizes.sh b/scripts/validate-cluster-sizes.sh deleted file mode 100755 index 31a54649..00000000 --- a/scripts/validate-cluster-sizes.sh +++ /dev/null @@ -1,170 +0,0 @@ -#!/bin/bash -# ClusterForge Size Configuration Validation Script -# ============================================================================= -# This script validates the YAML structure and shows how size configurations work -# for ClusterForge applications without requiring Helm to be installed. - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$SCRIPT_DIR/.." - -# Colors for output -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_section() { - echo -e "${YELLOW}[SECTION]${NC} $1" -} - -# Check YAML syntax using available tools -check_yaml() { - local file="$1" - local filename=$(basename "$file") - - # Check if file exists - if [ ! -f "$file" ]; then - echo "❌ $filename: File not found" - return 1 - fi - - # Check if file is readable - if [ ! -r "$file" ]; then - echo "❌ $filename: File not readable" - return 1 - fi - - # Try different validation methods - local validation_method="" - local temp_output="" - - # Method 1: Try yq v4+ syntax - if command -v yq &> /dev/null; then - if temp_output=$(yq eval '.' "$file" 2>&1); then - validation_method="yq v4" - elif temp_output=$(yq . "$file" 2>&1); then - validation_method="yq v3" - elif temp_output=$(yq r "$file" 2>&1); then - validation_method="yq v2" - fi - fi - - # Method 2: Try python if yq failed - if [ -z "$validation_method" ] && command -v python3 &> /dev/null; then - if temp_output=$(python3 -c "import yaml; yaml.safe_load(open('$file', 'r'))" 2>&1); then - validation_method="python3" - fi - fi - - # Method 3: Try python2 if python3 failed - if [ -z "$validation_method" ] && command -v python &> /dev/null; then - if temp_output=$(python -c "import yaml; yaml.safe_load(open('$file', 'r'))" 2>&1); then - validation_method="python2" - fi - fi - - # If validation succeeded with any method - if [ -n "$validation_method" ]; then - log_success "$filename: Valid YAML syntax (validated with $validation_method)" - return 0 - fi - - # All validation methods failed - fall back to basic checks - log_info "$filename: Cannot validate YAML syntax (no working validator found)" - - # Check for common YAML issues - if grep -q $'\t' "$file"; then - echo "❌ $filename: Contains tabs (YAML requires spaces)" - return 1 - fi - - # Check for basic structure (allow comments at start) - if grep -m 1 "^[a-zA-Z]" "$file" >/dev/null 2>&1; then - log_success "$filename: Basic structure OK (install yq/python for full validation)" - return 0 - else - echo "❌ $filename: No valid YAML content found" - return 1 - fi -} - -# Show key differences between configurations -show_config_differences() { - local size="$1" - - log_section "Key differences for $size cluster:" - - case "$size" in - small) - echo " - ArgoCD: Single replica, no HA Redis" - echo " - MinIO: 1 server, 500GB storage" - echo " - OpenBao: Single instance (no HA)" - echo " - Prometheus: 7d retention, minimal resources" - echo " - Target: 1-5 users, development/testing" - ;; - medium) - echo " - ArgoCD: 2 replicas with HA Redis" - echo " - MinIO: 3 servers, 6TB total storage" - echo " - OpenBao: 3 replicas with Raft HA" - echo " - Enhanced resources for team collaboration" - echo " - Target: 5-20 users, production workloads" - ;; - large) - echo " - ArgoCD: 3 replicas with enhanced PDB" - echo " - MinIO: External HA S3 recommended" - echo " - OpenBao: Full HA with enhanced security" - echo " - Full observability stack with extended retention" - echo " - Target: 10s-100s users, enterprise scale" - ;; - esac -} - -main() { - log_info "Validating ClusterForge configuration files..." - echo - - # Validate base configuration - log_section "Base Configuration" - check_yaml "$PROJECT_ROOT/root/values.yaml" - echo - - # Validate size-specific configurations - for size in small medium large; do - log_section "$size Cluster Configuration" - check_yaml "$PROJECT_ROOT/root/values_$size.yaml" - show_config_differences "$size" - echo - done - - log_section "Configuration Summary" - echo "✅ Base values.yaml: All ClusterForge applications enabled" - echo "✅ values_small.yaml: Minimal resources for 1-5 users (dev/test)" - echo "✅ values_medium.yaml: Balanced setup for 5-20 users (teams)" - echo "✅ values_large.yaml: Enterprise features for 10s-100s users" - echo - - log_section "Usage Examples" - echo " # Small cluster (development/testing):" - echo " ./scripts/bootstrap.sh dev.example.com --CLUSTER_SIZE=small" - echo - echo " # Medium cluster (team production - default):" - echo " ./scripts/bootstrap.sh team.example.com" - echo - echo " # Large cluster (enterprise scale):" - echo " ./scripts/bootstrap.sh prod.example.com --CLUSTER_SIZE=large" - echo - - log_success "All ClusterForge size configurations are valid! This is the way." -} - -main "$@" \ No newline at end of file diff --git a/sources/airm/0.3.3/.helmignore b/sources/airm/0.3.3/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.3/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.3/Chart.yaml b/sources/airm/0.3.3/Chart.yaml new file mode 100644 index 00000000..ba5cf2b4 --- /dev/null +++ b/sources/airm/0.3.3/Chart.yaml @@ -0,0 +1,35 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm +description: A Helm chart for AIRM full stack, including API, UI and dispatcher + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.3 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" + +dependencies: + - name: airm-api + version: 0.3.3 + - name: airm-dispatcher + version: 0.3.3 diff --git a/sources/airm/0.3.3/charts/airm-api/.helmignore b/sources/airm/0.3.3/charts/airm-api/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.3/charts/airm-api/Chart.yaml b/sources/airm/0.3.3/charts/airm-api/Chart.yaml new file mode 100644 index 00000000..53989d09 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/Chart.yaml @@ -0,0 +1,29 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm-api +description: A Helm chart for AIRM API and UI + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.3 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" diff --git a/sources/airm/0.3.3/charts/airm-api/README.md b/sources/airm/0.3.3/charts/airm-api/README.md new file mode 100644 index 00000000..a16ec9da --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/README.md @@ -0,0 +1,124 @@ + + +# HELM CHARTS + +Simple instructions to deploy AIRM UI and API applications using helm chart + +### 1. Requirements + +The following external components must be available in the Kubernetes cluster before the helm chart can be installed. + +- API Gateway implementation (e.g. KGateway) +- Keycloak with the expected `airm` realm installed +- Valid S3 compatible file storage service (e.g. MinIO) +- RabbitMQ operator +- Cert Manager operator +- External Secret operator +- CNPG operator +- OTEL LGTM stack installed on the cluster + +### 2. Install + +``` +cd helm/airm/charts + +# 1. Create output template just to validate (the public domain could be app-dev.silogen.ai, staging.silogen.ai, etc.) +helm template airm-api ./airm-api -n airm --create-namespace --set airm.appDomain= > airm-api-helm-generated.yaml + +# 2. Run chart install +helm install airm-api ./airm-api -n airm --create-namespace --set airm.appDomain= + +# 3. Delete chart if needed +helm delete airm-api -n airm + +# 4. Upgrade when bumping versions +helm upgrade -n airm --set airm.appDomain= airm-api ./airm-api +``` + +--- + +### 3. Helm Settings + +| Field Path | Description | Type | Example / Default | +|-------------------------------------------------------------------------------|-----------------------------------------------------------------| ------ |---------------------------------------------------------------------------------------------------| +| secretgenerator.image.repository | Docker image repository for secret generator | string | `ghcr.io/silogen/kubectl` | +| secretgenerator.image.tag | Docker image tag | string | `latest` | +| secretgenerator.image.pullPolicy | Image pull policy | string | `IfNotPresent` | +| kgateway.namespace | Namespace for kgateway resources | string | `kgateway-system` | +| kgateway.gatewayName | Gateway name | string | `https` | +| kgateway.airmapi.servicePort | Service port for airmapi | int | `80` | +| kgateway.airmapi.prefixValue | URL prefix for airmapi service | string | `airmapi` | +| kgateway.airmui.servicePort | Service port for airmui | int | `80` | +| kgateway.airmui.prefixValue | URL prefix for airmui service | string | `airmui` | +| aims.otelCollector.exporters.otlphttp.endpoint | Open Telemetry collector endpoint url for inference metrics | string | `http://lgtm-stack.otel-lgtm-stack.svc:4318` | +| aims.otelCollector.image | Base image for Open Telemetry Collector | string | `ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0` | +| aims.otelCollector.receivers.prometheus.config.scrape_configs.scrape_interval | Inference metrics scraping interval | string | `20s` | +| airm.includeDemoSetup | Include the demo organization and project setup when installing | bool | `true` | +| airm.appDomain | Public IP or domain for airm | string | `PUBLIC-IP` | +| airm.externalSecretStore.airm.name | Secret store name for airm | string | `airm-secret-store` | +| airm.externalSecretStore.minio.name | Secret store name for minio | string | `k8s-secret-store` | +| airm.externalSecretStore.keycloak.name | Secret store name for keycloak | string | `keycloak-secret-store` | +| airm.keycloak.publicUrl | Public URL to access keycloak | string | `https://kc.{{ .Values.airm.appDomain }}` | +| airm.keycloak.internalUrl | Internal URL to access keycloak | string | `http://keycloak.keycloak.svc.cluster.local:8080` | +| airm.keycloak.clientId | Client ID to access keycloak | string | `354a0fa1-35ac-4a6d-9c4d-d661129c2cd0` | +| airm.keycloak.realm | Keycloak realm for authentication | string | `airm` | +| airm.postgresql.cnpg.image | PostgreSQL container image | string | `ghcr.io/cloudnative-pg/postgresql:17` | +| airm.postgresql.cnpg.instance | Number of PostgreSQL instances | int | `1` | +| airm.postgresql.cnpg.resources.limits.cpu | CPU limit for PostgreSQL container | string | `"2"` | +| airm.postgresql.cnpg.resources.limits.memory | Memory limit for PostgreSQL container | string | `1Gi` | +| airm.postgresql.cnpg.resources.requests.cpu | CPU request for PostgreSQL container | string | `"1"` | +| airm.postgresql.cnpg.resources.requests.memory | Memory request for PostgreSQL container | string | `512Mi` | +| airm.postgresql.cnpg.storage.size | Storage size for PostgreSQL | string | `50Gi` | +| airm.postgresql.cnpg.storage.storageClass | Storage class for PostgreSQL | string | `default` | +| airm.postgresql.cnpg.walStorage.size | WAL storage size for PostgreSQL | string | `50Gi` | +| airm.postgresql.cnpg.walStorage.storageClass | WAL storage class for PostgreSQL | string | `default` | +| airm.rabbitmq.replicas | Number of replicas for the RabbitMQ cluster | int | `1` | +| airm.rabbitmq.resources.limits.cpu | CPU limit for for the RabbitMQ cluster | string | `1` | +| airm.rabbitmq.resources.limits.memory | Memory limit for for the RabbitMQ cluster | string | `1Gi` | +| airm.rabbitmq.resources.requests.cpu | CPU request for the RabbitMQ cluster | string | `500m` | +| airm.rabbitmq.resources.requests.memory | Memory request for the RabbitMQ cluster | string | `1Gi` | +| airm.rabbitmq.persistence.storage | Persistent storage size for the RabbitMQ cluster | string | `20Gi` | +| airm.rabbitmq.persistence.storageClassName | Storage class name for the RabbitMQ cluster | string | `default` | +| airm.rabbitmq.backup.enabled | Enable RabbitMQ backup | bool | `false` | +| airm.rabbitmq.backup.image | RabbitMQ backup container image | string | `amdenterpriseai/rabbitmq-backup:0.1` | +| airm.rabbitmq.backup.resources.limits.memory | Memory limit for cron job of RabbitMQ backup | string | `512Mi` | +| airm.rabbitmq.backup.resources.requests.cpu | CPU request for cron job of RabbitMQ backup | string | `250m` | +| airm.rabbitmq.backup.resources.requests.memory | Memory request for cron job of RabbitMQ backup | string | `256Mi` | +| airm.frontend.image.repository | Frontend image repository | string | `amdenterpriseai/airm-ui` | +| airm.frontend.image.tag | Frontend image tag | string | `v2025.08-rc.21` | +| airm.frontend.image.pullPolicy | Frontend image pull policy | string | `IfNotPresent` | +| airm.frontend.servicePort | Frontend service port | int | `80` | +| airm.frontend.resources.limits.memory | Memory limit for frontend | string | `4Gi` | +| airm.frontend.resources.requests.cpu | CPU request for frontend | string | `500m` | +| airm.frontend.resources.requests.memory | Memory request for frontend | string | `4Gi` | +| airm.backend.image.repository | Backend API image repository | string | `amdenterpriseai/airm-api` | +| airm.backend.image.tag | Backend API image tag | string | `v2025.08-rc.21` | +| airm.backend.image.pullPolicy | Backend API image pull policy | string | `IfNotPresent` | +| airm.backend.servicePort | Backend API service port | int | `80` | +| airm.backend.servicePortMetrics | Backend API metrics service port | int | `9009` | +| airm.backend.env.dbPort | Database port | int | `5432` | +| airm.backend.env.rabbitmqPort | RabbitMQ port | int | `5672` | +| airm.backend.env.minioUrl | Minio service URL | string | `http://minio.minio-tenant-default.svc.cluster.local:80` | +| airm.backend.env.minioBucket | Minio bucket name | string | `default-bucket` | +| airm.backend.env.prometheusUrl | Prometheus service URL | string | `http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:9090` | +| airm.backend.env.clusterAuthUrl | Cluster auth service URL | string | `http://cluster-auth.cluster-auth.svc.cluster.local:8081` | +| airm.backend.resources.limits.memory | Memory limit for backend API container | string | `1Gi` | +| airm.backend.resources.requests.cpu | CPU request for backend API container | string | `500m` | +| airm.backend.resources.requests.memory | Memory request for backend API container | string | `1Gi` | +| airm.backend.securityContext.allowPrivilegeEscalation | Security context: allow privilege escalation | bool | `false` | +| airm.backend.securityContext.runAsNonRoot | Security context: run container as non-root | bool | `true` | +| airm.backend.securityContext.runAsUser | Security context: user ID to run container as | int | `1000` | +| airm.backend.securityContext.seccompProfile.type | Security context: seccomp profile type | string | `RuntimeDefault` | +| airm.utilities.netcat.image.repository | Netcat image repository | string | `busybox` | +| airm.utilities.netcat.image.tag | Netcat image tag | string | `1.37.0` | +| airm.utilities.netcat.image.pullPolicy | Netcat image pull policy | string | `IfNotPresent` | +| airm.utilities.curl.image.repository | Curl image repository | string | `curlimages/curl` | +| airm.utilities.curl.image.tag | Curl image tag | string | `8.16.0` | +| airm.utilities.curl.image.pullPolicy | Curl image pull policy | string | `IfNotPresent` | +| airm.utilities.liquibase.image.repository | Liquibase image repository | string | `docker.io/liquibase/liquibase` | +| airm.utilities.liquibase.image.tag | Liquibase image tag | string | `4.31` | +| airm.utilities.liquibase.image.pullPolicy | Liquibase image pull policy | string | `IfNotPresent` | diff --git a/sources/airm/0.3.3/charts/airm-api/files/configure.sh b/sources/airm/0.3.3/charts/airm-api/files/configure.sh new file mode 100644 index 00000000..69a3f59d --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/files/configure.sh @@ -0,0 +1,374 @@ +#!/bin/bash + +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +##################################################################################### +echo "" +echo "Run configure script block..." +echo "" + +# --- Configuration Variables --- +# Get values from bloom configmap mounted as env + +# NOTE: ORG_NAME is hardcoded to demo because gpu operator metrics has same org name hardcoded there +# Otherwise the following line can be uncommented to consider the real org name from domain config +# ORG_NAME=$(echo $NEW_DOMAIN_NAME | awk -F '.' '{ print $2 }') +ORG_NAME="demo" +ORG_DOMAINS="[\"${NEW_DOMAIN_NAME}\"]" +CLUSTER_WORKLOADS_BASE_URL="https://workspaces.${NEW_DOMAIN_NAME}/" +CLUSTER_KUBE_API_URL="https://k8s.${NEW_DOMAIN_NAME}" +USER_EMAIL="devuser@${NEW_DOMAIN_NAME}" +PROJECT_NAME="demo" +PROJECT_DESCRIPTION="demo" +CLUSTER_NAME="demo-cluster" +TIMEOUT=300 +SLEEP_INTERVAL=5 + +# --- Input Validation --- +echo "Validating environment variables..." +echo "KEYCLOAK_CLIENT_ID: ${KEYCLOAK_CLIENT_ID}" +echo "NEW_DOMAIN_NAME: ${NEW_DOMAIN_NAME}" +echo "AIRM_API_URL: ${AIRM_API_URL}" + +function check_env_variable() { + if [ -z "${!1}" ]; then + echo "ERROR: $1 environment variable is not set." + exit 1 + fi +} + +function check_success() { + if [ "$1" -ne 0 ]; then + echo "ERROR: $2" + exit 1 + fi +} + +check_env_variable "AIRM_API_URL" +check_env_variable "KEYCLOAK_URL" +check_env_variable "KEYCLOAK_REALM" +check_env_variable "KEYCLOAK_CLIENT_SECRET" +check_env_variable "KEYCLOAK_CLIENT_ID" +check_env_variable "KEYCLOAK_ADMIN_CLIENT_ID" +check_env_variable "KEYCLOAK_ADMIN_CLIENT_SECRET" +check_env_variable "USER_PASSWORD" + +function refresh_token() { + TOKEN=$(curl -s -d "client_id=${KEYCLOAK_CLIENT_ID}" -d "username=${USER_EMAIL}" -d "password=${USER_PASSWORD}" -d 'grant_type=password' -d "client_secret=${KEYCLOAK_CLIENT_SECRET}" "${KEYCLOAK_URL}/realms/${KEYCLOAK_REALM}/protocol/openid-connect/token" | jq -r '.access_token') + if [ -z "$TOKEN" ] || [ "$TOKEN" == "null" ]; then + echo "ERROR: Failed to obtain access token from Keycloak." + exit 1 + fi +} + +function create_org() { + # Try to get ORG_ID by name + ORG_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r --arg name "$ORG_NAME" '.organizations[] | select(.name==$name) | .id') + + # If not found, create the org and fetch the ID again + if [ -z "$ORG_ID" ] || [ "$ORG_ID" == "null" ]; then + ORG_RESP=$(curl -s -o /dev/null -X POST -w "%{http_code}" "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' \ + -d "{ \"name\": \"$ORG_NAME\", \"domains\": $ORG_DOMAINS }") + echo "$ORG_RESP" + check_success "$([[ "$ORG_RESP" == "200" || "$ORG_RESP" == "201" ]] && echo 0 || echo 1)" "Failed to create organization" + + ORG_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r --arg name "$ORG_NAME" '.organizations[] | select(.name==$name) | .id') + fi + + if [ -z "$ORG_ID" ] || [ "$ORG_ID" == "null" ]; then + echo "ERROR: Failed to create or retrieve organization ID." + exit 1 + else + echo "ORG_ID=${ORG_ID}" + fi +} + +function add_user_to_org() { + # Check if user exists in org + USER_EXISTS=$(curl -s -X GET "${AIRM_API_URL}/v1/users" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" -H 'Content-Type: application/json' | jq -r --arg email "$USER_EMAIL" '.data? // [] | .[] | select(.email==$email) | .email') + # Add user to org if they don't exist + if [ -z "$USER_EXISTS" ] || [ "$USER_EXISTS" == "null" ]; then + echo "$USER_EXISTS" + echo "User '$USER_EMAIL' not found in organization. Adding..." + ADD_USER_RESP=$(curl -w "%{http_code}" -X 'POST' "${AIRM_API_URL}/v1/organizations/${ORG_ID}/users" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" -H 'Content-Type: application/json' -d '{ "email": "'"$USER_EMAIL"'", "roles": ["Platform Administrator"]}') + echo "$ADD_USER_RESP" + check_success "$([[ "$ADD_USER_RESP" == "200" || "$ADD_USER_RESP" == "201" || "$ADD_USER_RESP" == "null201" ]] && echo 0 || echo 1)" "Failed to add user to organization" + else + echo "User '$USER_EMAIL' already exists in organization." + fi +} + +function create_project() { + PROJECT_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/projects" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" | jq -r '.projects[] | select(.name=="'$PROJECT_NAME'") | .id') + + for (( i=0; i<=TIMEOUT; i+=SLEEP_INTERVAL )); do + CLUSTER_STATUS=$(curl -s -X GET "${AIRM_API_URL}/v1/clusters/$CLUSTER_ID" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r '.status') + + if [ "$CLUSTER_STATUS" == "healthy" ]; then + echo "Cluster is healthy!" + break # Exit the loop if the cluster is healthy + fi + echo "Cluster status: $CLUSTER_STATUS. Waiting $SLEEP_INTERVAL seconds... ($i/$TIMEOUT seconds elapsed)" + sleep $SLEEP_INTERVAL + done + + if [ "$CLUSTER_STATUS" != "healthy" ]; then + echo "ERROR: Cluster did not become healthy within $TIMEOUT seconds." + exit 1 + fi + + if [ -z "$PROJECT_ID" ] || [ "$PROJECT_ID" == "null" ]; then + echo "Projects '$PROJECT_NAME' not found. Creating..." + PROJECT_ID=$(curl -X 'POST' \ + "${AIRM_API_URL}/v1/projects" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' \ + -d '{ + "name": "'"$PROJECT_NAME"'", + "description": "'"$PROJECT_DESCRIPTION"'", + "cluster_id": "'"$CLUSTER_ID"'", + "quota": { + "cpu_milli_cores": 0, + "memory_bytes": 0, + "ephemeral_storage_bytes": 0, + "gpu_count": 0 + } + }' | jq -r '.id') + echo "$PROJECT_ID" + check_success "$([[ "$PROJECT_ID" != "null" ]] && echo 0 || echo 1)" "Failed to create project" + else + echo "Project '$PROJECT_NAME' already exists with ID: $PROJECT_ID" + fi +} + +function add_minio_secret_and_storage_to_project() { + for (( i=0; i<=TIMEOUT; i+=SLEEP_INTERVAL )); do + PROJECT_STATUS=$(curl -s -X GET "${AIRM_API_URL}/v1/projects/$PROJECT_ID" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r '.status') + + if [ "$PROJECT_STATUS" == "Ready" ]; then + echo "Project is ready!" + break # Exit the loop if the project is ready + fi + echo "Project status: $PROJECT_STATUS. Waiting $SLEEP_INTERVAL seconds... ($i/$TIMEOUT seconds elapsed)" + sleep $SLEEP_INTERVAL + done + + SECRET_NAME="minio-credentials-fetcher" + STORAGE_NAME="minio-storage" + + SECRET_IN_PROJECT=$(curl -X 'GET' \ + "${AIRM_API_URL}/v1/projects/${PROJECT_ID}/secrets" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer ${TOKEN}" | jq -r '.project_secrets[] | select(.secret.name=="'"$SECRET_NAME"'") | .id') + EXTERNAL_SECRET_API_VERSION="v1beta1" + EXTERNAL_SECRET_MANIFEST=$(cat < /dev/null 2>&1; then + echo "AIRM API is ready!" + break + else + echo "Waiting for AIRM API..." + sleep 10 + fi + done + + echo "All dependencies are ready!" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: configure + image: "{{ .Values.airm.utilities.clusterTool.image.repository }}:{{ .Values.airm.utilities.clusterTool.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.clusterTool.image.pullPolicy }}" + command: ["/bin/bash"] + args: ["/scripts/configure.sh"] + env: + - name: DEBIAN_FRONTEND + value: "noninteractive" + - name: ORG_NAME + value: "demo" + - name: NEW_DOMAIN_NAME + value: "{{ .Values.airm.appDomain }}" + - name: KEYCLOAK_CLIENT_ID + value: "{{ .Values.airm.keycloak.clientId }}" + - name: KEYCLOAK_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: KEYCLOAK_SECRET + name: "{{ .Release.Name }}-keycloak-ui-creds" + - name: USER_EMAIL + value: "devuser@{{ .Values.airm.appDomain }}" + - name: KEYCLOAK_URL + value: "{{ .Values.airm.keycloak.internalUrl }}" + - name: KEYCLOAK_REALM + value: "{{ .Values.airm.keycloak.realm }}" + - name: KEYCLOAK_ADMIN_CLIENT_ID + valueFrom: + secretKeyRef: + key: client-id + name: "{{ .Release.Name }}-keycloak-admin-client" + - name: KEYCLOAK_ADMIN_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: client-secret + name: "{{ .Release.Name }}-keycloak-admin-client" + - name: AIRM_API_URL + value: "http://{{ .Release.Name }}-api.{{ .Release.Namespace }}.svc.cluster.local" + - name: USER_PASSWORD + valueFrom: + secretKeyRef: + key: USER_PASSWORD + name: "{{ .Release.Name }}-user-credentials" + volumeMounts: + - name: configure-script + mountPath: /scripts + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + volumes: + - name: configure-script + configMap: + name: "{{ .Release.Name }}-configure-script" + defaultMode: 0755 + +{{- end }} diff --git a/sources/airm/0.3.3/charts/airm-api/templates/airm-es.yaml b/sources/airm/0.3.3/charts/airm-api/templates/airm-es.yaml new file mode 100644 index 00000000..4dd18aeb --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/templates/airm-es.yaml @@ -0,0 +1,215 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cnpg-superuser" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-cnpg-superuser-username + property: value + secretKey: username + - remoteRef: + key: airm-cnpg-superuser-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-cnpg-superuser" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cnpg-user" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-cnpg-user-username + property: value + secretKey: username + - remoteRef: + key: airm-cnpg-user-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-cnpg-user" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-keycloak-admin-client" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-keycloak-admin-client-id + property: value + secretKey: client-id + - remoteRef: + key: airm-keycloak-admin-client-secret + property: value + secretKey: client-secret + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.keycloak.name }} + target: + name: "{{ .Release.Name }}-keycloak-admin-client" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-rabbitmq-admin" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-rabbitmq-user-username + property: value + secretKey: username + - remoteRef: + key: airm-rabbitmq-user-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-rabbitmq-admin" + template: + data: + default_user.conf: | + default_user = {{ "{{ .username }}" }} + default_pass = {{ "{{ .password }}" }} + password: '{{ "{{ .password }}" }}' + username: '{{ "{{ .username }}" }}' + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-api-minio-credentials" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: minio-api-access-key + property: value + secretKey: minio-access-key + - remoteRef: + key: minio-api-secret-key + property: value + secretKey: minio-secret-key + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.minio.name }} + target: + name: "{{ .Release.Name }}-api-minio-credentials" +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-secrets-airm" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-ui-auth-nextauth-secret + property: value + secretKey: NEXTAUTH_SECRET + refreshInterval: 15s + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-secrets-airm" +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-keycloak-ui-client" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-ui-keycloak-secret + property: value + secretKey: KEYCLOAK_SECRET + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.keycloak.name }} + target: + name: "{{ .Release.Name }}-keycloak-ui-creds" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cluster-auth-secrets" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + refreshInterval: 5m + target: + name: "{{ .Release.Name }}-cluster-auth-admin" + data: + - secretKey: admin-token + remoteRef: + key: cluster-auth-admin-token + property: value +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-user-credentials" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: keycloak-initial-devuser-password + property: value + secretKey: USER_PASSWORD + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-user-credentials" + template: + type: Opaque diff --git a/sources/airm/0.3.3/charts/airm-api/templates/airm-httproute.yaml b/sources/airm/0.3.3/charts/airm-api/templates/airm-httproute.yaml new file mode 100644 index 00000000..3393d6a5 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/templates/airm-httproute.yaml @@ -0,0 +1,81 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: '{{ .Release.Name }}api-route' + namespace: '{{ .Release.Namespace }}' +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: {{ .Values.kgateway.gatewayName }} + namespace: {{ .Values.kgateway.namespace }} + rules: + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-api' + port: {{ .Values.kgateway.airmapi.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmapi.prefixValue }}\..* + path: + type: RegularExpression + value: .*/stream.* + timeouts: + backendRequest: {{ .Values.kgateway.airmapi.timeouts.stream.backendRequest }} + request: {{ .Values.kgateway.airmapi.timeouts.stream.request }} + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-api' + port: {{ .Values.kgateway.airmapi.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmapi.prefixValue }}\..* + path: + type: PathPrefix + value: / + timeouts: + backendRequest: {{ .Values.kgateway.airmapi.timeouts.nonStream.backendRequest }} + request: {{ .Values.kgateway.airmapi.timeouts.nonStream.request }} +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: '{{ .Release.Name }}ui-route' + namespace: '{{ .Release.Namespace }}' +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: {{ .Values.kgateway.gatewayName }} + namespace: {{ .Values.kgateway.namespace }} + rules: + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-ui' + port: {{ .Values.kgateway.airmui.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmui.prefixValue }}\..* + path: + type: PathPrefix + value: / + timeouts: + backendRequest: {{ .Values.kgateway.airmui.timeouts.backendRequest }} + request: {{ .Values.kgateway.airmui.timeouts.request }} diff --git a/sources/airm/0.3.3/charts/airm-api/templates/airm-rabbitmq-cluster.yaml b/sources/airm/0.3.3/charts/airm-api/templates/airm-rabbitmq-cluster.yaml new file mode 100644 index 00000000..3db2ff07 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/templates/airm-rabbitmq-cluster.yaml @@ -0,0 +1,69 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: rabbitmq.com/v1beta1 +kind: RabbitmqCluster +metadata: + name: '{{ .Release.Name }}-rabbitmq' + namespace: '{{ .Release.Namespace }}' +spec: + persistence: + {{- toYaml .Values.airm.rabbitmq.persistence | nindent 4 }} + replicas: {{ .Values.airm.rabbitmq.replicas }} + resources: + {{- toYaml .Values.airm.rabbitmq.resources | nindent 4 }} + secretBackend: + externalSecret: + name: '{{ .Release.Name }}-rabbitmq-admin' + tls: + secretName: '{{ .Release.Name }}-tls-secret' +--- +{{- if .Values.airm.rabbitmq.backup.enabled -}} + +apiVersion: batch/v1 +kind: CronJob +metadata: + name: '{{ .Release.Name }}-rabbitmq-backup-cron' + namespace: '{{ .Release.Namespace }}' +spec: + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + spec: + containers: + - env: + - name: RABBITMQ_URL + value: 'http://{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local:15672' + - name: RABBITMQ_USER + valueFrom: + secretKeyRef: + key: username + name: '{{ .Release.Name }}-rabbitmq-admin' + - name: RABBITMQ_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: '{{ .Release.Name }}-rabbitmq-admin' + - name: S3_HOST + value: "{{ .Values.airm.backend.env.minioUrl }}" + - name: S3_ACCESS_KEY + valueFrom: + secretKeyRef: + key: minio-access-key + name: '{{ .Release.Name }}-api-minio-credentials' + - name: S3_SECRET_KEY + valueFrom: + secretKeyRef: + key: minio-secret-key + name: '{{ .Release.Name }}-api-minio-credentials' + image: '{{ .Values.airm.rabbitmq.backup.image }}' + name: rabbitmq-backup-cron + resources: + {{- toYaml .Values.airm.rabbitmq.backup.resources | nindent 16 }} + restartPolicy: OnFailure + schedule: 0 * * * * + +{{- end }} diff --git a/sources/airm/0.3.3/charts/airm-api/templates/airm-vllm-collector.yaml b/sources/airm/0.3.3/charts/airm-api/templates/airm-vllm-collector.yaml new file mode 100644 index 00000000..f12aa532 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/templates/airm-vllm-collector.yaml @@ -0,0 +1,93 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: "{{ .Release.Name }}-{{ .Values.aims.otelCollector.name }}" + namespace: "{{ .Release.Namespace }}" +spec: + mode: daemonset + image: "{{ .Values.aims.otelCollector.image }}" + config: + receivers: + prometheus: + config: + scrape_configs: + - job_name: "vllm" + metrics_path: /metrics + scrape_interval: "{{ .Values.aims.otelCollector.receivers.prometheus.config.scrape_configs.scrape_interval }}" + kubernetes_sd_configs: + - role: pod + relabel_configs: + # Only scrape pods with the workload-id label + - source_labels: + [__meta_kubernetes_pod_label_airm_silogen_ai_workload_id] + action: keep + regex: .+ + # Only scrape pods with app label starting with isvc. + - source_labels: [__meta_kubernetes_pod_label_app] + action: keep + regex: isvc\..* + # Set the workload_id from the label + - source_labels: + [__meta_kubernetes_pod_label_airm_silogen_ai_workload_id] + target_label: workload_id + # Set service name from app label + - source_labels: [__meta_kubernetes_pod_label_app] + target_label: service + # Set service instance id from pod name + - source_labels: [__meta_kubernetes_pod_name] + target_label: service_instance_id + # Set the scrape target to port 8000 + - source_labels: [__meta_kubernetes_pod_ip] + target_label: __address__ + replacement: $1:8000 + otlp: + protocols: + grpc: {} + http: {} + + processors: + resource: + attributes: + - key: airm.silogen.ai/workload-id + from_attribute: workload_id + action: upsert + - key: service.instance.id + from_attribute: service_instance_id + action: upsert + - key: service.name + from_attribute: service + action: upsert + + transform: + metric_statements: + - context: datapoint + statements: + - set(attributes["workload_id"], resource.attributes["airm.silogen.ai/workload-id"]) where attributes["workload_id"] == nil + - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where attributes["service_instance_id"] == nil + - set(attributes["service"], resource.attributes["service.name"]) where attributes["service"] == nil + + exporters: + otlphttp: + endpoint: "{{ .Values.aims.otelCollector.exporters.otlphttp.endpoint }}" + + service: + pipelines: + metrics: + receivers: [prometheus] + processors: [resource, transform] + exporters: [otlphttp] + + traces: + receivers: [otlp] + processors: [resource] + exporters: [otlphttp] + + logs: + receivers: [otlp] + processors: [resource] + exporters: [otlphttp] diff --git a/sources/airm/0.3.3/charts/airm-api/values.yaml b/sources/airm/0.3.3/charts/airm-api/values.yaml new file mode 100644 index 00000000..78a9791f --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-api/values.yaml @@ -0,0 +1,166 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +secretgenerator: + image: + repository: amdenterpriseai/cluster-tool + tag: latest + pullPolicy: IfNotPresent + +kgateway: + namespace: kgateway-system + gatewayName: https + airmapi: + servicePort: 80 + prefixValue: airmapi + timeouts: + stream: + backendRequest: 30m + request: 30m + nonStream: + backendRequest: 10m + request: 10m + airmui: + servicePort: 80 + prefixValue: airmui + timeouts: + backendRequest: 1m + request: 1m + keycloak: + prefixValue: kc + +aims: + otelCollector: + image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0 + receivers: + prometheus: + config: + scrape_configs: + scrape_interval: 20s + exporters: + otlphttp: + endpoint: "http://lgtm-stack.otel-lgtm-stack.svc:4318" + name: "vllm-collector" + +airm: + appDomain: PUBLIC-IP + includeDemoSetup: true + + externalSecretStore: + airm: + name: openbao-secret-store + minio: + name: openbao-secret-store + keycloak: + name: openbao-secret-store + + postgresql: + enabled: true + cnpg: + image: ghcr.io/cloudnative-pg/postgresql:17 + instance: 1 + resources: + limits: + cpu: "2" + memory: 1Gi + requests: + cpu: "1" + memory: 512Mi + storage: + size: 50Gi + storageClass: default + walStorage: + size: 50Gi + storageClass: default + + rabbitmq: + replicas: 1 + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: '1' + memory: 1Gi + persistence: + storage: 20Gi + storageClassName: default + backup: + enabled: false + image: amdenterpriseai/rabbitmq-backup:0.1 + resources: + limits: + memory: 512Mi + requests: + cpu: 250m + memory: 256Mi + + keycloak: + internalUrl: http://keycloak.keycloak.svc.cluster.local:8080 + clientId: "354a0fa1-35ac-4a6d-9c4d-d661129c2cd0" + realm: airm + + frontend: + image: + repository: amdenterpriseai/airm-ui + tag: 0.3.3 + pullPolicy: IfNotPresent + servicePort: 80 + resources: + limits: + memory: 4Gi + requests: + cpu: 500m + memory: 4Gi + + backend: + image: + repository: amdenterpriseai/airm-api + tag: 0.3.3 + pullPolicy: IfNotPresent + + servicePort: 80 + servicePortMetrics: 9009 + env: + dbPort: 5432 + rabbitmqPort: 5672 + minioUrl: http://minio.minio-tenant-default.svc.cluster.local:80 + minioBucket: default-bucket + prometheusUrl: http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:9090 + clusterAuthUrl: http://cluster-auth.cluster-auth.svc.cluster.local:8081 + + resources: + limits: + memory: 1Gi + requests: + cpu: 500m + memory: 1Gi + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + + utilities: + netcat: + image: + repository: busybox + tag: 1.37.0 + pullPolicy: IfNotPresent + curl: + image: + repository: curlimages/curl + tag: 8.16.0 + pullPolicy: IfNotPresent + liquibase: + image: + repository: docker.io/liquibase/liquibase + tag: 4.31 + pullPolicy: IfNotPresent + clusterTool: + image: + repository: amdenterpriseai/cluster-tool + tag: latest + pullPolicy: IfNotPresent diff --git a/sources/airm/0.3.3/charts/airm-dispatcher/.helmignore b/sources/airm/0.3.3/charts/airm-dispatcher/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-dispatcher/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.3/charts/airm-dispatcher/Chart.yaml b/sources/airm/0.3.3/charts/airm-dispatcher/Chart.yaml new file mode 100644 index 00000000..4fbdee97 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-dispatcher/Chart.yaml @@ -0,0 +1,29 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm-dispatcher +description: A Helm chart for AIRM Dispatcher + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.3 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" diff --git a/sources/airm/0.3.3/charts/airm-dispatcher/README.md b/sources/airm/0.3.3/charts/airm-dispatcher/README.md new file mode 100644 index 00000000..0b85c706 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-dispatcher/README.md @@ -0,0 +1,54 @@ + + +# HELM CHARTS + +Simple instructions to deploy AIRM dispatcher application using helm chart. +The dispatcher can be run on a compute cluster, which may or may not be the same as the one hosting the AIRM API and UI. + +### 1. Requirements + +The following external components must be available in the Kubernetes cluster before the helm chart can be installed. + +- Accessible RabbitMQ cluster (must be the same cluster used by AIRM API). +- Kaiwo installed on the cluster (along with all its dependencies) + +### 2. Install + +``` +cd helm/airm/charts + +# 1. Create output template just to validate (the public domain could be app-dev.silogen.ai, staging.silogen.ai, etc.) +helm template airm-dispatcher ./airm-dispatcher -n airm --create-namespace > airm-dispatcher-helm-generated.yaml + +# 2. Run chart install +helm install airm-dispatcher ./airm-dispatcher -n airm --create-namespace + +# 3. Delete chart if needed +helm delete airm-dispatcher -n airm + +# 4. Upgrade when bumping versions +helm upgrade -n airm --set airm-dispatcher ./airm-dispatcher +``` + +--- + +### 3. Helm Settings + +| Field Path | Description | Type | Example / Default | +|---------------------------------------------|--------------------------------------------------------------|---------|-----------------------------------| +| airm.dispatcher.image.repository | Dispatcher image repository | string | `amdenterpriseai/airm-dispatcher` | +| airm.dispatcher.image.tag | Dispatcher image tag | string | `v2025.08-rc.21` | +| airm.dispatcher.image.pullPolicy | Dispatcher image pull policy | string | `IfNotPresent` | +| airm.dispatcher.servicePort | Dispatcher service port | int | `80` | +| airm.utilities.netcat.image.repository | Netcat image repository | string | `busybox` | +| airm.utilities.netcat.image.tag | Netcat image tag | string | `1.37.0` | +| airm.utilities.netcat.image.pullPolicy | Netcat image pull policy | string | `IfNotPresent` | +| airm.utilities.curl.image.repository | Curl image repository | string | `curlimages/curl` | +| airm.utilities.curl.image.tag | Curl image tag | string | `8.16.0` | +| airm.utilities.curl.image.pullPolicy | Curl image pull policy | string | `IfNotPresent` | +| airm.additionalClusterRoles.platformAdmin | Additional cluster roles for the Platform Administrator role | array | `[]` | +| airm.additionalClusterRoles.projectMember | Additional cluster roles for the Project Member role | array | `[]` | diff --git a/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-cluster-policies.yaml b/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-cluster-policies.yaml new file mode 100644 index 00000000..caf92aa6 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-cluster-policies.yaml @@ -0,0 +1,352 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-project-namespace-rolebinding +spec: + background: false + rules: + - name: generate-project-namespace-rolebinding + match: + any: + - resources: + kinds: + - Namespace + operations: + - CREATE + preconditions: + any: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/project-id" || '''' }}`}}' + operator: NotEquals + value: "" + skipBackgroundRequests: true + generate: + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + name: "project-member-role-binding" + namespace: "{{`{{request.object.metadata.name}}`}}" + synchronize: true + data: + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: airm-project-member + subjects: + - kind: Group + # Add this in for backwards compatibility + name: "oidc{{`{{request.object.metadata.name}}`}}" + apiGroup: rbac.authorization.k8s.io + - kind: Group + # The kubernetes cluster apply an OIDC prefix of 'oidc:', so we adjust the groups to expect that + name: "oidc:{{`{{request.object.metadata.name}}`}}" + apiGroup: rbac.authorization.k8s.io +--- +# Kyverno policy that enforces that workloads submitted to a namespace managed by AIRMan have the +# correct kueue lables and field set, so that they are bound by the quota of the namespace +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-quota-enforcement-for-workloads +spec: + background: false + rules: + - name: set-queue-name-from-namespace-default + match: + resources: + kinds: + - Deployment + - StatefulSet + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + + - name: set-queue-name-from-namespace-jobs + match: + resources: + kinds: + - Job # https://kueue.sigs.k8s.io/docs/tasks/run/jobs/ + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + spec: + suspend: true + + - name: set-queue-name-from-namespace-cronjobs + match: + resources: + kinds: + - CronJob # https://kueue.sigs.k8s.io/docs/tasks/run/run_cronjobs/ + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.spec.jobTemplate.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + spec: + jobTemplate: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + spec: + suspend: true + + - name: set-queue-name-from-namespace-kaiwo + match: + resources: + kinds: + - KaiwoJob + - KaiwoService + - AIMService + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: "{{`{{request.object.spec.clusterQueue || '' }}`}}" + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + spec: + clusterQueue: "{{`{{request.namespace }}`}}" +--- +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-workload-tracking-policy +spec: + background: false + rules: + # For all supported types, if airm.silogen.ai/workload-id and airm.silogen.ai/component-id are not set, we assume + # it has been submitted from outside of AIRMan. In that case, we set airm.silogen.ai/auto-discovered: true, so it can + # be tracked upstream. We also set airm.silogen.ai/discovered-component-type so that we can identify the type of component + # that was originally tracked, and ignore children created by it. See remove-auto-discovered-annotations-inherited-from-parent + # We also try to capture the user who submitted the workload, and consume it in the application + + # Please note that ReplicaSet is not supported because by default it is filtered away by Kyverno by default: https://github.com/kyverno/kyverno/blob/main/charts/kyverno/values.yaml#L270 + - name: add-discovery-annotations-for-supported-types + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || '''' }}`}}' + operator: Equals + value: "" + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || '''' }}`}}' + operator: Equals + value: "" + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/auto-discovered" || '''' }}`}}' + operator: Equals + value: "" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + annotations: + airm.silogen.ai/submitter: "{{`{{request.userInfo.username }}`}}" + airm.silogen.ai/auto-discovered: "true" + airm.silogen.ai/discovered-component-type: "{{`{{request.object.kind }}`}}" + # For all supported types, if airm.silogen.ai/auto-discovered is set and the airm.silogen.ai/discovered-component-type + # doesnt match the kind of the current component, we assume this type has been created by a parent which is also + # supported by AIRMan and we dont need to track this type upstream, so we unset the airm.silogen.ai/auto-discovered annotation. + # This is mostly to account for KaiwoJob, KaiwoService, AIMService which propagate annotations to pods. + - name: remove-auto-discovered-annotations-inherited-from-parent + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/auto-discovered" || '''' }}`}}' + operator: Equals + value: "true" + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/discovered-component-type" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.object.kind }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + annotations: + airm.silogen.ai/auto-discovered: "false" + # For all supported types, if airm.silogen.ai/project-id does not match that of the namespace label, overwrite it + # with the expected value, to avoid metrics getting mixed up between projects. + - name: set-project-id-from-namespace-label + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + context: + - name: ns_labels + apiCall: + urlPath: "/api/v1/namespaces/{{`{{request.namespace }}`}}" + method: GET + jmesPath: "metadata.labels" + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/project-id" || '''' }}`}}' + operator: NotEquals + value: '{{`{{ns_labels."airm.silogen.ai/project-id" }}`}}' + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/project-id: '{{`{{ns_labels."airm.silogen.ai/project-id" }}`}}' + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to types that expect them at + # metadata.labels. The expectation is that these are propagated to the subsequent pods that are created. + + # If the resource is spawned off by a CRD, it will not know about the labels on the previous version of the object, + # so we also check request.oldObject for the labels to try and preserve them if they were already set. + - name: add-workload-and-component-id-default + match: + resources: + kinds: + - Pod + - KaiwoJob + - KaiwoService + - AIMService + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to objects with templates and + # also add it to spec.template.metadata.labels to ensure that the pods created by them contain the labels as well + - name: add-workload-and-component-id-to-objects-with-template + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + template: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to cronjob and + # also add it to spec.jobTemplate.metadata.labels to ensure that the pods created by the cronjob + # contain it as well + - name: add-workload-and-component-id-cronjobs + match: + resources: + kinds: + - CronJob + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + jobTemplate: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + template: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' diff --git a/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-cluster-roles.yaml b/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-cluster-roles.yaml new file mode 100644 index 00000000..2461e894 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-cluster-roles.yaml @@ -0,0 +1,164 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: airm-platform-admin +rules: + - apiGroups: [""] + resources: + [ + "pods", + "events", + "services", + "configmaps", + "persistentvolumes", + "persistentvolumeclaims", + "namespaces", + "serviceaccounts", + ] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: [""] + resources: ["pods/log", "pods/exec", "pods/attach", "pods/portforward"] + verbs: ["*"] + - apiGroups: ["apps"] + resources: ["deployments", "deployments/scale", "replicasets", "statefulsets", "daemonsets"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses", "networkpolicies"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimmodels", "aimmodelcaches", "aimservicetemplates", "aimruntimeconfigs", "aimtemplatecaches"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes", "gateways"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["config.kaiwo.silogen.ai"] + resources: ["kaiwoconfigs"] + verbs: ["*"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwoqueueconfigs"] + verbs: ["*"] + - apiGroups: ["aim.silogen.ai"] + resources: [ "aimclustermodels", "aimclusterservicetemplates", "aimclusterruntimeconfigs", "aimclustermodelsources" ] + verbs: ["*"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "clusterroles", "rolebindings", "clusterrolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["external-secrets.io"] + resources: ["externalsecrets"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: ["external-secrets.io"] + resources: ["clustersecretstores"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["kueue.x-k8s.io"] + resources: ["clusterqueues", "resourceflavors", "localqueues", "workloadpriorityclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list"] +{{- range .Values.airm.additionalClusterRoles.platformAdmin }} + - apiGroups: + {{ .apiGroups | toYaml | nindent 6 }} + resources: + {{ .resources | toYaml | nindent 6 }} + verbs: + {{ .verbs | toYaml | nindent 6 }} +{{- end }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: airm-platform-admin-role-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: airm-platform-admin +subjects: + - kind: Group + # Add this in for backwards compatibility + name: "oidcairm-role:Platform Administrator" + apiGroup: rbac.authorization.k8s.io + - kind: Group + # The kubernetes cluster apply an OIDC prefix of 'oidc':, so we adjust the group to expect that + name: "oidc:airm-role:Platform Administrator" + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: airm-project-member +rules: + - apiGroups: [""] + resources: + [ + "pods", + "pods/log", + "pods/exec", + "pods/attach", + "pods/portforward", + "events", + "services", + "configmaps", + "persistentvolumes", + "persistentvolumeclaims", + ] + verbs: ["*"] + - apiGroups: ["apps"] + resources: ["deployments", "replicasets", "statefulsets", "daemonsets"] + verbs: ["*"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["*"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses", "networkpolicies"] + verbs: ["*"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["*"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimmodels", "aimmodelcaches", "aimservicetemplates", "aimruntimeconfigs", "aimtemplatecaches"] + verbs: ["*"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes", "gateways"] + verbs: ["*"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch", "create"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "clusterroles", "rolebindings", "clusterrolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["external-secrets.io"] + resources: ["clustersecretstores", "externalsecrets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] +{{- range .Values.airm.additionalClusterRoles.projectMember }} + - apiGroups: + {{ .apiGroups | toYaml | nindent 6 }} + resources: + {{ .resources | toYaml | nindent 6 }} + verbs: + {{ .verbs | toYaml | nindent 6 }} +{{- end }} diff --git a/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml b/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml new file mode 100644 index 00000000..8a3489ef --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml @@ -0,0 +1,343 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-cluster-nodes-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Replace + jobTemplate: + spec: + template: + spec: + containers: + - command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/clusters/nodes + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + name: airm-cluster-nodes-cron + resources: + limits: + memory: 100Mi + requests: + cpu: 50m + memory: 100Mi + restartPolicy: OnFailure + schedule: 0 * * * * +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-aim-models-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Forbid + schedule: "*/5 * * * *" + suspend: false + jobTemplate: + spec: + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + restartPolicy: OnFailure + initContainers: + - name: check-dispatcher-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm Dispatcher at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm Dispatcher is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-dispatcher.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.servicePort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-aim-models-cron + command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/aims/cluster-models + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + resources: + requests: + memory: "100Mi" + cpu: "50m" + limits: + memory: "100Mi" +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-heartbeat-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Replace + schedule: "*/1 * * * *" + suspend: false + jobTemplate: + spec: + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + restartPolicy: OnFailure + initContainers: + - name: check-dispatcher-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm Dispatcher at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm Dispatcher is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-dispatcher.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.servicePort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-heartbeat-cron + command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/heartbeats + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + resources: + requests: + memory: "100Mi" + cpu: "50m" + limits: + memory: "100Mi" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: "{{ .Release.Name }}-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + replicas: 1 + selector: + matchLabels: + app: "{{ .Release.Name }}-dispatcher" + template: + metadata: + labels: + app: "{{ .Release.Name }}-dispatcher" + spec: + serviceAccountName: "{{ .Release.Name }}-dispatcher-sa" + {{- with .Values.airm.dispatcher.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + initContainers: + - name: check-rabbitmq-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm RabbitMQ at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm RabbitMQ is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.env.rabbitmqPort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-dispatcher + image: "{{ .Values.airm.dispatcher.image.repository }}:{{ .Values.airm.dispatcher.image.tag }}" + imagePullPolicy: "{{ .Values.airm.dispatcher.image.pullPolicy }}" + ports: + - containerPort: 8080 + env: + - name: KUBE_CLUSTER_NAME + value: demo-cluster + - name: ORG_NAME + value: demo + - name: RABBITMQ_HOST + value: "{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local" + - name: RABBITMQ_PORT + value: "{{ .Values.airm.dispatcher.env.rabbitmqPort }}" + - name: RABBITMQ_AIRM_COMMON_VHOST + value: "vh_airm_common" + - name: RABBITMQ_AIRM_COMMON_QUEUE + value: "airm_common" + - name: RABBITMQ_USER + valueFrom: + secretKeyRef: + name: "{{ .Release.Name }}-rabbitmq-common-vhost-user" + key: username + - name: RABBITMQ_PASSWORD + valueFrom: + secretKeyRef: + name: "{{ .Release.Name }}-rabbitmq-common-vhost-user" + key: password + livenessProbe: + httpGet: + path: /v1/health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + successThreshold: 1 + readinessProbe: + httpGet: + path: /v1/health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + successThreshold: 1 + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "1Gi" + securityContext: + runAsUser: 0 +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: "{{ .Release.Name }}-dispatcher-cluster-access-binding" +subjects: + - kind: ServiceAccount + name: "{{ .Release.Name }}-dispatcher-sa" + namespace: "{{ .Release.Namespace }}" +roleRef: + kind: ClusterRole + name: "{{ .Release.Name }}-dispatcher-cluster-access-role" + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: "{{ .Release.Name }}-dispatcher-cluster-access-role" +rules: + - apiGroups: [""] + resources: ["services", "namespaces", "configmaps", "pods"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["deployments", "statefulsets", "daemonsets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices", "kaiwoqueueconfigs"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["external-secrets.io"] + resources: ["externalsecrets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimclustermodels"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: "{{ .Release.Name }}-dispatcher-sa" + namespace: "{{ .Release.Namespace }}" + +--- +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}-dispatcher" + namespace: "{{ .Release.Namespace }}" + labels: + app: "{{ .Release.Name }}-dispatcher" +spec: + ports: + - name: web + port: {{ .Values.airm.dispatcher.servicePort }} + targetPort: 8080 + type: ClusterIP + selector: + app: "{{ .Release.Name }}-dispatcher" diff --git a/sources/airm/0.3.3/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml b/sources/airm/0.3.3/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml new file mode 100644 index 00000000..e930efd0 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml @@ -0,0 +1,35 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# These are additional cluster roles needed by kyverno background controller to be able to +# create rolebindings in namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kyverno:airm-policy-roles + labels: + rbac.kyverno.io/aggregate-to-background-controller: "true" +rules: + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterroles", "rolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] # allow kyverno to bind clusterroles via rolebindings + resources: ["clusterroles"] + verbs: ["bind"] +--- +# These are additional cluster roles needed by kyverno reports controller to be able to +# manage custom resources for reporting +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kyverno:airm-reports-policy-roles + labels: + rbac.kyverno.io/aggregate-to-reports-controller: "true" +rules: + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["get", "list", "watch"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices"] + verbs: ["get", "list", "watch"] diff --git a/sources/airm/0.3.3/charts/airm-dispatcher/values.yaml b/sources/airm/0.3.3/charts/airm-dispatcher/values.yaml new file mode 100644 index 00000000..670cf399 --- /dev/null +++ b/sources/airm/0.3.3/charts/airm-dispatcher/values.yaml @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +airm: + dispatcher: + image: + repository: amdenterpriseai/airm-dispatcher + tag: 0.3.3 + pullPolicy: IfNotPresent + servicePort: 80 + env: + rabbitmqPort: 5672 + utilities: + netcat: + image: + repository: busybox + tag: 1.37.0 + pullPolicy: IfNotPresent + curl: + image: + repository: curlimages/curl + tag: 8.16.0 + pullPolicy: IfNotPresent + additionalClusterRoles: + platformAdmin: [] + projectMember: [] diff --git a/sources/airm/0.3.3/values.yaml b/sources/airm/0.3.3/values.yaml new file mode 100644 index 00000000..69346880 --- /dev/null +++ b/sources/airm/0.3.3/values.yaml @@ -0,0 +1,3 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT diff --git a/sources/airm/0.3.4/.helmignore b/sources/airm/0.3.4/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.4/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.4/Chart.yaml b/sources/airm/0.3.4/Chart.yaml new file mode 100644 index 00000000..4879c7c6 --- /dev/null +++ b/sources/airm/0.3.4/Chart.yaml @@ -0,0 +1,35 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm +description: A Helm chart for AIRM full stack, including API, UI and dispatcher + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.4 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" + +dependencies: + - name: airm-api + version: 0.3.4 + - name: airm-dispatcher + version: 0.3.4 diff --git a/sources/airm/0.3.4/charts/airm-api/.helmignore b/sources/airm/0.3.4/charts/airm-api/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.4/charts/airm-api/Chart.yaml b/sources/airm/0.3.4/charts/airm-api/Chart.yaml new file mode 100644 index 00000000..4bddec9c --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/Chart.yaml @@ -0,0 +1,29 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm-api +description: A Helm chart for AIRM API and UI + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.4 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" diff --git a/sources/airm/0.3.4/charts/airm-api/README.md b/sources/airm/0.3.4/charts/airm-api/README.md new file mode 100644 index 00000000..a16ec9da --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/README.md @@ -0,0 +1,124 @@ + + +# HELM CHARTS + +Simple instructions to deploy AIRM UI and API applications using helm chart + +### 1. Requirements + +The following external components must be available in the Kubernetes cluster before the helm chart can be installed. + +- API Gateway implementation (e.g. KGateway) +- Keycloak with the expected `airm` realm installed +- Valid S3 compatible file storage service (e.g. MinIO) +- RabbitMQ operator +- Cert Manager operator +- External Secret operator +- CNPG operator +- OTEL LGTM stack installed on the cluster + +### 2. Install + +``` +cd helm/airm/charts + +# 1. Create output template just to validate (the public domain could be app-dev.silogen.ai, staging.silogen.ai, etc.) +helm template airm-api ./airm-api -n airm --create-namespace --set airm.appDomain= > airm-api-helm-generated.yaml + +# 2. Run chart install +helm install airm-api ./airm-api -n airm --create-namespace --set airm.appDomain= + +# 3. Delete chart if needed +helm delete airm-api -n airm + +# 4. Upgrade when bumping versions +helm upgrade -n airm --set airm.appDomain= airm-api ./airm-api +``` + +--- + +### 3. Helm Settings + +| Field Path | Description | Type | Example / Default | +|-------------------------------------------------------------------------------|-----------------------------------------------------------------| ------ |---------------------------------------------------------------------------------------------------| +| secretgenerator.image.repository | Docker image repository for secret generator | string | `ghcr.io/silogen/kubectl` | +| secretgenerator.image.tag | Docker image tag | string | `latest` | +| secretgenerator.image.pullPolicy | Image pull policy | string | `IfNotPresent` | +| kgateway.namespace | Namespace for kgateway resources | string | `kgateway-system` | +| kgateway.gatewayName | Gateway name | string | `https` | +| kgateway.airmapi.servicePort | Service port for airmapi | int | `80` | +| kgateway.airmapi.prefixValue | URL prefix for airmapi service | string | `airmapi` | +| kgateway.airmui.servicePort | Service port for airmui | int | `80` | +| kgateway.airmui.prefixValue | URL prefix for airmui service | string | `airmui` | +| aims.otelCollector.exporters.otlphttp.endpoint | Open Telemetry collector endpoint url for inference metrics | string | `http://lgtm-stack.otel-lgtm-stack.svc:4318` | +| aims.otelCollector.image | Base image for Open Telemetry Collector | string | `ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0` | +| aims.otelCollector.receivers.prometheus.config.scrape_configs.scrape_interval | Inference metrics scraping interval | string | `20s` | +| airm.includeDemoSetup | Include the demo organization and project setup when installing | bool | `true` | +| airm.appDomain | Public IP or domain for airm | string | `PUBLIC-IP` | +| airm.externalSecretStore.airm.name | Secret store name for airm | string | `airm-secret-store` | +| airm.externalSecretStore.minio.name | Secret store name for minio | string | `k8s-secret-store` | +| airm.externalSecretStore.keycloak.name | Secret store name for keycloak | string | `keycloak-secret-store` | +| airm.keycloak.publicUrl | Public URL to access keycloak | string | `https://kc.{{ .Values.airm.appDomain }}` | +| airm.keycloak.internalUrl | Internal URL to access keycloak | string | `http://keycloak.keycloak.svc.cluster.local:8080` | +| airm.keycloak.clientId | Client ID to access keycloak | string | `354a0fa1-35ac-4a6d-9c4d-d661129c2cd0` | +| airm.keycloak.realm | Keycloak realm for authentication | string | `airm` | +| airm.postgresql.cnpg.image | PostgreSQL container image | string | `ghcr.io/cloudnative-pg/postgresql:17` | +| airm.postgresql.cnpg.instance | Number of PostgreSQL instances | int | `1` | +| airm.postgresql.cnpg.resources.limits.cpu | CPU limit for PostgreSQL container | string | `"2"` | +| airm.postgresql.cnpg.resources.limits.memory | Memory limit for PostgreSQL container | string | `1Gi` | +| airm.postgresql.cnpg.resources.requests.cpu | CPU request for PostgreSQL container | string | `"1"` | +| airm.postgresql.cnpg.resources.requests.memory | Memory request for PostgreSQL container | string | `512Mi` | +| airm.postgresql.cnpg.storage.size | Storage size for PostgreSQL | string | `50Gi` | +| airm.postgresql.cnpg.storage.storageClass | Storage class for PostgreSQL | string | `default` | +| airm.postgresql.cnpg.walStorage.size | WAL storage size for PostgreSQL | string | `50Gi` | +| airm.postgresql.cnpg.walStorage.storageClass | WAL storage class for PostgreSQL | string | `default` | +| airm.rabbitmq.replicas | Number of replicas for the RabbitMQ cluster | int | `1` | +| airm.rabbitmq.resources.limits.cpu | CPU limit for for the RabbitMQ cluster | string | `1` | +| airm.rabbitmq.resources.limits.memory | Memory limit for for the RabbitMQ cluster | string | `1Gi` | +| airm.rabbitmq.resources.requests.cpu | CPU request for the RabbitMQ cluster | string | `500m` | +| airm.rabbitmq.resources.requests.memory | Memory request for the RabbitMQ cluster | string | `1Gi` | +| airm.rabbitmq.persistence.storage | Persistent storage size for the RabbitMQ cluster | string | `20Gi` | +| airm.rabbitmq.persistence.storageClassName | Storage class name for the RabbitMQ cluster | string | `default` | +| airm.rabbitmq.backup.enabled | Enable RabbitMQ backup | bool | `false` | +| airm.rabbitmq.backup.image | RabbitMQ backup container image | string | `amdenterpriseai/rabbitmq-backup:0.1` | +| airm.rabbitmq.backup.resources.limits.memory | Memory limit for cron job of RabbitMQ backup | string | `512Mi` | +| airm.rabbitmq.backup.resources.requests.cpu | CPU request for cron job of RabbitMQ backup | string | `250m` | +| airm.rabbitmq.backup.resources.requests.memory | Memory request for cron job of RabbitMQ backup | string | `256Mi` | +| airm.frontend.image.repository | Frontend image repository | string | `amdenterpriseai/airm-ui` | +| airm.frontend.image.tag | Frontend image tag | string | `v2025.08-rc.21` | +| airm.frontend.image.pullPolicy | Frontend image pull policy | string | `IfNotPresent` | +| airm.frontend.servicePort | Frontend service port | int | `80` | +| airm.frontend.resources.limits.memory | Memory limit for frontend | string | `4Gi` | +| airm.frontend.resources.requests.cpu | CPU request for frontend | string | `500m` | +| airm.frontend.resources.requests.memory | Memory request for frontend | string | `4Gi` | +| airm.backend.image.repository | Backend API image repository | string | `amdenterpriseai/airm-api` | +| airm.backend.image.tag | Backend API image tag | string | `v2025.08-rc.21` | +| airm.backend.image.pullPolicy | Backend API image pull policy | string | `IfNotPresent` | +| airm.backend.servicePort | Backend API service port | int | `80` | +| airm.backend.servicePortMetrics | Backend API metrics service port | int | `9009` | +| airm.backend.env.dbPort | Database port | int | `5432` | +| airm.backend.env.rabbitmqPort | RabbitMQ port | int | `5672` | +| airm.backend.env.minioUrl | Minio service URL | string | `http://minio.minio-tenant-default.svc.cluster.local:80` | +| airm.backend.env.minioBucket | Minio bucket name | string | `default-bucket` | +| airm.backend.env.prometheusUrl | Prometheus service URL | string | `http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:9090` | +| airm.backend.env.clusterAuthUrl | Cluster auth service URL | string | `http://cluster-auth.cluster-auth.svc.cluster.local:8081` | +| airm.backend.resources.limits.memory | Memory limit for backend API container | string | `1Gi` | +| airm.backend.resources.requests.cpu | CPU request for backend API container | string | `500m` | +| airm.backend.resources.requests.memory | Memory request for backend API container | string | `1Gi` | +| airm.backend.securityContext.allowPrivilegeEscalation | Security context: allow privilege escalation | bool | `false` | +| airm.backend.securityContext.runAsNonRoot | Security context: run container as non-root | bool | `true` | +| airm.backend.securityContext.runAsUser | Security context: user ID to run container as | int | `1000` | +| airm.backend.securityContext.seccompProfile.type | Security context: seccomp profile type | string | `RuntimeDefault` | +| airm.utilities.netcat.image.repository | Netcat image repository | string | `busybox` | +| airm.utilities.netcat.image.tag | Netcat image tag | string | `1.37.0` | +| airm.utilities.netcat.image.pullPolicy | Netcat image pull policy | string | `IfNotPresent` | +| airm.utilities.curl.image.repository | Curl image repository | string | `curlimages/curl` | +| airm.utilities.curl.image.tag | Curl image tag | string | `8.16.0` | +| airm.utilities.curl.image.pullPolicy | Curl image pull policy | string | `IfNotPresent` | +| airm.utilities.liquibase.image.repository | Liquibase image repository | string | `docker.io/liquibase/liquibase` | +| airm.utilities.liquibase.image.tag | Liquibase image tag | string | `4.31` | +| airm.utilities.liquibase.image.pullPolicy | Liquibase image pull policy | string | `IfNotPresent` | diff --git a/sources/airm/0.3.4/charts/airm-api/files/configure.sh b/sources/airm/0.3.4/charts/airm-api/files/configure.sh new file mode 100644 index 00000000..69a3f59d --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/files/configure.sh @@ -0,0 +1,374 @@ +#!/bin/bash + +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +##################################################################################### +echo "" +echo "Run configure script block..." +echo "" + +# --- Configuration Variables --- +# Get values from bloom configmap mounted as env + +# NOTE: ORG_NAME is hardcoded to demo because gpu operator metrics has same org name hardcoded there +# Otherwise the following line can be uncommented to consider the real org name from domain config +# ORG_NAME=$(echo $NEW_DOMAIN_NAME | awk -F '.' '{ print $2 }') +ORG_NAME="demo" +ORG_DOMAINS="[\"${NEW_DOMAIN_NAME}\"]" +CLUSTER_WORKLOADS_BASE_URL="https://workspaces.${NEW_DOMAIN_NAME}/" +CLUSTER_KUBE_API_URL="https://k8s.${NEW_DOMAIN_NAME}" +USER_EMAIL="devuser@${NEW_DOMAIN_NAME}" +PROJECT_NAME="demo" +PROJECT_DESCRIPTION="demo" +CLUSTER_NAME="demo-cluster" +TIMEOUT=300 +SLEEP_INTERVAL=5 + +# --- Input Validation --- +echo "Validating environment variables..." +echo "KEYCLOAK_CLIENT_ID: ${KEYCLOAK_CLIENT_ID}" +echo "NEW_DOMAIN_NAME: ${NEW_DOMAIN_NAME}" +echo "AIRM_API_URL: ${AIRM_API_URL}" + +function check_env_variable() { + if [ -z "${!1}" ]; then + echo "ERROR: $1 environment variable is not set." + exit 1 + fi +} + +function check_success() { + if [ "$1" -ne 0 ]; then + echo "ERROR: $2" + exit 1 + fi +} + +check_env_variable "AIRM_API_URL" +check_env_variable "KEYCLOAK_URL" +check_env_variable "KEYCLOAK_REALM" +check_env_variable "KEYCLOAK_CLIENT_SECRET" +check_env_variable "KEYCLOAK_CLIENT_ID" +check_env_variable "KEYCLOAK_ADMIN_CLIENT_ID" +check_env_variable "KEYCLOAK_ADMIN_CLIENT_SECRET" +check_env_variable "USER_PASSWORD" + +function refresh_token() { + TOKEN=$(curl -s -d "client_id=${KEYCLOAK_CLIENT_ID}" -d "username=${USER_EMAIL}" -d "password=${USER_PASSWORD}" -d 'grant_type=password' -d "client_secret=${KEYCLOAK_CLIENT_SECRET}" "${KEYCLOAK_URL}/realms/${KEYCLOAK_REALM}/protocol/openid-connect/token" | jq -r '.access_token') + if [ -z "$TOKEN" ] || [ "$TOKEN" == "null" ]; then + echo "ERROR: Failed to obtain access token from Keycloak." + exit 1 + fi +} + +function create_org() { + # Try to get ORG_ID by name + ORG_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r --arg name "$ORG_NAME" '.organizations[] | select(.name==$name) | .id') + + # If not found, create the org and fetch the ID again + if [ -z "$ORG_ID" ] || [ "$ORG_ID" == "null" ]; then + ORG_RESP=$(curl -s -o /dev/null -X POST -w "%{http_code}" "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' \ + -d "{ \"name\": \"$ORG_NAME\", \"domains\": $ORG_DOMAINS }") + echo "$ORG_RESP" + check_success "$([[ "$ORG_RESP" == "200" || "$ORG_RESP" == "201" ]] && echo 0 || echo 1)" "Failed to create organization" + + ORG_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r --arg name "$ORG_NAME" '.organizations[] | select(.name==$name) | .id') + fi + + if [ -z "$ORG_ID" ] || [ "$ORG_ID" == "null" ]; then + echo "ERROR: Failed to create or retrieve organization ID." + exit 1 + else + echo "ORG_ID=${ORG_ID}" + fi +} + +function add_user_to_org() { + # Check if user exists in org + USER_EXISTS=$(curl -s -X GET "${AIRM_API_URL}/v1/users" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" -H 'Content-Type: application/json' | jq -r --arg email "$USER_EMAIL" '.data? // [] | .[] | select(.email==$email) | .email') + # Add user to org if they don't exist + if [ -z "$USER_EXISTS" ] || [ "$USER_EXISTS" == "null" ]; then + echo "$USER_EXISTS" + echo "User '$USER_EMAIL' not found in organization. Adding..." + ADD_USER_RESP=$(curl -w "%{http_code}" -X 'POST' "${AIRM_API_URL}/v1/organizations/${ORG_ID}/users" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" -H 'Content-Type: application/json' -d '{ "email": "'"$USER_EMAIL"'", "roles": ["Platform Administrator"]}') + echo "$ADD_USER_RESP" + check_success "$([[ "$ADD_USER_RESP" == "200" || "$ADD_USER_RESP" == "201" || "$ADD_USER_RESP" == "null201" ]] && echo 0 || echo 1)" "Failed to add user to organization" + else + echo "User '$USER_EMAIL' already exists in organization." + fi +} + +function create_project() { + PROJECT_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/projects" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" | jq -r '.projects[] | select(.name=="'$PROJECT_NAME'") | .id') + + for (( i=0; i<=TIMEOUT; i+=SLEEP_INTERVAL )); do + CLUSTER_STATUS=$(curl -s -X GET "${AIRM_API_URL}/v1/clusters/$CLUSTER_ID" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r '.status') + + if [ "$CLUSTER_STATUS" == "healthy" ]; then + echo "Cluster is healthy!" + break # Exit the loop if the cluster is healthy + fi + echo "Cluster status: $CLUSTER_STATUS. Waiting $SLEEP_INTERVAL seconds... ($i/$TIMEOUT seconds elapsed)" + sleep $SLEEP_INTERVAL + done + + if [ "$CLUSTER_STATUS" != "healthy" ]; then + echo "ERROR: Cluster did not become healthy within $TIMEOUT seconds." + exit 1 + fi + + if [ -z "$PROJECT_ID" ] || [ "$PROJECT_ID" == "null" ]; then + echo "Projects '$PROJECT_NAME' not found. Creating..." + PROJECT_ID=$(curl -X 'POST' \ + "${AIRM_API_URL}/v1/projects" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' \ + -d '{ + "name": "'"$PROJECT_NAME"'", + "description": "'"$PROJECT_DESCRIPTION"'", + "cluster_id": "'"$CLUSTER_ID"'", + "quota": { + "cpu_milli_cores": 0, + "memory_bytes": 0, + "ephemeral_storage_bytes": 0, + "gpu_count": 0 + } + }' | jq -r '.id') + echo "$PROJECT_ID" + check_success "$([[ "$PROJECT_ID" != "null" ]] && echo 0 || echo 1)" "Failed to create project" + else + echo "Project '$PROJECT_NAME' already exists with ID: $PROJECT_ID" + fi +} + +function add_minio_secret_and_storage_to_project() { + for (( i=0; i<=TIMEOUT; i+=SLEEP_INTERVAL )); do + PROJECT_STATUS=$(curl -s -X GET "${AIRM_API_URL}/v1/projects/$PROJECT_ID" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r '.status') + + if [ "$PROJECT_STATUS" == "Ready" ]; then + echo "Project is ready!" + break # Exit the loop if the project is ready + fi + echo "Project status: $PROJECT_STATUS. Waiting $SLEEP_INTERVAL seconds... ($i/$TIMEOUT seconds elapsed)" + sleep $SLEEP_INTERVAL + done + + SECRET_NAME="minio-credentials-fetcher" + STORAGE_NAME="minio-storage" + + SECRET_IN_PROJECT=$(curl -X 'GET' \ + "${AIRM_API_URL}/v1/projects/${PROJECT_ID}/secrets" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer ${TOKEN}" | jq -r '.project_secrets[] | select(.secret.name=="'"$SECRET_NAME"'") | .id') + EXTERNAL_SECRET_API_VERSION="v1beta1" + EXTERNAL_SECRET_MANIFEST=$(cat < /dev/null 2>&1; then + echo "AIRM API is ready!" + break + else + echo "Waiting for AIRM API..." + sleep 10 + fi + done + + echo "All dependencies are ready!" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: configure + image: "{{ .Values.airm.utilities.clusterTool.image.repository }}:{{ .Values.airm.utilities.clusterTool.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.clusterTool.image.pullPolicy }}" + command: ["/bin/bash"] + args: ["/scripts/configure.sh"] + env: + - name: DEBIAN_FRONTEND + value: "noninteractive" + - name: ORG_NAME + value: "demo" + - name: NEW_DOMAIN_NAME + value: "{{ .Values.airm.appDomain }}" + - name: KEYCLOAK_CLIENT_ID + value: "{{ .Values.airm.keycloak.clientId }}" + - name: KEYCLOAK_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: KEYCLOAK_SECRET + name: "{{ .Release.Name }}-keycloak-ui-creds" + - name: USER_EMAIL + value: "devuser@{{ .Values.airm.appDomain }}" + - name: KEYCLOAK_URL + value: "{{ .Values.airm.keycloak.internalUrl }}" + - name: KEYCLOAK_REALM + value: "{{ .Values.airm.keycloak.realm }}" + - name: KEYCLOAK_ADMIN_CLIENT_ID + valueFrom: + secretKeyRef: + key: client-id + name: "{{ .Release.Name }}-keycloak-admin-client" + - name: KEYCLOAK_ADMIN_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: client-secret + name: "{{ .Release.Name }}-keycloak-admin-client" + - name: AIRM_API_URL + value: "http://{{ .Release.Name }}-api.{{ .Release.Namespace }}.svc.cluster.local" + - name: USER_PASSWORD + valueFrom: + secretKeyRef: + key: USER_PASSWORD + name: "{{ .Release.Name }}-user-credentials" + volumeMounts: + - name: configure-script + mountPath: /scripts + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + volumes: + - name: configure-script + configMap: + name: "{{ .Release.Name }}-configure-script" + defaultMode: 0755 + +{{- end }} diff --git a/sources/airm/0.3.4/charts/airm-api/templates/airm-es.yaml b/sources/airm/0.3.4/charts/airm-api/templates/airm-es.yaml new file mode 100644 index 00000000..4dd18aeb --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/templates/airm-es.yaml @@ -0,0 +1,215 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cnpg-superuser" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-cnpg-superuser-username + property: value + secretKey: username + - remoteRef: + key: airm-cnpg-superuser-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-cnpg-superuser" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cnpg-user" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-cnpg-user-username + property: value + secretKey: username + - remoteRef: + key: airm-cnpg-user-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-cnpg-user" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-keycloak-admin-client" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-keycloak-admin-client-id + property: value + secretKey: client-id + - remoteRef: + key: airm-keycloak-admin-client-secret + property: value + secretKey: client-secret + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.keycloak.name }} + target: + name: "{{ .Release.Name }}-keycloak-admin-client" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-rabbitmq-admin" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-rabbitmq-user-username + property: value + secretKey: username + - remoteRef: + key: airm-rabbitmq-user-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-rabbitmq-admin" + template: + data: + default_user.conf: | + default_user = {{ "{{ .username }}" }} + default_pass = {{ "{{ .password }}" }} + password: '{{ "{{ .password }}" }}' + username: '{{ "{{ .username }}" }}' + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-api-minio-credentials" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: minio-api-access-key + property: value + secretKey: minio-access-key + - remoteRef: + key: minio-api-secret-key + property: value + secretKey: minio-secret-key + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.minio.name }} + target: + name: "{{ .Release.Name }}-api-minio-credentials" +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-secrets-airm" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-ui-auth-nextauth-secret + property: value + secretKey: NEXTAUTH_SECRET + refreshInterval: 15s + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-secrets-airm" +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-keycloak-ui-client" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-ui-keycloak-secret + property: value + secretKey: KEYCLOAK_SECRET + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.keycloak.name }} + target: + name: "{{ .Release.Name }}-keycloak-ui-creds" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cluster-auth-secrets" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + refreshInterval: 5m + target: + name: "{{ .Release.Name }}-cluster-auth-admin" + data: + - secretKey: admin-token + remoteRef: + key: cluster-auth-admin-token + property: value +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-user-credentials" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: keycloak-initial-devuser-password + property: value + secretKey: USER_PASSWORD + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-user-credentials" + template: + type: Opaque diff --git a/sources/airm/0.3.4/charts/airm-api/templates/airm-httproute.yaml b/sources/airm/0.3.4/charts/airm-api/templates/airm-httproute.yaml new file mode 100644 index 00000000..3393d6a5 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/templates/airm-httproute.yaml @@ -0,0 +1,81 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: '{{ .Release.Name }}api-route' + namespace: '{{ .Release.Namespace }}' +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: {{ .Values.kgateway.gatewayName }} + namespace: {{ .Values.kgateway.namespace }} + rules: + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-api' + port: {{ .Values.kgateway.airmapi.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmapi.prefixValue }}\..* + path: + type: RegularExpression + value: .*/stream.* + timeouts: + backendRequest: {{ .Values.kgateway.airmapi.timeouts.stream.backendRequest }} + request: {{ .Values.kgateway.airmapi.timeouts.stream.request }} + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-api' + port: {{ .Values.kgateway.airmapi.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmapi.prefixValue }}\..* + path: + type: PathPrefix + value: / + timeouts: + backendRequest: {{ .Values.kgateway.airmapi.timeouts.nonStream.backendRequest }} + request: {{ .Values.kgateway.airmapi.timeouts.nonStream.request }} +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: '{{ .Release.Name }}ui-route' + namespace: '{{ .Release.Namespace }}' +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: {{ .Values.kgateway.gatewayName }} + namespace: {{ .Values.kgateway.namespace }} + rules: + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-ui' + port: {{ .Values.kgateway.airmui.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmui.prefixValue }}\..* + path: + type: PathPrefix + value: / + timeouts: + backendRequest: {{ .Values.kgateway.airmui.timeouts.backendRequest }} + request: {{ .Values.kgateway.airmui.timeouts.request }} diff --git a/sources/airm/0.3.4/charts/airm-api/templates/airm-rabbitmq-cluster.yaml b/sources/airm/0.3.4/charts/airm-api/templates/airm-rabbitmq-cluster.yaml new file mode 100644 index 00000000..3db2ff07 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/templates/airm-rabbitmq-cluster.yaml @@ -0,0 +1,69 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: rabbitmq.com/v1beta1 +kind: RabbitmqCluster +metadata: + name: '{{ .Release.Name }}-rabbitmq' + namespace: '{{ .Release.Namespace }}' +spec: + persistence: + {{- toYaml .Values.airm.rabbitmq.persistence | nindent 4 }} + replicas: {{ .Values.airm.rabbitmq.replicas }} + resources: + {{- toYaml .Values.airm.rabbitmq.resources | nindent 4 }} + secretBackend: + externalSecret: + name: '{{ .Release.Name }}-rabbitmq-admin' + tls: + secretName: '{{ .Release.Name }}-tls-secret' +--- +{{- if .Values.airm.rabbitmq.backup.enabled -}} + +apiVersion: batch/v1 +kind: CronJob +metadata: + name: '{{ .Release.Name }}-rabbitmq-backup-cron' + namespace: '{{ .Release.Namespace }}' +spec: + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + spec: + containers: + - env: + - name: RABBITMQ_URL + value: 'http://{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local:15672' + - name: RABBITMQ_USER + valueFrom: + secretKeyRef: + key: username + name: '{{ .Release.Name }}-rabbitmq-admin' + - name: RABBITMQ_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: '{{ .Release.Name }}-rabbitmq-admin' + - name: S3_HOST + value: "{{ .Values.airm.backend.env.minioUrl }}" + - name: S3_ACCESS_KEY + valueFrom: + secretKeyRef: + key: minio-access-key + name: '{{ .Release.Name }}-api-minio-credentials' + - name: S3_SECRET_KEY + valueFrom: + secretKeyRef: + key: minio-secret-key + name: '{{ .Release.Name }}-api-minio-credentials' + image: '{{ .Values.airm.rabbitmq.backup.image }}' + name: rabbitmq-backup-cron + resources: + {{- toYaml .Values.airm.rabbitmq.backup.resources | nindent 16 }} + restartPolicy: OnFailure + schedule: 0 * * * * + +{{- end }} diff --git a/sources/airm/0.3.4/charts/airm-api/templates/airm-vllm-collector.yaml b/sources/airm/0.3.4/charts/airm-api/templates/airm-vllm-collector.yaml new file mode 100644 index 00000000..f12aa532 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/templates/airm-vllm-collector.yaml @@ -0,0 +1,93 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: "{{ .Release.Name }}-{{ .Values.aims.otelCollector.name }}" + namespace: "{{ .Release.Namespace }}" +spec: + mode: daemonset + image: "{{ .Values.aims.otelCollector.image }}" + config: + receivers: + prometheus: + config: + scrape_configs: + - job_name: "vllm" + metrics_path: /metrics + scrape_interval: "{{ .Values.aims.otelCollector.receivers.prometheus.config.scrape_configs.scrape_interval }}" + kubernetes_sd_configs: + - role: pod + relabel_configs: + # Only scrape pods with the workload-id label + - source_labels: + [__meta_kubernetes_pod_label_airm_silogen_ai_workload_id] + action: keep + regex: .+ + # Only scrape pods with app label starting with isvc. + - source_labels: [__meta_kubernetes_pod_label_app] + action: keep + regex: isvc\..* + # Set the workload_id from the label + - source_labels: + [__meta_kubernetes_pod_label_airm_silogen_ai_workload_id] + target_label: workload_id + # Set service name from app label + - source_labels: [__meta_kubernetes_pod_label_app] + target_label: service + # Set service instance id from pod name + - source_labels: [__meta_kubernetes_pod_name] + target_label: service_instance_id + # Set the scrape target to port 8000 + - source_labels: [__meta_kubernetes_pod_ip] + target_label: __address__ + replacement: $1:8000 + otlp: + protocols: + grpc: {} + http: {} + + processors: + resource: + attributes: + - key: airm.silogen.ai/workload-id + from_attribute: workload_id + action: upsert + - key: service.instance.id + from_attribute: service_instance_id + action: upsert + - key: service.name + from_attribute: service + action: upsert + + transform: + metric_statements: + - context: datapoint + statements: + - set(attributes["workload_id"], resource.attributes["airm.silogen.ai/workload-id"]) where attributes["workload_id"] == nil + - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where attributes["service_instance_id"] == nil + - set(attributes["service"], resource.attributes["service.name"]) where attributes["service"] == nil + + exporters: + otlphttp: + endpoint: "{{ .Values.aims.otelCollector.exporters.otlphttp.endpoint }}" + + service: + pipelines: + metrics: + receivers: [prometheus] + processors: [resource, transform] + exporters: [otlphttp] + + traces: + receivers: [otlp] + processors: [resource] + exporters: [otlphttp] + + logs: + receivers: [otlp] + processors: [resource] + exporters: [otlphttp] diff --git a/sources/airm/0.3.4/charts/airm-api/values.yaml b/sources/airm/0.3.4/charts/airm-api/values.yaml new file mode 100644 index 00000000..9ee63f06 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-api/values.yaml @@ -0,0 +1,166 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +secretgenerator: + image: + repository: amdenterpriseai/cluster-tool + tag: latest + pullPolicy: IfNotPresent + +kgateway: + namespace: kgateway-system + gatewayName: https + airmapi: + servicePort: 80 + prefixValue: airmapi + timeouts: + stream: + backendRequest: 30m + request: 30m + nonStream: + backendRequest: 10m + request: 10m + airmui: + servicePort: 80 + prefixValue: airmui + timeouts: + backendRequest: 1m + request: 1m + keycloak: + prefixValue: kc + +aims: + otelCollector: + image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0 + receivers: + prometheus: + config: + scrape_configs: + scrape_interval: 20s + exporters: + otlphttp: + endpoint: "http://lgtm-stack.otel-lgtm-stack.svc:4318" + name: "vllm-collector" + +airm: + appDomain: PUBLIC-IP + includeDemoSetup: true + + externalSecretStore: + airm: + name: openbao-secret-store + minio: + name: openbao-secret-store + keycloak: + name: openbao-secret-store + + postgresql: + enabled: true + cnpg: + image: ghcr.io/cloudnative-pg/postgresql:17 + instance: 1 + resources: + limits: + cpu: "2" + memory: 1Gi + requests: + cpu: "1" + memory: 512Mi + storage: + size: 50Gi + storageClass: default + walStorage: + size: 50Gi + storageClass: default + + rabbitmq: + replicas: 1 + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: '1' + memory: 1Gi + persistence: + storage: 20Gi + storageClassName: default + backup: + enabled: false + image: amdenterpriseai/rabbitmq-backup:0.1 + resources: + limits: + memory: 512Mi + requests: + cpu: 250m + memory: 256Mi + + keycloak: + internalUrl: http://keycloak.keycloak.svc.cluster.local:8080 + clientId: "354a0fa1-35ac-4a6d-9c4d-d661129c2cd0" + realm: airm + + frontend: + image: + repository: amdenterpriseai/airm-ui + tag: 0.3.4 + pullPolicy: IfNotPresent + servicePort: 80 + resources: + limits: + memory: 4Gi + requests: + cpu: 500m + memory: 4Gi + + backend: + image: + repository: amdenterpriseai/airm-api + tag: 0.3.4 + pullPolicy: IfNotPresent + + servicePort: 80 + servicePortMetrics: 9009 + env: + dbPort: 5432 + rabbitmqPort: 5672 + minioUrl: http://minio.minio-tenant-default.svc.cluster.local:80 + minioBucket: default-bucket + prometheusUrl: http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:9090 + clusterAuthUrl: http://cluster-auth.cluster-auth.svc.cluster.local:8081 + + resources: + limits: + memory: 1Gi + requests: + cpu: 500m + memory: 1Gi + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + + utilities: + netcat: + image: + repository: busybox + tag: 1.37.0 + pullPolicy: IfNotPresent + curl: + image: + repository: curlimages/curl + tag: 8.16.0 + pullPolicy: IfNotPresent + liquibase: + image: + repository: docker.io/liquibase/liquibase + tag: 4.31 + pullPolicy: IfNotPresent + clusterTool: + image: + repository: amdenterpriseai/cluster-tool + tag: latest + pullPolicy: IfNotPresent diff --git a/sources/airm/0.3.4/charts/airm-dispatcher/.helmignore b/sources/airm/0.3.4/charts/airm-dispatcher/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-dispatcher/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.4/charts/airm-dispatcher/Chart.yaml b/sources/airm/0.3.4/charts/airm-dispatcher/Chart.yaml new file mode 100644 index 00000000..16fb1b13 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-dispatcher/Chart.yaml @@ -0,0 +1,29 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm-dispatcher +description: A Helm chart for AIRM Dispatcher + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.4 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" diff --git a/sources/airm/0.3.4/charts/airm-dispatcher/README.md b/sources/airm/0.3.4/charts/airm-dispatcher/README.md new file mode 100644 index 00000000..0b85c706 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-dispatcher/README.md @@ -0,0 +1,54 @@ + + +# HELM CHARTS + +Simple instructions to deploy AIRM dispatcher application using helm chart. +The dispatcher can be run on a compute cluster, which may or may not be the same as the one hosting the AIRM API and UI. + +### 1. Requirements + +The following external components must be available in the Kubernetes cluster before the helm chart can be installed. + +- Accessible RabbitMQ cluster (must be the same cluster used by AIRM API). +- Kaiwo installed on the cluster (along with all its dependencies) + +### 2. Install + +``` +cd helm/airm/charts + +# 1. Create output template just to validate (the public domain could be app-dev.silogen.ai, staging.silogen.ai, etc.) +helm template airm-dispatcher ./airm-dispatcher -n airm --create-namespace > airm-dispatcher-helm-generated.yaml + +# 2. Run chart install +helm install airm-dispatcher ./airm-dispatcher -n airm --create-namespace + +# 3. Delete chart if needed +helm delete airm-dispatcher -n airm + +# 4. Upgrade when bumping versions +helm upgrade -n airm --set airm-dispatcher ./airm-dispatcher +``` + +--- + +### 3. Helm Settings + +| Field Path | Description | Type | Example / Default | +|---------------------------------------------|--------------------------------------------------------------|---------|-----------------------------------| +| airm.dispatcher.image.repository | Dispatcher image repository | string | `amdenterpriseai/airm-dispatcher` | +| airm.dispatcher.image.tag | Dispatcher image tag | string | `v2025.08-rc.21` | +| airm.dispatcher.image.pullPolicy | Dispatcher image pull policy | string | `IfNotPresent` | +| airm.dispatcher.servicePort | Dispatcher service port | int | `80` | +| airm.utilities.netcat.image.repository | Netcat image repository | string | `busybox` | +| airm.utilities.netcat.image.tag | Netcat image tag | string | `1.37.0` | +| airm.utilities.netcat.image.pullPolicy | Netcat image pull policy | string | `IfNotPresent` | +| airm.utilities.curl.image.repository | Curl image repository | string | `curlimages/curl` | +| airm.utilities.curl.image.tag | Curl image tag | string | `8.16.0` | +| airm.utilities.curl.image.pullPolicy | Curl image pull policy | string | `IfNotPresent` | +| airm.additionalClusterRoles.platformAdmin | Additional cluster roles for the Platform Administrator role | array | `[]` | +| airm.additionalClusterRoles.projectMember | Additional cluster roles for the Project Member role | array | `[]` | diff --git a/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-cluster-policies.yaml b/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-cluster-policies.yaml new file mode 100644 index 00000000..caf92aa6 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-cluster-policies.yaml @@ -0,0 +1,352 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-project-namespace-rolebinding +spec: + background: false + rules: + - name: generate-project-namespace-rolebinding + match: + any: + - resources: + kinds: + - Namespace + operations: + - CREATE + preconditions: + any: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/project-id" || '''' }}`}}' + operator: NotEquals + value: "" + skipBackgroundRequests: true + generate: + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + name: "project-member-role-binding" + namespace: "{{`{{request.object.metadata.name}}`}}" + synchronize: true + data: + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: airm-project-member + subjects: + - kind: Group + # Add this in for backwards compatibility + name: "oidc{{`{{request.object.metadata.name}}`}}" + apiGroup: rbac.authorization.k8s.io + - kind: Group + # The kubernetes cluster apply an OIDC prefix of 'oidc:', so we adjust the groups to expect that + name: "oidc:{{`{{request.object.metadata.name}}`}}" + apiGroup: rbac.authorization.k8s.io +--- +# Kyverno policy that enforces that workloads submitted to a namespace managed by AIRMan have the +# correct kueue lables and field set, so that they are bound by the quota of the namespace +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-quota-enforcement-for-workloads +spec: + background: false + rules: + - name: set-queue-name-from-namespace-default + match: + resources: + kinds: + - Deployment + - StatefulSet + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + + - name: set-queue-name-from-namespace-jobs + match: + resources: + kinds: + - Job # https://kueue.sigs.k8s.io/docs/tasks/run/jobs/ + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + spec: + suspend: true + + - name: set-queue-name-from-namespace-cronjobs + match: + resources: + kinds: + - CronJob # https://kueue.sigs.k8s.io/docs/tasks/run/run_cronjobs/ + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.spec.jobTemplate.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + spec: + jobTemplate: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + spec: + suspend: true + + - name: set-queue-name-from-namespace-kaiwo + match: + resources: + kinds: + - KaiwoJob + - KaiwoService + - AIMService + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: "{{`{{request.object.spec.clusterQueue || '' }}`}}" + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + spec: + clusterQueue: "{{`{{request.namespace }}`}}" +--- +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-workload-tracking-policy +spec: + background: false + rules: + # For all supported types, if airm.silogen.ai/workload-id and airm.silogen.ai/component-id are not set, we assume + # it has been submitted from outside of AIRMan. In that case, we set airm.silogen.ai/auto-discovered: true, so it can + # be tracked upstream. We also set airm.silogen.ai/discovered-component-type so that we can identify the type of component + # that was originally tracked, and ignore children created by it. See remove-auto-discovered-annotations-inherited-from-parent + # We also try to capture the user who submitted the workload, and consume it in the application + + # Please note that ReplicaSet is not supported because by default it is filtered away by Kyverno by default: https://github.com/kyverno/kyverno/blob/main/charts/kyverno/values.yaml#L270 + - name: add-discovery-annotations-for-supported-types + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || '''' }}`}}' + operator: Equals + value: "" + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || '''' }}`}}' + operator: Equals + value: "" + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/auto-discovered" || '''' }}`}}' + operator: Equals + value: "" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + annotations: + airm.silogen.ai/submitter: "{{`{{request.userInfo.username }}`}}" + airm.silogen.ai/auto-discovered: "true" + airm.silogen.ai/discovered-component-type: "{{`{{request.object.kind }}`}}" + # For all supported types, if airm.silogen.ai/auto-discovered is set and the airm.silogen.ai/discovered-component-type + # doesnt match the kind of the current component, we assume this type has been created by a parent which is also + # supported by AIRMan and we dont need to track this type upstream, so we unset the airm.silogen.ai/auto-discovered annotation. + # This is mostly to account for KaiwoJob, KaiwoService, AIMService which propagate annotations to pods. + - name: remove-auto-discovered-annotations-inherited-from-parent + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/auto-discovered" || '''' }}`}}' + operator: Equals + value: "true" + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/discovered-component-type" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.object.kind }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + annotations: + airm.silogen.ai/auto-discovered: "false" + # For all supported types, if airm.silogen.ai/project-id does not match that of the namespace label, overwrite it + # with the expected value, to avoid metrics getting mixed up between projects. + - name: set-project-id-from-namespace-label + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + context: + - name: ns_labels + apiCall: + urlPath: "/api/v1/namespaces/{{`{{request.namespace }}`}}" + method: GET + jmesPath: "metadata.labels" + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/project-id" || '''' }}`}}' + operator: NotEquals + value: '{{`{{ns_labels."airm.silogen.ai/project-id" }}`}}' + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/project-id: '{{`{{ns_labels."airm.silogen.ai/project-id" }}`}}' + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to types that expect them at + # metadata.labels. The expectation is that these are propagated to the subsequent pods that are created. + + # If the resource is spawned off by a CRD, it will not know about the labels on the previous version of the object, + # so we also check request.oldObject for the labels to try and preserve them if they were already set. + - name: add-workload-and-component-id-default + match: + resources: + kinds: + - Pod + - KaiwoJob + - KaiwoService + - AIMService + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to objects with templates and + # also add it to spec.template.metadata.labels to ensure that the pods created by them contain the labels as well + - name: add-workload-and-component-id-to-objects-with-template + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + template: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to cronjob and + # also add it to spec.jobTemplate.metadata.labels to ensure that the pods created by the cronjob + # contain it as well + - name: add-workload-and-component-id-cronjobs + match: + resources: + kinds: + - CronJob + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + jobTemplate: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + template: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' diff --git a/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-cluster-roles.yaml b/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-cluster-roles.yaml new file mode 100644 index 00000000..2461e894 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-cluster-roles.yaml @@ -0,0 +1,164 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: airm-platform-admin +rules: + - apiGroups: [""] + resources: + [ + "pods", + "events", + "services", + "configmaps", + "persistentvolumes", + "persistentvolumeclaims", + "namespaces", + "serviceaccounts", + ] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: [""] + resources: ["pods/log", "pods/exec", "pods/attach", "pods/portforward"] + verbs: ["*"] + - apiGroups: ["apps"] + resources: ["deployments", "deployments/scale", "replicasets", "statefulsets", "daemonsets"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses", "networkpolicies"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimmodels", "aimmodelcaches", "aimservicetemplates", "aimruntimeconfigs", "aimtemplatecaches"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes", "gateways"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["config.kaiwo.silogen.ai"] + resources: ["kaiwoconfigs"] + verbs: ["*"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwoqueueconfigs"] + verbs: ["*"] + - apiGroups: ["aim.silogen.ai"] + resources: [ "aimclustermodels", "aimclusterservicetemplates", "aimclusterruntimeconfigs", "aimclustermodelsources" ] + verbs: ["*"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "clusterroles", "rolebindings", "clusterrolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["external-secrets.io"] + resources: ["externalsecrets"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: ["external-secrets.io"] + resources: ["clustersecretstores"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["kueue.x-k8s.io"] + resources: ["clusterqueues", "resourceflavors", "localqueues", "workloadpriorityclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list"] +{{- range .Values.airm.additionalClusterRoles.platformAdmin }} + - apiGroups: + {{ .apiGroups | toYaml | nindent 6 }} + resources: + {{ .resources | toYaml | nindent 6 }} + verbs: + {{ .verbs | toYaml | nindent 6 }} +{{- end }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: airm-platform-admin-role-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: airm-platform-admin +subjects: + - kind: Group + # Add this in for backwards compatibility + name: "oidcairm-role:Platform Administrator" + apiGroup: rbac.authorization.k8s.io + - kind: Group + # The kubernetes cluster apply an OIDC prefix of 'oidc':, so we adjust the group to expect that + name: "oidc:airm-role:Platform Administrator" + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: airm-project-member +rules: + - apiGroups: [""] + resources: + [ + "pods", + "pods/log", + "pods/exec", + "pods/attach", + "pods/portforward", + "events", + "services", + "configmaps", + "persistentvolumes", + "persistentvolumeclaims", + ] + verbs: ["*"] + - apiGroups: ["apps"] + resources: ["deployments", "replicasets", "statefulsets", "daemonsets"] + verbs: ["*"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["*"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses", "networkpolicies"] + verbs: ["*"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["*"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimmodels", "aimmodelcaches", "aimservicetemplates", "aimruntimeconfigs", "aimtemplatecaches"] + verbs: ["*"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes", "gateways"] + verbs: ["*"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch", "create"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "clusterroles", "rolebindings", "clusterrolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["external-secrets.io"] + resources: ["clustersecretstores", "externalsecrets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] +{{- range .Values.airm.additionalClusterRoles.projectMember }} + - apiGroups: + {{ .apiGroups | toYaml | nindent 6 }} + resources: + {{ .resources | toYaml | nindent 6 }} + verbs: + {{ .verbs | toYaml | nindent 6 }} +{{- end }} diff --git a/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml b/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml new file mode 100644 index 00000000..8a3489ef --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml @@ -0,0 +1,343 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-cluster-nodes-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Replace + jobTemplate: + spec: + template: + spec: + containers: + - command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/clusters/nodes + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + name: airm-cluster-nodes-cron + resources: + limits: + memory: 100Mi + requests: + cpu: 50m + memory: 100Mi + restartPolicy: OnFailure + schedule: 0 * * * * +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-aim-models-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Forbid + schedule: "*/5 * * * *" + suspend: false + jobTemplate: + spec: + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + restartPolicy: OnFailure + initContainers: + - name: check-dispatcher-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm Dispatcher at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm Dispatcher is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-dispatcher.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.servicePort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-aim-models-cron + command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/aims/cluster-models + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + resources: + requests: + memory: "100Mi" + cpu: "50m" + limits: + memory: "100Mi" +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-heartbeat-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Replace + schedule: "*/1 * * * *" + suspend: false + jobTemplate: + spec: + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + restartPolicy: OnFailure + initContainers: + - name: check-dispatcher-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm Dispatcher at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm Dispatcher is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-dispatcher.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.servicePort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-heartbeat-cron + command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/heartbeats + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + resources: + requests: + memory: "100Mi" + cpu: "50m" + limits: + memory: "100Mi" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: "{{ .Release.Name }}-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + replicas: 1 + selector: + matchLabels: + app: "{{ .Release.Name }}-dispatcher" + template: + metadata: + labels: + app: "{{ .Release.Name }}-dispatcher" + spec: + serviceAccountName: "{{ .Release.Name }}-dispatcher-sa" + {{- with .Values.airm.dispatcher.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + initContainers: + - name: check-rabbitmq-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm RabbitMQ at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm RabbitMQ is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.env.rabbitmqPort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-dispatcher + image: "{{ .Values.airm.dispatcher.image.repository }}:{{ .Values.airm.dispatcher.image.tag }}" + imagePullPolicy: "{{ .Values.airm.dispatcher.image.pullPolicy }}" + ports: + - containerPort: 8080 + env: + - name: KUBE_CLUSTER_NAME + value: demo-cluster + - name: ORG_NAME + value: demo + - name: RABBITMQ_HOST + value: "{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local" + - name: RABBITMQ_PORT + value: "{{ .Values.airm.dispatcher.env.rabbitmqPort }}" + - name: RABBITMQ_AIRM_COMMON_VHOST + value: "vh_airm_common" + - name: RABBITMQ_AIRM_COMMON_QUEUE + value: "airm_common" + - name: RABBITMQ_USER + valueFrom: + secretKeyRef: + name: "{{ .Release.Name }}-rabbitmq-common-vhost-user" + key: username + - name: RABBITMQ_PASSWORD + valueFrom: + secretKeyRef: + name: "{{ .Release.Name }}-rabbitmq-common-vhost-user" + key: password + livenessProbe: + httpGet: + path: /v1/health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + successThreshold: 1 + readinessProbe: + httpGet: + path: /v1/health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + successThreshold: 1 + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "1Gi" + securityContext: + runAsUser: 0 +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: "{{ .Release.Name }}-dispatcher-cluster-access-binding" +subjects: + - kind: ServiceAccount + name: "{{ .Release.Name }}-dispatcher-sa" + namespace: "{{ .Release.Namespace }}" +roleRef: + kind: ClusterRole + name: "{{ .Release.Name }}-dispatcher-cluster-access-role" + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: "{{ .Release.Name }}-dispatcher-cluster-access-role" +rules: + - apiGroups: [""] + resources: ["services", "namespaces", "configmaps", "pods"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["deployments", "statefulsets", "daemonsets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices", "kaiwoqueueconfigs"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["external-secrets.io"] + resources: ["externalsecrets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimclustermodels"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: "{{ .Release.Name }}-dispatcher-sa" + namespace: "{{ .Release.Namespace }}" + +--- +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}-dispatcher" + namespace: "{{ .Release.Namespace }}" + labels: + app: "{{ .Release.Name }}-dispatcher" +spec: + ports: + - name: web + port: {{ .Values.airm.dispatcher.servicePort }} + targetPort: 8080 + type: ClusterIP + selector: + app: "{{ .Release.Name }}-dispatcher" diff --git a/sources/airm/0.3.4/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml b/sources/airm/0.3.4/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml new file mode 100644 index 00000000..e930efd0 --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml @@ -0,0 +1,35 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# These are additional cluster roles needed by kyverno background controller to be able to +# create rolebindings in namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kyverno:airm-policy-roles + labels: + rbac.kyverno.io/aggregate-to-background-controller: "true" +rules: + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterroles", "rolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] # allow kyverno to bind clusterroles via rolebindings + resources: ["clusterroles"] + verbs: ["bind"] +--- +# These are additional cluster roles needed by kyverno reports controller to be able to +# manage custom resources for reporting +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kyverno:airm-reports-policy-roles + labels: + rbac.kyverno.io/aggregate-to-reports-controller: "true" +rules: + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["get", "list", "watch"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices"] + verbs: ["get", "list", "watch"] diff --git a/sources/airm/0.3.4/charts/airm-dispatcher/values.yaml b/sources/airm/0.3.4/charts/airm-dispatcher/values.yaml new file mode 100644 index 00000000..acb0eb0f --- /dev/null +++ b/sources/airm/0.3.4/charts/airm-dispatcher/values.yaml @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +airm: + dispatcher: + image: + repository: amdenterpriseai/airm-dispatcher + tag: 0.3.4 + pullPolicy: IfNotPresent + servicePort: 80 + env: + rabbitmqPort: 5672 + utilities: + netcat: + image: + repository: busybox + tag: 1.37.0 + pullPolicy: IfNotPresent + curl: + image: + repository: curlimages/curl + tag: 8.16.0 + pullPolicy: IfNotPresent + additionalClusterRoles: + platformAdmin: [] + projectMember: [] diff --git a/sources/airm/0.3.4/values.yaml b/sources/airm/0.3.4/values.yaml new file mode 100644 index 00000000..69346880 --- /dev/null +++ b/sources/airm/0.3.4/values.yaml @@ -0,0 +1,3 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT diff --git a/sources/airm/0.3.5/.helmignore b/sources/airm/0.3.5/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.5/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.5/Chart.yaml b/sources/airm/0.3.5/Chart.yaml new file mode 100644 index 00000000..ea8fa28e --- /dev/null +++ b/sources/airm/0.3.5/Chart.yaml @@ -0,0 +1,35 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm +description: A Helm chart for AIRM full stack, including API, UI and dispatcher + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.5 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" + +dependencies: + - name: airm-api + version: 0.3.5 + - name: airm-dispatcher + version: 0.3.5 diff --git a/sources/airm/0.3.5/charts/airm-api/.helmignore b/sources/airm/0.3.5/charts/airm-api/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.5/charts/airm-api/Chart.yaml b/sources/airm/0.3.5/charts/airm-api/Chart.yaml new file mode 100644 index 00000000..d8cf0c72 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/Chart.yaml @@ -0,0 +1,29 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm-api +description: A Helm chart for AIRM API and UI + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.5 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" diff --git a/sources/airm/0.3.5/charts/airm-api/README.md b/sources/airm/0.3.5/charts/airm-api/README.md new file mode 100644 index 00000000..a16ec9da --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/README.md @@ -0,0 +1,124 @@ + + +# HELM CHARTS + +Simple instructions to deploy AIRM UI and API applications using helm chart + +### 1. Requirements + +The following external components must be available in the Kubernetes cluster before the helm chart can be installed. + +- API Gateway implementation (e.g. KGateway) +- Keycloak with the expected `airm` realm installed +- Valid S3 compatible file storage service (e.g. MinIO) +- RabbitMQ operator +- Cert Manager operator +- External Secret operator +- CNPG operator +- OTEL LGTM stack installed on the cluster + +### 2. Install + +``` +cd helm/airm/charts + +# 1. Create output template just to validate (the public domain could be app-dev.silogen.ai, staging.silogen.ai, etc.) +helm template airm-api ./airm-api -n airm --create-namespace --set airm.appDomain= > airm-api-helm-generated.yaml + +# 2. Run chart install +helm install airm-api ./airm-api -n airm --create-namespace --set airm.appDomain= + +# 3. Delete chart if needed +helm delete airm-api -n airm + +# 4. Upgrade when bumping versions +helm upgrade -n airm --set airm.appDomain= airm-api ./airm-api +``` + +--- + +### 3. Helm Settings + +| Field Path | Description | Type | Example / Default | +|-------------------------------------------------------------------------------|-----------------------------------------------------------------| ------ |---------------------------------------------------------------------------------------------------| +| secretgenerator.image.repository | Docker image repository for secret generator | string | `ghcr.io/silogen/kubectl` | +| secretgenerator.image.tag | Docker image tag | string | `latest` | +| secretgenerator.image.pullPolicy | Image pull policy | string | `IfNotPresent` | +| kgateway.namespace | Namespace for kgateway resources | string | `kgateway-system` | +| kgateway.gatewayName | Gateway name | string | `https` | +| kgateway.airmapi.servicePort | Service port for airmapi | int | `80` | +| kgateway.airmapi.prefixValue | URL prefix for airmapi service | string | `airmapi` | +| kgateway.airmui.servicePort | Service port for airmui | int | `80` | +| kgateway.airmui.prefixValue | URL prefix for airmui service | string | `airmui` | +| aims.otelCollector.exporters.otlphttp.endpoint | Open Telemetry collector endpoint url for inference metrics | string | `http://lgtm-stack.otel-lgtm-stack.svc:4318` | +| aims.otelCollector.image | Base image for Open Telemetry Collector | string | `ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0` | +| aims.otelCollector.receivers.prometheus.config.scrape_configs.scrape_interval | Inference metrics scraping interval | string | `20s` | +| airm.includeDemoSetup | Include the demo organization and project setup when installing | bool | `true` | +| airm.appDomain | Public IP or domain for airm | string | `PUBLIC-IP` | +| airm.externalSecretStore.airm.name | Secret store name for airm | string | `airm-secret-store` | +| airm.externalSecretStore.minio.name | Secret store name for minio | string | `k8s-secret-store` | +| airm.externalSecretStore.keycloak.name | Secret store name for keycloak | string | `keycloak-secret-store` | +| airm.keycloak.publicUrl | Public URL to access keycloak | string | `https://kc.{{ .Values.airm.appDomain }}` | +| airm.keycloak.internalUrl | Internal URL to access keycloak | string | `http://keycloak.keycloak.svc.cluster.local:8080` | +| airm.keycloak.clientId | Client ID to access keycloak | string | `354a0fa1-35ac-4a6d-9c4d-d661129c2cd0` | +| airm.keycloak.realm | Keycloak realm for authentication | string | `airm` | +| airm.postgresql.cnpg.image | PostgreSQL container image | string | `ghcr.io/cloudnative-pg/postgresql:17` | +| airm.postgresql.cnpg.instance | Number of PostgreSQL instances | int | `1` | +| airm.postgresql.cnpg.resources.limits.cpu | CPU limit for PostgreSQL container | string | `"2"` | +| airm.postgresql.cnpg.resources.limits.memory | Memory limit for PostgreSQL container | string | `1Gi` | +| airm.postgresql.cnpg.resources.requests.cpu | CPU request for PostgreSQL container | string | `"1"` | +| airm.postgresql.cnpg.resources.requests.memory | Memory request for PostgreSQL container | string | `512Mi` | +| airm.postgresql.cnpg.storage.size | Storage size for PostgreSQL | string | `50Gi` | +| airm.postgresql.cnpg.storage.storageClass | Storage class for PostgreSQL | string | `default` | +| airm.postgresql.cnpg.walStorage.size | WAL storage size for PostgreSQL | string | `50Gi` | +| airm.postgresql.cnpg.walStorage.storageClass | WAL storage class for PostgreSQL | string | `default` | +| airm.rabbitmq.replicas | Number of replicas for the RabbitMQ cluster | int | `1` | +| airm.rabbitmq.resources.limits.cpu | CPU limit for for the RabbitMQ cluster | string | `1` | +| airm.rabbitmq.resources.limits.memory | Memory limit for for the RabbitMQ cluster | string | `1Gi` | +| airm.rabbitmq.resources.requests.cpu | CPU request for the RabbitMQ cluster | string | `500m` | +| airm.rabbitmq.resources.requests.memory | Memory request for the RabbitMQ cluster | string | `1Gi` | +| airm.rabbitmq.persistence.storage | Persistent storage size for the RabbitMQ cluster | string | `20Gi` | +| airm.rabbitmq.persistence.storageClassName | Storage class name for the RabbitMQ cluster | string | `default` | +| airm.rabbitmq.backup.enabled | Enable RabbitMQ backup | bool | `false` | +| airm.rabbitmq.backup.image | RabbitMQ backup container image | string | `amdenterpriseai/rabbitmq-backup:0.1` | +| airm.rabbitmq.backup.resources.limits.memory | Memory limit for cron job of RabbitMQ backup | string | `512Mi` | +| airm.rabbitmq.backup.resources.requests.cpu | CPU request for cron job of RabbitMQ backup | string | `250m` | +| airm.rabbitmq.backup.resources.requests.memory | Memory request for cron job of RabbitMQ backup | string | `256Mi` | +| airm.frontend.image.repository | Frontend image repository | string | `amdenterpriseai/airm-ui` | +| airm.frontend.image.tag | Frontend image tag | string | `v2025.08-rc.21` | +| airm.frontend.image.pullPolicy | Frontend image pull policy | string | `IfNotPresent` | +| airm.frontend.servicePort | Frontend service port | int | `80` | +| airm.frontend.resources.limits.memory | Memory limit for frontend | string | `4Gi` | +| airm.frontend.resources.requests.cpu | CPU request for frontend | string | `500m` | +| airm.frontend.resources.requests.memory | Memory request for frontend | string | `4Gi` | +| airm.backend.image.repository | Backend API image repository | string | `amdenterpriseai/airm-api` | +| airm.backend.image.tag | Backend API image tag | string | `v2025.08-rc.21` | +| airm.backend.image.pullPolicy | Backend API image pull policy | string | `IfNotPresent` | +| airm.backend.servicePort | Backend API service port | int | `80` | +| airm.backend.servicePortMetrics | Backend API metrics service port | int | `9009` | +| airm.backend.env.dbPort | Database port | int | `5432` | +| airm.backend.env.rabbitmqPort | RabbitMQ port | int | `5672` | +| airm.backend.env.minioUrl | Minio service URL | string | `http://minio.minio-tenant-default.svc.cluster.local:80` | +| airm.backend.env.minioBucket | Minio bucket name | string | `default-bucket` | +| airm.backend.env.prometheusUrl | Prometheus service URL | string | `http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:9090` | +| airm.backend.env.clusterAuthUrl | Cluster auth service URL | string | `http://cluster-auth.cluster-auth.svc.cluster.local:8081` | +| airm.backend.resources.limits.memory | Memory limit for backend API container | string | `1Gi` | +| airm.backend.resources.requests.cpu | CPU request for backend API container | string | `500m` | +| airm.backend.resources.requests.memory | Memory request for backend API container | string | `1Gi` | +| airm.backend.securityContext.allowPrivilegeEscalation | Security context: allow privilege escalation | bool | `false` | +| airm.backend.securityContext.runAsNonRoot | Security context: run container as non-root | bool | `true` | +| airm.backend.securityContext.runAsUser | Security context: user ID to run container as | int | `1000` | +| airm.backend.securityContext.seccompProfile.type | Security context: seccomp profile type | string | `RuntimeDefault` | +| airm.utilities.netcat.image.repository | Netcat image repository | string | `busybox` | +| airm.utilities.netcat.image.tag | Netcat image tag | string | `1.37.0` | +| airm.utilities.netcat.image.pullPolicy | Netcat image pull policy | string | `IfNotPresent` | +| airm.utilities.curl.image.repository | Curl image repository | string | `curlimages/curl` | +| airm.utilities.curl.image.tag | Curl image tag | string | `8.16.0` | +| airm.utilities.curl.image.pullPolicy | Curl image pull policy | string | `IfNotPresent` | +| airm.utilities.liquibase.image.repository | Liquibase image repository | string | `docker.io/liquibase/liquibase` | +| airm.utilities.liquibase.image.tag | Liquibase image tag | string | `4.31` | +| airm.utilities.liquibase.image.pullPolicy | Liquibase image pull policy | string | `IfNotPresent` | diff --git a/sources/airm/0.3.5/charts/airm-api/files/configure.sh b/sources/airm/0.3.5/charts/airm-api/files/configure.sh new file mode 100644 index 00000000..69a3f59d --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/files/configure.sh @@ -0,0 +1,374 @@ +#!/bin/bash + +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +##################################################################################### +echo "" +echo "Run configure script block..." +echo "" + +# --- Configuration Variables --- +# Get values from bloom configmap mounted as env + +# NOTE: ORG_NAME is hardcoded to demo because gpu operator metrics has same org name hardcoded there +# Otherwise the following line can be uncommented to consider the real org name from domain config +# ORG_NAME=$(echo $NEW_DOMAIN_NAME | awk -F '.' '{ print $2 }') +ORG_NAME="demo" +ORG_DOMAINS="[\"${NEW_DOMAIN_NAME}\"]" +CLUSTER_WORKLOADS_BASE_URL="https://workspaces.${NEW_DOMAIN_NAME}/" +CLUSTER_KUBE_API_URL="https://k8s.${NEW_DOMAIN_NAME}" +USER_EMAIL="devuser@${NEW_DOMAIN_NAME}" +PROJECT_NAME="demo" +PROJECT_DESCRIPTION="demo" +CLUSTER_NAME="demo-cluster" +TIMEOUT=300 +SLEEP_INTERVAL=5 + +# --- Input Validation --- +echo "Validating environment variables..." +echo "KEYCLOAK_CLIENT_ID: ${KEYCLOAK_CLIENT_ID}" +echo "NEW_DOMAIN_NAME: ${NEW_DOMAIN_NAME}" +echo "AIRM_API_URL: ${AIRM_API_URL}" + +function check_env_variable() { + if [ -z "${!1}" ]; then + echo "ERROR: $1 environment variable is not set." + exit 1 + fi +} + +function check_success() { + if [ "$1" -ne 0 ]; then + echo "ERROR: $2" + exit 1 + fi +} + +check_env_variable "AIRM_API_URL" +check_env_variable "KEYCLOAK_URL" +check_env_variable "KEYCLOAK_REALM" +check_env_variable "KEYCLOAK_CLIENT_SECRET" +check_env_variable "KEYCLOAK_CLIENT_ID" +check_env_variable "KEYCLOAK_ADMIN_CLIENT_ID" +check_env_variable "KEYCLOAK_ADMIN_CLIENT_SECRET" +check_env_variable "USER_PASSWORD" + +function refresh_token() { + TOKEN=$(curl -s -d "client_id=${KEYCLOAK_CLIENT_ID}" -d "username=${USER_EMAIL}" -d "password=${USER_PASSWORD}" -d 'grant_type=password' -d "client_secret=${KEYCLOAK_CLIENT_SECRET}" "${KEYCLOAK_URL}/realms/${KEYCLOAK_REALM}/protocol/openid-connect/token" | jq -r '.access_token') + if [ -z "$TOKEN" ] || [ "$TOKEN" == "null" ]; then + echo "ERROR: Failed to obtain access token from Keycloak." + exit 1 + fi +} + +function create_org() { + # Try to get ORG_ID by name + ORG_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r --arg name "$ORG_NAME" '.organizations[] | select(.name==$name) | .id') + + # If not found, create the org and fetch the ID again + if [ -z "$ORG_ID" ] || [ "$ORG_ID" == "null" ]; then + ORG_RESP=$(curl -s -o /dev/null -X POST -w "%{http_code}" "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' \ + -d "{ \"name\": \"$ORG_NAME\", \"domains\": $ORG_DOMAINS }") + echo "$ORG_RESP" + check_success "$([[ "$ORG_RESP" == "200" || "$ORG_RESP" == "201" ]] && echo 0 || echo 1)" "Failed to create organization" + + ORG_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/organizations" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r --arg name "$ORG_NAME" '.organizations[] | select(.name==$name) | .id') + fi + + if [ -z "$ORG_ID" ] || [ "$ORG_ID" == "null" ]; then + echo "ERROR: Failed to create or retrieve organization ID." + exit 1 + else + echo "ORG_ID=${ORG_ID}" + fi +} + +function add_user_to_org() { + # Check if user exists in org + USER_EXISTS=$(curl -s -X GET "${AIRM_API_URL}/v1/users" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" -H 'Content-Type: application/json' | jq -r --arg email "$USER_EMAIL" '.data? // [] | .[] | select(.email==$email) | .email') + # Add user to org if they don't exist + if [ -z "$USER_EXISTS" ] || [ "$USER_EXISTS" == "null" ]; then + echo "$USER_EXISTS" + echo "User '$USER_EMAIL' not found in organization. Adding..." + ADD_USER_RESP=$(curl -w "%{http_code}" -X 'POST' "${AIRM_API_URL}/v1/organizations/${ORG_ID}/users" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" -H 'Content-Type: application/json' -d '{ "email": "'"$USER_EMAIL"'", "roles": ["Platform Administrator"]}') + echo "$ADD_USER_RESP" + check_success "$([[ "$ADD_USER_RESP" == "200" || "$ADD_USER_RESP" == "201" || "$ADD_USER_RESP" == "null201" ]] && echo 0 || echo 1)" "Failed to add user to organization" + else + echo "User '$USER_EMAIL' already exists in organization." + fi +} + +function create_project() { + PROJECT_ID=$(curl -s -X GET "${AIRM_API_URL}/v1/projects" -H 'accept: application/json' -H "Authorization: Bearer ${TOKEN}" | jq -r '.projects[] | select(.name=="'$PROJECT_NAME'") | .id') + + for (( i=0; i<=TIMEOUT; i+=SLEEP_INTERVAL )); do + CLUSTER_STATUS=$(curl -s -X GET "${AIRM_API_URL}/v1/clusters/$CLUSTER_ID" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r '.status') + + if [ "$CLUSTER_STATUS" == "healthy" ]; then + echo "Cluster is healthy!" + break # Exit the loop if the cluster is healthy + fi + echo "Cluster status: $CLUSTER_STATUS. Waiting $SLEEP_INTERVAL seconds... ($i/$TIMEOUT seconds elapsed)" + sleep $SLEEP_INTERVAL + done + + if [ "$CLUSTER_STATUS" != "healthy" ]; then + echo "ERROR: Cluster did not become healthy within $TIMEOUT seconds." + exit 1 + fi + + if [ -z "$PROJECT_ID" ] || [ "$PROJECT_ID" == "null" ]; then + echo "Projects '$PROJECT_NAME' not found. Creating..." + PROJECT_ID=$(curl -X 'POST' \ + "${AIRM_API_URL}/v1/projects" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' \ + -d '{ + "name": "'"$PROJECT_NAME"'", + "description": "'"$PROJECT_DESCRIPTION"'", + "cluster_id": "'"$CLUSTER_ID"'", + "quota": { + "cpu_milli_cores": 0, + "memory_bytes": 0, + "ephemeral_storage_bytes": 0, + "gpu_count": 0 + } + }' | jq -r '.id') + echo "$PROJECT_ID" + check_success "$([[ "$PROJECT_ID" != "null" ]] && echo 0 || echo 1)" "Failed to create project" + else + echo "Project '$PROJECT_NAME' already exists with ID: $PROJECT_ID" + fi +} + +function add_minio_secret_and_storage_to_project() { + for (( i=0; i<=TIMEOUT; i+=SLEEP_INTERVAL )); do + PROJECT_STATUS=$(curl -s -X GET "${AIRM_API_URL}/v1/projects/$PROJECT_ID" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' | jq -r '.status') + + if [ "$PROJECT_STATUS" == "Ready" ]; then + echo "Project is ready!" + break # Exit the loop if the project is ready + fi + echo "Project status: $PROJECT_STATUS. Waiting $SLEEP_INTERVAL seconds... ($i/$TIMEOUT seconds elapsed)" + sleep $SLEEP_INTERVAL + done + + SECRET_NAME="minio-credentials-fetcher" + STORAGE_NAME="minio-storage" + + SECRET_IN_PROJECT=$(curl -X 'GET' \ + "${AIRM_API_URL}/v1/projects/${PROJECT_ID}/secrets" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer ${TOKEN}" | jq -r '.project_secrets[] | select(.secret.name=="'"$SECRET_NAME"'") | .id') + EXTERNAL_SECRET_API_VERSION="v1beta1" + EXTERNAL_SECRET_MANIFEST=$(cat < /dev/null 2>&1; then + echo "AIRM API is ready!" + break + else + echo "Waiting for AIRM API..." + sleep 10 + fi + done + + echo "All dependencies are ready!" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: configure + image: "{{ .Values.airm.utilities.clusterTool.image.repository }}:{{ .Values.airm.utilities.clusterTool.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.clusterTool.image.pullPolicy }}" + command: ["/bin/bash"] + args: ["/scripts/configure.sh"] + env: + - name: DEBIAN_FRONTEND + value: "noninteractive" + - name: ORG_NAME + value: "demo" + - name: NEW_DOMAIN_NAME + value: "{{ .Values.airm.appDomain }}" + - name: KEYCLOAK_CLIENT_ID + value: "{{ .Values.airm.keycloak.clientId }}" + - name: KEYCLOAK_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: KEYCLOAK_SECRET + name: "{{ .Release.Name }}-keycloak-ui-creds" + - name: USER_EMAIL + value: "devuser@{{ .Values.airm.appDomain }}" + - name: KEYCLOAK_URL + value: "{{ .Values.airm.keycloak.internalUrl }}" + - name: KEYCLOAK_REALM + value: "{{ .Values.airm.keycloak.realm }}" + - name: KEYCLOAK_ADMIN_CLIENT_ID + valueFrom: + secretKeyRef: + key: client-id + name: "{{ .Release.Name }}-keycloak-admin-client" + - name: KEYCLOAK_ADMIN_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: client-secret + name: "{{ .Release.Name }}-keycloak-admin-client" + - name: AIRM_API_URL + value: "http://{{ .Release.Name }}-api.{{ .Release.Namespace }}.svc.cluster.local" + - name: USER_PASSWORD + valueFrom: + secretKeyRef: + key: USER_PASSWORD + name: "{{ .Release.Name }}-user-credentials" + volumeMounts: + - name: configure-script + mountPath: /scripts + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + volumes: + - name: configure-script + configMap: + name: "{{ .Release.Name }}-configure-script" + defaultMode: 0755 + +{{- end }} diff --git a/sources/airm/0.3.5/charts/airm-api/templates/airm-es.yaml b/sources/airm/0.3.5/charts/airm-api/templates/airm-es.yaml new file mode 100644 index 00000000..4dd18aeb --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/templates/airm-es.yaml @@ -0,0 +1,215 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cnpg-superuser" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-cnpg-superuser-username + property: value + secretKey: username + - remoteRef: + key: airm-cnpg-superuser-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-cnpg-superuser" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cnpg-user" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-cnpg-user-username + property: value + secretKey: username + - remoteRef: + key: airm-cnpg-user-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-cnpg-user" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-keycloak-admin-client" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-keycloak-admin-client-id + property: value + secretKey: client-id + - remoteRef: + key: airm-keycloak-admin-client-secret + property: value + secretKey: client-secret + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.keycloak.name }} + target: + name: "{{ .Release.Name }}-keycloak-admin-client" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-rabbitmq-admin" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-rabbitmq-user-username + property: value + secretKey: username + - remoteRef: + key: airm-rabbitmq-user-password + property: value + secretKey: password + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-rabbitmq-admin" + template: + data: + default_user.conf: | + default_user = {{ "{{ .username }}" }} + default_pass = {{ "{{ .password }}" }} + password: '{{ "{{ .password }}" }}' + username: '{{ "{{ .username }}" }}' + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-api-minio-credentials" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: minio-api-access-key + property: value + secretKey: minio-access-key + - remoteRef: + key: minio-api-secret-key + property: value + secretKey: minio-secret-key + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.minio.name }} + target: + name: "{{ .Release.Name }}-api-minio-credentials" +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-secrets-airm" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-ui-auth-nextauth-secret + property: value + secretKey: NEXTAUTH_SECRET + refreshInterval: 15s + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-secrets-airm" +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-keycloak-ui-client" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: airm-ui-keycloak-secret + property: value + secretKey: KEYCLOAK_SECRET + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.keycloak.name }} + target: + name: "{{ .Release.Name }}-keycloak-ui-creds" + template: + type: Opaque +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-cluster-auth-secrets" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + refreshInterval: 5m + target: + name: "{{ .Release.Name }}-cluster-auth-admin" + data: + - secretKey: admin-token + remoteRef: + key: cluster-auth-admin-token + property: value +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: "{{ .Release.Name }}-user-credentials" + namespace: "{{ .Release.Namespace }}" + annotations: + helm.sh/hook: pre-install +spec: + data: + - remoteRef: + key: keycloak-initial-devuser-password + property: value + secretKey: USER_PASSWORD + secretStoreRef: + kind: ClusterSecretStore + name: {{ .Values.airm.externalSecretStore.airm.name }} + target: + name: "{{ .Release.Name }}-user-credentials" + template: + type: Opaque diff --git a/sources/airm/0.3.5/charts/airm-api/templates/airm-httproute.yaml b/sources/airm/0.3.5/charts/airm-api/templates/airm-httproute.yaml new file mode 100644 index 00000000..3393d6a5 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/templates/airm-httproute.yaml @@ -0,0 +1,81 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: '{{ .Release.Name }}api-route' + namespace: '{{ .Release.Namespace }}' +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: {{ .Values.kgateway.gatewayName }} + namespace: {{ .Values.kgateway.namespace }} + rules: + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-api' + port: {{ .Values.kgateway.airmapi.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmapi.prefixValue }}\..* + path: + type: RegularExpression + value: .*/stream.* + timeouts: + backendRequest: {{ .Values.kgateway.airmapi.timeouts.stream.backendRequest }} + request: {{ .Values.kgateway.airmapi.timeouts.stream.request }} + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-api' + port: {{ .Values.kgateway.airmapi.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmapi.prefixValue }}\..* + path: + type: PathPrefix + value: / + timeouts: + backendRequest: {{ .Values.kgateway.airmapi.timeouts.nonStream.backendRequest }} + request: {{ .Values.kgateway.airmapi.timeouts.nonStream.request }} +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: '{{ .Release.Name }}ui-route' + namespace: '{{ .Release.Namespace }}' +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: {{ .Values.kgateway.gatewayName }} + namespace: {{ .Values.kgateway.namespace }} + rules: + - backendRefs: + - group: "" + kind: Service + name: '{{ .Release.Name }}-ui' + port: {{ .Values.kgateway.airmui.servicePort }} + weight: 1 + matches: + - headers: + - name: Host + type: RegularExpression + value: {{ .Values.kgateway.airmui.prefixValue }}\..* + path: + type: PathPrefix + value: / + timeouts: + backendRequest: {{ .Values.kgateway.airmui.timeouts.backendRequest }} + request: {{ .Values.kgateway.airmui.timeouts.request }} diff --git a/sources/airm/0.3.5/charts/airm-api/templates/airm-rabbitmq-cluster.yaml b/sources/airm/0.3.5/charts/airm-api/templates/airm-rabbitmq-cluster.yaml new file mode 100644 index 00000000..3db2ff07 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/templates/airm-rabbitmq-cluster.yaml @@ -0,0 +1,69 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: rabbitmq.com/v1beta1 +kind: RabbitmqCluster +metadata: + name: '{{ .Release.Name }}-rabbitmq' + namespace: '{{ .Release.Namespace }}' +spec: + persistence: + {{- toYaml .Values.airm.rabbitmq.persistence | nindent 4 }} + replicas: {{ .Values.airm.rabbitmq.replicas }} + resources: + {{- toYaml .Values.airm.rabbitmq.resources | nindent 4 }} + secretBackend: + externalSecret: + name: '{{ .Release.Name }}-rabbitmq-admin' + tls: + secretName: '{{ .Release.Name }}-tls-secret' +--- +{{- if .Values.airm.rabbitmq.backup.enabled -}} + +apiVersion: batch/v1 +kind: CronJob +metadata: + name: '{{ .Release.Name }}-rabbitmq-backup-cron' + namespace: '{{ .Release.Namespace }}' +spec: + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + spec: + containers: + - env: + - name: RABBITMQ_URL + value: 'http://{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local:15672' + - name: RABBITMQ_USER + valueFrom: + secretKeyRef: + key: username + name: '{{ .Release.Name }}-rabbitmq-admin' + - name: RABBITMQ_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: '{{ .Release.Name }}-rabbitmq-admin' + - name: S3_HOST + value: "{{ .Values.airm.backend.env.minioUrl }}" + - name: S3_ACCESS_KEY + valueFrom: + secretKeyRef: + key: minio-access-key + name: '{{ .Release.Name }}-api-minio-credentials' + - name: S3_SECRET_KEY + valueFrom: + secretKeyRef: + key: minio-secret-key + name: '{{ .Release.Name }}-api-minio-credentials' + image: '{{ .Values.airm.rabbitmq.backup.image }}' + name: rabbitmq-backup-cron + resources: + {{- toYaml .Values.airm.rabbitmq.backup.resources | nindent 16 }} + restartPolicy: OnFailure + schedule: 0 * * * * + +{{- end }} diff --git a/sources/airm/0.3.5/charts/airm-api/templates/airm-vllm-collector.yaml b/sources/airm/0.3.5/charts/airm-api/templates/airm-vllm-collector.yaml new file mode 100644 index 00000000..f12aa532 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/templates/airm-vllm-collector.yaml @@ -0,0 +1,93 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: "{{ .Release.Name }}-{{ .Values.aims.otelCollector.name }}" + namespace: "{{ .Release.Namespace }}" +spec: + mode: daemonset + image: "{{ .Values.aims.otelCollector.image }}" + config: + receivers: + prometheus: + config: + scrape_configs: + - job_name: "vllm" + metrics_path: /metrics + scrape_interval: "{{ .Values.aims.otelCollector.receivers.prometheus.config.scrape_configs.scrape_interval }}" + kubernetes_sd_configs: + - role: pod + relabel_configs: + # Only scrape pods with the workload-id label + - source_labels: + [__meta_kubernetes_pod_label_airm_silogen_ai_workload_id] + action: keep + regex: .+ + # Only scrape pods with app label starting with isvc. + - source_labels: [__meta_kubernetes_pod_label_app] + action: keep + regex: isvc\..* + # Set the workload_id from the label + - source_labels: + [__meta_kubernetes_pod_label_airm_silogen_ai_workload_id] + target_label: workload_id + # Set service name from app label + - source_labels: [__meta_kubernetes_pod_label_app] + target_label: service + # Set service instance id from pod name + - source_labels: [__meta_kubernetes_pod_name] + target_label: service_instance_id + # Set the scrape target to port 8000 + - source_labels: [__meta_kubernetes_pod_ip] + target_label: __address__ + replacement: $1:8000 + otlp: + protocols: + grpc: {} + http: {} + + processors: + resource: + attributes: + - key: airm.silogen.ai/workload-id + from_attribute: workload_id + action: upsert + - key: service.instance.id + from_attribute: service_instance_id + action: upsert + - key: service.name + from_attribute: service + action: upsert + + transform: + metric_statements: + - context: datapoint + statements: + - set(attributes["workload_id"], resource.attributes["airm.silogen.ai/workload-id"]) where attributes["workload_id"] == nil + - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where attributes["service_instance_id"] == nil + - set(attributes["service"], resource.attributes["service.name"]) where attributes["service"] == nil + + exporters: + otlphttp: + endpoint: "{{ .Values.aims.otelCollector.exporters.otlphttp.endpoint }}" + + service: + pipelines: + metrics: + receivers: [prometheus] + processors: [resource, transform] + exporters: [otlphttp] + + traces: + receivers: [otlp] + processors: [resource] + exporters: [otlphttp] + + logs: + receivers: [otlp] + processors: [resource] + exporters: [otlphttp] diff --git a/sources/airm/0.3.5/charts/airm-api/values.yaml b/sources/airm/0.3.5/charts/airm-api/values.yaml new file mode 100644 index 00000000..4213af3d --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-api/values.yaml @@ -0,0 +1,166 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +secretgenerator: + image: + repository: amdenterpriseai/cluster-tool + tag: latest + pullPolicy: IfNotPresent + +kgateway: + namespace: kgateway-system + gatewayName: https + airmapi: + servicePort: 80 + prefixValue: airmapi + timeouts: + stream: + backendRequest: 30m + request: 30m + nonStream: + backendRequest: 10m + request: 10m + airmui: + servicePort: 80 + prefixValue: airmui + timeouts: + backendRequest: 1m + request: 1m + keycloak: + prefixValue: kc + +aims: + otelCollector: + image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.113.0 + receivers: + prometheus: + config: + scrape_configs: + scrape_interval: 20s + exporters: + otlphttp: + endpoint: "http://lgtm-stack.otel-lgtm-stack.svc:4318" + name: "vllm-collector" + +airm: + appDomain: PUBLIC-IP + includeDemoSetup: true + + externalSecretStore: + airm: + name: openbao-secret-store + minio: + name: openbao-secret-store + keycloak: + name: openbao-secret-store + + postgresql: + enabled: true + cnpg: + image: ghcr.io/cloudnative-pg/postgresql:17 + instance: 1 + resources: + limits: + cpu: "2" + memory: 1Gi + requests: + cpu: "1" + memory: 512Mi + storage: + size: 50Gi + storageClass: default + walStorage: + size: 50Gi + storageClass: default + + rabbitmq: + replicas: 1 + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: '1' + memory: 1Gi + persistence: + storage: 20Gi + storageClassName: default + backup: + enabled: false + image: amdenterpriseai/rabbitmq-backup:0.1 + resources: + limits: + memory: 512Mi + requests: + cpu: 250m + memory: 256Mi + + keycloak: + internalUrl: http://keycloak.keycloak.svc.cluster.local:8080 + clientId: "354a0fa1-35ac-4a6d-9c4d-d661129c2cd0" + realm: airm + + frontend: + image: + repository: amdenterpriseai/airm-ui + tag: 0.3.5 + pullPolicy: IfNotPresent + servicePort: 80 + resources: + limits: + memory: 4Gi + requests: + cpu: 500m + memory: 4Gi + + backend: + image: + repository: amdenterpriseai/airm-api + tag: 0.3.5 + pullPolicy: IfNotPresent + + servicePort: 80 + servicePortMetrics: 9009 + env: + dbPort: 5432 + rabbitmqPort: 5672 + minioUrl: http://minio.minio-tenant-default.svc.cluster.local:80 + minioBucket: default-bucket + prometheusUrl: http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:9090 + clusterAuthUrl: http://cluster-auth.cluster-auth.svc.cluster.local:8081 + + resources: + limits: + memory: 1Gi + requests: + cpu: 500m + memory: 1Gi + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + + utilities: + netcat: + image: + repository: busybox + tag: 1.37.0 + pullPolicy: IfNotPresent + curl: + image: + repository: curlimages/curl + tag: 8.16.0 + pullPolicy: IfNotPresent + liquibase: + image: + repository: docker.io/liquibase/liquibase + tag: 4.31 + pullPolicy: IfNotPresent + clusterTool: + image: + repository: amdenterpriseai/cluster-tool + tag: latest + pullPolicy: IfNotPresent diff --git a/sources/airm/0.3.5/charts/airm-dispatcher/.helmignore b/sources/airm/0.3.5/charts/airm-dispatcher/.helmignore new file mode 100644 index 00000000..64c2af83 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-dispatcher/.helmignore @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/sources/airm/0.3.5/charts/airm-dispatcher/Chart.yaml b/sources/airm/0.3.5/charts/airm-dispatcher/Chart.yaml new file mode 100644 index 00000000..96b660fd --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-dispatcher/Chart.yaml @@ -0,0 +1,29 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: v2 +name: airm-dispatcher +description: A Helm chart for AIRM Dispatcher + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.3.5 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" diff --git a/sources/airm/0.3.5/charts/airm-dispatcher/README.md b/sources/airm/0.3.5/charts/airm-dispatcher/README.md new file mode 100644 index 00000000..0b85c706 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-dispatcher/README.md @@ -0,0 +1,54 @@ + + +# HELM CHARTS + +Simple instructions to deploy AIRM dispatcher application using helm chart. +The dispatcher can be run on a compute cluster, which may or may not be the same as the one hosting the AIRM API and UI. + +### 1. Requirements + +The following external components must be available in the Kubernetes cluster before the helm chart can be installed. + +- Accessible RabbitMQ cluster (must be the same cluster used by AIRM API). +- Kaiwo installed on the cluster (along with all its dependencies) + +### 2. Install + +``` +cd helm/airm/charts + +# 1. Create output template just to validate (the public domain could be app-dev.silogen.ai, staging.silogen.ai, etc.) +helm template airm-dispatcher ./airm-dispatcher -n airm --create-namespace > airm-dispatcher-helm-generated.yaml + +# 2. Run chart install +helm install airm-dispatcher ./airm-dispatcher -n airm --create-namespace + +# 3. Delete chart if needed +helm delete airm-dispatcher -n airm + +# 4. Upgrade when bumping versions +helm upgrade -n airm --set airm-dispatcher ./airm-dispatcher +``` + +--- + +### 3. Helm Settings + +| Field Path | Description | Type | Example / Default | +|---------------------------------------------|--------------------------------------------------------------|---------|-----------------------------------| +| airm.dispatcher.image.repository | Dispatcher image repository | string | `amdenterpriseai/airm-dispatcher` | +| airm.dispatcher.image.tag | Dispatcher image tag | string | `v2025.08-rc.21` | +| airm.dispatcher.image.pullPolicy | Dispatcher image pull policy | string | `IfNotPresent` | +| airm.dispatcher.servicePort | Dispatcher service port | int | `80` | +| airm.utilities.netcat.image.repository | Netcat image repository | string | `busybox` | +| airm.utilities.netcat.image.tag | Netcat image tag | string | `1.37.0` | +| airm.utilities.netcat.image.pullPolicy | Netcat image pull policy | string | `IfNotPresent` | +| airm.utilities.curl.image.repository | Curl image repository | string | `curlimages/curl` | +| airm.utilities.curl.image.tag | Curl image tag | string | `8.16.0` | +| airm.utilities.curl.image.pullPolicy | Curl image pull policy | string | `IfNotPresent` | +| airm.additionalClusterRoles.platformAdmin | Additional cluster roles for the Platform Administrator role | array | `[]` | +| airm.additionalClusterRoles.projectMember | Additional cluster roles for the Project Member role | array | `[]` | diff --git a/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-cluster-policies.yaml b/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-cluster-policies.yaml new file mode 100644 index 00000000..caf92aa6 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-cluster-policies.yaml @@ -0,0 +1,352 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-project-namespace-rolebinding +spec: + background: false + rules: + - name: generate-project-namespace-rolebinding + match: + any: + - resources: + kinds: + - Namespace + operations: + - CREATE + preconditions: + any: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/project-id" || '''' }}`}}' + operator: NotEquals + value: "" + skipBackgroundRequests: true + generate: + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + name: "project-member-role-binding" + namespace: "{{`{{request.object.metadata.name}}`}}" + synchronize: true + data: + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: airm-project-member + subjects: + - kind: Group + # Add this in for backwards compatibility + name: "oidc{{`{{request.object.metadata.name}}`}}" + apiGroup: rbac.authorization.k8s.io + - kind: Group + # The kubernetes cluster apply an OIDC prefix of 'oidc:', so we adjust the groups to expect that + name: "oidc:{{`{{request.object.metadata.name}}`}}" + apiGroup: rbac.authorization.k8s.io +--- +# Kyverno policy that enforces that workloads submitted to a namespace managed by AIRMan have the +# correct kueue lables and field set, so that they are bound by the quota of the namespace +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-quota-enforcement-for-workloads +spec: + background: false + rules: + - name: set-queue-name-from-namespace-default + match: + resources: + kinds: + - Deployment + - StatefulSet + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + + - name: set-queue-name-from-namespace-jobs + match: + resources: + kinds: + - Job # https://kueue.sigs.k8s.io/docs/tasks/run/jobs/ + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + spec: + suspend: true + + - name: set-queue-name-from-namespace-cronjobs + match: + resources: + kinds: + - CronJob # https://kueue.sigs.k8s.io/docs/tasks/run/run_cronjobs/ + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.spec.jobTemplate.metadata.labels."kueue.x-k8s.io/queue-name" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + spec: + jobTemplate: + metadata: + labels: + kueue.x-k8s.io/queue-name: "{{`{{request.namespace }}`}}" + spec: + suspend: true + + - name: set-queue-name-from-namespace-kaiwo + match: + resources: + kinds: + - KaiwoJob + - KaiwoService + - AIMService + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: "{{`{{request.object.spec.clusterQueue || '' }}`}}" + operator: NotEquals + value: "{{`{{request.namespace }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + spec: + clusterQueue: "{{`{{request.namespace }}`}}" +--- +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: airm-workload-tracking-policy +spec: + background: false + rules: + # For all supported types, if airm.silogen.ai/workload-id and airm.silogen.ai/component-id are not set, we assume + # it has been submitted from outside of AIRMan. In that case, we set airm.silogen.ai/auto-discovered: true, so it can + # be tracked upstream. We also set airm.silogen.ai/discovered-component-type so that we can identify the type of component + # that was originally tracked, and ignore children created by it. See remove-auto-discovered-annotations-inherited-from-parent + # We also try to capture the user who submitted the workload, and consume it in the application + + # Please note that ReplicaSet is not supported because by default it is filtered away by Kyverno by default: https://github.com/kyverno/kyverno/blob/main/charts/kyverno/values.yaml#L270 + - name: add-discovery-annotations-for-supported-types + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || '''' }}`}}' + operator: Equals + value: "" + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || '''' }}`}}' + operator: Equals + value: "" + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/auto-discovered" || '''' }}`}}' + operator: Equals + value: "" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + annotations: + airm.silogen.ai/submitter: "{{`{{request.userInfo.username }}`}}" + airm.silogen.ai/auto-discovered: "true" + airm.silogen.ai/discovered-component-type: "{{`{{request.object.kind }}`}}" + # For all supported types, if airm.silogen.ai/auto-discovered is set and the airm.silogen.ai/discovered-component-type + # doesnt match the kind of the current component, we assume this type has been created by a parent which is also + # supported by AIRMan and we dont need to track this type upstream, so we unset the airm.silogen.ai/auto-discovered annotation. + # This is mostly to account for KaiwoJob, KaiwoService, AIMService which propagate annotations to pods. + - name: remove-auto-discovered-annotations-inherited-from-parent + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + preconditions: + all: + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/auto-discovered" || '''' }}`}}' + operator: Equals + value: "true" + - key: '{{`{{request.object.metadata.annotations."airm.silogen.ai/discovered-component-type" || '''' }}`}}' + operator: NotEquals + value: "{{`{{request.object.kind }}`}}" + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + annotations: + airm.silogen.ai/auto-discovered: "false" + # For all supported types, if airm.silogen.ai/project-id does not match that of the namespace label, overwrite it + # with the expected value, to avoid metrics getting mixed up between projects. + - name: set-project-id-from-namespace-label + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - KaiwoJob + - KaiwoService + - AIMService + - Pod + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + context: + - name: ns_labels + apiCall: + urlPath: "/api/v1/namespaces/{{`{{request.namespace }}`}}" + method: GET + jmesPath: "metadata.labels" + preconditions: + all: + - key: '{{`{{request.object.metadata.labels."airm.silogen.ai/project-id" || '''' }}`}}' + operator: NotEquals + value: '{{`{{ns_labels."airm.silogen.ai/project-id" }}`}}' + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/project-id: '{{`{{ns_labels."airm.silogen.ai/project-id" }}`}}' + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to types that expect them at + # metadata.labels. The expectation is that these are propagated to the subsequent pods that are created. + + # If the resource is spawned off by a CRD, it will not know about the labels on the previous version of the object, + # so we also check request.oldObject for the labels to try and preserve them if they were already set. + - name: add-workload-and-component-id-default + match: + resources: + kinds: + - Pod + - KaiwoJob + - KaiwoService + - AIMService + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to objects with templates and + # also add it to spec.template.metadata.labels to ensure that the pods created by them contain the labels as well + - name: add-workload-and-component-id-to-objects-with-template + match: + resources: + kinds: + - Job + - Deployment + - StatefulSet + - DaemonSet + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + template: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + + # Add the airm.silogen.ai/workload-id and airm.silogen.ai/component-id labels to cronjob and + # also add it to spec.jobTemplate.metadata.labels to ensure that the pods created by the cronjob + # contain it as well + - name: add-workload-and-component-id-cronjobs + match: + resources: + kinds: + - CronJob + namespaceSelector: + matchExpressions: + - key: airm.silogen.ai/project-id + operator: Exists + skipBackgroundRequests: true + mutate: + patchStrategicMerge: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + jobTemplate: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' + spec: + template: + metadata: + labels: + airm.silogen.ai/workload-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/workload-id" || request.oldObject.metadata.labels."airm.silogen.ai/workload-id" || request.uid }}`}}' + airm.silogen.ai/component-id: '{{`{{request.object.metadata.labels."airm.silogen.ai/component-id" || request.oldObject.metadata.labels."airm.silogen.ai/component-id" || request.uid }}`}}' diff --git a/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-cluster-roles.yaml b/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-cluster-roles.yaml new file mode 100644 index 00000000..2461e894 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-cluster-roles.yaml @@ -0,0 +1,164 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: airm-platform-admin +rules: + - apiGroups: [""] + resources: + [ + "pods", + "events", + "services", + "configmaps", + "persistentvolumes", + "persistentvolumeclaims", + "namespaces", + "serviceaccounts", + ] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: [""] + resources: ["pods/log", "pods/exec", "pods/attach", "pods/portforward"] + verbs: ["*"] + - apiGroups: ["apps"] + resources: ["deployments", "deployments/scale", "replicasets", "statefulsets", "daemonsets"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses", "networkpolicies"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimmodels", "aimmodelcaches", "aimservicetemplates", "aimruntimeconfigs", "aimtemplatecaches"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes", "gateways"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: ["config.kaiwo.silogen.ai"] + resources: ["kaiwoconfigs"] + verbs: ["*"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwoqueueconfigs"] + verbs: ["*"] + - apiGroups: ["aim.silogen.ai"] + resources: [ "aimclustermodels", "aimclusterservicetemplates", "aimclusterruntimeconfigs", "aimclustermodelsources" ] + verbs: ["*"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "clusterroles", "rolebindings", "clusterrolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["external-secrets.io"] + resources: ["externalsecrets"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: ["external-secrets.io"] + resources: ["clustersecretstores"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["kueue.x-k8s.io"] + resources: ["clusterqueues", "resourceflavors", "localqueues", "workloadpriorityclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list"] +{{- range .Values.airm.additionalClusterRoles.platformAdmin }} + - apiGroups: + {{ .apiGroups | toYaml | nindent 6 }} + resources: + {{ .resources | toYaml | nindent 6 }} + verbs: + {{ .verbs | toYaml | nindent 6 }} +{{- end }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: airm-platform-admin-role-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: airm-platform-admin +subjects: + - kind: Group + # Add this in for backwards compatibility + name: "oidcairm-role:Platform Administrator" + apiGroup: rbac.authorization.k8s.io + - kind: Group + # The kubernetes cluster apply an OIDC prefix of 'oidc':, so we adjust the group to expect that + name: "oidc:airm-role:Platform Administrator" + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: airm-project-member +rules: + - apiGroups: [""] + resources: + [ + "pods", + "pods/log", + "pods/exec", + "pods/attach", + "pods/portforward", + "events", + "services", + "configmaps", + "persistentvolumes", + "persistentvolumeclaims", + ] + verbs: ["*"] + - apiGroups: ["apps"] + resources: ["deployments", "replicasets", "statefulsets", "daemonsets"] + verbs: ["*"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["*"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses", "networkpolicies"] + verbs: ["*"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["*"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimmodels", "aimmodelcaches", "aimservicetemplates", "aimruntimeconfigs", "aimtemplatecaches"] + verbs: ["*"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes", "gateways"] + verbs: ["*"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch", "create"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "clusterroles", "rolebindings", "clusterrolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["external-secrets.io"] + resources: ["clustersecretstores", "externalsecrets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] +{{- range .Values.airm.additionalClusterRoles.projectMember }} + - apiGroups: + {{ .apiGroups | toYaml | nindent 6 }} + resources: + {{ .resources | toYaml | nindent 6 }} + verbs: + {{ .verbs | toYaml | nindent 6 }} +{{- end }} diff --git a/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml b/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml new file mode 100644 index 00000000..8a3489ef --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-dispatcher/templates/airm-dispatcher-app.yaml @@ -0,0 +1,343 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-cluster-nodes-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Replace + jobTemplate: + spec: + template: + spec: + containers: + - command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/clusters/nodes + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + name: airm-cluster-nodes-cron + resources: + limits: + memory: 100Mi + requests: + cpu: 50m + memory: 100Mi + restartPolicy: OnFailure + schedule: 0 * * * * +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-aim-models-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Forbid + schedule: "*/5 * * * *" + suspend: false + jobTemplate: + spec: + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + restartPolicy: OnFailure + initContainers: + - name: check-dispatcher-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm Dispatcher at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm Dispatcher is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-dispatcher.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.servicePort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-aim-models-cron + command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/aims/cluster-models + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + resources: + requests: + memory: "100Mi" + cpu: "50m" + limits: + memory: "100Mi" +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: "{{ .Release.Name }}-heartbeat-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + concurrencyPolicy: Replace + schedule: "*/1 * * * *" + suspend: false + jobTemplate: + spec: + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + restartPolicy: OnFailure + initContainers: + - name: check-dispatcher-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm Dispatcher at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm Dispatcher is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-dispatcher.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.servicePort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-heartbeat-cron + command: + - curl + - -X + - POST + - http://{{ .Release.Name }}-dispatcher:{{ .Values.airm.dispatcher.servicePort }}/v1/heartbeats + image: "{{ .Values.airm.utilities.curl.image.repository }}:{{ .Values.airm.utilities.curl.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.curl.image.pullPolicy }}" + resources: + requests: + memory: "100Mi" + cpu: "50m" + limits: + memory: "100Mi" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: "{{ .Release.Name }}-dispatcher" + namespace: "{{ .Release.Namespace }}" +spec: + replicas: 1 + selector: + matchLabels: + app: "{{ .Release.Name }}-dispatcher" + template: + metadata: + labels: + app: "{{ .Release.Name }}-dispatcher" + spec: + serviceAccountName: "{{ .Release.Name }}-dispatcher-sa" + {{- with .Values.airm.dispatcher.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + initContainers: + - name: check-rabbitmq-is-ready + command: + [ + "/bin/sh", + "-c", + "while ! nc -z \"$ENDPOINT_URL_TO_CHECK\" \"$ENDPOINT_PORT_TO_CHECK\"; do echo \"Waiting for Airm RabbitMQ at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}...\"; sleep 3; done; echo \"Airm RabbitMQ is accepting connections at ${ENDPOINT_URL_TO_CHECK}:${ENDPOINT_PORT_TO_CHECK}.\"; sleep 3; exit 0" + ] + image: "{{ .Values.airm.utilities.netcat.image.repository }}:{{ .Values.airm.utilities.netcat.image.tag }}" + imagePullPolicy: "{{ .Values.airm.utilities.netcat.image.pullPolicy }}" + env: + - name: ENDPOINT_URL_TO_CHECK + value: "{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local" + - name: ENDPOINT_PORT_TO_CHECK + value: "{{ .Values.airm.dispatcher.env.rabbitmqPort }}" + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + drop: + - ALL + add: + - SETUID + - SETGID + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + seccompProfile: + type: RuntimeDefault + containers: + - name: airm-dispatcher + image: "{{ .Values.airm.dispatcher.image.repository }}:{{ .Values.airm.dispatcher.image.tag }}" + imagePullPolicy: "{{ .Values.airm.dispatcher.image.pullPolicy }}" + ports: + - containerPort: 8080 + env: + - name: KUBE_CLUSTER_NAME + value: demo-cluster + - name: ORG_NAME + value: demo + - name: RABBITMQ_HOST + value: "{{ .Release.Name }}-rabbitmq.{{ .Release.Namespace }}.svc.cluster.local" + - name: RABBITMQ_PORT + value: "{{ .Values.airm.dispatcher.env.rabbitmqPort }}" + - name: RABBITMQ_AIRM_COMMON_VHOST + value: "vh_airm_common" + - name: RABBITMQ_AIRM_COMMON_QUEUE + value: "airm_common" + - name: RABBITMQ_USER + valueFrom: + secretKeyRef: + name: "{{ .Release.Name }}-rabbitmq-common-vhost-user" + key: username + - name: RABBITMQ_PASSWORD + valueFrom: + secretKeyRef: + name: "{{ .Release.Name }}-rabbitmq-common-vhost-user" + key: password + livenessProbe: + httpGet: + path: /v1/health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + successThreshold: 1 + readinessProbe: + httpGet: + path: /v1/health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + successThreshold: 1 + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "1Gi" + securityContext: + runAsUser: 0 +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: "{{ .Release.Name }}-dispatcher-cluster-access-binding" +subjects: + - kind: ServiceAccount + name: "{{ .Release.Name }}-dispatcher-sa" + namespace: "{{ .Release.Namespace }}" +roleRef: + kind: ClusterRole + name: "{{ .Release.Name }}-dispatcher-cluster-access-role" + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: "{{ .Release.Name }}-dispatcher-cluster-access-role" +rules: + - apiGroups: [""] + resources: ["services", "namespaces", "configmaps", "pods"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["deployments", "statefulsets", "daemonsets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices", "kaiwoqueueconfigs"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["gateway.networking.k8s.io"] + resources: ["httproutes"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["networking.k8s.io"] + resources: ["ingresses"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["external-secrets.io"] + resources: ["externalsecrets"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices", "aimclustermodels"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: "{{ .Release.Name }}-dispatcher-sa" + namespace: "{{ .Release.Namespace }}" + +--- +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}-dispatcher" + namespace: "{{ .Release.Namespace }}" + labels: + app: "{{ .Release.Name }}-dispatcher" +spec: + ports: + - name: web + port: {{ .Values.airm.dispatcher.servicePort }} + targetPort: 8080 + type: ClusterIP + selector: + app: "{{ .Release.Name }}-dispatcher" diff --git a/sources/airm/0.3.5/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml b/sources/airm/0.3.5/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml new file mode 100644 index 00000000..e930efd0 --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-dispatcher/templates/kyverno-cluster-role.yaml @@ -0,0 +1,35 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +# These are additional cluster roles needed by kyverno background controller to be able to +# create rolebindings in namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kyverno:airm-policy-roles + labels: + rbac.kyverno.io/aggregate-to-background-controller: "true" +rules: + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterroles", "rolebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["rbac.authorization.k8s.io"] # allow kyverno to bind clusterroles via rolebindings + resources: ["clusterroles"] + verbs: ["bind"] +--- +# These are additional cluster roles needed by kyverno reports controller to be able to +# manage custom resources for reporting +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kyverno:airm-reports-policy-roles + labels: + rbac.kyverno.io/aggregate-to-reports-controller: "true" +rules: + - apiGroups: ["kaiwo.silogen.ai"] + resources: ["kaiwojobs", "kaiwoservices"] + verbs: ["get", "list", "watch"] + - apiGroups: ["aim.silogen.ai"] + resources: ["aimservices"] + verbs: ["get", "list", "watch"] diff --git a/sources/airm/0.3.5/charts/airm-dispatcher/values.yaml b/sources/airm/0.3.5/charts/airm-dispatcher/values.yaml new file mode 100644 index 00000000..6babd81c --- /dev/null +++ b/sources/airm/0.3.5/charts/airm-dispatcher/values.yaml @@ -0,0 +1,27 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT + +airm: + dispatcher: + image: + repository: amdenterpriseai/airm-dispatcher + tag: 0.3.5 + pullPolicy: IfNotPresent + servicePort: 80 + env: + rabbitmqPort: 5672 + utilities: + netcat: + image: + repository: busybox + tag: 1.37.0 + pullPolicy: IfNotPresent + curl: + image: + repository: curlimages/curl + tag: 8.16.0 + pullPolicy: IfNotPresent + additionalClusterRoles: + platformAdmin: [] + projectMember: [] diff --git a/sources/airm/0.3.5/values.yaml b/sources/airm/0.3.5/values.yaml new file mode 100644 index 00000000..69346880 --- /dev/null +++ b/sources/airm/0.3.5/values.yaml @@ -0,0 +1,3 @@ +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# +# SPDX-License-Identifier: MIT diff --git a/sources/argocd-config/es-tls-secret.yaml b/sources/argocd-config/es-tls-secret.yaml new file mode 100644 index 00000000..f70de573 --- /dev/null +++ b/sources/argocd-config/es-tls-secret.yaml @@ -0,0 +1,73 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: argocd-tls-secret-svc-account + namespace: argocd +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: argocd-external-secrets-tls-role +rules: +- apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: argocd-external-secrets-tls-role-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: argocd-external-secrets-tls-role +subjects: +- kind: ServiceAccount + name: argocd-tls-secret-svc-account + namespace: argocd +--- +apiVersion: external-secrets.io/v1beta1 +kind: ClusterSecretStore +metadata: + name: argocd-tls-secret-store +spec: + provider: + kubernetes: + remoteNamespace: kgateway-system + server: + caProvider: + type: ConfigMap + name: kube-root-ca.crt + key: ca.crt + namespace: kgateway-system + auth: + serviceAccount: + name: argocd-tls-secret-svc-account +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: cluster-tls + namespace: argocd +spec: + refreshInterval: 1h + secretStoreRef: + name: argocd-tls-secret-store + kind: ClusterSecretStore + target: + name: cluster-tls + template: + metadata: + labels: + app.kubernetes.io/part-of: argocd + data: + - secretKey: cert + remoteRef: + key: cluster-tls + property: tls.crt + - secretKey: key + remoteRef: + key: cluster-tls + property: tls.key +--- + diff --git a/sources/kyverno-config/local-path-access-mode-mutation.yaml b/sources/kyverno-config/local-path-access-mode-mutation.yaml deleted file mode 100644 index 1e1e9369..00000000 --- a/sources/kyverno-config/local-path-access-mode-mutation.yaml +++ /dev/null @@ -1,95 +0,0 @@ ---- -# Kyverno ClusterPolicy to mutate PVC access modes for local-path compatibility -# This policy is ONLY deployed to small and medium clusters via enabledApps configuration -# Large clusters use Longhorn and do NOT deploy this policy at all -apiVersion: kyverno.io/v1 -kind: ClusterPolicy -metadata: - name: local-path-access-mode-mutation - annotations: - policies.kyverno.io/title: "Local-Path Access Mode Mutation" - policies.kyverno.io/category: "Storage" - policies.kyverno.io/severity: "medium" - policies.kyverno.io/subject: "PersistentVolumeClaim" - policies.kyverno.io/minversion: "1.6.0" - policies.kyverno.io/description: >- - This policy automatically converts ReadWriteMany (RWX) and ReadOnlyMany (ROX) - access modes to ReadWriteOnce (RWO) for clusters using local-path provisioner. - This prevents PVC creation failures since local-path only supports RWO and RWOP. - NOTE: This policy is only deployed to small/medium clusters, never to large clusters. -spec: - admission: true - background: false - validationFailureAction: Enforce - rules: - - name: convert-rwx-rox-to-rwo - match: - resources: - kinds: - - PersistentVolumeClaim - preconditions: - any: - # Apply if PVC requests unsupported access modes - - key: "ReadWriteMany" - operator: AnyIn - value: "{{ request.object.spec.accessModes || [] }}" - - key: "ReadOnlyMany" - operator: AnyIn - value: "{{ request.object.spec.accessModes || [] }}" - mutate: - patchStrategicMerge: - spec: - # Replace access modes with RWO (only supported mode for local-path) - accessModes: - - ReadWriteOnce - metadata: - annotations: - +(kyverno.io/mutation-applied): "local-path-rwx-to-rwo" - +(kyverno.io/policy-reason): "local-path provisioner only supports ReadWriteOnce and ReadWriteOncePod" - ---- -# Validation policy to warn about access mode changes -apiVersion: kyverno.io/v1 -kind: ClusterPolicy -metadata: - name: local-path-access-mode-warning - annotations: - policies.kyverno.io/title: "Local-Path Access Mode Warning" - policies.kyverno.io/category: "Storage" - policies.kyverno.io/severity: "low" - policies.kyverno.io/subject: "PersistentVolumeClaim" - policies.kyverno.io/description: >- - This policy generates warnings when PVCs request RWX/ROX access modes - that will be converted to RWO due to local-path provisioner limitations. - NOTE: This policy is only deployed to small/medium clusters, never to large clusters. -spec: - admission: true - background: false - validationFailureAction: Audit # Warning only, don't block - rules: - - name: warn-access-mode-conversion - match: - resources: - kinds: - - PersistentVolumeClaim - preconditions: - any: - # Warn for unsupported access modes that will be converted - - key: "ReadWriteMany" - operator: AnyIn - value: "{{ request.object.spec.accessModes || [] }}" - - key: "ReadOnlyMany" - operator: AnyIn - value: "{{ request.object.spec.accessModes || [] }}" - validate: - message: >- - WARNING: The requested access mode(s) {{ request.object.spec.accessModes | join(',') }} - are not supported by the local-path provisioner used in small/medium clusters. - The access mode has been automatically converted to ReadWriteOnce (RWO). - For ReadWriteMany support, consider using a large cluster with Longhorn storage. - deny: - conditions: - # This condition is always false, so it only generates a warning - - key: "false" - operator: Equals - value: "true" \ No newline at end of file diff --git a/sources/kyverno-policies/base/templates/dynamic-pvc-creation.yaml b/sources/kyverno-policies/base/templates/dynamic-pvc-creation.yaml new file mode 100644 index 00000000..600a95b1 --- /dev/null +++ b/sources/kyverno-policies/base/templates/dynamic-pvc-creation.yaml @@ -0,0 +1,73 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kyverno-pvc-generator +rules: + - apiGroups: + - "" + resources: + - persistentvolumeclaims + verbs: + - create +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kyverno-pvc-generator-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kyverno-pvc-generator +subjects: + - kind: ServiceAccount + name: kyverno-background-controller + namespace: kyverno +--- +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: dynamic-pvc-creation +spec: + admission: true + background: false + emitWarning: false + rules: + - generate: + apiVersion: v1 + data: + spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: '{{ request.object.metadata.annotations."pvc.silogen.ai/user-pvc-size" + }}' + storageClassName: '{{ request.object.metadata.annotations."pvc.silogen.ai/user-pvc-storage-class-name" + }}' + kind: PersistentVolumeClaim + name: pvc-{{ request.object.metadata.annotations."pvc.silogen.ai/user-pvc-uid" }} + namespace: '{{ request.object.metadata.namespace }}' + synchronize: false + match: + resources: + kinds: + - Deployment + - Pod + name: create-pvc-if-annotated + preconditions: + all: + - key: '{{ request.object.metadata.annotations."pvc.silogen.ai/user-pvc-auto-create" + }}' + operator: Equals + value: "true" + - key: '{{ request.object.metadata.annotations."pvc.silogen.ai/user-pvc-size" + }}' + operator: NotEquals + value: "" + - key: '{{ request.object.metadata.annotations."pvc.silogen.ai/user-pvc-storage-class-name" + }}' + operator: NotEquals + value: "" + skipBackgroundRequests: true + validationFailureAction: Enforce \ No newline at end of file diff --git a/sources/kyverno-policies/base/templates/modelcache-resource-constraints.yaml b/sources/kyverno-policies/base/templates/modelcache-resource-constraints.yaml new file mode 100644 index 00000000..03c60fb3 --- /dev/null +++ b/sources/kyverno-policies/base/templates/modelcache-resource-constraints.yaml @@ -0,0 +1,35 @@ +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: modelcache-download-resource-limits +spec: + rules: + - name: add-resource-constraints-to-download-jobs + match: + any: + - resources: + kinds: + - Job + selector: + matchLabels: + app.kubernetes.io/managed-by: modelcache-controller + mutate: + patchStrategicMerge: + spec: + template: + spec: + containers: + - (name): "model-download" + resources: + limits: + memory: "32Gi" + requests: + memory: "4Gi" + cpu: "2" + env: + - name: HF_XET_HIGH_PERFORMANCE + value: "0" + - name: HF_XET_NUM_CONCURRENT_RANGE_GETS + value: "8" + - name: HF_XET_RECONSTRUCT_WRITE_SEQUENTIALLY + value: "1" diff --git a/sources/kyverno-policies/storage-local-path/access-mode-mutation.yaml b/sources/kyverno-policies/storage-local-path/templates/access-mode-mutation.yaml similarity index 86% rename from sources/kyverno-policies/storage-local-path/access-mode-mutation.yaml rename to sources/kyverno-policies/storage-local-path/templates/access-mode-mutation.yaml index c01a9bcc..24cc6f09 100644 --- a/sources/kyverno-policies/storage-local-path/access-mode-mutation.yaml +++ b/sources/kyverno-policies/storage-local-path/templates/access-mode-mutation.yaml @@ -32,10 +32,10 @@ spec: # Apply if PVC requests unsupported access modes - key: "ReadWriteMany" operator: AnyIn - value: "{{ request.object.spec.accessModes || [] }}" + value: "{{ "{{" }} request.object.spec.accessModes || [] {{ "}}" }}" - key: "ReadOnlyMany" operator: AnyIn - value: "{{ request.object.spec.accessModes || [] }}" + value: "{{ "{{" }} request.object.spec.accessModes || [] {{ "}}" }}" mutate: patchStrategicMerge: spec: @@ -44,7 +44,7 @@ spec: - ReadWriteOnce metadata: annotations: - +(kyverno.io/original-access-modes): "{{ request.object.spec.accessModes && request.object.spec.accessModes | join(',') || 'undefined' }}" + +(kyverno.io/original-access-modes): "{{ "{{" }} join(',', request.object.spec.accessModes) {{ "}}" }}" +(kyverno.io/mutation-applied): "local-path-rwx-to-rwo" +(kyverno.io/policy-reason): "local-path provisioner only supports ReadWriteOnce and ReadWriteOncePod" @@ -78,13 +78,13 @@ spec: # Warn for unsupported access modes that will be converted - key: "ReadWriteMany" operator: AnyIn - value: "{{ request.object.spec.accessModes || [] }}" + value: "{{ "{{" }} request.object.spec.accessModes || [] {{ "}}" }}" - key: "ReadOnlyMany" operator: AnyIn - value: "{{ request.object.spec.accessModes || [] }}" + value: "{{ "{{" }} request.object.spec.accessModes || [] {{ "}}" }}" validate: message: >- - WARNING: The requested access mode(s) {{ request.object.spec.accessModes && request.object.spec.accessModes | join(',') || 'undefined' }} + WARNING: The requested access mode(s) {{ "{{" }} join(',', request.object.spec.accessModes) {{ "}}" }} are not supported by the local-path provisioner used in small/medium clusters. The access mode has been automatically converted to ReadWriteOnce (RWO). For ReadWriteMany support, consider using a large cluster with Longhorn storage. diff --git a/sources/minio-tenant-config/templates/minio-create-user-cronjob.yaml b/sources/minio-tenant-config/templates/minio-create-user-cronjob.yaml index 952023b8..75bb5b1c 100644 --- a/sources/minio-tenant-config/templates/minio-create-user-cronjob.yaml +++ b/sources/minio-tenant-config/templates/minio-create-user-cronjob.yaml @@ -18,10 +18,10 @@ spec: name: mc resources: requests: - memory: "32Mi" + memory: "64Mi" cpu: "100m" limits: - memory: "128Mi" + memory: "256Mi" cpu: "500m" volumeMounts: - mountPath: /tmp/minio-config diff --git a/sources/openbao-config/0.1.0/templates/openbao-unseal-cronjob.yaml b/sources/openbao-config/0.1.0/templates/openbao-unseal-cronjob.yaml index 9c1447e3..5f8d8887 100644 --- a/sources/openbao-config/0.1.0/templates/openbao-unseal-cronjob.yaml +++ b/sources/openbao-config/0.1.0/templates/openbao-unseal-cronjob.yaml @@ -17,6 +17,19 @@ spec: spec: restartPolicy: Never serviceAccountName: openbao-unseal-job-sa + initContainers: + - name: wait-for-keys + image: ghcr.io/silogen/cluster-tool:latest + command: ["/bin/bash"] + args: + - -c + - | + echo "Waiting for openbao-keys secret to exist..." + while ! kubectl get secret openbao-keys -n cf-openbao >/dev/null 2>&1; do + echo "Secret openbao-keys not found, waiting 10 seconds..." + sleep 10 + done + echo "Secret openbao-keys found, proceeding with unseal job" containers: - name: openbao-init image: ghcr.io/silogen/cluster-tool:latest diff --git a/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml index f42e8c26..cb76e752 100644 --- a/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml +++ b/sources/otel-lgtm-stack/v1.0.7/templates/lgtm-stack.yaml @@ -78,6 +78,83 @@ metadata: name: grafana-sidecar namespace: {{ .Release.Namespace }} --- +# ConfigMap for liveness/readiness check script +apiVersion: v1 +kind: ConfigMap +metadata: + name: lgtm-check-scripts + namespace: {{ .Release.Namespace }} +data: + check-ps.sh: | + #!/bin/sh + # Get processes using /proc filesystem (runs inside container) + PROCESS_LIST="" + for pid in /proc/[0-9]*; do + cmdline=$(cat "$pid/cmdline" 2>/dev/null | tr "\0" " ") + if [ -n "$cmdline" ]; then + PROCESS_LIST="${PROCESS_LIST}${cmdline}"$'\n' + fi + done + + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[1;33m' + NC='\033[0m' + + if [[ -z "$PROCESS_LIST" ]]; then + echo -e "${RED}[FAILED] Could not retrieve process list from container${NC}" + exit 1 + fi + + # Define required processes + REQUIRED_PROCESSES=( + "/bin/bash /otel-lgtm/run-all.sh" + "/bin/bash ./run-loki.sh" + "/bin/bash ./run-grafana.sh" + "./bin/grafana server" + "/bin/bash ./run-otelcol.sh" + "/bin/bash ./run-prometheus.sh" + "./otelcol-contrib/otelcol-contrib --feature-gates service.profilesSupport --config=file:./otelcol-config.yaml" + "./loki/loki --config.file=./loki-config.yaml" + "/bin/bash ./run-tempo.sh" + "./prometheus/prometheus --web.enable-remote-write-receiver --web.enable-otlp-receiver --enable-feature=exemplar-storage --enable-feature=native-histograms --storage.tsdb.path=/data/prometheus --config.file=./prometheus.yaml" + "/data/grafana/plugins/grafana-llm-app/gpx_llm_linux_amd64" + ) + + # Check each process and build list of missing ones + MISSING_COUNT=0 + echo "Expected processes (${#REQUIRED_PROCESSES[@]} total):" + echo "---" + + for REQUIRED in "${REQUIRED_PROCESSES[@]}"; do + if echo "$PROCESS_LIST" | grep -qF "$REQUIRED"; then + echo -e "${GREEN} [RUNNING] $REQUIRED${NC}" + else + echo -e "${RED} [MISSING] $REQUIRED${NC}" + MISSING_COUNT=$((MISSING_COUNT + 1)) + fi + done + + echo "" + + if [[ $MISSING_COUNT -eq 0 ]]; then + echo -e "${GREEN} [OK] All required processes are running${NC}" + else + echo -e "${RED} [WARNING] $MISSING_COUNT of ${#REQUIRED_PROCESSES[@]} processes are missing${NC}" + echo "" + echo -e "${YELLOW} This may indicate the LGTM stack is not fully operational.${NC}" + echo -e "${YELLOW} Consider restarting the pod.${NC}" + fi + + echo "" + + if [[ $MISSING_COUNT -gt 0 ]]; then + echo -e "${RED}[FAILED] Some required processes are not running${NC}" + exit 1 + fi + exit 0 + +--- # Source: grafana/templates/configmap-dashboard-provider.yaml ###### apiVersion: v1 kind: ConfigMap @@ -235,7 +312,8 @@ spec: - | curl -f http://localhost:9090/-/ready && curl -f http://localhost:3100/ready && - curl -f http://localhost:3000/api/health + curl -f http://localhost:3000/api/health && + /scripts/check-ps.sh initialDelaySeconds: 120 periodSeconds: 30 timeoutSeconds: 15 @@ -266,6 +344,9 @@ spec: - name: sc-dashboard-provider ## mountPath: "/otel-lgtm/grafana/conf/provisioning/dashboards/sc-dashboardproviders.yaml" subPath: provider.yaml + - name: check-scripts + mountPath: /scripts + readOnly: true volumes: - name: tempo-data persistentVolumeClaim: @@ -288,3 +369,7 @@ spec: - name: sc-dashboard-provider ## configMap: name: grafana-config-dashboards + - name: check-scripts + configMap: + name: lgtm-check-scripts + defaultMode: 493