diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 177c52f31..78ead8d48 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -116,9 +116,16 @@ Check required Helm deployment secrets: kubectl -n openshell get secret \ openshell-server-tls \ openshell-server-client-ca \ - openshell-client-tls + openshell-client-tls \ + openshell-jwt-keys ``` +If the gateway exits with `failed to read sandbox JWT signing key from +/etc/openshell-jwt/signing.pem`, verify that `openshell-jwt-keys` contains +`signing.pem`, `public.pem`, and `kid`, and that the StatefulSet mounts the +`sandbox-jwt` secret at `/etc/openshell-jwt`. The sandbox JWT mount is required +even when local Helm values disable TLS. + Check the image references currently used by the gateway deployment: ```bash @@ -177,6 +184,18 @@ helm -n openshell get values openshell | grep sandboxNamespace Then inspect sandbox resources in that namespace. +Check the configured sandbox service account when TokenReview bootstrap or +sandbox registration fails. Helm creates a dedicated sandbox service account by +default and writes it to `[openshell.drivers.kubernetes].service_account_name`; +the gateway rejects projected tokens from other service accounts. + +```bash +helm -n openshell get values openshell | grep -A3 sandboxServiceAccount +kubectl -n get serviceaccount openshell-sandbox +kubectl -n openshell get configmap openshell-config -o jsonpath='{.data.gateway\.toml}' +kubectl -n get sandbox -o jsonpath='{.spec.template.spec.serviceAccountName}{"\n"}' +``` + ### Step 6: Check VM-Backed Gateways Use the VM driver logs and host diagnostics available in the user's environment. Verify: diff --git a/.agents/skills/helm-dev-environment/SKILL.md b/.agents/skills/helm-dev-environment/SKILL.md index 623efb2e6..a97395fb1 100644 --- a/.agents/skills/helm-dev-environment/SKILL.md +++ b/.agents/skills/helm-dev-environment/SKILL.md @@ -26,8 +26,9 @@ mise run helm:k3s:create ``` Creates a k3d cluster and merges its kubeconfig into the worktree-local `kubeconfig` file. -Also applies base manifests (`deploy/kube/manifests/agent-sandbox.yaml`). Traefik is -disabled at cluster creation time. +Also applies base manifests (`deploy/kube/manifests/agent-sandbox.yaml`) and preloads the +default community sandbox image into k3d so the first sandbox create does not wait on a +large registry pull. Traefik is disabled at cluster creation time. **Multi-worktree support:** the cluster name is derived from the last component of the current git branch (e.g. branch `kube-support/local-dev/tmutch` → cluster @@ -43,6 +44,8 @@ Port mappings created at cluster time (cannot be changed without recreating): Override with env vars before running `helm:k3s:create`: - `HELM_K3S_LB_HOST_PORT` (default: `8080`) +- `HELM_K3S_PRELOAD_SANDBOX_IMAGE` (default: + `ghcr.io/nvidia/openshell-community/sandboxes/base:latest`; set to an empty value to skip) ### 2. Deploy OpenShell diff --git a/.markdownlint-cli2.jsonc b/.markdownlint-cli2.jsonc index 30cf48849..125df0f81 100644 --- a/.markdownlint-cli2.jsonc +++ b/.markdownlint-cli2.jsonc @@ -16,6 +16,7 @@ ".claude/**", ".opencode/**", ".github/**", + "architecture/plans/**", "**/node_modules/**", "target/**", ".pytest_cache/**", diff --git a/Cargo.lock b/Cargo.lock index 9e73bce83..b54e2a54f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3395,6 +3395,7 @@ dependencies = [ "rcgen", "serde", "serde_json", + "sha2 0.10.9", "tar", "tempfile", "tokio", @@ -3684,6 +3685,7 @@ name = "openshell-server" version = "0.0.0" dependencies = [ "anyhow", + "async-trait", "axum 0.8.9", "bytes", "clap", @@ -3740,6 +3742,7 @@ dependencies = [ "tower-http 0.6.8", "tracing", "tracing-subscriber", + "url", "uuid", "wiremock", "x509-parser", diff --git a/architecture/gateway.md b/architecture/gateway.md index 04e64a73f..75cb624ca 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -46,6 +46,25 @@ Sandbox supervisor RPCs authenticate with either mTLS material or a sandbox secret depending on the runtime and deployment mode. User-facing mutations are authorized by role policy when OIDC or edge identity is enabled. +Sandbox secrets are gateway-signed JWTs bound to a single sandbox ID. Docker, +Podman, and VM drivers deliver the initial token through supervisor-only +runtime material; Kubernetes supervisors exchange a projected ServiceAccount +token through `IssueSandboxToken`. The gateway validates that projected token +with Kubernetes `TokenReview`, requires the configured sandbox service account, +checks the returned pod binding against the live pod UID, and reads the pod's +sandbox annotation before minting the gateway JWT. Supervisors renew gateway +JWTs in memory before expiry only while the sandbox record still exists. Older +tokens are not server-revoked; deployments bound replay exposure with short +`gateway_jwt.ttl_secs` lifetimes. + +Sandbox JWTs are not user credentials. The gRPC router accepts +`Principal::Sandbox` only on the supervisor-to-gateway RPC allowlist +(`ConnectSupervisor`, `RelayStream`, token renewal, config sync, policy status, +log push, and policy-analysis callbacks). Handlers then compare the +authenticated sandbox ID with any sandbox ID or name resolved from the request. +Supervisor control and relay streams require a matching sandbox principal before +the gateway registers the session or bridges relay bytes. + ## API Surface The gateway API is organized around platform objects and operational streams: diff --git a/crates/openshell-bootstrap/Cargo.toml b/crates/openshell-bootstrap/Cargo.toml index c0fb7e9f4..578d59e65 100644 --- a/crates/openshell-bootstrap/Cargo.toml +++ b/crates/openshell-bootstrap/Cargo.toml @@ -16,6 +16,7 @@ bytes = { workspace = true } futures = { workspace = true } miette = { workspace = true } rcgen = { workspace = true } +sha2 = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } tar = "0.4" diff --git a/crates/openshell-bootstrap/src/jwt.rs b/crates/openshell-bootstrap/src/jwt.rs new file mode 100644 index 000000000..cf8ab0dc1 --- /dev/null +++ b/crates/openshell-bootstrap/src/jwt.rs @@ -0,0 +1,112 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Gateway-minted JWT signing-key generation. +//! +//! The gateway mints per-sandbox identity tokens (see PR 2 of the +//! per-sandbox identity series, issue #1354) signed with an Ed25519 +//! keypair generated once at gateway init and persisted alongside the +//! existing PKI bundle. The signing key never leaves the gateway; the +//! public key plus a stable `kid` are consumed by the gateway's own +//! validator and any future external verifiers. + +use miette::{IntoDiagnostic, Result, WrapErr}; +use rcgen::{KeyPair, PKCS_ED25519}; +use sha2::{Digest, Sha256}; + +/// All PEM-encoded material needed to mint and validate sandbox JWTs. +/// +/// The signing key stays in the gateway process. The public key is shared +/// across gateway replicas (so any replica can validate a JWT minted by +/// any other replica). The `kid` is published in every minted JWT's +/// header so the validator can pick the right key after a future rotation. +pub struct JwtKeyMaterial { + /// PKCS#8 PEM-encoded Ed25519 private key. + pub signing_key_pem: String, + /// `SubjectPublicKeyInfo` PEM-encoded Ed25519 public key. + pub public_key_pem: String, + /// Stable identifier derived from the public key (SHA-256 hex prefix). + /// Embedded in every minted JWT's `kid` header so future rotation can + /// be performed in-place by adding a second key without breaking + /// in-flight tokens. + pub kid: String, +} + +/// Generate a fresh Ed25519 JWT signing key. +/// +/// Output PEM is in the formats `jsonwebtoken` consumes via +/// `EncodingKey::from_ed_pem` (signing) and `DecodingKey::from_ed_pem` +/// (validation), so the gateway can round-trip its own tokens with no +/// further conversion. +pub fn generate_jwt_key() -> Result { + let keypair = KeyPair::generate_for(&PKCS_ED25519) + .into_diagnostic() + .wrap_err("failed to generate Ed25519 JWT signing key")?; + let signing_key_pem = keypair.serialize_pem(); + let public_key_pem = keypair.public_key_pem(); + let kid = kid_from_public_key_der(&keypair.public_key_der()); + Ok(JwtKeyMaterial { + signing_key_pem, + public_key_pem, + kid, + }) +} + +/// Stable `kid` derived from the SHA-256 of the public-key DER. +/// +/// First 16 bytes hex-encoded — collision-resistant for the small N of +/// signing keys a single deployment ever has, while staying short enough +/// to keep JWT headers compact. +fn kid_from_public_key_der(public_key_der: &[u8]) -> String { + let digest = Sha256::digest(public_key_der); + hex_encode_prefix(&digest, 16) +} + +fn hex_encode_prefix(bytes: &[u8], n: usize) -> String { + use std::fmt::Write as _; + let mut out = String::with_capacity(n * 2); + for byte in bytes.iter().take(n) { + let _ = write!(out, "{byte:02x}"); + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn generate_jwt_key_produces_parseable_pem() { + let material = generate_jwt_key().expect("generate_jwt_key"); + assert!(material.signing_key_pem.contains("BEGIN PRIVATE KEY")); + assert!(material.public_key_pem.contains("BEGIN PUBLIC KEY")); + assert_eq!(material.kid.len(), 32, "kid is 16 bytes hex-encoded"); + assert!(material.kid.chars().all(|c| c.is_ascii_hexdigit())); + } + + #[test] + fn kid_is_stable_for_identical_public_keys() { + // Same input -> same kid. Hash of a fixed byte string. + let kid_a = kid_from_public_key_der(b"abc"); + let kid_b = kid_from_public_key_der(b"abc"); + assert_eq!(kid_a, kid_b); + } + + #[test] + fn kid_differs_for_different_public_keys() { + let kid_a = kid_from_public_key_der(b"first"); + let kid_b = kid_from_public_key_der(b"second"); + assert_ne!(kid_a, kid_b); + } + + #[test] + fn generated_keys_are_unique() { + let a = generate_jwt_key().expect("generate_jwt_key"); + let b = generate_jwt_key().expect("generate_jwt_key"); + assert_ne!( + a.kid, b.kid, + "fresh keypairs must produce distinct public keys" + ); + assert_ne!(a.signing_key_pem, b.signing_key_pem); + } +} diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 0988c4b6b..8845f0392 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -3,6 +3,7 @@ pub mod build; pub mod edge_token; +pub mod jwt; pub mod oidc_token; mod metadata; diff --git a/crates/openshell-bootstrap/src/pki.rs b/crates/openshell-bootstrap/src/pki.rs index b6747260b..388507840 100644 --- a/crates/openshell-bootstrap/src/pki.rs +++ b/crates/openshell-bootstrap/src/pki.rs @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +use crate::jwt::{JwtKeyMaterial, generate_jwt_key}; use miette::{IntoDiagnostic, Result, WrapErr}; use rcgen::{BasicConstraints, CertificateParams, DnType, Ia5String, IsCa, KeyPair, SanType}; use std::net::IpAddr; @@ -15,6 +16,12 @@ pub struct PkiBundle { pub server_key_pem: String, pub client_cert_pem: String, pub client_key_pem: String, + /// PKCS#8 PEM Ed25519 private key for minting per-sandbox JWTs. + pub jwt_signing_key_pem: String, + /// SPKI PEM Ed25519 public key, paired with `jwt_signing_key_pem`. + pub jwt_public_key_pem: String, + /// Stable identifier embedded in the `kid` header of every minted JWT. + pub jwt_key_id: String, } /// Default SANs always included on the server certificate. Covers the host @@ -99,6 +106,13 @@ pub fn generate_pki(extra_sans: &[String]) -> Result { .into_diagnostic() .wrap_err("failed to sign client certificate")?; + // --- JWT signing key (Ed25519, used to mint per-sandbox identity tokens) --- + let JwtKeyMaterial { + signing_key_pem: jwt_signing_key_pem, + public_key_pem: jwt_public_key_pem, + kid: jwt_key_id, + } = generate_jwt_key().wrap_err("failed to generate JWT signing key")?; + Ok(PkiBundle { ca_cert_pem: ca_cert.pem(), ca_key_pem: ca_key.serialize_pem(), @@ -106,6 +120,9 @@ pub fn generate_pki(extra_sans: &[String]) -> Result { server_key_pem: server_key.serialize_pem(), client_cert_pem: client_cert.pem(), client_key_pem: client_key.serialize_pem(), + jwt_signing_key_pem, + jwt_public_key_pem, + jwt_key_id, }) } @@ -148,6 +165,9 @@ mod tests { assert!(bundle.server_key_pem.contains("BEGIN PRIVATE KEY")); assert!(bundle.client_cert_pem.contains("BEGIN CERTIFICATE")); assert!(bundle.client_key_pem.contains("BEGIN PRIVATE KEY")); + assert!(bundle.jwt_signing_key_pem.contains("BEGIN PRIVATE KEY")); + assert!(bundle.jwt_public_key_pem.contains("BEGIN PUBLIC KEY")); + assert_eq!(bundle.jwt_key_id.len(), 32, "kid is 16 bytes hex-encoded"); } #[test] diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 2e3cb0531..9bd4adf5d 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -743,6 +743,11 @@ fn import_local_package_mtls_bundle(name: &str) -> Result> { client_key_pem: std::fs::read_to_string(&key) .into_diagnostic() .wrap_err_with(|| format!("failed to read {}", key.display()))?, + // CLI never holds the gateway's JWT signing material — only the + // gateway needs it. Fill the JWT fields with placeholders. + jwt_signing_key_pem: String::new(), + jwt_public_key_pem: String::new(), + jwt_key_id: String::new(), }; openshell_bootstrap::mtls::store_pki_bundle(name, &bundle) .wrap_err_with(|| format!("failed to store mTLS bundle for gateway '{name}'"))?; diff --git a/crates/openshell-cli/tests/ensure_providers_integration.rs b/crates/openshell-cli/tests/ensure_providers_integration.rs index fa2605ac2..ea2d5a465 100644 --- a/crates/openshell-cli/tests/ensure_providers_integration.rs +++ b/crates/openshell-cli/tests/ensure_providers_integration.rs @@ -535,6 +535,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/mtls_integration.rs b/crates/openshell-cli/tests/mtls_integration.rs index fd7a18b28..8f83599b1 100644 --- a/crates/openshell-cli/tests/mtls_integration.rs +++ b/crates/openshell-cli/tests/mtls_integration.rs @@ -424,6 +424,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/provider_commands_integration.rs b/crates/openshell-cli/tests/provider_commands_integration.rs index cb2b3cb18..b0e3b99a1 100644 --- a/crates/openshell-cli/tests/provider_commands_integration.rs +++ b/crates/openshell-cli/tests/provider_commands_integration.rs @@ -800,6 +800,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs index 8e606beea..2ce409413 100644 --- a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs +++ b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs @@ -604,6 +604,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs b/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs index 44393fb2f..88358391c 100644 --- a/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs +++ b/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs @@ -437,6 +437,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-core/src/config.rs b/crates/openshell-core/src/config.rs index dbd8dfb8a..287bd72eb 100644 --- a/crates/openshell-core/src/config.rs +++ b/crates/openshell-core/src/config.rs @@ -205,6 +205,13 @@ pub struct Config { #[serde(default)] pub oidc: Option, + /// Gateway-minted sandbox JWT configuration. When `Some`, the gateway + /// loads the signing key from disk and accepts gateway-issued sandbox + /// JWTs as `Principal::Sandbox`. Required for the per-sandbox identity + /// flow (issue #1354). + #[serde(default)] + pub gateway_jwt: Option, + /// Database URL for persistence. pub database_url: String, @@ -317,6 +324,37 @@ const fn default_jwks_ttl_secs() -> u64 { 3600 } +/// Gateway-minted sandbox JWT configuration. +/// +/// Points the gateway at the Ed25519 signing key (produced by `certgen`) +/// and identifies the issuer string embedded in every minted token. The +/// signing key never leaves the gateway process; the public key is loaded +/// by the same gateway so it can validate its own tokens. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GatewayJwtConfig { + /// Path to the Ed25519 signing key (PKCS#8 PEM). + pub signing_key_path: PathBuf, + /// Path to the matching public key (SPKI PEM). + pub public_key_path: PathBuf, + /// Path to the `kid` value (plain text, one line). + pub kid_path: PathBuf, + /// Stable gateway identity embedded in `iss`/`aud`. Defaults to the + /// hostname-or-`openshell` placeholder if unset. + #[serde(default = "default_gateway_id")] + pub gateway_id: String, + /// Token lifetime in seconds. Defaults to 1 hour. + #[serde(default = "default_sandbox_token_ttl_secs")] + pub ttl_secs: u64, +} + +fn default_gateway_id() -> String { + "openshell".to_string() +} + +const fn default_sandbox_token_ttl_secs() -> u64 { + 3_600 +} + fn default_roles_claim() -> String { "realm_access.roles".to_string() } @@ -340,6 +378,7 @@ impl Config { log_level: default_log_level(), tls, oidc: None, + gateway_jwt: None, database_url: String::new(), compute_drivers: vec![], ssh_session_ttl_secs: default_ssh_session_ttl_secs(), diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index d0225c471..cc82ab53f 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -26,7 +26,7 @@ pub mod sandbox_env; pub mod settings; pub mod time; -pub use config::{ComputeDriverKind, Config, OidcConfig, TlsConfig}; +pub use config::{ComputeDriverKind, Config, GatewayJwtConfig, OidcConfig, TlsConfig}; pub use error::{ComputeDriverError, Error, Result}; pub use metadata::{GetResourceVersion, ObjectId, ObjectLabels, ObjectName, SetResourceVersion}; diff --git a/crates/openshell-core/src/sandbox_env.rs b/crates/openshell-core/src/sandbox_env.rs index d345762ca..b367e450c 100644 --- a/crates/openshell-core/src/sandbox_env.rs +++ b/crates/openshell-core/src/sandbox_env.rs @@ -34,3 +34,22 @@ pub const TLS_CERT: &str = "OPENSHELL_TLS_CERT"; /// Path to the private key for mTLS communication with the gateway. pub const TLS_KEY: &str = "OPENSHELL_TLS_KEY"; + +/// Raw gateway-minted JWT identifying this sandbox. Mutually exclusive with +/// [`SANDBOX_TOKEN_FILE`] / [`K8S_SA_TOKEN_FILE`]; used only by test harnesses +/// that bypass the file-mount path. +pub const SANDBOX_TOKEN: &str = "OPENSHELL_SANDBOX_TOKEN"; + +/// Path to the file holding a gateway-minted sandbox JWT. +/// +/// Set by the Docker, Podman, and VM drivers, which write the token to a +/// bundle file at sandbox-create time. Read once at supervisor startup; +/// the token is held in process memory thereafter. +pub const SANDBOX_TOKEN_FILE: &str = "OPENSHELL_SANDBOX_TOKEN_FILE"; + +/// Path to the projected `ServiceAccount` JWT (Kubernetes driver). +/// +/// Used to bootstrap a gateway-minted JWT via `IssueSandboxToken`. Kubelet +/// writes and rotates this file; the supervisor exchanges its contents +/// for a gateway JWT at startup and on refresh. +pub const K8S_SA_TOKEN_FILE: &str = "OPENSHELL_K8S_SA_TOKEN_FILE"; diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 3a0772217..03e584843 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -992,6 +992,19 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig ); } + // Gateway-minted sandbox JWT (PR 3 of the per-sandbox identity series). + // Passed via env var since Docker has no native secret mount that is + // simpler than the existing bind-mount pattern; the trust boundary + // (`docker inspect` access) is already equivalent to the TLS key mount. + if let Some(spec) = sandbox.spec.as_ref() + && !spec.sandbox_token.is_empty() + { + environment.insert( + openshell_core::sandbox_env::SANDBOX_TOKEN.to_string(), + spec.sandbox_token.clone(), + ); + } + let mut pairs = environment.into_iter().collect::>(); pairs.sort_by(|left, right| left.0.cmp(&right.0)); pairs diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index 2ac2da1ee..575fb6677 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -33,6 +33,7 @@ fn test_sandbox() -> DriverSandbox { }), gpu: false, gpu_device: String::new(), + sandbox_token: String::new(), }), status: None, } diff --git a/crates/openshell-driver-kubernetes/README.md b/crates/openshell-driver-kubernetes/README.md index 4a8a8f76b..1d45a1d83 100644 --- a/crates/openshell-driver-kubernetes/README.md +++ b/crates/openshell-driver-kubernetes/README.md @@ -38,6 +38,12 @@ The driver injects gateway callback configuration, sandbox identity, TLS client material, and the supervisor SSH socket path into the workload. Driver-owned values must override image-provided environment variables. +Sandbox pods run as `service_account_name` and keep +`automountServiceAccountToken: false`. The only Kubernetes token exposed to the +supervisor is an explicit, audience-bound projected token mounted at +`/var/run/secrets/openshell/token` for the one-shot `IssueSandboxToken` +bootstrap exchange. + The gateway uses the supervisor relay for connect, exec, and file sync. Sandbox pods do not need direct external ingress for SSH. diff --git a/crates/openshell-driver-kubernetes/src/config.rs b/crates/openshell-driver-kubernetes/src/config.rs index 3f7888af6..3c1b28738 100644 --- a/crates/openshell-driver-kubernetes/src/config.rs +++ b/crates/openshell-driver-kubernetes/src/config.rs @@ -7,6 +7,9 @@ use serde::{Deserialize, Serialize}; /// Default Kubernetes namespace for sandbox resources. pub const DEFAULT_K8S_NAMESPACE: &str = "openshell"; +/// Default Kubernetes `ServiceAccount` assigned to sandbox pods. +pub const DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME: &str = "default"; + /// Default storage size for the workspace PVC. pub const DEFAULT_WORKSPACE_STORAGE_SIZE: &str = "2Gi"; @@ -51,6 +54,9 @@ impl std::str::FromStr for SupervisorSideloadMethod { #[serde(default, deny_unknown_fields)] pub struct KubernetesComputeConfig { pub namespace: String, + /// Kubernetes `ServiceAccount` assigned to sandbox pods and accepted by + /// the gateway's `TokenReview` bootstrap authenticator. + pub service_account_name: String, pub default_image: String, pub image_pull_policy: String, /// Image that provides the `openshell-sandbox` supervisor binary. @@ -68,12 +74,30 @@ pub struct KubernetesComputeConfig { pub host_gateway_ip: String, pub enable_user_namespaces: bool, pub workspace_default_storage_size: String, + /// Lifetime (seconds) of the projected `ServiceAccount` token kubelet + /// writes into each sandbox pod. Used only for the one-shot + /// `IssueSandboxToken` bootstrap exchange — the gateway-minted JWT + /// that follows has its own TTL set via `gateway_jwt.ttl_secs`. + /// + /// Kubelet enforces a minimum of 600 seconds; the supervisor uses + /// this token within a few seconds of pod start, so any value at + /// the floor is sufficient. Default 3600. + pub sa_token_ttl_secs: i64, } +/// Lower bound enforced by kubelet for projected SA tokens. +pub const MIN_SA_TOKEN_TTL_SECS: i64 = 600; + +/// Cap at 24h — operators who want longer-lived bootstrap tokens are +/// almost certainly misconfigured (the token is consumed seconds after +/// pod start). +pub const MAX_SA_TOKEN_TTL_SECS: i64 = 86_400; + impl Default for KubernetesComputeConfig { fn default() -> Self { Self { namespace: DEFAULT_K8S_NAMESPACE.to_string(), + service_account_name: DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME.to_string(), default_image: default_sandbox_image(), // Default empty so the gateway omits `imagePullPolicy` from pod // specs and Kubernetes applies its own default (Always for `latest`, @@ -89,6 +113,22 @@ impl Default for KubernetesComputeConfig { host_gateway_ip: String::new(), enable_user_namespaces: false, workspace_default_storage_size: DEFAULT_WORKSPACE_STORAGE_SIZE.to_string(), + sa_token_ttl_secs: 3600, + } + } +} + +impl KubernetesComputeConfig { + /// Clamp `sa_token_ttl_secs` into the `[MIN_SA_TOKEN_TTL_SECS, + /// MAX_SA_TOKEN_TTL_SECS]` range used by the projected-volume spec. + /// Invalid (≤0) values fall back to the default 3600. + #[must_use] + pub fn effective_sa_token_ttl_secs(&self) -> i64 { + if self.sa_token_ttl_secs <= 0 { + 3600 + } else { + self.sa_token_ttl_secs + .clamp(MIN_SA_TOKEN_TTL_SECS, MAX_SA_TOKEN_TTL_SECS) } } } @@ -113,6 +153,15 @@ mod tests { ); } + #[test] + fn default_service_account_name_is_default() { + let cfg = KubernetesComputeConfig::default(); + assert_eq!( + cfg.service_account_name, + DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME + ); + } + #[test] fn serde_override_workspace_storage_size() { let json = serde_json::json!({ @@ -121,4 +170,13 @@ mod tests { let cfg: KubernetesComputeConfig = serde_json::from_value(json).unwrap(); assert_eq!(cfg.workspace_default_storage_size, "10Gi"); } + + #[test] + fn serde_override_service_account_name() { + let json = serde_json::json!({ + "service_account_name": "openshell-sandbox" + }); + let cfg: KubernetesComputeConfig = serde_json::from_value(json).unwrap(); + assert_eq!(cfg.service_account_name, "openshell-sandbox"); + } } diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index a624f787e..01b183b2b 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -4,7 +4,8 @@ //! Kubernetes compute driver. use crate::config::{ - DEFAULT_WORKSPACE_STORAGE_SIZE, KubernetesComputeConfig, SupervisorSideloadMethod, + DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, DEFAULT_WORKSPACE_STORAGE_SIZE, KubernetesComputeConfig, + SupervisorSideloadMethod, }; use futures::{Stream, StreamExt, TryStreamExt}; use k8s_openapi::api::core::v1::{Event as KubeEventObj, Node}; @@ -319,6 +320,7 @@ impl KubernetesComputeDriver { supervisor_image: &self.config.supervisor_image, supervisor_image_pull_policy: &self.config.supervisor_image_pull_policy, supervisor_sideload_method: self.config.supervisor_sideload_method, + service_account_name: &self.config.service_account_name, sandbox_id: &sandbox.id, sandbox_name: &sandbox.name, grpc_endpoint: &self.config.grpc_endpoint, @@ -327,6 +329,7 @@ impl KubernetesComputeDriver { host_gateway_ip: &self.config.host_gateway_ip, enable_user_namespaces: self.config.enable_user_namespaces, workspace_default_storage_size: &self.config.workspace_default_storage_size, + sa_token_ttl_secs: self.config.effective_sa_token_ttl_secs(), }; obj.data = sandbox_to_k8s_spec(sandbox.spec.as_ref(), ¶ms); let api = self.api(); @@ -1047,13 +1050,13 @@ fn default_workspace_volume_claim_templates(storage_size: &str) -> serde_json::V } /// Parameters shared by `sandbox_to_k8s_spec` and `sandbox_template_to_k8s`. -#[derive(Default)] struct SandboxPodParams<'a> { default_image: &'a str, image_pull_policy: &'a str, supervisor_image: &'a str, supervisor_image_pull_policy: &'a str, supervisor_sideload_method: SupervisorSideloadMethod, + service_account_name: &'a str, sandbox_id: &'a str, sandbox_name: &'a str, grpc_endpoint: &'a str, @@ -1062,6 +1065,31 @@ struct SandboxPodParams<'a> { host_gateway_ip: &'a str, enable_user_namespaces: bool, workspace_default_storage_size: &'a str, + /// Lifetime (seconds) of the projected `ServiceAccount` token used + /// for the bootstrap `IssueSandboxToken` exchange. + sa_token_ttl_secs: i64, +} + +impl Default for SandboxPodParams<'_> { + fn default() -> Self { + Self { + default_image: "", + image_pull_policy: "", + supervisor_image: "", + supervisor_image_pull_policy: "", + supervisor_sideload_method: SupervisorSideloadMethod::default(), + service_account_name: DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, + sandbox_id: "", + sandbox_name: "", + grpc_endpoint: "", + ssh_socket_path: "", + client_tls_secret_name: "", + host_gateway_ip: "", + enable_user_namespaces: false, + workspace_default_storage_size: DEFAULT_WORKSPACE_STORAGE_SIZE, + sa_token_ttl_secs: 3600, + } + } } fn spec_pod_env(spec: Option<&SandboxSpec>) -> std::collections::HashMap { @@ -1153,8 +1181,28 @@ fn sandbox_template_to_k8s( if !template.labels.is_empty() { metadata.insert("labels".to_string(), serde_json::json!(template.labels)); } - if let Some(annotations) = platform_config_struct(template, "annotations") { - metadata.insert("annotations".to_string(), annotations); + // Carry the sandbox UUID as a pod annotation so the gateway can resolve + // a projected SA token claim (pod name + uid) back to a sandbox identity + // when the supervisor calls `IssueSandboxToken` at startup. The gateway's + // K8s Role does NOT grant `patch pods`, so this annotation is + // effectively immutable post-create (see plan §11.8). + let mut pod_annotations = platform_config_struct(template, "annotations") + .and_then(|v| match v { + serde_json::Value::Object(map) => Some(map), + _ => None, + }) + .unwrap_or_default(); + if !params.sandbox_id.is_empty() { + pod_annotations.insert( + "openshell.io/sandbox-id".to_string(), + serde_json::Value::String(params.sandbox_id.to_string()), + ); + } + if !pod_annotations.is_empty() { + metadata.insert( + "annotations".to_string(), + serde_json::Value::Object(pod_annotations), + ); } let mut spec = serde_json::Map::new(); @@ -1185,6 +1233,13 @@ fn sandbox_template_to_k8s( } } + if !params.service_account_name.is_empty() { + spec.insert( + "serviceAccountName".to_string(), + serde_json::json!(params.service_account_name), + ); + } + // Disable service account token auto-mounting for security hardening. // Sandbox pods should not have access to the Kubernetes API by default. spec.insert( @@ -1241,17 +1296,26 @@ fn sandbox_template_to_k8s( }), ); - // Mount client TLS secret for mTLS to the server. + // Mount client TLS secret for mTLS to the server, plus the projected + // ServiceAccount token used to bootstrap the sandbox's gateway JWT + // via `IssueSandboxToken`. + let mut volume_mounts: Vec = Vec::new(); if !params.client_tls_secret_name.is_empty() { - container.insert( - "volumeMounts".to_string(), - serde_json::json!([{ - "name": "openshell-client-tls", - "mountPath": "/etc/openshell-tls/client", - "readOnly": true - }]), - ); - } + volume_mounts.push(serde_json::json!({ + "name": "openshell-client-tls", + "mountPath": "/etc/openshell-tls/client", + "readOnly": true + })); + } + volume_mounts.push(serde_json::json!({ + "name": "openshell-sa-token", + "mountPath": "/var/run/secrets/openshell", + "readOnly": true, + })); + container.insert( + "volumeMounts".to_string(), + serde_json::Value::Array(volume_mounts), + ); if let Some(resources) = container_resources(template, gpu) { container.insert("resources".to_string(), resources); @@ -1263,15 +1327,31 @@ fn sandbox_template_to_k8s( // Add TLS secret volume. Mode 0400 (owner-read) prevents the // unprivileged sandbox user from reading the mTLS private key. + let mut volumes: Vec = Vec::new(); if !params.client_tls_secret_name.is_empty() { - spec.insert( - "volumes".to_string(), - serde_json::json!([{ - "name": "openshell-client-tls", - "secret": { "secretName": params.client_tls_secret_name, "defaultMode": 256 } - }]), - ); - } + volumes.push(serde_json::json!({ + "name": "openshell-client-tls", + "secret": { "secretName": params.client_tls_secret_name, "defaultMode": 256 } + })); + } + // Projected ServiceAccountToken volume — kubelet writes a short-lived + // audience-bound JWT into /var/run/secrets/openshell/token and rotates + // it automatically. The supervisor exchanges this for a gateway-minted + // JWT via `IssueSandboxToken` once at startup. + volumes.push(serde_json::json!({ + "name": "openshell-sa-token", + "projected": { + "sources": [{ + "serviceAccountToken": { + "audience": "openshell-gateway", + "expirationSeconds": params.sa_token_ttl_secs, + "path": "token" + } + }], + "defaultMode": 256 + } + })); + spec.insert("volumes".to_string(), serde_json::Value::Array(volumes)); // Add hostAliases so sandbox pods can reach the Docker host. if !params.host_gateway_ip.is_empty() { @@ -1450,6 +1530,14 @@ fn apply_required_env( "/etc/openshell-tls/client/tls.key", ); } + // Projected ServiceAccount token written by kubelet (see the volume + // definition in `sandbox_template_to_k8s`). The supervisor reads this + // and exchanges it for a gateway-minted JWT via `IssueSandboxToken`. + upsert_env( + env, + openshell_core::sandbox_env::K8S_SA_TOKEN_FILE, + "/var/run/secrets/openshell/token", + ); } fn upsert_env(env: &mut Vec, name: &str, value: &str) { @@ -2421,6 +2509,32 @@ mod tests { ); } + #[test] + fn sandbox_template_sets_configured_service_account_name() { + let params = SandboxPodParams { + service_account_name: "openshell-sandbox", + ..Default::default() + }; + let pod_template = sandbox_template_to_k8s( + &SandboxTemplate::default(), + false, + &std::collections::HashMap::new(), + true, + ¶ms, + ); + + assert_eq!( + pod_template["spec"]["serviceAccountName"], + serde_json::json!("openshell-sandbox"), + "sandbox pods must run under the configured service account" + ); + assert_eq!( + pod_template["spec"]["automountServiceAccountToken"], + serde_json::json!(false), + "explicit service account selection must not re-enable default token automounting" + ); + } + #[test] fn platform_config_bool_extracts_value() { let template = SandboxTemplate { diff --git a/crates/openshell-driver-kubernetes/src/lib.rs b/crates/openshell-driver-kubernetes/src/lib.rs index 433d62353..b0a5ca957 100644 --- a/crates/openshell-driver-kubernetes/src/lib.rs +++ b/crates/openshell-driver-kubernetes/src/lib.rs @@ -6,7 +6,8 @@ pub mod driver; pub mod grpc; pub use config::{ - DEFAULT_WORKSPACE_STORAGE_SIZE, KubernetesComputeConfig, SupervisorSideloadMethod, + DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, DEFAULT_WORKSPACE_STORAGE_SIZE, KubernetesComputeConfig, + SupervisorSideloadMethod, }; pub use driver::{KubernetesComputeDriver, KubernetesDriverError}; pub use grpc::ComputeDriverService; diff --git a/crates/openshell-driver-kubernetes/src/main.rs b/crates/openshell-driver-kubernetes/src/main.rs index 37f8c08f8..703659af3 100644 --- a/crates/openshell-driver-kubernetes/src/main.rs +++ b/crates/openshell-driver-kubernetes/src/main.rs @@ -10,8 +10,8 @@ use tracing_subscriber::EnvFilter; use openshell_core::VERSION; use openshell_core::proto::compute::v1::compute_driver_server::ComputeDriverServer; use openshell_driver_kubernetes::{ - ComputeDriverService, KubernetesComputeConfig, KubernetesComputeDriver, - SupervisorSideloadMethod, + ComputeDriverService, DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, KubernetesComputeConfig, + KubernetesComputeDriver, SupervisorSideloadMethod, }; #[derive(Parser, Debug)] @@ -31,6 +31,13 @@ struct Args { #[arg(long, env = "OPENSHELL_SANDBOX_NAMESPACE", default_value = "default")] sandbox_namespace: String, + #[arg( + long, + env = "OPENSHELL_K8S_SANDBOX_SERVICE_ACCOUNT", + default_value = DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME + )] + sandbox_service_account: String, + #[arg(long, env = "OPENSHELL_SANDBOX_IMAGE")] sandbox_image: Option, @@ -68,6 +75,13 @@ struct Args { #[arg(long, env = "OPENSHELL_ENABLE_USER_NAMESPACES")] enable_user_namespaces: bool, + + /// Lifetime (seconds) of the projected `ServiceAccount` token + /// kubelet writes into each sandbox pod for the `IssueSandboxToken` + /// bootstrap exchange. Kubelet enforces a minimum of 600s; the + /// gateway clamps values outside `[600, 86400]`. Default 3600. + #[arg(long, env = "OPENSHELL_K8S_SA_TOKEN_TTL_SECS", default_value_t = 3600)] + sa_token_ttl_secs: i64, } #[tokio::main] @@ -81,6 +95,7 @@ async fn main() -> Result<()> { let driver = KubernetesComputeDriver::new(KubernetesComputeConfig { namespace: args.sandbox_namespace, + service_account_name: args.sandbox_service_account, default_image: args.sandbox_image.unwrap_or_default(), image_pull_policy: args.sandbox_image_pull_policy.unwrap_or_default(), supervisor_image: args @@ -99,6 +114,7 @@ async fn main() -> Result<()> { .unwrap_or_else(|_| { openshell_driver_kubernetes::DEFAULT_WORKSPACE_STORAGE_SIZE.to_string() }), + sa_token_ttl_secs: args.sa_token_ttl_secs, }) .await .into_diagnostic()?; diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index c3f2c3282..e79ff2769 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -299,6 +299,17 @@ fn build_env( ); } + // 4. Gateway-minted sandbox JWT (PR 3 of the per-sandbox identity + // series). Passed via env var; the supervisor reads it directly. + if let Some(s) = spec + && !s.sandbox_token.is_empty() + { + env.insert( + openshell_core::sandbox_env::SANDBOX_TOKEN.into(), + s.sandbox_token.clone(), + ); + } + env } diff --git a/crates/openshell-sandbox/src/debug_rpc.rs b/crates/openshell-sandbox/src/debug_rpc.rs new file mode 100644 index 000000000..b8cc43e2d --- /dev/null +++ b/crates/openshell-sandbox/src/debug_rpc.rs @@ -0,0 +1,236 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! One-shot debug RPCs exposed via `openshell-sandbox debug-rpc`. +//! +//! Designed for end-to-end verification of the per-sandbox identity +//! flow (issue #1354). A `docker exec` (or `kubectl exec`) into a +//! running sandbox can issue raw sandbox-class gRPC calls without +//! standing up a custom binary inside the sandbox image — useful for +//! confirming the cross-sandbox IDOR guard and renewal semantics. +//! +//! Subcommands: +//! - `get-sandbox-config --sandbox-id ` — call `GetSandboxConfig` +//! - `refresh` — call `RefreshSandboxToken` +//! - `show-token` — print the raw gateway JWT bytes +//! - `show-principal` — pretty-print the decoded JWT claims +//! (no signature verification — the supervisor already trusts the +//! token's origin) + +use base64::Engine as _; +use miette::{IntoDiagnostic, Result, WrapErr}; +use openshell_core::proto::{ + GetSandboxConfigRequest, RefreshSandboxTokenRequest, open_shell_client::OpenShellClient, +}; + +use crate::grpc_client::{AuthedChannel, connect_channel_pub}; + +/// Entry point for the `debug-rpc` subcommand. Returns the process exit +/// code; `main` propagates it. +pub async fn run(args: &[String]) -> Result { + let cmd = args + .first() + .map(String::as_str) + .ok_or_else(|| miette::miette!("{}", USAGE))?; + + match cmd { + "get-sandbox-config" => run_get_sandbox_config(&args[1..]).await, + "refresh" => run_refresh().await, + "show-token" => run_show_token(), + "show-principal" => run_show_principal(), + "--help" | "-h" => { + println!("{USAGE}"); + Ok(0) + } + other => Err(miette::miette!( + "unknown debug-rpc command '{other}'\n\n{USAGE}" + )), + } +} + +const USAGE: &str = "\ +usage: openshell-sandbox debug-rpc [options] + +commands: + get-sandbox-config --sandbox-id call GetSandboxConfig + refresh renew the gateway JWT + show-token print raw gateway JWT + show-principal print decoded JWT claims + +requires: OPENSHELL_ENDPOINT in env, plus one of OPENSHELL_SANDBOX_TOKEN, +OPENSHELL_SANDBOX_TOKEN_FILE, or OPENSHELL_K8S_SA_TOKEN_FILE so the +supervisor's normal token-acquisition path can resolve a JWT."; + +async fn open_client() -> Result> { + let endpoint = std::env::var(openshell_core::sandbox_env::ENDPOINT) + .into_diagnostic() + .wrap_err("OPENSHELL_ENDPOINT must be set")?; + let channel = connect_channel_pub(&endpoint).await?; + Ok(OpenShellClient::new(channel)) +} + +async fn run_get_sandbox_config(args: &[String]) -> Result { + let sandbox_id = parse_flag(args, "--sandbox-id") + .ok_or_else(|| miette::miette!("get-sandbox-config: --sandbox-id is required"))?; + let mut client = open_client().await?; + let resp = client + .get_sandbox_config(GetSandboxConfigRequest { + sandbox_id: sandbox_id.to_string(), + }) + .await; + match resp { + Ok(r) => { + let inner = r.into_inner(); + println!( + "version={} policy_hash={} config_revision={}", + inner.version, inner.policy_hash, inner.config_revision + ); + Ok(0) + } + Err(status) => { + eprintln!("{}: {}", code_name(status.code()), status.message()); + // Map gRPC status to a non-zero exit so callers can branch + // (e.g. expect-permission-denied in a shell test). + Ok(match status.code() { + tonic::Code::PermissionDenied => 7, + tonic::Code::Unauthenticated => 16, + tonic::Code::NotFound => 5, + _ => 1, + }) + } + } +} + +async fn run_refresh() -> Result { + let mut client = open_client().await?; + let resp = client + .refresh_sandbox_token(RefreshSandboxTokenRequest {}) + .await; + match resp { + Ok(r) => { + let inner = r.into_inner(); + println!( + "token={}\nexpires_at_ms={}", + inner.token, inner.expires_at_ms + ); + Ok(0) + } + Err(status) => { + eprintln!("{}: {}", code_name(status.code()), status.message()); + Ok(1) + } + } +} + +fn run_show_token() -> Result { + let token = read_local_token()?; + println!("{token}"); + Ok(0) +} + +fn run_show_principal() -> Result { + let token = read_local_token()?; + let payload_b64 = token + .split('.') + .nth(1) + .ok_or_else(|| miette::miette!("token has no payload segment"))?; + let payload = base64::engine::general_purpose::URL_SAFE_NO_PAD + .decode(payload_b64) + .into_diagnostic() + .wrap_err("failed to base64-decode token payload")?; + let claims: serde_json::Value = serde_json::from_slice(&payload) + .into_diagnostic() + .wrap_err("failed to parse token payload as JSON")?; + println!( + "{}", + serde_json::to_string_pretty(&claims).into_diagnostic()? + ); + Ok(0) +} + +/// Read the token from the env/file/SA-bootstrap chain, but only the +/// "already a gateway JWT" paths — show-token / show-principal don't +/// want to actually exchange an SA token. +fn read_local_token() -> Result { + if let Ok(t) = std::env::var(openshell_core::sandbox_env::SANDBOX_TOKEN) + && !t.is_empty() + { + return Ok(t); + } + if let Ok(path) = std::env::var(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE) + && !path.is_empty() + { + return Ok(std::fs::read_to_string(&path) + .into_diagnostic() + .wrap_err_with(|| format!("failed to read sandbox token from {path}"))? + .trim() + .to_string()); + } + Err(miette::miette!( + "no in-process gateway JWT available — set OPENSHELL_SANDBOX_TOKEN or \ + OPENSHELL_SANDBOX_TOKEN_FILE. The K8s SA-bootstrap path is intentionally \ + excluded from `show-token` / `show-principal` to avoid issuing a fresh \ + token just for inspection." + )) +} + +fn parse_flag<'a>(args: &'a [String], name: &str) -> Option<&'a str> { + let mut iter = args.iter(); + while let Some(a) = iter.next() { + if a == name { + return iter.next().map(String::as_str); + } + if let Some(rest) = a.strip_prefix(&format!("{name}=")) { + return Some(rest); + } + } + None +} + +fn code_name(c: tonic::Code) -> &'static str { + match c { + tonic::Code::Ok => "OK", + tonic::Code::Cancelled => "Cancelled", + tonic::Code::Unknown => "Unknown", + tonic::Code::InvalidArgument => "InvalidArgument", + tonic::Code::DeadlineExceeded => "DeadlineExceeded", + tonic::Code::NotFound => "NotFound", + tonic::Code::AlreadyExists => "AlreadyExists", + tonic::Code::PermissionDenied => "PermissionDenied", + tonic::Code::ResourceExhausted => "ResourceExhausted", + tonic::Code::FailedPrecondition => "FailedPrecondition", + tonic::Code::Aborted => "Aborted", + tonic::Code::OutOfRange => "OutOfRange", + tonic::Code::Unimplemented => "Unimplemented", + tonic::Code::Internal => "Internal", + tonic::Code::Unavailable => "Unavailable", + tonic::Code::DataLoss => "DataLoss", + tonic::Code::Unauthenticated => "Unauthenticated", + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_flag_handles_space_separated() { + let args: Vec = ["--sandbox-id", "abc-123"] + .iter() + .map(ToString::to_string) + .collect(); + assert_eq!(parse_flag(&args, "--sandbox-id"), Some("abc-123")); + } + + #[test] + fn parse_flag_handles_equals_separated() { + let args: Vec = ["--sandbox-id=abc-123".to_string()].to_vec(); + assert_eq!(parse_flag(&args, "--sandbox-id"), Some("abc-123")); + } + + #[test] + fn parse_flag_returns_none_when_missing() { + let args: Vec = ["--other".to_string(), "x".to_string()].to_vec(); + assert!(parse_flag(&args, "--sandbox-id").is_none()); + } +} diff --git a/crates/openshell-sandbox/src/grpc_client.rs b/crates/openshell-sandbox/src/grpc_client.rs index 3fccb680f..148408d5a 100644 --- a/crates/openshell-sandbox/src/grpc_client.rs +++ b/crates/openshell-sandbox/src/grpc_client.rs @@ -3,22 +3,99 @@ //! gRPC client for fetching sandbox policy, provider environment, and inference //! route bundles from `OpenShell` server. +//! +//! Every request carries a gateway-minted JWT in the `Authorization` header. +//! The token is resolved at startup from one of three sources: +//! +//! 1. `OPENSHELL_SANDBOX_TOKEN` — raw JWT in the env (test harness path). +//! 2. `OPENSHELL_SANDBOX_TOKEN_FILE` — file containing the JWT (Docker / +//! Podman / VM drivers write this to a bundle file at sandbox-create +//! time). +//! 3. `OPENSHELL_K8S_SA_TOKEN_FILE` — projected `ServiceAccount` JWT; the +//! supervisor exchanges it for a gateway JWT via `IssueSandboxToken` +//! once at startup. +//! +//! The resolved gateway JWT is held in process memory thereafter and +//! injected on every outbound call by [`AuthInterceptor`]. use std::collections::HashMap; -use std::time::Duration; +use std::sync::{Arc, OnceLock, RwLock}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; use miette::{IntoDiagnostic, Result, WrapErr}; use openshell_core::proto::{ DenialSummary, GetDraftPolicyRequest, GetInferenceBundleRequest, GetInferenceBundleResponse, - GetSandboxConfigRequest, GetSandboxProviderEnvironmentRequest, PolicyChunk, PolicySource, - PolicyStatus, ReportPolicyStatusRequest, SandboxPolicy as ProtoSandboxPolicy, - SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, UpdateConfigRequest, - inference_client::InferenceClient, open_shell_client::OpenShellClient, + GetSandboxConfigRequest, GetSandboxProviderEnvironmentRequest, IssueSandboxTokenRequest, + PolicyChunk, PolicySource, PolicyStatus, RefreshSandboxTokenRequest, ReportPolicyStatusRequest, + SandboxPolicy as ProtoSandboxPolicy, SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, + UpdateConfigRequest, inference_client::InferenceClient, open_shell_client::OpenShellClient, }; +use openshell_core::sandbox_env; +use tonic::Status; +use tonic::metadata::AsciiMetadataValue; +use tonic::service::interceptor::InterceptedService; use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint, Identity}; -use tracing::debug; +use tracing::{debug, info, warn}; -/// Create a channel to the `OpenShell` server. +/// Channel type after the [`AuthInterceptor`] is applied. Aliased so the +/// generated client type signatures stay readable. +pub type AuthedChannel = InterceptedService; + +/// Shared, refreshable Bearer header. All [`AuthInterceptor`] clones read +/// the same slot, so the renewal task can replace the token in place without +/// rebuilding the channel. +type TokenSlot = Arc>; + +/// Process-wide token slot. Initialized by the first [`connect_channel`] +/// call and shared with every subsequent client and the renewal loop. +static TOKEN_SLOT: OnceLock = OnceLock::new(); + +/// One-shot guard so the renewal loop spawns at most once per process. +static REFRESH_SPAWNED: OnceLock<()> = OnceLock::new(); + +fn install_token_slot(token: &str) -> Result { + let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")) + .into_diagnostic() + .wrap_err("sandbox JWT contained characters not valid for a header value")?; + if let Some(existing) = TOKEN_SLOT.get() { + *existing.write().expect("token slot poisoned") = bearer; + return Ok(existing.clone()); + } + let slot: TokenSlot = Arc::new(RwLock::new(bearer)); + let _ = TOKEN_SLOT.set(slot.clone()); + Ok(TOKEN_SLOT.get().cloned().unwrap_or(slot)) +} + +/// gRPC interceptor that injects `authorization: Bearer ` on every +/// outbound request. The token lives in a shared [`TokenSlot`] so the renewal +/// task can replace it without rebuilding clients. +#[derive(Clone)] +pub struct AuthInterceptor { + bearer: TokenSlot, +} + +impl AuthInterceptor { + fn new(bearer: TokenSlot) -> Self { + Self { bearer } + } +} + +impl tonic::service::Interceptor for AuthInterceptor { + fn call( + &mut self, + mut req: tonic::Request<()>, + ) -> std::result::Result, Status> { + let bearer = self + .bearer + .read() + .expect("auth interceptor token slot poisoned") + .clone(); + req.metadata_mut().insert("authorization", bearer); + Ok(req) + } +} + +/// Build the plain (un-intercepted) gRPC channel. /// /// When the endpoint uses `https://`, mTLS is configured using these env vars: /// - `OPENSHELL_TLS_CA` -- path to the CA certificate @@ -27,7 +104,7 @@ use tracing::debug; /// /// When the endpoint uses `http://`, a plaintext connection is used (for /// deployments where TLS is disabled, e.g. behind a Cloudflare Tunnel). -async fn connect_channel(endpoint: &str) -> Result { +async fn build_plain_channel(endpoint: &str) -> Result { let mut ep = Endpoint::from_shared(endpoint.to_string()) .into_diagnostic() .wrap_err("invalid gRPC endpoint")? @@ -43,13 +120,13 @@ async fn connect_channel(endpoint: &str) -> Result { let tls_enabled = endpoint.starts_with("https://"); if tls_enabled { - let ca_path = std::env::var(openshell_core::sandbox_env::TLS_CA) + let ca_path = std::env::var(sandbox_env::TLS_CA) .into_diagnostic() .wrap_err("OPENSHELL_TLS_CA is required")?; - let cert_path = std::env::var(openshell_core::sandbox_env::TLS_CERT) + let cert_path = std::env::var(sandbox_env::TLS_CERT) .into_diagnostic() .wrap_err("OPENSHELL_TLS_CERT is required")?; - let key_path = std::env::var(openshell_core::sandbox_env::TLS_KEY) + let key_path = std::env::var(sandbox_env::TLS_KEY) .into_diagnostic() .wrap_err("OPENSHELL_TLS_KEY is required")?; @@ -79,24 +156,246 @@ async fn connect_channel(endpoint: &str) -> Result { .wrap_err("failed to connect to OpenShell server") } -/// Create a channel to the `OpenShell` server (public for use by `supervisor_session`). -pub async fn connect_channel_pub(endpoint: &str) -> Result { +/// Build a Bearer-authenticated channel to the gateway. +/// +/// First call per process resolves the sandbox JWT via the three-step +/// lookup (env → file → K8s SA bootstrap exchange) and installs it into +/// the process-wide [`TOKEN_SLOT`]. Subsequent calls reuse the cached +/// slot — the renewal loop keeps the value fresh, so re-running the +/// bootstrap is both unnecessary and (on the K8s SA path) expensive +/// (one apiserver round-trip per call). The renewal loop itself is +/// spawned once per process via [`REFRESH_SPAWNED`]. +async fn connect_channel(endpoint: &str) -> Result { + let channel = build_plain_channel(endpoint).await?; + let slot = if let Some(existing) = TOKEN_SLOT.get() { + existing.clone() + } else { + let token = acquire_sandbox_token(endpoint, &channel).await?; + install_token_slot(&token)? + }; + let intercepted = InterceptedService::new(channel, AuthInterceptor::new(slot.clone())); + if REFRESH_SPAWNED.set(()).is_ok() { + let refresh_channel = intercepted.clone(); + tokio::spawn(async move { + refresh_token_loop(refresh_channel, slot).await; + }); + } + Ok(intercepted) +} + +/// Resolve the sandbox JWT used to authenticate every outbound RPC. +/// +/// `endpoint` is logged on errors but never used for transport here; the +/// actual network call lives inside this function only on the K8s +/// bootstrap path, which uses `plain_channel` to call `IssueSandboxToken` +/// once before the steady-state Bearer-authenticated channel is built. +async fn acquire_sandbox_token(endpoint: &str, plain_channel: &Channel) -> Result { + if let Ok(t) = std::env::var(sandbox_env::SANDBOX_TOKEN) + && !t.is_empty() + { + debug!(source = "env", "loaded sandbox token"); + return Ok(t); + } + + if let Ok(path) = std::env::var(sandbox_env::SANDBOX_TOKEN_FILE) + && !path.is_empty() + { + let contents = std::fs::read_to_string(&path) + .into_diagnostic() + .wrap_err_with(|| format!("failed to read sandbox token from {path}"))?; + debug!(source = "file", path = %path, "loaded sandbox token"); + return Ok(contents.trim().to_string()); + } + + if let Ok(sa_path) = std::env::var(sandbox_env::K8S_SA_TOKEN_FILE) + && !sa_path.is_empty() + { + let sa_token = std::fs::read_to_string(&sa_path) + .into_diagnostic() + .wrap_err_with(|| format!("failed to read K8s SA token from {sa_path}"))? + .trim() + .to_string(); + info!(endpoint = %endpoint, "exchanging K8s ServiceAccount token for sandbox JWT"); + // The bootstrap exchange uses a one-off interceptor pinned to the + // SA token; the resulting gateway JWT becomes the value in the + // shared `TOKEN_SLOT` once `connect_channel` returns. + let bootstrap_slot: TokenSlot = Arc::new(RwLock::new( + AsciiMetadataValue::try_from(format!("Bearer {sa_token}")) + .into_diagnostic() + .wrap_err("SA token contained characters not valid for a header value")?, + )); + let interceptor = AuthInterceptor::new(bootstrap_slot); + let bootstrap = InterceptedService::new(plain_channel.clone(), interceptor); + let mut client = OpenShellClient::new(bootstrap); + let resp = client + .issue_sandbox_token(IssueSandboxTokenRequest {}) + .await + .into_diagnostic() + .wrap_err("IssueSandboxToken bootstrap exchange failed")?; + return Ok(resp.into_inner().token); + } + + Err(miette::miette!( + "no sandbox token source available — set one of {}, {}, or {}", + sandbox_env::SANDBOX_TOKEN, + sandbox_env::SANDBOX_TOKEN_FILE, + sandbox_env::K8S_SA_TOKEN_FILE, + )) +} + +/// Build an authenticated channel for direct external use (e.g. the +/// long-lived `supervisor_session` control stream). +pub async fn connect_channel_pub(endpoint: &str) -> Result { connect_channel(endpoint).await } +/// Background task that renews the sandbox JWT at ~80% of its remaining +/// lifetime. The new token replaces the value in [`TOKEN_SLOT`], so all +/// in-flight and future clients pick it up on their next request. The +/// loop never panics: every failure is logged and re-attempted after a +/// bounded backoff. +async fn refresh_token_loop(channel: AuthedChannel, slot: TokenSlot) { + let mut client = OpenShellClient::new(channel); + loop { + let sleep = compute_refresh_delay(&slot); + tokio::time::sleep(sleep).await; + match client + .refresh_sandbox_token(RefreshSandboxTokenRequest {}) + .await + { + Ok(resp) => { + let new_token = resp.into_inner().token; + match AsciiMetadataValue::try_from(format!("Bearer {new_token}")) { + Ok(value) => { + if let Ok(mut guard) = slot.write() { + *guard = value; + info!("renewed gateway sandbox JWT in-place"); + } + } + Err(e) => warn!(error = %e, "refreshed JWT contained invalid header bytes"), + } + } + Err(status) => { + warn!(error = %status, "RefreshSandboxToken failed; will retry"); + // Backoff so we don't spin against a sustained failure. + tokio::time::sleep(Duration::from_secs(60)).await; + } + } + } +} + +/// Compute the next refresh delay: 80 % of the time remaining until the +/// current token's `exp`, plus up to 10 % jitter, floored at 60 s and +/// capped at 12 h. If the token can't be parsed (legacy/non-JWT bearer) +/// default to 6 h. +fn compute_refresh_delay(slot: &TokenSlot) -> Duration { + let token = slot + .read() + .ok() + .and_then(|v| v.to_str().ok().map(str::to_string)) + .unwrap_or_default(); + let bearer = token.strip_prefix("Bearer ").unwrap_or(&token); + let now_ms = i64::try_from( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_or(0, |d| d.as_millis()), + ) + .unwrap_or(i64::MAX); + let remaining_ms = parse_jwt_exp_ms(bearer).map_or(21_600_000, |exp| exp - now_ms); // 6 h fallback + let mut delay_ms = (remaining_ms.max(0) * 8 / 10).clamp(60_000, 43_200_000); + // Up to 10 % jitter, derived deterministically from token bytes so + // unit tests are reproducible without injecting an RNG. + let jitter_pct = (token.len() % 10) as u64; + let jitter_ms = (u64::try_from(delay_ms).unwrap_or(0) * jitter_pct) / 100; + delay_ms = delay_ms.saturating_add(i64::try_from(jitter_ms).unwrap_or(0)); + Duration::from_millis(u64::try_from(delay_ms).unwrap_or(0)) +} + +/// Decode the `exp` claim from a JWT without verifying its signature. +/// Returns the expiry in milliseconds since the Unix epoch, or `None` if +/// the token is not a parseable JWT. +fn parse_jwt_exp_ms(jwt: &str) -> Option { + use base64::Engine; + let mut parts = jwt.splitn(3, '.'); + let _header = parts.next()?; + let payload_b64 = parts.next()?; + let decoded = base64::engine::general_purpose::URL_SAFE_NO_PAD + .decode(payload_b64) + .ok()?; + let value: serde_json::Value = serde_json::from_slice(&decoded).ok()?; + let exp_secs = value.get("exp")?.as_i64()?; + exp_secs.checked_mul(1000) +} + +#[cfg(test)] +mod auth_tests { + use super::*; + + #[test] + fn parse_jwt_exp_reads_unsigned_payload() { + use base64::Engine as _; + let payload = base64::engine::general_purpose::URL_SAFE_NO_PAD + .encode(br#"{"exp":1234567890,"sandbox_id":"sb-1"}"#); + let token = format!("h.{payload}.sig"); + assert_eq!(parse_jwt_exp_ms(&token), Some(1_234_567_890_000)); + } + + #[test] + fn parse_jwt_exp_returns_none_for_malformed_token() { + assert!(parse_jwt_exp_ms("not-a-jwt").is_none()); + assert!(parse_jwt_exp_ms("only.two").is_none()); + assert!(parse_jwt_exp_ms("a.!!!.c").is_none()); + } + + #[test] + fn compute_refresh_delay_uses_80_percent_when_token_present() { + // Build a JWT whose exp is 1000 seconds in the future. With 0-jitter + // the delay should be roughly 800 seconds. + use base64::Engine as _; + let now_s = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs(); + let exp = now_s + 1000; + let payload_json = format!(r#"{{"exp":{exp}}}"#); + let payload = base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(payload_json); + let token = format!("h.{payload}.s"); + let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")).unwrap(); + let slot: TokenSlot = Arc::new(RwLock::new(bearer)); + let delay = compute_refresh_delay(&slot); + // 800 s baseline + up to 10 % jitter → 800..=880 s, with some slack + // for the 1-second resolution of the exp claim. + let secs = delay.as_secs(); + assert!( + (700..=900).contains(&secs), + "expected 80%-of-1000s delay, got {secs}s" + ); + } + + #[test] + fn compute_refresh_delay_floors_at_60_seconds() { + // Already-expired token still produces a 60 s floor so the loop + // doesn't busy-spin. + use base64::Engine as _; + let exp = 1; // past + let payload = + base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(format!(r#"{{"exp":{exp}}}"#)); + let token = format!("h.{payload}.s"); + let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")).unwrap(); + let slot: TokenSlot = Arc::new(RwLock::new(bearer)); + let delay = compute_refresh_delay(&slot); + assert!(delay.as_secs() >= 60); + } +} + /// Connect to the `OpenShell` server. -/// -/// Sandboxes authenticate to the gateway via the mTLS client certificate -/// configured by `connect_channel`. They do not present an OIDC Bearer -/// token; the gateway recognises sandbox-class callers by absence of a -/// Bearer header on the request. -async fn connect(endpoint: &str) -> Result> { +async fn connect(endpoint: &str) -> Result> { let channel = connect_channel(endpoint).await?; Ok(OpenShellClient::new(channel)) } /// Connect to the inference service. -async fn connect_inference(endpoint: &str) -> Result> { +async fn connect_inference(endpoint: &str) -> Result> { let channel = connect_channel(endpoint).await?; Ok(InferenceClient::new(channel)) } @@ -118,7 +417,7 @@ pub async fn fetch_policy(endpoint: &str, sandbox_id: &str) -> Result, + client: &mut OpenShellClient, sandbox_id: &str, ) -> Result> { let response = client @@ -142,7 +441,7 @@ async fn fetch_policy_with_client( /// Sync a locally-discovered policy using an existing client connection. async fn sync_policy_with_client( - client: &mut OpenShellClient, + client: &mut OpenShellClient, sandbox: &str, policy: &ProtoSandboxPolicy, ) -> Result<()> { @@ -238,7 +537,7 @@ pub async fn fetch_provider_environment( /// and status reporting, avoiding per-request TLS handshake overhead. #[derive(Clone)] pub struct CachedOpenShellClient { - client: OpenShellClient, + client: OpenShellClient, } /// Settings poll result returned by [`CachedOpenShellClient::poll_settings`]. @@ -269,7 +568,7 @@ impl CachedOpenShellClient { } /// Get a clone of the underlying tonic client for direct RPC calls. - pub fn raw_client(&self) -> OpenShellClient { + pub fn raw_client(&self) -> OpenShellClient { self.client.clone() } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index ded56ce9e..b83125f12 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -7,6 +7,7 @@ pub mod bypass_monitor; mod child_env; +pub mod debug_rpc; pub mod denial_aggregator; mod grpc_client; mod identity; diff --git a/crates/openshell-sandbox/src/main.rs b/crates/openshell-sandbox/src/main.rs index 4a6cb1955..3c9e21578 100644 --- a/crates/openshell-sandbox/src/main.rs +++ b/crates/openshell-sandbox/src/main.rs @@ -24,6 +24,15 @@ use openshell_sandbox::run_sandbox; /// performs the copy in pure Rust. const COPY_SELF_SUBCOMMAND: &str = "copy-self"; +/// Subcommand for one-shot debug RPCs from inside a sandbox container. +/// +/// Reads the same token sources as the supervisor (env, file, K8s SA +/// bootstrap) and issues a single gRPC call against the gateway. Useful +/// for end-to-end verification: e.g. `docker exec` into a sandbox, then +/// run `openshell-sandbox debug-rpc get-sandbox-config --sandbox-id ` +/// to confirm the cross-sandbox IDOR guard fires. +const DEBUG_RPC_SUBCOMMAND: &str = "debug-rpc"; + /// `OpenShell` Sandbox - process isolation and monitoring. #[derive(Parser, Debug)] #[command(name = "openshell-sandbox")] @@ -150,6 +159,20 @@ fn main() -> Result<()> { return copy_self(dest); } + // Handle `debug-rpc [args]` before clap. Uses a small + // dedicated runtime so we don't pay the supervisor's full startup cost. + if raw_args.get(1).map(String::as_str) == Some(DEBUG_RPC_SUBCOMMAND) { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .into_diagnostic()?; + return runtime.block_on(async move { + let _ = rustls::crypto::ring::default_provider().install_default(); + let exit = openshell_sandbox::debug_rpc::run(&raw_args[2..]).await?; + std::process::exit(exit); + }); + } + let args = Args::parse(); // Try to open a rolling log file; fall back to stderr-only logging if it fails diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index 3d2f6d576..8c6eb77f3 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -155,6 +155,15 @@ impl ProcessHandle { .kill_on_drop(true) .env(openshell_core::sandbox_env::SANDBOX, "1"); + // Strip supervisor-only credentials from the entrypoint's inherited + // environment. The entrypoint drops to the sandbox user before + // `exec`; without this strip, anything running as the sandbox user + // (e.g. an SSH-spawned shell) could read /proc//environ + // and recover the gateway-minted JWT. Issue #1354. + cmd.env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN) + .env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE) + .env_remove(openshell_core::sandbox_env::K8S_SA_TOKEN_FILE); + inject_provider_env(&mut cmd, provider_env); if let Some(dir) = workdir { @@ -281,6 +290,15 @@ impl ProcessHandle { .kill_on_drop(true) .env(openshell_core::sandbox_env::SANDBOX, "1"); + // Strip supervisor-only credentials from the entrypoint's inherited + // environment. The entrypoint drops to the sandbox user before + // `exec`; without this strip, anything running as the sandbox user + // (e.g. an SSH-spawned shell) could read /proc//environ + // and recover the gateway-minted JWT. Issue #1354. + cmd.env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN) + .env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE) + .env_remove(openshell_core::sandbox_env::K8S_SA_TOKEN_FILE); + inject_provider_env(&mut cmd, provider_env); if let Some(dir) = workdir { diff --git a/crates/openshell-sandbox/src/supervisor_session.rs b/crates/openshell-sandbox/src/supervisor_session.rs index 6485dddf0..4d7392ee3 100644 --- a/crates/openshell-sandbox/src/supervisor_session.rs +++ b/crates/openshell-sandbox/src/supervisor_session.rs @@ -28,7 +28,6 @@ use openshell_ocsf::{ use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; use tokio::sync::mpsc; use tokio_stream::StreamExt; -use tonic::transport::Channel; use tracing::{debug, warn}; use crate::grpc_client; @@ -371,7 +370,7 @@ fn handle_gateway_message( sandbox_id: &str, ssh_socket_path: &std::path::Path, netns_fd: Option, - channel: &Channel, + channel: &grpc_client::AuthedChannel, tx: &mpsc::Sender, ) { match &msg.payload { @@ -436,7 +435,7 @@ async fn handle_relay_open( relay_open: RelayOpen, ssh_socket_path: &std::path::Path, netns_fd: Option, - channel: Channel, + channel: grpc_client::AuthedChannel, tx: mpsc::Sender, ) -> Result<(), Box> { let channel_id = relay_open.channel_id.clone(); diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml index 4bbfe24fc..fa19ab526 100644 --- a/crates/openshell-server/Cargo.toml +++ b/crates/openshell-server/Cargo.toml @@ -82,6 +82,8 @@ uuid = { workspace = true } hmac = "0.12" sha2 = { workspace = true } jsonwebtoken = { workspace = true } +async-trait = "0.1" +url = { workspace = true } hex = "0.4" russh = "0.57" rand = { workspace = true } diff --git a/crates/openshell-server/src/auth/authenticator.rs b/crates/openshell-server/src/auth/authenticator.rs new file mode 100644 index 000000000..066b55a13 --- /dev/null +++ b/crates/openshell-server/src/auth/authenticator.rs @@ -0,0 +1,277 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Pluggable authentication trait + chain dispatch. +//! +//! The gateway runs every authenticated request through an +//! [`AuthenticatorChain`] of [`Authenticator`] implementations. The chain +//! evaluates authenticators in order; the first one that recognizes the +//! caller produces the [`Principal`]. An authenticator that does not apply +//! (e.g. an OIDC authenticator seeing no Bearer header) returns `Ok(None)` +//! so the chain falls through to the next. An authenticator that *does* +//! apply but rejects the caller returns `Err(Status)`, which terminates +//! the chain — fail-closed. +//! +//! Live authenticators slotting into the chain: +//! - [`super::sandbox_jwt::SandboxJwtAuthenticator`] — gateway-minted JWTs +//! - [`super::k8s_sa::K8sServiceAccountAuthenticator`] — K8s projected SA +//! tokens (path-scoped to `IssueSandboxToken`) +//! - [`super::oidc::OidcAuthenticator`] — user OIDC Bearer tokens +//! - [`PermissiveUserAuthenticator`] — final-fallback dev-mode catch-all +//! that produces a synthetic user principal when no OIDC is +//! configured. Preserves the "no OIDC = open" dev posture for +//! singleplayer / helm-dev deployments. + +use super::identity::{Identity, IdentityProvider}; +use super::principal::{Principal, UserPrincipal}; +use async_trait::async_trait; +use std::sync::Arc; +use tonic::Status; + +/// Pluggable authentication step. +/// +/// Implementations are expected to be cheap to clone (they live behind +/// `Arc` inside an [`AuthenticatorChain`]). +#[async_trait] +pub trait Authenticator: Send + Sync + 'static { + /// Inspect an inbound request and return the authenticated principal. + /// + /// - `Ok(Some(principal))` — this authenticator recognized the caller. + /// The chain stops and the principal is inserted into request + /// extensions. + /// - `Ok(None)` — this authenticator does not apply (e.g. no Bearer + /// token for an OIDC authenticator). The chain falls through to + /// the next authenticator. + /// - `Err(status)` — this authenticator applies but rejected the + /// caller. The chain terminates and the status is returned to the + /// client. Fail-closed. + async fn authenticate( + &self, + headers: &http::HeaderMap, + path: &str, + ) -> Result, Status>; +} + +/// First-match-wins authenticator chain. +/// +/// The chain owns its authenticators behind `Arc` so the entire chain is +/// cheap to clone — required because `tower::Service::call` clones the +/// router on every request. +#[derive(Clone)] +pub struct AuthenticatorChain { + authenticators: Arc<[Arc]>, +} + +impl AuthenticatorChain { + /// Build a chain from an ordered list of authenticators. Earlier + /// entries are evaluated first. + pub fn new(authenticators: Vec>) -> Self { + Self { + authenticators: Arc::from(authenticators), + } + } + + /// Run the chain. Returns the first principal produced. If every + /// authenticator returns `Ok(None)`, the result is `Ok(None)` — the + /// router translates that to `unauthenticated`. + pub async fn authenticate( + &self, + headers: &http::HeaderMap, + path: &str, + ) -> Result, Status> { + for authenticator in self.authenticators.iter() { + if let Some(principal) = authenticator.authenticate(headers, path).await? { + return Ok(Some(principal)); + } + } + Ok(None) + } +} + +impl std::fmt::Debug for AuthenticatorChain { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AuthenticatorChain") + .field("len", &self.authenticators.len()) + .finish() + } +} + +/// Final-fallback authenticator that produces a synthetic user principal +/// for any request the earlier authenticators didn't claim. Used only +/// when no user-side authentication is configured (no OIDC, no fronting +/// proxy contract). This preserves the dev-mode open posture in a +/// principal-aware way so handlers always see *some* principal in +/// extensions. +/// +/// Producing a User principal (rather than Anonymous) means dev-mode +/// requests pass the per-handler IDOR guard via the User-bypass +/// branch — equivalent to "RBAC was the user's gate" with the dev +/// default of "every caller is a user." +pub struct PermissiveUserAuthenticator { + subject: String, +} + +impl PermissiveUserAuthenticator { + pub fn new(subject: impl Into) -> Self { + Self { + subject: subject.into(), + } + } +} + +#[async_trait] +impl Authenticator for PermissiveUserAuthenticator { + async fn authenticate( + &self, + _headers: &http::HeaderMap, + _path: &str, + ) -> Result, Status> { + Ok(Some(Principal::User(UserPrincipal { + identity: Identity { + subject: self.subject.clone(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Internal, + }, + }))) + } +} + +#[cfg(test)] +pub mod test_support { + use super::*; + use std::sync::Mutex; + + /// Authenticator that always returns the configured outcome. Used by + /// tests to inject a known principal (or rejection) without running real + /// crypto. Each call records the path it was invoked with so tests can + /// assert chain ordering. + pub struct MockAuthenticator { + pub outcome: Result, Status>, + pub calls: Mutex>, + } + + impl MockAuthenticator { + pub fn returning(outcome: Result, Status>) -> Self { + Self { + outcome, + calls: Mutex::new(Vec::new()), + } + } + + pub fn call_count(&self) -> usize { + self.calls.lock().unwrap().len() + } + } + + #[async_trait] + impl Authenticator for MockAuthenticator { + async fn authenticate( + &self, + _headers: &http::HeaderMap, + path: &str, + ) -> Result, Status> { + self.calls.lock().unwrap().push(path.to_string()); + self.outcome.clone() + } + } +} + +#[cfg(test)] +mod tests { + use super::test_support::MockAuthenticator; + use super::*; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::UserPrincipal; + + fn user_principal(subject: &str) -> Principal { + Principal::User(UserPrincipal { + identity: Identity { + subject: subject.to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + }) + } + + #[tokio::test] + async fn chain_returns_first_match() { + let first = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "alice", + ))))); + let second = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "bob", + ))))); + let chain = AuthenticatorChain::new(vec![first.clone(), second.clone()]); + let result = chain + .authenticate(&http::HeaderMap::new(), "/some/path") + .await + .unwrap() + .expect("expected a principal"); + match result { + Principal::User(u) => assert_eq!(u.identity.subject, "alice"), + _ => panic!("expected user principal"), + } + assert_eq!(first.call_count(), 1); + assert_eq!( + second.call_count(), + 0, + "second authenticator must be skipped after first matches" + ); + } + + #[tokio::test] + async fn chain_falls_through_on_none() { + let first = Arc::new(MockAuthenticator::returning(Ok(None))); + let second = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "bob", + ))))); + let chain = AuthenticatorChain::new(vec![first.clone(), second.clone()]); + let result = chain + .authenticate(&http::HeaderMap::new(), "/some/path") + .await + .unwrap() + .expect("expected a principal"); + match result { + Principal::User(u) => assert_eq!(u.identity.subject, "bob"), + _ => panic!("expected user principal"), + } + assert_eq!(first.call_count(), 1); + assert_eq!(second.call_count(), 1); + } + + #[tokio::test] + async fn chain_fails_closed_on_first_error() { + let first = Arc::new(MockAuthenticator::returning(Err(Status::unauthenticated( + "bad token", + )))); + let second = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "bob", + ))))); + let chain = AuthenticatorChain::new(vec![first.clone(), second.clone()]); + let err = chain + .authenticate(&http::HeaderMap::new(), "/some/path") + .await + .expect_err("must short-circuit on error"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + assert_eq!(first.call_count(), 1); + assert_eq!( + second.call_count(), + 0, + "must not consult later authenticators after an error" + ); + } + + #[tokio::test] + async fn empty_chain_returns_none() { + let chain = AuthenticatorChain::new(vec![]); + let result = chain + .authenticate(&http::HeaderMap::new(), "/some/path") + .await + .unwrap(); + assert!(result.is_none()); + } +} diff --git a/crates/openshell-server/src/auth/guard.rs b/crates/openshell-server/src/auth/guard.rs new file mode 100644 index 000000000..edcd6bc01 --- /dev/null +++ b/crates/openshell-server/src/auth/guard.rs @@ -0,0 +1,177 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Per-handler sandbox-scope guards. +//! +//! Closes the IDOR half of issue #1354: a sandbox principal may only +//! reference its own sandbox, identified by its [`Principal::Sandbox`]'s +//! `sandbox_id`. User principals retain the broad scope the RBAC layer +//! already evaluated. + +use super::principal::Principal; +use super::principal::SandboxPrincipal; +use tonic::Status; +use tracing::info; + +/// Reject a sandbox-class request whose body references a sandbox other +/// than the one the calling principal was authenticated against. +/// +/// - [`Principal::User`] passes through (RBAC has already evaluated user +/// scope at the router level). +/// - [`Principal::Sandbox`] must reference the same canonical UUID it +/// was authenticated with. +/// - [`Principal::Anonymous`] is rejected — sandbox-class methods are +/// never anonymously callable. +/// +/// `claimed_sandbox_id` is the canonical UUID the request is operating +/// on. Name-keyed handlers must resolve the name to a UUID via the +/// store before calling this guard. +#[allow(clippy::result_large_err)] +pub fn ensure_sandbox_scope(principal: &Principal, claimed_sandbox_id: &str) -> Result<(), Status> { + match principal { + Principal::User(_) => Ok(()), + Principal::Sandbox(p) => { + if p.sandbox_id == claimed_sandbox_id { + Ok(()) + } else { + info!( + principal_sandbox_id = %p.sandbox_id, + requested_sandbox_id = %claimed_sandbox_id, + "cross-sandbox access denied" + ); + Err(Status::permission_denied( + "cross-sandbox access denied: principal does not own this sandbox", + )) + } + } + Principal::Anonymous => Err(Status::unauthenticated( + "sandbox-scoped methods require an authenticated caller", + )), + } +} + +/// Convenience: read the `Principal` out of a request and apply +/// [`ensure_sandbox_scope`]. Returns the principal so callers can read it +/// further (e.g. for audit logging). +#[allow(clippy::result_large_err)] +pub fn enforce_sandbox_scope( + request: &tonic::Request, + claimed_sandbox_id: &str, +) -> Result { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; + ensure_sandbox_scope(&principal, claimed_sandbox_id)?; + Ok(principal) +} + +/// Require a sandbox principal and reject users or anonymous callers. +/// +/// Supervisor-only control/data plane RPCs (`ConnectSupervisor`, +/// `RelayStream`) must be presented by the sandbox supervisor itself. +/// User principals intentionally pass [`ensure_sandbox_scope`] for normal +/// CLI/TUI APIs because RBAC is their gate, but they are not valid +/// supervisor identities. +#[allow(clippy::result_large_err)] +pub fn ensure_sandbox_principal_scope( + principal: &Principal, + claimed_sandbox_id: &str, +) -> Result { + match principal { + Principal::Sandbox(p) => { + ensure_sandbox_scope(principal, claimed_sandbox_id)?; + Ok(p.clone()) + } + Principal::User(_) => Err(Status::permission_denied( + "supervisor RPCs require a sandbox principal", + )), + Principal::Anonymous => Err(Status::unauthenticated( + "supervisor RPCs require an authenticated sandbox principal", + )), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{SandboxIdentitySource, SandboxPrincipal, UserPrincipal}; + + fn user(subject: &str) -> Principal { + Principal::User(UserPrincipal { + identity: Identity { + subject: subject.to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + }) + } + + fn sandbox(id: &str) -> Principal { + Principal::Sandbox(SandboxPrincipal { + sandbox_id: id.to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + }, + trust_domain: Some("openshell".to_string()), + }) + } + + #[test] + fn user_principal_bypasses_equality_check() { + // RBAC was the user's gate at the router layer. + assert!(ensure_sandbox_scope(&user("alice"), "any-sandbox").is_ok()); + } + + #[test] + fn sandbox_principal_matching_id_is_allowed() { + assert!(ensure_sandbox_scope(&sandbox("sbx-1"), "sbx-1").is_ok()); + } + + #[test] + fn sandbox_principal_mismatched_id_is_denied() { + let err = + ensure_sandbox_scope(&sandbox("sbx-1"), "sbx-2").expect_err("must deny cross-sandbox"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[test] + fn anonymous_principal_is_rejected() { + let err = + ensure_sandbox_scope(&Principal::Anonymous, "sbx-1").expect_err("must reject anon"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } + + #[test] + fn sandbox_principal_scope_returns_matching_sandbox() { + let principal = sandbox("sbx-1"); + let scoped = ensure_sandbox_principal_scope(&principal, "sbx-1").expect("scope OK"); + assert_eq!(scoped.sandbox_id, "sbx-1"); + } + + #[test] + fn sandbox_principal_scope_rejects_users() { + let err = ensure_sandbox_principal_scope(&user("alice"), "sbx-1") + .expect_err("users are not supervisor identities"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[test] + fn enforce_reads_from_request_extensions() { + let mut req = tonic::Request::new(()); + req.extensions_mut().insert(sandbox("sbx-1")); + let result = enforce_sandbox_scope(&req, "sbx-1").expect("scope OK"); + assert!(matches!(result, Principal::Sandbox(_))); + } + + #[test] + fn enforce_rejects_request_without_principal() { + let req = tonic::Request::new(()); + let err = enforce_sandbox_scope(&req, "sbx-1").expect_err("must require principal"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } +} diff --git a/crates/openshell-server/src/auth/k8s_sa.rs b/crates/openshell-server/src/auth/k8s_sa.rs new file mode 100644 index 000000000..e0649e734 --- /dev/null +++ b/crates/openshell-server/src/auth/k8s_sa.rs @@ -0,0 +1,591 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Kubernetes `ServiceAccount` bootstrap authenticator. +//! +//! Path-scoped to `IssueSandboxToken`. Validates a projected SA token +//! presented by a sandbox pod, reads the pod's `openshell.io/sandbox-id` +//! annotation, and returns a [`Principal::Sandbox`] with +//! [`SandboxIdentitySource::K8sServiceAccount`]. The `IssueSandboxToken` +//! handler then mints a gateway-signed JWT for that sandbox id; subsequent +//! gRPC calls from the supervisor use the gateway-minted JWT validated by +//! [`super::sandbox_jwt::SandboxJwtAuthenticator`]. +//! +//! This is the only authenticator that talks to the K8s apiserver. It is +//! optional — the gateway boots without it in singleplayer deployments. + +use super::authenticator::Authenticator; +use super::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; +use async_trait::async_trait; +use k8s_openapi::api::{ + authentication::v1::{TokenReview, TokenReviewSpec, TokenReviewStatus, UserInfo}, + core::v1::Pod, +}; +use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta; +use kube::api::{Api, PostParams}; +use std::sync::Arc; +use tonic::Status; +use tracing::{debug, info, warn}; + +/// gRPC method path that this authenticator accepts. All other paths fall +/// through (return `Ok(None)`) so a gateway-minted JWT is required there. +pub const ISSUE_SANDBOX_TOKEN_PATH: &str = "/openshell.v1.OpenShell/IssueSandboxToken"; + +/// Pod annotation that binds a sandbox pod to its UUID. Set by the +/// Kubernetes compute driver at pod-create time. The gateway treats this +/// annotation as authoritative; the K8s `Role` granted to the gateway must +/// not include `patch pods` (see plan §11.8). +pub const SANDBOX_ID_ANNOTATION: &str = "openshell.io/sandbox-id"; +const POD_NAME_EXTRA: &str = "authentication.kubernetes.io/pod-name"; +const POD_UID_EXTRA: &str = "authentication.kubernetes.io/pod-uid"; + +/// Resolved identity extracted from a validated SA token + pod lookup. +#[derive(Debug, Clone)] +pub struct ResolvedK8sIdentity { + pub sandbox_id: String, + pub pod_name: String, + pub pod_uid: String, +} + +/// Apiserver-facing operations the authenticator depends on. Split out so +/// tests can fake the apiserver without standing up a kube cluster. +#[async_trait] +pub trait K8sIdentityResolver: Send + Sync + 'static { + /// Validate `token` via `TokenReview` (`aud == openshell-gateway`), + /// extract the pod name/uid, then `GET` the pod and read + /// `openshell.io/sandbox-id`. Returns `Ok(None)` when the token is + /// well-formed but does not authenticate (e.g. wrong audience); returns + /// `Err` for transport/server errors. + async fn resolve(&self, token: &str) -> Result, Status>; +} + +/// Authenticator wrapper around a [`K8sIdentityResolver`]. +pub struct K8sServiceAccountAuthenticator { + resolver: Arc, +} + +impl std::fmt::Debug for K8sServiceAccountAuthenticator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("K8sServiceAccountAuthenticator") + .finish_non_exhaustive() + } +} + +impl K8sServiceAccountAuthenticator { + pub fn new(resolver: Arc) -> Self { + Self { resolver } + } +} + +#[async_trait] +impl Authenticator for K8sServiceAccountAuthenticator { + async fn authenticate( + &self, + headers: &http::HeaderMap, + path: &str, + ) -> Result, Status> { + // Scope: only the bootstrap RPC. Other paths fall through so the + // SandboxJwtAuthenticator (or OIDC) handles them. + if path != ISSUE_SANDBOX_TOKEN_PATH { + return Ok(None); + } + + let Some(token) = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .and_then(|v| v.strip_prefix("Bearer ")) + else { + return Ok(None); + }; + + let Some(resolved) = self.resolver.resolve(token).await? else { + debug!("K8s SA token did not authenticate; falling through"); + return Ok(None); + }; + + if resolved.sandbox_id.is_empty() { + warn!( + pod = %resolved.pod_name, + "pod missing openshell.io/sandbox-id annotation; rejecting" + ); + return Err(Status::permission_denied( + "pod is not bound to a sandbox identity", + )); + } + + Ok(Some(Principal::Sandbox(SandboxPrincipal { + sandbox_id: resolved.sandbox_id, + source: SandboxIdentitySource::K8sServiceAccount { + pod_name: resolved.pod_name, + pod_uid: resolved.pod_uid, + }, + trust_domain: Some("openshell".to_string()), + }))) + } +} + +#[derive(Debug)] +struct TokenReviewIdentity { + pod_name: String, + pod_uid: String, +} + +/// Resolver backed by the apiserver's `TokenReview` API and `kube::Client` +/// for the per-pod annotation lookup. +pub struct LiveK8sResolver { + token_reviews_api: Api, + pods_api: Api, + expected_audience: String, + sandbox_namespace: String, + expected_service_account: String, +} + +impl LiveK8sResolver { + pub fn new( + client: kube::Client, + namespace: &str, + expected_audience: String, + expected_service_account: String, + ) -> Self { + let token_reviews_api: Api = Api::all(client.clone()); + let pods_api: Api = Api::namespaced(client, namespace); + Self { + token_reviews_api, + pods_api, + expected_audience, + sandbox_namespace: namespace.to_string(), + expected_service_account, + } + } +} + +#[async_trait] +impl K8sIdentityResolver for LiveK8sResolver { + async fn resolve(&self, token: &str) -> Result, Status> { + let review = TokenReview { + metadata: ObjectMeta::default(), + spec: TokenReviewSpec { + audiences: Some(vec![self.expected_audience.clone()]), + token: Some(token.to_string()), + }, + status: None, + }; + + let review = self + .token_reviews_api + .create(&PostParams::default(), &review) + .await + .map_err(|e| { + warn!(error = %e, "K8s TokenReview failed"); + Status::internal(format!("tokenreview failed: {e}")) + })?; + let status = review + .status + .ok_or_else(|| Status::internal("TokenReview response missing status"))?; + let Some(identity) = token_review_identity( + &status, + &self.expected_audience, + &self.sandbox_namespace, + &self.expected_service_account, + )? + else { + return Ok(None); + }; + + info!( + pod_name = %identity.pod_name, + pod_uid = %identity.pod_uid, + service_account = %self.expected_service_account, + "validated K8s SA token via TokenReview" + ); + + // Look up the pod and read its sandbox-id annotation. + let pod = self + .pods_api + .get_opt(&identity.pod_name) + .await + .map_err(|e| { + warn!( + pod = %identity.pod_name, + error = %e, + "failed to fetch sandbox pod for annotation lookup" + ); + Status::internal(format!("pod GET failed: {e}")) + })?; + let Some(pod) = pod else { + warn!( + pod = %identity.pod_name, + "sandbox pod referenced by SA token not found in this namespace" + ); + return Err(Status::not_found("sandbox pod not found")); + }; + + // Defense-in-depth: confirm the pod UID matches the SA token's + // `kubernetes.io.pod.uid`. Prevents a replayed token from a + // recreated pod with the same name. + let actual_uid = pod.metadata.uid.as_deref().unwrap_or_default(); + if actual_uid != identity.pod_uid { + warn!( + pod = %identity.pod_name, + claimed_uid = %identity.pod_uid, + actual_uid = %actual_uid, + "SA token pod UID does not match live pod; rejecting" + ); + return Err(Status::permission_denied("SA token pod UID mismatch")); + } + + let sandbox_id = pod + .metadata + .annotations + .as_ref() + .and_then(|a| a.get(SANDBOX_ID_ANNOTATION)) + .cloned() + .unwrap_or_default(); + + Ok(Some(ResolvedK8sIdentity { + sandbox_id, + pod_name: identity.pod_name, + pod_uid: identity.pod_uid, + })) + } +} + +#[allow(clippy::result_large_err)] +fn token_review_identity( + status: &TokenReviewStatus, + expected_audience: &str, + sandbox_namespace: &str, + expected_service_account: &str, +) -> Result, Status> { + if status.authenticated != Some(true) { + debug!( + error = status.error.as_deref().unwrap_or_default(), + "K8s TokenReview did not authenticate token" + ); + return Ok(None); + } + + let audiences = status.audiences.as_deref().unwrap_or_default(); + if !audiences.iter().any(|aud| aud == expected_audience) { + warn!( + expected_audience = %expected_audience, + audiences = ?audiences, + "K8s TokenReview authenticated token without expected audience" + ); + return Err(Status::unauthenticated("SA token audience not accepted")); + } + + let user = status + .user + .as_ref() + .ok_or_else(|| Status::permission_denied("TokenReview response missing user info"))?; + let username = user + .username + .as_deref() + .ok_or_else(|| Status::permission_denied("TokenReview response missing username"))?; + let expected_username = + format!("system:serviceaccount:{sandbox_namespace}:{expected_service_account}"); + if username != expected_username { + warn!( + username = %username, + sandbox_namespace = %sandbox_namespace, + service_account = %expected_service_account, + "K8s TokenReview principal is not the configured sandbox service account" + ); + return Err(Status::permission_denied( + "SA token is not from the configured sandbox service account", + )); + } + + let pod_name = user_extra_one(user, POD_NAME_EXTRA)?; + let pod_uid = user_extra_one(user, POD_UID_EXTRA)?; + Ok(Some(TokenReviewIdentity { pod_name, pod_uid })) +} + +#[allow(clippy::result_large_err)] +fn user_extra_one(user: &UserInfo, key: &str) -> Result { + let Some(values) = user.extra.as_ref().and_then(|extra| extra.get(key)) else { + return Err(Status::permission_denied("SA token is not pod-bound")); + }; + if values.len() != 1 || values[0].is_empty() { + return Err(Status::permission_denied( + "SA token has invalid pod binding", + )); + } + Ok(values[0].clone()) +} + +#[cfg(test)] +pub mod test_support { + use super::*; + use std::sync::Mutex; + + /// Fake resolver for unit tests. Returns the configured outcome on + /// every call and records the tokens it observed. + pub struct FakeResolver { + pub outcome: Result, Status>, + pub seen_tokens: Mutex>, + } + + impl FakeResolver { + pub fn returning(outcome: Result, Status>) -> Self { + Self { + outcome, + seen_tokens: Mutex::new(Vec::new()), + } + } + } + + #[async_trait] + impl K8sIdentityResolver for FakeResolver { + async fn resolve(&self, token: &str) -> Result, Status> { + self.seen_tokens.lock().unwrap().push(token.to_string()); + match &self.outcome { + Ok(opt) => Ok(opt.clone()), + Err(s) => Err(Status::new(s.code(), s.message())), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::test_support::FakeResolver; + use super::*; + use std::collections::BTreeMap; + + fn bearer_headers(token: &str) -> http::HeaderMap { + let mut h = http::HeaderMap::new(); + h.insert( + "authorization", + http::HeaderValue::from_str(&format!("Bearer {token}")).unwrap(), + ); + h + } + + fn token_review_status( + authenticated: bool, + audiences: Vec<&str>, + username: &str, + extra: Vec<(&str, &str)>, + ) -> TokenReviewStatus { + TokenReviewStatus { + authenticated: Some(authenticated), + audiences: Some(audiences.into_iter().map(str::to_string).collect()), + error: None, + user: Some(UserInfo { + username: Some(username.to_string()), + uid: Some("sa-uid".to_string()), + groups: Some(vec![ + "system:serviceaccounts".to_string(), + "system:serviceaccounts:openshell".to_string(), + "system:authenticated".to_string(), + ]), + extra: Some( + extra + .into_iter() + .map(|(k, v)| (k.to_string(), vec![v.to_string()])) + .collect::>(), + ), + }), + } + } + + #[test] + fn token_review_identity_extracts_pod_binding() { + let status = token_review_status( + true, + vec!["openshell-gateway"], + "system:serviceaccount:openshell:default", + vec![ + (POD_NAME_EXTRA, "openshell-sandbox-a"), + (POD_UID_EXTRA, "uid-a"), + ], + ); + + let identity = token_review_identity(&status, "openshell-gateway", "openshell", "default") + .unwrap() + .expect("authenticated token should resolve"); + + assert_eq!(identity.pod_name, "openshell-sandbox-a"); + assert_eq!(identity.pod_uid, "uid-a"); + } + + #[test] + fn token_review_identity_returns_none_when_not_authenticated() { + let status = TokenReviewStatus { + authenticated: Some(false), + error: Some("invalid audience".to_string()), + ..Default::default() + }; + + assert!( + token_review_identity(&status, "openshell-gateway", "openshell", "default") + .unwrap() + .is_none() + ); + } + + #[test] + fn token_review_identity_requires_expected_audience() { + let status = token_review_status( + true, + vec!["kubernetes.default.svc"], + "system:serviceaccount:openshell:default", + vec![ + (POD_NAME_EXTRA, "openshell-sandbox-a"), + (POD_UID_EXTRA, "uid-a"), + ], + ); + + let err = token_review_identity(&status, "openshell-gateway", "openshell", "default") + .expect_err("wrong audience must fail closed"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } + + #[test] + fn token_review_identity_requires_sandbox_namespace() { + let status = token_review_status( + true, + vec!["openshell-gateway"], + "system:serviceaccount:other:default", + vec![ + (POD_NAME_EXTRA, "openshell-sandbox-a"), + (POD_UID_EXTRA, "uid-a"), + ], + ); + + let err = token_review_identity(&status, "openshell-gateway", "openshell", "default") + .expect_err("other namespace must be rejected"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[test] + fn token_review_identity_requires_configured_service_account() { + let status = token_review_status( + true, + vec!["openshell-gateway"], + "system:serviceaccount:openshell:other", + vec![ + (POD_NAME_EXTRA, "openshell-sandbox-a"), + (POD_UID_EXTRA, "uid-a"), + ], + ); + + let err = token_review_identity(&status, "openshell-gateway", "openshell", "default") + .expect_err("other service account must be rejected"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[test] + fn token_review_identity_requires_pod_bound_extras() { + let status = token_review_status( + true, + vec!["openshell-gateway"], + "system:serviceaccount:openshell:default", + vec![], + ); + + let err = token_review_identity(&status, "openshell-gateway", "openshell", "default") + .expect_err("non pod-bound tokens must be rejected"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[tokio::test] + async fn authenticates_on_issue_path_only() { + let resolved = ResolvedK8sIdentity { + sandbox_id: "sandbox-a".to_string(), + pod_name: "openshell-sandbox-a".to_string(), + pod_uid: "uid-a".to_string(), + }; + let fake = Arc::new(FakeResolver::returning(Ok(Some(resolved)))); + let auth = K8sServiceAccountAuthenticator::new(fake.clone()); + + let on_issue = auth + .authenticate(&bearer_headers("sa-jwt"), ISSUE_SANDBOX_TOKEN_PATH) + .await + .unwrap() + .expect("expected principal"); + match on_issue { + Principal::Sandbox(p) => { + assert_eq!(p.sandbox_id, "sandbox-a"); + assert!(matches!( + p.source, + SandboxIdentitySource::K8sServiceAccount { .. } + )); + } + _ => panic!("expected sandbox principal"), + } + + let off_issue = auth + .authenticate( + &bearer_headers("sa-jwt"), + "/openshell.v1.OpenShell/GetSandboxConfig", + ) + .await + .unwrap(); + assert!( + off_issue.is_none(), + "K8s SA authenticator must be scoped to IssueSandboxToken" + ); + assert_eq!( + fake.seen_tokens.lock().unwrap().len(), + 1, + "off-path call must not consult the apiserver" + ); + } + + #[tokio::test] + async fn missing_bearer_yields_none() { + let fake = Arc::new(FakeResolver::returning(Ok(None))); + let auth = K8sServiceAccountAuthenticator::new(fake); + let result = auth + .authenticate(&http::HeaderMap::new(), ISSUE_SANDBOX_TOKEN_PATH) + .await + .unwrap(); + assert!(result.is_none()); + } + + #[tokio::test] + async fn resolver_returning_none_falls_through() { + let fake = Arc::new(FakeResolver::returning(Ok(None))); + let auth = K8sServiceAccountAuthenticator::new(fake); + let result = auth + .authenticate( + &bearer_headers("not-a-real-sa-token"), + ISSUE_SANDBOX_TOKEN_PATH, + ) + .await + .unwrap(); + assert!(result.is_none(), "non-authenticating tokens fall through"); + } + + #[tokio::test] + async fn pod_without_annotation_is_rejected() { + let resolved = ResolvedK8sIdentity { + sandbox_id: String::new(), + pod_name: "stray-pod".to_string(), + pod_uid: "uid".to_string(), + }; + let fake = Arc::new(FakeResolver::returning(Ok(Some(resolved)))); + let auth = K8sServiceAccountAuthenticator::new(fake); + let err = auth + .authenticate(&bearer_headers("sa-jwt"), ISSUE_SANDBOX_TOKEN_PATH) + .await + .expect_err("unbound pod must be rejected"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[tokio::test] + async fn resolver_error_propagates() { + let fake = Arc::new(FakeResolver::returning(Err(Status::unavailable( + "apiserver down", + )))); + let auth = K8sServiceAccountAuthenticator::new(fake); + let err = auth + .authenticate(&bearer_headers("sa-jwt"), ISSUE_SANDBOX_TOKEN_PATH) + .await + .expect_err("resolver error must propagate"); + assert_eq!(err.code(), tonic::Code::Unavailable); + } +} diff --git a/crates/openshell-server/src/auth/mod.rs b/crates/openshell-server/src/auth/mod.rs index 8e4f332d8..ca032a006 100644 --- a/crates/openshell-server/src/auth/mod.rs +++ b/crates/openshell-server/src/auth/mod.rs @@ -8,9 +8,15 @@ //! - `identity`: Provider-agnostic identity representation //! - `http`: HTTP endpoints for auth discovery and token exchange +pub mod authenticator; pub mod authz; +pub mod guard; mod http; pub mod identity; +pub mod k8s_sa; pub mod oidc; +pub mod principal; +pub mod sandbox_jwt; +pub mod sandbox_methods; pub use http::router; diff --git a/crates/openshell-server/src/auth/oidc.rs b/crates/openshell-server/src/auth/oidc.rs index 92298579e..5e5a23500 100644 --- a/crates/openshell-server/src/auth/oidc.rs +++ b/crates/openshell-server/src/auth/oidc.rs @@ -10,7 +10,10 @@ //! This module owns authentication (verifying who the caller is). //! Authorization (deciding what the caller can do) is in `authz.rs`. +use super::authenticator::Authenticator; use super::identity::{Identity, IdentityProvider}; +use super::principal::{Principal, UserPrincipal}; +use async_trait::async_trait; use jsonwebtoken::{Algorithm, DecodingKey, Validation, decode, decode_header}; use openshell_core::OidcConfig; use reqwest::Client; @@ -22,15 +25,6 @@ use tokio::sync::RwLock; use tonic::Status; use tracing::{debug, info, warn}; -/// Internal metadata header set by the auth middleware to mark a request as -/// originating from a sandbox. This is stripped from all incoming requests -/// first so external callers cannot spoof it. -pub const INTERNAL_AUTH_SOURCE_HEADER: &str = "x-openshell-auth-source"; -/// Internal auth-source marker for requests originating from a sandbox -/// (no OIDC Bearer; trust derives from the mTLS channel or operator's -/// fronting proxy). -pub const AUTH_SOURCE_SANDBOX: &str = "sandbox"; - /// Truly unauthenticated methods — health probes and infrastructure. const UNAUTHENTICATED_METHODS: &[&str] = &[ "/openshell.v1.OpenShell/Health", @@ -40,40 +34,6 @@ const UNAUTHENTICATED_METHODS: &[&str] = &[ /// Path prefixes that bypass OIDC validation (gRPC reflection, health probes). const UNAUTHENTICATED_PREFIXES: &[&str] = &["/grpc.reflection.", "/grpc.health."]; -/// Sandbox-to-server RPCs that are called by sandboxes instead of CLI -/// users. These do not require an OIDC Bearer token; the gRPC channel's -/// mTLS handshake (or the operator's fronting proxy when -/// `--disable-gateway-auth` is set) is the trust boundary. -const SANDBOX_METHODS: &[&str] = &[ - "/openshell.v1.OpenShell/ReportPolicyStatus", - "/openshell.v1.OpenShell/PushSandboxLogs", - "/openshell.v1.OpenShell/GetSandboxProviderEnvironment", - "/openshell.v1.OpenShell/SubmitPolicyAnalysis", - "/openshell.sandbox.v1.SandboxService/GetSandboxConfig", - "/openshell.inference.v1.Inference/GetInferenceBundle", -]; - -/// Methods that accept either an OIDC Bearer token (CLI users, full scope) -/// or no Bearer (sandbox supervisor, sandbox-restricted scope). -/// `UpdateConfig` is called by both CLI (policy/settings mutations) and the -/// sandbox supervisor (policy sync on startup). -/// `OpenShell/GetSandboxConfig` serves CLI settings reads while remaining -/// compatible with sandbox callers. -/// `GetDraftPolicy` serves CLI reviewer surfaces (`openshell rule get`, -/// TUI inbox) AND the sandbox-side `policy.local /wait` long-poll that -/// blocks on the agent's proposal until the developer decides. -const DUAL_AUTH_METHODS: &[&str] = &[ - "/openshell.v1.OpenShell/UpdateConfig", - "/openshell.v1.OpenShell/GetSandboxConfig", - "/openshell.v1.OpenShell/GetDraftPolicy", -]; - -/// Returns `true` if the method accepts either an OIDC Bearer token or a -/// sandbox-class caller (no Bearer). -pub fn is_dual_auth_method(path: &str) -> bool { - DUAL_AUTH_METHODS.contains(&path) -} - /// Returns `true` if the method needs no authentication at all. pub fn is_unauthenticated_method(path: &str) -> bool { UNAUTHENTICATED_METHODS.contains(&path) @@ -82,34 +42,6 @@ pub fn is_unauthenticated_method(path: &str) -> bool { .any(|prefix| path.starts_with(prefix)) } -/// Returns `true` if the method is an exclusively sandbox-class call (does -/// not accept OIDC Bearer). -pub fn is_sandbox_method(path: &str) -> bool { - SANDBOX_METHODS.contains(&path) -} - -/// Remove internal auth-source markers from the request before any auth -/// decision is made so external callers cannot spoof them. -pub fn clear_internal_auth_markers(headers: &mut http::HeaderMap) { - headers.remove(INTERNAL_AUTH_SOURCE_HEADER); -} - -/// Mark the request as originating from a sandbox caller. -pub fn mark_sandbox_caller(headers: &mut http::HeaderMap) { - headers.insert( - INTERNAL_AUTH_SOURCE_HEADER, - http::HeaderValue::from_static(AUTH_SOURCE_SANDBOX), - ); -} - -/// Returns `true` if the request metadata indicates a sandbox caller. -pub fn is_sandbox_caller(metadata: &tonic::metadata::MetadataMap) -> bool { - metadata - .get(INTERNAL_AUTH_SOURCE_HEADER) - .and_then(|v| v.to_str().ok()) - == Some(AUTH_SOURCE_SANDBOX) -} - /// Cached JWKS key set fetched from the OIDC issuer. /// /// A `refresh_mutex` ensures that only one refresh runs at a time, @@ -419,6 +351,42 @@ impl JwksCache { } } +/// Authenticator that validates `Authorization: Bearer ` headers against +/// the configured OIDC issuer. +/// +/// Returns `Ok(None)` when no Bearer header is present, so the chain can fall +/// through to other authenticators (e.g. the gateway-minted sandbox JWT +/// authenticator). +pub struct OidcAuthenticator { + cache: Arc, +} + +impl OidcAuthenticator { + pub fn new(cache: Arc) -> Self { + Self { cache } + } +} + +#[async_trait] +impl Authenticator for OidcAuthenticator { + async fn authenticate( + &self, + headers: &http::HeaderMap, + _path: &str, + ) -> Result, Status> { + let Some(token) = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .and_then(|v| v.strip_prefix("Bearer ")) + else { + return Ok(None); + }; + + let identity = self.cache.validate_token(token).await?; + Ok(Some(Principal::User(UserPrincipal { identity }))) + } +} + #[cfg(test)] mod tests { use super::*; @@ -433,7 +401,6 @@ mod tests { assert!(!is_unauthenticated_method( "/openshell.v1.OpenShell/CreateSandbox" )); - assert!(!is_sandbox_method("/openshell.v1.OpenShell/CreateSandbox")); } #[test] @@ -451,74 +418,6 @@ mod tests { assert!(is_unauthenticated_method("/grpc.health.v1.Health/Check")); } - #[test] - fn sandbox_rpcs_are_sandbox_methods() { - assert!(is_sandbox_method( - "/openshell.sandbox.v1.SandboxService/GetSandboxConfig" - )); - assert!(is_sandbox_method( - "/openshell.v1.OpenShell/GetSandboxProviderEnvironment" - )); - assert!(is_sandbox_method( - "/openshell.v1.OpenShell/ReportPolicyStatus" - )); - assert!(is_sandbox_method("/openshell.v1.OpenShell/PushSandboxLogs")); - assert!(is_sandbox_method( - "/openshell.v1.OpenShell/SubmitPolicyAnalysis" - )); - assert!(is_sandbox_method( - "/openshell.inference.v1.Inference/GetInferenceBundle" - )); - } - - #[test] - fn openshell_get_sandbox_config_is_dual_auth() { - assert!(!is_sandbox_method( - "/openshell.v1.OpenShell/GetSandboxConfig" - )); - assert!(is_dual_auth_method( - "/openshell.v1.OpenShell/GetSandboxConfig" - )); - } - - #[test] - fn openshell_get_draft_policy_is_dual_auth() { - // policy.local calls GetDraftPolicy from inside the sandbox - // supervisor (no Bearer, authenticated via mTLS), and the CLI/TUI - // reviewer surfaces call it with an OIDC Bearer. Sandbox-only - // would lock CLI out; Bearer-only would 401 the /wait long-poll - // in OIDC-enabled deployments. - assert!(!is_sandbox_method("/openshell.v1.OpenShell/GetDraftPolicy")); - assert!(is_dual_auth_method( - "/openshell.v1.OpenShell/GetDraftPolicy" - )); - } - - #[test] - fn sandbox_caller_marker_round_trips_through_metadata() { - let mut headers = http::HeaderMap::new(); - mark_sandbox_caller(&mut headers); - let metadata = tonic::metadata::MetadataMap::from_headers(headers); - assert!(is_sandbox_caller(&metadata)); - } - - #[test] - fn unmarked_request_is_not_sandbox_caller() { - let metadata = tonic::metadata::MetadataMap::new(); - assert!(!is_sandbox_caller(&metadata)); - } - - #[test] - fn clear_internal_markers_strips_spoofed_header() { - let mut headers = http::HeaderMap::new(); - headers.insert( - INTERNAL_AUTH_SOURCE_HEADER, - http::HeaderValue::from_static(AUTH_SOURCE_SANDBOX), - ); - clear_internal_auth_markers(&mut headers); - assert!(headers.get(INTERNAL_AUTH_SOURCE_HEADER).is_none()); - } - #[test] fn extract_roles_keycloak_path() { let json = serde_json::json!({ diff --git a/crates/openshell-server/src/auth/principal.rs b/crates/openshell-server/src/auth/principal.rs new file mode 100644 index 000000000..a95eb831b --- /dev/null +++ b/crates/openshell-server/src/auth/principal.rs @@ -0,0 +1,78 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Authenticated caller principals. +//! +//! A `Principal` is the result of running the [`super::authenticator::Authenticator`] +//! chain on an inbound request. It generalizes over the kinds of callers the +//! gateway recognizes — human users (OIDC), sandbox supervisors (gateway-minted +//! JWT, future SPIFFE), and anonymous callers (truly unauthenticated methods +//! like health probes). +//! +//! Handlers read the principal from the gRPC `Request` extensions and gate +//! access accordingly. Sandbox-class handlers MUST compare +//! `Principal::Sandbox.sandbox_id` against the request body's `sandbox_id` +//! to prevent cross-sandbox access (see issue #1354). + +use super::identity::Identity; + +/// Who is calling. +/// +/// Inserted into `tonic::Request::extensions` by the auth router. Handlers +/// retrieve it via `req.extensions().get::()`. +#[derive(Debug, Clone)] +pub enum Principal { + /// Human caller authenticated via OIDC (Keycloak, Entra ID, Okta, etc.). + User(UserPrincipal), + /// Sandbox supervisor authenticated by an identity bound to a specific + /// sandbox UUID. The wrapped `sandbox_id` MUST match any sandbox referenced + /// in the request body for sandbox-class methods. + Sandbox(#[allow(dead_code)] SandboxPrincipal), + /// Truly unauthenticated caller (health probes, reflection). Sandbox-class + /// and user-class methods reject this variant. + #[allow(dead_code)] + Anonymous, +} + +/// User caller — wraps the existing provider-agnostic [`Identity`]. +#[derive(Debug, Clone)] +pub struct UserPrincipal { + /// The verified identity from the authentication provider. + pub identity: Identity, +} + +/// Sandbox caller — bound to one specific sandbox UUID. +/// +/// `sandbox_id` and `source` are consumed by the router and handler guards. +#[derive(Debug, Clone)] +#[allow(dead_code)] +pub struct SandboxPrincipal { + /// Canonical sandbox UUID populated from a verified sandbox credential. + pub sandbox_id: String, + /// How this principal was verified — used for audit logs and method-specific + /// authorization checks. + pub source: SandboxIdentitySource, + /// SPIFFE trust domain. Populated when the credential is SPIFFE-shaped; + /// reserved for future per-sandbox cert / SPIRE authenticators. + pub trust_domain: Option, +} + +/// How a [`SandboxPrincipal`] was authenticated. +/// +/// Variant fields are populated by the producing authenticator and consumed +/// by audit logging and method-specific authorization checks. +#[derive(Debug, Clone)] +#[allow(dead_code)] +pub enum SandboxIdentitySource { + /// Gateway-minted JWT validated against the gateway's signing key. + /// Produced by [`super::sandbox_jwt::SandboxJwtAuthenticator`]. + BootstrapJwt { issuer: String }, + /// Per-sandbox client certificate. Reserved for channel-bound sandbox + /// identity. + BootstrapCert { fingerprint: String }, + /// SPIRE-issued SVID. Reserved for SPIFFE/SPIRE sandbox identity. + SpiffeSvid { spiffe_id: String }, + /// K8s `ServiceAccount` token used to bootstrap a gateway-minted JWT + /// via `IssueSandboxToken`. Populated only on that one RPC path. + K8sServiceAccount { pod_name: String, pod_uid: String }, +} diff --git a/crates/openshell-server/src/auth/sandbox_jwt.rs b/crates/openshell-server/src/auth/sandbox_jwt.rs new file mode 100644 index 000000000..2ec890249 --- /dev/null +++ b/crates/openshell-server/src/auth/sandbox_jwt.rs @@ -0,0 +1,347 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Gateway-minted per-sandbox JWTs. +//! +//! The gateway signs an Ed25519 JWT for each sandbox at create time and +//! the sandbox supervisor presents it as `Authorization: Bearer ` on +//! supervisor-to-gateway gRPC calls. This module implements both sides of the +//! gateway-controlled token: +//! - [`SandboxJwtIssuer`] mints fresh tokens (called from +//! `handle_create_sandbox` and the `IssueSandboxToken` RPC). +//! - [`SandboxJwtAuthenticator`] validates tokens on inbound requests and +//! produces a [`Principal::Sandbox`] with [`SandboxIdentitySource::BootstrapJwt`]. +//! +//! Algorithm: `EdDSA` (Ed25519). Pinned via `Validation::algorithms` to +//! prevent algorithm-confusion attacks. + +use super::authenticator::Authenticator; +use super::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; +use async_trait::async_trait; +use jsonwebtoken::{ + Algorithm, DecodingKey, EncodingKey, Header, Validation, decode, decode_header, encode, +}; +use serde::{Deserialize, Serialize}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use tonic::Status; +use tracing::{debug, warn}; + +/// SPIFFE-shaped subject prefix. Embedded in the `sub` claim of every +/// minted token so a future migration to per-sandbox certs or SPIRE can +/// reuse the same subject namespace without breaking handler equality +/// checks. +const SPIFFE_SUBJECT_PREFIX: &str = "spiffe://openshell/sandbox/"; + +/// JWT claim set serialized in every gateway-minted sandbox token. +#[derive(Debug, Serialize, Deserialize)] +pub struct SandboxJwtClaims { + /// `spiffe://openshell/sandbox/`. SPIFFE-shaped for forward + /// compatibility with channel-bound identity (per-sandbox cert / SPIRE). + pub sub: String, + /// Gateway identity (`openshell-gateway:`). Both `iss` and + /// `aud` use the same value so any future replicas of the same + /// deployment validate each others' tokens without configuration. + pub iss: String, + pub aud: String, + pub iat: i64, + pub exp: i64, + /// Canonical sandbox UUID, denormalized from `sub` for cheap parsing + /// without a SPIFFE library. + pub sandbox_id: String, +} + +/// Mints fresh sandbox JWTs. +pub struct SandboxJwtIssuer { + encoding_key: EncodingKey, + kid: String, + issuer: String, + audience: String, + ttl: Duration, +} + +impl std::fmt::Debug for SandboxJwtIssuer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SandboxJwtIssuer") + .field("kid", &self.kid) + .field("issuer", &self.issuer) + .field("audience", &self.audience) + .field("ttl", &self.ttl) + .finish_non_exhaustive() + } +} + +/// Outcome of a successful mint. +#[derive(Debug, Clone)] +pub struct MintedToken { + pub token: String, + pub expires_at_ms: i64, +} + +impl SandboxJwtIssuer { + pub fn from_pem( + signing_key_pem: &[u8], + kid: String, + gateway_id: &str, + ttl: Duration, + ) -> Result { + let encoding_key = EncodingKey::from_ed_pem(signing_key_pem) + .map_err(|e| format!("failed to parse Ed25519 signing key PEM: {e}"))?; + let identity = format!("openshell-gateway:{gateway_id}"); + Ok(Self { + encoding_key, + kid, + issuer: identity.clone(), + audience: identity, + ttl, + }) + } + + /// Mint a fresh token for `sandbox_id`. + #[allow(clippy::result_large_err)] // `tonic::Status` is the natural error here + pub fn mint(&self, sandbox_id: &str) -> Result { + let now = now_secs(); + let exp = now + i64::try_from(self.ttl.as_secs()).unwrap_or(3_600); + let claims = SandboxJwtClaims { + sub: format!("{SPIFFE_SUBJECT_PREFIX}{sandbox_id}"), + iss: self.issuer.clone(), + aud: self.audience.clone(), + iat: now, + exp, + sandbox_id: sandbox_id.to_string(), + }; + let mut header = Header::new(Algorithm::EdDSA); + header.kid = Some(self.kid.clone()); + let token = encode(&header, &claims, &self.encoding_key).map_err(|e| { + warn!(error = %e, "failed to mint sandbox JWT"); + Status::internal("failed to mint sandbox token") + })?; + Ok(MintedToken { + token, + expires_at_ms: exp.saturating_mul(1000), + }) + } + + pub fn ttl(&self) -> Duration { + self.ttl + } +} + +/// Authenticator that validates gateway-minted sandbox JWTs. +pub struct SandboxJwtAuthenticator { + decoding_key: DecodingKey, + kid: String, + issuer: String, + audience: String, +} + +impl std::fmt::Debug for SandboxJwtAuthenticator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SandboxJwtAuthenticator") + .field("kid", &self.kid) + .field("issuer", &self.issuer) + .field("audience", &self.audience) + .finish_non_exhaustive() + } +} + +impl SandboxJwtAuthenticator { + pub fn from_pem(public_key_pem: &[u8], kid: String, gateway_id: &str) -> Result { + let decoding_key = DecodingKey::from_ed_pem(public_key_pem) + .map_err(|e| format!("failed to parse Ed25519 public key PEM: {e}"))?; + let identity = format!("openshell-gateway:{gateway_id}"); + Ok(Self { + decoding_key, + kid, + issuer: identity.clone(), + audience: identity, + }) + } + + #[allow(clippy::result_large_err)] + fn validate_bearer(&self, token: &str) -> Result, Status> { + let header = decode_header(token).map_err(|e| { + debug!(error = %e, "sandbox JWT header decode failed"); + Status::unauthenticated("invalid token") + })?; + + // Fall through to other authenticators when the kid does not match — + // OIDC issuers may share the Bearer slot. + if header.kid.as_deref() != Some(self.kid.as_str()) { + return Ok(None); + } + if !matches!(header.alg, Algorithm::EdDSA) { + return Ok(None); + } + + let mut validation = Validation::new(Algorithm::EdDSA); + validation.algorithms = vec![Algorithm::EdDSA]; + validation.set_issuer(&[&self.issuer]); + validation.set_audience(&[&self.audience]); + validation.set_required_spec_claims(&["iss", "aud", "exp", "sub"]); + + let data = + decode::(token, &self.decoding_key, &validation).map_err(|e| { + debug!(error = %e, "sandbox JWT validation failed"); + Status::unauthenticated(format!("invalid token: {e}")) + })?; + + let claims = data.claims; + Ok(Some(Principal::Sandbox(SandboxPrincipal { + sandbox_id: claims.sandbox_id, + source: SandboxIdentitySource::BootstrapJwt { issuer: claims.iss }, + trust_domain: Some("openshell".to_string()), + }))) + } +} + +#[async_trait] +impl Authenticator for SandboxJwtAuthenticator { + async fn authenticate( + &self, + headers: &http::HeaderMap, + _path: &str, + ) -> Result, Status> { + let Some(token) = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .and_then(|v| v.strip_prefix("Bearer ")) + else { + return Ok(None); + }; + self.validate_bearer(token) + } +} + +fn now_secs() -> i64 { + i64::try_from( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_or(0, |d| d.as_secs()), + ) + .unwrap_or(i64::MAX) +} + +#[cfg(test)] +mod tests { + use super::*; + use openshell_bootstrap::jwt::generate_jwt_key; + + fn header_map_with_bearer(token: &str) -> http::HeaderMap { + let mut h = http::HeaderMap::new(); + h.insert( + "authorization", + http::HeaderValue::from_str(&format!("Bearer {token}")).unwrap(), + ); + h + } + + fn pair() -> (SandboxJwtIssuer, SandboxJwtAuthenticator) { + let mat = generate_jwt_key().expect("jwt key"); + let issuer = SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid.clone(), + "test-gateway", + Duration::from_secs(3600), + ) + .unwrap(); + let auth = SandboxJwtAuthenticator::from_pem( + mat.public_key_pem.as_bytes(), + mat.kid, + "test-gateway", + ) + .unwrap(); + (issuer, auth) + } + + #[tokio::test] + async fn mint_and_validate_round_trip() { + let (issuer, auth) = pair(); + let minted = issuer.mint("sandbox-a").unwrap(); + let principal = auth + .authenticate(&header_map_with_bearer(&minted.token), "/anything") + .await + .unwrap() + .expect("expected principal"); + match principal { + Principal::Sandbox(p) => { + assert_eq!(p.sandbox_id, "sandbox-a"); + match p.source { + SandboxIdentitySource::BootstrapJwt { issuer: iss } => { + assert_eq!(iss, "openshell-gateway:test-gateway"); + } + other => panic!("unexpected source: {other:?}"), + } + } + _ => panic!("expected Sandbox principal"), + } + } + + #[tokio::test] + async fn token_signed_by_other_key_is_rejected() { + let (_, auth_a) = pair(); + let (issuer_b, _) = pair(); // different keypair + let minted = issuer_b.mint("sandbox-b").unwrap(); + // The token has a different `kid` than auth_a expects, so the + // authenticator yields None (lets the chain fall through). That is + // the documented behavior for cross-issuer Bearer headers. + let result = auth_a + .authenticate(&header_map_with_bearer(&minted.token), "/anything") + .await + .unwrap(); + assert!(result.is_none(), "different kid must fall through"); + } + + #[tokio::test] + async fn missing_bearer_yields_none() { + let (_, auth) = pair(); + let result = auth + .authenticate(&http::HeaderMap::new(), "/anything") + .await + .unwrap(); + assert!(result.is_none()); + } + + #[tokio::test] + async fn malformed_token_is_rejected() { + let (_, auth) = pair(); + let err = auth + .authenticate(&header_map_with_bearer("not.a.jwt"), "/anything") + .await + .expect_err("malformed must reject"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } + + #[tokio::test] + async fn expired_token_is_rejected() { + // Mint a token whose iat is far in the past so its TTL window is + // already closed by `now`. We sign the JWT directly with the same + // signing key to bypass the issuer's TTL-vs-now coupling. + let mat = generate_jwt_key().unwrap(); + let issuer = SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid.clone(), + "g", + Duration::from_secs(3600), + ) + .unwrap(); + let auth = + SandboxJwtAuthenticator::from_pem(mat.public_key_pem.as_bytes(), mat.kid.clone(), "g") + .unwrap(); + let claims = SandboxJwtClaims { + sub: format!("{SPIFFE_SUBJECT_PREFIX}sandbox-c"), + iss: "openshell-gateway:g".to_string(), + aud: "openshell-gateway:g".to_string(), + iat: now_secs() - 7200, + exp: now_secs() - 3600, + sandbox_id: "sandbox-c".to_string(), + }; + let mut header = Header::new(Algorithm::EdDSA); + header.kid = Some(mat.kid); + let token = encode(&header, &claims, &issuer.encoding_key).unwrap(); + let err = auth + .authenticate(&header_map_with_bearer(&token), "/anything") + .await + .expect_err("expired token must reject"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } +} diff --git a/crates/openshell-server/src/auth/sandbox_methods.rs b/crates/openshell-server/src/auth/sandbox_methods.rs new file mode 100644 index 000000000..e03b8eeb6 --- /dev/null +++ b/crates/openshell-server/src/auth/sandbox_methods.rs @@ -0,0 +1,70 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Method-level allowlist for sandbox principals. +//! +//! Gateway-minted sandbox JWTs identify a single sandbox supervisor. They +//! must not authorize user-facing or admin APIs. The router rejects sandbox +//! principals for every method outside this supervisor-to-gateway allowlist; +//! handlers still perform same-sandbox checks on request bodies. + +/// Methods a `Principal::Sandbox` may invoke. +const ALLOWED_SANDBOX_METHODS: &[&str] = &[ + "/openshell.v1.OpenShell/IssueSandboxToken", + "/openshell.v1.OpenShell/RefreshSandboxToken", + "/openshell.v1.OpenShell/ConnectSupervisor", + "/openshell.v1.OpenShell/RelayStream", + "/openshell.v1.OpenShell/GetSandboxConfig", + "/openshell.v1.OpenShell/GetSandboxProviderEnvironment", + "/openshell.v1.OpenShell/UpdateConfig", + "/openshell.v1.OpenShell/ReportPolicyStatus", + "/openshell.v1.OpenShell/PushSandboxLogs", + "/openshell.v1.OpenShell/SubmitPolicyAnalysis", + "/openshell.v1.OpenShell/GetDraftPolicy", + "/openshell.inference.v1.Inference/GetInferenceBundle", +]; + +pub fn is_sandbox_callable(path: &str) -> bool { + ALLOWED_SANDBOX_METHODS.contains(&path) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn supervisor_callbacks_are_allowed() { + assert!(is_sandbox_callable( + "/openshell.v1.OpenShell/ConnectSupervisor" + )); + assert!(is_sandbox_callable("/openshell.v1.OpenShell/RelayStream")); + assert!(is_sandbox_callable( + "/openshell.v1.OpenShell/GetSandboxConfig" + )); + assert!(is_sandbox_callable( + "/openshell.inference.v1.Inference/GetInferenceBundle" + )); + } + + #[test] + fn user_and_admin_methods_are_not_allowed() { + assert!(!is_sandbox_callable( + "/openshell.v1.OpenShell/ListSandboxes" + )); + assert!(!is_sandbox_callable( + "/openshell.v1.OpenShell/DeleteSandbox" + )); + assert!(!is_sandbox_callable( + "/openshell.v1.OpenShell/CreateProvider" + )); + assert!(!is_sandbox_callable( + "/openshell.v1.OpenShell/ApproveDraftChunk" + )); + assert!(!is_sandbox_callable( + "/openshell.inference.v1.Inference/GetClusterInference" + )); + assert!(!is_sandbox_callable( + "/openshell.inference.v1.Inference/SetClusterInference" + )); + } +} diff --git a/crates/openshell-server/src/certgen.rs b/crates/openshell-server/src/certgen.rs index 683170aad..f188968fb 100644 --- a/crates/openshell-server/src/certgen.rs +++ b/crates/openshell-server/src/certgen.rs @@ -52,6 +52,12 @@ pub struct CertgenArgs { #[arg(long, required_unless_present = "output_dir")] client_secret_name: Option, + /// Name of the sandbox-JWT signing-key Secret (`Opaque`) to create. + /// Holds `signing.pem`, `public.pem`, and `kid` keys. Mounted on the + /// gateway pod (only) so it can mint and validate per-sandbox JWTs. + #[arg(long, required_unless_present = "output_dir")] + jwt_secret_name: Option, + /// Extra Subject Alternative Name for the server certificate. Repeatable. /// Auto-detected as an IP address or DNS name. #[arg(long = "server-san", value_name = "SAN")] @@ -93,10 +99,10 @@ enum K8sAction { Create, } -fn decide_k8s(server_exists: bool, client_exists: bool) -> K8sAction { - match (server_exists, client_exists) { - (true, true) => K8sAction::SkipExists, - (false, false) => K8sAction::Create, +fn decide_k8s(server_exists: bool, client_exists: bool, jwt_exists: bool) -> K8sAction { + match (server_exists, client_exists, jwt_exists) { + (true, true, true) => K8sAction::SkipExists, + (false, false, false) => K8sAction::Create, _ => K8sAction::PartialState, } } @@ -114,6 +120,10 @@ async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { .client_secret_name .as_deref() .ok_or_else(|| miette::miette!("--client-secret-name is required"))?; + let jwt_name = args + .jwt_secret_name + .as_deref() + .ok_or_else(|| miette::miette!("--jwt-secret-name is required"))?; let client = Client::try_default() .await @@ -133,22 +143,29 @@ async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { .into_diagnostic() .wrap_err_with(|| format!("failed to read secret {client_name}"))? .is_some(); + let jwt_exists = api + .get_opt(jwt_name) + .await + .into_diagnostic() + .wrap_err_with(|| format!("failed to read secret {jwt_name}"))? + .is_some(); - match decide_k8s(server_exists, client_exists) { + match decide_k8s(server_exists, client_exists, jwt_exists) { K8sAction::SkipExists => { info!( namespace = %namespace, server = %server_name, client = %client_name, + jwt = %jwt_name, "PKI secrets already exist, skipping." ); return Ok(()); } K8sAction::PartialState => { return Err(miette::miette!( - "partial PKI state in namespace {namespace}: exactly one of \ - {server_name} / {client_name} exists. Recover with: \ - kubectl delete secret -n {namespace} {server_name} {client_name}", + "partial PKI state in namespace {namespace}: only some of \ + {server_name} / {client_name} / {jwt_name} exist. Recover with: \ + kubectl delete secret -n {namespace} {server_name} {client_name} {jwt_name}", )); } K8sAction::Create => {} @@ -166,6 +183,12 @@ async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { &bundle.client_key_pem, &bundle.ca_cert_pem, ); + let jwt_secret = jwt_signing_secret( + jwt_name, + &bundle.jwt_signing_key_pem, + &bundle.jwt_public_key_pem, + &bundle.jwt_key_id, + ); api.create(&PostParams::default(), &server_secret) .await @@ -175,11 +198,16 @@ async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { .await .into_diagnostic() .wrap_err_with(|| format!("failed to create secret {client_name}"))?; + api.create(&PostParams::default(), &jwt_secret) + .await + .into_diagnostic() + .wrap_err_with(|| format!("failed to create secret {jwt_name}"))?; info!( namespace = %namespace, server = %server_name, client = %client_name, + jwt = %jwt_name, "PKI secrets created." ); Ok(()) @@ -207,6 +235,31 @@ fn tls_secret(name: &str, crt_pem: &str, key_pem: &str, ca_pem: &str) -> Secret } } +/// Build an `Opaque` Secret carrying the gateway-minted sandbox JWT +/// signing material. Mounted only on the gateway pod — sandbox pods +/// receive a per-pod gateway-signed token, never the signing key itself. +fn jwt_signing_secret(name: &str, signing_pem: &str, public_pem: &str, kid: &str) -> Secret { + let mut data = BTreeMap::new(); + data.insert( + "signing.pem".to_string(), + ByteString(signing_pem.as_bytes().to_vec()), + ); + data.insert( + "public.pem".to_string(), + ByteString(public_pem.as_bytes().to_vec()), + ); + data.insert("kid".to_string(), ByteString(kid.as_bytes().to_vec())); + Secret { + metadata: ObjectMeta { + name: Some(name.to_string()), + ..Default::default() + }, + type_: Some("Opaque".to_string()), + data: Some(data), + ..Default::default() + } +} + // ─────────────────────────────── Local mode ─────────────────────────────── #[derive(Debug, PartialEq, Eq)] @@ -235,12 +288,17 @@ struct LocalPaths { client_dir: PathBuf, client_crt: PathBuf, client_key: PathBuf, + jwt_dir: PathBuf, + jwt_signing: PathBuf, + jwt_public: PathBuf, + jwt_kid: PathBuf, } impl LocalPaths { fn resolve(dir: &Path) -> Self { let server_dir = dir.join("server"); let client_dir = dir.join("client"); + let jwt_dir = dir.join("jwt"); Self { ca_crt: dir.join("ca.crt"), ca_key: dir.join("ca.key"), @@ -250,10 +308,14 @@ impl LocalPaths { client_crt: client_dir.join("tls.crt"), client_key: client_dir.join("tls.key"), client_dir, + jwt_signing: jwt_dir.join("signing.pem"), + jwt_public: jwt_dir.join("public.pem"), + jwt_kid: jwt_dir.join("kid"), + jwt_dir, } } - fn all_files(&self) -> [&Path; 6] { + fn all_files(&self) -> [&Path; 9] { [ &self.ca_crt, &self.ca_key, @@ -261,6 +323,9 @@ impl LocalPaths { &self.server_key, &self.client_crt, &self.client_key, + &self.jwt_signing, + &self.jwt_public, + &self.jwt_kid, ] } @@ -271,7 +336,7 @@ impl LocalPaths { fn decide_local(present: usize) -> LocalAction { match present { - 6 => LocalAction::Skip, + 9 => LocalAction::Skip, 0 => LocalAction::Create, _ => LocalAction::PartialState, } @@ -318,6 +383,9 @@ fn read_local_bundle(paths: &LocalPaths) -> Result { server_key_pem: read_pem(&paths.server_key)?, client_cert_pem: read_pem(&paths.client_crt)?, client_key_pem: read_pem(&paths.client_key)?, + jwt_signing_key_pem: read_pem(&paths.jwt_signing)?, + jwt_public_key_pem: read_pem(&paths.jwt_public)?, + jwt_key_id: read_pem(&paths.jwt_kid)?.trim().to_string(), }) } @@ -339,9 +407,11 @@ fn write_local_bundle(dir: &Path, bundle: &PkiBundle, paths: &LocalPaths) -> Res let temp_server = temp.join("server"); let temp_client = temp.join("client"); + let temp_jwt = temp.join("jwt"); create_dir_restricted(&temp)?; create_dir_restricted(&temp_server)?; create_dir_restricted(&temp_client)?; + create_dir_restricted(&temp_jwt)?; write_pem(&temp.join("ca.crt"), &bundle.ca_cert_pem, false)?; write_pem(&temp.join("ca.key"), &bundle.ca_key_pem, true)?; @@ -349,19 +419,34 @@ fn write_local_bundle(dir: &Path, bundle: &PkiBundle, paths: &LocalPaths) -> Res write_pem(&temp_server.join("tls.key"), &bundle.server_key_pem, true)?; write_pem(&temp_client.join("tls.crt"), &bundle.client_cert_pem, false)?; write_pem(&temp_client.join("tls.key"), &bundle.client_key_pem, true)?; + write_pem( + &temp_jwt.join("signing.pem"), + &bundle.jwt_signing_key_pem, + true, + )?; + write_pem( + &temp_jwt.join("public.pem"), + &bundle.jwt_public_key_pem, + false, + )?; + write_pem(&temp_jwt.join("kid"), &bundle.jwt_key_id, false)?; // Final destination (might not exist yet on first run). create_dir_restricted(dir)?; create_dir_restricted(&paths.server_dir)?; create_dir_restricted(&paths.client_dir)?; + create_dir_restricted(&paths.jwt_dir)?; - let renames: [(PathBuf, &Path); 6] = [ + let renames: [(PathBuf, &Path); 9] = [ (temp.join("ca.crt"), paths.ca_crt.as_path()), (temp.join("ca.key"), paths.ca_key.as_path()), (temp_server.join("tls.crt"), paths.server_crt.as_path()), (temp_server.join("tls.key"), paths.server_key.as_path()), (temp_client.join("tls.crt"), paths.client_crt.as_path()), (temp_client.join("tls.key"), paths.client_key.as_path()), + (temp_jwt.join("signing.pem"), paths.jwt_signing.as_path()), + (temp_jwt.join("public.pem"), paths.jwt_public.as_path()), + (temp_jwt.join("kid"), paths.jwt_kid.as_path()), ]; for (from, to) in &renames { std::fs::rename(from, to) @@ -406,8 +491,8 @@ fn print_bundle(bundle: &PkiBundle) { #[cfg(test)] mod tests { use super::{ - K8sAction, LocalAction, LocalPaths, decide_k8s, decide_local, read_local_bundle, - sibling_temp_dir, tls_secret, write_local_bundle, + K8sAction, LocalAction, LocalPaths, decide_k8s, decide_local, jwt_signing_secret, + read_local_bundle, sibling_temp_dir, tls_secret, write_local_bundle, }; use openshell_bootstrap::pki::generate_pki; use std::path::Path; @@ -415,23 +500,32 @@ mod tests { // ── Kubernetes-mode decision ── #[test] - fn decide_k8s_skip_when_both_exist() { - assert_eq!(decide_k8s(true, true), K8sAction::SkipExists); + fn decide_k8s_skip_when_all_three_exist() { + assert_eq!(decide_k8s(true, true, true), K8sAction::SkipExists); } #[test] - fn decide_k8s_create_when_neither_exists() { - assert_eq!(decide_k8s(false, false), K8sAction::Create); + fn decide_k8s_create_when_none_exist() { + assert_eq!(decide_k8s(false, false, false), K8sAction::Create); } #[test] - fn decide_k8s_partial_when_only_server_exists() { - assert_eq!(decide_k8s(true, false), K8sAction::PartialState); - } - - #[test] - fn decide_k8s_partial_when_only_client_exists() { - assert_eq!(decide_k8s(false, true), K8sAction::PartialState); + fn decide_k8s_partial_for_any_mixed_state() { + let mixes = [ + (true, false, false), + (false, true, false), + (false, false, true), + (true, true, false), + (true, false, true), + (false, true, true), + ]; + for (s, c, j) in mixes { + assert_eq!( + decide_k8s(s, c, j), + K8sAction::PartialState, + "({s},{c},{j})" + ); + } } #[test] @@ -446,11 +540,23 @@ mod tests { assert_eq!(data["ca.crt"].0, b"CA-PEM"); } + #[test] + fn jwt_signing_secret_has_opaque_type_and_three_keys() { + let s = jwt_signing_secret("jwt", "SIGN", "PUB", "kid-1"); + assert_eq!(s.metadata.name.as_deref(), Some("jwt")); + assert_eq!(s.type_.as_deref(), Some("Opaque")); + let data = s.data.expect("data set"); + assert_eq!(data.len(), 3); + assert_eq!(data["signing.pem"].0, b"SIGN"); + assert_eq!(data["public.pem"].0, b"PUB"); + assert_eq!(data["kid"].0, b"kid-1"); + } + // ── Local-mode decision ── #[test] - fn decide_local_skip_when_all_six_present() { - assert_eq!(decide_local(6), LocalAction::Skip); + fn decide_local_skip_when_all_nine_present() { + assert_eq!(decide_local(9), LocalAction::Skip); } #[test] @@ -460,7 +566,7 @@ mod tests { #[test] fn decide_local_partial_for_any_count_in_between() { - for n in 1..=5 { + for n in 1..=8 { assert_eq!(decide_local(n), LocalAction::PartialState, "n = {n}"); } } diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index 8d4e094c4..a7bbf7652 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -336,6 +336,17 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { }); } + // `gateway_jwt` is configured through TOML. Standard deployments + // (helm chart + RPM init script) drop the keypair to a known path and + // pass that path through the file. A CLI shortcut can be added if a + // singleplayer operator needs to override it. + if let Some(jwt) = file + .as_ref() + .and_then(|f| f.openshell.gateway.gateway_jwt.clone()) + { + config.gateway_jwt = Some(jwt); + } + let vm_config = build_vm_config( file.as_ref(), local_tls.as_ref(), @@ -832,6 +843,8 @@ mod tests { "openshell-server-tls", "--client-secret-name", "openshell-client-tls", + "--jwt-secret-name", + "openshell-jwt-keys", "--server-san", "openshell.example.com", "--server-san", diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index a69231dea..98dc3fd63 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -421,7 +421,11 @@ impl ComputeRuntime { .map(|_| ()) } - pub async fn create_sandbox(&self, sandbox: Sandbox) -> Result { + pub async fn create_sandbox( + &self, + sandbox: Sandbox, + sandbox_token: Option, + ) -> Result { let sandbox_id = sandbox.object_id().to_string(); // Create with MustCreate condition to prevent duplicate creation race @@ -452,7 +456,12 @@ impl ComputeRuntime { } })?; - let driver_sandbox = driver_sandbox_from_public(&sandbox); + let mut driver_sandbox = driver_sandbox_from_public(&sandbox); + if let Some(token) = sandbox_token + && let Some(spec) = driver_sandbox.spec.as_mut() + { + spec.sandbox_token = token; + } match self .driver .create_sandbox(Request::new(CreateSandboxRequest { @@ -1229,6 +1238,7 @@ fn driver_sandbox_spec_from_public(spec: &SandboxSpec) -> DriverSandboxSpec { .map(driver_sandbox_template_from_public), gpu: spec.gpu, gpu_device: spec.gpu_device.clone(), + sandbox_token: String::new(), } } @@ -1614,6 +1624,7 @@ fn is_terminal_failure_reason(reason: &str) -> bool { "dependenciesnotready", "starting", "containerstarting", + "containercreated", "healthcheckstarting", "inspectfailed", ]; @@ -1996,6 +2007,10 @@ mod tests { ), ("dependenciesnotready", "lowercase also works"), ("Starting", "VM is starting"), + ( + "ContainerCreated", + "Podman created the container before starting it", + ), ]; for (reason, message) in transient_cases { @@ -2033,6 +2048,10 @@ mod tests { "Pod exists with phase: Pending; Service Exists", ), ("Starting", "VM is starting"), + ( + "ContainerCreated", + "Container exists but has not started yet", + ), ]; for (reason, message) in transient_conditions { @@ -2862,7 +2881,7 @@ mod tests { resource_version: 0, }); - let created = runtime.create_sandbox(sandbox).await.unwrap(); + let created = runtime.create_sandbox(sandbox, None).await.unwrap(); assert_eq!( created.metadata.as_ref().unwrap().resource_version, @@ -2898,11 +2917,11 @@ mod tests { // Spawn two concurrent creation attempts for the same sandbox let runtime1 = runtime.clone(); let sandbox1 = sandbox.clone(); - let handle1 = tokio::spawn(async move { runtime1.create_sandbox(sandbox1).await }); + let handle1 = tokio::spawn(async move { runtime1.create_sandbox(sandbox1, None).await }); let runtime2 = runtime.clone(); let sandbox2 = sandbox.clone(); - let handle2 = tokio::spawn(async move { runtime2.create_sandbox(sandbox2).await }); + let handle2 = tokio::spawn(async move { runtime2.create_sandbox(sandbox2, None).await }); // Wait for both to complete let result1 = handle1.await.unwrap(); diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs index db0dcd684..e0bd15123 100644 --- a/crates/openshell-server/src/config_file.rs +++ b/crates/openshell-server/src/config_file.rs @@ -25,7 +25,7 @@ use std::net::SocketAddr; use std::path::{Path, PathBuf}; use openshell_core::config::ComputeDriverKind; -use openshell_core::{OidcConfig, TlsConfig}; +use openshell_core::{GatewayJwtConfig, OidcConfig, TlsConfig}; use serde::{Deserialize, Serialize}; /// Latest schema version this build understands. @@ -112,9 +112,16 @@ pub struct GatewayFileSection { #[serde(default)] pub client_tls_secret_name: Option, #[serde(default)] + pub service_account_name: Option, + #[serde(default)] pub host_gateway_ip: Option, #[serde(default)] pub enable_user_namespaces: Option, + /// Lifetime (seconds) of the projected `ServiceAccount` token kubelet + /// writes for the `IssueSandboxToken` bootstrap exchange. Driver + /// clamps to `[600, 86400]`. + #[serde(default)] + pub sa_token_ttl_secs: Option, #[serde(default)] pub guest_tls_ca: Option, #[serde(default)] @@ -133,6 +140,8 @@ pub struct GatewayFileSection { pub tls: Option, #[serde(default)] pub oidc: Option, + #[serde(default)] + pub gateway_jwt: Option, // ── Disallowed-in-file fields ──────────────────────────────────────── // @@ -245,8 +254,10 @@ fn inheritable_keys(driver: ComputeDriverKind) -> &'static [&'static str] { "default_image", "supervisor_image", "client_tls_secret_name", + "service_account_name", "host_gateway_ip", "enable_user_namespaces", + "sa_token_ttl_secs", ], ComputeDriverKind::Docker => &[ "sandbox_namespace", @@ -279,8 +290,10 @@ fn gateway_inherited_value(g: &GatewayFileSection, key: &str) -> Option g.default_image.as_deref().map(string_value), "supervisor_image" => g.supervisor_image.as_deref().map(string_value), "client_tls_secret_name" => g.client_tls_secret_name.as_deref().map(string_value), + "service_account_name" => g.service_account_name.as_deref().map(string_value), "host_gateway_ip" => g.host_gateway_ip.as_deref().map(string_value), "enable_user_namespaces" => g.enable_user_namespaces.map(toml::Value::Boolean), + "sa_token_ttl_secs" => g.sa_token_ttl_secs.map(toml::Value::Integer), "guest_tls_ca" => g.guest_tls_ca.as_deref().map(path_value), "guest_tls_cert" => g.guest_tls_cert.as_deref().map(path_value), "guest_tls_key" => g.guest_tls_key.as_deref().map(path_value), @@ -334,6 +347,7 @@ sandbox_namespace = "agents" default_image = "ghcr.io/nvidia/openshell/sandbox:latest" supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" client_tls_secret_name = "openshell-sandbox-tls" +service_account_name = "openshell-sandbox" [openshell.gateway.tls] cert_path = "/etc/openshell/certs/gateway.pem" diff --git a/crates/openshell-server/src/grpc/auth_rpc.rs b/crates/openshell-server/src/grpc/auth_rpc.rs new file mode 100644 index 000000000..8e98b1824 --- /dev/null +++ b/crates/openshell-server/src/grpc/auth_rpc.rs @@ -0,0 +1,365 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Authentication-related RPC handlers. +//! +//! Hosts the two sandbox-identity RPCs: +//! - `IssueSandboxToken` — bootstrap exchange (K8s SA token → gateway JWT) +//! - `RefreshSandboxToken` — renew a still-valid gateway JWT +//! +//! Both end in a fresh gateway-signed JWT minted by +//! [`crate::auth::sandbox_jwt::SandboxJwtIssuer`]. Older tokens remain valid +//! until their own `exp` and are bounded by the configured short TTL. + +use crate::ServerState; +use crate::auth::principal::{Principal, SandboxIdentitySource}; +use openshell_core::proto::{ + IssueSandboxTokenRequest, IssueSandboxTokenResponse, RefreshSandboxTokenRequest, + RefreshSandboxTokenResponse, Sandbox, +}; +use std::sync::Arc; +use tonic::{Request, Response, Status}; +use tracing::{debug, info, warn}; + +#[allow(clippy::result_large_err, clippy::unused_async)] +pub async fn handle_issue_sandbox_token( + state: &Arc, + request: Request, +) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; + + let Principal::Sandbox(sandbox) = principal else { + return Err(Status::permission_denied( + "IssueSandboxToken requires a sandbox principal", + )); + }; + + // Only the bootstrap K8s ServiceAccount path can mint a fresh gateway JWT + // via this RPC. Sandboxes already holding a gateway JWT use + // `RefreshSandboxToken` instead. + if !matches!( + sandbox.source, + SandboxIdentitySource::K8sServiceAccount { .. } + ) { + debug!( + sandbox_id = %sandbox.sandbox_id, + "IssueSandboxToken rejected: non-bootstrap principal source" + ); + return Err(Status::permission_denied( + "this principal cannot mint a sandbox token; use RefreshSandboxToken", + )); + } + + let issuer = state.sandbox_jwt_issuer.as_ref().ok_or_else(|| { + warn!( + sandbox_id = %sandbox.sandbox_id, + "IssueSandboxToken called but sandbox JWT issuer is not configured" + ); + Status::unavailable("sandbox JWT minting is not configured on this gateway") + })?; + + ensure_sandbox_exists(state, &sandbox.sandbox_id).await?; + + let minted = issuer.mint(&sandbox.sandbox_id)?; + info!( + sandbox_id = %sandbox.sandbox_id, + "issued gateway sandbox JWT" + ); + Ok(Response::new(IssueSandboxTokenResponse { + token: minted.token, + expires_at_ms: minted.expires_at_ms, + })) +} + +#[allow(clippy::result_large_err, clippy::unused_async)] +pub async fn handle_refresh_sandbox_token( + state: &Arc, + request: Request, +) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; + + let Principal::Sandbox(sandbox) = principal else { + return Err(Status::permission_denied( + "RefreshSandboxToken requires a sandbox principal", + )); + }; + + // Only callers already holding a gateway-minted JWT may refresh; the + // K8s bootstrap path must use `IssueSandboxToken`. + let SandboxIdentitySource::BootstrapJwt { .. } = &sandbox.source else { + debug!( + sandbox_id = %sandbox.sandbox_id, + "RefreshSandboxToken rejected: non-gateway-JWT principal source" + ); + return Err(Status::permission_denied( + "this principal cannot refresh; use IssueSandboxToken for bootstrap", + )); + }; + + let issuer = state.sandbox_jwt_issuer.as_ref().ok_or_else(|| { + warn!( + sandbox_id = %sandbox.sandbox_id, + "RefreshSandboxToken called but sandbox JWT issuer is not configured" + ); + Status::unavailable("sandbox JWT minting is not configured on this gateway") + })?; + + ensure_sandbox_exists(state, &sandbox.sandbox_id).await?; + + let minted = issuer.mint(&sandbox.sandbox_id)?; + info!( + sandbox_id = %sandbox.sandbox_id, + "renewed gateway sandbox JWT" + ); + + Ok(Response::new(RefreshSandboxTokenResponse { + token: minted.token, + expires_at_ms: minted.expires_at_ms, + })) +} + +async fn ensure_sandbox_exists(state: &Arc, sandbox_id: &str) -> Result<(), Status> { + if sandbox_id.is_empty() { + return Err(Status::invalid_argument("sandbox_id is required")); + } + + state + .store + .get_message::(sandbox_id) + .await + .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? + .ok_or_else(|| Status::not_found("sandbox not found"))?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ServerState; + use crate::auth::principal::{Principal, SandboxPrincipal, UserPrincipal}; + use crate::auth::sandbox_jwt::SandboxJwtIssuer; + use crate::compute::new_test_runtime; + use crate::persistence::Store; + use crate::sandbox_index::SandboxIndex; + use crate::sandbox_watch::SandboxWatchBus; + use crate::supervisor_session::SupervisorSessionRegistry; + use crate::tracing_bus::TracingLogBus; + use openshell_bootstrap::jwt::generate_jwt_key; + use openshell_core::Config; + use openshell_core::proto::datamodel::v1::ObjectMeta; + use openshell_core::proto::{Sandbox, SandboxPhase, SandboxSpec}; + use std::collections::HashMap; + use std::time::Duration; + + async fn state_with_issuer() -> Arc { + let mat = generate_jwt_key().expect("jwt key"); + let store = Arc::new( + Store::connect("sqlite::memory:?cache=shared") + .await + .unwrap(), + ); + let compute = new_test_runtime(store.clone()).await; + let mut state = ServerState::new( + Config::new(None).with_database_url("sqlite::memory:?cache=shared"), + store, + compute, + SandboxIndex::new(), + SandboxWatchBus::new(), + TracingLogBus::new(), + Arc::new(SupervisorSessionRegistry::new()), + None, + ); + // We don't need the authenticator for these tests; only the issuer. + let issuer = SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid, + "test-gateway", + Duration::from_secs(3600), + ) + .unwrap(); + state.sandbox_jwt_issuer = Some(Arc::new(issuer)); + let state = Arc::new(state); + insert_sandbox(&state, "sandbox-a").await; + state + } + + async fn insert_sandbox(state: &Arc, sandbox_id: &str) { + let sandbox = Sandbox { + metadata: Some(ObjectMeta { + id: sandbox_id.to_string(), + name: sandbox_id.to_string(), + created_at_ms: 1_000_000, + labels: HashMap::default(), + resource_version: 0, + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Ready as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + } + + fn sandbox_principal(sandbox_id: &str) -> Principal { + use crate::auth::principal::SandboxIdentitySource; + Principal::Sandbox(SandboxPrincipal { + sandbox_id: sandbox_id.to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test-gateway".to_string(), + }, + trust_domain: Some("openshell".to_string()), + }) + } + + #[tokio::test] + async fn refresh_returns_new_token() { + let state = state_with_issuer().await; + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut().insert(sandbox_principal("sandbox-a")); + let resp = handle_refresh_sandbox_token(&state, req) + .await + .expect("refresh OK") + .into_inner(); + assert!(!resp.token.is_empty()); + assert!(resp.expires_at_ms > 0); + } + + #[tokio::test] + async fn refresh_rejects_missing_sandbox() { + let state = state_with_issuer().await; + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut() + .insert(sandbox_principal("sandbox-deleted")); + let err = handle_refresh_sandbox_token(&state, req) + .await + .expect_err("missing sandbox must not refresh"); + assert_eq!(err.code(), tonic::Code::NotFound); + } + + #[tokio::test] + async fn issue_returns_token_for_existing_sandbox() { + use crate::auth::principal::SandboxIdentitySource; + + let state = state_with_issuer().await; + let mut req = Request::new(IssueSandboxTokenRequest {}); + req.extensions_mut() + .insert(Principal::Sandbox(SandboxPrincipal { + sandbox_id: "sandbox-a".to_string(), + source: SandboxIdentitySource::K8sServiceAccount { + pod_name: "pod-a".to_string(), + pod_uid: "uid-a".to_string(), + }, + trust_domain: Some("openshell".to_string()), + })); + let resp = handle_issue_sandbox_token(&state, req) + .await + .expect("issue OK") + .into_inner(); + assert!(!resp.token.is_empty()); + assert!(resp.expires_at_ms > 0); + } + + #[tokio::test] + async fn issue_rejects_missing_sandbox() { + use crate::auth::principal::SandboxIdentitySource; + + let state = state_with_issuer().await; + let mut req = Request::new(IssueSandboxTokenRequest {}); + req.extensions_mut() + .insert(Principal::Sandbox(SandboxPrincipal { + sandbox_id: "sandbox-deleted".to_string(), + source: SandboxIdentitySource::K8sServiceAccount { + pod_name: "pod-a".to_string(), + pod_uid: "uid-a".to_string(), + }, + trust_domain: Some("openshell".to_string()), + })); + let err = handle_issue_sandbox_token(&state, req) + .await + .expect_err("missing sandbox must not receive a token"); + assert_eq!(err.code(), tonic::Code::NotFound); + } + + #[tokio::test] + async fn refresh_rejects_user_principal() { + use crate::auth::identity::{Identity, IdentityProvider}; + let state = state_with_issuer().await; + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut().insert(Principal::User(UserPrincipal { + identity: Identity { + subject: "alice".to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + })); + let err = handle_refresh_sandbox_token(&state, req) + .await + .expect_err("user must not refresh"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[tokio::test] + async fn refresh_rejects_k8s_sa_principal() { + // K8s SA-bootstrap principals must use IssueSandboxToken, not + // RefreshSandboxToken — the refresh path assumes a still-valid + // gateway-minted JWT exists. + use crate::auth::principal::SandboxIdentitySource; + let state = state_with_issuer().await; + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut() + .insert(Principal::Sandbox(SandboxPrincipal { + sandbox_id: "sandbox-a".to_string(), + source: SandboxIdentitySource::K8sServiceAccount { + pod_name: "pod-a".to_string(), + pod_uid: "uid-a".to_string(), + }, + trust_domain: Some("openshell".to_string()), + })); + let err = handle_refresh_sandbox_token(&state, req) + .await + .expect_err("K8s SA principal must not refresh"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[tokio::test] + async fn refresh_fails_when_issuer_not_configured() { + // Build a ServerState without the issuer to confirm the handler + // returns Unavailable. + let store = Arc::new( + Store::connect("sqlite::memory:?cache=shared") + .await + .unwrap(), + ); + let compute = new_test_runtime(store.clone()).await; + let state = Arc::new(ServerState::new( + Config::new(None).with_database_url("sqlite::memory:?cache=shared"), + store, + compute, + SandboxIndex::new(), + SandboxWatchBus::new(), + TracingLogBus::new(), + Arc::new(SupervisorSessionRegistry::new()), + None, + )); + insert_sandbox(&state, "sandbox-a").await; + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut().insert(sandbox_principal("sandbox-a")); + let err = handle_refresh_sandbox_token(&state, req) + .await + .expect_err("missing issuer must yield unavailable"); + assert_eq!(err.code(), tonic::Code::Unavailable); + } +} diff --git a/crates/openshell-server/src/grpc/mod.rs b/crates/openshell-server/src/grpc/mod.rs index 8f70c20bb..8538c8658 100644 --- a/crates/openshell-server/src/grpc/mod.rs +++ b/crates/openshell-server/src/grpc/mod.rs @@ -3,6 +3,7 @@ //! gRPC service implementation. +mod auth_rpc; pub mod policy; pub mod provider; mod sandbox; @@ -27,18 +28,20 @@ use openshell_core::proto::{ GetSandboxLogsResponse, GetSandboxPolicyStatusRequest, GetSandboxPolicyStatusResponse, GetSandboxProviderEnvironmentRequest, GetSandboxProviderEnvironmentResponse, GetSandboxRequest, GetServiceRequest, HealthRequest, HealthResponse, ImportProviderProfilesRequest, - ImportProviderProfilesResponse, LintProviderProfilesRequest, LintProviderProfilesResponse, - ListProviderProfilesRequest, ListProviderProfilesResponse, ListProvidersRequest, - ListProvidersResponse, ListSandboxPoliciesRequest, ListSandboxPoliciesResponse, - ListSandboxProvidersRequest, ListSandboxProvidersResponse, ListSandboxesRequest, - ListSandboxesResponse, ListServicesRequest, ListServicesResponse, ProviderProfileResponse, - ProviderResponse, PushSandboxLogsRequest, PushSandboxLogsResponse, RejectDraftChunkRequest, - RejectDraftChunkResponse, RelayFrame, ReportPolicyStatusRequest, ReportPolicyStatusResponse, - RevokeSshSessionRequest, RevokeSshSessionResponse, RotateProviderCredentialRequest, - RotateProviderCredentialResponse, SandboxResponse, SandboxStreamEvent, ServiceEndpointResponse, - ServiceStatus, SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, SupervisorMessage, - TcpForwardFrame, UndoDraftChunkRequest, UndoDraftChunkResponse, UpdateConfigRequest, - UpdateConfigResponse, UpdateProviderRequest, WatchSandboxRequest, open_shell_server::OpenShell, + ImportProviderProfilesResponse, IssueSandboxTokenRequest, IssueSandboxTokenResponse, + LintProviderProfilesRequest, LintProviderProfilesResponse, ListProviderProfilesRequest, + ListProviderProfilesResponse, ListProvidersRequest, ListProvidersResponse, + ListSandboxPoliciesRequest, ListSandboxPoliciesResponse, ListSandboxProvidersRequest, + ListSandboxProvidersResponse, ListSandboxesRequest, ListSandboxesResponse, ListServicesRequest, + ListServicesResponse, ProviderProfileResponse, ProviderResponse, PushSandboxLogsRequest, + PushSandboxLogsResponse, RefreshSandboxTokenRequest, RefreshSandboxTokenResponse, + RejectDraftChunkRequest, RejectDraftChunkResponse, RelayFrame, ReportPolicyStatusRequest, + ReportPolicyStatusResponse, RevokeSshSessionRequest, RevokeSshSessionResponse, + RotateProviderCredentialRequest, RotateProviderCredentialResponse, SandboxResponse, + SandboxStreamEvent, ServiceEndpointResponse, ServiceStatus, SubmitPolicyAnalysisRequest, + SubmitPolicyAnalysisResponse, SupervisorMessage, TcpForwardFrame, UndoDraftChunkRequest, + UndoDraftChunkResponse, UpdateConfigRequest, UpdateConfigResponse, UpdateProviderRequest, + WatchSandboxRequest, open_shell_server::OpenShell, }; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; @@ -567,6 +570,22 @@ impl OpenShell for OpenShellService { policy::handle_get_draft_history(&self.state, request).await } + // --- Sandbox identity --- + + async fn issue_sandbox_token( + &self, + request: Request, + ) -> Result, Status> { + auth_rpc::handle_issue_sandbox_token(&self.state, request).await + } + + async fn refresh_sandbox_token( + &self, + request: Request, + ) -> Result, Status> { + auth_rpc::handle_refresh_sandbox_token(&self.state, request).await + } + // --- Supervisor session --- type ConnectSupervisorStream = diff --git a/crates/openshell-server/src/grpc/policy.rs b/crates/openshell-server/src/grpc/policy.rs index 412febb96..886d66a0e 100644 --- a/crates/openshell-server/src/grpc/policy.rs +++ b/crates/openshell-server/src/grpc/policy.rs @@ -10,9 +10,10 @@ #![allow(clippy::cast_precision_loss)] // f64->f32 for confidence scores #![allow(clippy::items_after_statements)] // DB_PORTS const inside function +use crate::ServerState; +use crate::auth::principal::Principal; use crate::persistence::{DraftChunkRecord, ObjectId, ObjectName, ObjectType, PolicyRecord, Store}; use crate::policy_store::PolicyStoreExt; -use crate::{ServerState, auth::oidc}; use openshell_core::proto::policy_merge_operation; use openshell_core::proto::setting_value; use openshell_core::proto::{ @@ -314,8 +315,12 @@ fn truncate_for_log(input: &str, max_chars: usize) -> String { } } +#[cfg(test)] fn is_sandbox_caller(request: &Request) -> bool { - oidc::is_sandbox_caller(request.metadata()) + matches!( + request.extensions().get::(), + Some(Principal::Sandbox(_)) + ) } /// Sandbox-class callers may only perform sandbox-scoped policy sync. They @@ -352,7 +357,9 @@ pub(super) async fn handle_get_sandbox_config( state: &Arc, request: Request, ) -> Result, Status> { - let sandbox_id = request.into_inner().sandbox_id; + let sandbox_id = request.get_ref().sandbox_id.clone(); + crate::auth::guard::enforce_sandbox_scope(&request, &sandbox_id)?; + drop(request); let sandbox = state .store @@ -615,7 +622,9 @@ pub(super) async fn handle_get_sandbox_provider_environment( state: &Arc, request: Request, ) -> Result, Status> { - let sandbox_id = request.into_inner().sandbox_id; + let sandbox_id = request.get_ref().sandbox_id.clone(); + crate::auth::guard::enforce_sandbox_scope(&request, &sandbox_id)?; + drop(request); let sandbox = state .store @@ -658,10 +667,26 @@ pub(super) async fn handle_update_config( state: &Arc, request: Request, ) -> Result, Status> { - let sandbox_caller = is_sandbox_caller(&request); + let principal = request.extensions().get::().cloned(); + let sandbox_caller = matches!(principal, Some(Principal::Sandbox(_))); let req = request.into_inner(); if sandbox_caller { validate_sandbox_caller_update(&req)?; + // Resolve req.name to a sandbox UUID and verify the calling + // sandbox principal owns it. User callers (CLI / TUI) bypass + // this check because RBAC was their gate. + let sandbox = state + .store + .get_message_by_name::(&req.name) + .await + .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? + .ok_or_else(|| Status::not_found("sandbox not found"))?; + crate::auth::guard::ensure_sandbox_scope( + principal + .as_ref() + .expect("sandbox_caller implies principal"), + sandbox.object_id(), + )?; } let key = req.setting_key.trim(); let has_policy = req.policy.is_some(); @@ -1184,6 +1209,8 @@ pub(super) async fn handle_report_policy_status( state: &Arc, request: Request, ) -> Result, Status> { + let sandbox_id = request.get_ref().sandbox_id.clone(); + crate::auth::guard::enforce_sandbox_scope(&request, &sandbox_id)?; let req = request.into_inner(); if req.sandbox_id.is_empty() { return Err(Status::invalid_argument("sandbox_id is required")); @@ -1306,8 +1333,13 @@ pub(super) async fn handle_push_sandbox_logs( state: &Arc, request: Request>, ) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; let mut stream = request.into_inner(); - let mut validated = false; + let mut validated_sandbox_id = None; while let Some(batch) = stream .message() @@ -1318,15 +1350,13 @@ pub(super) async fn handle_push_sandbox_logs( continue; } - if !validated { - state - .store - .get_message::(&batch.sandbox_id) - .await - .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? - .ok_or_else(|| Status::not_found("sandbox not found"))?; - validated = true; - } + ensure_log_stream_sandbox_scope( + state, + &principal, + &batch.sandbox_id, + &mut validated_sandbox_id, + ) + .await?; for log in batch.logs.into_iter().take(100) { let mut log = log; @@ -1339,6 +1369,32 @@ pub(super) async fn handle_push_sandbox_logs( Ok(Response::new(PushSandboxLogsResponse {})) } +async fn ensure_log_stream_sandbox_scope( + state: &Arc, + principal: &Principal, + sandbox_id: &str, + validated_sandbox_id: &mut Option, +) -> Result<(), Status> { + if let Some(validated) = validated_sandbox_id.as_deref() { + if sandbox_id != validated { + return Err(Status::permission_denied( + "log stream sandbox_id changed after validation", + )); + } + return Ok(()); + } + + crate::auth::guard::ensure_sandbox_scope(principal, sandbox_id)?; + state + .store + .get_message::(sandbox_id) + .await + .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? + .ok_or_else(|| Status::not_found("sandbox not found"))?; + *validated_sandbox_id = Some(sandbox_id.to_string()); + Ok(()) +} + // --------------------------------------------------------------------------- // Draft policy recommendation handlers // --------------------------------------------------------------------------- @@ -1347,6 +1403,11 @@ pub(super) async fn handle_submit_policy_analysis( state: &Arc, request: Request, ) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; let req = request.into_inner(); if req.name.is_empty() { return Err(Status::invalid_argument("name is required")); @@ -1359,6 +1420,9 @@ pub(super) async fn handle_submit_policy_analysis( .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? .ok_or_else(|| Status::not_found("sandbox not found"))?; let sandbox_id = sandbox.object_id().to_string(); + // Name → id resolved; now enforce that a sandbox principal only acts + // on its own sandbox. User principals are unaffected. + crate::auth::guard::ensure_sandbox_scope(&principal, &sandbox_id)?; let current_version = state .store @@ -1475,6 +1539,11 @@ pub(super) async fn handle_get_draft_policy( state: &Arc, request: Request, ) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; let req = request.into_inner(); if req.name.is_empty() { return Err(Status::invalid_argument("name is required")); @@ -1487,6 +1556,7 @@ pub(super) async fn handle_get_draft_policy( .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? .ok_or_else(|| Status::not_found("sandbox not found"))?; let sandbox_id = sandbox.object_id().to_string(); + crate::auth::guard::ensure_sandbox_scope(&principal, &sandbox_id)?; let status_filter = if req.status_filter.is_empty() { None @@ -2828,11 +2898,49 @@ fn materialize_global_settings( #[cfg(test)] mod tests { use super::*; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{ + Principal, SandboxIdentitySource, SandboxPrincipal, UserPrincipal, + }; use crate::grpc::test_support::test_server_state; use std::collections::HashMap; use std::sync::Arc; use tonic::Code; + /// Wrap a request with a user `Principal` so handler scope guards treat + /// the test caller as a CLI user. Most handler tests exercise + /// user-facing behavior and should not trip sandbox equality checks. + fn with_user(mut request: Request) -> Request { + request + .extensions_mut() + .insert(Principal::User(UserPrincipal { + identity: Identity { + subject: "test-user".to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + })); + request + } + + /// Wrap a request with a sandbox `Principal` bound to `sandbox_id`. + /// Use for tests that exercise sandbox-caller code paths. + #[allow(dead_code)] + fn with_sandbox(mut request: Request, sandbox_id: &str) -> Request { + request + .extensions_mut() + .insert(Principal::Sandbox(SandboxPrincipal { + sandbox_id: sandbox_id.to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + }, + trust_domain: Some("openshell".to_string()), + })); + request + } + #[test] fn sandbox_caller_update_validation_allows_sandbox_policy_sync() { let req = UpdateConfigRequest { @@ -2867,15 +2975,240 @@ mod tests { } #[test] - fn sandbox_caller_marker_detected_from_metadata() { + fn sandbox_caller_detected_from_principal_extension() { + use crate::auth::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; let mut req = Request::new(()); - req.metadata_mut().insert( - oidc::INTERNAL_AUTH_SOURCE_HEADER, - oidc::AUTH_SOURCE_SANDBOX.parse().unwrap(), - ); + req.extensions_mut() + .insert(Principal::Sandbox(SandboxPrincipal { + sandbox_id: "test-sandbox".to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + }, + trust_domain: None, + })); assert!(is_sandbox_caller(&req)); } + #[test] + fn user_principal_not_treated_as_sandbox_caller() { + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{Principal, UserPrincipal}; + let mut req = Request::new(()); + req.extensions_mut().insert(Principal::User(UserPrincipal { + identity: Identity { + subject: "alice".to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + })); + assert!(!is_sandbox_caller(&req)); + } + + // ---- Sandbox IDOR guard (issue #1354) ---- + + #[tokio::test] + async fn cross_sandbox_get_sandbox_config_denied() { + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + // Two sandboxes; the caller is principal of A, the request body + // references B. + for (id, name) in [("sb-a", "sandbox-a"), ("sb-b", "sandbox-b")] { + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: id.to_string(), + name: name.to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + resource_version: 0, + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + } + let req = with_sandbox( + Request::new(GetSandboxConfigRequest { + sandbox_id: "sb-b".to_string(), + }), + "sb-a", + ); + let err = handle_get_sandbox_config(&state, req) + .await + .expect_err("cross-sandbox call must be denied"); + assert_eq!(err.code(), Code::PermissionDenied); + } + + #[tokio::test] + async fn same_sandbox_get_sandbox_config_allowed() { + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: "sb-self".to_string(), + name: "self".to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + resource_version: 0, + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + let req = with_sandbox( + Request::new(GetSandboxConfigRequest { + sandbox_id: "sb-self".to_string(), + }), + "sb-self", + ); + handle_get_sandbox_config(&state, req) + .await + .expect("matching principal must be allowed"); + } + + #[tokio::test] + async fn cross_sandbox_submit_policy_analysis_denied() { + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + for (id, name) in [("sb-a", "sandbox-a"), ("sb-b", "sandbox-b")] { + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: id.to_string(), + name: name.to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + resource_version: 0, + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + } + let req = with_sandbox( + Request::new(SubmitPolicyAnalysisRequest { + name: "sandbox-b".to_string(), + ..Default::default() + }), + "sb-a", + ); + let err = handle_submit_policy_analysis(&state, req) + .await + .expect_err("cross-sandbox submit must be denied"); + assert_eq!(err.code(), Code::PermissionDenied); + } + + #[tokio::test] + async fn cross_sandbox_get_draft_policy_denied() { + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + for (id, name) in [("sb-a", "sandbox-a"), ("sb-b", "sandbox-b")] { + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: id.to_string(), + name: name.to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + resource_version: 0, + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + } + let req = with_sandbox( + Request::new(GetDraftPolicyRequest { + name: "sandbox-b".to_string(), + status_filter: String::new(), + }), + "sb-a", + ); + let err = handle_get_draft_policy(&state, req) + .await + .expect_err("cross-sandbox draft read must be denied"); + assert_eq!(err.code(), Code::PermissionDenied); + } + + #[tokio::test] + async fn user_principal_can_read_any_sandbox_config() { + // RBAC was the user gate; the IDOR guard must NOT trip for users. + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: "sb-x".to_string(), + name: "x".to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + resource_version: 0, + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + let req = with_user(Request::new(GetSandboxConfigRequest { + sandbox_id: "sb-x".to_string(), + })); + handle_get_sandbox_config(&state, req) + .await + .expect("user principal must succeed"); + } + + #[tokio::test] + async fn log_stream_scope_rejects_sandbox_id_change_after_validation() { + let state = test_server_state().await; + for id in ["sb-a", "sb-b"] { + let sandbox = test_sandbox(id, id, ProtoSandboxPolicy::default(), vec![]); + state.store.put_message(&sandbox).await.unwrap(); + } + let req = with_sandbox(Request::new(()), "sb-a"); + let principal = req.extensions().get::().unwrap().clone(); + let mut validated = None; + + ensure_log_stream_sandbox_scope(&state, &principal, "sb-a", &mut validated) + .await + .expect("first frame should validate"); + let err = ensure_log_stream_sandbox_scope(&state, &principal, "sb-b", &mut validated) + .await + .expect_err("later frame must not switch sandbox ids"); + + assert_eq!(err.code(), Code::PermissionDenied); + } + + #[tokio::test] + async fn log_stream_scope_rejects_missing_sandbox() { + let state = test_server_state().await; + let req = with_sandbox(Request::new(()), "sb-a"); + let principal = req.extensions().get::().unwrap().clone(); + let mut validated = None; + + let err = ensure_log_stream_sandbox_scope(&state, &principal, "sb-a", &mut validated) + .await + .expect_err("missing sandbox must not validate"); + + assert_eq!(err.code(), Code::NotFound); + } + // ---- Sandbox without policy ---- #[tokio::test] @@ -2989,9 +3322,9 @@ mod tests { async fn get_sandbox_policy(state: &Arc, sandbox_id: &str) -> ProtoSandboxPolicy { handle_get_sandbox_config( state, - Request::new(GetSandboxConfigRequest { + with_user(Request::new(GetSandboxConfigRequest { sandbox_id: sandbox_id.to_string(), - }), + })), ) .await .unwrap() @@ -3432,9 +3765,9 @@ mod tests { let legacy_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-provider-env".to_string(), - }), + })), ) .await .unwrap() @@ -3444,9 +3777,9 @@ mod tests { enable_providers_v2(&state).await; let v2_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-provider-env".to_string(), - }), + })), ) .await .unwrap() @@ -3478,9 +3811,9 @@ mod tests { let first = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-provider-revision".to_string(), - }), + })), ) .await .unwrap() @@ -3494,9 +3827,9 @@ mod tests { let second = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-provider-revision".to_string(), - }), + })), ) .await .unwrap() @@ -3548,9 +3881,9 @@ mod tests { ); let baseline_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3558,11 +3891,11 @@ mod tests { handle_attach_sandbox_provider( &state, - Request::new(AttachSandboxProviderRequest { + with_user(Request::new(AttachSandboxProviderRequest { sandbox_name: "attach-lifecycle".to_string(), provider_name: "work-github".to_string(), expected_resource_version: 0, - }), + })), ) .await .unwrap(); @@ -3576,9 +3909,9 @@ mod tests { let attached_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3612,9 +3945,9 @@ mod tests { let detached_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3708,9 +4041,9 @@ mod tests { ); let baseline_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-custom-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3718,11 +4051,11 @@ mod tests { handle_attach_sandbox_provider( &state, - Request::new(AttachSandboxProviderRequest { + with_user(Request::new(AttachSandboxProviderRequest { sandbox_name: "custom-attach-lifecycle".to_string(), provider_name: "work-custom".to_string(), expected_resource_version: 0, - }), + })), ) .await .unwrap(); @@ -3739,9 +4072,9 @@ mod tests { let attached_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-custom-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3774,9 +4107,9 @@ mod tests { ); let detached_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-custom-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3874,9 +4207,9 @@ mod tests { let response = handle_get_sandbox_config( &state, - Request::new(GetSandboxConfigRequest { + with_user(Request::new(GetSandboxConfigRequest { sandbox_id: "sb-global-profile".to_string(), - }), + })), ) .await .unwrap() @@ -4002,7 +4335,7 @@ mod tests { let submit = handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name.clone(), proposed_chunks: vec![PolicyChunk { rule_name: "allow_example".to_string(), @@ -4016,7 +4349,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4028,10 +4361,10 @@ mod tests { let draft_policy = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name.clone(), status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4098,10 +4431,10 @@ mod tests { let draft_policy_after_undo = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name.clone(), status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4150,10 +4483,10 @@ mod tests { let draft_policy_after_clear = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name.clone(), status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4212,7 +4545,7 @@ mod tests { let submit = handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name.clone(), proposed_chunks: vec![PolicyChunk { rule_name: "allow_example".to_string(), @@ -4221,7 +4554,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4242,10 +4575,10 @@ mod tests { let draft = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name, status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4321,7 +4654,7 @@ mod tests { async move { handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name, analysis_mode: "agent_authored".to_string(), proposed_chunks: vec![PolicyChunk { @@ -4330,7 +4663,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4350,10 +4683,10 @@ mod tests { let draft = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name.clone(), status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4428,7 +4761,7 @@ mod tests { async move { handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name, analysis_mode: "mechanistic".to_string(), proposed_chunks: vec![PolicyChunk { @@ -4437,7 +4770,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4451,10 +4784,10 @@ mod tests { let draft = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name, status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4524,7 +4857,7 @@ mod tests { let submit = handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name.clone(), proposed_chunks: vec![PolicyChunk { rule_name: "allow_example".to_string(), @@ -4532,7 +4865,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4572,10 +4905,10 @@ mod tests { let draft = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name, status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4646,7 +4979,7 @@ mod tests { handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_a.object_name().to_string(), proposed_chunks: vec![PolicyChunk { rule_name: "allow_example".to_string(), @@ -4660,17 +4993,17 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap(); let draft_policy = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_a.object_name().to_string(), status_filter: String::new(), - }), + })), ) .await .unwrap() diff --git a/crates/openshell-server/src/grpc/sandbox.rs b/crates/openshell-server/src/grpc/sandbox.rs index 4978687ed..c98f64717 100644 --- a/crates/openshell-server/src/grpc/sandbox.rs +++ b/crates/openshell-server/src/grpc/sandbox.rs @@ -133,7 +133,27 @@ pub(super) async fn handle_create_sandbox( status })?; - let sandbox = state.compute.create_sandbox(sandbox).await?; + // Mint the gateway JWT for singleplayer drivers. K8s sandboxes skip + // this mint and bootstrap via `IssueSandboxToken` at supervisor + // startup; identifying "is this K8s?" lives in the compute layer, so + // we mint unconditionally here when the issuer is configured and let + // the K8s driver simply ignore the field. + let sandbox_token = state.sandbox_jwt_issuer.as_ref().map(|issuer| { + issuer.mint(&id).map(|minted| { + tracing::info!( + sandbox_id = %id, + "minted sandbox JWT" + ); + minted.token + }) + }); + let sandbox_token = match sandbox_token { + Some(Ok(token)) => Some(token), + Some(Err(status)) => return Err(status), + None => None, + }; + + let sandbox = state.compute.create_sandbox(sandbox, sandbox_token).await?; info!( sandbox_id = %id, diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs index 53d6265b7..2fb89b0ac 100644 --- a/crates/openshell-server/src/inference.rs +++ b/crates/openshell-server/src/inference.rs @@ -59,8 +59,25 @@ impl ObjectType for InferenceRoute { impl Inference for InferenceService { async fn get_inference_bundle( &self, - _request: Request, + request: Request, ) -> Result, Status> { + // GetInferenceBundle is gateway-wide (no per-sandbox routes yet), + // so it has no `sandbox_id` to compare against. Just reject + // anonymous callers; both user and sandbox principals are allowed. + match request + .extensions() + .get::() + { + Some( + crate::auth::principal::Principal::User(_) + | crate::auth::principal::Principal::Sandbox(_), + ) => {} + Some(crate::auth::principal::Principal::Anonymous) | None => { + return Err(Status::unauthenticated( + "GetInferenceBundle requires an authenticated caller", + )); + } + } resolve_inference_bundle(self.state.store.as_ref()) .await .map(Response::new) diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 220e45026..f447e4030 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -105,6 +105,22 @@ pub struct ServerState { /// OIDC JWKS cache for JWT validation. `None` when OIDC is not configured. pub oidc_cache: Option>, + + /// Gateway-minted sandbox JWT issuer. `None` when `config.gateway_jwt` + /// is not configured; in that mode `IssueSandboxToken` returns + /// `Status::unavailable`. Populated at startup from the on-disk key + /// material that `certgen` writes. + pub sandbox_jwt_issuer: Option>, + + /// Authenticator that validates gateway-minted sandbox JWTs on every + /// inbound request. Always set when `sandbox_jwt_issuer` is, so callers + /// presenting a freshly minted token are recognized. + pub sandbox_jwt_authenticator: Option>, + + /// Optional K8s `ServiceAccount` authenticator that backs the + /// `IssueSandboxToken` bootstrap path. Only present when the gateway + /// runs in-cluster. + pub k8s_sa_authenticator: Option>, } fn is_benign_tls_handshake_failure(error: &std::io::Error) -> bool { @@ -149,6 +165,9 @@ impl ServerState { settings_mutex: tokio::sync::Mutex::new(()), supervisor_sessions, oidc_cache, + sandbox_jwt_issuer: None, + sandbox_jwt_authenticator: None, + k8s_sa_authenticator: None, } } } @@ -206,7 +225,7 @@ pub async fn run_server( supervisor_sessions.clone(), ) .await?; - let state = Arc::new(ServerState::new( + let mut state = ServerState::new( config.clone(), store.clone(), compute, @@ -215,7 +234,103 @@ pub async fn run_server( tracing_log_bus, supervisor_sessions, oidc_cache, - )); + ); + + // Load the gateway-minted sandbox JWT signing key when configured. + // Optional so single-driver dev deployments without certgen continue + // to start. The helm-deployed gateway and the RPM init script populate + // `gateway_jwt` once `certgen` has produced the on-disk material. + if let Some(ref jwt) = config.gateway_jwt { + let signing_pem = std::fs::read(&jwt.signing_key_path).map_err(|e| { + Error::config(format!( + "failed to read sandbox JWT signing key from {}: {e}", + jwt.signing_key_path.display() + )) + })?; + let public_pem = std::fs::read(&jwt.public_key_path).map_err(|e| { + Error::config(format!( + "failed to read sandbox JWT public key from {}: {e}", + jwt.public_key_path.display() + )) + })?; + let kid = std::fs::read_to_string(&jwt.kid_path) + .map_err(|e| { + Error::config(format!( + "failed to read sandbox JWT kid from {}: {e}", + jwt.kid_path.display() + )) + })? + .trim() + .to_string(); + if kid.is_empty() { + return Err(Error::config(format!( + "sandbox JWT kid file {} is empty", + jwt.kid_path.display() + ))); + } + let issuer = auth::sandbox_jwt::SandboxJwtIssuer::from_pem( + &signing_pem, + kid.clone(), + &jwt.gateway_id, + Duration::from_secs(jwt.ttl_secs), + ) + .map_err(Error::config)?; + let authenticator = + auth::sandbox_jwt::SandboxJwtAuthenticator::from_pem(&public_pem, kid, &jwt.gateway_id) + .map_err(Error::config)?; + info!( + gateway_id = %jwt.gateway_id, + ttl_secs = jwt.ttl_secs, + "gateway-minted sandbox JWT enabled" + ); + state.sandbox_jwt_issuer = Some(Arc::new(issuer)); + state.sandbox_jwt_authenticator = Some(Arc::new(authenticator)); + } + + // K8s ServiceAccount bootstrap authenticator. Only constructed when + // the gateway is running in-cluster (kubelet provides the API host + // env var) and has a sandbox JWT issuer to mint replacements against; + // outside the cluster we can't call the apiserver's TokenReview API, + // and without the issuer there's nothing to exchange the SA token for. + if state.sandbox_jwt_issuer.is_some() && std::env::var_os("KUBERNETES_SERVICE_HOST").is_some() { + // Pod lookups and TokenReview identity checks must match the sandbox + // namespace and service account used by the Kubernetes driver. Fall + // back to the historical "default" namespace only if driver config + // cannot be parsed while bootstrapping the authenticator. + let kubernetes_config = + kubernetes_config_from_file(config_file.as_ref()).unwrap_or_else(|_| { + KubernetesComputeConfig { + namespace: "default".to_string(), + ..Default::default() + } + }); + let sandbox_namespace = kubernetes_config.namespace; + let sandbox_service_account = kubernetes_config.service_account_name; + match kube::Client::try_default().await { + Ok(client) => { + let resolver = Arc::new(auth::k8s_sa::LiveK8sResolver::new( + client, + &sandbox_namespace, + "openshell-gateway".to_string(), + sandbox_service_account.clone(), + )); + let authenticator = auth::k8s_sa::K8sServiceAccountAuthenticator::new(resolver); + state.k8s_sa_authenticator = Some(Arc::new(authenticator)); + info!( + namespace = %sandbox_namespace, + service_account = %sandbox_service_account, + "K8s ServiceAccount bootstrap authenticator enabled" + ); + } + Err(e) => warn!( + error = %e, + "in-cluster K8s client construction failed; \ + K8s ServiceAccount bootstrap is disabled" + ), + } + } + + let state = Arc::new(state); // Resume sandboxes that were stopped during the previous gateway // shutdown so the running compute state matches the persisted store. diff --git a/crates/openshell-server/src/multiplex.rs b/crates/openshell-server/src/multiplex.rs index deac9ee78..ed57c39b5 100644 --- a/crates/openshell-server/src/multiplex.rs +++ b/crates/openshell-server/src/multiplex.rs @@ -31,8 +31,15 @@ use tower_http::request_id::{MakeRequestId, RequestId}; use tracing::Span; use crate::{ - OpenShellService, ServerState, auth::authz::AuthzPolicy, auth::identity::Identity, auth::oidc, - http_router, inference::InferenceService, service_http_router, + OpenShellService, ServerState, + auth::authenticator::{AuthenticatorChain, PermissiveUserAuthenticator}, + auth::authz::AuthzPolicy, + auth::identity::Identity, + auth::oidc::{self, OidcAuthenticator}, + auth::principal::{Principal, UserPrincipal}, + http_router, + inference::InferenceService, + service_http_router, }; /// Request-ID generator that produces a UUID v4 for each inbound request. @@ -153,17 +160,11 @@ impl MultiplexService { user_role: oidc.user_role.clone(), scopes_enabled: !oidc.scopes_claim.is_empty(), }); - let has_client_ca = self - .state - .config - .tls - .as_ref() - .is_some_and(|tls| tls.client_ca_path.is_some()); - let grpc_service = AuthGrpcRouter::new( + let authenticator_chain = build_authenticator_chain(&self.state); + let grpc_service = AuthGrpcRouter::with_peer_identity( GrpcRouter::new(openshell, inference), - self.state.oidc_cache.clone(), + authenticator_chain, authz_policy, - has_client_ca, peer_identity, ); let http_service = http_router(self.state.clone()); @@ -256,50 +257,103 @@ where } } -/// gRPC router wrapper that authenticates and authorizes requests. +/// Assemble the authenticator chain for the gateway. /// -/// When `oidc_cache` is `Some`, extracts the `authorization: Bearer ` -/// header, validates the JWT (authentication), then checks RBAC roles -/// (authorization) before forwarding to the inner gRPC router. +/// Chain order (first-match-wins): +/// 1. `K8sServiceAccountAuthenticator` (path-scoped to `IssueSandboxToken`) +/// — exchanges a projected SA token for a `Principal::Sandbox` so the +/// `IssueSandboxToken` handler can mint a gateway JWT. No-op on every +/// other path; only present when the gateway runs in-cluster. +/// 2. `SandboxJwtAuthenticator` — validates gateway-minted JWTs. Recognized +/// via a distinctive `kid` so non-matching Bearer tokens fall through. +/// 3. `OidcAuthenticator` — validates user Bearer tokens against the +/// configured OIDC issuer. Returns `Unauthenticated` for missing +/// Bearer headers so non-OIDC clients can't sneak through. +/// 4. `PermissiveUserAuthenticator` — installed only when no OIDC is +/// configured (singleplayer / helm-dev). Catches anything the +/// sandbox authenticators didn't claim and produces a synthetic +/// user principal, preserving the existing "no OIDC = open" dev posture. /// -/// Authentication is provider-specific (currently OIDC via `oidc.rs`). -/// Authorization is provider-agnostic (via `authz.rs`). This separation -/// aligns with RFC 0001's control-plane identity design. +/// When neither OIDC nor gateway-minted JWTs are configured (a barebones +/// dev gateway), the chain is left as `None` so the router short-circuits +/// to pass-through. +fn build_authenticator_chain(state: &ServerState) -> Option { + let mut authenticators: Vec> = Vec::new(); + if let Some(k8s) = state.k8s_sa_authenticator.clone() { + authenticators.push(k8s); + } + if let Some(jwt) = state.sandbox_jwt_authenticator.clone() { + authenticators.push(jwt); + } + if let Some(cache) = state.oidc_cache.clone() { + authenticators.push(Arc::new(OidcAuthenticator::new(cache))); + } else if !authenticators.is_empty() { + // No OIDC, but sandbox-side authentication IS configured — + // user CLI calls must still pass through, so install a + // permissive final fallback. Production deployments configure + // OIDC and this branch is unused. + authenticators.push(Arc::new(PermissiveUserAuthenticator::new("dev-anonymous"))); + } + if authenticators.is_empty() { + return None; + } + Some(AuthenticatorChain::new(authenticators)) +} + +/// gRPC router wrapper that runs the [`AuthenticatorChain`] and inserts the +/// resulting [`Principal`] into the request's extensions. /// -/// Sandbox-class methods (`oidc::is_sandbox_method`) accept callers without -/// a Bearer token: the gRPC channel's mTLS handshake is the trust -/// boundary. The router marks such requests with the -/// `INTERNAL_AUTH_SOURCE_HEADER` so handlers (`policy.rs`) can apply -/// sandbox-restricted scope. +/// Behavior: +/// - Strip any external `x-openshell-auth-source` marker first (so callers +/// cannot spoof a sandbox identity). +/// - Health probes / reflection bypass the chain entirely. +/// - When no chain is configured (OIDC not configured), forward without +/// authentication — preserves today's pass-through behavior. +/// - Otherwise, run the chain. The first match produces a `Principal`. +/// `Principal::User` is gated by the RBAC `AuthzPolicy`. +/// `Principal::Sandbox` is gated by a supervisor-method allowlist, then +/// handlers enforce same-sandbox scope on request bodies. #[derive(Clone)] pub struct AuthGrpcRouter { inner: S, - oidc_cache: Option>, + authenticator_chain: Option, authz_policy: Option, - /// Whether a client CA is configured (mTLS is a valid auth mechanism). - has_client_ca: bool, /// mTLS peer identity extracted from the TLS handshake. peer_identity: Option, } impl AuthGrpcRouter { + #[cfg(test)] fn new( inner: S, - oidc_cache: Option>, + authenticator_chain: Option, + authz_policy: Option, + ) -> Self { + Self::with_peer_identity(inner, authenticator_chain, authz_policy, None) + } + + fn with_peer_identity( + inner: S, + authenticator_chain: Option, authz_policy: Option, - has_client_ca: bool, peer_identity: Option, ) -> Self { Self { inner, - oidc_cache, + authenticator_chain, authz_policy, - has_client_ca, peer_identity, } } } +fn status_response(status: tonic::Status) -> Response { + let response = status.into_http(); + let (parts, body) = response.into_parts(); + let body = tonic::body::BoxBody::new(body); + Response::from_parts(parts, body) +} + impl tower::Service> for AuthGrpcRouter where S: tower::Service, Response = Response> @@ -319,28 +373,21 @@ where } fn call(&mut self, req: Request) -> Self::Future { - let oidc_cache = self.oidc_cache.clone(); + let chain = self.authenticator_chain.clone(); let authz_policy = self.authz_policy.clone(); - let has_client_ca = self.has_client_ca; let peer_identity = self.peer_identity.clone(); let mut inner = self.inner.clone(); Box::pin(async move { let mut req = req; - oidc::clear_internal_auth_markers(req.headers_mut()); - - // No auth configured — pass through. - if oidc_cache.is_none() && !has_client_ca { - return inner.ready().await?.call(req).await; - } - // mTLS-only (no OIDC) — TLS layer already enforced client certs, - // so if we got here the peer is authenticated. - if oidc_cache.is_none() && has_client_ca { + // No chain configured — pass through. Preserves today's + // "auth not configured means open" behavior for dev / + // fronting-proxy deployments. + let Some(chain) = chain else { return inner.ready().await?.call(req).await; - } + }; - let cache = oidc_cache.expect("checked above"); let path = req.uri().path().to_string(); // Health probes and reflection — truly unauthenticated. @@ -348,72 +395,43 @@ where return inner.ready().await?.call(req).await; } - // Sandbox-class RPCs — no Bearer expected. The gRPC channel's - // mTLS handshake (or the operator's fronting proxy when - // `--disable-gateway-auth` is set) is the trust boundary. - if oidc::is_sandbox_method(&path) { - oidc::mark_sandbox_caller(req.headers_mut()); - return inner.ready().await?.call(req).await; - } - - // Dual-auth methods (e.g. UpdateConfig) — Bearer present grants - // full scope (CLI users); Bearer absent marks the caller as - // sandbox-class for restricted scope downstream. - if oidc::is_dual_auth_method(&path) && !has_bearer_token(req.headers()) { - oidc::mark_sandbox_caller(req.headers_mut()); - return inner.ready().await?.call(req).await; - } + let principal = match chain.authenticate(req.headers(), &path).await { + Ok(Some(p)) => p, + Ok(None) => { + if let Some(identity) = peer_identity { + Principal::User(UserPrincipal { identity }) + } else { + return Ok(status_response(tonic::Status::unauthenticated( + "missing authorization header", + ))); + } + } + Err(status) => return Ok(status_response(status)), + }; - // Extract Bearer token from the authorization header. - let token = req - .headers() - .get("authorization") - .and_then(|v| v.to_str().ok()) - .and_then(|v| v.strip_prefix("Bearer ")); - - let Some(token) = token else { - // No bearer token — fall back to mTLS if a client cert was - // presented (only possible when both OIDC and client CA are - // configured and require_client_auth is false). - if let Some(ref identity) = peer_identity { + match principal { + Principal::User(ref user) => { if let Some(ref policy) = authz_policy - && let Err(status) = policy.check(identity, &path) + && let Err(status) = policy.check(&user.identity, &path) { - let response = status.into_http(); - let (parts, body) = response.into_parts(); - let body = tonic::body::BoxBody::new(body); - return Ok(Response::from_parts(parts, body)); + return Ok(status_response(status)); } - return inner.ready().await?.call(req).await; } - let status = tonic::Status::unauthenticated("missing authorization header"); - let response = status.into_http(); - let (parts, body) = response.into_parts(); - let body = tonic::body::BoxBody::new(body); - return Ok(Response::from_parts(parts, body)); - }; - - // Authenticate: validate the JWT and produce an Identity. - let identity = match cache.validate_token(token).await { - Ok(id) => id, - Err(status) => { - let response = status.into_http(); - let (parts, body) = response.into_parts(); - let body = tonic::body::BoxBody::new(body); - return Ok(Response::from_parts(parts, body)); + Principal::Sandbox(_) => { + if !crate::auth::sandbox_methods::is_sandbox_callable(&path) { + return Ok(status_response(tonic::Status::permission_denied( + "sandbox principals may not call this method", + ))); + } + } + Principal::Anonymous => { + return Ok(status_response(tonic::Status::unauthenticated( + "anonymous callers may not call authenticated methods", + ))); } - }; - - // Authorize: check RBAC roles against the method. - if let Some(ref policy) = authz_policy - && let Err(status) = policy.check(&identity, &path) - { - let response = status.into_http(); - let (parts, body) = response.into_parts(); - let body = tonic::body::BoxBody::new(body); - return Ok(Response::from_parts(parts, body)); } + req.extensions_mut().insert(principal); inner.ready().await?.call(req).await }) } @@ -513,13 +531,6 @@ where } } -fn has_bearer_token(headers: &http::HeaderMap) -> bool { - headers - .get("authorization") - .and_then(|v| v.to_str().ok()) - .is_some_and(|v| v.starts_with("Bearer ")) -} - fn grpc_method_from_path(path: &str) -> String { path.rsplit('/').next().unwrap_or(path).to_string() } @@ -860,4 +871,244 @@ mod tests { fn normalize_root_path() { assert_eq!(normalize_http_path("/"), "unknown"); } + + mod auth_router { + use super::*; + use crate::auth::authenticator::test_support::MockAuthenticator; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{ + Principal, SandboxIdentitySource, SandboxPrincipal, UserPrincipal, + }; + use http_body_util::Full; + use std::sync::Arc; + use std::sync::Mutex; + use tower::Service; + + type RecordedPrincipal = Arc>>; + + /// Service that snapshots the `Principal` from request extensions + /// and returns 200 OK. Used by router-level tests to assert the + /// chain's effect on the downstream service. + #[derive(Clone)] + struct PrincipalRecorder { + recorded: RecordedPrincipal, + } + + impl PrincipalRecorder { + fn new() -> (Self, RecordedPrincipal) { + let recorded = Arc::new(Mutex::new(None)); + ( + Self { + recorded: recorded.clone(), + }, + recorded, + ) + } + } + + impl Service> for PrincipalRecorder { + type Response = Response; + type Error = std::convert::Infallible; + type Future = Pin> + Send>>; + + fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } + + fn call(&mut self, req: Request) -> Self::Future { + let principal = req.extensions().get::().cloned(); + *self.recorded.lock().unwrap() = principal; + Box::pin(async move { + let body = tonic::body::BoxBody::new( + Full::new(Bytes::new()) + .map_err(|never| match never {}) + .boxed_unsync(), + ); + Ok(Response::new(body)) + }) + } + } + + fn empty_request(path: &str) -> Request> { + Request::builder() + .uri(path) + .body(Full::new(Bytes::new())) + .unwrap() + } + + fn grpc_status(res: &Response) -> Option { + res.headers() + .get("grpc-status") + .map(|v| v.to_str().unwrap().to_string()) + } + + fn user_principal(subject: &str) -> Principal { + Principal::User(UserPrincipal { + identity: Identity { + subject: subject.to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + }) + } + + fn sandbox_principal() -> Principal { + Principal::Sandbox(SandboxPrincipal { + sandbox_id: "sandbox-a".to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + }, + trust_domain: Some("openshell".to_string()), + }) + } + + #[tokio::test] + async fn user_principal_lands_in_request_extensions() { + let mock = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "alice", + ))))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let _ = router + .call(empty_request("/openshell.v1.OpenShell/ListSandboxes")) + .await + .unwrap(); + let principal = seen.lock().unwrap().clone().expect("principal"); + match principal { + Principal::User(u) => assert_eq!(u.identity.subject, "alice"), + _ => panic!("expected user principal"), + } + } + + #[tokio::test] + async fn sandbox_principal_lands_in_request_extensions() { + let mock = Arc::new(MockAuthenticator::returning(Ok(Some(sandbox_principal())))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let _ = router + .call(empty_request("/openshell.v1.OpenShell/ReportPolicyStatus")) + .await + .unwrap(); + let captured = seen.lock().unwrap().clone(); + match captured { + Some(Principal::Sandbox(p)) => assert_eq!(p.sandbox_id, "sandbox-a"), + other => panic!("expected sandbox principal, got {other:?}"), + } + } + + #[tokio::test] + async fn sandbox_principal_can_call_allowlisted_method() { + let mock = Arc::new(MockAuthenticator::returning(Ok(Some(sandbox_principal())))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + + let res = router + .call(empty_request("/openshell.v1.OpenShell/GetSandboxConfig")) + .await + .unwrap(); + + assert_eq!(res.status(), 200); + assert!(matches!( + seen.lock().unwrap().as_ref(), + Some(Principal::Sandbox(_)) + )); + } + + #[tokio::test] + async fn sandbox_principal_can_fetch_inference_bundle() { + let mock = Arc::new(MockAuthenticator::returning(Ok(Some(sandbox_principal())))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + + let res = router + .call(empty_request( + "/openshell.inference.v1.Inference/GetInferenceBundle", + )) + .await + .unwrap(); + + assert_eq!(res.status(), 200); + assert!(matches!( + seen.lock().unwrap().as_ref(), + Some(Principal::Sandbox(_)) + )); + } + + #[tokio::test] + async fn sandbox_principal_is_denied_on_user_and_admin_methods() { + for path in [ + "/openshell.v1.OpenShell/ListSandboxes", + "/openshell.v1.OpenShell/DeleteSandbox", + "/openshell.v1.OpenShell/CreateProvider", + "/openshell.v1.OpenShell/ApproveDraftChunk", + "/openshell.inference.v1.Inference/GetClusterInference", + "/openshell.inference.v1.Inference/SetClusterInference", + ] { + let mock = Arc::new(MockAuthenticator::returning(Ok(Some(sandbox_principal())))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + + let res = router.call(empty_request(path)).await.unwrap(); + + assert!(seen.lock().unwrap().is_none(), "{path} reached handler"); + assert_eq!(grpc_status(&res).as_deref(), Some("7"), "{path}"); + } + } + + #[tokio::test] + async fn missing_principal_returns_unauthenticated() { + let mock = Arc::new(MockAuthenticator::returning(Ok(None))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let res = router + .call(empty_request("/openshell.v1.OpenShell/ListSandboxes")) + .await + .unwrap(); + assert!(seen.lock().unwrap().is_none()); + // tonic sets grpc-status=16 (UNAUTHENTICATED) in trailers. + assert_eq!(grpc_status(&res).as_deref(), Some("16")); + } + + #[tokio::test] + async fn authenticator_error_short_circuits() { + let mock = Arc::new(MockAuthenticator::returning(Err( + tonic::Status::unauthenticated("forged"), + ))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let res = router + .call(empty_request("/openshell.v1.OpenShell/ListSandboxes")) + .await + .unwrap(); + assert!(seen.lock().unwrap().is_none()); + assert_eq!(grpc_status(&res).as_deref(), Some("16")); + } + + #[tokio::test] + async fn health_methods_bypass_chain() { + // Authenticator is wired to fail-closed; the request still gets + // through because the path is exempt. + let mock = Arc::new(MockAuthenticator::returning(Err( + tonic::Status::unauthenticated("would reject"), + ))); + let chain = AuthenticatorChain::new(vec![mock.clone()]); + let (recorder, _) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let res = router + .call(empty_request("/openshell.v1.OpenShell/Health")) + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!(mock.call_count(), 0, "health must not consult the chain"); + } + } } diff --git a/crates/openshell-server/src/supervisor_session.rs b/crates/openshell-server/src/supervisor_session.rs index 91f40c289..d2dc8630d 100644 --- a/crates/openshell-server/src/supervisor_session.rs +++ b/crates/openshell-server/src/supervisor_session.rs @@ -18,6 +18,7 @@ use openshell_core::proto::{ }; use crate::ServerState; +use crate::auth::principal::Principal; const HEARTBEAT_INTERVAL_SECS: u32 = 15; const RELAY_PENDING_TIMEOUT: Duration = Duration::from_secs(10); @@ -337,17 +338,40 @@ impl SupervisorSessionRegistry { /// Returns the `DuplexStream` half that the supervisor side should read/write. // `tonic::Status` is large but is the API surface of gRPC handlers. #[allow(clippy::result_large_err)] - pub fn claim_relay(&self, channel_id: &str) -> Result { + pub fn claim_relay( + &self, + channel_id: &str, + principal: Option<&Principal>, + ) -> Result { let pending = { let mut map = self.pending_relays.lock().unwrap(); + let pending = map + .get(channel_id) + .ok_or_else(|| Status::not_found("unknown or expired relay channel"))?; + + if let Some(principal) = principal + && let Err(status) = crate::auth::guard::ensure_sandbox_principal_scope( + principal, + &pending.sandbox_id, + ) + { + info!( + channel_id = %channel_id, + sandbox_id = %pending.sandbox_id, + "relay stream: rejecting cross-sandbox claim" + ); + return Err(status); + } + + if pending.created_at.elapsed() > RELAY_PENDING_TIMEOUT { + map.remove(channel_id); + return Err(Status::deadline_exceeded("relay channel timed out")); + } + map.remove(channel_id) - .ok_or_else(|| Status::not_found("unknown or expired relay channel"))? + .expect("pending relay existed before removal") }; - if pending.created_at.elapsed() > RELAY_PENDING_TIMEOUT { - return Err(Status::deadline_exceeded("relay channel timed out")); - } - // Create a duplex stream pair: one end for the gateway bridge, one for // the supervisor HTTP CONNECT handler. let (gateway_stream, supervisor_stream) = tokio::io::duplex(64 * 1024); @@ -449,6 +473,7 @@ pub async fn handle_relay_stream( >, Status, > { + let principal = request.extensions().get::().cloned(); let mut inbound = request.into_inner(); // First frame must identify the channel. @@ -470,7 +495,7 @@ pub async fn handle_relay_stream( }; // Claim the pending relay. Consumes the entry — it cannot be reused. - let supervisor_side = registry.claim_relay(&channel_id)?; + let supervisor_side = registry.claim_relay(&channel_id, principal.as_ref())?; info!(channel_id = %channel_id, "relay stream: claimed pending relay, bridging"); let (mut read_half, mut write_half) = tokio::io::split(supervisor_side); @@ -554,6 +579,7 @@ pub async fn handle_connect_supervisor( >, Status, > { + let principal = request.extensions().get::().cloned(); let mut inbound = request.into_inner(); // Step 1: Wait for SupervisorHello. @@ -569,6 +595,9 @@ pub async fn handle_connect_supervisor( if sandbox_id.is_empty() { return Err(Status::invalid_argument("sandbox_id is required")); } + if let Some(principal) = principal.as_ref() { + crate::auth::guard::ensure_sandbox_principal_scope(principal, &sandbox_id)?; + } require_persisted_sandbox(&state.store, &sandbox_id).await?; let session_id = Uuid::new_v4().to_string(); @@ -783,6 +812,8 @@ fn handle_supervisor_message( #[cfg(test)] mod tests { use super::*; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{SandboxIdentitySource, SandboxPrincipal, UserPrincipal}; use crate::persistence::Store; use tokio::io::{AsyncReadExt, AsyncWriteExt}; @@ -823,6 +854,28 @@ mod tests { } } + fn sandbox_principal(sandbox_id: &str) -> Principal { + Principal::Sandbox(SandboxPrincipal { + sandbox_id: sandbox_id.to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + }, + trust_domain: Some("openshell".to_string()), + }) + } + + fn user_principal(subject: &str) -> Principal { + Principal::User(UserPrincipal { + identity: Identity { + subject: subject.to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + }) + } + // ---- registry: register / remove ---- #[test] @@ -1235,7 +1288,10 @@ mod tests { #[test] fn claim_relay_unknown_channel() { let registry = SupervisorSessionRegistry::new(); - let err = registry.claim_relay("nonexistent").expect_err("should err"); + let principal = sandbox_principal("sbx-test"); + let err = registry + .claim_relay("nonexistent", Some(&principal)) + .expect_err("should err"); assert_eq!(err.code(), tonic::Code::NotFound); } @@ -1248,11 +1304,51 @@ mod tests { pending_relay("sbx-test", relay_tx, Instant::now()), ); - let result = registry.claim_relay("ch-1"); + let principal = sandbox_principal("sbx-test"); + let result = registry.claim_relay("ch-1", Some(&principal)); assert!(result.is_ok()); assert!(!registry.pending_relays.lock().unwrap().contains_key("ch-1")); } + #[test] + fn claim_relay_rejects_cross_sandbox_principal_without_consuming_channel() { + let registry = SupervisorSessionRegistry::new(); + let (relay_tx, _relay_rx) = oneshot::channel(); + registry.pending_relays.lock().unwrap().insert( + "ch-cross".to_string(), + pending_relay("sbx-owner", relay_tx, Instant::now()), + ); + + let attacker = sandbox_principal("sbx-attacker"); + let err = registry + .claim_relay("ch-cross", Some(&attacker)) + .expect_err("cross-sandbox relay claim must fail"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + assert!( + registry + .pending_relays + .lock() + .unwrap() + .contains_key("ch-cross"), + "failed cross-sandbox claim must not consume the channel" + ); + } + + #[test] + fn claim_relay_rejects_user_principal() { + let registry = SupervisorSessionRegistry::new(); + let (relay_tx, _relay_rx) = oneshot::channel(); + registry.pending_relays.lock().unwrap().insert( + "ch-user".to_string(), + pending_relay("sbx-owner", relay_tx, Instant::now()), + ); + + let err = registry + .claim_relay("ch-user", Some(&user_principal("alice"))) + .expect_err("users are not supervisor identities"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + #[tokio::test] async fn relay_open_failure_completes_pending_waiter() { let registry = SupervisorSessionRegistry::new(); @@ -1293,7 +1389,7 @@ mod tests { ); let err = registry - .claim_relay("ch-old") + .claim_relay("ch-old", Some(&sandbox_principal("sbx-test"))) .expect_err("expired entry must fail"); assert_eq!(err.code(), tonic::Code::DeadlineExceeded); // Entry must have been consumed regardless. @@ -1317,7 +1413,7 @@ mod tests { ); let err = registry - .claim_relay("ch-1") + .claim_relay("ch-1", Some(&sandbox_principal("sbx-test"))) .expect_err("should err when receiver is gone"); assert_eq!(err.code(), tonic::Code::Internal); } @@ -1331,7 +1427,9 @@ mod tests { pending_relay("sbx-test", relay_tx, Instant::now()), ); - let mut supervisor_side = registry.claim_relay("ch-io").expect("claim should succeed"); + let mut supervisor_side = registry + .claim_relay("ch-io", Some(&sandbox_principal("sbx-test"))) + .expect("claim should succeed"); let mut gateway_side = relay_rx .await .expect("gateway side should receive result") diff --git a/crates/openshell-server/tests/common/mod.rs b/crates/openshell-server/tests/common/mod.rs index 3a8ecb5b3..1f0268131 100644 --- a/crates/openshell-server/tests/common/mod.rs +++ b/crates/openshell-server/tests/common/mod.rs @@ -20,10 +20,11 @@ use openshell_core::proto::{ GetGatewayConfigRequest, GetGatewayConfigResponse, GetProviderRequest, GetSandboxConfigRequest, GetSandboxConfigResponse, GetSandboxProviderEnvironmentRequest, GetSandboxProviderEnvironmentResponse, GetSandboxRequest, HealthRequest, HealthResponse, - ListProvidersRequest, ListProvidersResponse, ListSandboxesRequest, ListSandboxesResponse, - ProviderResponse, RelayFrame, RevokeSshSessionRequest, RevokeSshSessionResponse, - SandboxResponse, SandboxStreamEvent, ServiceStatus, SupervisorMessage, TcpForwardFrame, - UpdateProviderRequest, WatchSandboxRequest, + IssueSandboxTokenRequest, IssueSandboxTokenResponse, ListProvidersRequest, + ListProvidersResponse, ListSandboxesRequest, ListSandboxesResponse, ProviderResponse, + RefreshSandboxTokenRequest, RefreshSandboxTokenResponse, RelayFrame, RevokeSshSessionRequest, + RevokeSshSessionResponse, SandboxResponse, SandboxStreamEvent, ServiceStatus, + SupervisorMessage, TcpForwardFrame, UpdateProviderRequest, WatchSandboxRequest, open_shell_server::{OpenShell, OpenShellServer}, }; use openshell_server::{MultiplexedService, TlsAcceptor, health_router}; @@ -420,6 +421,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/supervisor_relay_integration.rs b/crates/openshell-server/tests/supervisor_relay_integration.rs index aae6d8cf1..7f6bab7e9 100644 --- a/crates/openshell-server/tests/supervisor_relay_integration.rs +++ b/crates/openshell-server/tests/supervisor_relay_integration.rs @@ -394,6 +394,18 @@ impl OpenShell for RelayGateway { ) -> Result, Status> { Err(Status::unimplemented("unused")) } + async fn issue_sandbox_token( + &self, + _: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("unused")) + } + async fn refresh_sandbox_token( + &self, + _: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("unused")) + } } // --------------------------------------------------------------------------- diff --git a/deploy/helm/openshell/README.md b/deploy/helm/openshell/README.md index 390571062..2ad52cf97 100644 --- a/deploy/helm/openshell/README.md +++ b/deploy/helm/openshell/README.md @@ -124,6 +124,9 @@ cert-manager alternative. | probes.startup.timeoutSeconds | int | `1` | Startup probe timeout, in seconds. | | replicaCount | int | `1` | Number of OpenShell gateway replicas. | | resources | object | `{}` | Gateway pod resource requests and limits. | +| sandboxServiceAccount.annotations | object | `{}` | Annotations to add to the generated sandbox service account. | +| sandboxServiceAccount.create | bool | `true` | Create a service account for sandbox pods. | +| sandboxServiceAccount.name | string | `""` | Existing service account name for sandbox pods when sandboxServiceAccount.create is false. | | securityContext.allowPrivilegeEscalation | bool | `false` | Whether the gateway container can gain additional privileges. | | securityContext.capabilities.drop | list | `["ALL"]` | Linux capabilities dropped from the gateway container. | | securityContext.runAsNonRoot | bool | `true` | Require the gateway container to run as a non-root user. | @@ -145,6 +148,10 @@ cert-manager alternative. | server.oidc.userRole | string | `""` | Role name for standard user access. | | server.sandboxImage | string | `"ghcr.io/nvidia/openshell-community/sandboxes/base:latest"` | Default sandbox image used when requests do not specify one. | | server.sandboxImagePullPolicy | string | `""` | Kubernetes imagePullPolicy for sandbox pods. Empty = Kubernetes default (Always for :latest, IfNotPresent otherwise). Set to "Always" for dev clusters so new images are picked up without manual eviction. | +| server.sandboxJwt.gatewayId | string | `""` | | +| server.sandboxJwt.k8sSaTokenTtlSecs | int | `3600` | | +| server.sandboxJwt.signingSecretName | string | `""` | | +| server.sandboxJwt.ttlSecs | int | `3600` | | | server.sandboxNamespace | string | `""` | Namespace where sandbox pods are created. Defaults to the Helm release namespace (.Release.Namespace) when left empty. | | server.tls.certSecretName | string | `"openshell-server-tls"` | K8s secret (type kubernetes.io/tls) with tls.crt and tls.key for the server. | | server.tls.clientCaSecretName | string | `"openshell-server-client-ca"` | K8s secret with ca.crt for client certificate verification (mTLS). Set to "" to disable mTLS and run HTTPS-only (use OIDC for auth instead). | diff --git a/deploy/helm/openshell/templates/_helpers.tpl b/deploy/helm/openshell/templates/_helpers.tpl index 00925d2d3..3e375a54a 100644 --- a/deploy/helm/openshell/templates/_helpers.tpl +++ b/deploy/helm/openshell/templates/_helpers.tpl @@ -59,6 +59,17 @@ Create the name of the service account to use {{- end }} {{- end }} +{{/* +Create the name of the service account assigned to sandbox pods +*/}} +{{- define "openshell.sandboxServiceAccountName" -}} +{{- if .Values.sandboxServiceAccount.create }} +{{- default (printf "%s-sandbox" (include "openshell.fullname" .) | trunc 63 | trimSuffix "-") .Values.sandboxServiceAccount.name }} +{{- else }} +{{- default "default" .Values.sandboxServiceAccount.name }} +{{- end }} +{{- end }} + {{/* Gateway image reference. Uses image.tag when set; falls back to .Chart.AppVersion so a released chart automatically pulls the matching image without extra overrides. diff --git a/deploy/helm/openshell/templates/certgen.yaml b/deploy/helm/openshell/templates/certgen.yaml index ef4500db6..61203760b 100644 --- a/deploy/helm/openshell/templates/certgen.yaml +++ b/deploy/helm/openshell/templates/certgen.yaml @@ -100,6 +100,7 @@ spec: - generate-certs - --server-secret-name={{ .Values.server.tls.certSecretName }} - --client-secret-name={{ .Values.server.tls.clientTlsSecretName }} + - --jwt-secret-name={{ .Values.server.sandboxJwt.signingSecretName | default (printf "%s-jwt-keys" (include "openshell.fullname" .)) }} {{- range .Values.pkiInitJob.serverDnsNames }} - --server-san={{ . }} {{- end }} diff --git a/deploy/helm/openshell/templates/clusterrole.yaml b/deploy/helm/openshell/templates/clusterrole.yaml index a660aee75..30a192fc3 100644 --- a/deploy/helm/openshell/templates/clusterrole.yaml +++ b/deploy/helm/openshell/templates/clusterrole.yaml @@ -8,6 +8,14 @@ metadata: labels: {{- include "openshell.labels" . | nindent 4 }} rules: + # Validate projected sandbox ServiceAccount tokens during the + # IssueSandboxToken bootstrap exchange. + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create - apiGroups: - "" resources: diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml index a3d7b3411..f49476fd8 100644 --- a/deploy/helm/openshell/templates/gateway-config.yaml +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -64,6 +64,13 @@ data: {{- end }} {{- end }} + [openshell.gateway.gateway_jwt] + signing_key_path = "/etc/openshell-jwt/signing.pem" + public_key_path = "/etc/openshell-jwt/public.pem" + kid_path = "/etc/openshell-jwt/kid" + gateway_id = {{ .Values.server.sandboxJwt.gatewayId | default (include "openshell.fullname" .) | quote }} + ttl_secs = {{ .Values.server.sandboxJwt.ttlSecs | default 3600 }} + {{- if .Values.server.oidc.issuer }} [openshell.gateway.oidc] @@ -86,7 +93,9 @@ data: [openshell.drivers.kubernetes] grpc_endpoint = {{ include "openshell.grpcEndpoint" . | quote }} + service_account_name = {{ include "openshell.sandboxServiceAccountName" . | quote }} supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }} + sa_token_ttl_secs = {{ .Values.server.sandboxJwt.k8sSaTokenTtlSecs | default 3600 }} {{- if .Values.server.sandboxImagePullPolicy }} image_pull_policy = {{ .Values.server.sandboxImagePullPolicy | quote }} {{- end }} diff --git a/deploy/helm/openshell/templates/role.yaml b/deploy/helm/openshell/templates/role.yaml index 1d756117c..5ecc4428a 100644 --- a/deploy/helm/openshell/templates/role.yaml +++ b/deploy/helm/openshell/templates/role.yaml @@ -5,6 +5,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: {{ include "openshell.fullname" . }}-sandbox + namespace: {{ include "openshell.sandboxNamespace" . }} labels: {{- include "openshell.labels" . | nindent 4 }} rules: @@ -29,3 +30,15 @@ rules: - get - list - watch + # Per-sandbox identity: TokenReview authenticates the projected token from + # the configured sandbox service account, then the gateway resolves the + # returned pod name and UID to the pod's `openshell.io/sandbox-id` + # annotation. patch is intentionally NOT granted — the annotation is set + # once at pod create and must remain immutable for the lifetime of the + # sandbox. + - apiGroups: + - "" + resources: + - pods + verbs: + - get diff --git a/deploy/helm/openshell/templates/rolebinding.yaml b/deploy/helm/openshell/templates/rolebinding.yaml index 2bb3c7d08..e5233f753 100644 --- a/deploy/helm/openshell/templates/rolebinding.yaml +++ b/deploy/helm/openshell/templates/rolebinding.yaml @@ -5,6 +5,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: {{ include "openshell.fullname" . }}-sandbox + namespace: {{ include "openshell.sandboxNamespace" . }} labels: {{- include "openshell.labels" . | nindent 4 }} roleRef: diff --git a/deploy/helm/openshell/templates/serviceaccount.yaml b/deploy/helm/openshell/templates/serviceaccount.yaml index 1f03f8e94..a98ad5363 100644 --- a/deploy/helm/openshell/templates/serviceaccount.yaml +++ b/deploy/helm/openshell/templates/serviceaccount.yaml @@ -13,3 +13,19 @@ metadata: {{- toYaml . | nindent 4 }} {{- end }} {{- end }} +{{- if and .Values.serviceAccount.create .Values.sandboxServiceAccount.create }} +--- +{{- end }} +{{- if .Values.sandboxServiceAccount.create }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "openshell.sandboxServiceAccountName" . }} + namespace: {{ include "openshell.sandboxNamespace" . }} + labels: + {{- include "openshell.labels" . | nindent 4 }} + {{- with .Values.sandboxServiceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index c6ff21491..5dd4f1caf 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -75,6 +75,9 @@ spec: - name: gateway-config mountPath: /etc/openshell readOnly: true + - name: sandbox-jwt + mountPath: /etc/openshell-jwt + readOnly: true {{- if not .Values.server.disableTls }} - name: tls-cert mountPath: /etc/openshell-tls/server @@ -84,12 +87,12 @@ spec: mountPath: /etc/openshell-tls/client-ca readOnly: true {{- end }} + {{- end }} {{- if and .Values.server.oidc.issuer .Values.server.oidc.caConfigMapName }} - name: oidc-ca mountPath: /etc/openshell-tls/oidc-ca readOnly: true {{- end }} - {{- end }} ports: - name: grpc containerPort: {{ .Values.service.port }} @@ -131,6 +134,10 @@ spec: - name: gateway-config configMap: name: {{ include "openshell.fullname" . }}-config + - name: sandbox-jwt + secret: + secretName: {{ .Values.server.sandboxJwt.signingSecretName | default (printf "%s-jwt-keys" (include "openshell.fullname" .)) }} + defaultMode: 0400 {{- if not .Values.server.disableTls }} - name: tls-cert secret: @@ -147,12 +154,12 @@ spec: secretName: {{ .Values.server.tls.clientCaSecretName }} {{- end }} {{- end }} + {{- end }} {{- if and .Values.server.oidc.issuer .Values.server.oidc.caConfigMapName }} - name: oidc-ca configMap: name: {{ .Values.server.oidc.caConfigMapName }} {{- end }} - {{- end }} {{- with .Values.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/deploy/helm/openshell/tests/gateway_config_test.yaml b/deploy/helm/openshell/tests/gateway_config_test.yaml index 2d464b8e6..2bb51dcf1 100644 --- a/deploy/helm/openshell/tests/gateway_config_test.yaml +++ b/deploy/helm/openshell/tests/gateway_config_test.yaml @@ -33,6 +33,23 @@ tests: path: data["gateway.toml"] pattern: '(?ms)\[openshell\.gateway\][^\[]*?grpc_endpoint' + - it: renders the sandbox service account name under [openshell.drivers.kubernetes] + template: templates/gateway-config.yaml + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?service_account_name\s*=\s*"openshell-sandbox"' + + - it: uses the configured existing sandbox service account name + template: templates/gateway-config.yaml + set: + sandboxServiceAccount.create: false + sandboxServiceAccount.name: precreated-sandbox + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?service_account_name\s*=\s*"precreated-sandbox"' + - it: omits server_sans when no DNS SANs are configured template: templates/gateway-config.yaml asserts: diff --git a/deploy/helm/openshell/tests/sandbox_namespace_test.yaml b/deploy/helm/openshell/tests/sandbox_namespace_test.yaml index 2d3461c6f..ee89fce53 100644 --- a/deploy/helm/openshell/tests/sandbox_namespace_test.yaml +++ b/deploy/helm/openshell/tests/sandbox_namespace_test.yaml @@ -5,6 +5,9 @@ suite: sandboxNamespace defaulting templates: - templates/gateway-config.yaml - templates/networkpolicy.yaml + - templates/role.yaml + - templates/rolebinding.yaml + - templates/serviceaccount.yaml release: name: openshell namespace: my-namespace @@ -44,3 +47,31 @@ tests: - equal: path: metadata.namespace value: other-ns + + - it: uses explicit sandboxNamespace for sandbox RBAC + template: templates/role.yaml + set: + server.sandboxNamespace: other-ns + asserts: + - equal: + path: metadata.namespace + value: other-ns + + - it: uses explicit sandboxNamespace for sandbox RoleBinding + template: templates/rolebinding.yaml + set: + server.sandboxNamespace: other-ns + asserts: + - equal: + path: metadata.namespace + value: other-ns + + - it: uses explicit sandboxNamespace for sandbox ServiceAccount + template: templates/serviceaccount.yaml + set: + server.sandboxNamespace: other-ns + asserts: + - equal: + path: metadata.namespace + value: other-ns + documentIndex: 1 diff --git a/deploy/helm/openshell/tests/sandbox_service_account_test.yaml b/deploy/helm/openshell/tests/sandbox_service_account_test.yaml new file mode 100644 index 000000000..c42641582 --- /dev/null +++ b/deploy/helm/openshell/tests/sandbox_service_account_test.yaml @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +suite: sandbox service account +templates: + - templates/serviceaccount.yaml +release: + name: openshell + namespace: my-namespace + +tests: + - it: creates gateway and sandbox service accounts by default + asserts: + - hasDocuments: + count: 2 + - equal: + path: metadata.name + value: openshell + documentIndex: 0 + - equal: + path: metadata.name + value: openshell-sandbox + documentIndex: 1 + + - it: uses the configured existing sandbox service account name + set: + sandboxServiceAccount.create: false + sandboxServiceAccount.name: precreated-sandbox + asserts: + - hasDocuments: + count: 1 diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index 26ba1b5b5..0295efcd0 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -48,6 +48,14 @@ serviceAccount: # -- Existing service account name to use when serviceAccount.create is false. name: "" +sandboxServiceAccount: + # -- Create a service account for sandbox pods. + create: true + # -- Annotations to add to the generated sandbox service account. + annotations: {} + # -- Existing service account name for sandbox pods when sandboxServiceAccount.create is false. + name: "" + # -- Extra annotations to add to the gateway pod. podAnnotations: {} # -- Extra labels to add to the gateway pod. @@ -175,6 +183,24 @@ server: clientCaSecretName: openshell-server-client-ca # -- K8s secret mounted into sandbox pods for mTLS to the server. clientTlsSecretName: openshell-client-tls + # Gateway-minted sandbox JWT signing keys. The pre-install certgen hook + # generates an Ed25519 keypair and writes it to a secret containing + # signing.pem (PKCS#8), public.pem (SPKI), and kid (plain text). + sandboxJwt: + # Name of the Opaque Secret holding the signing key material. Empty + # falls back to "-jwt-keys". + signingSecretName: "" + # Stable gateway identity embedded in iss/aud of every minted token. + # Defaults to the release name so HA replicas share identity. + gatewayId: "" + # Token TTL in seconds. Defaults to 3600 (1h). + ttlSecs: 3600 + # Lifetime (seconds) of the projected ServiceAccount token kubelet + # writes into each sandbox pod for the IssueSandboxToken bootstrap + # exchange. Kubelet enforces a minimum of 600s; the driver clamps + # values outside [600, 86400]. Default 3600 — generous, since the + # supervisor consumes the token within seconds of pod start. + k8sSaTokenTtlSecs: 3600 # OIDC (OpenID Connect) configuration for JWT-based authentication. # When issuer is set, the server validates Bearer tokens on gRPC requests. oidc: diff --git a/docs/reference/gateway-config.mdx b/docs/reference/gateway-config.mdx index 218982405..b0bff3a6b 100644 --- a/docs/reference/gateway-config.mdx +++ b/docs/reference/gateway-config.mdx @@ -94,6 +94,13 @@ key_path = "/etc/openshell/certs/gateway-key.pem" client_ca_path = "/etc/openshell/certs/client-ca.pem" allow_unauthenticated = false +[openshell.gateway.gateway_jwt] +signing_key_path = "/etc/openshell/jwt/signing.pem" +public_key_path = "/etc/openshell/jwt/public.pem" +kid_path = "/etc/openshell/jwt/kid" +gateway_id = "openshell" +ttl_secs = 3600 + [openshell.gateway.oidc] issuer = "https://idp.example.com/realms/openshell" audience = "openshell-cli" @@ -134,6 +141,7 @@ client_ca_path = "/etc/openshell-tls/client-ca/ca.crt" [openshell.drivers.kubernetes] namespace = "agents" grpc_endpoint = "https://openshell-gateway.agents.svc:8080" +service_account_name = "openshell-sandbox" image_pull_policy = "IfNotPresent" # Use the image volume on K8s >= 1.35 (GA in 1.36); switch to "init-container" # on older clusters or where the ImageVolume feature gate is off. diff --git a/docs/reference/sandbox-compute-drivers.mdx b/docs/reference/sandbox-compute-drivers.mdx index 33168986f..1ab5c9b2f 100644 --- a/docs/reference/sandbox-compute-drivers.mdx +++ b/docs/reference/sandbox-compute-drivers.mdx @@ -117,6 +117,7 @@ For maintainer-level implementation details, refer to the [Kubernetes driver REA |---|---|---| | `compute_drivers = ["kubernetes"]` | Not applicable | Select the Kubernetes compute driver. | | `[openshell.drivers.kubernetes].namespace` | `server.sandboxNamespace` | Set the namespace for sandbox resources. The Helm chart defaults to the release namespace when left empty. | +| `service_account_name` | `sandboxServiceAccount.name` | Set the Kubernetes service account assigned to sandbox pods and accepted by the gateway TokenReview bootstrap path. The Helm chart creates a dedicated sandbox service account by default. | | `default_image` | `server.sandboxImage` | Set the default sandbox image. | | `image_pull_policy` | `server.sandboxImagePullPolicy` | Set the Kubernetes image pull policy for sandbox pods. | | `grpc_endpoint` | `server.grpcEndpoint` | Set the gateway callback endpoint reachable from sandbox pods. | diff --git a/e2e/support/gateway-common.sh b/e2e/support/gateway-common.sh index 2f8a2c141..09d96ecd8 100644 --- a/e2e/support/gateway-common.sh +++ b/e2e/support/gateway-common.sh @@ -50,6 +50,23 @@ e2e_generate_pki() { "${gateway_bin}" generate-certs --output-dir "${pki_dir}" "${san_args[@]}" } +e2e_preserve_mise_dirs() { + if ! command -v mise >/dev/null 2>&1; then + return 0 + fi + + if [ -z "${MISE_DATA_DIR:-}" ]; then + export MISE_DATA_DIR="${XDG_DATA_HOME:-${HOME}/.local/share}/mise" + fi + + if [ -z "${MISE_CACHE_DIR:-}" ]; then + case "$(uname -s)" in + Darwin) export MISE_CACHE_DIR="${HOME}/Library/Caches/mise" ;; + *) export MISE_CACHE_DIR="${XDG_CACHE_HOME:-${HOME}/.cache}/mise" ;; + esac + fi +} + e2e_register_plaintext_gateway() { local config_home=$1 local name=$2 @@ -93,6 +110,37 @@ EOF printf '%s' "${name}" >"${config_home}/openshell/active_gateway" } +e2e_toml_string() { + local value="$1" + value="${value//\\/\\\\}" + value="${value//\"/\\\"}" + printf '"%s"' "${value}" +} + +e2e_generate_gateway_jwt() { + local jwt_dir=$1 + + mkdir -p "${jwt_dir}" + ( + umask 077 + openssl genpkey -algorithm Ed25519 -out "${jwt_dir}/signing.pem" >/dev/null 2>&1 + ) + openssl pkey -in "${jwt_dir}/signing.pem" -pubout -out "${jwt_dir}/public.pem" >/dev/null 2>&1 + openssl rand -hex 16 >"${jwt_dir}/kid" +} + +e2e_write_gateway_jwt_config() { + local jwt_dir=$1 + local gateway_id=$2 + + printf '[openshell.gateway.gateway_jwt]\n' + printf 'signing_key_path = %s\n' "$(e2e_toml_string "${jwt_dir}/signing.pem")" + printf 'public_key_path = %s\n' "$(e2e_toml_string "${jwt_dir}/public.pem")" + printf 'kid_path = %s\n' "$(e2e_toml_string "${jwt_dir}/kid")" + printf 'gateway_id = %s\n' "$(e2e_toml_string "${gateway_id}")" + printf 'ttl_secs = 3600\n\n' +} + e2e_build_gateway_binaries() { local root=$1 local target_var=$2 @@ -176,4 +224,3 @@ e2e_print_gateway_log_on_failure() { echo "=== end gateway log ===" fi } - diff --git a/e2e/with-docker-gateway.sh b/e2e/with-docker-gateway.sh index ed920f3e4..82faad9be 100755 --- a/e2e/with-docker-gateway.sh +++ b/e2e/with-docker-gateway.sh @@ -25,6 +25,8 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" # shellcheck source=e2e/support/gateway-common.sh source "${ROOT}/e2e/support/gateway-common.sh" +e2e_preserve_mise_dirs + github_actions_host_docker_tmpdir() { if [ "${GITHUB_ACTIONS:-}" != "true" ] \ || [ ! -S /var/run/docker.sock ] \ @@ -391,6 +393,7 @@ e2e_generate_pki "${GATEWAY_BIN}" "${PKI_DIR}" HOST_PORT=$(e2e_pick_port) STATE_DIR="${WORKDIR}/state" mkdir -p "${STATE_DIR}" +JWT_DIR="${STATE_DIR}/jwt" GATEWAY_ENDPOINT="https://host.openshell.internal:${HOST_PORT}" E2E_NAMESPACE="e2e-docker-$$-${HOST_PORT}" @@ -410,6 +413,7 @@ else fi echo "Starting openshell-gateway on port ${HOST_PORT} (namespace: ${E2E_NAMESPACE})..." +e2e_generate_gateway_jwt "${JWT_DIR}" # Driver-specific options moved from CLI flags into a TOML config table # (commit 560550d2). Synthesize a minimal config here and pass --config. @@ -428,6 +432,7 @@ GATEWAY_CONFIG="${STATE_DIR}/gateway.toml" { printf '[openshell]\nversion = 1\n\n' printf '[openshell.gateway]\nlog_level = "info"\n\n' + e2e_write_gateway_jwt_config "${JWT_DIR}" "openshell-e2e-docker-${HOST_PORT}" printf '[openshell.drivers.docker]\n' printf 'sandbox_namespace = %s\n' "$(toml_string "${E2E_NAMESPACE}")" printf 'network_name = %s\n' "$(toml_string "${DOCKER_NETWORK_NAME}")" diff --git a/e2e/with-kube-gateway.sh b/e2e/with-kube-gateway.sh index 34a081516..440323944 100755 --- a/e2e/with-kube-gateway.sh +++ b/e2e/with-kube-gateway.sh @@ -13,12 +13,16 @@ # Create a local k3d cluster via tasks/scripts/helm-k3s-local.sh, install # the chart, port-forward, and tear the cluster down on exit. # -# Helm e2e currently uses plaintext gateway traffic (ci/values-tls-disabled.yaml). +# Helm e2e currently uses plaintext gateway traffic (ci/values-skaffold.yaml). +# The certgen hook still runs so the gateway has sandbox JWT signing keys. # -# Image source: helm install pulls from ${OPENSHELL_REGISTRY}/{gateway,supervisor}:${IMAGE_TAG} -# (defaults: ghcr.io/nvidia/openshell, latest). CI sets IMAGE_TAG to the commit SHA; -# local devs should set it to a tag pulled from a registry the cluster can reach, -# or build and import images via a separate bootstrap step before running this script. +# Image source: +# - Ephemeral k3d mode builds local `openshell/{gateway,supervisor}:${IMAGE_TAG}` +# images by default, imports them into k3d, then installs the chart. This +# mirrors the Skaffold local-dev path. +# - Existing-context mode pulls from ${OPENSHELL_REGISTRY}/{gateway,supervisor}:${IMAGE_TAG} +# (defaults: ghcr.io/nvidia/openshell, latest). CI sets IMAGE_TAG to the +# commit SHA and preloads or publishes the images before running this script. set -euo pipefail @@ -31,6 +35,8 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" # shellcheck source=e2e/support/gateway-common.sh source "${ROOT}/e2e/support/gateway-common.sh" +e2e_preserve_mise_dirs + WORKDIR_PARENT="${TMPDIR:-/tmp}" WORKDIR_PARENT="${WORKDIR_PARENT%/}" WORKDIR="$(mktemp -d "${WORKDIR_PARENT}/openshell-e2e-kube.XXXXXX")" @@ -147,8 +153,21 @@ else KUBE_CONTEXT="k3d-${CLUSTER_NAME}" fi -IMAGE_TAG_VALUE="${IMAGE_TAG:-latest}" -REGISTRY_VALUE="${OPENSHELL_REGISTRY:-ghcr.io/nvidia/openshell}" +if [ -z "${OPENSHELL_E2E_KUBE_BUILD_IMAGES+x}" ]; then + if [ "${CLUSTER_CREATED_BY_US}" = "1" ]; then + OPENSHELL_E2E_KUBE_BUILD_IMAGES=1 + else + OPENSHELL_E2E_KUBE_BUILD_IMAGES=0 + fi +fi + +if [ "${OPENSHELL_E2E_KUBE_BUILD_IMAGES}" = "1" ]; then + REGISTRY_VALUE="${OPENSHELL_REGISTRY:-openshell}" + IMAGE_TAG_VALUE="${IMAGE_TAG:-e2e-${CLUSTER_NAME:-local}}" +else + REGISTRY_VALUE="${OPENSHELL_REGISTRY:-ghcr.io/nvidia/openshell}" + IMAGE_TAG_VALUE="${IMAGE_TAG:-latest}" +fi REGISTRY_VALUE="${REGISTRY_VALUE%/}" # Resolve a host-gateway IP that sandbox pods can dial to reach test fixtures @@ -160,7 +179,9 @@ REGISTRY_VALUE="${REGISTRY_VALUE%/}" # Preference order: # 1. OPENSHELL_E2E_HOST_GATEWAY_IP — operator override (remote clusters where # auto-detection has no signal). -# 2. Gateway of the cluster's Docker network (k3d- for ephemeral +# 2. k3d's CoreDNS host.k3d.internal entry. On Docker Desktop this is a +# host-routable address; the Docker network gateway is not. +# 3. Gateway of the cluster's Docker network (k3d- for ephemeral # clusters, `kind` for kind clusters used in CI). Pods SNAT through their # node to this IP, which lands on the host's bridge interface and reaches # any 0.0.0.0-bound listener / published container port. @@ -171,18 +192,29 @@ HOST_GATEWAY_IP="${OPENSHELL_E2E_HOST_GATEWAY_IP:-}" # the docker bridge gateway on Linux). That mapping handles Docker Desktop # correctly; the docker network gateway alone does not. if [ -z "${HOST_GATEWAY_IP}" ] && command -v kubectl >/dev/null 2>&1; then - detected="$(kctl -n kube-system get configmap coredns -o jsonpath='{.data.NodeHosts}' 2>/dev/null \ - | awk '$2 == "host.k3d.internal" { print $1; exit }')" - if [ -n "${detected}" ]; then - HOST_GATEWAY_IP="${detected}" - echo "Detected host gateway IP ${HOST_GATEWAY_IP} from CoreDNS host.k3d.internal entry." - fi + for _ in {1..15}; do + detected="$(kctl -n kube-system get configmap coredns -o jsonpath='{.data.NodeHosts}' 2>/dev/null \ + | awk '$2 == "host.k3d.internal" { print $1; exit }' || true)" + if [ -n "${detected}" ]; then + HOST_GATEWAY_IP="${detected}" + echo "Detected host gateway IP ${HOST_GATEWAY_IP} from CoreDNS host.k3d.internal entry." + break + fi + sleep 1 + done fi # Fallback for non-k3d clusters (kind in CI, etc.): use the docker network # gateway IP. Works on Linux where the bridge is reachable from pods; on macOS # Docker Desktop without k3d, this will likely not route to the host. -if [ -z "${HOST_GATEWAY_IP}" ] && command -v docker >/dev/null 2>&1; then +use_docker_network_gateway=1 +if [ "$(uname -s)" = "Darwin" ] \ + && { [ "${CLUSTER_CREATED_BY_US}" = "1" ] || [[ "${KUBE_CONTEXT}" == k3d-* ]]; }; then + use_docker_network_gateway=0 +fi +if [ -z "${HOST_GATEWAY_IP}" ] \ + && [ "${use_docker_network_gateway}" = "1" ] \ + && command -v docker >/dev/null 2>&1; then candidate_networks=() if [ "${CLUSTER_CREATED_BY_US}" = "1" ]; then candidate_networks+=("k3d-${CLUSTER_NAME}") @@ -227,6 +259,15 @@ elif [[ "${KUBE_CONTEXT}" == k3d-* ]] && command -v k3d >/dev/null 2>&1; then import_cluster_name="${candidate}" fi fi +if [ "${OPENSHELL_E2E_KUBE_BUILD_IMAGES}" = "1" ]; then + require_cmd docker + echo "Building local Kubernetes e2e images (${REGISTRY_VALUE}/{gateway,supervisor}:${IMAGE_TAG_VALUE})..." + CONTAINER_ENGINE=docker IMAGE_REGISTRY="${REGISTRY_VALUE}" IMAGE_TAG="${IMAGE_TAG_VALUE}" \ + bash "${ROOT}/tasks/scripts/docker-build-image.sh" gateway + CONTAINER_ENGINE=docker IMAGE_REGISTRY="${REGISTRY_VALUE}" IMAGE_TAG="${IMAGE_TAG_VALUE}" \ + bash "${ROOT}/tasks/scripts/docker-build-image.sh" supervisor +fi + if [ -n "${import_cluster_name}" ]; then for image in \ "${REGISTRY_VALUE}/gateway:${IMAGE_TAG_VALUE}" \ @@ -255,7 +296,7 @@ fi echo "Installing Helm chart (release=${RELEASE_NAME}, namespace=${NAMESPACE}, tag=${IMAGE_TAG_VALUE})..." helmctl install "${RELEASE_NAME}" "${ROOT}/deploy/helm/openshell" \ --namespace "${NAMESPACE}" --create-namespace \ - --values "${ROOT}/deploy/helm/openshell/ci/values-tls-disabled.yaml" \ + --values "${ROOT}/deploy/helm/openshell/ci/values-skaffold.yaml" \ --set "fullnameOverride=openshell" \ --set "image.repository=${REGISTRY_VALUE}/gateway" \ --set "image.tag=${IMAGE_TAG_VALUE}" \ diff --git a/e2e/with-podman-gateway.sh b/e2e/with-podman-gateway.sh index 875ebee4b..c9f9cb5fc 100755 --- a/e2e/with-podman-gateway.sh +++ b/e2e/with-podman-gateway.sh @@ -335,6 +335,7 @@ HOST_PORT=$(e2e_pick_port) HEALTH_PORT=$(e2e_pick_port) STATE_DIR="${WORKDIR}/state" mkdir -p "${STATE_DIR}" +JWT_DIR="${STATE_DIR}/jwt" E2E_NAMESPACE="e2e-podman-$$-${HOST_PORT}" PODMAN_NETWORK_NAME="${E2E_NAMESPACE}" @@ -346,6 +347,7 @@ export OPENSHELL_E2E_NETWORK_NAME="${PODMAN_NETWORK_NAME}" export OPENSHELL_E2E_SANDBOX_NAMESPACE="${E2E_NAMESPACE}" echo "Starting openshell-gateway on port ${HOST_PORT} (namespace: ${E2E_NAMESPACE})..." +e2e_generate_gateway_jwt "${JWT_DIR}" # Driver-specific options moved from CLI flags into a TOML config table # (commit 560550d2). Synthesize a minimal config here and pass --config. @@ -370,6 +372,7 @@ GATEWAY_CONFIG="${STATE_DIR}/gateway.toml" # (CLI > TOML in the merge precedence) so the test can use an ephemeral port. cp "${ROOT}/deploy/rpm/gateway.toml.default" "${GATEWAY_CONFIG}" { + e2e_write_gateway_jwt_config "${JWT_DIR}" "openshell-e2e-podman-${HOST_PORT}" printf '\n[openshell.drivers.podman]\n' # The Podman driver scopes isolation by network rather than namespace. printf 'network_name = %s\n' "$(toml_string "${PODMAN_NETWORK_NAME}")" diff --git a/proto/compute_driver.proto b/proto/compute_driver.proto index 3c4308f3f..6de13f3e5 100644 --- a/proto/compute_driver.proto +++ b/proto/compute_driver.proto @@ -90,6 +90,13 @@ message DriverSandboxSpec { // (e.g. "0", "1"). When empty with gpu=true, the driver assigns the // first available GPU. string gpu_device = 10; + // Gateway-minted JWT identifying this sandbox to the gateway. Set by + // the gateway on create; the driver materialises it via its native + // secret mechanism (Docker/Podman/VM bind-mount a per-sandbox file; + // the Kubernetes driver ignores this field and relies on its projected + // ServiceAccount token bootstrap instead). Never echoed to the public + // Sandbox proto. + string sandbox_token = 11; } // Driver-owned runtime template consumed by the compute platform. diff --git a/proto/openshell.proto b/proto/openshell.proto index ca62646e3..10c69f414 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -224,6 +224,51 @@ service OpenShell { // Get decision history for a sandbox's draft policy. rpc GetDraftHistory(GetDraftHistoryRequest) returns (GetDraftHistoryResponse); + + // Exchange a sandbox-bootstrap credential (e.g. a Kubernetes projected + // ServiceAccount token) for a gateway-minted JWT bound to the calling + // sandbox's UUID. Used by the Kubernetes driver path; singleplayer + // drivers receive the gateway JWT directly from the create-sandbox flow + // and never call this RPC. + rpc IssueSandboxToken(IssueSandboxTokenRequest) returns (IssueSandboxTokenResponse); + + // Renew the calling sandbox's gateway JWT. Older tokens remain valid + // until their own expiry; deployments should keep token TTLs short to + // bound replay exposure. The supervisor calls this from a background + // task at ~80% of the token's lifetime; the new token is cached in + // memory only — the on-disk bootstrap file is intentionally not + // rewritten. + rpc RefreshSandboxToken(RefreshSandboxTokenRequest) + returns (RefreshSandboxTokenResponse); +} + +// IssueSandboxToken request. Empty body; identity is established by the +// authentication credentials carried in the request headers (a projected +// Kubernetes ServiceAccount JWT in the K8s driver path). +message IssueSandboxTokenRequest {} + +// IssueSandboxToken response. The supervisor caches the returned token in +// memory and presents it as `Authorization: Bearer` on every subsequent +// gateway RPC. +message IssueSandboxTokenResponse { + // Gateway-minted JWT bound to the calling sandbox's UUID. + string token = 1; + // Absolute expiry of the issued token, milliseconds since the epoch. + int64 expires_at_ms = 2; +} + +// RefreshSandboxToken request. Empty body; the calling principal must +// already be a sandbox principal (i.e. the request carries a still-valid +// gateway-minted JWT in its Authorization header). +message RefreshSandboxTokenRequest {} + +// RefreshSandboxToken response. The new token replaces the supervisor's +// in-memory bearer credential. +message RefreshSandboxTokenResponse { + // Fresh gateway-minted JWT bound to the same sandbox UUID. + string token = 1; + // Absolute expiry of the new token, milliseconds since the epoch. + int64 expires_at_ms = 2; } // Health check request. diff --git a/tasks/scripts/helm-k3s-local.sh b/tasks/scripts/helm-k3s-local.sh index fd73d38c8..8d4919e85 100755 --- a/tasks/scripts/helm-k3s-local.sh +++ b/tasks/scripts/helm-k3s-local.sh @@ -27,6 +27,10 @@ K3D_CLUSTER_NAME_MAX=32 # Host port forwarded to port 80 via the k3d load balancer. # Used by Envoy Gateway's LoadBalancer service (values-gateway.yaml). HOST_LB_PORT="${HELM_K3S_LB_HOST_PORT:-8080}" +# Preload the default community sandbox image so the first sandbox create does +# not pay the full registry pull cost inside the cluster. +DEFAULT_SANDBOX_PRELOAD_IMAGE="ghcr.io/nvidia/openshell-community/sandboxes/base:latest" +PRELOAD_SANDBOX_IMAGE="${HELM_K3S_PRELOAD_SANDBOX_IMAGE-${DEFAULT_SANDBOX_PRELOAD_IMAGE}}" default_kubeconfig="${ROOT}/kubeconfig" if [[ -n "${HELM_K3S_KUBECONFIG:-}" ]]; then @@ -52,6 +56,9 @@ Environment: Override to share a single cluster across worktrees. HELM_K3S_KUBECONFIG kubeconfig file to write/merge (default: repo kubeconfig or \$KUBECONFIG) HELM_K3S_LB_HOST_PORT Host port mapped to load balancer port 80 (default: 8080) + HELM_K3S_PRELOAD_SANDBOX_IMAGE + Sandbox image to docker pull and import into k3d + (default: ${DEFAULT_SANDBOX_PRELOAD_IMAGE}; set empty to skip) macOS uses k3d from mise (Docker required). Linux can use this flow only when k3d is installed explicitly; otherwise use kind or an existing cluster context. @@ -144,7 +151,7 @@ configure_ghcr_credentials() { local -a nodes mapfile -t nodes < <(docker ps --format '{{.Names}}' \ - --filter "name=k3d-${CLUSTER_NAME}-server" 2>/dev/null || true) + --filter "name=k3d-${CLUSTER_NAME}-server-" 2>/dev/null || true) if [[ ${#nodes[@]} -eq 0 ]]; then echo "warning: no server nodes found for cluster '${CLUSTER_NAME}', skipping ghcr.io credential setup." >&2 @@ -159,6 +166,79 @@ configure_ghcr_credentials() { done } +cluster_has_image() { + local image="$1" + local -a nodes + mapfile -t nodes < <(docker ps --format '{{.Names}}' \ + --filter "name=k3d-${CLUSTER_NAME}-server-" 2>/dev/null || true) + + for node in "${nodes[@]}"; do + if docker exec "${node}" sh -c 'ctr -n k8s.io images list -q | grep -Fxq "$1"' sh "${image}"; then + return 0 + fi + done + + return 1 +} + +cluster_image_platform() { + local -a nodes + mapfile -t nodes < <(docker ps --format '{{.Names}}' \ + --filter "name=k3d-${CLUSTER_NAME}-server-" 2>/dev/null || true) + + if [[ ${#nodes[@]} -gt 0 ]]; then + local platform + platform="$(docker inspect \ + --format '{{.ImageManifestDescriptor.platform.os}}/{{.ImageManifestDescriptor.platform.architecture}}' \ + "${nodes[0]}" 2>/dev/null || true)" + if [[ "${platform}" != "/" && -n "${platform}" ]]; then + echo "${platform}" + return 0 + fi + fi + + case "$(uname -m)" in + arm64 | aarch64) echo "linux/arm64" ;; + x86_64 | amd64) echo "linux/amd64" ;; + *) echo "linux/$(uname -m)" ;; + esac +} + +preload_sandbox_image() { + if [[ -z "${PRELOAD_SANDBOX_IMAGE}" ]]; then + echo "Skipping sandbox image preload." + return 0 + fi + + if cluster_has_image "${PRELOAD_SANDBOX_IMAGE}"; then + echo "Sandbox image already present in cluster: ${PRELOAD_SANDBOX_IMAGE}" + return 0 + fi + + local platform tmp + platform="$(cluster_image_platform)" + echo "Preloading sandbox image into k3d cluster: ${PRELOAD_SANDBOX_IMAGE}" + echo "Sandbox image platform: ${platform}" + if ! docker image inspect "${PRELOAD_SANDBOX_IMAGE}" >/dev/null 2>&1; then + echo "Pulling sandbox image..." + docker pull --platform "${platform}" "${PRELOAD_SANDBOX_IMAGE}" + fi + + tmp="$(mktemp "${TMPDIR:-/tmp}/openshell-sandbox-image.XXXXXX")" + if ! docker image save --platform "${platform}" -o "${tmp}" "${PRELOAD_SANDBOX_IMAGE}"; then + echo "Pulling sandbox image for ${platform}..." + docker pull --platform "${platform}" "${PRELOAD_SANDBOX_IMAGE}" + docker image save --platform "${platform}" -o "${tmp}" "${PRELOAD_SANDBOX_IMAGE}" + fi + + if ! k3d image import "${tmp}" --cluster "${CLUSTER_NAME}"; then + rm -f "${tmp}" + return 1 + fi + + rm -f "${tmp}" +} + cmd_create() { require_supported_os require_docker @@ -189,6 +269,7 @@ EOF merge_kubeconfig apply_base_manifests configure_ghcr_credentials + preload_sandbox_image echo "Active context: $(k3d_context_name)" echo "Kubeconfig: ${KUBECONFIG_TARGET}" echo "Envoy Gateway LoadBalancer (port 80): http://127.0.0.1:${HOST_LB_PORT}"