From 754afb5eb6f3e0f7f7bc32b8b6eeaed7b4d75137 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Thu, 14 May 2026 18:42:13 -0700 Subject: [PATCH 01/18] refactor(server): introduce Authenticator trait and Principal enum Replaces the hard-coded sandbox-method / dual-auth / Bearer branches in AuthGrpcRouter with a pluggable Authenticator chain that produces a Principal::{User, Sandbox, Anonymous}. The principal is inserted into request extensions for handler consumption. PR-1 keeps the legacy metadata marker for sandbox principals so existing handlers that read x-openshell-auth-source continue to work; the marker is removed in the PR-3 wire break. The OidcAuthenticator wraps the existing JwksCache::validate_token for User principals, and the LegacySandboxMarkerAuthenticator preserves the pre-refactor path-based behavior pending the gateway-minted JWT flow in PR 2/3. Part of the per-sandbox identity series that closes #1354. Signed-off-by: Taylor Mutch --- Cargo.lock | 1 + crates/openshell-server/Cargo.toml | 1 + .../src/auth/authenticator.rs | 334 ++++++++++++++ crates/openshell-server/src/auth/mod.rs | 2 + crates/openshell-server/src/auth/oidc.rs | 39 ++ crates/openshell-server/src/auth/principal.rs | 85 ++++ crates/openshell-server/src/multiplex.rs | 424 +++++++++++++----- 7 files changed, 778 insertions(+), 108 deletions(-) create mode 100644 crates/openshell-server/src/auth/authenticator.rs create mode 100644 crates/openshell-server/src/auth/principal.rs diff --git a/Cargo.lock b/Cargo.lock index 9e73bce83..df5d27504 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3684,6 +3684,7 @@ name = "openshell-server" version = "0.0.0" dependencies = [ "anyhow", + "async-trait", "axum 0.8.9", "bytes", "clap", diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml index 4bbfe24fc..34768c4b9 100644 --- a/crates/openshell-server/Cargo.toml +++ b/crates/openshell-server/Cargo.toml @@ -82,6 +82,7 @@ uuid = { workspace = true } hmac = "0.12" sha2 = { workspace = true } jsonwebtoken = { workspace = true } +async-trait = "0.1" hex = "0.4" russh = "0.57" rand = { workspace = true } diff --git a/crates/openshell-server/src/auth/authenticator.rs b/crates/openshell-server/src/auth/authenticator.rs new file mode 100644 index 000000000..40fa1e4ec --- /dev/null +++ b/crates/openshell-server/src/auth/authenticator.rs @@ -0,0 +1,334 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Pluggable authentication trait + chain dispatch. +//! +//! The gateway runs every authenticated request through an +//! [`AuthenticatorChain`] of [`Authenticator`] implementations. The chain +//! evaluates authenticators in order; the first one that recognizes the +//! caller produces the [`Principal`]. An authenticator that does not apply +//! (e.g. an OIDC authenticator seeing no Bearer header) returns `Ok(None)` +//! so the chain falls through to the next. An authenticator that *does* +//! apply but rejects the caller returns `Err(Status)`, which terminates +//! the chain — fail-closed. +//! +//! This module is the abstraction PR (PR 1). Subsequent PRs slot in: +//! - PR 2: `SandboxJwtAuthenticator` + `K8sServiceAccountAuthenticator` +//! - PR 3: removal of the PR-1 legacy marker authenticator + +use super::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; +use async_trait::async_trait; +use std::sync::Arc; +use tonic::Status; + +/// Pluggable authentication step. +/// +/// Implementations are expected to be cheap to clone (they live behind +/// `Arc` inside an [`AuthenticatorChain`]). +#[async_trait] +pub trait Authenticator: Send + Sync + 'static { + /// Inspect an inbound request and return the authenticated principal. + /// + /// - `Ok(Some(principal))` — this authenticator recognized the caller. + /// The chain stops and the principal is inserted into request + /// extensions. + /// - `Ok(None)` — this authenticator does not apply (e.g. no Bearer + /// token for an OIDC authenticator). The chain falls through to + /// the next authenticator. + /// - `Err(status)` — this authenticator applies but rejected the + /// caller. The chain terminates and the status is returned to the + /// client. Fail-closed. + async fn authenticate( + &self, + headers: &http::HeaderMap, + path: &str, + ) -> Result, Status>; +} + +/// First-match-wins authenticator chain. +/// +/// The chain owns its authenticators behind `Arc` so the entire chain is +/// cheap to clone — required because `tower::Service::call` clones the +/// router on every request. +#[derive(Clone)] +pub struct AuthenticatorChain { + authenticators: Arc<[Arc]>, +} + +impl AuthenticatorChain { + /// Build a chain from an ordered list of authenticators. Earlier + /// entries are evaluated first. + pub fn new(authenticators: Vec>) -> Self { + Self { + authenticators: Arc::from(authenticators), + } + } + + /// Run the chain. Returns the first principal produced. If every + /// authenticator returns `Ok(None)`, the result is `Ok(None)` — the + /// router translates that to `unauthenticated`. + pub async fn authenticate( + &self, + headers: &http::HeaderMap, + path: &str, + ) -> Result, Status> { + for authenticator in self.authenticators.iter() { + if let Some(principal) = authenticator.authenticate(headers, path).await? { + return Ok(Some(principal)); + } + } + Ok(None) + } +} + +impl std::fmt::Debug for AuthenticatorChain { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AuthenticatorChain") + .field("len", &self.authenticators.len()) + .finish() + } +} + +/// Authenticator that preserves the pre-refactor behavior for sandbox-class +/// and dual-auth-no-Bearer paths. +/// +/// Returns `Some(Principal::Sandbox)` with [`SandboxIdentitySource::LegacyMarker`] +/// — the `sandbox_id` is left empty because no credential was verified. This +/// matches the pre-PR-1 router which trusted the path list, not the caller. +/// +/// PR 3 deletes this type once every sandbox call carries a gateway-minted +/// JWT and the path-based branches are gone. +pub struct LegacySandboxMarkerAuthenticator; + +#[async_trait] +impl Authenticator for LegacySandboxMarkerAuthenticator { + async fn authenticate( + &self, + headers: &http::HeaderMap, + path: &str, + ) -> Result, Status> { + let is_sandbox_path = super::oidc::is_sandbox_method(path); + let is_dual_no_bearer = + super::oidc::is_dual_auth_method(path) && !has_bearer_token(headers); + if is_sandbox_path || is_dual_no_bearer { + return Ok(Some(Principal::Sandbox(SandboxPrincipal { + sandbox_id: String::new(), + source: SandboxIdentitySource::LegacyMarker, + trust_domain: None, + }))); + } + Ok(None) + } +} + +fn has_bearer_token(headers: &http::HeaderMap) -> bool { + headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .is_some_and(|v| v.starts_with("Bearer ")) +} + +#[cfg(test)] +pub mod test_support { + use super::*; + use std::sync::Mutex; + + /// Authenticator that always returns the configured outcome. Used by + /// tests to inject a known principal (or rejection) without running real + /// crypto. Each call records the path it was invoked with so tests can + /// assert chain ordering. + pub struct MockAuthenticator { + pub outcome: Result, Status>, + pub calls: Mutex>, + } + + impl MockAuthenticator { + pub fn returning(outcome: Result, Status>) -> Self { + Self { + outcome, + calls: Mutex::new(Vec::new()), + } + } + + pub fn call_count(&self) -> usize { + self.calls.lock().unwrap().len() + } + } + + #[async_trait] + impl Authenticator for MockAuthenticator { + async fn authenticate( + &self, + _headers: &http::HeaderMap, + path: &str, + ) -> Result, Status> { + self.calls.lock().unwrap().push(path.to_string()); + self.outcome.clone() + } + } +} + +#[cfg(test)] +mod tests { + use super::test_support::MockAuthenticator; + use super::*; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::UserPrincipal; + + fn user_principal(subject: &str) -> Principal { + Principal::User(UserPrincipal { + identity: Identity { + subject: subject.to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + }) + } + + #[tokio::test] + async fn chain_returns_first_match() { + let first = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "alice", + ))))); + let second = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "bob", + ))))); + let chain = AuthenticatorChain::new(vec![first.clone(), second.clone()]); + let result = chain + .authenticate(&http::HeaderMap::new(), "/some/path") + .await + .unwrap() + .expect("expected a principal"); + match result { + Principal::User(u) => assert_eq!(u.identity.subject, "alice"), + _ => panic!("expected user principal"), + } + assert_eq!(first.call_count(), 1); + assert_eq!( + second.call_count(), + 0, + "second authenticator must be skipped after first matches" + ); + } + + #[tokio::test] + async fn chain_falls_through_on_none() { + let first = Arc::new(MockAuthenticator::returning(Ok(None))); + let second = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "bob", + ))))); + let chain = AuthenticatorChain::new(vec![first.clone(), second.clone()]); + let result = chain + .authenticate(&http::HeaderMap::new(), "/some/path") + .await + .unwrap() + .expect("expected a principal"); + match result { + Principal::User(u) => assert_eq!(u.identity.subject, "bob"), + _ => panic!("expected user principal"), + } + assert_eq!(first.call_count(), 1); + assert_eq!(second.call_count(), 1); + } + + #[tokio::test] + async fn chain_fails_closed_on_first_error() { + let first = Arc::new(MockAuthenticator::returning(Err(Status::unauthenticated( + "bad token", + )))); + let second = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "bob", + ))))); + let chain = AuthenticatorChain::new(vec![first.clone(), second.clone()]); + let err = chain + .authenticate(&http::HeaderMap::new(), "/some/path") + .await + .expect_err("must short-circuit on error"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + assert_eq!(first.call_count(), 1); + assert_eq!( + second.call_count(), + 0, + "must not consult later authenticators after an error" + ); + } + + #[tokio::test] + async fn empty_chain_returns_none() { + let chain = AuthenticatorChain::new(vec![]); + let result = chain + .authenticate(&http::HeaderMap::new(), "/some/path") + .await + .unwrap(); + assert!(result.is_none()); + } + + #[tokio::test] + async fn legacy_marker_recognizes_sandbox_method() { + let auth = LegacySandboxMarkerAuthenticator; + let result = auth + .authenticate( + &http::HeaderMap::new(), + "/openshell.v1.OpenShell/ReportPolicyStatus", + ) + .await + .unwrap() + .expect("sandbox path must produce a principal"); + match result { + Principal::Sandbox(p) => { + assert!(p.sandbox_id.is_empty(), "legacy marker has no verified id"); + assert!(matches!(p.source, SandboxIdentitySource::LegacyMarker)); + } + _ => panic!("expected sandbox principal"), + } + } + + #[tokio::test] + async fn legacy_marker_recognizes_dual_auth_without_bearer() { + let auth = LegacySandboxMarkerAuthenticator; + let result = auth + .authenticate( + &http::HeaderMap::new(), + "/openshell.v1.OpenShell/UpdateConfig", + ) + .await + .unwrap(); + assert!( + result.is_some(), + "dual-auth without Bearer must mark sandbox" + ); + } + + #[tokio::test] + async fn legacy_marker_yields_to_dual_auth_with_bearer() { + let auth = LegacySandboxMarkerAuthenticator; + let mut headers = http::HeaderMap::new(); + headers.insert( + "authorization", + http::HeaderValue::from_static("Bearer xyz"), + ); + let result = auth + .authenticate(&headers, "/openshell.v1.OpenShell/UpdateConfig") + .await + .unwrap(); + assert!( + result.is_none(), + "dual-auth WITH Bearer must fall through to the OIDC authenticator" + ); + } + + #[tokio::test] + async fn legacy_marker_skips_unrelated_paths() { + let auth = LegacySandboxMarkerAuthenticator; + let result = auth + .authenticate( + &http::HeaderMap::new(), + "/openshell.v1.OpenShell/ListSandboxes", + ) + .await + .unwrap(); + assert!(result.is_none()); + } +} diff --git a/crates/openshell-server/src/auth/mod.rs b/crates/openshell-server/src/auth/mod.rs index 8e4f332d8..db09e26b7 100644 --- a/crates/openshell-server/src/auth/mod.rs +++ b/crates/openshell-server/src/auth/mod.rs @@ -8,9 +8,11 @@ //! - `identity`: Provider-agnostic identity representation //! - `http`: HTTP endpoints for auth discovery and token exchange +pub mod authenticator; pub mod authz; mod http; pub mod identity; pub mod oidc; +pub mod principal; pub use http::router; diff --git a/crates/openshell-server/src/auth/oidc.rs b/crates/openshell-server/src/auth/oidc.rs index 92298579e..c2eb58b2d 100644 --- a/crates/openshell-server/src/auth/oidc.rs +++ b/crates/openshell-server/src/auth/oidc.rs @@ -10,7 +10,10 @@ //! This module owns authentication (verifying who the caller is). //! Authorization (deciding what the caller can do) is in `authz.rs`. +use super::authenticator::Authenticator; use super::identity::{Identity, IdentityProvider}; +use super::principal::{Principal, UserPrincipal}; +use async_trait::async_trait; use jsonwebtoken::{Algorithm, DecodingKey, Validation, decode, decode_header}; use openshell_core::OidcConfig; use reqwest::Client; @@ -419,6 +422,42 @@ impl JwksCache { } } +/// Authenticator that validates `Authorization: Bearer ` headers against +/// the configured OIDC issuer. +/// +/// Returns `Ok(None)` when no Bearer header is present, so the chain can fall +/// through to other authenticators (e.g. the gateway-minted sandbox JWT +/// authenticator added in PR 2). +pub struct OidcAuthenticator { + cache: Arc, +} + +impl OidcAuthenticator { + pub fn new(cache: Arc) -> Self { + Self { cache } + } +} + +#[async_trait] +impl Authenticator for OidcAuthenticator { + async fn authenticate( + &self, + headers: &http::HeaderMap, + _path: &str, + ) -> Result, Status> { + let Some(token) = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .and_then(|v| v.strip_prefix("Bearer ")) + else { + return Ok(None); + }; + + let identity = self.cache.validate_token(token).await?; + Ok(Some(Principal::User(UserPrincipal { identity }))) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/openshell-server/src/auth/principal.rs b/crates/openshell-server/src/auth/principal.rs new file mode 100644 index 000000000..25bb57109 --- /dev/null +++ b/crates/openshell-server/src/auth/principal.rs @@ -0,0 +1,85 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Authenticated caller principals. +//! +//! A `Principal` is the result of running the [`super::authenticator::Authenticator`] +//! chain on an inbound request. It generalizes over the kinds of callers the +//! gateway recognizes — human users (OIDC), sandbox supervisors (gateway-minted +//! JWT, future SPIFFE), and anonymous callers (truly unauthenticated methods +//! like health probes). +//! +//! Handlers read the principal from the gRPC `Request` extensions and gate +//! access accordingly. Sandbox-class handlers MUST compare +//! `Principal::Sandbox.sandbox_id` against the request body's `sandbox_id` +//! to prevent cross-sandbox access (see issue #1354). + +use super::identity::Identity; + +/// Who is calling. +/// +/// Inserted into `tonic::Request::extensions` by the auth router. Handlers +/// retrieve it via `req.extensions().get::()`. +#[derive(Debug, Clone)] +pub enum Principal { + /// Human caller authenticated via OIDC (Keycloak, Entra ID, Okta, etc.). + User(UserPrincipal), + /// Sandbox supervisor authenticated by an identity bound to a specific + /// sandbox UUID. The wrapped `sandbox_id` MUST match any sandbox referenced + /// in the request body for sandbox-class methods (PR-4 guard). + Sandbox(#[allow(dead_code)] SandboxPrincipal), + /// Truly unauthenticated caller (health probes, reflection). Sandbox-class + /// and user-class methods reject this variant. + #[allow(dead_code)] + Anonymous, +} + +/// User caller — wraps the existing provider-agnostic [`Identity`]. +#[derive(Debug, Clone)] +pub struct UserPrincipal { + /// The verified identity from the authentication provider. + pub identity: Identity, +} + +/// Sandbox caller — bound to one specific sandbox UUID. +/// +/// `sandbox_id` and `source` are consumed by the PR-4 handler guard; until +/// then they only exist in the type so the trait shape is stable across the +/// PR series. +#[derive(Debug, Clone)] +#[allow(dead_code)] +pub struct SandboxPrincipal { + /// Canonical sandbox UUID. Empty string only for the PR-1 legacy marker; + /// PR 2 onwards always populates this from a verified credential. + pub sandbox_id: String, + /// How this principal was verified — used for audit logs and to gate the + /// PR-4 IDOR check against unverified sources. + pub source: SandboxIdentitySource, + /// SPIFFE trust domain. Populated when the credential is SPIFFE-shaped; + /// reserved for future per-sandbox cert / SPIRE authenticators. + pub trust_domain: Option, +} + +/// How a [`SandboxPrincipal`] was authenticated. +#[derive(Debug, Clone)] +pub enum SandboxIdentitySource { + /// PR-1 placeholder: the request matched a sandbox-class path or a + /// dual-auth path without a Bearer token. No credential was verified. + /// Removed in PR 3 once every sandbox call carries a gateway-minted JWT. + LegacyMarker, + /// Gateway-minted JWT validated against the gateway's signing key. + /// Populated by PR 2's `SandboxJwtAuthenticator`. + #[allow(dead_code)] + BootstrapJwt { issuer: String, jti: String }, + /// Per-sandbox client certificate. Reserved for the v2 channel-bound + /// identity follow-up. + #[allow(dead_code)] + BootstrapCert { fingerprint: String }, + /// SPIRE-issued SVID. Reserved for the SPIFFE/SPIRE follow-up. + #[allow(dead_code)] + SpiffeSvid { spiffe_id: String }, + /// K8s `ServiceAccount` token used to bootstrap a gateway-minted JWT + /// via `IssueSandboxToken`. Populated only on that one RPC path. + #[allow(dead_code)] + K8sServiceAccount { pod_name: String, pod_uid: String }, +} diff --git a/crates/openshell-server/src/multiplex.rs b/crates/openshell-server/src/multiplex.rs index deac9ee78..e8aa0dfb4 100644 --- a/crates/openshell-server/src/multiplex.rs +++ b/crates/openshell-server/src/multiplex.rs @@ -31,8 +31,15 @@ use tower_http::request_id::{MakeRequestId, RequestId}; use tracing::Span; use crate::{ - OpenShellService, ServerState, auth::authz::AuthzPolicy, auth::identity::Identity, auth::oidc, - http_router, inference::InferenceService, service_http_router, + OpenShellService, ServerState, + auth::authenticator::{AuthenticatorChain, LegacySandboxMarkerAuthenticator}, + auth::authz::AuthzPolicy, + auth::identity::Identity, + auth::oidc::{self, OidcAuthenticator}, + auth::principal::{Principal, UserPrincipal}, + http_router, + inference::InferenceService, + service_http_router, }; /// Request-ID generator that produces a UUID v4 for each inbound request. @@ -153,17 +160,11 @@ impl MultiplexService { user_role: oidc.user_role.clone(), scopes_enabled: !oidc.scopes_claim.is_empty(), }); - let has_client_ca = self - .state - .config - .tls - .as_ref() - .is_some_and(|tls| tls.client_ca_path.is_some()); - let grpc_service = AuthGrpcRouter::new( + let authenticator_chain = build_authenticator_chain(self.state.oidc_cache.clone()); + let grpc_service = AuthGrpcRouter::with_peer_identity( GrpcRouter::new(openshell, inference), - self.state.oidc_cache.clone(), + authenticator_chain, authz_policy, - has_client_ca, peer_identity, ); let http_service = http_router(self.state.clone()); @@ -256,28 +257,54 @@ where } } -/// gRPC router wrapper that authenticates and authorizes requests. +/// Assemble the authenticator chain for the gateway. /// -/// When `oidc_cache` is `Some`, extracts the `authorization: Bearer ` -/// header, validates the JWT (authentication), then checks RBAC roles -/// (authorization) before forwarding to the inner gRPC router. +/// PR-1 composition: +/// 1. [`LegacySandboxMarkerAuthenticator`] — preserves the path-based +/// sandbox/dual-auth-no-Bearer behavior so handlers that still read the +/// metadata marker keep working. Removed in PR 3. +/// 2. [`OidcAuthenticator`] — validates Bearer tokens against the configured +/// OIDC issuer. Only added when OIDC is configured. /// -/// Authentication is provider-specific (currently OIDC via `oidc.rs`). -/// Authorization is provider-agnostic (via `authz.rs`). This separation -/// aligns with RFC 0001's control-plane identity design. +/// When OIDC is not configured (singleplayer dev mode, fronting-proxy +/// deployments before PR 3), the chain still has the legacy marker so +/// sandbox-class methods produce a `Principal::Sandbox` and non-sandbox +/// methods produce `None` — preserving today's "OIDC None == pass through" +/// behavior via the router's `chain_empty_means_passthrough` short-circuit. +fn build_authenticator_chain( + oidc_cache: Option>, +) -> Option { + let mut authenticators: Vec> = Vec::new(); + authenticators.push(Arc::new(LegacySandboxMarkerAuthenticator)); + if let Some(cache) = oidc_cache { + authenticators.push(Arc::new(OidcAuthenticator::new(cache))); + } else { + // No OIDC configured — the router treats a missing OIDC cache as + // "pass-through for non-sandbox methods" by skipping the chain + // entirely. See AuthGrpcRouter::call. + return None; + } + Some(AuthenticatorChain::new(authenticators)) +} + +/// gRPC router wrapper that runs the [`AuthenticatorChain`] and inserts the +/// resulting [`Principal`] into the request's extensions. /// -/// Sandbox-class methods (`oidc::is_sandbox_method`) accept callers without -/// a Bearer token: the gRPC channel's mTLS handshake is the trust -/// boundary. The router marks such requests with the -/// `INTERNAL_AUTH_SOURCE_HEADER` so handlers (`policy.rs`) can apply -/// sandbox-restricted scope. +/// Behavior: +/// - Strip any external `x-openshell-auth-source` marker first (so callers +/// cannot spoof a sandbox identity). +/// - Health probes / reflection bypass the chain entirely. +/// - When no chain is configured (OIDC not configured), forward without +/// authentication — preserves today's pass-through behavior. +/// - Otherwise, run the chain. The first match produces a `Principal`. +/// `Principal::User` is gated by the RBAC `AuthzPolicy`. The legacy +/// sandbox marker also inserts the metadata marker for backwards-compat +/// with handlers that still consume it (PR-1 only; removed in PR 3). #[derive(Clone)] pub struct AuthGrpcRouter { inner: S, - oidc_cache: Option>, + authenticator_chain: Option, authz_policy: Option, - /// Whether a client CA is configured (mTLS is a valid auth mechanism). - has_client_ca: bool, /// mTLS peer identity extracted from the TLS handshake. peer_identity: Option, } @@ -285,21 +312,34 @@ pub struct AuthGrpcRouter { impl AuthGrpcRouter { fn new( inner: S, - oidc_cache: Option>, + authenticator_chain: Option, + authz_policy: Option, + ) -> Self { + Self::with_peer_identity(inner, authenticator_chain, authz_policy, None) + } + + fn with_peer_identity( + inner: S, + authenticator_chain: Option, authz_policy: Option, - has_client_ca: bool, peer_identity: Option, ) -> Self { Self { inner, - oidc_cache, + authenticator_chain, authz_policy, - has_client_ca, peer_identity, } } } +fn status_response(status: tonic::Status) -> Response { + let response = status.into_http(); + let (parts, body) = response.into_parts(); + let body = tonic::body::BoxBody::new(body); + Response::from_parts(parts, body) +} + impl tower::Service> for AuthGrpcRouter where S: tower::Service, Response = Response> @@ -319,9 +359,8 @@ where } fn call(&mut self, req: Request) -> Self::Future { - let oidc_cache = self.oidc_cache.clone(); + let chain = self.authenticator_chain.clone(); let authz_policy = self.authz_policy.clone(); - let has_client_ca = self.has_client_ca; let peer_identity = self.peer_identity.clone(); let mut inner = self.inner.clone(); @@ -329,18 +368,12 @@ where let mut req = req; oidc::clear_internal_auth_markers(req.headers_mut()); - // No auth configured — pass through. - if oidc_cache.is_none() && !has_client_ca { + // No chain configured — pass through. Preserves today's + // "OIDC None means open" behavior for dev/fronting-proxy modes. + let Some(chain) = chain else { return inner.ready().await?.call(req).await; - } - - // mTLS-only (no OIDC) — TLS layer already enforced client certs, - // so if we got here the peer is authenticated. - if oidc_cache.is_none() && has_client_ca { - return inner.ready().await?.call(req).await; - } + }; - let cache = oidc_cache.expect("checked above"); let path = req.uri().path().to_string(); // Health probes and reflection — truly unauthenticated. @@ -348,72 +381,38 @@ where return inner.ready().await?.call(req).await; } - // Sandbox-class RPCs — no Bearer expected. The gRPC channel's - // mTLS handshake (or the operator's fronting proxy when - // `--disable-gateway-auth` is set) is the trust boundary. - if oidc::is_sandbox_method(&path) { - oidc::mark_sandbox_caller(req.headers_mut()); - return inner.ready().await?.call(req).await; - } - - // Dual-auth methods (e.g. UpdateConfig) — Bearer present grants - // full scope (CLI users); Bearer absent marks the caller as - // sandbox-class for restricted scope downstream. - if oidc::is_dual_auth_method(&path) && !has_bearer_token(req.headers()) { - oidc::mark_sandbox_caller(req.headers_mut()); - return inner.ready().await?.call(req).await; - } - - // Extract Bearer token from the authorization header. - let token = req - .headers() - .get("authorization") - .and_then(|v| v.to_str().ok()) - .and_then(|v| v.strip_prefix("Bearer ")); - - let Some(token) = token else { - // No bearer token — fall back to mTLS if a client cert was - // presented (only possible when both OIDC and client CA are - // configured and require_client_auth is false). - if let Some(ref identity) = peer_identity { - if let Some(ref policy) = authz_policy - && let Err(status) = policy.check(identity, &path) - { - let response = status.into_http(); - let (parts, body) = response.into_parts(); - let body = tonic::body::BoxBody::new(body); - return Ok(Response::from_parts(parts, body)); + let principal = match chain.authenticate(req.headers(), &path).await { + Ok(Some(p)) => p, + Ok(None) => { + if let Some(identity) = peer_identity { + Principal::User(UserPrincipal { identity }) + } else { + return Ok(status_response(tonic::Status::unauthenticated( + "missing authorization header", + ))); } - return inner.ready().await?.call(req).await; } - let status = tonic::Status::unauthenticated("missing authorization header"); - let response = status.into_http(); - let (parts, body) = response.into_parts(); - let body = tonic::body::BoxBody::new(body); - return Ok(Response::from_parts(parts, body)); + Err(status) => return Ok(status_response(status)), }; - // Authenticate: validate the JWT and produce an Identity. - let identity = match cache.validate_token(token).await { - Ok(id) => id, - Err(status) => { - let response = status.into_http(); - let (parts, body) = response.into_parts(); - let body = tonic::body::BoxBody::new(body); - return Ok(Response::from_parts(parts, body)); - } - }; - - // Authorize: check RBAC roles against the method. - if let Some(ref policy) = authz_policy - && let Err(status) = policy.check(&identity, &path) + // Authorize user principals via RBAC. Sandbox principals get + // their PR-4 equality check at the handler level; legacy markers + // (PR-1) bypass RBAC, matching pre-refactor behavior. + if let Principal::User(ref user) = principal + && let Some(ref policy) = authz_policy + && let Err(status) = policy.check(&user.identity, &path) { - let response = status.into_http(); - let (parts, body) = response.into_parts(); - let body = tonic::body::BoxBody::new(body); - return Ok(Response::from_parts(parts, body)); + return Ok(status_response(status)); } + // PR-1 backwards-compat: handlers still consume the metadata + // marker today. Insert it for sandbox principals so existing + // policy-handler logic continues to work. PR 3 removes this. + if matches!(principal, Principal::Sandbox(_)) { + oidc::mark_sandbox_caller(req.headers_mut()); + } + + req.extensions_mut().insert(principal); inner.ready().await?.call(req).await }) } @@ -513,13 +512,6 @@ where } } -fn has_bearer_token(headers: &http::HeaderMap) -> bool { - headers - .get("authorization") - .and_then(|v| v.to_str().ok()) - .is_some_and(|v| v.starts_with("Bearer ")) -} - fn grpc_method_from_path(path: &str) -> String { path.rsplit('/').next().unwrap_or(path).to_string() } @@ -860,4 +852,220 @@ mod tests { fn normalize_root_path() { assert_eq!(normalize_http_path("/"), "unknown"); } + + mod auth_router { + use super::*; + use crate::auth::authenticator::test_support::MockAuthenticator; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{ + Principal, SandboxIdentitySource, SandboxPrincipal, UserPrincipal, + }; + use http_body_util::Full; + use std::sync::Arc; + use std::sync::Mutex; + use tower::Service; + + type RecordedPrincipal = Arc>>; + + /// Service that snapshots the `Principal` from request extensions + /// and the `x-openshell-auth-source` header, then returns 200 OK. + #[derive(Clone)] + struct PrincipalRecorder { + recorded: RecordedPrincipal, + sandbox_marker: Arc>, + } + + impl PrincipalRecorder { + fn new() -> (Self, RecordedPrincipal, Arc>) { + let recorded = Arc::new(Mutex::new(None)); + let marker = Arc::new(Mutex::new(false)); + ( + Self { + recorded: recorded.clone(), + sandbox_marker: marker.clone(), + }, + recorded, + marker, + ) + } + } + + impl Service> for PrincipalRecorder { + type Response = Response; + type Error = std::convert::Infallible; + type Future = Pin> + Send>>; + + fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } + + fn call(&mut self, req: Request) -> Self::Future { + let principal = req.extensions().get::().cloned(); + let has_marker = req + .headers() + .get(oidc::INTERNAL_AUTH_SOURCE_HEADER) + .is_some_and(|v| v.as_bytes() == oidc::AUTH_SOURCE_SANDBOX.as_bytes()); + *self.recorded.lock().unwrap() = principal; + *self.sandbox_marker.lock().unwrap() = has_marker; + Box::pin(async move { + let body = tonic::body::BoxBody::new( + Full::new(Bytes::new()) + .map_err(|never| match never {}) + .boxed_unsync(), + ); + Ok(Response::new(body)) + }) + } + } + + fn empty_request(path: &str) -> Request> { + Request::builder() + .uri(path) + .body(Full::new(Bytes::new())) + .unwrap() + } + + fn user_principal(subject: &str) -> Principal { + Principal::User(UserPrincipal { + identity: Identity { + subject: subject.to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + }) + } + + fn sandbox_principal() -> Principal { + Principal::Sandbox(SandboxPrincipal { + sandbox_id: String::new(), + source: SandboxIdentitySource::LegacyMarker, + trust_domain: None, + }) + } + + #[tokio::test] + async fn user_principal_lands_in_request_extensions() { + let mock = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "alice", + ))))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen, _) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let _ = router + .call(empty_request("/openshell.v1.OpenShell/ListSandboxes")) + .await + .unwrap(); + let principal = seen.lock().unwrap().clone().expect("principal"); + match principal { + Principal::User(u) => assert_eq!(u.identity.subject, "alice"), + _ => panic!("expected user principal"), + } + } + + #[tokio::test] + async fn sandbox_principal_inserts_metadata_marker_for_backcompat() { + // PR-1 keeps the metadata marker so handlers that still read it + // (until PR 3/4 swap them over to extensions) keep working. + let mock = Arc::new(MockAuthenticator::returning(Ok(Some(sandbox_principal())))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen, marker_seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let _ = router + .call(empty_request("/openshell.v1.OpenShell/ReportPolicyStatus")) + .await + .unwrap(); + assert!( + matches!( + seen.lock().unwrap().clone(), + Some(Principal::Sandbox(_)) + ), + "principal must reach extensions" + ); + assert!( + *marker_seen.lock().unwrap(), + "sandbox principals must also set the legacy metadata marker" + ); + } + + #[tokio::test] + async fn missing_principal_returns_unauthenticated() { + let mock = Arc::new(MockAuthenticator::returning(Ok(None))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen, _) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let res = router + .call(empty_request("/openshell.v1.OpenShell/ListSandboxes")) + .await + .unwrap(); + assert!(seen.lock().unwrap().is_none()); + // tonic sets grpc-status=16 (UNAUTHENTICATED) in trailers. + let grpc_status = res + .headers() + .get("grpc-status") + .map(|v| v.to_str().unwrap().to_string()); + assert_eq!(grpc_status.as_deref(), Some("16")); + } + + #[tokio::test] + async fn authenticator_error_short_circuits() { + let mock = Arc::new(MockAuthenticator::returning(Err( + tonic::Status::unauthenticated("forged"), + ))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen, _) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let res = router + .call(empty_request("/openshell.v1.OpenShell/ListSandboxes")) + .await + .unwrap(); + assert!(seen.lock().unwrap().is_none()); + assert_eq!( + res.headers() + .get("grpc-status") + .map(|v| v.to_str().unwrap().to_string()) + .as_deref(), + Some("16") + ); + } + + #[tokio::test] + async fn health_methods_bypass_chain() { + // Authenticator is wired to fail-closed; the request still gets + // through because the path is exempt. + let mock = Arc::new(MockAuthenticator::returning(Err( + tonic::Status::unauthenticated("would reject"), + ))); + let chain = AuthenticatorChain::new(vec![mock.clone()]); + let (recorder, _, _) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let res = router + .call(empty_request("/openshell.v1.OpenShell/Health")) + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!(mock.call_count(), 0, "health must not consult the chain"); + } + + #[tokio::test] + async fn external_auth_source_marker_is_stripped() { + let mock = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "alice", + ))))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, _, marker_seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let mut req = empty_request("/openshell.v1.OpenShell/ListSandboxes"); + req.headers_mut().insert( + oidc::INTERNAL_AUTH_SOURCE_HEADER, + HeaderValue::from_static(oidc::AUTH_SOURCE_SANDBOX), + ); + let _ = router.call(req).await.unwrap(); + assert!( + !*marker_seen.lock().unwrap(), + "external sandbox marker must be stripped before auth" + ); + } + } } From 4a9a30a42c9b70abd157f682f832b6e410230a1d Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Thu, 14 May 2026 20:06:12 -0700 Subject: [PATCH 02/18] feat(server): gateway-minted sandbox JWTs and IssueSandboxToken RPC Adds the gateway-side infrastructure for per-sandbox identity tokens (the PR-2 step of the series resolving #1354): - New Ed25519 keypair generated by `certgen` alongside the existing PKI. Local mode writes `/jwt/{signing.pem,public.pem,kid}`; K8s mode creates an Opaque `-jwt-keys` Secret. - `SandboxJwtIssuer` mints tokens with EdDSA-signed claims (SPIFFE-shaped `sub`, denormalised `sandbox_id`, 24h default TTL, `jti` for revocation). - `SandboxJwtAuthenticator` validates tokens through the Authenticator chain and yields `Principal::Sandbox(BootstrapJwt {..})`. Tokens with a different `kid` fall through so non-matching Bearer headers reach the OIDC authenticator unchanged. - `K8sServiceAccountAuthenticator` is path-scoped to `IssueSandboxToken`; consumes a projected SA token and produces a `K8sServiceAccount` sandbox principal that the new `IssueSandboxToken` handler exchanges for a fresh gateway JWT. - In-memory `RevocationSet` with TTL pruning, ready for the PR-3 delete-side hook and PR-5 refresh. - Helm chart mounts the JWT secret on the gateway pod and wires `[openshell.gateway.gateway_jwt]` into the rendered TOML. PR 2 is additive: no driver yet writes a sandbox token, no supervisor yet presents a Bearer JWT. PR 3 wires the consumer ends and removes the legacy path-based sandbox marker. Signed-off-by: Taylor Mutch --- Cargo.lock | 1 + crates/openshell-bootstrap/Cargo.toml | 1 + crates/openshell-bootstrap/src/jwt.rs | 112 +++++ crates/openshell-bootstrap/src/lib.rs | 1 + crates/openshell-bootstrap/src/pki.rs | 20 + crates/openshell-cli/src/run.rs | 5 + .../tests/ensure_providers_integration.rs | 7 + .../openshell-cli/tests/mtls_integration.rs | 7 + .../tests/provider_commands_integration.rs | 7 + .../sandbox_create_lifecycle_integration.rs | 7 + .../sandbox_name_fallback_integration.rs | 7 + crates/openshell-core/src/config.rs | 39 ++ crates/openshell-core/src/lib.rs | 2 +- crates/openshell-core/src/sandbox_env.rs | 19 + crates/openshell-server/src/auth/k8s_sa.rs | 299 +++++++++++++ crates/openshell-server/src/auth/mod.rs | 3 + .../openshell-server/src/auth/revocation.rs | 100 +++++ .../openshell-server/src/auth/sandbox_jwt.rs | 397 ++++++++++++++++++ crates/openshell-server/src/certgen.rs | 160 +++++-- crates/openshell-server/src/cli.rs | 14 + crates/openshell-server/src/config_file.rs | 4 +- crates/openshell-server/src/grpc/auth_rpc.rs | 68 +++ crates/openshell-server/src/grpc/mod.rs | 29 +- crates/openshell-server/src/grpc/sandbox.rs | 6 + crates/openshell-server/src/lib.rs | 86 +++- crates/openshell-server/src/multiplex.rs | 55 +-- .../tests/auth_endpoint_integration.rs | 8 + .../tests/edge_tunnel_auth.rs | 7 + .../tests/multiplex_integration.rs | 7 + .../tests/multiplex_tls_integration.rs | 7 + .../tests/supervisor_relay_integration.rs | 6 + .../tests/ws_tunnel_integration.rs | 7 + deploy/helm/openshell/templates/certgen.yaml | 1 + .../openshell/templates/gateway-config.yaml | 7 + .../helm/openshell/templates/statefulset.yaml | 7 + deploy/helm/openshell/values.yaml | 12 + proto/openshell.proto | 22 + 37 files changed, 1483 insertions(+), 64 deletions(-) create mode 100644 crates/openshell-bootstrap/src/jwt.rs create mode 100644 crates/openshell-server/src/auth/k8s_sa.rs create mode 100644 crates/openshell-server/src/auth/revocation.rs create mode 100644 crates/openshell-server/src/auth/sandbox_jwt.rs create mode 100644 crates/openshell-server/src/grpc/auth_rpc.rs diff --git a/Cargo.lock b/Cargo.lock index df5d27504..6f55814f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3395,6 +3395,7 @@ dependencies = [ "rcgen", "serde", "serde_json", + "sha2 0.10.9", "tar", "tempfile", "tokio", diff --git a/crates/openshell-bootstrap/Cargo.toml b/crates/openshell-bootstrap/Cargo.toml index c0fb7e9f4..578d59e65 100644 --- a/crates/openshell-bootstrap/Cargo.toml +++ b/crates/openshell-bootstrap/Cargo.toml @@ -16,6 +16,7 @@ bytes = { workspace = true } futures = { workspace = true } miette = { workspace = true } rcgen = { workspace = true } +sha2 = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } tar = "0.4" diff --git a/crates/openshell-bootstrap/src/jwt.rs b/crates/openshell-bootstrap/src/jwt.rs new file mode 100644 index 000000000..cf8ab0dc1 --- /dev/null +++ b/crates/openshell-bootstrap/src/jwt.rs @@ -0,0 +1,112 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Gateway-minted JWT signing-key generation. +//! +//! The gateway mints per-sandbox identity tokens (see PR 2 of the +//! per-sandbox identity series, issue #1354) signed with an Ed25519 +//! keypair generated once at gateway init and persisted alongside the +//! existing PKI bundle. The signing key never leaves the gateway; the +//! public key plus a stable `kid` are consumed by the gateway's own +//! validator and any future external verifiers. + +use miette::{IntoDiagnostic, Result, WrapErr}; +use rcgen::{KeyPair, PKCS_ED25519}; +use sha2::{Digest, Sha256}; + +/// All PEM-encoded material needed to mint and validate sandbox JWTs. +/// +/// The signing key stays in the gateway process. The public key is shared +/// across gateway replicas (so any replica can validate a JWT minted by +/// any other replica). The `kid` is published in every minted JWT's +/// header so the validator can pick the right key after a future rotation. +pub struct JwtKeyMaterial { + /// PKCS#8 PEM-encoded Ed25519 private key. + pub signing_key_pem: String, + /// `SubjectPublicKeyInfo` PEM-encoded Ed25519 public key. + pub public_key_pem: String, + /// Stable identifier derived from the public key (SHA-256 hex prefix). + /// Embedded in every minted JWT's `kid` header so future rotation can + /// be performed in-place by adding a second key without breaking + /// in-flight tokens. + pub kid: String, +} + +/// Generate a fresh Ed25519 JWT signing key. +/// +/// Output PEM is in the formats `jsonwebtoken` consumes via +/// `EncodingKey::from_ed_pem` (signing) and `DecodingKey::from_ed_pem` +/// (validation), so the gateway can round-trip its own tokens with no +/// further conversion. +pub fn generate_jwt_key() -> Result { + let keypair = KeyPair::generate_for(&PKCS_ED25519) + .into_diagnostic() + .wrap_err("failed to generate Ed25519 JWT signing key")?; + let signing_key_pem = keypair.serialize_pem(); + let public_key_pem = keypair.public_key_pem(); + let kid = kid_from_public_key_der(&keypair.public_key_der()); + Ok(JwtKeyMaterial { + signing_key_pem, + public_key_pem, + kid, + }) +} + +/// Stable `kid` derived from the SHA-256 of the public-key DER. +/// +/// First 16 bytes hex-encoded — collision-resistant for the small N of +/// signing keys a single deployment ever has, while staying short enough +/// to keep JWT headers compact. +fn kid_from_public_key_der(public_key_der: &[u8]) -> String { + let digest = Sha256::digest(public_key_der); + hex_encode_prefix(&digest, 16) +} + +fn hex_encode_prefix(bytes: &[u8], n: usize) -> String { + use std::fmt::Write as _; + let mut out = String::with_capacity(n * 2); + for byte in bytes.iter().take(n) { + let _ = write!(out, "{byte:02x}"); + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn generate_jwt_key_produces_parseable_pem() { + let material = generate_jwt_key().expect("generate_jwt_key"); + assert!(material.signing_key_pem.contains("BEGIN PRIVATE KEY")); + assert!(material.public_key_pem.contains("BEGIN PUBLIC KEY")); + assert_eq!(material.kid.len(), 32, "kid is 16 bytes hex-encoded"); + assert!(material.kid.chars().all(|c| c.is_ascii_hexdigit())); + } + + #[test] + fn kid_is_stable_for_identical_public_keys() { + // Same input -> same kid. Hash of a fixed byte string. + let kid_a = kid_from_public_key_der(b"abc"); + let kid_b = kid_from_public_key_der(b"abc"); + assert_eq!(kid_a, kid_b); + } + + #[test] + fn kid_differs_for_different_public_keys() { + let kid_a = kid_from_public_key_der(b"first"); + let kid_b = kid_from_public_key_der(b"second"); + assert_ne!(kid_a, kid_b); + } + + #[test] + fn generated_keys_are_unique() { + let a = generate_jwt_key().expect("generate_jwt_key"); + let b = generate_jwt_key().expect("generate_jwt_key"); + assert_ne!( + a.kid, b.kid, + "fresh keypairs must produce distinct public keys" + ); + assert_ne!(a.signing_key_pem, b.signing_key_pem); + } +} diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 0988c4b6b..8845f0392 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -3,6 +3,7 @@ pub mod build; pub mod edge_token; +pub mod jwt; pub mod oidc_token; mod metadata; diff --git a/crates/openshell-bootstrap/src/pki.rs b/crates/openshell-bootstrap/src/pki.rs index b6747260b..388507840 100644 --- a/crates/openshell-bootstrap/src/pki.rs +++ b/crates/openshell-bootstrap/src/pki.rs @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +use crate::jwt::{JwtKeyMaterial, generate_jwt_key}; use miette::{IntoDiagnostic, Result, WrapErr}; use rcgen::{BasicConstraints, CertificateParams, DnType, Ia5String, IsCa, KeyPair, SanType}; use std::net::IpAddr; @@ -15,6 +16,12 @@ pub struct PkiBundle { pub server_key_pem: String, pub client_cert_pem: String, pub client_key_pem: String, + /// PKCS#8 PEM Ed25519 private key for minting per-sandbox JWTs. + pub jwt_signing_key_pem: String, + /// SPKI PEM Ed25519 public key, paired with `jwt_signing_key_pem`. + pub jwt_public_key_pem: String, + /// Stable identifier embedded in the `kid` header of every minted JWT. + pub jwt_key_id: String, } /// Default SANs always included on the server certificate. Covers the host @@ -99,6 +106,13 @@ pub fn generate_pki(extra_sans: &[String]) -> Result { .into_diagnostic() .wrap_err("failed to sign client certificate")?; + // --- JWT signing key (Ed25519, used to mint per-sandbox identity tokens) --- + let JwtKeyMaterial { + signing_key_pem: jwt_signing_key_pem, + public_key_pem: jwt_public_key_pem, + kid: jwt_key_id, + } = generate_jwt_key().wrap_err("failed to generate JWT signing key")?; + Ok(PkiBundle { ca_cert_pem: ca_cert.pem(), ca_key_pem: ca_key.serialize_pem(), @@ -106,6 +120,9 @@ pub fn generate_pki(extra_sans: &[String]) -> Result { server_key_pem: server_key.serialize_pem(), client_cert_pem: client_cert.pem(), client_key_pem: client_key.serialize_pem(), + jwt_signing_key_pem, + jwt_public_key_pem, + jwt_key_id, }) } @@ -148,6 +165,9 @@ mod tests { assert!(bundle.server_key_pem.contains("BEGIN PRIVATE KEY")); assert!(bundle.client_cert_pem.contains("BEGIN CERTIFICATE")); assert!(bundle.client_key_pem.contains("BEGIN PRIVATE KEY")); + assert!(bundle.jwt_signing_key_pem.contains("BEGIN PRIVATE KEY")); + assert!(bundle.jwt_public_key_pem.contains("BEGIN PUBLIC KEY")); + assert_eq!(bundle.jwt_key_id.len(), 32, "kid is 16 bytes hex-encoded"); } #[test] diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 2e3cb0531..9bd4adf5d 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -743,6 +743,11 @@ fn import_local_package_mtls_bundle(name: &str) -> Result> { client_key_pem: std::fs::read_to_string(&key) .into_diagnostic() .wrap_err_with(|| format!("failed to read {}", key.display()))?, + // CLI never holds the gateway's JWT signing material — only the + // gateway needs it. Fill the JWT fields with placeholders. + jwt_signing_key_pem: String::new(), + jwt_public_key_pem: String::new(), + jwt_key_id: String::new(), }; openshell_bootstrap::mtls::store_pki_bundle(name, &bundle) .wrap_err_with(|| format!("failed to store mTLS bundle for gateway '{name}'"))?; diff --git a/crates/openshell-cli/tests/ensure_providers_integration.rs b/crates/openshell-cli/tests/ensure_providers_integration.rs index fa2605ac2..a18080243 100644 --- a/crates/openshell-cli/tests/ensure_providers_integration.rs +++ b/crates/openshell-cli/tests/ensure_providers_integration.rs @@ -535,6 +535,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/mtls_integration.rs b/crates/openshell-cli/tests/mtls_integration.rs index fd7a18b28..bb491db85 100644 --- a/crates/openshell-cli/tests/mtls_integration.rs +++ b/crates/openshell-cli/tests/mtls_integration.rs @@ -424,6 +424,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/provider_commands_integration.rs b/crates/openshell-cli/tests/provider_commands_integration.rs index cb2b3cb18..a52955f56 100644 --- a/crates/openshell-cli/tests/provider_commands_integration.rs +++ b/crates/openshell-cli/tests/provider_commands_integration.rs @@ -800,6 +800,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs index 8e606beea..9101dbd26 100644 --- a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs +++ b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs @@ -604,6 +604,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs b/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs index 44393fb2f..3cc39b3bc 100644 --- a/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs +++ b/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs @@ -437,6 +437,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-core/src/config.rs b/crates/openshell-core/src/config.rs index dbd8dfb8a..ea284f657 100644 --- a/crates/openshell-core/src/config.rs +++ b/crates/openshell-core/src/config.rs @@ -205,6 +205,13 @@ pub struct Config { #[serde(default)] pub oidc: Option, + /// Gateway-minted sandbox JWT configuration. When `Some`, the gateway + /// loads the signing key from disk and accepts gateway-issued sandbox + /// JWTs as `Principal::Sandbox`. Required for the per-sandbox identity + /// flow (issue #1354). + #[serde(default)] + pub gateway_jwt: Option, + /// Database URL for persistence. pub database_url: String, @@ -317,6 +324,37 @@ const fn default_jwks_ttl_secs() -> u64 { 3600 } +/// Gateway-minted sandbox JWT configuration. +/// +/// Points the gateway at the Ed25519 signing key (produced by `certgen`) +/// and identifies the issuer string embedded in every minted token. The +/// signing key never leaves the gateway process; the public key is loaded +/// by the same gateway so it can validate its own tokens. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GatewayJwtConfig { + /// Path to the Ed25519 signing key (PKCS#8 PEM). + pub signing_key_path: PathBuf, + /// Path to the matching public key (SPKI PEM). + pub public_key_path: PathBuf, + /// Path to the `kid` value (plain text, one line). + pub kid_path: PathBuf, + /// Stable gateway identity embedded in `iss`/`aud`. Defaults to the + /// hostname-or-`openshell` placeholder if unset. + #[serde(default = "default_gateway_id")] + pub gateway_id: String, + /// Token lifetime in seconds. Defaults to 24 hours. + #[serde(default = "default_sandbox_token_ttl_secs")] + pub ttl_secs: u64, +} + +fn default_gateway_id() -> String { + "openshell".to_string() +} + +const fn default_sandbox_token_ttl_secs() -> u64 { + 86_400 +} + fn default_roles_claim() -> String { "realm_access.roles".to_string() } @@ -340,6 +378,7 @@ impl Config { log_level: default_log_level(), tls, oidc: None, + gateway_jwt: None, database_url: String::new(), compute_drivers: vec![], ssh_session_ttl_secs: default_ssh_session_ttl_secs(), diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index d0225c471..cc82ab53f 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -26,7 +26,7 @@ pub mod sandbox_env; pub mod settings; pub mod time; -pub use config::{ComputeDriverKind, Config, OidcConfig, TlsConfig}; +pub use config::{ComputeDriverKind, Config, GatewayJwtConfig, OidcConfig, TlsConfig}; pub use error::{ComputeDriverError, Error, Result}; pub use metadata::{GetResourceVersion, ObjectId, ObjectLabels, ObjectName, SetResourceVersion}; diff --git a/crates/openshell-core/src/sandbox_env.rs b/crates/openshell-core/src/sandbox_env.rs index d345762ca..b367e450c 100644 --- a/crates/openshell-core/src/sandbox_env.rs +++ b/crates/openshell-core/src/sandbox_env.rs @@ -34,3 +34,22 @@ pub const TLS_CERT: &str = "OPENSHELL_TLS_CERT"; /// Path to the private key for mTLS communication with the gateway. pub const TLS_KEY: &str = "OPENSHELL_TLS_KEY"; + +/// Raw gateway-minted JWT identifying this sandbox. Mutually exclusive with +/// [`SANDBOX_TOKEN_FILE`] / [`K8S_SA_TOKEN_FILE`]; used only by test harnesses +/// that bypass the file-mount path. +pub const SANDBOX_TOKEN: &str = "OPENSHELL_SANDBOX_TOKEN"; + +/// Path to the file holding a gateway-minted sandbox JWT. +/// +/// Set by the Docker, Podman, and VM drivers, which write the token to a +/// bundle file at sandbox-create time. Read once at supervisor startup; +/// the token is held in process memory thereafter. +pub const SANDBOX_TOKEN_FILE: &str = "OPENSHELL_SANDBOX_TOKEN_FILE"; + +/// Path to the projected `ServiceAccount` JWT (Kubernetes driver). +/// +/// Used to bootstrap a gateway-minted JWT via `IssueSandboxToken`. Kubelet +/// writes and rotates this file; the supervisor exchanges its contents +/// for a gateway JWT at startup and on refresh. +pub const K8S_SA_TOKEN_FILE: &str = "OPENSHELL_K8S_SA_TOKEN_FILE"; diff --git a/crates/openshell-server/src/auth/k8s_sa.rs b/crates/openshell-server/src/auth/k8s_sa.rs new file mode 100644 index 000000000..a9f189e42 --- /dev/null +++ b/crates/openshell-server/src/auth/k8s_sa.rs @@ -0,0 +1,299 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Kubernetes `ServiceAccount` bootstrap authenticator. +//! +//! Path-scoped to `IssueSandboxToken`. Validates a projected SA token +//! presented by a sandbox pod, reads the pod's `openshell.io/sandbox-id` +//! annotation, and returns a [`Principal::Sandbox`] with +//! [`SandboxIdentitySource::K8sServiceAccount`]. The `IssueSandboxToken` +//! handler then mints a gateway-signed JWT for that sandbox id; subsequent +//! gRPC calls from the supervisor use the gateway-minted JWT validated by +//! [`super::sandbox_jwt::SandboxJwtAuthenticator`]. +//! +//! This is the only authenticator that talks to the K8s apiserver. It is +//! optional — the gateway boots without it in singleplayer deployments. + +use super::authenticator::Authenticator; +use super::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; +use async_trait::async_trait; +use std::sync::Arc; +use tonic::Status; +use tracing::{debug, warn}; + +/// gRPC method path that this authenticator accepts. All other paths fall +/// through (return `Ok(None)`) so a gateway-minted JWT is required there. +pub const ISSUE_SANDBOX_TOKEN_PATH: &str = "/openshell.v1.OpenShell/IssueSandboxToken"; + +/// Pod annotation that binds a sandbox pod to its UUID. Set by the +/// Kubernetes compute driver at pod-create time. The gateway treats this +/// annotation as authoritative; the K8s `Role` granted to the gateway must +/// not include `patch pods` (see plan §11.8). +#[allow(dead_code)] +pub const SANDBOX_ID_ANNOTATION: &str = "openshell.io/sandbox-id"; + +/// Resolved identity extracted from a validated SA token + pod lookup. +#[derive(Debug, Clone)] +pub struct ResolvedK8sIdentity { + pub sandbox_id: String, + pub pod_name: String, + pub pod_uid: String, +} + +/// Apiserver-facing operations the authenticator depends on. Split out so +/// tests can fake the apiserver without standing up a kube cluster. +#[async_trait] +pub trait K8sIdentityResolver: Send + Sync + 'static { + /// Validate `token` via `TokenReview` (`aud == openshell-gateway`), + /// extract the pod name/uid, then `GET` the pod and read + /// `openshell.io/sandbox-id`. Returns `Ok(None)` when the token is + /// well-formed but does not authenticate (e.g. wrong audience); returns + /// `Err` for transport/server errors. + async fn resolve(&self, token: &str) -> Result, Status>; +} + +/// Authenticator wrapper around a [`K8sIdentityResolver`]. +pub struct K8sServiceAccountAuthenticator { + resolver: Arc, +} + +impl std::fmt::Debug for K8sServiceAccountAuthenticator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("K8sServiceAccountAuthenticator") + .finish_non_exhaustive() + } +} + +impl K8sServiceAccountAuthenticator { + pub fn new(resolver: Arc) -> Self { + Self { resolver } + } +} + +#[async_trait] +impl Authenticator for K8sServiceAccountAuthenticator { + async fn authenticate( + &self, + headers: &http::HeaderMap, + path: &str, + ) -> Result, Status> { + // Scope: only the bootstrap RPC. Other paths fall through so the + // SandboxJwtAuthenticator (or OIDC) handles them. + if path != ISSUE_SANDBOX_TOKEN_PATH { + return Ok(None); + } + + let Some(token) = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .and_then(|v| v.strip_prefix("Bearer ")) + else { + return Ok(None); + }; + + let Some(resolved) = self.resolver.resolve(token).await? else { + debug!("K8s SA token did not authenticate; falling through"); + return Ok(None); + }; + + if resolved.sandbox_id.is_empty() { + warn!( + pod = %resolved.pod_name, + "pod missing openshell.io/sandbox-id annotation; rejecting" + ); + return Err(Status::permission_denied( + "pod is not bound to a sandbox identity", + )); + } + + Ok(Some(Principal::Sandbox(SandboxPrincipal { + sandbox_id: resolved.sandbox_id, + source: SandboxIdentitySource::K8sServiceAccount { + pod_name: resolved.pod_name, + pod_uid: resolved.pod_uid, + }, + trust_domain: Some("openshell".to_string()), + }))) + } +} + +/// Live resolver backed by a `kube::Client`. PR 2 ships this with a +/// `not_implemented` stub so the authenticator type and trait are in place +/// for PR 3's K8s driver wiring. The `TokenReview` + pod-`GET` +/// implementation lands when the K8s driver actually creates the +/// projected SA volume (PR 3). +#[allow(dead_code)] +pub struct LiveK8sResolver { + client: kube::Client, + audience: String, + namespace: String, +} + +impl LiveK8sResolver { + #[allow(dead_code)] + pub fn new(client: kube::Client, audience: String, namespace: String) -> Self { + Self { + client, + audience, + namespace, + } + } +} + +#[async_trait] +impl K8sIdentityResolver for LiveK8sResolver { + async fn resolve(&self, _token: &str) -> Result, Status> { + // Implementation lands in PR 3 with the K8s driver wiring. + // Until then `IssueSandboxToken` is wired but only exercised via + // the test harness (see fake resolver below). + Err(Status::unimplemented( + "K8s ServiceAccount bootstrap not yet enabled", + )) + } +} + +#[cfg(test)] +pub mod test_support { + use super::*; + use std::sync::Mutex; + + /// Fake resolver for unit tests. Returns the configured outcome on + /// every call and records the tokens it observed. + pub struct FakeResolver { + pub outcome: Result, Status>, + pub seen_tokens: Mutex>, + } + + impl FakeResolver { + pub fn returning(outcome: Result, Status>) -> Self { + Self { + outcome, + seen_tokens: Mutex::new(Vec::new()), + } + } + } + + #[async_trait] + impl K8sIdentityResolver for FakeResolver { + async fn resolve(&self, token: &str) -> Result, Status> { + self.seen_tokens.lock().unwrap().push(token.to_string()); + match &self.outcome { + Ok(opt) => Ok(opt.clone()), + Err(s) => Err(Status::new(s.code(), s.message())), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::test_support::FakeResolver; + use super::*; + + fn bearer_headers(token: &str) -> http::HeaderMap { + let mut h = http::HeaderMap::new(); + h.insert( + "authorization", + http::HeaderValue::from_str(&format!("Bearer {token}")).unwrap(), + ); + h + } + + #[tokio::test] + async fn authenticates_on_issue_path_only() { + let resolved = ResolvedK8sIdentity { + sandbox_id: "sandbox-a".to_string(), + pod_name: "openshell-sandbox-a".to_string(), + pod_uid: "uid-a".to_string(), + }; + let fake = Arc::new(FakeResolver::returning(Ok(Some(resolved)))); + let auth = K8sServiceAccountAuthenticator::new(fake.clone()); + + let on_issue = auth + .authenticate(&bearer_headers("sa-jwt"), ISSUE_SANDBOX_TOKEN_PATH) + .await + .unwrap() + .expect("expected principal"); + match on_issue { + Principal::Sandbox(p) => { + assert_eq!(p.sandbox_id, "sandbox-a"); + assert!(matches!( + p.source, + SandboxIdentitySource::K8sServiceAccount { .. } + )); + } + _ => panic!("expected sandbox principal"), + } + + let off_issue = auth + .authenticate( + &bearer_headers("sa-jwt"), + "/openshell.v1.OpenShell/GetSandboxConfig", + ) + .await + .unwrap(); + assert!( + off_issue.is_none(), + "K8s SA authenticator must be scoped to IssueSandboxToken" + ); + assert_eq!( + fake.seen_tokens.lock().unwrap().len(), + 1, + "off-path call must not consult the apiserver" + ); + } + + #[tokio::test] + async fn missing_bearer_yields_none() { + let fake = Arc::new(FakeResolver::returning(Ok(None))); + let auth = K8sServiceAccountAuthenticator::new(fake); + let result = auth + .authenticate(&http::HeaderMap::new(), ISSUE_SANDBOX_TOKEN_PATH) + .await + .unwrap(); + assert!(result.is_none()); + } + + #[tokio::test] + async fn resolver_returning_none_falls_through() { + let fake = Arc::new(FakeResolver::returning(Ok(None))); + let auth = K8sServiceAccountAuthenticator::new(fake); + let result = auth + .authenticate( + &bearer_headers("not-a-real-sa-token"), + ISSUE_SANDBOX_TOKEN_PATH, + ) + .await + .unwrap(); + assert!(result.is_none(), "non-authenticating tokens fall through"); + } + + #[tokio::test] + async fn pod_without_annotation_is_rejected() { + let resolved = ResolvedK8sIdentity { + sandbox_id: String::new(), + pod_name: "stray-pod".to_string(), + pod_uid: "uid".to_string(), + }; + let fake = Arc::new(FakeResolver::returning(Ok(Some(resolved)))); + let auth = K8sServiceAccountAuthenticator::new(fake); + let err = auth + .authenticate(&bearer_headers("sa-jwt"), ISSUE_SANDBOX_TOKEN_PATH) + .await + .expect_err("unbound pod must be rejected"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[tokio::test] + async fn resolver_error_propagates() { + let fake = Arc::new(FakeResolver::returning(Err(Status::unavailable( + "apiserver down", + )))); + let auth = K8sServiceAccountAuthenticator::new(fake); + let err = auth + .authenticate(&bearer_headers("sa-jwt"), ISSUE_SANDBOX_TOKEN_PATH) + .await + .expect_err("resolver error must propagate"); + assert_eq!(err.code(), tonic::Code::Unavailable); + } +} diff --git a/crates/openshell-server/src/auth/mod.rs b/crates/openshell-server/src/auth/mod.rs index db09e26b7..3d8152de1 100644 --- a/crates/openshell-server/src/auth/mod.rs +++ b/crates/openshell-server/src/auth/mod.rs @@ -12,7 +12,10 @@ pub mod authenticator; pub mod authz; mod http; pub mod identity; +pub mod k8s_sa; pub mod oidc; pub mod principal; +pub mod revocation; +pub mod sandbox_jwt; pub use http::router; diff --git a/crates/openshell-server/src/auth/revocation.rs b/crates/openshell-server/src/auth/revocation.rs new file mode 100644 index 000000000..3cca82211 --- /dev/null +++ b/crates/openshell-server/src/auth/revocation.rs @@ -0,0 +1,100 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Sandbox-JWT revocation set. +//! +//! Tracks `jti` claims that have been explicitly revoked (sandbox deleted +//! or token refreshed). The validator consults this set on every sandbox +//! JWT validation and rejects matches as `Unauthenticated`. +//! +//! PR-2 implementation is in-memory only; a gateway restart clears the +//! set. The token TTL (24 h default) bounds the exposure window. PR 5 +//! (refresh RPC) introduces persistence to `Store` so revocations survive +//! restarts. + +use std::collections::HashMap; +use std::sync::RwLock; +use std::time::{SystemTime, UNIX_EPOCH}; + +/// In-memory `jti` deny-list with TTL-based pruning. +#[derive(Debug, Default)] +pub struct RevocationSet { + entries: RwLock>, +} + +impl RevocationSet { + pub fn new() -> Self { + Self::default() + } + + /// Mark `jti` as revoked until `expires_at_ms` (after which it would + /// naturally fail signature validation due to `exp`, so we can drop it). + pub fn revoke(&self, jti: &str, expires_at_ms: i64) { + let mut entries = self.entries.write().expect("revocation lock poisoned"); + entries.insert(jti.to_string(), expires_at_ms); + } + + /// Returns true if `jti` is currently revoked. + pub fn is_revoked(&self, jti: &str) -> bool { + let entries = self.entries.read().expect("revocation lock poisoned"); + entries.contains_key(jti) + } + + /// Drop entries whose `exp` is in the past. Called periodically (or on + /// demand from tests) to bound memory growth. + pub fn prune_expired(&self) -> usize { + let now = now_ms(); + let mut entries = self.entries.write().expect("revocation lock poisoned"); + let before = entries.len(); + entries.retain(|_, exp| *exp > now); + before - entries.len() + } + + /// Number of currently tracked revocations. Test/diagnostic only. + #[cfg(test)] + pub fn len(&self) -> usize { + self.entries.read().unwrap().len() + } +} + +fn now_ms() -> i64 { + i64::try_from( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_or(0, |d| d.as_millis()), + ) + .unwrap_or(i64::MAX) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn revoked_jti_is_detected() { + let set = RevocationSet::new(); + let future = now_ms() + 60_000; + set.revoke("abc", future); + assert!(set.is_revoked("abc")); + assert!(!set.is_revoked("xyz")); + } + + #[test] + fn prune_drops_expired_entries() { + let set = RevocationSet::new(); + set.revoke("expired", now_ms() - 1_000); + set.revoke("future", now_ms() + 60_000); + let dropped = set.prune_expired(); + assert_eq!(dropped, 1); + assert!(!set.is_revoked("expired")); + assert!(set.is_revoked("future")); + } + + #[test] + fn re_revoking_overwrites_expiry() { + let set = RevocationSet::new(); + set.revoke("dup", now_ms() + 1_000); + set.revoke("dup", now_ms() + 99_000); + assert_eq!(set.len(), 1); + } +} diff --git a/crates/openshell-server/src/auth/sandbox_jwt.rs b/crates/openshell-server/src/auth/sandbox_jwt.rs new file mode 100644 index 000000000..6b1736dbe --- /dev/null +++ b/crates/openshell-server/src/auth/sandbox_jwt.rs @@ -0,0 +1,397 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Gateway-minted per-sandbox JWTs. +//! +//! The gateway signs an Ed25519 JWT for each sandbox at create time and +//! the sandbox supervisor presents it as `Authorization: Bearer ` on +//! every gRPC call (PR 3). This module implements both sides of the +//! gateway-controlled token: +//! - [`SandboxJwtIssuer`] mints fresh tokens (called from +//! `handle_create_sandbox` and the `IssueSandboxToken` RPC). +//! - [`SandboxJwtAuthenticator`] validates tokens on inbound requests and +//! produces a [`Principal::Sandbox`] with [`SandboxIdentitySource::BootstrapJwt`]. +//! +//! Algorithm: `EdDSA` (Ed25519). Pinned via `Validation::algorithms` to +//! prevent algorithm-confusion attacks. + +use super::authenticator::Authenticator; +use super::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; +use super::revocation::RevocationSet; +use async_trait::async_trait; +use jsonwebtoken::{ + Algorithm, DecodingKey, EncodingKey, Header, Validation, decode, decode_header, encode, +}; +use serde::{Deserialize, Serialize}; +use std::sync::Arc; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use tonic::Status; +use tracing::{debug, warn}; +use uuid::Uuid; + +/// SPIFFE-shaped subject prefix. Embedded in the `sub` claim of every +/// minted token so a future migration to per-sandbox certs or SPIRE can +/// reuse the same subject namespace without breaking handler equality +/// checks. +const SPIFFE_SUBJECT_PREFIX: &str = "spiffe://openshell/sandbox/"; + +/// JWT claim set serialized in every gateway-minted sandbox token. +#[derive(Debug, Serialize, Deserialize)] +pub struct SandboxJwtClaims { + /// `spiffe://openshell/sandbox/`. SPIFFE-shaped for forward + /// compatibility with channel-bound identity (per-sandbox cert / SPIRE). + pub sub: String, + /// Gateway identity (`openshell-gateway:`). Both `iss` and + /// `aud` use the same value so any future replicas of the same + /// deployment validate each others' tokens without configuration. + pub iss: String, + pub aud: String, + pub iat: i64, + pub exp: i64, + pub jti: String, + /// Canonical sandbox UUID, denormalized from `sub` for cheap parsing + /// without a SPIFFE library. + pub sandbox_id: String, +} + +/// Mints fresh sandbox JWTs. +pub struct SandboxJwtIssuer { + encoding_key: EncodingKey, + kid: String, + issuer: String, + audience: String, + ttl: Duration, +} + +impl std::fmt::Debug for SandboxJwtIssuer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SandboxJwtIssuer") + .field("kid", &self.kid) + .field("issuer", &self.issuer) + .field("audience", &self.audience) + .field("ttl", &self.ttl) + .finish_non_exhaustive() + } +} + +/// Outcome of a successful mint — caller persists the `jti` so the same +/// token can be revoked on `DeleteSandbox` / refresh. +#[derive(Debug, Clone)] +pub struct MintedToken { + pub token: String, + pub jti: String, + pub expires_at_ms: i64, +} + +impl SandboxJwtIssuer { + pub fn from_pem( + signing_key_pem: &[u8], + kid: String, + gateway_id: &str, + ttl: Duration, + ) -> Result { + let encoding_key = EncodingKey::from_ed_pem(signing_key_pem) + .map_err(|e| format!("failed to parse Ed25519 signing key PEM: {e}"))?; + let identity = format!("openshell-gateway:{gateway_id}"); + Ok(Self { + encoding_key, + kid, + issuer: identity.clone(), + audience: identity, + ttl, + }) + } + + /// Mint a fresh token for `sandbox_id`. The caller MUST track the + /// returned `jti` (in the `RevocationSet`'s mint-time index if we ever + /// need to revoke the most-recent token for a given sandbox). + #[allow(clippy::result_large_err)] // `tonic::Status` is the natural error here + pub fn mint(&self, sandbox_id: &str) -> Result { + let now = now_secs(); + let exp = now + i64::try_from(self.ttl.as_secs()).unwrap_or(86_400); + let jti = Uuid::new_v4().to_string(); + let claims = SandboxJwtClaims { + sub: format!("{SPIFFE_SUBJECT_PREFIX}{sandbox_id}"), + iss: self.issuer.clone(), + aud: self.audience.clone(), + iat: now, + exp, + jti: jti.clone(), + sandbox_id: sandbox_id.to_string(), + }; + let mut header = Header::new(Algorithm::EdDSA); + header.kid = Some(self.kid.clone()); + let token = encode(&header, &claims, &self.encoding_key).map_err(|e| { + warn!(error = %e, "failed to mint sandbox JWT"); + Status::internal("failed to mint sandbox token") + })?; + Ok(MintedToken { + token, + jti, + expires_at_ms: exp.saturating_mul(1000), + }) + } + + pub fn ttl(&self) -> Duration { + self.ttl + } +} + +/// Authenticator that validates gateway-minted sandbox JWTs. +pub struct SandboxJwtAuthenticator { + decoding_key: DecodingKey, + kid: String, + issuer: String, + audience: String, + revocation: Arc, +} + +impl std::fmt::Debug for SandboxJwtAuthenticator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SandboxJwtAuthenticator") + .field("kid", &self.kid) + .field("issuer", &self.issuer) + .field("audience", &self.audience) + .finish_non_exhaustive() + } +} + +impl SandboxJwtAuthenticator { + pub fn from_pem( + public_key_pem: &[u8], + kid: String, + gateway_id: &str, + revocation: Arc, + ) -> Result { + let decoding_key = DecodingKey::from_ed_pem(public_key_pem) + .map_err(|e| format!("failed to parse Ed25519 public key PEM: {e}"))?; + let identity = format!("openshell-gateway:{gateway_id}"); + Ok(Self { + decoding_key, + kid, + issuer: identity.clone(), + audience: identity, + revocation, + }) + } + + #[allow(clippy::result_large_err)] + fn validate_bearer(&self, token: &str) -> Result, Status> { + let header = decode_header(token).map_err(|e| { + debug!(error = %e, "sandbox JWT header decode failed"); + Status::unauthenticated("invalid token") + })?; + + // Fall through to other authenticators when the kid does not match — + // OIDC issuers may share the Bearer slot. + if header.kid.as_deref() != Some(self.kid.as_str()) { + return Ok(None); + } + if !matches!(header.alg, Algorithm::EdDSA) { + return Ok(None); + } + + let mut validation = Validation::new(Algorithm::EdDSA); + validation.algorithms = vec![Algorithm::EdDSA]; + validation.set_issuer(&[&self.issuer]); + validation.set_audience(&[&self.audience]); + validation.set_required_spec_claims(&["iss", "aud", "exp", "sub"]); + + let data = + decode::(token, &self.decoding_key, &validation).map_err(|e| { + debug!(error = %e, "sandbox JWT validation failed"); + Status::unauthenticated(format!("invalid token: {e}")) + })?; + + let claims = data.claims; + if self.revocation.is_revoked(&claims.jti) { + debug!(jti = %claims.jti, "sandbox JWT rejected: jti revoked"); + return Err(Status::unauthenticated("revoked token")); + } + + Ok(Some(Principal::Sandbox(SandboxPrincipal { + sandbox_id: claims.sandbox_id, + source: SandboxIdentitySource::BootstrapJwt { + issuer: claims.iss, + jti: claims.jti, + }, + trust_domain: Some("openshell".to_string()), + }))) + } +} + +#[async_trait] +impl Authenticator for SandboxJwtAuthenticator { + async fn authenticate( + &self, + headers: &http::HeaderMap, + _path: &str, + ) -> Result, Status> { + let Some(token) = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .and_then(|v| v.strip_prefix("Bearer ")) + else { + return Ok(None); + }; + self.validate_bearer(token) + } +} + +fn now_secs() -> i64 { + i64::try_from( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_or(0, |d| d.as_secs()), + ) + .unwrap_or(i64::MAX) +} + +#[cfg(test)] +mod tests { + use super::*; + use openshell_bootstrap::jwt::generate_jwt_key; + + fn header_map_with_bearer(token: &str) -> http::HeaderMap { + let mut h = http::HeaderMap::new(); + h.insert( + "authorization", + http::HeaderValue::from_str(&format!("Bearer {token}")).unwrap(), + ); + h + } + + fn pair() -> ( + SandboxJwtIssuer, + SandboxJwtAuthenticator, + Arc, + ) { + let mat = generate_jwt_key().expect("jwt key"); + let revocation = Arc::new(RevocationSet::new()); + let issuer = SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid.clone(), + "test-gateway", + Duration::from_secs(3600), + ) + .unwrap(); + let auth = SandboxJwtAuthenticator::from_pem( + mat.public_key_pem.as_bytes(), + mat.kid, + "test-gateway", + revocation.clone(), + ) + .unwrap(); + (issuer, auth, revocation) + } + + #[tokio::test] + async fn mint_and_validate_round_trip() { + let (issuer, auth, _) = pair(); + let minted = issuer.mint("sandbox-a").unwrap(); + let principal = auth + .authenticate(&header_map_with_bearer(&minted.token), "/anything") + .await + .unwrap() + .expect("expected principal"); + match principal { + Principal::Sandbox(p) => { + assert_eq!(p.sandbox_id, "sandbox-a"); + match p.source { + SandboxIdentitySource::BootstrapJwt { issuer: iss, jti } => { + assert_eq!(iss, "openshell-gateway:test-gateway"); + assert_eq!(jti, minted.jti); + } + other => panic!("unexpected source: {other:?}"), + } + } + _ => panic!("expected Sandbox principal"), + } + } + + #[tokio::test] + async fn revoked_jti_is_rejected() { + let (issuer, auth, revocation) = pair(); + let minted = issuer.mint("sandbox-a").unwrap(); + revocation.revoke(&minted.jti, minted.expires_at_ms); + let err = auth + .authenticate(&header_map_with_bearer(&minted.token), "/anything") + .await + .expect_err("revoked must reject"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } + + #[tokio::test] + async fn token_signed_by_other_key_is_rejected() { + let (_, auth_a, _) = pair(); + let (issuer_b, _, _) = pair(); // different keypair + let minted = issuer_b.mint("sandbox-b").unwrap(); + // The token has a different `kid` than auth_a expects, so the + // authenticator yields None (lets the chain fall through). That is + // the documented behavior for cross-issuer Bearer headers. + let result = auth_a + .authenticate(&header_map_with_bearer(&minted.token), "/anything") + .await + .unwrap(); + assert!(result.is_none(), "different kid must fall through"); + } + + #[tokio::test] + async fn missing_bearer_yields_none() { + let (_, auth, _) = pair(); + let result = auth + .authenticate(&http::HeaderMap::new(), "/anything") + .await + .unwrap(); + assert!(result.is_none()); + } + + #[tokio::test] + async fn malformed_token_is_rejected() { + let (_, auth, _) = pair(); + let err = auth + .authenticate(&header_map_with_bearer("not.a.jwt"), "/anything") + .await + .expect_err("malformed must reject"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } + + #[tokio::test] + async fn expired_token_is_rejected() { + // Mint a token whose iat is far in the past so its TTL window is + // already closed by `now`. We sign the JWT directly with the same + // signing key to bypass the issuer's TTL-vs-now coupling. + let mat = generate_jwt_key().unwrap(); + let issuer = SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid.clone(), + "g", + Duration::from_secs(3600), + ) + .unwrap(); + let auth = SandboxJwtAuthenticator::from_pem( + mat.public_key_pem.as_bytes(), + mat.kid.clone(), + "g", + Arc::new(RevocationSet::new()), + ) + .unwrap(); + let claims = SandboxJwtClaims { + sub: format!("{SPIFFE_SUBJECT_PREFIX}sandbox-c"), + iss: "openshell-gateway:g".to_string(), + aud: "openshell-gateway:g".to_string(), + iat: now_secs() - 7200, + exp: now_secs() - 3600, + jti: Uuid::new_v4().to_string(), + sandbox_id: "sandbox-c".to_string(), + }; + let mut header = Header::new(Algorithm::EdDSA); + header.kid = Some(mat.kid); + let token = encode(&header, &claims, &issuer.encoding_key).unwrap(); + let err = auth + .authenticate(&header_map_with_bearer(&token), "/anything") + .await + .expect_err("expired token must reject"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } +} diff --git a/crates/openshell-server/src/certgen.rs b/crates/openshell-server/src/certgen.rs index 683170aad..f188968fb 100644 --- a/crates/openshell-server/src/certgen.rs +++ b/crates/openshell-server/src/certgen.rs @@ -52,6 +52,12 @@ pub struct CertgenArgs { #[arg(long, required_unless_present = "output_dir")] client_secret_name: Option, + /// Name of the sandbox-JWT signing-key Secret (`Opaque`) to create. + /// Holds `signing.pem`, `public.pem`, and `kid` keys. Mounted on the + /// gateway pod (only) so it can mint and validate per-sandbox JWTs. + #[arg(long, required_unless_present = "output_dir")] + jwt_secret_name: Option, + /// Extra Subject Alternative Name for the server certificate. Repeatable. /// Auto-detected as an IP address or DNS name. #[arg(long = "server-san", value_name = "SAN")] @@ -93,10 +99,10 @@ enum K8sAction { Create, } -fn decide_k8s(server_exists: bool, client_exists: bool) -> K8sAction { - match (server_exists, client_exists) { - (true, true) => K8sAction::SkipExists, - (false, false) => K8sAction::Create, +fn decide_k8s(server_exists: bool, client_exists: bool, jwt_exists: bool) -> K8sAction { + match (server_exists, client_exists, jwt_exists) { + (true, true, true) => K8sAction::SkipExists, + (false, false, false) => K8sAction::Create, _ => K8sAction::PartialState, } } @@ -114,6 +120,10 @@ async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { .client_secret_name .as_deref() .ok_or_else(|| miette::miette!("--client-secret-name is required"))?; + let jwt_name = args + .jwt_secret_name + .as_deref() + .ok_or_else(|| miette::miette!("--jwt-secret-name is required"))?; let client = Client::try_default() .await @@ -133,22 +143,29 @@ async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { .into_diagnostic() .wrap_err_with(|| format!("failed to read secret {client_name}"))? .is_some(); + let jwt_exists = api + .get_opt(jwt_name) + .await + .into_diagnostic() + .wrap_err_with(|| format!("failed to read secret {jwt_name}"))? + .is_some(); - match decide_k8s(server_exists, client_exists) { + match decide_k8s(server_exists, client_exists, jwt_exists) { K8sAction::SkipExists => { info!( namespace = %namespace, server = %server_name, client = %client_name, + jwt = %jwt_name, "PKI secrets already exist, skipping." ); return Ok(()); } K8sAction::PartialState => { return Err(miette::miette!( - "partial PKI state in namespace {namespace}: exactly one of \ - {server_name} / {client_name} exists. Recover with: \ - kubectl delete secret -n {namespace} {server_name} {client_name}", + "partial PKI state in namespace {namespace}: only some of \ + {server_name} / {client_name} / {jwt_name} exist. Recover with: \ + kubectl delete secret -n {namespace} {server_name} {client_name} {jwt_name}", )); } K8sAction::Create => {} @@ -166,6 +183,12 @@ async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { &bundle.client_key_pem, &bundle.ca_cert_pem, ); + let jwt_secret = jwt_signing_secret( + jwt_name, + &bundle.jwt_signing_key_pem, + &bundle.jwt_public_key_pem, + &bundle.jwt_key_id, + ); api.create(&PostParams::default(), &server_secret) .await @@ -175,11 +198,16 @@ async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { .await .into_diagnostic() .wrap_err_with(|| format!("failed to create secret {client_name}"))?; + api.create(&PostParams::default(), &jwt_secret) + .await + .into_diagnostic() + .wrap_err_with(|| format!("failed to create secret {jwt_name}"))?; info!( namespace = %namespace, server = %server_name, client = %client_name, + jwt = %jwt_name, "PKI secrets created." ); Ok(()) @@ -207,6 +235,31 @@ fn tls_secret(name: &str, crt_pem: &str, key_pem: &str, ca_pem: &str) -> Secret } } +/// Build an `Opaque` Secret carrying the gateway-minted sandbox JWT +/// signing material. Mounted only on the gateway pod — sandbox pods +/// receive a per-pod gateway-signed token, never the signing key itself. +fn jwt_signing_secret(name: &str, signing_pem: &str, public_pem: &str, kid: &str) -> Secret { + let mut data = BTreeMap::new(); + data.insert( + "signing.pem".to_string(), + ByteString(signing_pem.as_bytes().to_vec()), + ); + data.insert( + "public.pem".to_string(), + ByteString(public_pem.as_bytes().to_vec()), + ); + data.insert("kid".to_string(), ByteString(kid.as_bytes().to_vec())); + Secret { + metadata: ObjectMeta { + name: Some(name.to_string()), + ..Default::default() + }, + type_: Some("Opaque".to_string()), + data: Some(data), + ..Default::default() + } +} + // ─────────────────────────────── Local mode ─────────────────────────────── #[derive(Debug, PartialEq, Eq)] @@ -235,12 +288,17 @@ struct LocalPaths { client_dir: PathBuf, client_crt: PathBuf, client_key: PathBuf, + jwt_dir: PathBuf, + jwt_signing: PathBuf, + jwt_public: PathBuf, + jwt_kid: PathBuf, } impl LocalPaths { fn resolve(dir: &Path) -> Self { let server_dir = dir.join("server"); let client_dir = dir.join("client"); + let jwt_dir = dir.join("jwt"); Self { ca_crt: dir.join("ca.crt"), ca_key: dir.join("ca.key"), @@ -250,10 +308,14 @@ impl LocalPaths { client_crt: client_dir.join("tls.crt"), client_key: client_dir.join("tls.key"), client_dir, + jwt_signing: jwt_dir.join("signing.pem"), + jwt_public: jwt_dir.join("public.pem"), + jwt_kid: jwt_dir.join("kid"), + jwt_dir, } } - fn all_files(&self) -> [&Path; 6] { + fn all_files(&self) -> [&Path; 9] { [ &self.ca_crt, &self.ca_key, @@ -261,6 +323,9 @@ impl LocalPaths { &self.server_key, &self.client_crt, &self.client_key, + &self.jwt_signing, + &self.jwt_public, + &self.jwt_kid, ] } @@ -271,7 +336,7 @@ impl LocalPaths { fn decide_local(present: usize) -> LocalAction { match present { - 6 => LocalAction::Skip, + 9 => LocalAction::Skip, 0 => LocalAction::Create, _ => LocalAction::PartialState, } @@ -318,6 +383,9 @@ fn read_local_bundle(paths: &LocalPaths) -> Result { server_key_pem: read_pem(&paths.server_key)?, client_cert_pem: read_pem(&paths.client_crt)?, client_key_pem: read_pem(&paths.client_key)?, + jwt_signing_key_pem: read_pem(&paths.jwt_signing)?, + jwt_public_key_pem: read_pem(&paths.jwt_public)?, + jwt_key_id: read_pem(&paths.jwt_kid)?.trim().to_string(), }) } @@ -339,9 +407,11 @@ fn write_local_bundle(dir: &Path, bundle: &PkiBundle, paths: &LocalPaths) -> Res let temp_server = temp.join("server"); let temp_client = temp.join("client"); + let temp_jwt = temp.join("jwt"); create_dir_restricted(&temp)?; create_dir_restricted(&temp_server)?; create_dir_restricted(&temp_client)?; + create_dir_restricted(&temp_jwt)?; write_pem(&temp.join("ca.crt"), &bundle.ca_cert_pem, false)?; write_pem(&temp.join("ca.key"), &bundle.ca_key_pem, true)?; @@ -349,19 +419,34 @@ fn write_local_bundle(dir: &Path, bundle: &PkiBundle, paths: &LocalPaths) -> Res write_pem(&temp_server.join("tls.key"), &bundle.server_key_pem, true)?; write_pem(&temp_client.join("tls.crt"), &bundle.client_cert_pem, false)?; write_pem(&temp_client.join("tls.key"), &bundle.client_key_pem, true)?; + write_pem( + &temp_jwt.join("signing.pem"), + &bundle.jwt_signing_key_pem, + true, + )?; + write_pem( + &temp_jwt.join("public.pem"), + &bundle.jwt_public_key_pem, + false, + )?; + write_pem(&temp_jwt.join("kid"), &bundle.jwt_key_id, false)?; // Final destination (might not exist yet on first run). create_dir_restricted(dir)?; create_dir_restricted(&paths.server_dir)?; create_dir_restricted(&paths.client_dir)?; + create_dir_restricted(&paths.jwt_dir)?; - let renames: [(PathBuf, &Path); 6] = [ + let renames: [(PathBuf, &Path); 9] = [ (temp.join("ca.crt"), paths.ca_crt.as_path()), (temp.join("ca.key"), paths.ca_key.as_path()), (temp_server.join("tls.crt"), paths.server_crt.as_path()), (temp_server.join("tls.key"), paths.server_key.as_path()), (temp_client.join("tls.crt"), paths.client_crt.as_path()), (temp_client.join("tls.key"), paths.client_key.as_path()), + (temp_jwt.join("signing.pem"), paths.jwt_signing.as_path()), + (temp_jwt.join("public.pem"), paths.jwt_public.as_path()), + (temp_jwt.join("kid"), paths.jwt_kid.as_path()), ]; for (from, to) in &renames { std::fs::rename(from, to) @@ -406,8 +491,8 @@ fn print_bundle(bundle: &PkiBundle) { #[cfg(test)] mod tests { use super::{ - K8sAction, LocalAction, LocalPaths, decide_k8s, decide_local, read_local_bundle, - sibling_temp_dir, tls_secret, write_local_bundle, + K8sAction, LocalAction, LocalPaths, decide_k8s, decide_local, jwt_signing_secret, + read_local_bundle, sibling_temp_dir, tls_secret, write_local_bundle, }; use openshell_bootstrap::pki::generate_pki; use std::path::Path; @@ -415,23 +500,32 @@ mod tests { // ── Kubernetes-mode decision ── #[test] - fn decide_k8s_skip_when_both_exist() { - assert_eq!(decide_k8s(true, true), K8sAction::SkipExists); + fn decide_k8s_skip_when_all_three_exist() { + assert_eq!(decide_k8s(true, true, true), K8sAction::SkipExists); } #[test] - fn decide_k8s_create_when_neither_exists() { - assert_eq!(decide_k8s(false, false), K8sAction::Create); + fn decide_k8s_create_when_none_exist() { + assert_eq!(decide_k8s(false, false, false), K8sAction::Create); } #[test] - fn decide_k8s_partial_when_only_server_exists() { - assert_eq!(decide_k8s(true, false), K8sAction::PartialState); - } - - #[test] - fn decide_k8s_partial_when_only_client_exists() { - assert_eq!(decide_k8s(false, true), K8sAction::PartialState); + fn decide_k8s_partial_for_any_mixed_state() { + let mixes = [ + (true, false, false), + (false, true, false), + (false, false, true), + (true, true, false), + (true, false, true), + (false, true, true), + ]; + for (s, c, j) in mixes { + assert_eq!( + decide_k8s(s, c, j), + K8sAction::PartialState, + "({s},{c},{j})" + ); + } } #[test] @@ -446,11 +540,23 @@ mod tests { assert_eq!(data["ca.crt"].0, b"CA-PEM"); } + #[test] + fn jwt_signing_secret_has_opaque_type_and_three_keys() { + let s = jwt_signing_secret("jwt", "SIGN", "PUB", "kid-1"); + assert_eq!(s.metadata.name.as_deref(), Some("jwt")); + assert_eq!(s.type_.as_deref(), Some("Opaque")); + let data = s.data.expect("data set"); + assert_eq!(data.len(), 3); + assert_eq!(data["signing.pem"].0, b"SIGN"); + assert_eq!(data["public.pem"].0, b"PUB"); + assert_eq!(data["kid"].0, b"kid-1"); + } + // ── Local-mode decision ── #[test] - fn decide_local_skip_when_all_six_present() { - assert_eq!(decide_local(6), LocalAction::Skip); + fn decide_local_skip_when_all_nine_present() { + assert_eq!(decide_local(9), LocalAction::Skip); } #[test] @@ -460,7 +566,7 @@ mod tests { #[test] fn decide_local_partial_for_any_count_in_between() { - for n in 1..=5 { + for n in 1..=8 { assert_eq!(decide_local(n), LocalAction::PartialState, "n = {n}"); } } diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index 8d4e094c4..9e7f4d8e7 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -336,6 +336,18 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { }); } + // PR-2 wires gateway_jwt via the config file only — there's no CLI + // flag yet because the standard deployments (helm chart + RPM init + // script) drop the keypair to a known path and pass that path through + // the TOML. A CLI shortcut can be added if a singleplayer operator + // needs to override. + if let Some(jwt) = file + .as_ref() + .and_then(|f| f.openshell.gateway.gateway_jwt.clone()) + { + config.gateway_jwt = Some(jwt); + } + let vm_config = build_vm_config( file.as_ref(), local_tls.as_ref(), @@ -832,6 +844,8 @@ mod tests { "openshell-server-tls", "--client-secret-name", "openshell-client-tls", + "--jwt-secret-name", + "openshell-jwt-keys", "--server-san", "openshell.example.com", "--server-san", diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs index db0dcd684..96a7af90d 100644 --- a/crates/openshell-server/src/config_file.rs +++ b/crates/openshell-server/src/config_file.rs @@ -25,7 +25,7 @@ use std::net::SocketAddr; use std::path::{Path, PathBuf}; use openshell_core::config::ComputeDriverKind; -use openshell_core::{OidcConfig, TlsConfig}; +use openshell_core::{GatewayJwtConfig, OidcConfig, TlsConfig}; use serde::{Deserialize, Serialize}; /// Latest schema version this build understands. @@ -133,6 +133,8 @@ pub struct GatewayFileSection { pub tls: Option, #[serde(default)] pub oidc: Option, + #[serde(default)] + pub gateway_jwt: Option, // ── Disallowed-in-file fields ──────────────────────────────────────── // diff --git a/crates/openshell-server/src/grpc/auth_rpc.rs b/crates/openshell-server/src/grpc/auth_rpc.rs new file mode 100644 index 000000000..eb233a215 --- /dev/null +++ b/crates/openshell-server/src/grpc/auth_rpc.rs @@ -0,0 +1,68 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Authentication-related RPC handlers. +//! +//! Currently hosts the `IssueSandboxToken` exchange used by the Kubernetes +//! driver to convert a projected `ServiceAccount` token into a +//! gateway-minted JWT bound to a specific sandbox. + +use crate::ServerState; +use crate::auth::principal::{Principal, SandboxIdentitySource}; +use openshell_core::proto::{IssueSandboxTokenRequest, IssueSandboxTokenResponse}; +use std::sync::Arc; +use tonic::{Request, Response, Status}; +use tracing::{debug, info, warn}; + +#[allow(clippy::result_large_err, clippy::unused_async)] +pub async fn handle_issue_sandbox_token( + state: &Arc, + request: Request, +) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; + + let Principal::Sandbox(sandbox) = principal else { + return Err(Status::permission_denied( + "IssueSandboxToken requires a sandbox principal", + )); + }; + + // Only the bootstrap K8s ServiceAccount path can mint a fresh + // gateway JWT — gateway-issued JWTs already exist and refreshing them + // is a future capability (PR 5). Reject re-exchange attempts. + if !matches!( + sandbox.source, + SandboxIdentitySource::K8sServiceAccount { .. } + ) { + debug!( + sandbox_id = %sandbox.sandbox_id, + "IssueSandboxToken rejected: non-bootstrap principal source" + ); + return Err(Status::permission_denied( + "this principal cannot mint a sandbox token", + )); + } + + let issuer = state.sandbox_jwt_issuer.as_ref().ok_or_else(|| { + warn!( + sandbox_id = %sandbox.sandbox_id, + "IssueSandboxToken called but sandbox JWT issuer is not configured" + ); + Status::unavailable("sandbox JWT minting is not configured on this gateway") + })?; + + let minted = issuer.mint(&sandbox.sandbox_id)?; + info!( + sandbox_id = %sandbox.sandbox_id, + jti = %minted.jti, + "issued gateway sandbox JWT" + ); + Ok(Response::new(IssueSandboxTokenResponse { + token: minted.token, + expires_at_ms: minted.expires_at_ms, + })) +} diff --git a/crates/openshell-server/src/grpc/mod.rs b/crates/openshell-server/src/grpc/mod.rs index 53233a46d..ff3fc55f3 100644 --- a/crates/openshell-server/src/grpc/mod.rs +++ b/crates/openshell-server/src/grpc/mod.rs @@ -3,6 +3,7 @@ //! gRPC service implementation. +mod auth_rpc; pub mod policy; pub mod provider; mod sandbox; @@ -27,15 +28,16 @@ use openshell_core::proto::{ GetSandboxLogsResponse, GetSandboxPolicyStatusRequest, GetSandboxPolicyStatusResponse, GetSandboxProviderEnvironmentRequest, GetSandboxProviderEnvironmentResponse, GetSandboxRequest, GetServiceRequest, HealthRequest, HealthResponse, ImportProviderProfilesRequest, - ImportProviderProfilesResponse, LintProviderProfilesRequest, LintProviderProfilesResponse, - ListProviderProfilesRequest, ListProviderProfilesResponse, ListProvidersRequest, - ListProvidersResponse, ListSandboxPoliciesRequest, ListSandboxPoliciesResponse, - ListSandboxProvidersRequest, ListSandboxProvidersResponse, ListSandboxesRequest, - ListSandboxesResponse, ListServicesRequest, ListServicesResponse, ProviderProfileResponse, - ProviderResponse, PushSandboxLogsRequest, PushSandboxLogsResponse, RejectDraftChunkRequest, - RejectDraftChunkResponse, RelayFrame, ReportPolicyStatusRequest, ReportPolicyStatusResponse, - RevokeSshSessionRequest, RevokeSshSessionResponse, RotateProviderCredentialRequest, - RotateProviderCredentialResponse, SandboxResponse, SandboxStreamEvent, ServiceEndpointResponse, + ImportProviderProfilesResponse, IssueSandboxTokenRequest, IssueSandboxTokenResponse, + LintProviderProfilesRequest, LintProviderProfilesResponse, ListProviderProfilesRequest, + ListProviderProfilesResponse, ListProvidersRequest, ListProvidersResponse, + ListSandboxPoliciesRequest, ListSandboxPoliciesResponse, ListSandboxProvidersRequest, + ListSandboxProvidersResponse, ListSandboxesRequest, ListSandboxesResponse, ListServicesRequest, + ListServicesResponse, ProviderProfileResponse, ProviderResponse, PushSandboxLogsRequest, + PushSandboxLogsResponse, RejectDraftChunkRequest, RejectDraftChunkResponse, RelayFrame, + ReportPolicyStatusRequest, ReportPolicyStatusResponse, RevokeSshSessionRequest, + RevokeSshSessionResponse, RotateProviderCredentialRequest, RotateProviderCredentialResponse, + SandboxResponse, SandboxStreamEvent, ServiceEndpointResponse, ServiceStatus, SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, SupervisorMessage, TcpForwardFrame, UndoDraftChunkRequest, UndoDraftChunkResponse, UpdateConfigRequest, UpdateConfigResponse, UpdateProviderRequest, WatchSandboxRequest, open_shell_server::OpenShell, @@ -567,6 +569,15 @@ impl OpenShell for OpenShellService { policy::handle_get_draft_history(&self.state, request).await } + // --- Sandbox identity --- + + async fn issue_sandbox_token( + &self, + request: Request, + ) -> Result, Status> { + auth_rpc::handle_issue_sandbox_token(&self.state, request).await + } + // --- Supervisor session --- type ConnectSupervisorStream = diff --git a/crates/openshell-server/src/grpc/sandbox.rs b/crates/openshell-server/src/grpc/sandbox.rs index 530f2332a..035f68471 100644 --- a/crates/openshell-server/src/grpc/sandbox.rs +++ b/crates/openshell-server/src/grpc/sandbox.rs @@ -99,6 +99,12 @@ pub(super) async fn handle_create_sandbox( } let id = uuid::Uuid::new_v4().to_string(); + // PR 3 wires `state.sandbox_jwt_issuer.mint(&id)` here for singleplayer + // drivers (Docker / Podman / VM), passing the minted token through the + // driver call so it lands in the sandbox bundle. K8s sandboxes skip + // this mint and exchange a projected ServiceAccount token via + // `IssueSandboxToken` at supervisor startup. + let name = if request.name.is_empty() { petname::petname(2, "-").unwrap_or_else(generate_name) } else { diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index de7cd4b6c..e62ebc141 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -105,6 +105,26 @@ pub struct ServerState { /// OIDC JWKS cache for JWT validation. `None` when OIDC is not configured. pub oidc_cache: Option>, + + /// Gateway-minted sandbox JWT issuer. `None` when `config.gateway_jwt` + /// is not configured; in that mode `IssueSandboxToken` returns + /// `Status::unavailable`. Populated at startup from the on-disk key + /// material that `certgen` writes. + pub sandbox_jwt_issuer: Option>, + + /// Authenticator that validates gateway-minted sandbox JWTs on every + /// inbound request. Always set when `sandbox_jwt_issuer` is, so callers + /// presenting a freshly minted token are recognized. + pub sandbox_jwt_authenticator: Option>, + + /// Optional K8s `ServiceAccount` authenticator that backs the + /// `IssueSandboxToken` bootstrap path. Only present when the gateway + /// runs in-cluster. + pub k8s_sa_authenticator: Option>, + + /// In-memory revocation set for gateway-minted sandbox JWTs. + /// Populated by `DeleteSandbox` and (in PR 5) `RefreshSandboxToken`. + pub sandbox_jwt_revocation: Arc, } fn is_benign_tls_handshake_failure(error: &std::io::Error) -> bool { @@ -149,6 +169,10 @@ impl ServerState { settings_mutex: tokio::sync::Mutex::new(()), supervisor_sessions, oidc_cache, + sandbox_jwt_issuer: None, + sandbox_jwt_authenticator: None, + k8s_sa_authenticator: None, + sandbox_jwt_revocation: Arc::new(auth::revocation::RevocationSet::new()), } } } @@ -206,7 +230,7 @@ pub async fn run_server( supervisor_sessions.clone(), ) .await?; - let state = Arc::new(ServerState::new( + let mut state = ServerState::new( config.clone(), store.clone(), compute, @@ -215,7 +239,65 @@ pub async fn run_server( tracing_log_bus, supervisor_sessions, oidc_cache, - )); + ); + + // Load the gateway-minted sandbox JWT signing key when configured. + // Optional in PR 2 so single-driver dev deployments without certgen + // continue to start. The helm-deployed gateway and the RPM init script + // populate `gateway_jwt` once `certgen` has produced the on-disk + // material. + if let Some(ref jwt) = config.gateway_jwt { + let signing_pem = std::fs::read(&jwt.signing_key_path).map_err(|e| { + Error::config(format!( + "failed to read sandbox JWT signing key from {}: {e}", + jwt.signing_key_path.display() + )) + })?; + let public_pem = std::fs::read(&jwt.public_key_path).map_err(|e| { + Error::config(format!( + "failed to read sandbox JWT public key from {}: {e}", + jwt.public_key_path.display() + )) + })?; + let kid = std::fs::read_to_string(&jwt.kid_path) + .map_err(|e| { + Error::config(format!( + "failed to read sandbox JWT kid from {}: {e}", + jwt.kid_path.display() + )) + })? + .trim() + .to_string(); + if kid.is_empty() { + return Err(Error::config(format!( + "sandbox JWT kid file {} is empty", + jwt.kid_path.display() + ))); + } + let issuer = auth::sandbox_jwt::SandboxJwtIssuer::from_pem( + &signing_pem, + kid.clone(), + &jwt.gateway_id, + Duration::from_secs(jwt.ttl_secs), + ) + .map_err(Error::config)?; + let authenticator = auth::sandbox_jwt::SandboxJwtAuthenticator::from_pem( + &public_pem, + kid, + &jwt.gateway_id, + state.sandbox_jwt_revocation.clone(), + ) + .map_err(Error::config)?; + info!( + gateway_id = %jwt.gateway_id, + ttl_secs = jwt.ttl_secs, + "gateway-minted sandbox JWT enabled" + ); + state.sandbox_jwt_issuer = Some(Arc::new(issuer)); + state.sandbox_jwt_authenticator = Some(Arc::new(authenticator)); + } + + let state = Arc::new(state); // Resume sandboxes that were stopped during the previous gateway // shutdown so the running compute state matches the persisted store. diff --git a/crates/openshell-server/src/multiplex.rs b/crates/openshell-server/src/multiplex.rs index e8aa0dfb4..19346ba75 100644 --- a/crates/openshell-server/src/multiplex.rs +++ b/crates/openshell-server/src/multiplex.rs @@ -160,7 +160,7 @@ impl MultiplexService { user_role: oidc.user_role.clone(), scopes_enabled: !oidc.scopes_claim.is_empty(), }); - let authenticator_chain = build_authenticator_chain(self.state.oidc_cache.clone()); + let authenticator_chain = build_authenticator_chain(&self.state); let grpc_service = AuthGrpcRouter::with_peer_identity( GrpcRouter::new(openshell, inference), authenticator_chain, @@ -259,29 +259,39 @@ where /// Assemble the authenticator chain for the gateway. /// -/// PR-1 composition: -/// 1. [`LegacySandboxMarkerAuthenticator`] — preserves the path-based -/// sandbox/dual-auth-no-Bearer behavior so handlers that still read the -/// metadata marker keep working. Removed in PR 3. -/// 2. [`OidcAuthenticator`] — validates Bearer tokens against the configured -/// OIDC issuer. Only added when OIDC is configured. +/// Chain order (first-match-wins): +/// 1. [`K8sServiceAccountAuthenticator`] (path-scoped to `IssueSandboxToken`) +/// — exchanges a projected SA token for a `Principal::Sandbox` so the +/// `IssueSandboxToken` handler can mint a gateway JWT. No-op on every +/// other path; only present when the gateway runs in-cluster. +/// 2. [`SandboxJwtAuthenticator`] — validates gateway-minted JWTs. Recognized +/// via a distinctive `kid` so non-matching Bearer tokens fall through. +/// 3. [`LegacySandboxMarkerAuthenticator`] — PR-1 holdover that produces a +/// `Principal::Sandbox` for sandbox/dual-auth paths without a Bearer. +/// Removed in PR 3. +/// 4. [`OidcAuthenticator`] — validates user Bearer tokens against the +/// configured OIDC issuer. Only added when OIDC is configured. /// -/// When OIDC is not configured (singleplayer dev mode, fronting-proxy -/// deployments before PR 3), the chain still has the legacy marker so -/// sandbox-class methods produce a `Principal::Sandbox` and non-sandbox -/// methods produce `None` — preserving today's "OIDC None == pass through" -/// behavior via the router's `chain_empty_means_passthrough` short-circuit. -fn build_authenticator_chain( - oidc_cache: Option>, -) -> Option { +/// When OIDC is *and* sandbox-JWT signing are both unconfigured (a barebones +/// dev gateway), the chain is left as `None` so the router short-circuits to +/// pass-through. The legacy marker can satisfy sandbox-class routes only +/// when paired with an OIDC authenticator that gates user routes, so we +/// require at least one of OIDC or sandbox JWT to build a chain. +fn build_authenticator_chain(state: &ServerState) -> Option { let mut authenticators: Vec> = Vec::new(); + if let Some(k8s) = state.k8s_sa_authenticator.clone() { + authenticators.push(k8s); + } + if let Some(jwt) = state.sandbox_jwt_authenticator.clone() { + authenticators.push(jwt); + } authenticators.push(Arc::new(LegacySandboxMarkerAuthenticator)); - if let Some(cache) = oidc_cache { + if let Some(cache) = state.oidc_cache.clone() { authenticators.push(Arc::new(OidcAuthenticator::new(cache))); - } else { - // No OIDC configured — the router treats a missing OIDC cache as - // "pass-through for non-sandbox methods" by skipping the chain - // entirely. See AuthGrpcRouter::call. + } else if state.sandbox_jwt_authenticator.is_none() { + // Neither OIDC nor gateway-minted JWTs are configured — preserve + // the pre-PR-1 "open by default" dev behavior by returning no chain + // so the router short-circuits to pass-through. return None; } Some(AuthenticatorChain::new(authenticators)) @@ -977,10 +987,7 @@ mod tests { .await .unwrap(); assert!( - matches!( - seen.lock().unwrap().clone(), - Some(Principal::Sandbox(_)) - ), + matches!(seen.lock().unwrap().clone(), Some(Principal::Sandbox(_))), "principal must reach extensions" ); assert!( diff --git a/crates/openshell-server/tests/auth_endpoint_integration.rs b/crates/openshell-server/tests/auth_endpoint_integration.rs index 8e321b2e5..3d74d66a5 100644 --- a/crates/openshell-server/tests/auth_endpoint_integration.rs +++ b/crates/openshell-server/tests/auth_endpoint_integration.rs @@ -779,6 +779,14 @@ impl openshell_core::proto::open_shell_server::OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, tonic::Status> + { + Err(tonic::Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/edge_tunnel_auth.rs b/crates/openshell-server/tests/edge_tunnel_auth.rs index 29f6e5c22..5a4364d40 100644 --- a/crates/openshell-server/tests/edge_tunnel_auth.rs +++ b/crates/openshell-server/tests/edge_tunnel_auth.rs @@ -436,6 +436,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/multiplex_integration.rs b/crates/openshell-server/tests/multiplex_integration.rs index cd8bd43a1..ed920a22f 100644 --- a/crates/openshell-server/tests/multiplex_integration.rs +++ b/crates/openshell-server/tests/multiplex_integration.rs @@ -405,6 +405,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/multiplex_tls_integration.rs b/crates/openshell-server/tests/multiplex_tls_integration.rs index bf238cc56..3617c30c6 100644 --- a/crates/openshell-server/tests/multiplex_tls_integration.rs +++ b/crates/openshell-server/tests/multiplex_tls_integration.rs @@ -418,6 +418,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/supervisor_relay_integration.rs b/crates/openshell-server/tests/supervisor_relay_integration.rs index aae6d8cf1..533e53a1b 100644 --- a/crates/openshell-server/tests/supervisor_relay_integration.rs +++ b/crates/openshell-server/tests/supervisor_relay_integration.rs @@ -394,6 +394,12 @@ impl OpenShell for RelayGateway { ) -> Result, Status> { Err(Status::unimplemented("unused")) } + async fn issue_sandbox_token( + &self, + _: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("unused")) + } } // --------------------------------------------------------------------------- diff --git a/crates/openshell-server/tests/ws_tunnel_integration.rs b/crates/openshell-server/tests/ws_tunnel_integration.rs index 68e6c2ee3..193a2ad1d 100644 --- a/crates/openshell-server/tests/ws_tunnel_integration.rs +++ b/crates/openshell-server/tests/ws_tunnel_integration.rs @@ -431,6 +431,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/deploy/helm/openshell/templates/certgen.yaml b/deploy/helm/openshell/templates/certgen.yaml index ef4500db6..61203760b 100644 --- a/deploy/helm/openshell/templates/certgen.yaml +++ b/deploy/helm/openshell/templates/certgen.yaml @@ -100,6 +100,7 @@ spec: - generate-certs - --server-secret-name={{ .Values.server.tls.certSecretName }} - --client-secret-name={{ .Values.server.tls.clientTlsSecretName }} + - --jwt-secret-name={{ .Values.server.sandboxJwt.signingSecretName | default (printf "%s-jwt-keys" (include "openshell.fullname" .)) }} {{- range .Values.pkiInitJob.serverDnsNames }} - --server-san={{ . }} {{- end }} diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml index 9d95e45c1..9793c2a5e 100644 --- a/deploy/helm/openshell/templates/gateway-config.yaml +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -64,6 +64,13 @@ data: {{- end }} {{- end }} + [openshell.gateway.gateway_jwt] + signing_key_path = "/etc/openshell-jwt/signing.pem" + public_key_path = "/etc/openshell-jwt/public.pem" + kid_path = "/etc/openshell-jwt/kid" + gateway_id = {{ .Values.server.sandboxJwt.gatewayId | default (include "openshell.fullname" .) | quote }} + ttl_secs = {{ .Values.server.sandboxJwt.ttlSecs | default 86400 }} + {{- if .Values.server.oidc.issuer }} [openshell.gateway.oidc] diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index c6ff21491..2a5ce420c 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -84,6 +84,9 @@ spec: mountPath: /etc/openshell-tls/client-ca readOnly: true {{- end }} + - name: sandbox-jwt + mountPath: /etc/openshell-jwt + readOnly: true {{- if and .Values.server.oidc.issuer .Values.server.oidc.caConfigMapName }} - name: oidc-ca mountPath: /etc/openshell-tls/oidc-ca @@ -147,6 +150,10 @@ spec: secretName: {{ .Values.server.tls.clientCaSecretName }} {{- end }} {{- end }} + - name: sandbox-jwt + secret: + secretName: {{ .Values.server.sandboxJwt.signingSecretName | default (printf "%s-jwt-keys" (include "openshell.fullname" .)) }} + defaultMode: 0400 {{- if and .Values.server.oidc.issuer .Values.server.oidc.caConfigMapName }} - name: oidc-ca configMap: diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index fd689c8bf..a694dc59d 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -171,6 +171,18 @@ server: clientCaSecretName: openshell-server-client-ca # -- K8s secret mounted into sandbox pods for mTLS to the server. clientTlsSecretName: openshell-client-tls + # Gateway-minted sandbox JWT signing keys. The pre-install certgen hook + # generates an Ed25519 keypair and writes it to a secret containing + # signing.pem (PKCS#8), public.pem (SPKI), and kid (plain text). + sandboxJwt: + # Name of the Opaque Secret holding the signing key material. Empty + # falls back to "-jwt-keys". + signingSecretName: "" + # Stable gateway identity embedded in iss/aud of every minted token. + # Defaults to the release name so HA replicas share identity. + gatewayId: "" + # Token TTL in seconds. Defaults to 86400 (24h). + ttlSecs: 86400 # OIDC (OpenID Connect) configuration for JWT-based authentication. # When issuer is set, the server validates Bearer tokens on gRPC requests. oidc: diff --git a/proto/openshell.proto b/proto/openshell.proto index ca62646e3..15f78b381 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -224,6 +224,28 @@ service OpenShell { // Get decision history for a sandbox's draft policy. rpc GetDraftHistory(GetDraftHistoryRequest) returns (GetDraftHistoryResponse); + + // Exchange a sandbox-bootstrap credential (e.g. a Kubernetes projected + // ServiceAccount token) for a gateway-minted JWT bound to the calling + // sandbox's UUID. Used by the Kubernetes driver path; singleplayer + // drivers receive the gateway JWT directly from the create-sandbox flow + // and never call this RPC. + rpc IssueSandboxToken(IssueSandboxTokenRequest) returns (IssueSandboxTokenResponse); +} + +// IssueSandboxToken request. Empty body; identity is established by the +// authentication credentials carried in the request headers (a projected +// Kubernetes ServiceAccount JWT in the K8s driver path). +message IssueSandboxTokenRequest {} + +// IssueSandboxToken response. The supervisor caches the returned token in +// memory and presents it as `Authorization: Bearer` on every subsequent +// gateway RPC. +message IssueSandboxTokenResponse { + // Gateway-minted JWT bound to the calling sandbox's UUID. + string token = 1; + // Absolute expiry of the issued token, milliseconds since the epoch. + int64 expires_at_ms = 2; } // Health check request. From cc7bc796b3b4db0a5d5cbf2ac2fc49a7e9efa4df Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Thu, 14 May 2026 21:14:52 -0700 Subject: [PATCH 03/18] feat(server)!: per-sandbox JWT identity over Bearer (wire break) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switches every sandbox-to-gateway gRPC call from "path-based mTLS-only trust" to "Authorization: Bearer " presented by the sandbox supervisor. Closes the trust-boundary half of issue #1354; the per-handler sandbox_id equality check follows in PR 4. Sandbox side: - crates/openshell-sandbox/src/grpc_client.rs gains an AuthInterceptor that injects the Bearer header on every outbound RPC. The token is resolved at startup from one of three sources, in order: 1. OPENSHELL_SANDBOX_TOKEN (env, test harnesses) 2. OPENSHELL_SANDBOX_TOKEN_FILE (Docker/Podman/VM drivers) 3. OPENSHELL_K8S_SA_TOKEN_FILE (K8s driver — projected SA token exchanged for a gateway JWT via IssueSandboxToken) Gateway side: - handle_create_sandbox mints a gateway JWT and passes it through the compute layer to DriverSandboxSpec.sandbox_token. K8s sandboxes ignore the field; Docker and Podman drivers inject it as OPENSHELL_SANDBOX_TOKEN in the container env. - Removes the path-based SANDBOX_METHODS / DUAL_AUTH_METHODS branches and the x-openshell-auth-source metadata marker. The AuthGrpcRouter chain is now uniform: K8s SA -> SandboxJwt -> OIDC, all extension-based. - Removes LegacySandboxMarkerAuthenticator and the SandboxIdentitySource:: LegacyMarker variant. Handlers read Principal::Sandbox directly from request extensions. Kubernetes driver: - Sandbox pods gain a projected ServiceAccount token volume mounted at /var/run/secrets/openshell/token (audience openshell-gateway, 1h TTL, kubelet auto-rotates). - Each pod is annotated with openshell.io/sandbox-id; the gateway resolves the SA token claim's pod uid back to a sandbox id via this annotation. - Helm Role grants the gateway pods:get in the sandbox namespace. No ClusterRoleBinding to system:auth-delegator — the gateway validates SA tokens against the apiserver's anonymous JWKS endpoint instead of via TokenReview, so no cluster-scoped privilege is required. The full JWKS verifier + pod-annotation lookup lands in the follow-up that brings the K8s helm-dev demo end-to-end; PR 3 exercises the wire break with Docker/Podman as the working drivers. Signed-off-by: Taylor Mutch --- crates/openshell-driver-docker/src/lib.rs | 13 ++ crates/openshell-driver-docker/src/tests.rs | 1 + .../openshell-driver-kubernetes/src/driver.rs | 93 ++++++++--- .../openshell-driver-podman/src/container.rs | 11 ++ crates/openshell-sandbox/src/grpc_client.rs | 156 +++++++++++++++--- .../src/supervisor_session.rs | 5 +- .../src/auth/authenticator.rs | 116 +------------ crates/openshell-server/src/auth/k8s_sa.rs | 26 ++- crates/openshell-server/src/auth/oidc.rs | 140 ---------------- crates/openshell-server/src/auth/principal.rs | 15 +- crates/openshell-server/src/compute/mod.rs | 14 +- crates/openshell-server/src/grpc/policy.rs | 42 ++++- crates/openshell-server/src/grpc/sandbox.rs | 29 +++- crates/openshell-server/src/multiplex.rs | 114 ++++--------- deploy/helm/openshell/templates/role.yaml | 11 ++ proto/compute_driver.proto | 7 + 16 files changed, 384 insertions(+), 409 deletions(-) diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 3a0772217..03e584843 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -992,6 +992,19 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig ); } + // Gateway-minted sandbox JWT (PR 3 of the per-sandbox identity series). + // Passed via env var since Docker has no native secret mount that is + // simpler than the existing bind-mount pattern; the trust boundary + // (`docker inspect` access) is already equivalent to the TLS key mount. + if let Some(spec) = sandbox.spec.as_ref() + && !spec.sandbox_token.is_empty() + { + environment.insert( + openshell_core::sandbox_env::SANDBOX_TOKEN.to_string(), + spec.sandbox_token.clone(), + ); + } + let mut pairs = environment.into_iter().collect::>(); pairs.sort_by(|left, right| left.0.cmp(&right.0)); pairs diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index 2ac2da1ee..575fb6677 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -33,6 +33,7 @@ fn test_sandbox() -> DriverSandbox { }), gpu: false, gpu_device: String::new(), + sandbox_token: String::new(), }), status: None, } diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 21ec7f5bf..07f205dee 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -1147,8 +1147,28 @@ fn sandbox_template_to_k8s( if !template.labels.is_empty() { metadata.insert("labels".to_string(), serde_json::json!(template.labels)); } - if let Some(annotations) = platform_config_struct(template, "annotations") { - metadata.insert("annotations".to_string(), annotations); + // Carry the sandbox UUID as a pod annotation so the gateway can resolve + // a projected SA token claim (pod name + uid) back to a sandbox identity + // when the supervisor calls `IssueSandboxToken` at startup. The gateway's + // K8s Role does NOT grant `patch pods`, so this annotation is + // effectively immutable post-create (see plan §11.8). + let mut pod_annotations = platform_config_struct(template, "annotations") + .and_then(|v| match v { + serde_json::Value::Object(map) => Some(map), + _ => None, + }) + .unwrap_or_default(); + if !params.sandbox_id.is_empty() { + pod_annotations.insert( + "openshell.io/sandbox-id".to_string(), + serde_json::Value::String(params.sandbox_id.to_string()), + ); + } + if !pod_annotations.is_empty() { + metadata.insert( + "annotations".to_string(), + serde_json::Value::Object(pod_annotations), + ); } let mut spec = serde_json::Map::new(); @@ -1235,17 +1255,26 @@ fn sandbox_template_to_k8s( }), ); - // Mount client TLS secret for mTLS to the server. + // Mount client TLS secret for mTLS to the server, plus the projected + // ServiceAccount token used to bootstrap the sandbox's gateway JWT + // via `IssueSandboxToken`. + let mut volume_mounts: Vec = Vec::new(); if !params.client_tls_secret_name.is_empty() { - container.insert( - "volumeMounts".to_string(), - serde_json::json!([{ - "name": "openshell-client-tls", - "mountPath": "/etc/openshell-tls/client", - "readOnly": true - }]), - ); - } + volume_mounts.push(serde_json::json!({ + "name": "openshell-client-tls", + "mountPath": "/etc/openshell-tls/client", + "readOnly": true + })); + } + volume_mounts.push(serde_json::json!({ + "name": "openshell-sa-token", + "mountPath": "/var/run/secrets/openshell", + "readOnly": true, + })); + container.insert( + "volumeMounts".to_string(), + serde_json::Value::Array(volume_mounts), + ); if let Some(resources) = container_resources(template, gpu) { container.insert("resources".to_string(), resources); @@ -1257,15 +1286,31 @@ fn sandbox_template_to_k8s( // Add TLS secret volume. Mode 0400 (owner-read) prevents the // unprivileged sandbox user from reading the mTLS private key. + let mut volumes: Vec = Vec::new(); if !params.client_tls_secret_name.is_empty() { - spec.insert( - "volumes".to_string(), - serde_json::json!([{ - "name": "openshell-client-tls", - "secret": { "secretName": params.client_tls_secret_name, "defaultMode": 256 } - }]), - ); - } + volumes.push(serde_json::json!({ + "name": "openshell-client-tls", + "secret": { "secretName": params.client_tls_secret_name, "defaultMode": 256 } + })); + } + // Projected ServiceAccountToken volume — kubelet writes a short-lived + // audience-bound JWT into /var/run/secrets/openshell/token and rotates + // it automatically. The supervisor exchanges this for a gateway-minted + // JWT via `IssueSandboxToken` once at startup. + volumes.push(serde_json::json!({ + "name": "openshell-sa-token", + "projected": { + "sources": [{ + "serviceAccountToken": { + "audience": "openshell-gateway", + "expirationSeconds": 3600_i64, + "path": "token" + } + }], + "defaultMode": 256 + } + })); + spec.insert("volumes".to_string(), serde_json::Value::Array(volumes)); // Add hostAliases so sandbox pods can reach the Docker host. if !params.host_gateway_ip.is_empty() { @@ -1444,6 +1489,14 @@ fn apply_required_env( "/etc/openshell-tls/client/tls.key", ); } + // Projected ServiceAccount token written by kubelet (see the volume + // definition in `sandbox_template_to_k8s`). The supervisor reads this + // and exchanges it for a gateway-minted JWT via `IssueSandboxToken`. + upsert_env( + env, + openshell_core::sandbox_env::K8S_SA_TOKEN_FILE, + "/var/run/secrets/openshell/token", + ); } fn upsert_env(env: &mut Vec, name: &str, value: &str) { diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index c3f2c3282..e79ff2769 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -299,6 +299,17 @@ fn build_env( ); } + // 4. Gateway-minted sandbox JWT (PR 3 of the per-sandbox identity + // series). Passed via env var; the supervisor reads it directly. + if let Some(s) = spec + && !s.sandbox_token.is_empty() + { + env.insert( + openshell_core::sandbox_env::SANDBOX_TOKEN.into(), + s.sandbox_token.clone(), + ); + } + env } diff --git a/crates/openshell-sandbox/src/grpc_client.rs b/crates/openshell-sandbox/src/grpc_client.rs index 3fccb680f..eeac41a44 100644 --- a/crates/openshell-sandbox/src/grpc_client.rs +++ b/crates/openshell-sandbox/src/grpc_client.rs @@ -3,6 +3,21 @@ //! gRPC client for fetching sandbox policy, provider environment, and inference //! route bundles from `OpenShell` server. +//! +//! Every request carries a gateway-minted JWT in the `Authorization` header +//! (PR 3 of the per-sandbox identity series; see issue #1354). The token is +//! resolved at startup from one of three sources: +//! +//! 1. `OPENSHELL_SANDBOX_TOKEN` — raw JWT in the env (test harness path). +//! 2. `OPENSHELL_SANDBOX_TOKEN_FILE` — file containing the JWT (Docker / +//! Podman / VM drivers write this to a bundle file at sandbox-create +//! time). +//! 3. `OPENSHELL_K8S_SA_TOKEN_FILE` — projected `ServiceAccount` JWT; the +//! supervisor exchanges it for a gateway JWT via `IssueSandboxToken` +//! once at startup. +//! +//! The resolved gateway JWT is held in process memory thereafter and +//! injected on every outbound call by [`AuthInterceptor`]. use std::collections::HashMap; use std::time::Duration; @@ -10,15 +25,50 @@ use std::time::Duration; use miette::{IntoDiagnostic, Result, WrapErr}; use openshell_core::proto::{ DenialSummary, GetDraftPolicyRequest, GetInferenceBundleRequest, GetInferenceBundleResponse, - GetSandboxConfigRequest, GetSandboxProviderEnvironmentRequest, PolicyChunk, PolicySource, - PolicyStatus, ReportPolicyStatusRequest, SandboxPolicy as ProtoSandboxPolicy, - SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, UpdateConfigRequest, - inference_client::InferenceClient, open_shell_client::OpenShellClient, + GetSandboxConfigRequest, GetSandboxProviderEnvironmentRequest, IssueSandboxTokenRequest, + PolicyChunk, PolicySource, PolicyStatus, ReportPolicyStatusRequest, + SandboxPolicy as ProtoSandboxPolicy, SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, + UpdateConfigRequest, inference_client::InferenceClient, open_shell_client::OpenShellClient, }; +use openshell_core::sandbox_env; +use tonic::Status; +use tonic::metadata::AsciiMetadataValue; +use tonic::service::interceptor::InterceptedService; use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint, Identity}; -use tracing::debug; +use tracing::{debug, info}; -/// Create a channel to the `OpenShell` server. +/// Channel type after the [`AuthInterceptor`] is applied. Aliased so the +/// generated client type signatures stay readable. +pub type AuthedChannel = InterceptedService; + +/// gRPC interceptor that injects `authorization: Bearer ` on every +/// outbound request. +#[derive(Clone)] +pub struct AuthInterceptor { + bearer: AsciiMetadataValue, +} + +impl AuthInterceptor { + fn new(token: &str) -> Result { + let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")) + .into_diagnostic() + .wrap_err("sandbox JWT contained characters not valid for a header value")?; + Ok(Self { bearer }) + } +} + +impl tonic::service::Interceptor for AuthInterceptor { + fn call( + &mut self, + mut req: tonic::Request<()>, + ) -> std::result::Result, Status> { + req.metadata_mut() + .insert("authorization", self.bearer.clone()); + Ok(req) + } +} + +/// Build the plain (un-intercepted) gRPC channel. /// /// When the endpoint uses `https://`, mTLS is configured using these env vars: /// - `OPENSHELL_TLS_CA` -- path to the CA certificate @@ -27,7 +77,7 @@ use tracing::debug; /// /// When the endpoint uses `http://`, a plaintext connection is used (for /// deployments where TLS is disabled, e.g. behind a Cloudflare Tunnel). -async fn connect_channel(endpoint: &str) -> Result { +async fn build_plain_channel(endpoint: &str) -> Result { let mut ep = Endpoint::from_shared(endpoint.to_string()) .into_diagnostic() .wrap_err("invalid gRPC endpoint")? @@ -43,13 +93,13 @@ async fn connect_channel(endpoint: &str) -> Result { let tls_enabled = endpoint.starts_with("https://"); if tls_enabled { - let ca_path = std::env::var(openshell_core::sandbox_env::TLS_CA) + let ca_path = std::env::var(sandbox_env::TLS_CA) .into_diagnostic() .wrap_err("OPENSHELL_TLS_CA is required")?; - let cert_path = std::env::var(openshell_core::sandbox_env::TLS_CERT) + let cert_path = std::env::var(sandbox_env::TLS_CERT) .into_diagnostic() .wrap_err("OPENSHELL_TLS_CERT is required")?; - let key_path = std::env::var(openshell_core::sandbox_env::TLS_KEY) + let key_path = std::env::var(sandbox_env::TLS_KEY) .into_diagnostic() .wrap_err("OPENSHELL_TLS_KEY is required")?; @@ -79,24 +129,84 @@ async fn connect_channel(endpoint: &str) -> Result { .wrap_err("failed to connect to OpenShell server") } -/// Create a channel to the `OpenShell` server (public for use by `supervisor_session`). -pub async fn connect_channel_pub(endpoint: &str) -> Result { +/// Build a Bearer-authenticated channel to the gateway. +/// +/// Resolves the sandbox JWT via the three-step lookup described at the +/// module level (env → file → K8s SA bootstrap exchange) and wraps the +/// resulting channel in [`AuthInterceptor`]. +async fn connect_channel(endpoint: &str) -> Result { + let channel = build_plain_channel(endpoint).await?; + let token = acquire_sandbox_token(endpoint, &channel).await?; + let interceptor = AuthInterceptor::new(&token)?; + Ok(InterceptedService::new(channel, interceptor)) +} + +/// Resolve the sandbox JWT used to authenticate every outbound RPC. +/// +/// `endpoint` is logged on errors but never used for transport here; the +/// actual network call lives inside this function only on the K8s +/// bootstrap path, which uses `plain_channel` to call `IssueSandboxToken` +/// once before the steady-state Bearer-authenticated channel is built. +async fn acquire_sandbox_token(endpoint: &str, plain_channel: &Channel) -> Result { + if let Ok(t) = std::env::var(sandbox_env::SANDBOX_TOKEN) + && !t.is_empty() + { + debug!(source = "env", "loaded sandbox token"); + return Ok(t); + } + + if let Ok(path) = std::env::var(sandbox_env::SANDBOX_TOKEN_FILE) + && !path.is_empty() + { + let contents = std::fs::read_to_string(&path) + .into_diagnostic() + .wrap_err_with(|| format!("failed to read sandbox token from {path}"))?; + debug!(source = "file", path = %path, "loaded sandbox token"); + return Ok(contents.trim().to_string()); + } + + if let Ok(sa_path) = std::env::var(sandbox_env::K8S_SA_TOKEN_FILE) + && !sa_path.is_empty() + { + let sa_token = std::fs::read_to_string(&sa_path) + .into_diagnostic() + .wrap_err_with(|| format!("failed to read K8s SA token from {sa_path}"))? + .trim() + .to_string(); + info!(endpoint = %endpoint, "exchanging K8s ServiceAccount token for sandbox JWT"); + let interceptor = AuthInterceptor::new(&sa_token)?; + let bootstrap = InterceptedService::new(plain_channel.clone(), interceptor); + let mut client = OpenShellClient::new(bootstrap); + let resp = client + .issue_sandbox_token(IssueSandboxTokenRequest {}) + .await + .into_diagnostic() + .wrap_err("IssueSandboxToken bootstrap exchange failed")?; + return Ok(resp.into_inner().token); + } + + Err(miette::miette!( + "no sandbox token source available — set one of {}, {}, or {}", + sandbox_env::SANDBOX_TOKEN, + sandbox_env::SANDBOX_TOKEN_FILE, + sandbox_env::K8S_SA_TOKEN_FILE, + )) +} + +/// Build an authenticated channel for direct external use (e.g. the +/// long-lived `supervisor_session` control stream). +pub async fn connect_channel_pub(endpoint: &str) -> Result { connect_channel(endpoint).await } /// Connect to the `OpenShell` server. -/// -/// Sandboxes authenticate to the gateway via the mTLS client certificate -/// configured by `connect_channel`. They do not present an OIDC Bearer -/// token; the gateway recognises sandbox-class callers by absence of a -/// Bearer header on the request. -async fn connect(endpoint: &str) -> Result> { +async fn connect(endpoint: &str) -> Result> { let channel = connect_channel(endpoint).await?; Ok(OpenShellClient::new(channel)) } /// Connect to the inference service. -async fn connect_inference(endpoint: &str) -> Result> { +async fn connect_inference(endpoint: &str) -> Result> { let channel = connect_channel(endpoint).await?; Ok(InferenceClient::new(channel)) } @@ -118,7 +228,7 @@ pub async fn fetch_policy(endpoint: &str, sandbox_id: &str) -> Result, + client: &mut OpenShellClient, sandbox_id: &str, ) -> Result> { let response = client @@ -142,7 +252,7 @@ async fn fetch_policy_with_client( /// Sync a locally-discovered policy using an existing client connection. async fn sync_policy_with_client( - client: &mut OpenShellClient, + client: &mut OpenShellClient, sandbox: &str, policy: &ProtoSandboxPolicy, ) -> Result<()> { @@ -238,7 +348,7 @@ pub async fn fetch_provider_environment( /// and status reporting, avoiding per-request TLS handshake overhead. #[derive(Clone)] pub struct CachedOpenShellClient { - client: OpenShellClient, + client: OpenShellClient, } /// Settings poll result returned by [`CachedOpenShellClient::poll_settings`]. @@ -269,7 +379,7 @@ impl CachedOpenShellClient { } /// Get a clone of the underlying tonic client for direct RPC calls. - pub fn raw_client(&self) -> OpenShellClient { + pub fn raw_client(&self) -> OpenShellClient { self.client.clone() } diff --git a/crates/openshell-sandbox/src/supervisor_session.rs b/crates/openshell-sandbox/src/supervisor_session.rs index 6485dddf0..4d7392ee3 100644 --- a/crates/openshell-sandbox/src/supervisor_session.rs +++ b/crates/openshell-sandbox/src/supervisor_session.rs @@ -28,7 +28,6 @@ use openshell_ocsf::{ use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; use tokio::sync::mpsc; use tokio_stream::StreamExt; -use tonic::transport::Channel; use tracing::{debug, warn}; use crate::grpc_client; @@ -371,7 +370,7 @@ fn handle_gateway_message( sandbox_id: &str, ssh_socket_path: &std::path::Path, netns_fd: Option, - channel: &Channel, + channel: &grpc_client::AuthedChannel, tx: &mpsc::Sender, ) { match &msg.payload { @@ -436,7 +435,7 @@ async fn handle_relay_open( relay_open: RelayOpen, ssh_socket_path: &std::path::Path, netns_fd: Option, - channel: Channel, + channel: grpc_client::AuthedChannel, tx: mpsc::Sender, ) -> Result<(), Box> { let channel_id = relay_open.channel_id.clone(); diff --git a/crates/openshell-server/src/auth/authenticator.rs b/crates/openshell-server/src/auth/authenticator.rs index 40fa1e4ec..827c3b8c1 100644 --- a/crates/openshell-server/src/auth/authenticator.rs +++ b/crates/openshell-server/src/auth/authenticator.rs @@ -12,11 +12,13 @@ //! apply but rejects the caller returns `Err(Status)`, which terminates //! the chain — fail-closed. //! -//! This module is the abstraction PR (PR 1). Subsequent PRs slot in: -//! - PR 2: `SandboxJwtAuthenticator` + `K8sServiceAccountAuthenticator` -//! - PR 3: removal of the PR-1 legacy marker authenticator +//! Live authenticators slotting into the chain: +//! - [`super::sandbox_jwt::SandboxJwtAuthenticator`] — gateway-minted JWTs +//! - [`super::k8s_sa::K8sServiceAccountAuthenticator`] — K8s projected SA +//! tokens (path-scoped to `IssueSandboxToken`) +//! - [`super::oidc::OidcAuthenticator`] — user OIDC Bearer tokens -use super::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; +use super::principal::Principal; use async_trait::async_trait; use std::sync::Arc; use tonic::Status; @@ -89,45 +91,6 @@ impl std::fmt::Debug for AuthenticatorChain { } } -/// Authenticator that preserves the pre-refactor behavior for sandbox-class -/// and dual-auth-no-Bearer paths. -/// -/// Returns `Some(Principal::Sandbox)` with [`SandboxIdentitySource::LegacyMarker`] -/// — the `sandbox_id` is left empty because no credential was verified. This -/// matches the pre-PR-1 router which trusted the path list, not the caller. -/// -/// PR 3 deletes this type once every sandbox call carries a gateway-minted -/// JWT and the path-based branches are gone. -pub struct LegacySandboxMarkerAuthenticator; - -#[async_trait] -impl Authenticator for LegacySandboxMarkerAuthenticator { - async fn authenticate( - &self, - headers: &http::HeaderMap, - path: &str, - ) -> Result, Status> { - let is_sandbox_path = super::oidc::is_sandbox_method(path); - let is_dual_no_bearer = - super::oidc::is_dual_auth_method(path) && !has_bearer_token(headers); - if is_sandbox_path || is_dual_no_bearer { - return Ok(Some(Principal::Sandbox(SandboxPrincipal { - sandbox_id: String::new(), - source: SandboxIdentitySource::LegacyMarker, - trust_domain: None, - }))); - } - Ok(None) - } -} - -fn has_bearer_token(headers: &http::HeaderMap) -> bool { - headers - .get("authorization") - .and_then(|v| v.to_str().ok()) - .is_some_and(|v| v.starts_with("Bearer ")) -} - #[cfg(test)] pub mod test_support { use super::*; @@ -264,71 +227,4 @@ mod tests { .unwrap(); assert!(result.is_none()); } - - #[tokio::test] - async fn legacy_marker_recognizes_sandbox_method() { - let auth = LegacySandboxMarkerAuthenticator; - let result = auth - .authenticate( - &http::HeaderMap::new(), - "/openshell.v1.OpenShell/ReportPolicyStatus", - ) - .await - .unwrap() - .expect("sandbox path must produce a principal"); - match result { - Principal::Sandbox(p) => { - assert!(p.sandbox_id.is_empty(), "legacy marker has no verified id"); - assert!(matches!(p.source, SandboxIdentitySource::LegacyMarker)); - } - _ => panic!("expected sandbox principal"), - } - } - - #[tokio::test] - async fn legacy_marker_recognizes_dual_auth_without_bearer() { - let auth = LegacySandboxMarkerAuthenticator; - let result = auth - .authenticate( - &http::HeaderMap::new(), - "/openshell.v1.OpenShell/UpdateConfig", - ) - .await - .unwrap(); - assert!( - result.is_some(), - "dual-auth without Bearer must mark sandbox" - ); - } - - #[tokio::test] - async fn legacy_marker_yields_to_dual_auth_with_bearer() { - let auth = LegacySandboxMarkerAuthenticator; - let mut headers = http::HeaderMap::new(); - headers.insert( - "authorization", - http::HeaderValue::from_static("Bearer xyz"), - ); - let result = auth - .authenticate(&headers, "/openshell.v1.OpenShell/UpdateConfig") - .await - .unwrap(); - assert!( - result.is_none(), - "dual-auth WITH Bearer must fall through to the OIDC authenticator" - ); - } - - #[tokio::test] - async fn legacy_marker_skips_unrelated_paths() { - let auth = LegacySandboxMarkerAuthenticator; - let result = auth - .authenticate( - &http::HeaderMap::new(), - "/openshell.v1.OpenShell/ListSandboxes", - ) - .await - .unwrap(); - assert!(result.is_none()); - } } diff --git a/crates/openshell-server/src/auth/k8s_sa.rs b/crates/openshell-server/src/auth/k8s_sa.rs index a9f189e42..f8a9a9fae 100644 --- a/crates/openshell-server/src/auth/k8s_sa.rs +++ b/crates/openshell-server/src/auth/k8s_sa.rs @@ -117,11 +117,18 @@ impl Authenticator for K8sServiceAccountAuthenticator { } } -/// Live resolver backed by a `kube::Client`. PR 2 ships this with a -/// `not_implemented` stub so the authenticator type and trait are in place -/// for PR 3's K8s driver wiring. The `TokenReview` + pod-`GET` -/// implementation lands when the K8s driver actually creates the -/// projected SA volume (PR 3). +/// Live resolver backed by a `kube::Client`. +/// +/// Validates the projected `ServiceAccount` token locally against the +/// apiserver's JWKS endpoint — no `TokenReview` API call required (so the +/// gateway needs no `system:auth-delegator` cluster binding). The +/// namespace-scoped `Role` granted in the Helm chart provides the only +/// permission this resolver needs: `pods: get`, used to read the +/// `openshell.io/sandbox-id` annotation. +/// +/// The JWKS fetch + signature verification implementation lands in the +/// follow-up that brings the K8s helm-dev demo end-to-end; PR 3 ships +/// the wire break with Docker/Podman as the only exercised drivers. #[allow(dead_code)] pub struct LiveK8sResolver { client: kube::Client, @@ -143,11 +150,12 @@ impl LiveK8sResolver { #[async_trait] impl K8sIdentityResolver for LiveK8sResolver { async fn resolve(&self, _token: &str) -> Result, Status> { - // Implementation lands in PR 3 with the K8s driver wiring. - // Until then `IssueSandboxToken` is wired but only exercised via - // the test harness (see fake resolver below). + // Full JWKS verification + pod annotation lookup lands in the + // K8s-demo follow-up. Returning `Unimplemented` keeps the + // K8s-side `IssueSandboxToken` call from silently succeeding + // before the validator is in place. Err(Status::unimplemented( - "K8s ServiceAccount bootstrap not yet enabled", + "K8s ServiceAccount bootstrap pending JWKS implementation", )) } } diff --git a/crates/openshell-server/src/auth/oidc.rs b/crates/openshell-server/src/auth/oidc.rs index c2eb58b2d..6c1339e4f 100644 --- a/crates/openshell-server/src/auth/oidc.rs +++ b/crates/openshell-server/src/auth/oidc.rs @@ -25,15 +25,6 @@ use tokio::sync::RwLock; use tonic::Status; use tracing::{debug, info, warn}; -/// Internal metadata header set by the auth middleware to mark a request as -/// originating from a sandbox. This is stripped from all incoming requests -/// first so external callers cannot spoof it. -pub const INTERNAL_AUTH_SOURCE_HEADER: &str = "x-openshell-auth-source"; -/// Internal auth-source marker for requests originating from a sandbox -/// (no OIDC Bearer; trust derives from the mTLS channel or operator's -/// fronting proxy). -pub const AUTH_SOURCE_SANDBOX: &str = "sandbox"; - /// Truly unauthenticated methods — health probes and infrastructure. const UNAUTHENTICATED_METHODS: &[&str] = &[ "/openshell.v1.OpenShell/Health", @@ -43,40 +34,6 @@ const UNAUTHENTICATED_METHODS: &[&str] = &[ /// Path prefixes that bypass OIDC validation (gRPC reflection, health probes). const UNAUTHENTICATED_PREFIXES: &[&str] = &["/grpc.reflection.", "/grpc.health."]; -/// Sandbox-to-server RPCs that are called by sandboxes instead of CLI -/// users. These do not require an OIDC Bearer token; the gRPC channel's -/// mTLS handshake (or the operator's fronting proxy when -/// `--disable-gateway-auth` is set) is the trust boundary. -const SANDBOX_METHODS: &[&str] = &[ - "/openshell.v1.OpenShell/ReportPolicyStatus", - "/openshell.v1.OpenShell/PushSandboxLogs", - "/openshell.v1.OpenShell/GetSandboxProviderEnvironment", - "/openshell.v1.OpenShell/SubmitPolicyAnalysis", - "/openshell.sandbox.v1.SandboxService/GetSandboxConfig", - "/openshell.inference.v1.Inference/GetInferenceBundle", -]; - -/// Methods that accept either an OIDC Bearer token (CLI users, full scope) -/// or no Bearer (sandbox supervisor, sandbox-restricted scope). -/// `UpdateConfig` is called by both CLI (policy/settings mutations) and the -/// sandbox supervisor (policy sync on startup). -/// `OpenShell/GetSandboxConfig` serves CLI settings reads while remaining -/// compatible with sandbox callers. -/// `GetDraftPolicy` serves CLI reviewer surfaces (`openshell rule get`, -/// TUI inbox) AND the sandbox-side `policy.local /wait` long-poll that -/// blocks on the agent's proposal until the developer decides. -const DUAL_AUTH_METHODS: &[&str] = &[ - "/openshell.v1.OpenShell/UpdateConfig", - "/openshell.v1.OpenShell/GetSandboxConfig", - "/openshell.v1.OpenShell/GetDraftPolicy", -]; - -/// Returns `true` if the method accepts either an OIDC Bearer token or a -/// sandbox-class caller (no Bearer). -pub fn is_dual_auth_method(path: &str) -> bool { - DUAL_AUTH_METHODS.contains(&path) -} - /// Returns `true` if the method needs no authentication at all. pub fn is_unauthenticated_method(path: &str) -> bool { UNAUTHENTICATED_METHODS.contains(&path) @@ -85,34 +42,6 @@ pub fn is_unauthenticated_method(path: &str) -> bool { .any(|prefix| path.starts_with(prefix)) } -/// Returns `true` if the method is an exclusively sandbox-class call (does -/// not accept OIDC Bearer). -pub fn is_sandbox_method(path: &str) -> bool { - SANDBOX_METHODS.contains(&path) -} - -/// Remove internal auth-source markers from the request before any auth -/// decision is made so external callers cannot spoof them. -pub fn clear_internal_auth_markers(headers: &mut http::HeaderMap) { - headers.remove(INTERNAL_AUTH_SOURCE_HEADER); -} - -/// Mark the request as originating from a sandbox caller. -pub fn mark_sandbox_caller(headers: &mut http::HeaderMap) { - headers.insert( - INTERNAL_AUTH_SOURCE_HEADER, - http::HeaderValue::from_static(AUTH_SOURCE_SANDBOX), - ); -} - -/// Returns `true` if the request metadata indicates a sandbox caller. -pub fn is_sandbox_caller(metadata: &tonic::metadata::MetadataMap) -> bool { - metadata - .get(INTERNAL_AUTH_SOURCE_HEADER) - .and_then(|v| v.to_str().ok()) - == Some(AUTH_SOURCE_SANDBOX) -} - /// Cached JWKS key set fetched from the OIDC issuer. /// /// A `refresh_mutex` ensures that only one refresh runs at a time, @@ -472,7 +401,6 @@ mod tests { assert!(!is_unauthenticated_method( "/openshell.v1.OpenShell/CreateSandbox" )); - assert!(!is_sandbox_method("/openshell.v1.OpenShell/CreateSandbox")); } #[test] @@ -490,74 +418,6 @@ mod tests { assert!(is_unauthenticated_method("/grpc.health.v1.Health/Check")); } - #[test] - fn sandbox_rpcs_are_sandbox_methods() { - assert!(is_sandbox_method( - "/openshell.sandbox.v1.SandboxService/GetSandboxConfig" - )); - assert!(is_sandbox_method( - "/openshell.v1.OpenShell/GetSandboxProviderEnvironment" - )); - assert!(is_sandbox_method( - "/openshell.v1.OpenShell/ReportPolicyStatus" - )); - assert!(is_sandbox_method("/openshell.v1.OpenShell/PushSandboxLogs")); - assert!(is_sandbox_method( - "/openshell.v1.OpenShell/SubmitPolicyAnalysis" - )); - assert!(is_sandbox_method( - "/openshell.inference.v1.Inference/GetInferenceBundle" - )); - } - - #[test] - fn openshell_get_sandbox_config_is_dual_auth() { - assert!(!is_sandbox_method( - "/openshell.v1.OpenShell/GetSandboxConfig" - )); - assert!(is_dual_auth_method( - "/openshell.v1.OpenShell/GetSandboxConfig" - )); - } - - #[test] - fn openshell_get_draft_policy_is_dual_auth() { - // policy.local calls GetDraftPolicy from inside the sandbox - // supervisor (no Bearer, authenticated via mTLS), and the CLI/TUI - // reviewer surfaces call it with an OIDC Bearer. Sandbox-only - // would lock CLI out; Bearer-only would 401 the /wait long-poll - // in OIDC-enabled deployments. - assert!(!is_sandbox_method("/openshell.v1.OpenShell/GetDraftPolicy")); - assert!(is_dual_auth_method( - "/openshell.v1.OpenShell/GetDraftPolicy" - )); - } - - #[test] - fn sandbox_caller_marker_round_trips_through_metadata() { - let mut headers = http::HeaderMap::new(); - mark_sandbox_caller(&mut headers); - let metadata = tonic::metadata::MetadataMap::from_headers(headers); - assert!(is_sandbox_caller(&metadata)); - } - - #[test] - fn unmarked_request_is_not_sandbox_caller() { - let metadata = tonic::metadata::MetadataMap::new(); - assert!(!is_sandbox_caller(&metadata)); - } - - #[test] - fn clear_internal_markers_strips_spoofed_header() { - let mut headers = http::HeaderMap::new(); - headers.insert( - INTERNAL_AUTH_SOURCE_HEADER, - http::HeaderValue::from_static(AUTH_SOURCE_SANDBOX), - ); - clear_internal_auth_markers(&mut headers); - assert!(headers.get(INTERNAL_AUTH_SOURCE_HEADER).is_none()); - } - #[test] fn extract_roles_keycloak_path() { let json = serde_json::json!({ diff --git a/crates/openshell-server/src/auth/principal.rs b/crates/openshell-server/src/auth/principal.rs index 25bb57109..fac3f6099 100644 --- a/crates/openshell-server/src/auth/principal.rs +++ b/crates/openshell-server/src/auth/principal.rs @@ -61,25 +61,22 @@ pub struct SandboxPrincipal { } /// How a [`SandboxPrincipal`] was authenticated. +/// +/// Variant fields are populated by the producing authenticator and consumed +/// by audit logging + the PR-4 IDOR guard. Until PR 4 lands those readers +/// they look unused to the dead-code lint. #[derive(Debug, Clone)] +#[allow(dead_code)] pub enum SandboxIdentitySource { - /// PR-1 placeholder: the request matched a sandbox-class path or a - /// dual-auth path without a Bearer token. No credential was verified. - /// Removed in PR 3 once every sandbox call carries a gateway-minted JWT. - LegacyMarker, /// Gateway-minted JWT validated against the gateway's signing key. - /// Populated by PR 2's `SandboxJwtAuthenticator`. - #[allow(dead_code)] + /// Produced by [`super::sandbox_jwt::SandboxJwtAuthenticator`]. BootstrapJwt { issuer: String, jti: String }, /// Per-sandbox client certificate. Reserved for the v2 channel-bound /// identity follow-up. - #[allow(dead_code)] BootstrapCert { fingerprint: String }, /// SPIRE-issued SVID. Reserved for the SPIFFE/SPIRE follow-up. - #[allow(dead_code)] SpiffeSvid { spiffe_id: String }, /// K8s `ServiceAccount` token used to bootstrap a gateway-minted JWT /// via `IssueSandboxToken`. Populated only on that one RPC path. - #[allow(dead_code)] K8sServiceAccount { pod_name: String, pod_uid: String }, } diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index a69231dea..c79ffcf2d 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -421,7 +421,11 @@ impl ComputeRuntime { .map(|_| ()) } - pub async fn create_sandbox(&self, sandbox: Sandbox) -> Result { + pub async fn create_sandbox( + &self, + sandbox: Sandbox, + sandbox_token: Option, + ) -> Result { let sandbox_id = sandbox.object_id().to_string(); // Create with MustCreate condition to prevent duplicate creation race @@ -452,7 +456,12 @@ impl ComputeRuntime { } })?; - let driver_sandbox = driver_sandbox_from_public(&sandbox); + let mut driver_sandbox = driver_sandbox_from_public(&sandbox); + if let Some(token) = sandbox_token + && let Some(spec) = driver_sandbox.spec.as_mut() + { + spec.sandbox_token = token; + } match self .driver .create_sandbox(Request::new(CreateSandboxRequest { @@ -1229,6 +1238,7 @@ fn driver_sandbox_spec_from_public(spec: &SandboxSpec) -> DriverSandboxSpec { .map(driver_sandbox_template_from_public), gpu: spec.gpu, gpu_device: spec.gpu_device.clone(), + sandbox_token: String::new(), } } diff --git a/crates/openshell-server/src/grpc/policy.rs b/crates/openshell-server/src/grpc/policy.rs index 219ab89be..62d837851 100644 --- a/crates/openshell-server/src/grpc/policy.rs +++ b/crates/openshell-server/src/grpc/policy.rs @@ -10,9 +10,9 @@ #![allow(clippy::cast_precision_loss)] // f64->f32 for confidence scores #![allow(clippy::items_after_statements)] // DB_PORTS const inside function +use crate::ServerState; use crate::persistence::{DraftChunkRecord, ObjectId, ObjectName, ObjectType, PolicyRecord, Store}; use crate::policy_store::PolicyStoreExt; -use crate::{ServerState, auth::oidc}; use openshell_core::proto::policy_merge_operation; use openshell_core::proto::setting_value; use openshell_core::proto::{ @@ -315,7 +315,12 @@ fn truncate_for_log(input: &str, max_chars: usize) -> String { } fn is_sandbox_caller(request: &Request) -> bool { - oidc::is_sandbox_caller(request.metadata()) + matches!( + request + .extensions() + .get::(), + Some(crate::auth::principal::Principal::Sandbox(_)) + ) } /// Sandbox-class callers may only perform sandbox-scoped policy sync. They @@ -2874,15 +2879,38 @@ mod tests { } #[test] - fn sandbox_caller_marker_detected_from_metadata() { + fn sandbox_caller_detected_from_principal_extension() { + use crate::auth::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; let mut req = Request::new(()); - req.metadata_mut().insert( - oidc::INTERNAL_AUTH_SOURCE_HEADER, - oidc::AUTH_SOURCE_SANDBOX.parse().unwrap(), - ); + req.extensions_mut() + .insert(Principal::Sandbox(SandboxPrincipal { + sandbox_id: "test-sandbox".to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + jti: "j-1".to_string(), + }, + trust_domain: None, + })); assert!(is_sandbox_caller(&req)); } + #[test] + fn user_principal_not_treated_as_sandbox_caller() { + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{Principal, UserPrincipal}; + let mut req = Request::new(()); + req.extensions_mut().insert(Principal::User(UserPrincipal { + identity: Identity { + subject: "alice".to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + })); + assert!(!is_sandbox_caller(&req)); + } + // ---- Sandbox without policy ---- #[tokio::test] diff --git a/crates/openshell-server/src/grpc/sandbox.rs b/crates/openshell-server/src/grpc/sandbox.rs index 035f68471..d5c87063d 100644 --- a/crates/openshell-server/src/grpc/sandbox.rs +++ b/crates/openshell-server/src/grpc/sandbox.rs @@ -99,12 +99,6 @@ pub(super) async fn handle_create_sandbox( } let id = uuid::Uuid::new_v4().to_string(); - // PR 3 wires `state.sandbox_jwt_issuer.mint(&id)` here for singleplayer - // drivers (Docker / Podman / VM), passing the minted token through the - // driver call so it lands in the sandbox bundle. K8s sandboxes skip - // this mint and exchange a projected ServiceAccount token via - // `IssueSandboxToken` at supervisor startup. - let name = if request.name.is_empty() { petname::petname(2, "-").unwrap_or_else(generate_name) } else { @@ -139,7 +133,28 @@ pub(super) async fn handle_create_sandbox( status })?; - let sandbox = state.compute.create_sandbox(sandbox).await?; + // Mint the gateway JWT for singleplayer drivers. K8s sandboxes skip + // this mint and bootstrap via `IssueSandboxToken` at supervisor + // startup; identifying "is this K8s?" lives in the compute layer, so + // we mint unconditionally here when the issuer is configured and let + // the K8s driver simply ignore the field. + let sandbox_token = state.sandbox_jwt_issuer.as_ref().map(|issuer| { + issuer.mint(&id).map(|minted| { + tracing::info!( + sandbox_id = %id, + jti = %minted.jti, + "minted sandbox JWT" + ); + minted.token + }) + }); + let sandbox_token = match sandbox_token { + Some(Ok(token)) => Some(token), + Some(Err(status)) => return Err(status), + None => None, + }; + + let sandbox = state.compute.create_sandbox(sandbox, sandbox_token).await?; info!( sandbox_id = %id, diff --git a/crates/openshell-server/src/multiplex.rs b/crates/openshell-server/src/multiplex.rs index 19346ba75..5c652e679 100644 --- a/crates/openshell-server/src/multiplex.rs +++ b/crates/openshell-server/src/multiplex.rs @@ -32,7 +32,7 @@ use tracing::Span; use crate::{ OpenShellService, ServerState, - auth::authenticator::{AuthenticatorChain, LegacySandboxMarkerAuthenticator}, + auth::authenticator::AuthenticatorChain, auth::authz::AuthzPolicy, auth::identity::Identity, auth::oidc::{self, OidcAuthenticator}, @@ -260,23 +260,18 @@ where /// Assemble the authenticator chain for the gateway. /// /// Chain order (first-match-wins): -/// 1. [`K8sServiceAccountAuthenticator`] (path-scoped to `IssueSandboxToken`) +/// 1. `K8sServiceAccountAuthenticator` (path-scoped to `IssueSandboxToken`) /// — exchanges a projected SA token for a `Principal::Sandbox` so the /// `IssueSandboxToken` handler can mint a gateway JWT. No-op on every /// other path; only present when the gateway runs in-cluster. -/// 2. [`SandboxJwtAuthenticator`] — validates gateway-minted JWTs. Recognized +/// 2. `SandboxJwtAuthenticator` — validates gateway-minted JWTs. Recognized /// via a distinctive `kid` so non-matching Bearer tokens fall through. -/// 3. [`LegacySandboxMarkerAuthenticator`] — PR-1 holdover that produces a -/// `Principal::Sandbox` for sandbox/dual-auth paths without a Bearer. -/// Removed in PR 3. -/// 4. [`OidcAuthenticator`] — validates user Bearer tokens against the -/// configured OIDC issuer. Only added when OIDC is configured. +/// 3. `OidcAuthenticator` — validates user Bearer tokens against the +/// configured OIDC issuer. /// -/// When OIDC is *and* sandbox-JWT signing are both unconfigured (a barebones -/// dev gateway), the chain is left as `None` so the router short-circuits to -/// pass-through. The legacy marker can satisfy sandbox-class routes only -/// when paired with an OIDC authenticator that gates user routes, so we -/// require at least one of OIDC or sandbox JWT to build a chain. +/// When neither OIDC nor gateway-minted JWTs are configured (a barebones +/// dev gateway), the chain is left as `None` so the router short-circuits +/// to pass-through. fn build_authenticator_chain(state: &ServerState) -> Option { let mut authenticators: Vec> = Vec::new(); if let Some(k8s) = state.k8s_sa_authenticator.clone() { @@ -285,13 +280,10 @@ fn build_authenticator_chain(state: &ServerState) -> Option if let Some(jwt) = state.sandbox_jwt_authenticator.clone() { authenticators.push(jwt); } - authenticators.push(Arc::new(LegacySandboxMarkerAuthenticator)); if let Some(cache) = state.oidc_cache.clone() { authenticators.push(Arc::new(OidcAuthenticator::new(cache))); - } else if state.sandbox_jwt_authenticator.is_none() { - // Neither OIDC nor gateway-minted JWTs are configured — preserve - // the pre-PR-1 "open by default" dev behavior by returning no chain - // so the router short-circuits to pass-through. + } + if authenticators.is_empty() { return None; } Some(AuthenticatorChain::new(authenticators)) @@ -376,10 +368,10 @@ where Box::pin(async move { let mut req = req; - oidc::clear_internal_auth_markers(req.headers_mut()); // No chain configured — pass through. Preserves today's - // "OIDC None means open" behavior for dev/fronting-proxy modes. + // "auth not configured means open" behavior for dev / + // fronting-proxy deployments. let Some(chain) = chain else { return inner.ready().await?.call(req).await; }; @@ -406,8 +398,9 @@ where }; // Authorize user principals via RBAC. Sandbox principals get - // their PR-4 equality check at the handler level; legacy markers - // (PR-1) bypass RBAC, matching pre-refactor behavior. + // a per-handler `sandbox_id` equality check in PR 4; right now + // they bypass RBAC because the public sandbox-class methods + // they call were path-bypassed before this refactor too. if let Principal::User(ref user) = principal && let Some(ref policy) = authz_policy && let Err(status) = policy.check(&user.identity, &path) @@ -415,13 +408,6 @@ where return Ok(status_response(status)); } - // PR-1 backwards-compat: handlers still consume the metadata - // marker today. Insert it for sandbox principals so existing - // policy-handler logic continues to work. PR 3 removes this. - if matches!(principal, Principal::Sandbox(_)) { - oidc::mark_sandbox_caller(req.headers_mut()); - } - req.extensions_mut().insert(principal); inner.ready().await?.call(req).await }) @@ -878,24 +864,21 @@ mod tests { type RecordedPrincipal = Arc>>; /// Service that snapshots the `Principal` from request extensions - /// and the `x-openshell-auth-source` header, then returns 200 OK. + /// and returns 200 OK. Used by router-level tests to assert the + /// chain's effect on the downstream service. #[derive(Clone)] struct PrincipalRecorder { recorded: RecordedPrincipal, - sandbox_marker: Arc>, } impl PrincipalRecorder { - fn new() -> (Self, RecordedPrincipal, Arc>) { + fn new() -> (Self, RecordedPrincipal) { let recorded = Arc::new(Mutex::new(None)); - let marker = Arc::new(Mutex::new(false)); ( Self { recorded: recorded.clone(), - sandbox_marker: marker.clone(), }, recorded, - marker, ) } } @@ -911,12 +894,7 @@ mod tests { fn call(&mut self, req: Request) -> Self::Future { let principal = req.extensions().get::().cloned(); - let has_marker = req - .headers() - .get(oidc::INTERNAL_AUTH_SOURCE_HEADER) - .is_some_and(|v| v.as_bytes() == oidc::AUTH_SOURCE_SANDBOX.as_bytes()); *self.recorded.lock().unwrap() = principal; - *self.sandbox_marker.lock().unwrap() = has_marker; Box::pin(async move { let body = tonic::body::BoxBody::new( Full::new(Bytes::new()) @@ -949,9 +927,12 @@ mod tests { fn sandbox_principal() -> Principal { Principal::Sandbox(SandboxPrincipal { - sandbox_id: String::new(), - source: SandboxIdentitySource::LegacyMarker, - trust_domain: None, + sandbox_id: "sandbox-a".to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + jti: "j-1".to_string(), + }, + trust_domain: Some("openshell".to_string()), }) } @@ -961,7 +942,7 @@ mod tests { "alice", ))))); let chain = AuthenticatorChain::new(vec![mock]); - let (recorder, seen, _) = PrincipalRecorder::new(); + let (recorder, seen) = PrincipalRecorder::new(); let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); let _ = router .call(empty_request("/openshell.v1.OpenShell/ListSandboxes")) @@ -975,32 +956,27 @@ mod tests { } #[tokio::test] - async fn sandbox_principal_inserts_metadata_marker_for_backcompat() { - // PR-1 keeps the metadata marker so handlers that still read it - // (until PR 3/4 swap them over to extensions) keep working. + async fn sandbox_principal_lands_in_request_extensions() { let mock = Arc::new(MockAuthenticator::returning(Ok(Some(sandbox_principal())))); let chain = AuthenticatorChain::new(vec![mock]); - let (recorder, seen, marker_seen) = PrincipalRecorder::new(); + let (recorder, seen) = PrincipalRecorder::new(); let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); let _ = router .call(empty_request("/openshell.v1.OpenShell/ReportPolicyStatus")) .await .unwrap(); - assert!( - matches!(seen.lock().unwrap().clone(), Some(Principal::Sandbox(_))), - "principal must reach extensions" - ); - assert!( - *marker_seen.lock().unwrap(), - "sandbox principals must also set the legacy metadata marker" - ); + let captured = seen.lock().unwrap().clone(); + match captured { + Some(Principal::Sandbox(p)) => assert_eq!(p.sandbox_id, "sandbox-a"), + other => panic!("expected sandbox principal, got {other:?}"), + } } #[tokio::test] async fn missing_principal_returns_unauthenticated() { let mock = Arc::new(MockAuthenticator::returning(Ok(None))); let chain = AuthenticatorChain::new(vec![mock]); - let (recorder, seen, _) = PrincipalRecorder::new(); + let (recorder, seen) = PrincipalRecorder::new(); let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); let res = router .call(empty_request("/openshell.v1.OpenShell/ListSandboxes")) @@ -1021,7 +997,7 @@ mod tests { tonic::Status::unauthenticated("forged"), ))); let chain = AuthenticatorChain::new(vec![mock]); - let (recorder, seen, _) = PrincipalRecorder::new(); + let (recorder, seen) = PrincipalRecorder::new(); let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); let res = router .call(empty_request("/openshell.v1.OpenShell/ListSandboxes")) @@ -1045,7 +1021,7 @@ mod tests { tonic::Status::unauthenticated("would reject"), ))); let chain = AuthenticatorChain::new(vec![mock.clone()]); - let (recorder, _, _) = PrincipalRecorder::new(); + let (recorder, _) = PrincipalRecorder::new(); let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); let res = router .call(empty_request("/openshell.v1.OpenShell/Health")) @@ -1054,25 +1030,5 @@ mod tests { assert_eq!(res.status(), 200); assert_eq!(mock.call_count(), 0, "health must not consult the chain"); } - - #[tokio::test] - async fn external_auth_source_marker_is_stripped() { - let mock = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( - "alice", - ))))); - let chain = AuthenticatorChain::new(vec![mock]); - let (recorder, _, marker_seen) = PrincipalRecorder::new(); - let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); - let mut req = empty_request("/openshell.v1.OpenShell/ListSandboxes"); - req.headers_mut().insert( - oidc::INTERNAL_AUTH_SOURCE_HEADER, - HeaderValue::from_static(oidc::AUTH_SOURCE_SANDBOX), - ); - let _ = router.call(req).await.unwrap(); - assert!( - !*marker_seen.lock().unwrap(), - "external sandbox marker must be stripped before auth" - ); - } } } diff --git a/deploy/helm/openshell/templates/role.yaml b/deploy/helm/openshell/templates/role.yaml index 1d756117c..4d26451bf 100644 --- a/deploy/helm/openshell/templates/role.yaml +++ b/deploy/helm/openshell/templates/role.yaml @@ -29,3 +29,14 @@ rules: - get - list - watch + # Per-sandbox identity (issue #1354): the gateway resolves a sandbox + # pod's projected SA token to its `openshell.io/sandbox-id` annotation + # via a pod GET when the supervisor calls IssueSandboxToken. patch is + # intentionally NOT granted — the annotation is set once at pod create + # and must remain immutable for the lifetime of the sandbox. + - apiGroups: + - "" + resources: + - pods + verbs: + - get diff --git a/proto/compute_driver.proto b/proto/compute_driver.proto index 3c4308f3f..6de13f3e5 100644 --- a/proto/compute_driver.proto +++ b/proto/compute_driver.proto @@ -90,6 +90,13 @@ message DriverSandboxSpec { // (e.g. "0", "1"). When empty with gpu=true, the driver assigns the // first available GPU. string gpu_device = 10; + // Gateway-minted JWT identifying this sandbox to the gateway. Set by + // the gateway on create; the driver materialises it via its native + // secret mechanism (Docker/Podman/VM bind-mount a per-sandbox file; + // the Kubernetes driver ignores this field and relies on its projected + // ServiceAccount token bootstrap instead). Never echoed to the public + // Sandbox proto. + string sandbox_token = 11; } // Driver-owned runtime template consumed by the compute platform. From 8e4c64ab0b2f8aef49c522e20d1d07e18c58a2bd Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Thu, 14 May 2026 21:25:55 -0700 Subject: [PATCH 04/18] fix(sandbox): strip supervisor-only credentials from entrypoint env ProcessHandle::spawn_impl previously inherited the supervisor's full environment when starting the sandbox entrypoint, then drop_privileges() demoted the child to the sandbox user. The combination meant a later process running as the sandbox user (e.g. an SSH-spawned shell) could read /proc//environ and recover the gateway-minted JWT. Explicitly env_remove the three sandbox-token env vars before exec so the entrypoint child carries none of the supervisor's identity material. SSH session shells already use env_clear() in apply_child_env, so this plugs the only remaining inheritance path. Related to #1354 (per-sandbox identity series, PR 3 follow-up). Signed-off-by: Taylor Mutch --- crates/openshell-sandbox/src/process.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index 3d2f6d576..8c6eb77f3 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -155,6 +155,15 @@ impl ProcessHandle { .kill_on_drop(true) .env(openshell_core::sandbox_env::SANDBOX, "1"); + // Strip supervisor-only credentials from the entrypoint's inherited + // environment. The entrypoint drops to the sandbox user before + // `exec`; without this strip, anything running as the sandbox user + // (e.g. an SSH-spawned shell) could read /proc//environ + // and recover the gateway-minted JWT. Issue #1354. + cmd.env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN) + .env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE) + .env_remove(openshell_core::sandbox_env::K8S_SA_TOKEN_FILE); + inject_provider_env(&mut cmd, provider_env); if let Some(dir) = workdir { @@ -281,6 +290,15 @@ impl ProcessHandle { .kill_on_drop(true) .env(openshell_core::sandbox_env::SANDBOX, "1"); + // Strip supervisor-only credentials from the entrypoint's inherited + // environment. The entrypoint drops to the sandbox user before + // `exec`; without this strip, anything running as the sandbox user + // (e.g. an SSH-spawned shell) could read /proc//environ + // and recover the gateway-minted JWT. Issue #1354. + cmd.env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN) + .env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE) + .env_remove(openshell_core::sandbox_env::K8S_SA_TOKEN_FILE); + inject_provider_env(&mut cmd, provider_env); if let Some(dir) = workdir { From 393fb6dd630f49c3731c8b188f39f31349aece13 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Thu, 14 May 2026 21:59:51 -0700 Subject: [PATCH 05/18] fix(server): per-handler sandbox_id equality check (closes #1354) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the IDOR guard that closes the second half of the per-sandbox identity series. Every sandbox-class handler now verifies that the calling Principal::Sandbox.sandbox_id matches the canonical UUID the request body operates on. User principals bypass the check because RBAC was their gate at the router layer; anonymous callers are rejected outright. New module crates/openshell-server/src/auth/guard.rs exposes ensure_sandbox_scope / enforce_sandbox_scope. Applied at the top of: - handle_get_sandbox_config (id-keyed) - handle_get_sandbox_provider_environment (id-keyed) - handle_report_policy_status (id-keyed) - handle_push_sandbox_logs (id-keyed, first frame only — principal is stable across the stream) - handle_submit_policy_analysis (name-keyed: resolve to id, then check) - handle_get_draft_policy (name-keyed) - handle_update_config (dual-auth: enforce only when Principal::Sandbox; CLI / TUI user paths are unaffected) - handle_get_inference_bundle (no sandbox_id in body; accept any authenticated principal, reject anonymous) Existing policy.rs tests are updated to wrap their requests with a test-helper user principal so the new guard treats them as CLI calls; six new tests cover the cross-sandbox-denied / same-sandbox-allowed / user-bypasses-guard matrix. Signed-off-by: Taylor Mutch --- crates/openshell-server/src/auth/guard.rs | 137 ++++++++ crates/openshell-server/src/auth/mod.rs | 1 + crates/openshell-server/src/grpc/policy.rs | 372 +++++++++++++++++---- crates/openshell-server/src/inference.rs | 19 +- 4 files changed, 469 insertions(+), 60 deletions(-) create mode 100644 crates/openshell-server/src/auth/guard.rs diff --git a/crates/openshell-server/src/auth/guard.rs b/crates/openshell-server/src/auth/guard.rs new file mode 100644 index 000000000..f5cdb8131 --- /dev/null +++ b/crates/openshell-server/src/auth/guard.rs @@ -0,0 +1,137 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Per-handler sandbox-scope guards. +//! +//! Closes the IDOR half of issue #1354: a sandbox principal may only +//! reference its own sandbox, identified by its [`Principal::Sandbox`]'s +//! `sandbox_id`. User principals retain the broad scope the RBAC layer +//! already evaluated. + +use super::principal::Principal; +use tonic::Status; +use tracing::info; + +/// Reject a sandbox-class request whose body references a sandbox other +/// than the one the calling principal was authenticated against. +/// +/// - [`Principal::User`] passes through (RBAC has already evaluated user +/// scope at the router level). +/// - [`Principal::Sandbox`] must reference the same canonical UUID it +/// was authenticated with. +/// - [`Principal::Anonymous`] is rejected — sandbox-class methods are +/// never anonymously callable. +/// +/// `claimed_sandbox_id` is the canonical UUID the request is operating +/// on. Name-keyed handlers must resolve the name to a UUID via the +/// store before calling this guard. +#[allow(clippy::result_large_err)] +pub fn ensure_sandbox_scope(principal: &Principal, claimed_sandbox_id: &str) -> Result<(), Status> { + match principal { + Principal::User(_) => Ok(()), + Principal::Sandbox(p) => { + if p.sandbox_id == claimed_sandbox_id { + Ok(()) + } else { + info!( + principal_sandbox_id = %p.sandbox_id, + requested_sandbox_id = %claimed_sandbox_id, + "cross-sandbox access denied" + ); + Err(Status::permission_denied( + "cross-sandbox access denied: principal does not own this sandbox", + )) + } + } + Principal::Anonymous => Err(Status::unauthenticated( + "sandbox-scoped methods require an authenticated caller", + )), + } +} + +/// Convenience: read the `Principal` out of a request and apply +/// [`ensure_sandbox_scope`]. Returns the principal so callers can read it +/// further (e.g. for audit logging). +#[allow(clippy::result_large_err)] +pub fn enforce_sandbox_scope( + request: &tonic::Request, + claimed_sandbox_id: &str, +) -> Result { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; + ensure_sandbox_scope(&principal, claimed_sandbox_id)?; + Ok(principal) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{SandboxIdentitySource, SandboxPrincipal, UserPrincipal}; + + fn user(subject: &str) -> Principal { + Principal::User(UserPrincipal { + identity: Identity { + subject: subject.to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + }) + } + + fn sandbox(id: &str) -> Principal { + Principal::Sandbox(SandboxPrincipal { + sandbox_id: id.to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + jti: "j-1".to_string(), + }, + trust_domain: Some("openshell".to_string()), + }) + } + + #[test] + fn user_principal_bypasses_equality_check() { + // RBAC was the user's gate at the router layer. + assert!(ensure_sandbox_scope(&user("alice"), "any-sandbox").is_ok()); + } + + #[test] + fn sandbox_principal_matching_id_is_allowed() { + assert!(ensure_sandbox_scope(&sandbox("sbx-1"), "sbx-1").is_ok()); + } + + #[test] + fn sandbox_principal_mismatched_id_is_denied() { + let err = + ensure_sandbox_scope(&sandbox("sbx-1"), "sbx-2").expect_err("must deny cross-sandbox"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[test] + fn anonymous_principal_is_rejected() { + let err = + ensure_sandbox_scope(&Principal::Anonymous, "sbx-1").expect_err("must reject anon"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } + + #[test] + fn enforce_reads_from_request_extensions() { + let mut req = tonic::Request::new(()); + req.extensions_mut().insert(sandbox("sbx-1")); + let result = enforce_sandbox_scope(&req, "sbx-1").expect("scope OK"); + assert!(matches!(result, Principal::Sandbox(_))); + } + + #[test] + fn enforce_rejects_request_without_principal() { + let req = tonic::Request::new(()); + let err = enforce_sandbox_scope(&req, "sbx-1").expect_err("must require principal"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } +} diff --git a/crates/openshell-server/src/auth/mod.rs b/crates/openshell-server/src/auth/mod.rs index 3d8152de1..d4c6978af 100644 --- a/crates/openshell-server/src/auth/mod.rs +++ b/crates/openshell-server/src/auth/mod.rs @@ -10,6 +10,7 @@ pub mod authenticator; pub mod authz; +pub mod guard; mod http; pub mod identity; pub mod k8s_sa; diff --git a/crates/openshell-server/src/grpc/policy.rs b/crates/openshell-server/src/grpc/policy.rs index 62d837851..4e9b82700 100644 --- a/crates/openshell-server/src/grpc/policy.rs +++ b/crates/openshell-server/src/grpc/policy.rs @@ -314,6 +314,7 @@ fn truncate_for_log(input: &str, max_chars: usize) -> String { } } +#[cfg(test)] fn is_sandbox_caller(request: &Request) -> bool { matches!( request @@ -357,7 +358,9 @@ pub(super) async fn handle_get_sandbox_config( state: &Arc, request: Request, ) -> Result, Status> { - let sandbox_id = request.into_inner().sandbox_id; + let sandbox_id = request.get_ref().sandbox_id.clone(); + crate::auth::guard::enforce_sandbox_scope(&request, &sandbox_id)?; + drop(request); let sandbox = state .store @@ -620,7 +623,9 @@ pub(super) async fn handle_get_sandbox_provider_environment( state: &Arc, request: Request, ) -> Result, Status> { - let sandbox_id = request.into_inner().sandbox_id; + let sandbox_id = request.get_ref().sandbox_id.clone(); + crate::auth::guard::enforce_sandbox_scope(&request, &sandbox_id)?; + drop(request); let sandbox = state .store @@ -663,10 +668,32 @@ pub(super) async fn handle_update_config( state: &Arc, request: Request, ) -> Result, Status> { - let sandbox_caller = is_sandbox_caller(&request); + let principal = request + .extensions() + .get::() + .cloned(); + let sandbox_caller = matches!( + principal, + Some(crate::auth::principal::Principal::Sandbox(_)) + ); let req = request.into_inner(); if sandbox_caller { validate_sandbox_caller_update(&req)?; + // Resolve req.name to a sandbox UUID and verify the calling + // sandbox principal owns it. User callers (CLI / TUI) bypass + // this check because RBAC was their gate. + let sandbox = state + .store + .get_message_by_name::(&req.name) + .await + .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? + .ok_or_else(|| Status::not_found("sandbox not found"))?; + crate::auth::guard::ensure_sandbox_scope( + principal + .as_ref() + .expect("sandbox_caller implies principal"), + sandbox.object_id(), + )?; } let key = req.setting_key.trim(); let has_policy = req.policy.is_some(); @@ -1189,6 +1216,8 @@ pub(super) async fn handle_report_policy_status( state: &Arc, request: Request, ) -> Result, Status> { + let sandbox_id = request.get_ref().sandbox_id.clone(); + crate::auth::guard::enforce_sandbox_scope(&request, &sandbox_id)?; let req = request.into_inner(); if req.sandbox_id.is_empty() { return Err(Status::invalid_argument("sandbox_id is required")); @@ -1311,6 +1340,11 @@ pub(super) async fn handle_push_sandbox_logs( state: &Arc, request: Request>, ) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; let mut stream = request.into_inner(); let mut validated = false; @@ -1324,6 +1358,10 @@ pub(super) async fn handle_push_sandbox_logs( } if !validated { + // The streaming RPC carries the sandbox_id in every frame, but + // the equality check only needs to run once on the first frame + // — the principal is stable across the stream. + crate::auth::guard::ensure_sandbox_scope(&principal, &batch.sandbox_id)?; state .store .get_message::(&batch.sandbox_id) @@ -1352,6 +1390,11 @@ pub(super) async fn handle_submit_policy_analysis( state: &Arc, request: Request, ) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; let req = request.into_inner(); if req.name.is_empty() { return Err(Status::invalid_argument("name is required")); @@ -1364,6 +1407,9 @@ pub(super) async fn handle_submit_policy_analysis( .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? .ok_or_else(|| Status::not_found("sandbox not found"))?; let sandbox_id = sandbox.object_id().to_string(); + // Name → id resolved; now enforce that a sandbox principal only acts + // on its own sandbox. User principals are unaffected. + crate::auth::guard::ensure_sandbox_scope(&principal, &sandbox_id)?; let current_version = state .store @@ -1480,6 +1526,11 @@ pub(super) async fn handle_get_draft_policy( state: &Arc, request: Request, ) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; let req = request.into_inner(); if req.name.is_empty() { return Err(Status::invalid_argument("name is required")); @@ -1492,6 +1543,7 @@ pub(super) async fn handle_get_draft_policy( .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? .ok_or_else(|| Status::not_found("sandbox not found"))?; let sandbox_id = sandbox.object_id().to_string(); + crate::auth::guard::ensure_sandbox_scope(&principal, &sandbox_id)?; let status_filter = if req.status_filter.is_empty() { None @@ -2834,6 +2886,10 @@ fn materialize_global_settings( mod tests { use super::*; use crate::ServerState; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{ + Principal, SandboxIdentitySource, SandboxPrincipal, UserPrincipal, + }; use crate::compute::new_test_runtime; use crate::persistence::Store; use crate::sandbox_index::SandboxIndex; @@ -2845,6 +2901,41 @@ mod tests { use std::sync::Arc; use tonic::Code; + /// Wrap a request with a user `Principal` so handlers' scope guards + /// (introduced in PR 4) treat the test caller as a CLI user — equivalent + /// to the pre-PR-4 behavior where all tests effectively ran as user. + fn with_user(mut request: Request) -> Request { + request + .extensions_mut() + .insert(Principal::User(UserPrincipal { + identity: Identity { + subject: "test-user".to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + })); + request + } + + /// Wrap a request with a sandbox `Principal` bound to `sandbox_id`. + /// Use for tests that exercise sandbox-caller code paths. + #[allow(dead_code)] + fn with_sandbox(mut request: Request, sandbox_id: &str) -> Request { + request + .extensions_mut() + .insert(Principal::Sandbox(SandboxPrincipal { + sandbox_id: sandbox_id.to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + jti: "j-test".to_string(), + }, + trust_domain: Some("openshell".to_string()), + })); + request + } + #[test] fn sandbox_caller_update_validation_allows_sandbox_policy_sync() { let req = UpdateConfigRequest { @@ -2911,6 +3002,169 @@ mod tests { assert!(!is_sandbox_caller(&req)); } + // ---- PR-4 IDOR guard (issue #1354) ---- + + #[tokio::test] + async fn cross_sandbox_get_sandbox_config_denied() { + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + // Two sandboxes; the caller is principal of A, the request body + // references B. + for (id, name) in [("sb-a", "sandbox-a"), ("sb-b", "sandbox-b")] { + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: id.to_string(), + name: name.to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + } + let req = with_sandbox( + Request::new(GetSandboxConfigRequest { + sandbox_id: "sb-b".to_string(), + }), + "sb-a", + ); + let err = handle_get_sandbox_config(&state, req) + .await + .expect_err("cross-sandbox call must be denied"); + assert_eq!(err.code(), Code::PermissionDenied); + } + + #[tokio::test] + async fn same_sandbox_get_sandbox_config_allowed() { + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: "sb-self".to_string(), + name: "self".to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + let req = with_sandbox( + Request::new(GetSandboxConfigRequest { + sandbox_id: "sb-self".to_string(), + }), + "sb-self", + ); + handle_get_sandbox_config(&state, req) + .await + .expect("matching principal must be allowed"); + } + + #[tokio::test] + async fn cross_sandbox_submit_policy_analysis_denied() { + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + for (id, name) in [("sb-a", "sandbox-a"), ("sb-b", "sandbox-b")] { + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: id.to_string(), + name: name.to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + } + let req = with_sandbox( + Request::new(SubmitPolicyAnalysisRequest { + name: "sandbox-b".to_string(), + ..Default::default() + }), + "sb-a", + ); + let err = handle_submit_policy_analysis(&state, req) + .await + .expect_err("cross-sandbox submit must be denied"); + assert_eq!(err.code(), Code::PermissionDenied); + } + + #[tokio::test] + async fn cross_sandbox_get_draft_policy_denied() { + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + for (id, name) in [("sb-a", "sandbox-a"), ("sb-b", "sandbox-b")] { + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: id.to_string(), + name: name.to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + } + let req = with_sandbox( + Request::new(GetDraftPolicyRequest { + name: "sandbox-b".to_string(), + status_filter: String::new(), + }), + "sb-a", + ); + let err = handle_get_draft_policy(&state, req) + .await + .expect_err("cross-sandbox draft read must be denied"); + assert_eq!(err.code(), Code::PermissionDenied); + } + + #[tokio::test] + async fn user_principal_can_read_any_sandbox_config() { + // RBAC was the user gate; the IDOR guard must NOT trip for users. + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: "sb-x".to_string(), + name: "x".to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + let req = with_user(Request::new(GetSandboxConfigRequest { + sandbox_id: "sb-x".to_string(), + })); + handle_get_sandbox_config(&state, req) + .await + .expect("user principal must succeed"); + } + // ---- Sandbox without policy ---- #[tokio::test] @@ -3024,9 +3278,9 @@ mod tests { async fn get_sandbox_policy(state: &Arc, sandbox_id: &str) -> ProtoSandboxPolicy { handle_get_sandbox_config( state, - Request::new(GetSandboxConfigRequest { + with_user(Request::new(GetSandboxConfigRequest { sandbox_id: sandbox_id.to_string(), - }), + })), ) .await .unwrap() @@ -3467,9 +3721,9 @@ mod tests { let legacy_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-provider-env".to_string(), - }), + })), ) .await .unwrap() @@ -3479,9 +3733,9 @@ mod tests { enable_providers_v2(&state).await; let v2_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-provider-env".to_string(), - }), + })), ) .await .unwrap() @@ -3513,9 +3767,9 @@ mod tests { let first = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-provider-revision".to_string(), - }), + })), ) .await .unwrap() @@ -3529,9 +3783,9 @@ mod tests { let second = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-provider-revision".to_string(), - }), + })), ) .await .unwrap() @@ -3583,9 +3837,9 @@ mod tests { ); let baseline_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3593,11 +3847,11 @@ mod tests { handle_attach_sandbox_provider( &state, - Request::new(AttachSandboxProviderRequest { + with_user(Request::new(AttachSandboxProviderRequest { sandbox_name: "attach-lifecycle".to_string(), provider_name: "work-github".to_string(), expected_resource_version: 0, - }), + })), ) .await .unwrap(); @@ -3611,9 +3865,9 @@ mod tests { let attached_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3647,9 +3901,9 @@ mod tests { let detached_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3743,9 +3997,9 @@ mod tests { ); let baseline_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-custom-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3753,11 +4007,11 @@ mod tests { handle_attach_sandbox_provider( &state, - Request::new(AttachSandboxProviderRequest { + with_user(Request::new(AttachSandboxProviderRequest { sandbox_name: "custom-attach-lifecycle".to_string(), provider_name: "work-custom".to_string(), expected_resource_version: 0, - }), + })), ) .await .unwrap(); @@ -3774,9 +4028,9 @@ mod tests { let attached_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-custom-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3809,9 +4063,9 @@ mod tests { ); let detached_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-custom-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3909,9 +4163,9 @@ mod tests { let response = handle_get_sandbox_config( &state, - Request::new(GetSandboxConfigRequest { + with_user(Request::new(GetSandboxConfigRequest { sandbox_id: "sb-global-profile".to_string(), - }), + })), ) .await .unwrap() @@ -4056,7 +4310,7 @@ mod tests { let submit = handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name.clone(), proposed_chunks: vec![PolicyChunk { rule_name: "allow_example".to_string(), @@ -4070,7 +4324,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4082,10 +4336,10 @@ mod tests { let draft_policy = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name.clone(), status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4152,10 +4406,10 @@ mod tests { let draft_policy_after_undo = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name.clone(), status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4204,10 +4458,10 @@ mod tests { let draft_policy_after_clear = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name.clone(), status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4266,7 +4520,7 @@ mod tests { let submit = handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name.clone(), proposed_chunks: vec![PolicyChunk { rule_name: "allow_example".to_string(), @@ -4275,7 +4529,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4296,10 +4550,10 @@ mod tests { let draft = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name, status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4375,7 +4629,7 @@ mod tests { async move { handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name, analysis_mode: "agent_authored".to_string(), proposed_chunks: vec![PolicyChunk { @@ -4384,7 +4638,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4404,10 +4658,10 @@ mod tests { let draft = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name.clone(), status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4482,7 +4736,7 @@ mod tests { async move { handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name, analysis_mode: "mechanistic".to_string(), proposed_chunks: vec![PolicyChunk { @@ -4491,7 +4745,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4505,10 +4759,10 @@ mod tests { let draft = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name, status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4578,7 +4832,7 @@ mod tests { let submit = handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name.clone(), proposed_chunks: vec![PolicyChunk { rule_name: "allow_example".to_string(), @@ -4586,7 +4840,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4626,10 +4880,10 @@ mod tests { let draft = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name, status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4700,7 +4954,7 @@ mod tests { handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_a.object_name().to_string(), proposed_chunks: vec![PolicyChunk { rule_name: "allow_example".to_string(), @@ -4714,17 +4968,17 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap(); let draft_policy = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_a.object_name().to_string(), status_filter: String::new(), - }), + })), ) .await .unwrap() diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs index 53d6265b7..2fb89b0ac 100644 --- a/crates/openshell-server/src/inference.rs +++ b/crates/openshell-server/src/inference.rs @@ -59,8 +59,25 @@ impl ObjectType for InferenceRoute { impl Inference for InferenceService { async fn get_inference_bundle( &self, - _request: Request, + request: Request, ) -> Result, Status> { + // GetInferenceBundle is gateway-wide (no per-sandbox routes yet), + // so it has no `sandbox_id` to compare against. Just reject + // anonymous callers; both user and sandbox principals are allowed. + match request + .extensions() + .get::() + { + Some( + crate::auth::principal::Principal::User(_) + | crate::auth::principal::Principal::Sandbox(_), + ) => {} + Some(crate::auth::principal::Principal::Anonymous) | None => { + return Err(Status::unauthenticated( + "GetInferenceBundle requires an authenticated caller", + )); + } + } resolve_inference_bundle(self.state.store.as_ref()) .await .map(Response::new) From 7223a150d722161979d28caef9ede1adb21891c4 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Thu, 14 May 2026 22:24:09 -0700 Subject: [PATCH 06/18] feat(server): RefreshSandboxToken RPC + sandbox refresh loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the rotation half of the per-sandbox identity series. Sandboxes holding a valid gateway-minted JWT can swap it for a fresh one without disruption; the old jti is revoked server-side before the new token is handed back, so a leaked token is unusable as soon as the rotation completes. Server side: - proto/openshell.proto gains RefreshSandboxToken plus empty request / token+expires_at_ms response messages. - handle_refresh_sandbox_token requires Principal::Sandbox with a BootstrapJwt source (K8s-SA principals are routed to IssueSandboxToken for bootstrap; user principals are rejected). The handler mints the replacement token first, then adds the old jti to the in-memory RevocationSet — so a failed mint never strands the sandbox. Sandbox side: - AuthInterceptor now reads its Bearer header from a process-wide Arc> slot, so a single in-place token rotation is visible to every cached client (CachedOpenShellClient, the supervisor session channel, log push, etc.). - connect_channel spawns a background refresh loop once per process that sleeps for ~80% of the token's remaining lifetime (clamped to 60s-12h, plus small deterministic jitter) and calls RefreshSandboxToken, updating the token slot on success. - New parse_jwt_exp_ms helper decodes the JWT payload without signature verification — the token's origin is already trusted via the acquisition flow. Tests: - 4 server-side handler tests (round-trip, user-principal rejected, K8s-SA-principal rejected, missing-issuer returns Unavailable) - 3 sandbox-side helper tests (parse-exp, 80%-of-TTL delay, 60s floor) All existing OpenShell test impls gain a refresh_sandbox_token stub. Signed-off-by: Taylor Mutch --- .../tests/ensure_providers_integration.rs | 7 + .../openshell-cli/tests/mtls_integration.rs | 7 + .../tests/provider_commands_integration.rs | 7 + .../sandbox_create_lifecycle_integration.rs | 7 + .../sandbox_name_fallback_integration.rs | 7 + crates/openshell-sandbox/src/grpc_client.rs | 222 +++++++++++++-- crates/openshell-server/src/grpc/auth_rpc.rs | 253 +++++++++++++++++- crates/openshell-server/src/grpc/mod.rs | 22 +- .../tests/auth_endpoint_integration.rs | 8 + .../tests/edge_tunnel_auth.rs | 7 + .../tests/multiplex_integration.rs | 7 + .../tests/multiplex_tls_integration.rs | 7 + .../tests/supervisor_relay_integration.rs | 6 + .../tests/ws_tunnel_integration.rs | 7 + proto/openshell.proto | 23 ++ 15 files changed, 566 insertions(+), 31 deletions(-) diff --git a/crates/openshell-cli/tests/ensure_providers_integration.rs b/crates/openshell-cli/tests/ensure_providers_integration.rs index a18080243..ea2d5a465 100644 --- a/crates/openshell-cli/tests/ensure_providers_integration.rs +++ b/crates/openshell-cli/tests/ensure_providers_integration.rs @@ -542,6 +542,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/mtls_integration.rs b/crates/openshell-cli/tests/mtls_integration.rs index bb491db85..8f83599b1 100644 --- a/crates/openshell-cli/tests/mtls_integration.rs +++ b/crates/openshell-cli/tests/mtls_integration.rs @@ -431,6 +431,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/provider_commands_integration.rs b/crates/openshell-cli/tests/provider_commands_integration.rs index a52955f56..b0e3b99a1 100644 --- a/crates/openshell-cli/tests/provider_commands_integration.rs +++ b/crates/openshell-cli/tests/provider_commands_integration.rs @@ -807,6 +807,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs index 9101dbd26..2ce409413 100644 --- a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs +++ b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs @@ -611,6 +611,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs b/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs index 3cc39b3bc..88358391c 100644 --- a/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs +++ b/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs @@ -444,6 +444,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-sandbox/src/grpc_client.rs b/crates/openshell-sandbox/src/grpc_client.rs index eeac41a44..fae66d5c5 100644 --- a/crates/openshell-sandbox/src/grpc_client.rs +++ b/crates/openshell-sandbox/src/grpc_client.rs @@ -20,13 +20,14 @@ //! injected on every outbound call by [`AuthInterceptor`]. use std::collections::HashMap; -use std::time::Duration; +use std::sync::{Arc, OnceLock, RwLock}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; use miette::{IntoDiagnostic, Result, WrapErr}; use openshell_core::proto::{ DenialSummary, GetDraftPolicyRequest, GetInferenceBundleRequest, GetInferenceBundleResponse, GetSandboxConfigRequest, GetSandboxProviderEnvironmentRequest, IssueSandboxTokenRequest, - PolicyChunk, PolicySource, PolicyStatus, ReportPolicyStatusRequest, + PolicyChunk, PolicySource, PolicyStatus, RefreshSandboxTokenRequest, ReportPolicyStatusRequest, SandboxPolicy as ProtoSandboxPolicy, SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, UpdateConfigRequest, inference_client::InferenceClient, open_shell_client::OpenShellClient, }; @@ -35,25 +36,48 @@ use tonic::Status; use tonic::metadata::AsciiMetadataValue; use tonic::service::interceptor::InterceptedService; use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint, Identity}; -use tracing::{debug, info}; +use tracing::{debug, info, warn}; /// Channel type after the [`AuthInterceptor`] is applied. Aliased so the /// generated client type signatures stay readable. pub type AuthedChannel = InterceptedService; +/// Shared, refreshable Bearer header. All [`AuthInterceptor`] clones read +/// the same slot, so the PR-5 refresh task can rotate the token in place +/// without rebuilding the channel. +type TokenSlot = Arc>; + +/// Process-wide token slot. Initialized by the first [`connect_channel`] +/// call and shared with every subsequent client + the refresh loop. +static TOKEN_SLOT: OnceLock = OnceLock::new(); + +/// One-shot guard so the refresh loop spawns at most once per process. +static REFRESH_SPAWNED: OnceLock<()> = OnceLock::new(); + +fn install_token_slot(token: &str) -> Result { + let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")) + .into_diagnostic() + .wrap_err("sandbox JWT contained characters not valid for a header value")?; + if let Some(existing) = TOKEN_SLOT.get() { + *existing.write().expect("token slot poisoned") = bearer; + return Ok(existing.clone()); + } + let slot: TokenSlot = Arc::new(RwLock::new(bearer)); + let _ = TOKEN_SLOT.set(slot.clone()); + Ok(TOKEN_SLOT.get().cloned().unwrap_or(slot)) +} + /// gRPC interceptor that injects `authorization: Bearer ` on every -/// outbound request. +/// outbound request. The token lives in a shared [`TokenSlot`] so the +/// PR-5 refresh task can replace it without rebuilding clients. #[derive(Clone)] pub struct AuthInterceptor { - bearer: AsciiMetadataValue, + bearer: TokenSlot, } impl AuthInterceptor { - fn new(token: &str) -> Result { - let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")) - .into_diagnostic() - .wrap_err("sandbox JWT contained characters not valid for a header value")?; - Ok(Self { bearer }) + fn new(bearer: TokenSlot) -> Self { + Self { bearer } } } @@ -62,8 +86,12 @@ impl tonic::service::Interceptor for AuthInterceptor { &mut self, mut req: tonic::Request<()>, ) -> std::result::Result, Status> { - req.metadata_mut() - .insert("authorization", self.bearer.clone()); + let bearer = self + .bearer + .read() + .expect("auth interceptor token slot poisoned") + .clone(); + req.metadata_mut().insert("authorization", bearer); Ok(req) } } @@ -132,13 +160,25 @@ async fn build_plain_channel(endpoint: &str) -> Result { /// Build a Bearer-authenticated channel to the gateway. /// /// Resolves the sandbox JWT via the three-step lookup described at the -/// module level (env → file → K8s SA bootstrap exchange) and wraps the -/// resulting channel in [`AuthInterceptor`]. +/// module level (env → file → K8s SA bootstrap exchange), installs the +/// token into the process-wide [`TOKEN_SLOT`], wraps the channel in an +/// [`AuthInterceptor`] that reads from that slot, and spawns the refresh +/// loop the first time the channel is built. async fn connect_channel(endpoint: &str) -> Result { let channel = build_plain_channel(endpoint).await?; let token = acquire_sandbox_token(endpoint, &channel).await?; - let interceptor = AuthInterceptor::new(&token)?; - Ok(InterceptedService::new(channel, interceptor)) + let slot = install_token_slot(&token)?; + let intercepted = InterceptedService::new(channel, AuthInterceptor::new(slot.clone())); + // Spawn the refresh loop once per process. It uses the same authed + // channel, so its outbound calls always carry the current token. + if REFRESH_SPAWNED.set(()).is_ok() { + let refresh_channel = intercepted.clone(); + let refresh_slot = slot; + tokio::spawn(async move { + refresh_token_loop(refresh_channel, refresh_slot).await; + }); + } + Ok(intercepted) } /// Resolve the sandbox JWT used to authenticate every outbound RPC. @@ -174,7 +214,15 @@ async fn acquire_sandbox_token(endpoint: &str, plain_channel: &Channel) -> Resul .trim() .to_string(); info!(endpoint = %endpoint, "exchanging K8s ServiceAccount token for sandbox JWT"); - let interceptor = AuthInterceptor::new(&sa_token)?; + // The bootstrap exchange uses a one-off interceptor pinned to the + // SA token; the resulting gateway JWT becomes the value in the + // shared `TOKEN_SLOT` once `connect_channel` returns. + let bootstrap_slot: TokenSlot = Arc::new(RwLock::new( + AsciiMetadataValue::try_from(format!("Bearer {sa_token}")) + .into_diagnostic() + .wrap_err("SA token contained characters not valid for a header value")?, + )); + let interceptor = AuthInterceptor::new(bootstrap_slot); let bootstrap = InterceptedService::new(plain_channel.clone(), interceptor); let mut client = OpenShellClient::new(bootstrap); let resp = client @@ -199,6 +247,146 @@ pub async fn connect_channel_pub(endpoint: &str) -> Result { connect_channel(endpoint).await } +/// Background task that rotates the sandbox JWT at ~80% of its remaining +/// lifetime. The new token replaces the value in [`TOKEN_SLOT`], so all +/// in-flight and future clients pick it up on their next request. The +/// loop never panics: every failure is logged and re-attempted after a +/// bounded backoff. +async fn refresh_token_loop(channel: AuthedChannel, slot: TokenSlot) { + let mut client = OpenShellClient::new(channel); + loop { + let sleep = compute_refresh_delay(&slot); + tokio::time::sleep(sleep).await; + match client + .refresh_sandbox_token(RefreshSandboxTokenRequest {}) + .await + { + Ok(resp) => { + let new_token = resp.into_inner().token; + match AsciiMetadataValue::try_from(format!("Bearer {new_token}")) { + Ok(value) => { + if let Ok(mut guard) = slot.write() { + *guard = value; + info!("rotated gateway sandbox JWT in-place"); + } + } + Err(e) => warn!(error = %e, "refreshed JWT contained invalid header bytes"), + } + } + Err(status) => { + warn!(error = %status, "RefreshSandboxToken failed; will retry"); + // Backoff so we don't spin against a sustained failure. + tokio::time::sleep(Duration::from_secs(60)).await; + } + } + } +} + +/// Compute the next refresh delay: 80 % of the time remaining until the +/// current token's `exp`, plus up to 10 % jitter, floored at 60 s and +/// capped at 12 h. If the token can't be parsed (legacy/non-JWT bearer) +/// default to 6 h. +fn compute_refresh_delay(slot: &TokenSlot) -> Duration { + let token = slot + .read() + .ok() + .and_then(|v| v.to_str().ok().map(str::to_string)) + .unwrap_or_default(); + let bearer = token.strip_prefix("Bearer ").unwrap_or(&token); + let now_ms = i64::try_from( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_or(0, |d| d.as_millis()), + ) + .unwrap_or(i64::MAX); + let remaining_ms = + parse_jwt_exp_ms(bearer).map_or(21_600_000, |exp| exp - now_ms); // 6 h fallback + let mut delay_ms = (remaining_ms.max(0) * 8 / 10).clamp(60_000, 43_200_000); + // Up to 10 % jitter, derived deterministically from token bytes so + // unit tests are reproducible without injecting an RNG. + let jitter_pct = (token.len() % 10) as u64; + let jitter_ms = (u64::try_from(delay_ms).unwrap_or(0) * jitter_pct) / 100; + delay_ms = delay_ms.saturating_add(i64::try_from(jitter_ms).unwrap_or(0)); + Duration::from_millis(u64::try_from(delay_ms).unwrap_or(0)) +} + +/// Decode the `exp` claim from a JWT without verifying its signature. +/// Returns the expiry in milliseconds since the Unix epoch, or `None` if +/// the token is not a parseable JWT. +fn parse_jwt_exp_ms(jwt: &str) -> Option { + use base64::Engine; + let mut parts = jwt.splitn(3, '.'); + let _header = parts.next()?; + let payload_b64 = parts.next()?; + let decoded = base64::engine::general_purpose::URL_SAFE_NO_PAD + .decode(payload_b64) + .ok()?; + let value: serde_json::Value = serde_json::from_slice(&decoded).ok()?; + let exp_secs = value.get("exp")?.as_i64()?; + exp_secs.checked_mul(1000) +} + +#[cfg(test)] +mod auth_tests { + use super::*; + + #[test] + fn parse_jwt_exp_reads_unsigned_payload() { + use base64::Engine as _; + let payload = base64::engine::general_purpose::URL_SAFE_NO_PAD + .encode(br#"{"exp":1234567890,"sandbox_id":"sb-1"}"#); + let token = format!("h.{payload}.sig"); + assert_eq!(parse_jwt_exp_ms(&token), Some(1_234_567_890_000)); + } + + #[test] + fn parse_jwt_exp_returns_none_for_malformed_token() { + assert!(parse_jwt_exp_ms("not-a-jwt").is_none()); + assert!(parse_jwt_exp_ms("only.two").is_none()); + assert!(parse_jwt_exp_ms("a.!!!.c").is_none()); + } + + #[test] + fn compute_refresh_delay_uses_80_percent_when_token_present() { + // Build a JWT whose exp is 1000 seconds in the future. With 0-jitter + // the delay should be roughly 800 seconds. + use base64::Engine as _; + let now_s = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs(); + let exp = now_s + 1000; + let payload_json = format!(r#"{{"exp":{exp}}}"#); + let payload = base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(payload_json); + let token = format!("h.{payload}.s"); + let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")).unwrap(); + let slot: TokenSlot = Arc::new(RwLock::new(bearer)); + let delay = compute_refresh_delay(&slot); + // 800 s baseline + up to 10 % jitter → 800..=880 s, with some slack + // for the 1-second resolution of the exp claim. + let secs = delay.as_secs(); + assert!( + (700..=900).contains(&secs), + "expected 80%-of-1000s delay, got {secs}s" + ); + } + + #[test] + fn compute_refresh_delay_floors_at_60_seconds() { + // Already-expired token still produces a 60 s floor so the loop + // doesn't busy-spin. + use base64::Engine as _; + let exp = 1; // past + let payload = + base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(format!(r#"{{"exp":{exp}}}"#)); + let token = format!("h.{payload}.s"); + let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")).unwrap(); + let slot: TokenSlot = Arc::new(RwLock::new(bearer)); + let delay = compute_refresh_delay(&slot); + assert!(delay.as_secs() >= 60); + } +} + /// Connect to the `OpenShell` server. async fn connect(endpoint: &str) -> Result> { let channel = connect_channel(endpoint).await?; diff --git a/crates/openshell-server/src/grpc/auth_rpc.rs b/crates/openshell-server/src/grpc/auth_rpc.rs index eb233a215..2519035be 100644 --- a/crates/openshell-server/src/grpc/auth_rpc.rs +++ b/crates/openshell-server/src/grpc/auth_rpc.rs @@ -3,14 +3,23 @@ //! Authentication-related RPC handlers. //! -//! Currently hosts the `IssueSandboxToken` exchange used by the Kubernetes -//! driver to convert a projected `ServiceAccount` token into a -//! gateway-minted JWT bound to a specific sandbox. +//! Hosts the two sandbox-identity RPCs: +//! - `IssueSandboxToken` — bootstrap exchange (K8s SA token → gateway JWT) +//! - `RefreshSandboxToken` — rotate a still-valid gateway JWT +//! +//! Both end in a fresh gateway-signed JWT minted by +//! [`crate::auth::sandbox_jwt::SandboxJwtIssuer`]. `RefreshSandboxToken` +//! additionally revokes the previous JWT's `jti` so the old token +//! becomes unusable as soon as the new one is handed back. use crate::ServerState; use crate::auth::principal::{Principal, SandboxIdentitySource}; -use openshell_core::proto::{IssueSandboxTokenRequest, IssueSandboxTokenResponse}; +use openshell_core::proto::{ + IssueSandboxTokenRequest, IssueSandboxTokenResponse, RefreshSandboxTokenRequest, + RefreshSandboxTokenResponse, +}; use std::sync::Arc; +use std::time::SystemTime; use tonic::{Request, Response, Status}; use tracing::{debug, info, warn}; @@ -32,8 +41,8 @@ pub async fn handle_issue_sandbox_token( }; // Only the bootstrap K8s ServiceAccount path can mint a fresh - // gateway JWT — gateway-issued JWTs already exist and refreshing them - // is a future capability (PR 5). Reject re-exchange attempts. + // gateway JWT via this RPC. Sandboxes already holding a gateway JWT + // use `RefreshSandboxToken` instead, which also revokes the old jti. if !matches!( sandbox.source, SandboxIdentitySource::K8sServiceAccount { .. } @@ -43,7 +52,7 @@ pub async fn handle_issue_sandbox_token( "IssueSandboxToken rejected: non-bootstrap principal source" ); return Err(Status::permission_denied( - "this principal cannot mint a sandbox token", + "this principal cannot mint a sandbox token; use RefreshSandboxToken", )); } @@ -66,3 +75,233 @@ pub async fn handle_issue_sandbox_token( expires_at_ms: minted.expires_at_ms, })) } + +#[allow(clippy::result_large_err, clippy::unused_async)] +pub async fn handle_refresh_sandbox_token( + state: &Arc, + request: Request, +) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; + + let Principal::Sandbox(sandbox) = principal else { + return Err(Status::permission_denied( + "RefreshSandboxToken requires a sandbox principal", + )); + }; + + // Only callers already holding a gateway-minted JWT may refresh; the + // K8s bootstrap path must use `IssueSandboxToken`. + let SandboxIdentitySource::BootstrapJwt { jti: old_jti, .. } = &sandbox.source else { + debug!( + sandbox_id = %sandbox.sandbox_id, + "RefreshSandboxToken rejected: non-gateway-JWT principal source" + ); + return Err(Status::permission_denied( + "this principal cannot refresh; use IssueSandboxToken for bootstrap", + )); + }; + + let issuer = state.sandbox_jwt_issuer.as_ref().ok_or_else(|| { + warn!( + sandbox_id = %sandbox.sandbox_id, + "RefreshSandboxToken called but sandbox JWT issuer is not configured" + ); + Status::unavailable("sandbox JWT minting is not configured on this gateway") + })?; + + // Mint the new token first; only revoke the old jti after we have a + // replacement so a failure here doesn't leave the sandbox stranded. + let minted = issuer.mint(&sandbox.sandbox_id)?; + + // Best-effort revocation of the old token. The plan calls for the + // jti deny-list to live in memory in PR 2; PR 5 only needs to drop + // the old jti into it. We use the new token's expiry as a safe upper + // bound for the revocation entry — the old jti can't outlive its own + // `exp`, and on TTL pruning the entry drops out cleanly. + state + .sandbox_jwt_revocation + .revoke(old_jti, minted.expires_at_ms.max(now_ms())); + info!( + sandbox_id = %sandbox.sandbox_id, + revoked_jti = %old_jti, + new_jti = %minted.jti, + "refreshed gateway sandbox JWT" + ); + + Ok(Response::new(RefreshSandboxTokenResponse { + token: minted.token, + expires_at_ms: minted.expires_at_ms, + })) +} + +fn now_ms() -> i64 { + i64::try_from( + SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .map_or(0, |d| d.as_millis()), + ) + .unwrap_or(i64::MAX) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ServerState; + use crate::auth::principal::{Principal, SandboxPrincipal, UserPrincipal}; + use crate::auth::revocation::RevocationSet; + use crate::auth::sandbox_jwt::SandboxJwtIssuer; + use crate::compute::new_test_runtime; + use crate::persistence::Store; + use crate::sandbox_index::SandboxIndex; + use crate::sandbox_watch::SandboxWatchBus; + use crate::supervisor_session::SupervisorSessionRegistry; + use crate::tracing_bus::TracingLogBus; + use openshell_bootstrap::jwt::generate_jwt_key; + use openshell_core::Config; + use std::time::Duration; + + async fn state_with_issuer() -> (Arc, SandboxJwtIssuer, Arc) { + let mat = generate_jwt_key().expect("jwt key"); + let revocation = Arc::new(RevocationSet::new()); + let issuer = SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid, + "test-gateway", + Duration::from_secs(3600), + ) + .expect("issuer"); + let store = Arc::new( + Store::connect("sqlite::memory:?cache=shared") + .await + .unwrap(), + ); + let compute = new_test_runtime(store.clone()).await; + let mut state = ServerState::new( + Config::new(None).with_database_url("sqlite::memory:?cache=shared"), + store, + compute, + SandboxIndex::new(), + SandboxWatchBus::new(), + TracingLogBus::new(), + Arc::new(SupervisorSessionRegistry::new()), + None, + ); + state.sandbox_jwt_revocation = revocation.clone(); + // We don't need the authenticator for these tests; only the issuer. + // The handler tests only exercise the mint+revoke path; they + // don't need the issuer to be the same instance that produced + // `issuer` above. A fresh keypair is fine. + let issuer_clone = SandboxJwtIssuer::from_pem( + generate_jwt_key().unwrap().signing_key_pem.as_bytes(), + "kid".to_string(), + "test-gateway", + Duration::from_secs(3600), + ) + .unwrap(); + state.sandbox_jwt_issuer = Some(Arc::new(issuer_clone)); + (Arc::new(state), issuer, revocation) + } + + fn sandbox_principal(sandbox_id: &str, jti: &str) -> Principal { + use crate::auth::principal::SandboxIdentitySource; + Principal::Sandbox(SandboxPrincipal { + sandbox_id: sandbox_id.to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test-gateway".to_string(), + jti: jti.to_string(), + }, + trust_domain: Some("openshell".to_string()), + }) + } + + #[tokio::test] + async fn refresh_revokes_old_jti_and_returns_new_token() { + let (state, _issuer, revocation) = state_with_issuer().await; + let old_jti = "j-original"; + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut() + .insert(sandbox_principal("sandbox-a", old_jti)); + let resp = handle_refresh_sandbox_token(&state, req) + .await + .expect("refresh OK") + .into_inner(); + assert!(!resp.token.is_empty()); + assert!(revocation.is_revoked(old_jti), "old jti must be revoked"); + } + + #[tokio::test] + async fn refresh_rejects_user_principal() { + use crate::auth::identity::{Identity, IdentityProvider}; + let (state, _, _) = state_with_issuer().await; + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut().insert(Principal::User(UserPrincipal { + identity: Identity { + subject: "alice".to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + })); + let err = handle_refresh_sandbox_token(&state, req) + .await + .expect_err("user must not refresh"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[tokio::test] + async fn refresh_rejects_k8s_sa_principal() { + // K8s SA-bootstrap principals must use IssueSandboxToken, not + // RefreshSandboxToken — the refresh path assumes a still-valid + // gateway-minted JWT exists. + use crate::auth::principal::SandboxIdentitySource; + let (state, _, _) = state_with_issuer().await; + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut() + .insert(Principal::Sandbox(SandboxPrincipal { + sandbox_id: "sandbox-a".to_string(), + source: SandboxIdentitySource::K8sServiceAccount { + pod_name: "pod-a".to_string(), + pod_uid: "uid-a".to_string(), + }, + trust_domain: Some("openshell".to_string()), + })); + let err = handle_refresh_sandbox_token(&state, req) + .await + .expect_err("K8s SA principal must not refresh"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[tokio::test] + async fn refresh_fails_when_issuer_not_configured() { + // Build a ServerState without the issuer to confirm the handler + // returns Unavailable. + let store = Arc::new( + Store::connect("sqlite::memory:?cache=shared") + .await + .unwrap(), + ); + let compute = new_test_runtime(store.clone()).await; + let state = Arc::new(ServerState::new( + Config::new(None).with_database_url("sqlite::memory:?cache=shared"), + store, + compute, + SandboxIndex::new(), + SandboxWatchBus::new(), + TracingLogBus::new(), + Arc::new(SupervisorSessionRegistry::new()), + None, + )); + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut() + .insert(sandbox_principal("sandbox-a", "j-1")); + let err = handle_refresh_sandbox_token(&state, req) + .await + .expect_err("missing issuer must yield unavailable"); + assert_eq!(err.code(), tonic::Code::Unavailable); + } +} diff --git a/crates/openshell-server/src/grpc/mod.rs b/crates/openshell-server/src/grpc/mod.rs index ff3fc55f3..bacd2cde7 100644 --- a/crates/openshell-server/src/grpc/mod.rs +++ b/crates/openshell-server/src/grpc/mod.rs @@ -34,13 +34,14 @@ use openshell_core::proto::{ ListSandboxPoliciesRequest, ListSandboxPoliciesResponse, ListSandboxProvidersRequest, ListSandboxProvidersResponse, ListSandboxesRequest, ListSandboxesResponse, ListServicesRequest, ListServicesResponse, ProviderProfileResponse, ProviderResponse, PushSandboxLogsRequest, - PushSandboxLogsResponse, RejectDraftChunkRequest, RejectDraftChunkResponse, RelayFrame, - ReportPolicyStatusRequest, ReportPolicyStatusResponse, RevokeSshSessionRequest, - RevokeSshSessionResponse, RotateProviderCredentialRequest, RotateProviderCredentialResponse, - SandboxResponse, SandboxStreamEvent, ServiceEndpointResponse, - ServiceStatus, SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, SupervisorMessage, - TcpForwardFrame, UndoDraftChunkRequest, UndoDraftChunkResponse, UpdateConfigRequest, - UpdateConfigResponse, UpdateProviderRequest, WatchSandboxRequest, open_shell_server::OpenShell, + PushSandboxLogsResponse, RefreshSandboxTokenRequest, RefreshSandboxTokenResponse, + RejectDraftChunkRequest, RejectDraftChunkResponse, RelayFrame, ReportPolicyStatusRequest, + ReportPolicyStatusResponse, RevokeSshSessionRequest, RevokeSshSessionResponse, + RotateProviderCredentialRequest, RotateProviderCredentialResponse, SandboxResponse, + SandboxStreamEvent, ServiceEndpointResponse, ServiceStatus, SubmitPolicyAnalysisRequest, + SubmitPolicyAnalysisResponse, SupervisorMessage, TcpForwardFrame, UndoDraftChunkRequest, + UndoDraftChunkResponse, UpdateConfigRequest, UpdateConfigResponse, UpdateProviderRequest, + WatchSandboxRequest, open_shell_server::OpenShell, }; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; @@ -578,6 +579,13 @@ impl OpenShell for OpenShellService { auth_rpc::handle_issue_sandbox_token(&self.state, request).await } + async fn refresh_sandbox_token( + &self, + request: Request, + ) -> Result, Status> { + auth_rpc::handle_refresh_sandbox_token(&self.state, request).await + } + // --- Supervisor session --- type ConnectSupervisorStream = diff --git a/crates/openshell-server/tests/auth_endpoint_integration.rs b/crates/openshell-server/tests/auth_endpoint_integration.rs index 3d74d66a5..21e136572 100644 --- a/crates/openshell-server/tests/auth_endpoint_integration.rs +++ b/crates/openshell-server/tests/auth_endpoint_integration.rs @@ -787,6 +787,14 @@ impl openshell_core::proto::open_shell_server::OpenShell for TestOpenShell { Err(tonic::Status::unimplemented("not implemented in test")) } + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, tonic::Status> + { + Err(tonic::Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/edge_tunnel_auth.rs b/crates/openshell-server/tests/edge_tunnel_auth.rs index 5a4364d40..c025f79e9 100644 --- a/crates/openshell-server/tests/edge_tunnel_auth.rs +++ b/crates/openshell-server/tests/edge_tunnel_auth.rs @@ -443,6 +443,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/multiplex_integration.rs b/crates/openshell-server/tests/multiplex_integration.rs index ed920a22f..d5e8069cf 100644 --- a/crates/openshell-server/tests/multiplex_integration.rs +++ b/crates/openshell-server/tests/multiplex_integration.rs @@ -412,6 +412,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/multiplex_tls_integration.rs b/crates/openshell-server/tests/multiplex_tls_integration.rs index 3617c30c6..352392dfa 100644 --- a/crates/openshell-server/tests/multiplex_tls_integration.rs +++ b/crates/openshell-server/tests/multiplex_tls_integration.rs @@ -425,6 +425,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/supervisor_relay_integration.rs b/crates/openshell-server/tests/supervisor_relay_integration.rs index 533e53a1b..7f6bab7e9 100644 --- a/crates/openshell-server/tests/supervisor_relay_integration.rs +++ b/crates/openshell-server/tests/supervisor_relay_integration.rs @@ -400,6 +400,12 @@ impl OpenShell for RelayGateway { ) -> Result, Status> { Err(Status::unimplemented("unused")) } + async fn refresh_sandbox_token( + &self, + _: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("unused")) + } } // --------------------------------------------------------------------------- diff --git a/crates/openshell-server/tests/ws_tunnel_integration.rs b/crates/openshell-server/tests/ws_tunnel_integration.rs index 193a2ad1d..7d746bc96 100644 --- a/crates/openshell-server/tests/ws_tunnel_integration.rs +++ b/crates/openshell-server/tests/ws_tunnel_integration.rs @@ -438,6 +438,13 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/proto/openshell.proto b/proto/openshell.proto index 15f78b381..4b97ad4cf 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -231,6 +231,15 @@ service OpenShell { // drivers receive the gateway JWT directly from the create-sandbox flow // and never call this RPC. rpc IssueSandboxToken(IssueSandboxTokenRequest) returns (IssueSandboxTokenResponse); + + // Rotate the calling sandbox's gateway JWT. The previously-issued + // token is revoked (its jti added to the gateway's deny list) and a + // fresh token bound to the same sandbox UUID is returned. The + // supervisor calls this from a background task at ~80% of the token's + // lifetime; the new token is cached in memory only — the on-disk + // bootstrap file is intentionally not rewritten. + rpc RefreshSandboxToken(RefreshSandboxTokenRequest) + returns (RefreshSandboxTokenResponse); } // IssueSandboxToken request. Empty body; identity is established by the @@ -248,6 +257,20 @@ message IssueSandboxTokenResponse { int64 expires_at_ms = 2; } +// RefreshSandboxToken request. Empty body; the calling principal must +// already be a sandbox principal (i.e. the request carries a still-valid +// gateway-minted JWT in its Authorization header). +message RefreshSandboxTokenRequest {} + +// RefreshSandboxToken response. The previous token is revoked server-side +// before this response is sent. +message RefreshSandboxTokenResponse { + // Fresh gateway-minted JWT bound to the same sandbox UUID. + string token = 1; + // Absolute expiry of the new token, milliseconds since the epoch. + int64 expires_at_ms = 2; +} + // Health check request. message HealthRequest {} From 901eabb36a2daad168a3f0a05dd494aa1065a2a4 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Thu, 14 May 2026 22:34:53 -0700 Subject: [PATCH 07/18] feat(server): make K8s ServiceAccount bootstrap token TTL configurable The projected SA token kubelet writes to each sandbox pod was previously a hardcoded 3600s literal in the driver. Operators in tighter audit regimes want to dial it lower; very large clusters may want it slightly higher to absorb token-refresh churn. Wires `sa_token_ttl_secs` through three layers: - KubernetesComputeConfig gains the field (default 3600). The driver clamps to [600, 86400] via `effective_sa_token_ttl_secs()`: 600s is kubelet's enforced minimum, 24h is the cap (the token is consumed within seconds of pod start, so longer is almost always a misconfiguration). - The openshell-driver-kubernetes binary exposes `--sa-token-ttl-secs` / `OPENSHELL_K8S_SA_TOKEN_TTL_SECS`. - `[openshell.gateway].sa_token_ttl_secs` in the gateway TOML inherits into `[openshell.drivers.kubernetes]`, mirroring the `enable_user_namespaces` plumbing. - Helm: `server.sandboxJwt.k8sSaTokenTtlSecs` (default 3600) renders into the K8s driver block of the gateway config. Signed-off-by: Taylor Mutch --- .../openshell-driver-kubernetes/src/config.rs | 33 +++++++++++++++++++ .../openshell-driver-kubernetes/src/driver.rs | 27 +++++++++++++-- .../openshell-driver-kubernetes/src/main.rs | 8 +++++ crates/openshell-sandbox/src/grpc_client.rs | 3 +- crates/openshell-server/src/config_file.rs | 7 ++++ .../openshell/templates/gateway-config.yaml | 1 + deploy/helm/openshell/values.yaml | 6 ++++ 7 files changed, 81 insertions(+), 4 deletions(-) diff --git a/crates/openshell-driver-kubernetes/src/config.rs b/crates/openshell-driver-kubernetes/src/config.rs index 28c04deb3..8e4275ab7 100644 --- a/crates/openshell-driver-kubernetes/src/config.rs +++ b/crates/openshell-driver-kubernetes/src/config.rs @@ -64,8 +64,25 @@ pub struct KubernetesComputeConfig { pub client_tls_secret_name: String, pub host_gateway_ip: String, pub enable_user_namespaces: bool, + /// Lifetime (seconds) of the projected `ServiceAccount` token kubelet + /// writes into each sandbox pod. Used only for the one-shot + /// `IssueSandboxToken` bootstrap exchange — the gateway-minted JWT + /// that follows has its own TTL set via `gateway_jwt.ttl_secs`. + /// + /// Kubelet enforces a minimum of 600 seconds; the supervisor uses + /// this token within a few seconds of pod start, so any value at + /// the floor is sufficient. Default 3600. + pub sa_token_ttl_secs: i64, } +/// Lower bound enforced by kubelet for projected SA tokens. +pub const MIN_SA_TOKEN_TTL_SECS: i64 = 600; + +/// Cap at 24h — operators who want longer-lived bootstrap tokens are +/// almost certainly misconfigured (the token is consumed seconds after +/// pod start). +pub const MAX_SA_TOKEN_TTL_SECS: i64 = 86_400; + impl Default for KubernetesComputeConfig { fn default() -> Self { Self { @@ -84,6 +101,22 @@ impl Default for KubernetesComputeConfig { client_tls_secret_name: String::new(), host_gateway_ip: String::new(), enable_user_namespaces: false, + sa_token_ttl_secs: 3600, + } + } +} + +impl KubernetesComputeConfig { + /// Clamp `sa_token_ttl_secs` into the `[MIN_SA_TOKEN_TTL_SECS, + /// MAX_SA_TOKEN_TTL_SECS]` range used by the projected-volume spec. + /// Invalid (≤0) values fall back to the default 3600. + #[must_use] + pub fn effective_sa_token_ttl_secs(&self) -> i64 { + if self.sa_token_ttl_secs <= 0 { + 3600 + } else { + self.sa_token_ttl_secs + .clamp(MIN_SA_TOKEN_TTL_SECS, MAX_SA_TOKEN_TTL_SECS) } } } diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 07f205dee..da9ade3eb 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -327,6 +327,7 @@ impl KubernetesComputeDriver { client_tls_secret_name: &self.config.client_tls_secret_name, host_gateway_ip: &self.config.host_gateway_ip, enable_user_namespaces: self.config.enable_user_namespaces, + sa_token_ttl_secs: self.config.effective_sa_token_ttl_secs(), }; obj.data = sandbox_to_k8s_spec(sandbox.spec.as_ref(), ¶ms); let api = self.api(); @@ -1042,7 +1043,6 @@ fn default_workspace_volume_claim_templates() -> serde_json::Value { } /// Parameters shared by `sandbox_to_k8s_spec` and `sandbox_template_to_k8s`. -#[derive(Default)] struct SandboxPodParams<'a> { default_image: &'a str, image_pull_policy: &'a str, @@ -1056,6 +1056,29 @@ struct SandboxPodParams<'a> { client_tls_secret_name: &'a str, host_gateway_ip: &'a str, enable_user_namespaces: bool, + /// Lifetime (seconds) of the projected `ServiceAccount` token used + /// for the bootstrap `IssueSandboxToken` exchange. + sa_token_ttl_secs: i64, +} + +impl Default for SandboxPodParams<'_> { + fn default() -> Self { + Self { + default_image: "", + image_pull_policy: "", + supervisor_image: "", + supervisor_image_pull_policy: "", + supervisor_sideload_method: SupervisorSideloadMethod::default(), + sandbox_id: "", + sandbox_name: "", + grpc_endpoint: "", + ssh_socket_path: "", + client_tls_secret_name: "", + host_gateway_ip: "", + enable_user_namespaces: false, + sa_token_ttl_secs: 3600, + } + } } fn spec_pod_env(spec: Option<&SandboxSpec>) -> std::collections::HashMap { @@ -1303,7 +1326,7 @@ fn sandbox_template_to_k8s( "sources": [{ "serviceAccountToken": { "audience": "openshell-gateway", - "expirationSeconds": 3600_i64, + "expirationSeconds": params.sa_token_ttl_secs, "path": "token" } }], diff --git a/crates/openshell-driver-kubernetes/src/main.rs b/crates/openshell-driver-kubernetes/src/main.rs index a170b5785..ac500e650 100644 --- a/crates/openshell-driver-kubernetes/src/main.rs +++ b/crates/openshell-driver-kubernetes/src/main.rs @@ -68,6 +68,13 @@ struct Args { #[arg(long, env = "OPENSHELL_ENABLE_USER_NAMESPACES")] enable_user_namespaces: bool, + + /// Lifetime (seconds) of the projected `ServiceAccount` token + /// kubelet writes into each sandbox pod for the `IssueSandboxToken` + /// bootstrap exchange. Kubelet enforces a minimum of 600s; the + /// gateway clamps values outside `[600, 86400]`. Default 3600. + #[arg(long, env = "OPENSHELL_K8S_SA_TOKEN_TTL_SECS", default_value_t = 3600)] + sa_token_ttl_secs: i64, } #[tokio::main] @@ -93,6 +100,7 @@ async fn main() -> Result<()> { client_tls_secret_name: args.client_tls_secret_name.unwrap_or_default(), host_gateway_ip: args.host_gateway_ip.unwrap_or_default(), enable_user_namespaces: args.enable_user_namespaces, + sa_token_ttl_secs: args.sa_token_ttl_secs, }) .await .into_diagnostic()?; diff --git a/crates/openshell-sandbox/src/grpc_client.rs b/crates/openshell-sandbox/src/grpc_client.rs index fae66d5c5..e0b20957e 100644 --- a/crates/openshell-sandbox/src/grpc_client.rs +++ b/crates/openshell-sandbox/src/grpc_client.rs @@ -299,8 +299,7 @@ fn compute_refresh_delay(slot: &TokenSlot) -> Duration { .map_or(0, |d| d.as_millis()), ) .unwrap_or(i64::MAX); - let remaining_ms = - parse_jwt_exp_ms(bearer).map_or(21_600_000, |exp| exp - now_ms); // 6 h fallback + let remaining_ms = parse_jwt_exp_ms(bearer).map_or(21_600_000, |exp| exp - now_ms); // 6 h fallback let mut delay_ms = (remaining_ms.max(0) * 8 / 10).clamp(60_000, 43_200_000); // Up to 10 % jitter, derived deterministically from token bytes so // unit tests are reproducible without injecting an RNG. diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs index 96a7af90d..5a20dff9f 100644 --- a/crates/openshell-server/src/config_file.rs +++ b/crates/openshell-server/src/config_file.rs @@ -115,6 +115,11 @@ pub struct GatewayFileSection { pub host_gateway_ip: Option, #[serde(default)] pub enable_user_namespaces: Option, + /// Lifetime (seconds) of the projected `ServiceAccount` token kubelet + /// writes for the `IssueSandboxToken` bootstrap exchange. Driver + /// clamps to `[600, 86400]`. + #[serde(default)] + pub sa_token_ttl_secs: Option, #[serde(default)] pub guest_tls_ca: Option, #[serde(default)] @@ -249,6 +254,7 @@ fn inheritable_keys(driver: ComputeDriverKind) -> &'static [&'static str] { "client_tls_secret_name", "host_gateway_ip", "enable_user_namespaces", + "sa_token_ttl_secs", ], ComputeDriverKind::Docker => &[ "sandbox_namespace", @@ -283,6 +289,7 @@ fn gateway_inherited_value(g: &GatewayFileSection, key: &str) -> Option g.client_tls_secret_name.as_deref().map(string_value), "host_gateway_ip" => g.host_gateway_ip.as_deref().map(string_value), "enable_user_namespaces" => g.enable_user_namespaces.map(toml::Value::Boolean), + "sa_token_ttl_secs" => g.sa_token_ttl_secs.map(toml::Value::Integer), "guest_tls_ca" => g.guest_tls_ca.as_deref().map(path_value), "guest_tls_cert" => g.guest_tls_cert.as_deref().map(path_value), "guest_tls_key" => g.guest_tls_key.as_deref().map(path_value), diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml index 9793c2a5e..302a5806f 100644 --- a/deploy/helm/openshell/templates/gateway-config.yaml +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -94,6 +94,7 @@ data: [openshell.drivers.kubernetes] grpc_endpoint = {{ include "openshell.grpcEndpoint" . | quote }} supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }} + sa_token_ttl_secs = {{ .Values.server.sandboxJwt.k8sSaTokenTtlSecs | default 3600 }} {{- if .Values.server.sandboxImagePullPolicy }} image_pull_policy = {{ .Values.server.sandboxImagePullPolicy | quote }} {{- end }} diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index a694dc59d..c2f7362c3 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -183,6 +183,12 @@ server: gatewayId: "" # Token TTL in seconds. Defaults to 86400 (24h). ttlSecs: 86400 + # Lifetime (seconds) of the projected ServiceAccount token kubelet + # writes into each sandbox pod for the IssueSandboxToken bootstrap + # exchange. Kubelet enforces a minimum of 600s; the driver clamps + # values outside [600, 86400]. Default 3600 — generous, since the + # supervisor consumes the token within seconds of pod start. + k8sSaTokenTtlSecs: 3600 # OIDC (OpenID Connect) configuration for JWT-based authentication. # When issuer is set, the server validates Bearer tokens on gRPC requests. oidc: From c4779922b06c07f4b294aad226734f4ed60679b3 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Fri, 15 May 2026 07:29:33 -0700 Subject: [PATCH 08/18] feat(server): JWKS-based K8s ServiceAccount token validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the LiveK8sResolver stub with a working validator. Sandbox pods present their projected ServiceAccount token via Authorization: Bearer on IssueSandboxToken; the gateway: 1. Decodes the JWT header and looks up the signing key. 2. On miss, fetches the apiserver's /.well-known/openid-configuration discovery doc + /openid/v1/jwks via kube::Client and caches the keys. 3. Validates the token's signature (RS256), issuer, audience (openshell-gateway), and expiry. 4. Reads `kubernetes.io.pod.{name,uid}` from the claims and GETs the pod in the gateway's sandbox namespace. 5. Verifies the live pod's UID matches the token's UID (defense against replayed tokens from recreated pods with the same name) and reads the openshell.io/sandbox-id annotation to derive the sandbox UUID. The gateway needs no system:auth-delegator ClusterRoleBinding — JWKS validation is local, so the only K8s permission it consumes is the namespace Role's `pods: get` grant. Discovery + JWKS reads ride the gateway's existing kube::Client auth (system:service-account-issuer- discovery is bound to system:authenticated in every supported K8s distro). ServerState gains an in-cluster detection path in run_server: when KUBERNETES_SERVICE_HOST is set AND a sandbox JWT issuer is configured, construct the resolver and wire it as state.k8s_sa_authenticator. The existing K8sServiceAccountAuthenticator (path-scoped to IssueSandboxToken) becomes functional. Tests: JWKS path parsing covers absolute URL, relative path, query string, and garbage rejection. End-to-end validation against a real apiserver is exercised in the helm-dev demo. Signed-off-by: Taylor Mutch --- Cargo.lock | 1 + crates/openshell-server/Cargo.toml | 1 + crates/openshell-server/src/auth/k8s_sa.rs | 349 +++++++++++++++++++-- crates/openshell-server/src/lib.rs | 33 ++ 4 files changed, 355 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6f55814f1..b54e2a54f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3742,6 +3742,7 @@ dependencies = [ "tower-http 0.6.8", "tracing", "tracing-subscriber", + "url", "uuid", "wiremock", "x509-parser", diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml index 34768c4b9..fa19ab526 100644 --- a/crates/openshell-server/Cargo.toml +++ b/crates/openshell-server/Cargo.toml @@ -83,6 +83,7 @@ hmac = "0.12" sha2 = { workspace = true } jsonwebtoken = { workspace = true } async-trait = "0.1" +url = { workspace = true } hex = "0.4" russh = "0.57" rand = { workspace = true } diff --git a/crates/openshell-server/src/auth/k8s_sa.rs b/crates/openshell-server/src/auth/k8s_sa.rs index f8a9a9fae..be625c703 100644 --- a/crates/openshell-server/src/auth/k8s_sa.rs +++ b/crates/openshell-server/src/auth/k8s_sa.rs @@ -17,9 +17,15 @@ use super::authenticator::Authenticator; use super::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; use async_trait::async_trait; +use jsonwebtoken::{Algorithm, DecodingKey, Validation, decode, decode_header}; +use k8s_openapi::api::core::v1::Pod; +use kube::api::Api; +use serde::Deserialize; +use std::collections::HashMap; use std::sync::Arc; +use tokio::sync::{Mutex, RwLock}; use tonic::Status; -use tracing::{debug, warn}; +use tracing::{debug, info, warn}; /// gRPC method path that this authenticator accepts. All other paths fall /// through (return `Ok(None)`) so a gateway-minted JWT is required there. @@ -29,7 +35,6 @@ pub const ISSUE_SANDBOX_TOKEN_PATH: &str = "/openshell.v1.OpenShell/IssueSandbox /// Kubernetes compute driver at pod-create time. The gateway treats this /// annotation as authoritative; the K8s `Role` granted to the gateway must /// not include `patch pods` (see plan §11.8). -#[allow(dead_code)] pub const SANDBOX_ID_ANNOTATION: &str = "openshell.io/sandbox-id"; /// Resolved identity extracted from a validated SA token + pod lookup. @@ -117,46 +122,306 @@ impl Authenticator for K8sServiceAccountAuthenticator { } } -/// Live resolver backed by a `kube::Client`. -/// -/// Validates the projected `ServiceAccount` token locally against the -/// apiserver's JWKS endpoint — no `TokenReview` API call required (so the -/// gateway needs no `system:auth-delegator` cluster binding). The -/// namespace-scoped `Role` granted in the Helm chart provides the only -/// permission this resolver needs: `pods: get`, used to read the -/// `openshell.io/sandbox-id` annotation. -/// -/// The JWKS fetch + signature verification implementation lands in the -/// follow-up that brings the K8s helm-dev demo end-to-end; PR 3 ships -/// the wire break with Docker/Podman as the only exercised drivers. +/// K8s apiserver discovery document (subset of fields used). +#[derive(Deserialize)] +struct ApiserverDiscovery { + issuer: String, + jwks_uri: String, +} + +/// JWKS key set returned by the apiserver's `/openid/v1/jwks` endpoint. +#[derive(Deserialize)] +struct JwkSet { + keys: Vec, +} + +#[derive(Deserialize)] +struct JwkKey { + kid: Option, + kty: String, + #[serde(default)] + n: String, + #[serde(default)] + e: String, + alg: Option, +} + +/// Claims subset extracted from a validated projected SA token. `exp`, +/// `aud`, and `serviceaccount` are validated by `jsonwebtoken` but we +/// don't read them post-decode — dead-code-allowed so the structural +/// match against the token shape stays explicit. +#[derive(Debug, Deserialize)] #[allow(dead_code)] -pub struct LiveK8sResolver { - client: kube::Client, - audience: String, +struct K8sSaClaims { + /// `system:serviceaccount::` + sub: String, + iss: String, + /// The audience claim is always an array for projected SA tokens. + #[serde(default)] + aud: Vec, + exp: i64, + #[serde(rename = "kubernetes.io")] + kubernetes: K8sClaim, +} + +#[derive(Debug, Deserialize)] +#[allow(dead_code)] +struct K8sClaim { namespace: String, + pod: K8sPodClaim, + #[serde(default)] + serviceaccount: Option, } -impl LiveK8sResolver { +#[derive(Debug, Deserialize)] +struct K8sPodClaim { + name: String, + uid: String, +} + +#[derive(Debug, Deserialize)] +struct K8sSaClaim { #[allow(dead_code)] - pub fn new(client: kube::Client, audience: String, namespace: String) -> Self { + name: String, + #[allow(dead_code)] + uid: String, +} + +/// JWKS cache for the K8s apiserver's projected `ServiceAccount` token +/// issuer. Discovery + key fetch lazily on first validate; subsequent +/// validations are in-process signature checks. Refreshes on `kid` miss +/// so apiserver key rotation propagates without a restart. +pub struct K8sApiserverJwks { + client: kube::Client, + expected_audience: String, + state: RwLock, + refresh: Mutex<()>, +} + +#[derive(Default)] +struct JwksState { + issuer: Option, + jwks_path: Option, + keys: HashMap, +} + +impl K8sApiserverJwks { + pub fn new(client: kube::Client, expected_audience: String) -> Self { Self { client, - audience, - namespace, + expected_audience, + state: RwLock::new(JwksState::default()), + refresh: Mutex::new(()), } } + + /// Validate `token`, returning the parsed claims on success. + #[allow(clippy::result_large_err)] + async fn validate(&self, token: &str) -> Result { + // Decode the header to find the kid first; we lazily load on demand. + let header = decode_header(token).map_err(|e| { + debug!(error = %e, "K8s SA JWT header decode failed"); + Status::unauthenticated("invalid token") + })?; + let kid = header + .kid + .ok_or_else(|| Status::unauthenticated("invalid token: missing kid"))?; + + let (issuer, key) = if let Some(pair) = self.cached_key(&kid).await { + pair + } else { + self.refresh_keys().await?; + self.cached_key(&kid).await.ok_or_else(|| { + debug!(kid = %kid, "K8s SA JWT kid not found in apiserver JWKS"); + Status::unauthenticated("invalid token: unknown signing key") + })? + }; + + let mut validation = Validation::new(Algorithm::RS256); + validation.algorithms = vec![Algorithm::RS256]; + validation.set_issuer(&[&issuer]); + validation.set_audience(&[&self.expected_audience]); + validation.set_required_spec_claims(&["iss", "aud", "exp", "sub"]); + + let data = decode::(token, &key, &validation).map_err(|e| { + debug!(error = %e, "K8s SA JWT validation failed"); + Status::unauthenticated(format!("invalid SA token: {e}")) + })?; + Ok(data.claims) + } + + async fn cached_key(&self, kid: &str) -> Option<(String, DecodingKey)> { + let state = self.state.read().await; + let issuer = state.issuer.clone()?; + let key = state.keys.get(kid).cloned()?; + Some((issuer, key)) + } + + /// Fetch the discovery document + JWKS and replace the cached state. + /// Coalesces concurrent refreshes so the apiserver sees one fetch. + #[allow(clippy::result_large_err)] + async fn refresh_keys(&self) -> Result<(), Status> { + let _guard = self.refresh.lock().await; + info!("refreshing K8s apiserver JWKS"); + let discovery: ApiserverDiscovery = self + .request_apiserver("/.well-known/openid-configuration") + .await?; + let jwks_path = jwks_path_from_uri(&discovery.jwks_uri).ok_or_else(|| { + Status::internal(format!( + "apiserver returned unusable jwks_uri '{}'", + discovery.jwks_uri + )) + })?; + let jwks: JwkSet = self.request_apiserver(&jwks_path).await?; + let mut keys = HashMap::new(); + for key in &jwks.keys { + if key.kty != "RSA" { + continue; + } + let Some(ref kid) = key.kid else { + continue; + }; + if let Some(alg) = key.alg.as_deref() + && alg != "RS256" + { + continue; + } + match DecodingKey::from_rsa_components(&key.n, &key.e) { + Ok(dk) => { + keys.insert(kid.clone(), dk); + } + Err(e) => warn!(kid = %kid, error = %e, "skipped malformed apiserver JWK"), + } + } + info!( + count = keys.len(), + issuer = %discovery.issuer, + "loaded apiserver JWKS" + ); + let mut state = self.state.write().await; + state.issuer = Some(discovery.issuer); + state.jwks_path = Some(jwks_path); + state.keys = keys; + Ok(()) + } + + #[allow(clippy::result_large_err)] + async fn request_apiserver( + &self, + path: &str, + ) -> Result { + let req = http::Request::builder() + .uri(path) + .body(Vec::new()) + .map_err(|e| Status::internal(format!("apiserver request build: {e}")))?; + self.client + .request::(req) + .await + .map_err(|e| Status::internal(format!("apiserver request failed: {e}"))) + } +} + +/// Pull a path-only URI out of the `jwks_uri` field. The apiserver's +/// discovery doc returns an absolute URL (e.g. +/// `https://kubernetes.default.svc.cluster.local/openid/v1/jwks`); we +/// strip to the path so `kube::Client::request` can be reused. +fn jwks_path_from_uri(uri: &str) -> Option { + if uri.starts_with('/') { + return Some(uri.to_string()); + } + let parsed = url::Url::parse(uri).ok()?; + let mut out = parsed.path().to_string(); + if let Some(q) = parsed.query() { + out.push('?'); + out.push_str(q); + } + Some(out) +} + +/// Resolver backed by the apiserver's JWKS endpoint (for SA-token +/// signature verification) and `kube::Client` (for the per-pod +/// annotation lookup). +pub struct LiveK8sResolver { + jwks: Arc, + pods_api: Api, +} + +impl LiveK8sResolver { + pub fn new(client: kube::Client, namespace: &str, expected_audience: String) -> Self { + let pods_api: Api = Api::namespaced(client.clone(), namespace); + let jwks = Arc::new(K8sApiserverJwks::new(client, expected_audience)); + Self { jwks, pods_api } + } } #[async_trait] impl K8sIdentityResolver for LiveK8sResolver { - async fn resolve(&self, _token: &str) -> Result, Status> { - // Full JWKS verification + pod annotation lookup lands in the - // K8s-demo follow-up. Returning `Unimplemented` keeps the - // K8s-side `IssueSandboxToken` call from silently succeeding - // before the validator is in place. - Err(Status::unimplemented( - "K8s ServiceAccount bootstrap pending JWKS implementation", - )) + async fn resolve(&self, token: &str) -> Result, Status> { + let claims = match self.jwks.validate(token).await { + Ok(c) => c, + Err(status) if status.code() == tonic::Code::Unauthenticated => { + // Returning Ok(None) lets the chain fall through; the + // outer router then returns Unauthenticated to the client. + return Ok(None); + } + Err(other) => return Err(other), + }; + + debug!( + sub = %claims.sub, + iss = %claims.iss, + pod_name = %claims.kubernetes.pod.name, + "validated K8s SA token" + ); + + // Look up the pod and read its sandbox-id annotation. + let pod = self + .pods_api + .get_opt(&claims.kubernetes.pod.name) + .await + .map_err(|e| { + warn!( + pod = %claims.kubernetes.pod.name, + error = %e, + "failed to fetch sandbox pod for annotation lookup" + ); + Status::internal(format!("pod GET failed: {e}")) + })?; + let Some(pod) = pod else { + warn!( + pod = %claims.kubernetes.pod.name, + "sandbox pod referenced by SA token not found in this namespace" + ); + return Err(Status::not_found("sandbox pod not found")); + }; + + // Defense-in-depth: confirm the pod UID matches the SA token's + // `kubernetes.io.pod.uid`. Prevents a replayed token from a + // recreated pod with the same name. + let actual_uid = pod.metadata.uid.as_deref().unwrap_or_default(); + if actual_uid != claims.kubernetes.pod.uid { + warn!( + pod = %claims.kubernetes.pod.name, + claimed_uid = %claims.kubernetes.pod.uid, + actual_uid = %actual_uid, + "SA token pod UID does not match live pod; rejecting" + ); + return Err(Status::permission_denied("SA token pod UID mismatch")); + } + + let sandbox_id = pod + .metadata + .annotations + .as_ref() + .and_then(|a| a.get(SANDBOX_ID_ANNOTATION)) + .cloned() + .unwrap_or_default(); + + Ok(Some(ResolvedK8sIdentity { + sandbox_id, + pod_name: claims.kubernetes.pod.name, + pod_uid: claims.kubernetes.pod.uid, + })) } } @@ -207,6 +472,32 @@ mod tests { h } + #[test] + fn jwks_path_extracts_absolute_url() { + let path = + jwks_path_from_uri("https://kubernetes.default.svc.cluster.local/openid/v1/jwks") + .expect("apiserver-style URL must parse"); + assert_eq!(path, "/openid/v1/jwks"); + } + + #[test] + fn jwks_path_preserves_relative_path() { + let path = jwks_path_from_uri("/openid/v1/jwks").expect("relative path must round-trip"); + assert_eq!(path, "/openid/v1/jwks"); + } + + #[test] + fn jwks_path_preserves_query_string() { + let path = jwks_path_from_uri("https://apiserver/openid/v1/jwks?version=v1") + .expect("query strings must be preserved"); + assert_eq!(path, "/openid/v1/jwks?version=v1"); + } + + #[test] + fn jwks_path_rejects_garbage() { + assert!(jwks_path_from_uri("not a url").is_none()); + } + #[tokio::test] async fn authenticates_on_issue_path_only() { let resolved = ResolvedK8sIdentity { diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index e62ebc141..e64a72c4b 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -297,6 +297,39 @@ pub async fn run_server( state.sandbox_jwt_authenticator = Some(Arc::new(authenticator)); } + // K8s ServiceAccount bootstrap authenticator. Only constructed when + // the gateway is running in-cluster (kubelet provides the API host + // env var) and has a sandbox JWT issuer to mint replacements against; + // outside the cluster we can't talk to the apiserver's JWKS endpoint, + // and without the issuer there's nothing to exchange the SA token + // for. + if state.sandbox_jwt_issuer.is_some() && std::env::var_os("KUBERNETES_SERVICE_HOST").is_some() { + match kube::Client::try_default().await { + Ok(client) => { + let namespace = std::env::var("POD_NAMESPACE") + .ok() + .filter(|s| !s.is_empty()) + .unwrap_or_else(|| "default".to_string()); + let resolver = Arc::new(auth::k8s_sa::LiveK8sResolver::new( + client, + &namespace, + "openshell-gateway".to_string(), + )); + let authenticator = auth::k8s_sa::K8sServiceAccountAuthenticator::new(resolver); + state.k8s_sa_authenticator = Some(Arc::new(authenticator)); + info!( + namespace = %namespace, + "K8s ServiceAccount bootstrap authenticator enabled" + ); + } + Err(e) => warn!( + error = %e, + "in-cluster K8s client construction failed; \ + K8s ServiceAccount bootstrap is disabled" + ), + } + } + let state = Arc::new(state); // Resume sandboxes that were stopped during the previous gateway From 99c4c8103412a9bcf5271d1664c207541d8d64fb Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Fri, 15 May 2026 08:14:31 -0700 Subject: [PATCH 09/18] fix(server): three sandbox-identity issues found during helm exercise MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three regressions / inefficiencies surfaced while bringing the per-sandbox identity series up end-to-end in the local helm cluster: 1. CLI returned Unauthenticated against a no-OIDC dev gateway. PR 3 removed the pre-refactor "no OIDC = pass through" behavior; with only sandbox-side authenticators in the chain, plain user CLI calls hit Unauthenticated. Add a PermissiveUserAuthenticator that installs as a final fallback when no OIDC is configured but sandbox JWT signing IS — produces a synthetic dev-anonymous user principal so the rest of the handler chain treats CLI calls as User and bypasses the IDOR guard. Production OIDC deployments are unaffected: when OIDC is configured the fallback is not installed and missing-Bearer still 401s. 2. Sandbox supervisor re-ran the K8s SA bootstrap exchange on every connect_channel() call. With multiple subsystems each building their own channels, IssueSandboxToken was firing every few seconds even though TOKEN_SLOT already had a fresh token. Change connect_channel to reuse TOKEN_SLOT when populated; only run acquire_sandbox_token on the first call per process. The refresh loop keeps the slot fresh thereafter. 3. K8s SA authenticator looked up sandbox pods in the gateway's own namespace (POD_NAMESPACE) instead of the K8s driver's configured sandbox namespace. Source from kubernetes_config_from_file() so the resolver targets the same namespace the driver creates pods in. Verified end-to-end against the helm-dev cluster: - Two sandboxes get distinct gateway JWTs with their own sandbox UUIDs. - Cross-sandbox GetSandboxConfig is rejected with PermissionDenied and the auth::guard audit log fires with both principal and requested IDs. - RefreshSandboxToken mints a new JWT and revokes the old jti; the old token is then rejected with Unauthenticated: revoked token. Signed-off-by: Taylor Mutch --- crates/openshell-sandbox/src/grpc_client.rs | 25 +++++----- .../src/auth/authenticator.rs | 49 ++++++++++++++++++- crates/openshell-server/src/lib.rs | 15 +++--- crates/openshell-server/src/multiplex.rs | 15 +++++- 4 files changed, 84 insertions(+), 20 deletions(-) diff --git a/crates/openshell-sandbox/src/grpc_client.rs b/crates/openshell-sandbox/src/grpc_client.rs index e0b20957e..86166cb62 100644 --- a/crates/openshell-sandbox/src/grpc_client.rs +++ b/crates/openshell-sandbox/src/grpc_client.rs @@ -159,23 +159,26 @@ async fn build_plain_channel(endpoint: &str) -> Result { /// Build a Bearer-authenticated channel to the gateway. /// -/// Resolves the sandbox JWT via the three-step lookup described at the -/// module level (env → file → K8s SA bootstrap exchange), installs the -/// token into the process-wide [`TOKEN_SLOT`], wraps the channel in an -/// [`AuthInterceptor`] that reads from that slot, and spawns the refresh -/// loop the first time the channel is built. +/// First call per process resolves the sandbox JWT via the three-step +/// lookup (env → file → K8s SA bootstrap exchange) and installs it into +/// the process-wide [`TOKEN_SLOT`]. Subsequent calls reuse the cached +/// slot — the refresh loop keeps the value fresh, so re-running the +/// bootstrap is both unnecessary and (on the K8s SA path) expensive +/// (one apiserver round-trip per call). The refresh loop itself is +/// spawned once per process via [`REFRESH_SPAWNED`]. async fn connect_channel(endpoint: &str) -> Result { let channel = build_plain_channel(endpoint).await?; - let token = acquire_sandbox_token(endpoint, &channel).await?; - let slot = install_token_slot(&token)?; + let slot = if let Some(existing) = TOKEN_SLOT.get() { + existing.clone() + } else { + let token = acquire_sandbox_token(endpoint, &channel).await?; + install_token_slot(&token)? + }; let intercepted = InterceptedService::new(channel, AuthInterceptor::new(slot.clone())); - // Spawn the refresh loop once per process. It uses the same authed - // channel, so its outbound calls always carry the current token. if REFRESH_SPAWNED.set(()).is_ok() { let refresh_channel = intercepted.clone(); - let refresh_slot = slot; tokio::spawn(async move { - refresh_token_loop(refresh_channel, refresh_slot).await; + refresh_token_loop(refresh_channel, slot).await; }); } Ok(intercepted) diff --git a/crates/openshell-server/src/auth/authenticator.rs b/crates/openshell-server/src/auth/authenticator.rs index 827c3b8c1..ee11f8f35 100644 --- a/crates/openshell-server/src/auth/authenticator.rs +++ b/crates/openshell-server/src/auth/authenticator.rs @@ -17,8 +17,13 @@ //! - [`super::k8s_sa::K8sServiceAccountAuthenticator`] — K8s projected SA //! tokens (path-scoped to `IssueSandboxToken`) //! - [`super::oidc::OidcAuthenticator`] — user OIDC Bearer tokens +//! - [`PermissiveUserAuthenticator`] — final-fallback dev-mode catch-all +//! that produces a synthetic user principal when no OIDC is +//! configured. Preserves the pre-PR-1 "no OIDC = open" posture for +//! singleplayer / helm-dev deployments. -use super::principal::Principal; +use super::identity::{Identity, IdentityProvider}; +use super::principal::{Principal, UserPrincipal}; use async_trait::async_trait; use std::sync::Arc; use tonic::Status; @@ -91,6 +96,48 @@ impl std::fmt::Debug for AuthenticatorChain { } } +/// Final-fallback authenticator that produces a synthetic user principal +/// for any request the earlier authenticators didn't claim. Used only +/// when no user-side authentication is configured (no OIDC, no fronting +/// proxy contract) — the pre-PR-1 gateway accepted such requests with +/// no auth at all; this preserves that posture in a principal-aware +/// way so handlers always see *some* principal in extensions. +/// +/// Producing a User principal (rather than Anonymous) means dev-mode +/// requests pass the per-handler IDOR guard via the User-bypass +/// branch — equivalent to "RBAC was the user's gate" with the dev +/// default of "every caller is a user." +pub struct PermissiveUserAuthenticator { + subject: String, +} + +impl PermissiveUserAuthenticator { + pub fn new(subject: impl Into) -> Self { + Self { + subject: subject.into(), + } + } +} + +#[async_trait] +impl Authenticator for PermissiveUserAuthenticator { + async fn authenticate( + &self, + _headers: &http::HeaderMap, + _path: &str, + ) -> Result, Status> { + Ok(Some(Principal::User(UserPrincipal { + identity: Identity { + subject: self.subject.clone(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Internal, + }, + }))) + } +} + #[cfg(test)] pub mod test_support { use super::*; diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index e64a72c4b..90faaeec8 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -304,21 +304,24 @@ pub async fn run_server( // and without the issuer there's nothing to exchange the SA token // for. if state.sandbox_jwt_issuer.is_some() && std::env::var_os("KUBERNETES_SERVICE_HOST").is_some() { + // Pod lookups must target the sandbox namespace (where the K8s + // driver places sandbox pods), not the gateway's own pod + // namespace. Sourced from the merged + // `[openshell.drivers.kubernetes].namespace` config, falling + // back to "default" only if the driver config can't be parsed. + let sandbox_namespace = kubernetes_config_from_file(config_file.as_ref()) + .map_or_else(|_| "default".to_string(), |cfg| cfg.namespace); match kube::Client::try_default().await { Ok(client) => { - let namespace = std::env::var("POD_NAMESPACE") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| "default".to_string()); let resolver = Arc::new(auth::k8s_sa::LiveK8sResolver::new( client, - &namespace, + &sandbox_namespace, "openshell-gateway".to_string(), )); let authenticator = auth::k8s_sa::K8sServiceAccountAuthenticator::new(resolver); state.k8s_sa_authenticator = Some(Arc::new(authenticator)); info!( - namespace = %namespace, + namespace = %sandbox_namespace, "K8s ServiceAccount bootstrap authenticator enabled" ); } diff --git a/crates/openshell-server/src/multiplex.rs b/crates/openshell-server/src/multiplex.rs index 5c652e679..c10839cc7 100644 --- a/crates/openshell-server/src/multiplex.rs +++ b/crates/openshell-server/src/multiplex.rs @@ -32,7 +32,7 @@ use tracing::Span; use crate::{ OpenShellService, ServerState, - auth::authenticator::AuthenticatorChain, + auth::authenticator::{AuthenticatorChain, PermissiveUserAuthenticator}, auth::authz::AuthzPolicy, auth::identity::Identity, auth::oidc::{self, OidcAuthenticator}, @@ -267,7 +267,12 @@ where /// 2. `SandboxJwtAuthenticator` — validates gateway-minted JWTs. Recognized /// via a distinctive `kid` so non-matching Bearer tokens fall through. /// 3. `OidcAuthenticator` — validates user Bearer tokens against the -/// configured OIDC issuer. +/// configured OIDC issuer. Returns `Unauthenticated` for missing +/// Bearer headers so non-OIDC clients can't sneak through. +/// 4. `PermissiveUserAuthenticator` — installed only when no OIDC is +/// configured (singleplayer / helm-dev). Catches anything the +/// sandbox authenticators didn't claim and produces a synthetic +/// user principal, preserving the pre-PR-1 "no OIDC = open" posture. /// /// When neither OIDC nor gateway-minted JWTs are configured (a barebones /// dev gateway), the chain is left as `None` so the router short-circuits @@ -282,6 +287,12 @@ fn build_authenticator_chain(state: &ServerState) -> Option } if let Some(cache) = state.oidc_cache.clone() { authenticators.push(Arc::new(OidcAuthenticator::new(cache))); + } else if !authenticators.is_empty() { + // No OIDC, but sandbox-side authentication IS configured — + // user CLI calls must still pass through, so install a + // permissive final fallback. Production deployments configure + // OIDC and this branch is unused. + authenticators.push(Arc::new(PermissiveUserAuthenticator::new("dev-anonymous"))); } if authenticators.is_empty() { return None; From 06e0d1ab287025fa41e433bbfb72126b27325bd6 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Fri, 15 May 2026 08:32:15 -0700 Subject: [PATCH 10/18] feat(sandbox): openshell-sandbox debug-rpc subcommand for end-to-end testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a small subcommand to the supervisor binary that issues one-shot sandbox-class RPCs against the gateway using the supervisor's existing token-acquisition pipeline. Designed to be invoked via docker exec or kubectl exec into a running sandbox to verify the per-sandbox identity flow end-to-end without writing a custom test binary inside the sandbox image. Subcommands: - get-sandbox-config --sandbox-id — call GetSandboxConfig - refresh — call RefreshSandboxToken - show-token — print raw gateway JWT bytes - show-principal — pretty-print decoded JWT claims Verification flow this enables (Docker path): docker exec sandbox-a openshell-sandbox debug-rpc show-principal docker exec sandbox-a openshell-sandbox debug-rpc \ get-sandbox-config --sandbox-id # → exit code 7 + "PermissionDenied: cross-sandbox access denied" K8s path: same RPCs, kubectl exec instead. show-token and show-principal intentionally don't trigger the K8s SA bootstrap exchange — they only read an already-cached token, so inspection doesn't burn a fresh JWT mint per call. Signed-off-by: Taylor Mutch --- crates/openshell-sandbox/src/debug_rpc.rs | 236 ++++++++++++++++++++++ crates/openshell-sandbox/src/lib.rs | 1 + crates/openshell-sandbox/src/main.rs | 23 +++ 3 files changed, 260 insertions(+) create mode 100644 crates/openshell-sandbox/src/debug_rpc.rs diff --git a/crates/openshell-sandbox/src/debug_rpc.rs b/crates/openshell-sandbox/src/debug_rpc.rs new file mode 100644 index 000000000..013099198 --- /dev/null +++ b/crates/openshell-sandbox/src/debug_rpc.rs @@ -0,0 +1,236 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! One-shot debug RPCs exposed via `openshell-sandbox debug-rpc`. +//! +//! Designed for end-to-end verification of the per-sandbox identity +//! flow (issue #1354). A `docker exec` (or `kubectl exec`) into a +//! running sandbox can issue raw sandbox-class gRPC calls without +//! standing up a custom binary inside the sandbox image — useful for +//! confirming the cross-sandbox IDOR guard and refresh semantics. +//! +//! Subcommands: +//! - `get-sandbox-config --sandbox-id ` — call `GetSandboxConfig` +//! - `refresh` — call `RefreshSandboxToken` +//! - `show-token` — print the raw gateway JWT bytes +//! - `show-principal` — pretty-print the decoded JWT claims +//! (no signature verification — the supervisor already trusts the +//! token's origin) + +use base64::Engine as _; +use miette::{IntoDiagnostic, Result, WrapErr}; +use openshell_core::proto::{ + GetSandboxConfigRequest, RefreshSandboxTokenRequest, open_shell_client::OpenShellClient, +}; + +use crate::grpc_client::{AuthedChannel, connect_channel_pub}; + +/// Entry point for the `debug-rpc` subcommand. Returns the process exit +/// code; `main` propagates it. +pub async fn run(args: &[String]) -> Result { + let cmd = args + .first() + .map(String::as_str) + .ok_or_else(|| miette::miette!("{}", USAGE))?; + + match cmd { + "get-sandbox-config" => run_get_sandbox_config(&args[1..]).await, + "refresh" => run_refresh().await, + "show-token" => run_show_token(), + "show-principal" => run_show_principal(), + "--help" | "-h" => { + println!("{USAGE}"); + Ok(0) + } + other => Err(miette::miette!( + "unknown debug-rpc command '{other}'\n\n{USAGE}" + )), + } +} + +const USAGE: &str = "\ +usage: openshell-sandbox debug-rpc [options] + +commands: + get-sandbox-config --sandbox-id call GetSandboxConfig + refresh call RefreshSandboxToken + show-token print raw gateway JWT + show-principal print decoded JWT claims + +requires: OPENSHELL_ENDPOINT in env, plus one of OPENSHELL_SANDBOX_TOKEN, +OPENSHELL_SANDBOX_TOKEN_FILE, or OPENSHELL_K8S_SA_TOKEN_FILE so the +supervisor's normal token-acquisition path can resolve a JWT."; + +async fn open_client() -> Result> { + let endpoint = std::env::var(openshell_core::sandbox_env::ENDPOINT) + .into_diagnostic() + .wrap_err("OPENSHELL_ENDPOINT must be set")?; + let channel = connect_channel_pub(&endpoint).await?; + Ok(OpenShellClient::new(channel)) +} + +async fn run_get_sandbox_config(args: &[String]) -> Result { + let sandbox_id = parse_flag(args, "--sandbox-id") + .ok_or_else(|| miette::miette!("get-sandbox-config: --sandbox-id is required"))?; + let mut client = open_client().await?; + let resp = client + .get_sandbox_config(GetSandboxConfigRequest { + sandbox_id: sandbox_id.to_string(), + }) + .await; + match resp { + Ok(r) => { + let inner = r.into_inner(); + println!( + "version={} policy_hash={} config_revision={}", + inner.version, inner.policy_hash, inner.config_revision + ); + Ok(0) + } + Err(status) => { + eprintln!("{}: {}", code_name(status.code()), status.message()); + // Map gRPC status to a non-zero exit so callers can branch + // (e.g. expect-permission-denied in a shell test). + Ok(match status.code() { + tonic::Code::PermissionDenied => 7, + tonic::Code::Unauthenticated => 16, + tonic::Code::NotFound => 5, + _ => 1, + }) + } + } +} + +async fn run_refresh() -> Result { + let mut client = open_client().await?; + let resp = client + .refresh_sandbox_token(RefreshSandboxTokenRequest {}) + .await; + match resp { + Ok(r) => { + let inner = r.into_inner(); + println!( + "token={}\nexpires_at_ms={}", + inner.token, inner.expires_at_ms + ); + Ok(0) + } + Err(status) => { + eprintln!("{}: {}", code_name(status.code()), status.message()); + Ok(1) + } + } +} + +fn run_show_token() -> Result { + let token = read_local_token()?; + println!("{token}"); + Ok(0) +} + +fn run_show_principal() -> Result { + let token = read_local_token()?; + let payload_b64 = token + .split('.') + .nth(1) + .ok_or_else(|| miette::miette!("token has no payload segment"))?; + let payload = base64::engine::general_purpose::URL_SAFE_NO_PAD + .decode(payload_b64) + .into_diagnostic() + .wrap_err("failed to base64-decode token payload")?; + let claims: serde_json::Value = serde_json::from_slice(&payload) + .into_diagnostic() + .wrap_err("failed to parse token payload as JSON")?; + println!( + "{}", + serde_json::to_string_pretty(&claims).into_diagnostic()? + ); + Ok(0) +} + +/// Read the token from the env/file/SA-bootstrap chain, but only the +/// "already a gateway JWT" paths — show-token / show-principal don't +/// want to actually exchange an SA token. +fn read_local_token() -> Result { + if let Ok(t) = std::env::var(openshell_core::sandbox_env::SANDBOX_TOKEN) + && !t.is_empty() + { + return Ok(t); + } + if let Ok(path) = std::env::var(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE) + && !path.is_empty() + { + return Ok(std::fs::read_to_string(&path) + .into_diagnostic() + .wrap_err_with(|| format!("failed to read sandbox token from {path}"))? + .trim() + .to_string()); + } + Err(miette::miette!( + "no in-process gateway JWT available — set OPENSHELL_SANDBOX_TOKEN or \ + OPENSHELL_SANDBOX_TOKEN_FILE. The K8s SA-bootstrap path is intentionally \ + excluded from `show-token` / `show-principal` to avoid issuing a fresh \ + token just for inspection." + )) +} + +fn parse_flag<'a>(args: &'a [String], name: &str) -> Option<&'a str> { + let mut iter = args.iter(); + while let Some(a) = iter.next() { + if a == name { + return iter.next().map(String::as_str); + } + if let Some(rest) = a.strip_prefix(&format!("{name}=")) { + return Some(rest); + } + } + None +} + +fn code_name(c: tonic::Code) -> &'static str { + match c { + tonic::Code::Ok => "OK", + tonic::Code::Cancelled => "Cancelled", + tonic::Code::Unknown => "Unknown", + tonic::Code::InvalidArgument => "InvalidArgument", + tonic::Code::DeadlineExceeded => "DeadlineExceeded", + tonic::Code::NotFound => "NotFound", + tonic::Code::AlreadyExists => "AlreadyExists", + tonic::Code::PermissionDenied => "PermissionDenied", + tonic::Code::ResourceExhausted => "ResourceExhausted", + tonic::Code::FailedPrecondition => "FailedPrecondition", + tonic::Code::Aborted => "Aborted", + tonic::Code::OutOfRange => "OutOfRange", + tonic::Code::Unimplemented => "Unimplemented", + tonic::Code::Internal => "Internal", + tonic::Code::Unavailable => "Unavailable", + tonic::Code::DataLoss => "DataLoss", + tonic::Code::Unauthenticated => "Unauthenticated", + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_flag_handles_space_separated() { + let args: Vec = ["--sandbox-id", "abc-123"] + .iter() + .map(ToString::to_string) + .collect(); + assert_eq!(parse_flag(&args, "--sandbox-id"), Some("abc-123")); + } + + #[test] + fn parse_flag_handles_equals_separated() { + let args: Vec = ["--sandbox-id=abc-123".to_string()].to_vec(); + assert_eq!(parse_flag(&args, "--sandbox-id"), Some("abc-123")); + } + + #[test] + fn parse_flag_returns_none_when_missing() { + let args: Vec = ["--other".to_string(), "x".to_string()].to_vec(); + assert!(parse_flag(&args, "--sandbox-id").is_none()); + } +} diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index ded56ce9e..b83125f12 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -7,6 +7,7 @@ pub mod bypass_monitor; mod child_env; +pub mod debug_rpc; pub mod denial_aggregator; mod grpc_client; mod identity; diff --git a/crates/openshell-sandbox/src/main.rs b/crates/openshell-sandbox/src/main.rs index 4a6cb1955..3c9e21578 100644 --- a/crates/openshell-sandbox/src/main.rs +++ b/crates/openshell-sandbox/src/main.rs @@ -24,6 +24,15 @@ use openshell_sandbox::run_sandbox; /// performs the copy in pure Rust. const COPY_SELF_SUBCOMMAND: &str = "copy-self"; +/// Subcommand for one-shot debug RPCs from inside a sandbox container. +/// +/// Reads the same token sources as the supervisor (env, file, K8s SA +/// bootstrap) and issues a single gRPC call against the gateway. Useful +/// for end-to-end verification: e.g. `docker exec` into a sandbox, then +/// run `openshell-sandbox debug-rpc get-sandbox-config --sandbox-id ` +/// to confirm the cross-sandbox IDOR guard fires. +const DEBUG_RPC_SUBCOMMAND: &str = "debug-rpc"; + /// `OpenShell` Sandbox - process isolation and monitoring. #[derive(Parser, Debug)] #[command(name = "openshell-sandbox")] @@ -150,6 +159,20 @@ fn main() -> Result<()> { return copy_self(dest); } + // Handle `debug-rpc [args]` before clap. Uses a small + // dedicated runtime so we don't pay the supervisor's full startup cost. + if raw_args.get(1).map(String::as_str) == Some(DEBUG_RPC_SUBCOMMAND) { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .into_diagnostic()?; + return runtime.block_on(async move { + let _ = rustls::crypto::ring::default_provider().install_default(); + let exit = openshell_sandbox::debug_rpc::run(&raw_args[2..]).await?; + std::process::exit(exit); + }); + } + let args = Args::parse(); // Try to open a rolling log file; fall back to stderr-only logging if it fails From 42289cf419d35a14089fe4d4beac5babbdc46700 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Fri, 15 May 2026 13:39:06 -0700 Subject: [PATCH 11/18] fix(helm): mount sandbox JWT keys without TLS Signed-off-by: Taylor Mutch --- .../skills/debug-openshell-cluster/SKILL.md | 9 ++++++++- crates/openshell-server/src/multiplex.rs | 1 + .../helm/openshell/templates/statefulset.yaml | 18 +++++++++--------- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 177c52f31..6b7f8e6b2 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -116,9 +116,16 @@ Check required Helm deployment secrets: kubectl -n openshell get secret \ openshell-server-tls \ openshell-server-client-ca \ - openshell-client-tls + openshell-client-tls \ + openshell-jwt-keys ``` +If the gateway exits with `failed to read sandbox JWT signing key from +/etc/openshell-jwt/signing.pem`, verify that `openshell-jwt-keys` contains +`signing.pem`, `public.pem`, and `kid`, and that the StatefulSet mounts the +`sandbox-jwt` secret at `/etc/openshell-jwt`. The sandbox JWT mount is required +even when local Helm values disable TLS. + Check the image references currently used by the gateway deployment: ```bash diff --git a/crates/openshell-server/src/multiplex.rs b/crates/openshell-server/src/multiplex.rs index c10839cc7..567df2272 100644 --- a/crates/openshell-server/src/multiplex.rs +++ b/crates/openshell-server/src/multiplex.rs @@ -323,6 +323,7 @@ pub struct AuthGrpcRouter { } impl AuthGrpcRouter { + #[cfg(test)] fn new( inner: S, authenticator_chain: Option, diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index 2a5ce420c..5dd4f1caf 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -75,6 +75,9 @@ spec: - name: gateway-config mountPath: /etc/openshell readOnly: true + - name: sandbox-jwt + mountPath: /etc/openshell-jwt + readOnly: true {{- if not .Values.server.disableTls }} - name: tls-cert mountPath: /etc/openshell-tls/server @@ -84,15 +87,12 @@ spec: mountPath: /etc/openshell-tls/client-ca readOnly: true {{- end }} - - name: sandbox-jwt - mountPath: /etc/openshell-jwt - readOnly: true + {{- end }} {{- if and .Values.server.oidc.issuer .Values.server.oidc.caConfigMapName }} - name: oidc-ca mountPath: /etc/openshell-tls/oidc-ca readOnly: true {{- end }} - {{- end }} ports: - name: grpc containerPort: {{ .Values.service.port }} @@ -134,6 +134,10 @@ spec: - name: gateway-config configMap: name: {{ include "openshell.fullname" . }}-config + - name: sandbox-jwt + secret: + secretName: {{ .Values.server.sandboxJwt.signingSecretName | default (printf "%s-jwt-keys" (include "openshell.fullname" .)) }} + defaultMode: 0400 {{- if not .Values.server.disableTls }} - name: tls-cert secret: @@ -150,16 +154,12 @@ spec: secretName: {{ .Values.server.tls.clientCaSecretName }} {{- end }} {{- end }} - - name: sandbox-jwt - secret: - secretName: {{ .Values.server.sandboxJwt.signingSecretName | default (printf "%s-jwt-keys" (include "openshell.fullname" .)) }} - defaultMode: 0400 + {{- end }} {{- if and .Values.server.oidc.issuer .Values.server.oidc.caConfigMapName }} - name: oidc-ca configMap: name: {{ .Values.server.oidc.caConfigMapName }} {{- end }} - {{- end }} {{- with .Values.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} From 1dca07bbec5add310a6928aba32b71e107304c0a Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Fri, 15 May 2026 16:23:40 -0700 Subject: [PATCH 12/18] test(e2e): configure sandbox JWT keys in harnesses Signed-off-by: Taylor Mutch --- e2e/support/gateway-common.sh | 32 +++++++++++++++++++++++++++++++- e2e/with-docker-gateway.sh | 3 +++ e2e/with-podman-gateway.sh | 3 +++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/e2e/support/gateway-common.sh b/e2e/support/gateway-common.sh index 2f8a2c141..25ffebb9a 100644 --- a/e2e/support/gateway-common.sh +++ b/e2e/support/gateway-common.sh @@ -93,6 +93,37 @@ EOF printf '%s' "${name}" >"${config_home}/openshell/active_gateway" } +e2e_toml_string() { + local value="$1" + value="${value//\\/\\\\}" + value="${value//\"/\\\"}" + printf '"%s"' "${value}" +} + +e2e_generate_gateway_jwt() { + local jwt_dir=$1 + + mkdir -p "${jwt_dir}" + ( + umask 077 + openssl genpkey -algorithm Ed25519 -out "${jwt_dir}/signing.pem" >/dev/null 2>&1 + ) + openssl pkey -in "${jwt_dir}/signing.pem" -pubout -out "${jwt_dir}/public.pem" >/dev/null 2>&1 + openssl rand -hex 16 >"${jwt_dir}/kid" +} + +e2e_write_gateway_jwt_config() { + local jwt_dir=$1 + local gateway_id=$2 + + printf '[openshell.gateway.gateway_jwt]\n' + printf 'signing_key_path = %s\n' "$(e2e_toml_string "${jwt_dir}/signing.pem")" + printf 'public_key_path = %s\n' "$(e2e_toml_string "${jwt_dir}/public.pem")" + printf 'kid_path = %s\n' "$(e2e_toml_string "${jwt_dir}/kid")" + printf 'gateway_id = %s\n' "$(e2e_toml_string "${gateway_id}")" + printf 'ttl_secs = 86400\n\n' +} + e2e_build_gateway_binaries() { local root=$1 local target_var=$2 @@ -176,4 +207,3 @@ e2e_print_gateway_log_on_failure() { echo "=== end gateway log ===" fi } - diff --git a/e2e/with-docker-gateway.sh b/e2e/with-docker-gateway.sh index ed920f3e4..bca9d80c6 100755 --- a/e2e/with-docker-gateway.sh +++ b/e2e/with-docker-gateway.sh @@ -391,6 +391,7 @@ e2e_generate_pki "${GATEWAY_BIN}" "${PKI_DIR}" HOST_PORT=$(e2e_pick_port) STATE_DIR="${WORKDIR}/state" mkdir -p "${STATE_DIR}" +JWT_DIR="${STATE_DIR}/jwt" GATEWAY_ENDPOINT="https://host.openshell.internal:${HOST_PORT}" E2E_NAMESPACE="e2e-docker-$$-${HOST_PORT}" @@ -410,6 +411,7 @@ else fi echo "Starting openshell-gateway on port ${HOST_PORT} (namespace: ${E2E_NAMESPACE})..." +e2e_generate_gateway_jwt "${JWT_DIR}" # Driver-specific options moved from CLI flags into a TOML config table # (commit 560550d2). Synthesize a minimal config here and pass --config. @@ -428,6 +430,7 @@ GATEWAY_CONFIG="${STATE_DIR}/gateway.toml" { printf '[openshell]\nversion = 1\n\n' printf '[openshell.gateway]\nlog_level = "info"\n\n' + e2e_write_gateway_jwt_config "${JWT_DIR}" "openshell-e2e-docker-${HOST_PORT}" printf '[openshell.drivers.docker]\n' printf 'sandbox_namespace = %s\n' "$(toml_string "${E2E_NAMESPACE}")" printf 'network_name = %s\n' "$(toml_string "${DOCKER_NETWORK_NAME}")" diff --git a/e2e/with-podman-gateway.sh b/e2e/with-podman-gateway.sh index 875ebee4b..c9f9cb5fc 100755 --- a/e2e/with-podman-gateway.sh +++ b/e2e/with-podman-gateway.sh @@ -335,6 +335,7 @@ HOST_PORT=$(e2e_pick_port) HEALTH_PORT=$(e2e_pick_port) STATE_DIR="${WORKDIR}/state" mkdir -p "${STATE_DIR}" +JWT_DIR="${STATE_DIR}/jwt" E2E_NAMESPACE="e2e-podman-$$-${HOST_PORT}" PODMAN_NETWORK_NAME="${E2E_NAMESPACE}" @@ -346,6 +347,7 @@ export OPENSHELL_E2E_NETWORK_NAME="${PODMAN_NETWORK_NAME}" export OPENSHELL_E2E_SANDBOX_NAMESPACE="${E2E_NAMESPACE}" echo "Starting openshell-gateway on port ${HOST_PORT} (namespace: ${E2E_NAMESPACE})..." +e2e_generate_gateway_jwt "${JWT_DIR}" # Driver-specific options moved from CLI flags into a TOML config table # (commit 560550d2). Synthesize a minimal config here and pass --config. @@ -370,6 +372,7 @@ GATEWAY_CONFIG="${STATE_DIR}/gateway.toml" # (CLI > TOML in the merge precedence) so the test can use an ephemeral port. cp "${ROOT}/deploy/rpm/gateway.toml.default" "${GATEWAY_CONFIG}" { + e2e_write_gateway_jwt_config "${JWT_DIR}" "openshell-e2e-podman-${HOST_PORT}" printf '\n[openshell.drivers.podman]\n' # The Podman driver scopes isolation by network rather than namespace. printf 'network_name = %s\n' "$(toml_string "${PODMAN_NETWORK_NAME}")" From 93a6900754405e136f370109579ab5b5efe9d6bb Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Mon, 18 May 2026 11:52:36 -0700 Subject: [PATCH 13/18] refactor(auth): remove sandbox token revocation Signed-off-by: Taylor Mutch --- architecture/gateway.md | 7 ++ crates/openshell-core/src/config.rs | 4 +- crates/openshell-sandbox/src/debug_rpc.rs | 4 +- crates/openshell-sandbox/src/grpc_client.rs | 20 ++-- crates/openshell-server/src/auth/guard.rs | 1 - crates/openshell-server/src/auth/mod.rs | 1 - crates/openshell-server/src/auth/principal.rs | 2 +- .../openshell-server/src/auth/revocation.rs | 100 ------------------ .../openshell-server/src/auth/sandbox_jwt.rs | 82 +++----------- crates/openshell-server/src/grpc/auth_rpc.rs | 86 ++++----------- crates/openshell-server/src/grpc/policy.rs | 2 - crates/openshell-server/src/grpc/sandbox.rs | 1 - crates/openshell-server/src/lib.rs | 15 +-- crates/openshell-server/src/multiplex.rs | 1 - .../openshell/templates/gateway-config.yaml | 2 +- deploy/helm/openshell/values.yaml | 4 +- docs/reference/gateway-config.mdx | 7 ++ e2e/support/gateway-common.sh | 19 +++- e2e/with-docker-gateway.sh | 2 + e2e/with-kube-gateway.sh | 73 ++++++++++--- proto/openshell.proto | 16 +-- 21 files changed, 158 insertions(+), 291 deletions(-) delete mode 100644 crates/openshell-server/src/auth/revocation.rs diff --git a/architecture/gateway.md b/architecture/gateway.md index 04e64a73f..2b032f0fd 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -46,6 +46,13 @@ Sandbox supervisor RPCs authenticate with either mTLS material or a sandbox secret depending on the runtime and deployment mode. User-facing mutations are authorized by role policy when OIDC or edge identity is enabled. +Sandbox secrets are gateway-signed JWTs bound to a single sandbox ID. Docker, +Podman, and VM drivers deliver the initial token through supervisor-only +runtime material; Kubernetes supervisors exchange a projected ServiceAccount +token through `IssueSandboxToken`. Supervisors renew gateway JWTs in memory +before expiry. Older tokens are not server-revoked; deployments bound replay +exposure with short `gateway_jwt.ttl_secs` lifetimes. + ## API Surface The gateway API is organized around platform objects and operational streams: diff --git a/crates/openshell-core/src/config.rs b/crates/openshell-core/src/config.rs index ea284f657..287bd72eb 100644 --- a/crates/openshell-core/src/config.rs +++ b/crates/openshell-core/src/config.rs @@ -342,7 +342,7 @@ pub struct GatewayJwtConfig { /// hostname-or-`openshell` placeholder if unset. #[serde(default = "default_gateway_id")] pub gateway_id: String, - /// Token lifetime in seconds. Defaults to 24 hours. + /// Token lifetime in seconds. Defaults to 1 hour. #[serde(default = "default_sandbox_token_ttl_secs")] pub ttl_secs: u64, } @@ -352,7 +352,7 @@ fn default_gateway_id() -> String { } const fn default_sandbox_token_ttl_secs() -> u64 { - 86_400 + 3_600 } fn default_roles_claim() -> String { diff --git a/crates/openshell-sandbox/src/debug_rpc.rs b/crates/openshell-sandbox/src/debug_rpc.rs index 013099198..b8cc43e2d 100644 --- a/crates/openshell-sandbox/src/debug_rpc.rs +++ b/crates/openshell-sandbox/src/debug_rpc.rs @@ -7,7 +7,7 @@ //! flow (issue #1354). A `docker exec` (or `kubectl exec`) into a //! running sandbox can issue raw sandbox-class gRPC calls without //! standing up a custom binary inside the sandbox image — useful for -//! confirming the cross-sandbox IDOR guard and refresh semantics. +//! confirming the cross-sandbox IDOR guard and renewal semantics. //! //! Subcommands: //! - `get-sandbox-config --sandbox-id ` — call `GetSandboxConfig` @@ -53,7 +53,7 @@ usage: openshell-sandbox debug-rpc [options] commands: get-sandbox-config --sandbox-id call GetSandboxConfig - refresh call RefreshSandboxToken + refresh renew the gateway JWT show-token print raw gateway JWT show-principal print decoded JWT claims diff --git a/crates/openshell-sandbox/src/grpc_client.rs b/crates/openshell-sandbox/src/grpc_client.rs index 86166cb62..c8270ad6a 100644 --- a/crates/openshell-sandbox/src/grpc_client.rs +++ b/crates/openshell-sandbox/src/grpc_client.rs @@ -43,15 +43,15 @@ use tracing::{debug, info, warn}; pub type AuthedChannel = InterceptedService; /// Shared, refreshable Bearer header. All [`AuthInterceptor`] clones read -/// the same slot, so the PR-5 refresh task can rotate the token in place -/// without rebuilding the channel. +/// the same slot, so the renewal task can replace the token in place without +/// rebuilding the channel. type TokenSlot = Arc>; /// Process-wide token slot. Initialized by the first [`connect_channel`] -/// call and shared with every subsequent client + the refresh loop. +/// call and shared with every subsequent client and the renewal loop. static TOKEN_SLOT: OnceLock = OnceLock::new(); -/// One-shot guard so the refresh loop spawns at most once per process. +/// One-shot guard so the renewal loop spawns at most once per process. static REFRESH_SPAWNED: OnceLock<()> = OnceLock::new(); fn install_token_slot(token: &str) -> Result { @@ -68,8 +68,8 @@ fn install_token_slot(token: &str) -> Result { } /// gRPC interceptor that injects `authorization: Bearer ` on every -/// outbound request. The token lives in a shared [`TokenSlot`] so the -/// PR-5 refresh task can replace it without rebuilding clients. +/// outbound request. The token lives in a shared [`TokenSlot`] so the renewal +/// task can replace it without rebuilding clients. #[derive(Clone)] pub struct AuthInterceptor { bearer: TokenSlot, @@ -162,9 +162,9 @@ async fn build_plain_channel(endpoint: &str) -> Result { /// First call per process resolves the sandbox JWT via the three-step /// lookup (env → file → K8s SA bootstrap exchange) and installs it into /// the process-wide [`TOKEN_SLOT`]. Subsequent calls reuse the cached -/// slot — the refresh loop keeps the value fresh, so re-running the +/// slot — the renewal loop keeps the value fresh, so re-running the /// bootstrap is both unnecessary and (on the K8s SA path) expensive -/// (one apiserver round-trip per call). The refresh loop itself is +/// (one apiserver round-trip per call). The renewal loop itself is /// spawned once per process via [`REFRESH_SPAWNED`]. async fn connect_channel(endpoint: &str) -> Result { let channel = build_plain_channel(endpoint).await?; @@ -250,7 +250,7 @@ pub async fn connect_channel_pub(endpoint: &str) -> Result { connect_channel(endpoint).await } -/// Background task that rotates the sandbox JWT at ~80% of its remaining +/// Background task that renews the sandbox JWT at ~80% of its remaining /// lifetime. The new token replaces the value in [`TOKEN_SLOT`], so all /// in-flight and future clients pick it up on their next request. The /// loop never panics: every failure is logged and re-attempted after a @@ -270,7 +270,7 @@ async fn refresh_token_loop(channel: AuthedChannel, slot: TokenSlot) { Ok(value) => { if let Ok(mut guard) = slot.write() { *guard = value; - info!("rotated gateway sandbox JWT in-place"); + info!("renewed gateway sandbox JWT in-place"); } } Err(e) => warn!(error = %e, "refreshed JWT contained invalid header bytes"), diff --git a/crates/openshell-server/src/auth/guard.rs b/crates/openshell-server/src/auth/guard.rs index f5cdb8131..aac768017 100644 --- a/crates/openshell-server/src/auth/guard.rs +++ b/crates/openshell-server/src/auth/guard.rs @@ -89,7 +89,6 @@ mod tests { sandbox_id: id.to_string(), source: SandboxIdentitySource::BootstrapJwt { issuer: "openshell-gateway:test".to_string(), - jti: "j-1".to_string(), }, trust_domain: Some("openshell".to_string()), }) diff --git a/crates/openshell-server/src/auth/mod.rs b/crates/openshell-server/src/auth/mod.rs index d4c6978af..880b02d38 100644 --- a/crates/openshell-server/src/auth/mod.rs +++ b/crates/openshell-server/src/auth/mod.rs @@ -16,7 +16,6 @@ pub mod identity; pub mod k8s_sa; pub mod oidc; pub mod principal; -pub mod revocation; pub mod sandbox_jwt; pub use http::router; diff --git a/crates/openshell-server/src/auth/principal.rs b/crates/openshell-server/src/auth/principal.rs index fac3f6099..7000ae342 100644 --- a/crates/openshell-server/src/auth/principal.rs +++ b/crates/openshell-server/src/auth/principal.rs @@ -70,7 +70,7 @@ pub struct SandboxPrincipal { pub enum SandboxIdentitySource { /// Gateway-minted JWT validated against the gateway's signing key. /// Produced by [`super::sandbox_jwt::SandboxJwtAuthenticator`]. - BootstrapJwt { issuer: String, jti: String }, + BootstrapJwt { issuer: String }, /// Per-sandbox client certificate. Reserved for the v2 channel-bound /// identity follow-up. BootstrapCert { fingerprint: String }, diff --git a/crates/openshell-server/src/auth/revocation.rs b/crates/openshell-server/src/auth/revocation.rs deleted file mode 100644 index 3cca82211..000000000 --- a/crates/openshell-server/src/auth/revocation.rs +++ /dev/null @@ -1,100 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Sandbox-JWT revocation set. -//! -//! Tracks `jti` claims that have been explicitly revoked (sandbox deleted -//! or token refreshed). The validator consults this set on every sandbox -//! JWT validation and rejects matches as `Unauthenticated`. -//! -//! PR-2 implementation is in-memory only; a gateway restart clears the -//! set. The token TTL (24 h default) bounds the exposure window. PR 5 -//! (refresh RPC) introduces persistence to `Store` so revocations survive -//! restarts. - -use std::collections::HashMap; -use std::sync::RwLock; -use std::time::{SystemTime, UNIX_EPOCH}; - -/// In-memory `jti` deny-list with TTL-based pruning. -#[derive(Debug, Default)] -pub struct RevocationSet { - entries: RwLock>, -} - -impl RevocationSet { - pub fn new() -> Self { - Self::default() - } - - /// Mark `jti` as revoked until `expires_at_ms` (after which it would - /// naturally fail signature validation due to `exp`, so we can drop it). - pub fn revoke(&self, jti: &str, expires_at_ms: i64) { - let mut entries = self.entries.write().expect("revocation lock poisoned"); - entries.insert(jti.to_string(), expires_at_ms); - } - - /// Returns true if `jti` is currently revoked. - pub fn is_revoked(&self, jti: &str) -> bool { - let entries = self.entries.read().expect("revocation lock poisoned"); - entries.contains_key(jti) - } - - /// Drop entries whose `exp` is in the past. Called periodically (or on - /// demand from tests) to bound memory growth. - pub fn prune_expired(&self) -> usize { - let now = now_ms(); - let mut entries = self.entries.write().expect("revocation lock poisoned"); - let before = entries.len(); - entries.retain(|_, exp| *exp > now); - before - entries.len() - } - - /// Number of currently tracked revocations. Test/diagnostic only. - #[cfg(test)] - pub fn len(&self) -> usize { - self.entries.read().unwrap().len() - } -} - -fn now_ms() -> i64 { - i64::try_from( - SystemTime::now() - .duration_since(UNIX_EPOCH) - .map_or(0, |d| d.as_millis()), - ) - .unwrap_or(i64::MAX) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn revoked_jti_is_detected() { - let set = RevocationSet::new(); - let future = now_ms() + 60_000; - set.revoke("abc", future); - assert!(set.is_revoked("abc")); - assert!(!set.is_revoked("xyz")); - } - - #[test] - fn prune_drops_expired_entries() { - let set = RevocationSet::new(); - set.revoke("expired", now_ms() - 1_000); - set.revoke("future", now_ms() + 60_000); - let dropped = set.prune_expired(); - assert_eq!(dropped, 1); - assert!(!set.is_revoked("expired")); - assert!(set.is_revoked("future")); - } - - #[test] - fn re_revoking_overwrites_expiry() { - let set = RevocationSet::new(); - set.revoke("dup", now_ms() + 1_000); - set.revoke("dup", now_ms() + 99_000); - assert_eq!(set.len(), 1); - } -} diff --git a/crates/openshell-server/src/auth/sandbox_jwt.rs b/crates/openshell-server/src/auth/sandbox_jwt.rs index 6b1736dbe..69ce18d23 100644 --- a/crates/openshell-server/src/auth/sandbox_jwt.rs +++ b/crates/openshell-server/src/auth/sandbox_jwt.rs @@ -17,17 +17,14 @@ use super::authenticator::Authenticator; use super::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; -use super::revocation::RevocationSet; use async_trait::async_trait; use jsonwebtoken::{ Algorithm, DecodingKey, EncodingKey, Header, Validation, decode, decode_header, encode, }; use serde::{Deserialize, Serialize}; -use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tonic::Status; use tracing::{debug, warn}; -use uuid::Uuid; /// SPIFFE-shaped subject prefix. Embedded in the `sub` claim of every /// minted token so a future migration to per-sandbox certs or SPIRE can @@ -48,7 +45,6 @@ pub struct SandboxJwtClaims { pub aud: String, pub iat: i64, pub exp: i64, - pub jti: String, /// Canonical sandbox UUID, denormalized from `sub` for cheap parsing /// without a SPIFFE library. pub sandbox_id: String, @@ -74,12 +70,10 @@ impl std::fmt::Debug for SandboxJwtIssuer { } } -/// Outcome of a successful mint — caller persists the `jti` so the same -/// token can be revoked on `DeleteSandbox` / refresh. +/// Outcome of a successful mint. #[derive(Debug, Clone)] pub struct MintedToken { pub token: String, - pub jti: String, pub expires_at_ms: i64, } @@ -102,21 +96,17 @@ impl SandboxJwtIssuer { }) } - /// Mint a fresh token for `sandbox_id`. The caller MUST track the - /// returned `jti` (in the `RevocationSet`'s mint-time index if we ever - /// need to revoke the most-recent token for a given sandbox). + /// Mint a fresh token for `sandbox_id`. #[allow(clippy::result_large_err)] // `tonic::Status` is the natural error here pub fn mint(&self, sandbox_id: &str) -> Result { let now = now_secs(); - let exp = now + i64::try_from(self.ttl.as_secs()).unwrap_or(86_400); - let jti = Uuid::new_v4().to_string(); + let exp = now + i64::try_from(self.ttl.as_secs()).unwrap_or(3_600); let claims = SandboxJwtClaims { sub: format!("{SPIFFE_SUBJECT_PREFIX}{sandbox_id}"), iss: self.issuer.clone(), aud: self.audience.clone(), iat: now, exp, - jti: jti.clone(), sandbox_id: sandbox_id.to_string(), }; let mut header = Header::new(Algorithm::EdDSA); @@ -127,7 +117,6 @@ impl SandboxJwtIssuer { })?; Ok(MintedToken { token, - jti, expires_at_ms: exp.saturating_mul(1000), }) } @@ -143,7 +132,6 @@ pub struct SandboxJwtAuthenticator { kid: String, issuer: String, audience: String, - revocation: Arc, } impl std::fmt::Debug for SandboxJwtAuthenticator { @@ -157,12 +145,7 @@ impl std::fmt::Debug for SandboxJwtAuthenticator { } impl SandboxJwtAuthenticator { - pub fn from_pem( - public_key_pem: &[u8], - kid: String, - gateway_id: &str, - revocation: Arc, - ) -> Result { + pub fn from_pem(public_key_pem: &[u8], kid: String, gateway_id: &str) -> Result { let decoding_key = DecodingKey::from_ed_pem(public_key_pem) .map_err(|e| format!("failed to parse Ed25519 public key PEM: {e}"))?; let identity = format!("openshell-gateway:{gateway_id}"); @@ -171,7 +154,6 @@ impl SandboxJwtAuthenticator { kid, issuer: identity.clone(), audience: identity, - revocation, }) } @@ -204,17 +186,9 @@ impl SandboxJwtAuthenticator { })?; let claims = data.claims; - if self.revocation.is_revoked(&claims.jti) { - debug!(jti = %claims.jti, "sandbox JWT rejected: jti revoked"); - return Err(Status::unauthenticated("revoked token")); - } - Ok(Some(Principal::Sandbox(SandboxPrincipal { sandbox_id: claims.sandbox_id, - source: SandboxIdentitySource::BootstrapJwt { - issuer: claims.iss, - jti: claims.jti, - }, + source: SandboxIdentitySource::BootstrapJwt { issuer: claims.iss }, trust_domain: Some("openshell".to_string()), }))) } @@ -261,13 +235,8 @@ mod tests { h } - fn pair() -> ( - SandboxJwtIssuer, - SandboxJwtAuthenticator, - Arc, - ) { + fn pair() -> (SandboxJwtIssuer, SandboxJwtAuthenticator) { let mat = generate_jwt_key().expect("jwt key"); - let revocation = Arc::new(RevocationSet::new()); let issuer = SandboxJwtIssuer::from_pem( mat.signing_key_pem.as_bytes(), mat.kid.clone(), @@ -279,15 +248,14 @@ mod tests { mat.public_key_pem.as_bytes(), mat.kid, "test-gateway", - revocation.clone(), ) .unwrap(); - (issuer, auth, revocation) + (issuer, auth) } #[tokio::test] async fn mint_and_validate_round_trip() { - let (issuer, auth, _) = pair(); + let (issuer, auth) = pair(); let minted = issuer.mint("sandbox-a").unwrap(); let principal = auth .authenticate(&header_map_with_bearer(&minted.token), "/anything") @@ -298,9 +266,8 @@ mod tests { Principal::Sandbox(p) => { assert_eq!(p.sandbox_id, "sandbox-a"); match p.source { - SandboxIdentitySource::BootstrapJwt { issuer: iss, jti } => { + SandboxIdentitySource::BootstrapJwt { issuer: iss } => { assert_eq!(iss, "openshell-gateway:test-gateway"); - assert_eq!(jti, minted.jti); } other => panic!("unexpected source: {other:?}"), } @@ -309,22 +276,10 @@ mod tests { } } - #[tokio::test] - async fn revoked_jti_is_rejected() { - let (issuer, auth, revocation) = pair(); - let minted = issuer.mint("sandbox-a").unwrap(); - revocation.revoke(&minted.jti, minted.expires_at_ms); - let err = auth - .authenticate(&header_map_with_bearer(&minted.token), "/anything") - .await - .expect_err("revoked must reject"); - assert_eq!(err.code(), tonic::Code::Unauthenticated); - } - #[tokio::test] async fn token_signed_by_other_key_is_rejected() { - let (_, auth_a, _) = pair(); - let (issuer_b, _, _) = pair(); // different keypair + let (_, auth_a) = pair(); + let (issuer_b, _) = pair(); // different keypair let minted = issuer_b.mint("sandbox-b").unwrap(); // The token has a different `kid` than auth_a expects, so the // authenticator yields None (lets the chain fall through). That is @@ -338,7 +293,7 @@ mod tests { #[tokio::test] async fn missing_bearer_yields_none() { - let (_, auth, _) = pair(); + let (_, auth) = pair(); let result = auth .authenticate(&http::HeaderMap::new(), "/anything") .await @@ -348,7 +303,7 @@ mod tests { #[tokio::test] async fn malformed_token_is_rejected() { - let (_, auth, _) = pair(); + let (_, auth) = pair(); let err = auth .authenticate(&header_map_with_bearer("not.a.jwt"), "/anything") .await @@ -369,20 +324,15 @@ mod tests { Duration::from_secs(3600), ) .unwrap(); - let auth = SandboxJwtAuthenticator::from_pem( - mat.public_key_pem.as_bytes(), - mat.kid.clone(), - "g", - Arc::new(RevocationSet::new()), - ) - .unwrap(); + let auth = + SandboxJwtAuthenticator::from_pem(mat.public_key_pem.as_bytes(), mat.kid.clone(), "g") + .unwrap(); let claims = SandboxJwtClaims { sub: format!("{SPIFFE_SUBJECT_PREFIX}sandbox-c"), iss: "openshell-gateway:g".to_string(), aud: "openshell-gateway:g".to_string(), iat: now_secs() - 7200, exp: now_secs() - 3600, - jti: Uuid::new_v4().to_string(), sandbox_id: "sandbox-c".to_string(), }; let mut header = Header::new(Algorithm::EdDSA); diff --git a/crates/openshell-server/src/grpc/auth_rpc.rs b/crates/openshell-server/src/grpc/auth_rpc.rs index 2519035be..b903d8f2b 100644 --- a/crates/openshell-server/src/grpc/auth_rpc.rs +++ b/crates/openshell-server/src/grpc/auth_rpc.rs @@ -5,12 +5,11 @@ //! //! Hosts the two sandbox-identity RPCs: //! - `IssueSandboxToken` — bootstrap exchange (K8s SA token → gateway JWT) -//! - `RefreshSandboxToken` — rotate a still-valid gateway JWT +//! - `RefreshSandboxToken` — renew a still-valid gateway JWT //! //! Both end in a fresh gateway-signed JWT minted by -//! [`crate::auth::sandbox_jwt::SandboxJwtIssuer`]. `RefreshSandboxToken` -//! additionally revokes the previous JWT's `jti` so the old token -//! becomes unusable as soon as the new one is handed back. +//! [`crate::auth::sandbox_jwt::SandboxJwtIssuer`]. Older tokens remain valid +//! until their own `exp` and are bounded by the configured short TTL. use crate::ServerState; use crate::auth::principal::{Principal, SandboxIdentitySource}; @@ -19,7 +18,6 @@ use openshell_core::proto::{ RefreshSandboxTokenResponse, }; use std::sync::Arc; -use std::time::SystemTime; use tonic::{Request, Response, Status}; use tracing::{debug, info, warn}; @@ -40,9 +38,9 @@ pub async fn handle_issue_sandbox_token( )); }; - // Only the bootstrap K8s ServiceAccount path can mint a fresh - // gateway JWT via this RPC. Sandboxes already holding a gateway JWT - // use `RefreshSandboxToken` instead, which also revokes the old jti. + // Only the bootstrap K8s ServiceAccount path can mint a fresh gateway JWT + // via this RPC. Sandboxes already holding a gateway JWT use + // `RefreshSandboxToken` instead. if !matches!( sandbox.source, SandboxIdentitySource::K8sServiceAccount { .. } @@ -67,7 +65,6 @@ pub async fn handle_issue_sandbox_token( let minted = issuer.mint(&sandbox.sandbox_id)?; info!( sandbox_id = %sandbox.sandbox_id, - jti = %minted.jti, "issued gateway sandbox JWT" ); Ok(Response::new(IssueSandboxTokenResponse { @@ -95,7 +92,7 @@ pub async fn handle_refresh_sandbox_token( // Only callers already holding a gateway-minted JWT may refresh; the // K8s bootstrap path must use `IssueSandboxToken`. - let SandboxIdentitySource::BootstrapJwt { jti: old_jti, .. } = &sandbox.source else { + let SandboxIdentitySource::BootstrapJwt { .. } = &sandbox.source else { debug!( sandbox_id = %sandbox.sandbox_id, "RefreshSandboxToken rejected: non-gateway-JWT principal source" @@ -113,23 +110,10 @@ pub async fn handle_refresh_sandbox_token( Status::unavailable("sandbox JWT minting is not configured on this gateway") })?; - // Mint the new token first; only revoke the old jti after we have a - // replacement so a failure here doesn't leave the sandbox stranded. let minted = issuer.mint(&sandbox.sandbox_id)?; - - // Best-effort revocation of the old token. The plan calls for the - // jti deny-list to live in memory in PR 2; PR 5 only needs to drop - // the old jti into it. We use the new token's expiry as a safe upper - // bound for the revocation entry — the old jti can't outlive its own - // `exp`, and on TTL pruning the entry drops out cleanly. - state - .sandbox_jwt_revocation - .revoke(old_jti, minted.expires_at_ms.max(now_ms())); info!( sandbox_id = %sandbox.sandbox_id, - revoked_jti = %old_jti, - new_jti = %minted.jti, - "refreshed gateway sandbox JWT" + "renewed gateway sandbox JWT" ); Ok(Response::new(RefreshSandboxTokenResponse { @@ -138,21 +122,11 @@ pub async fn handle_refresh_sandbox_token( })) } -fn now_ms() -> i64 { - i64::try_from( - SystemTime::now() - .duration_since(SystemTime::UNIX_EPOCH) - .map_or(0, |d| d.as_millis()), - ) - .unwrap_or(i64::MAX) -} - #[cfg(test)] mod tests { use super::*; use crate::ServerState; use crate::auth::principal::{Principal, SandboxPrincipal, UserPrincipal}; - use crate::auth::revocation::RevocationSet; use crate::auth::sandbox_jwt::SandboxJwtIssuer; use crate::compute::new_test_runtime; use crate::persistence::Store; @@ -164,16 +138,8 @@ mod tests { use openshell_core::Config; use std::time::Duration; - async fn state_with_issuer() -> (Arc, SandboxJwtIssuer, Arc) { + async fn state_with_issuer() -> Arc { let mat = generate_jwt_key().expect("jwt key"); - let revocation = Arc::new(RevocationSet::new()); - let issuer = SandboxJwtIssuer::from_pem( - mat.signing_key_pem.as_bytes(), - mat.kid, - "test-gateway", - Duration::from_secs(3600), - ) - .expect("issuer"); let store = Arc::new( Store::connect("sqlite::memory:?cache=shared") .await @@ -190,53 +156,46 @@ mod tests { Arc::new(SupervisorSessionRegistry::new()), None, ); - state.sandbox_jwt_revocation = revocation.clone(); // We don't need the authenticator for these tests; only the issuer. - // The handler tests only exercise the mint+revoke path; they - // don't need the issuer to be the same instance that produced - // `issuer` above. A fresh keypair is fine. - let issuer_clone = SandboxJwtIssuer::from_pem( - generate_jwt_key().unwrap().signing_key_pem.as_bytes(), - "kid".to_string(), + let issuer = SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid, "test-gateway", Duration::from_secs(3600), ) .unwrap(); - state.sandbox_jwt_issuer = Some(Arc::new(issuer_clone)); - (Arc::new(state), issuer, revocation) + state.sandbox_jwt_issuer = Some(Arc::new(issuer)); + Arc::new(state) } - fn sandbox_principal(sandbox_id: &str, jti: &str) -> Principal { + fn sandbox_principal(sandbox_id: &str) -> Principal { use crate::auth::principal::SandboxIdentitySource; Principal::Sandbox(SandboxPrincipal { sandbox_id: sandbox_id.to_string(), source: SandboxIdentitySource::BootstrapJwt { issuer: "openshell-gateway:test-gateway".to_string(), - jti: jti.to_string(), }, trust_domain: Some("openshell".to_string()), }) } #[tokio::test] - async fn refresh_revokes_old_jti_and_returns_new_token() { - let (state, _issuer, revocation) = state_with_issuer().await; - let old_jti = "j-original"; + async fn refresh_returns_new_token() { + let state = state_with_issuer().await; let mut req = Request::new(RefreshSandboxTokenRequest {}); - req.extensions_mut() - .insert(sandbox_principal("sandbox-a", old_jti)); + req.extensions_mut().insert(sandbox_principal("sandbox-a")); let resp = handle_refresh_sandbox_token(&state, req) .await .expect("refresh OK") .into_inner(); assert!(!resp.token.is_empty()); - assert!(revocation.is_revoked(old_jti), "old jti must be revoked"); + assert!(resp.expires_at_ms > 0); } #[tokio::test] async fn refresh_rejects_user_principal() { use crate::auth::identity::{Identity, IdentityProvider}; - let (state, _, _) = state_with_issuer().await; + let state = state_with_issuer().await; let mut req = Request::new(RefreshSandboxTokenRequest {}); req.extensions_mut().insert(Principal::User(UserPrincipal { identity: Identity { @@ -259,7 +218,7 @@ mod tests { // RefreshSandboxToken — the refresh path assumes a still-valid // gateway-minted JWT exists. use crate::auth::principal::SandboxIdentitySource; - let (state, _, _) = state_with_issuer().await; + let state = state_with_issuer().await; let mut req = Request::new(RefreshSandboxTokenRequest {}); req.extensions_mut() .insert(Principal::Sandbox(SandboxPrincipal { @@ -297,8 +256,7 @@ mod tests { None, )); let mut req = Request::new(RefreshSandboxTokenRequest {}); - req.extensions_mut() - .insert(sandbox_principal("sandbox-a", "j-1")); + req.extensions_mut().insert(sandbox_principal("sandbox-a")); let err = handle_refresh_sandbox_token(&state, req) .await .expect_err("missing issuer must yield unavailable"); diff --git a/crates/openshell-server/src/grpc/policy.rs b/crates/openshell-server/src/grpc/policy.rs index 4e9b82700..0ee1ddbb2 100644 --- a/crates/openshell-server/src/grpc/policy.rs +++ b/crates/openshell-server/src/grpc/policy.rs @@ -2929,7 +2929,6 @@ mod tests { sandbox_id: sandbox_id.to_string(), source: SandboxIdentitySource::BootstrapJwt { issuer: "openshell-gateway:test".to_string(), - jti: "j-test".to_string(), }, trust_domain: Some("openshell".to_string()), })); @@ -2978,7 +2977,6 @@ mod tests { sandbox_id: "test-sandbox".to_string(), source: SandboxIdentitySource::BootstrapJwt { issuer: "openshell-gateway:test".to_string(), - jti: "j-1".to_string(), }, trust_domain: None, })); diff --git a/crates/openshell-server/src/grpc/sandbox.rs b/crates/openshell-server/src/grpc/sandbox.rs index d5c87063d..889144237 100644 --- a/crates/openshell-server/src/grpc/sandbox.rs +++ b/crates/openshell-server/src/grpc/sandbox.rs @@ -142,7 +142,6 @@ pub(super) async fn handle_create_sandbox( issuer.mint(&id).map(|minted| { tracing::info!( sandbox_id = %id, - jti = %minted.jti, "minted sandbox JWT" ); minted.token diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 90faaeec8..7af52e0bd 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -121,10 +121,6 @@ pub struct ServerState { /// `IssueSandboxToken` bootstrap path. Only present when the gateway /// runs in-cluster. pub k8s_sa_authenticator: Option>, - - /// In-memory revocation set for gateway-minted sandbox JWTs. - /// Populated by `DeleteSandbox` and (in PR 5) `RefreshSandboxToken`. - pub sandbox_jwt_revocation: Arc, } fn is_benign_tls_handshake_failure(error: &std::io::Error) -> bool { @@ -172,7 +168,6 @@ impl ServerState { sandbox_jwt_issuer: None, sandbox_jwt_authenticator: None, k8s_sa_authenticator: None, - sandbox_jwt_revocation: Arc::new(auth::revocation::RevocationSet::new()), } } } @@ -281,13 +276,9 @@ pub async fn run_server( Duration::from_secs(jwt.ttl_secs), ) .map_err(Error::config)?; - let authenticator = auth::sandbox_jwt::SandboxJwtAuthenticator::from_pem( - &public_pem, - kid, - &jwt.gateway_id, - state.sandbox_jwt_revocation.clone(), - ) - .map_err(Error::config)?; + let authenticator = + auth::sandbox_jwt::SandboxJwtAuthenticator::from_pem(&public_pem, kid, &jwt.gateway_id) + .map_err(Error::config)?; info!( gateway_id = %jwt.gateway_id, ttl_secs = jwt.ttl_secs, diff --git a/crates/openshell-server/src/multiplex.rs b/crates/openshell-server/src/multiplex.rs index 567df2272..b4c1bff76 100644 --- a/crates/openshell-server/src/multiplex.rs +++ b/crates/openshell-server/src/multiplex.rs @@ -942,7 +942,6 @@ mod tests { sandbox_id: "sandbox-a".to_string(), source: SandboxIdentitySource::BootstrapJwt { issuer: "openshell-gateway:test".to_string(), - jti: "j-1".to_string(), }, trust_domain: Some("openshell".to_string()), }) diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml index 302a5806f..b81e73daa 100644 --- a/deploy/helm/openshell/templates/gateway-config.yaml +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -69,7 +69,7 @@ data: public_key_path = "/etc/openshell-jwt/public.pem" kid_path = "/etc/openshell-jwt/kid" gateway_id = {{ .Values.server.sandboxJwt.gatewayId | default (include "openshell.fullname" .) | quote }} - ttl_secs = {{ .Values.server.sandboxJwt.ttlSecs | default 86400 }} + ttl_secs = {{ .Values.server.sandboxJwt.ttlSecs | default 3600 }} {{- if .Values.server.oidc.issuer }} diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index c2f7362c3..fab8cb01a 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -181,8 +181,8 @@ server: # Stable gateway identity embedded in iss/aud of every minted token. # Defaults to the release name so HA replicas share identity. gatewayId: "" - # Token TTL in seconds. Defaults to 86400 (24h). - ttlSecs: 86400 + # Token TTL in seconds. Defaults to 3600 (1h). + ttlSecs: 3600 # Lifetime (seconds) of the projected ServiceAccount token kubelet # writes into each sandbox pod for the IssueSandboxToken bootstrap # exchange. Kubelet enforces a minimum of 600s; the driver clamps diff --git a/docs/reference/gateway-config.mdx b/docs/reference/gateway-config.mdx index 218982405..62a22d527 100644 --- a/docs/reference/gateway-config.mdx +++ b/docs/reference/gateway-config.mdx @@ -94,6 +94,13 @@ key_path = "/etc/openshell/certs/gateway-key.pem" client_ca_path = "/etc/openshell/certs/client-ca.pem" allow_unauthenticated = false +[openshell.gateway.gateway_jwt] +signing_key_path = "/etc/openshell/jwt/signing.pem" +public_key_path = "/etc/openshell/jwt/public.pem" +kid_path = "/etc/openshell/jwt/kid" +gateway_id = "openshell" +ttl_secs = 3600 + [openshell.gateway.oidc] issuer = "https://idp.example.com/realms/openshell" audience = "openshell-cli" diff --git a/e2e/support/gateway-common.sh b/e2e/support/gateway-common.sh index 25ffebb9a..09d96ecd8 100644 --- a/e2e/support/gateway-common.sh +++ b/e2e/support/gateway-common.sh @@ -50,6 +50,23 @@ e2e_generate_pki() { "${gateway_bin}" generate-certs --output-dir "${pki_dir}" "${san_args[@]}" } +e2e_preserve_mise_dirs() { + if ! command -v mise >/dev/null 2>&1; then + return 0 + fi + + if [ -z "${MISE_DATA_DIR:-}" ]; then + export MISE_DATA_DIR="${XDG_DATA_HOME:-${HOME}/.local/share}/mise" + fi + + if [ -z "${MISE_CACHE_DIR:-}" ]; then + case "$(uname -s)" in + Darwin) export MISE_CACHE_DIR="${HOME}/Library/Caches/mise" ;; + *) export MISE_CACHE_DIR="${XDG_CACHE_HOME:-${HOME}/.cache}/mise" ;; + esac + fi +} + e2e_register_plaintext_gateway() { local config_home=$1 local name=$2 @@ -121,7 +138,7 @@ e2e_write_gateway_jwt_config() { printf 'public_key_path = %s\n' "$(e2e_toml_string "${jwt_dir}/public.pem")" printf 'kid_path = %s\n' "$(e2e_toml_string "${jwt_dir}/kid")" printf 'gateway_id = %s\n' "$(e2e_toml_string "${gateway_id}")" - printf 'ttl_secs = 86400\n\n' + printf 'ttl_secs = 3600\n\n' } e2e_build_gateway_binaries() { diff --git a/e2e/with-docker-gateway.sh b/e2e/with-docker-gateway.sh index bca9d80c6..82faad9be 100755 --- a/e2e/with-docker-gateway.sh +++ b/e2e/with-docker-gateway.sh @@ -25,6 +25,8 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" # shellcheck source=e2e/support/gateway-common.sh source "${ROOT}/e2e/support/gateway-common.sh" +e2e_preserve_mise_dirs + github_actions_host_docker_tmpdir() { if [ "${GITHUB_ACTIONS:-}" != "true" ] \ || [ ! -S /var/run/docker.sock ] \ diff --git a/e2e/with-kube-gateway.sh b/e2e/with-kube-gateway.sh index 34a081516..440323944 100755 --- a/e2e/with-kube-gateway.sh +++ b/e2e/with-kube-gateway.sh @@ -13,12 +13,16 @@ # Create a local k3d cluster via tasks/scripts/helm-k3s-local.sh, install # the chart, port-forward, and tear the cluster down on exit. # -# Helm e2e currently uses plaintext gateway traffic (ci/values-tls-disabled.yaml). +# Helm e2e currently uses plaintext gateway traffic (ci/values-skaffold.yaml). +# The certgen hook still runs so the gateway has sandbox JWT signing keys. # -# Image source: helm install pulls from ${OPENSHELL_REGISTRY}/{gateway,supervisor}:${IMAGE_TAG} -# (defaults: ghcr.io/nvidia/openshell, latest). CI sets IMAGE_TAG to the commit SHA; -# local devs should set it to a tag pulled from a registry the cluster can reach, -# or build and import images via a separate bootstrap step before running this script. +# Image source: +# - Ephemeral k3d mode builds local `openshell/{gateway,supervisor}:${IMAGE_TAG}` +# images by default, imports them into k3d, then installs the chart. This +# mirrors the Skaffold local-dev path. +# - Existing-context mode pulls from ${OPENSHELL_REGISTRY}/{gateway,supervisor}:${IMAGE_TAG} +# (defaults: ghcr.io/nvidia/openshell, latest). CI sets IMAGE_TAG to the +# commit SHA and preloads or publishes the images before running this script. set -euo pipefail @@ -31,6 +35,8 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" # shellcheck source=e2e/support/gateway-common.sh source "${ROOT}/e2e/support/gateway-common.sh" +e2e_preserve_mise_dirs + WORKDIR_PARENT="${TMPDIR:-/tmp}" WORKDIR_PARENT="${WORKDIR_PARENT%/}" WORKDIR="$(mktemp -d "${WORKDIR_PARENT}/openshell-e2e-kube.XXXXXX")" @@ -147,8 +153,21 @@ else KUBE_CONTEXT="k3d-${CLUSTER_NAME}" fi -IMAGE_TAG_VALUE="${IMAGE_TAG:-latest}" -REGISTRY_VALUE="${OPENSHELL_REGISTRY:-ghcr.io/nvidia/openshell}" +if [ -z "${OPENSHELL_E2E_KUBE_BUILD_IMAGES+x}" ]; then + if [ "${CLUSTER_CREATED_BY_US}" = "1" ]; then + OPENSHELL_E2E_KUBE_BUILD_IMAGES=1 + else + OPENSHELL_E2E_KUBE_BUILD_IMAGES=0 + fi +fi + +if [ "${OPENSHELL_E2E_KUBE_BUILD_IMAGES}" = "1" ]; then + REGISTRY_VALUE="${OPENSHELL_REGISTRY:-openshell}" + IMAGE_TAG_VALUE="${IMAGE_TAG:-e2e-${CLUSTER_NAME:-local}}" +else + REGISTRY_VALUE="${OPENSHELL_REGISTRY:-ghcr.io/nvidia/openshell}" + IMAGE_TAG_VALUE="${IMAGE_TAG:-latest}" +fi REGISTRY_VALUE="${REGISTRY_VALUE%/}" # Resolve a host-gateway IP that sandbox pods can dial to reach test fixtures @@ -160,7 +179,9 @@ REGISTRY_VALUE="${REGISTRY_VALUE%/}" # Preference order: # 1. OPENSHELL_E2E_HOST_GATEWAY_IP — operator override (remote clusters where # auto-detection has no signal). -# 2. Gateway of the cluster's Docker network (k3d- for ephemeral +# 2. k3d's CoreDNS host.k3d.internal entry. On Docker Desktop this is a +# host-routable address; the Docker network gateway is not. +# 3. Gateway of the cluster's Docker network (k3d- for ephemeral # clusters, `kind` for kind clusters used in CI). Pods SNAT through their # node to this IP, which lands on the host's bridge interface and reaches # any 0.0.0.0-bound listener / published container port. @@ -171,18 +192,29 @@ HOST_GATEWAY_IP="${OPENSHELL_E2E_HOST_GATEWAY_IP:-}" # the docker bridge gateway on Linux). That mapping handles Docker Desktop # correctly; the docker network gateway alone does not. if [ -z "${HOST_GATEWAY_IP}" ] && command -v kubectl >/dev/null 2>&1; then - detected="$(kctl -n kube-system get configmap coredns -o jsonpath='{.data.NodeHosts}' 2>/dev/null \ - | awk '$2 == "host.k3d.internal" { print $1; exit }')" - if [ -n "${detected}" ]; then - HOST_GATEWAY_IP="${detected}" - echo "Detected host gateway IP ${HOST_GATEWAY_IP} from CoreDNS host.k3d.internal entry." - fi + for _ in {1..15}; do + detected="$(kctl -n kube-system get configmap coredns -o jsonpath='{.data.NodeHosts}' 2>/dev/null \ + | awk '$2 == "host.k3d.internal" { print $1; exit }' || true)" + if [ -n "${detected}" ]; then + HOST_GATEWAY_IP="${detected}" + echo "Detected host gateway IP ${HOST_GATEWAY_IP} from CoreDNS host.k3d.internal entry." + break + fi + sleep 1 + done fi # Fallback for non-k3d clusters (kind in CI, etc.): use the docker network # gateway IP. Works on Linux where the bridge is reachable from pods; on macOS # Docker Desktop without k3d, this will likely not route to the host. -if [ -z "${HOST_GATEWAY_IP}" ] && command -v docker >/dev/null 2>&1; then +use_docker_network_gateway=1 +if [ "$(uname -s)" = "Darwin" ] \ + && { [ "${CLUSTER_CREATED_BY_US}" = "1" ] || [[ "${KUBE_CONTEXT}" == k3d-* ]]; }; then + use_docker_network_gateway=0 +fi +if [ -z "${HOST_GATEWAY_IP}" ] \ + && [ "${use_docker_network_gateway}" = "1" ] \ + && command -v docker >/dev/null 2>&1; then candidate_networks=() if [ "${CLUSTER_CREATED_BY_US}" = "1" ]; then candidate_networks+=("k3d-${CLUSTER_NAME}") @@ -227,6 +259,15 @@ elif [[ "${KUBE_CONTEXT}" == k3d-* ]] && command -v k3d >/dev/null 2>&1; then import_cluster_name="${candidate}" fi fi +if [ "${OPENSHELL_E2E_KUBE_BUILD_IMAGES}" = "1" ]; then + require_cmd docker + echo "Building local Kubernetes e2e images (${REGISTRY_VALUE}/{gateway,supervisor}:${IMAGE_TAG_VALUE})..." + CONTAINER_ENGINE=docker IMAGE_REGISTRY="${REGISTRY_VALUE}" IMAGE_TAG="${IMAGE_TAG_VALUE}" \ + bash "${ROOT}/tasks/scripts/docker-build-image.sh" gateway + CONTAINER_ENGINE=docker IMAGE_REGISTRY="${REGISTRY_VALUE}" IMAGE_TAG="${IMAGE_TAG_VALUE}" \ + bash "${ROOT}/tasks/scripts/docker-build-image.sh" supervisor +fi + if [ -n "${import_cluster_name}" ]; then for image in \ "${REGISTRY_VALUE}/gateway:${IMAGE_TAG_VALUE}" \ @@ -255,7 +296,7 @@ fi echo "Installing Helm chart (release=${RELEASE_NAME}, namespace=${NAMESPACE}, tag=${IMAGE_TAG_VALUE})..." helmctl install "${RELEASE_NAME}" "${ROOT}/deploy/helm/openshell" \ --namespace "${NAMESPACE}" --create-namespace \ - --values "${ROOT}/deploy/helm/openshell/ci/values-tls-disabled.yaml" \ + --values "${ROOT}/deploy/helm/openshell/ci/values-skaffold.yaml" \ --set "fullnameOverride=openshell" \ --set "image.repository=${REGISTRY_VALUE}/gateway" \ --set "image.tag=${IMAGE_TAG_VALUE}" \ diff --git a/proto/openshell.proto b/proto/openshell.proto index 4b97ad4cf..10c69f414 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -232,12 +232,12 @@ service OpenShell { // and never call this RPC. rpc IssueSandboxToken(IssueSandboxTokenRequest) returns (IssueSandboxTokenResponse); - // Rotate the calling sandbox's gateway JWT. The previously-issued - // token is revoked (its jti added to the gateway's deny list) and a - // fresh token bound to the same sandbox UUID is returned. The - // supervisor calls this from a background task at ~80% of the token's - // lifetime; the new token is cached in memory only — the on-disk - // bootstrap file is intentionally not rewritten. + // Renew the calling sandbox's gateway JWT. Older tokens remain valid + // until their own expiry; deployments should keep token TTLs short to + // bound replay exposure. The supervisor calls this from a background + // task at ~80% of the token's lifetime; the new token is cached in + // memory only — the on-disk bootstrap file is intentionally not + // rewritten. rpc RefreshSandboxToken(RefreshSandboxTokenRequest) returns (RefreshSandboxTokenResponse); } @@ -262,8 +262,8 @@ message IssueSandboxTokenResponse { // gateway-minted JWT in its Authorization header). message RefreshSandboxTokenRequest {} -// RefreshSandboxToken response. The previous token is revoked server-side -// before this response is sent. +// RefreshSandboxToken response. The new token replaces the supervisor's +// in-memory bearer credential. message RefreshSandboxTokenResponse { // Fresh gateway-minted JWT bound to the same sandbox UUID. string token = 1; From 8bc98fc2126ae5ee4dbada1885db6f480bb7e788 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Mon, 18 May 2026 15:00:55 -0700 Subject: [PATCH 14/18] test(server): fix rebased test fixtures Signed-off-by: Taylor Mutch --- crates/openshell-server/src/compute/mod.rs | 6 +++--- crates/openshell-server/src/grpc/policy.rs | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index c79ffcf2d..3a6bf5c42 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -2872,7 +2872,7 @@ mod tests { resource_version: 0, }); - let created = runtime.create_sandbox(sandbox).await.unwrap(); + let created = runtime.create_sandbox(sandbox, None).await.unwrap(); assert_eq!( created.metadata.as_ref().unwrap().resource_version, @@ -2908,11 +2908,11 @@ mod tests { // Spawn two concurrent creation attempts for the same sandbox let runtime1 = runtime.clone(); let sandbox1 = sandbox.clone(); - let handle1 = tokio::spawn(async move { runtime1.create_sandbox(sandbox1).await }); + let handle1 = tokio::spawn(async move { runtime1.create_sandbox(sandbox1, None).await }); let runtime2 = runtime.clone(); let sandbox2 = sandbox.clone(); - let handle2 = tokio::spawn(async move { runtime2.create_sandbox(sandbox2).await }); + let handle2 = tokio::spawn(async move { runtime2.create_sandbox(sandbox2, None).await }); // Wait for both to complete let result1 = handle1.await.unwrap(); diff --git a/crates/openshell-server/src/grpc/policy.rs b/crates/openshell-server/src/grpc/policy.rs index 0ee1ddbb2..03d0c2800 100644 --- a/crates/openshell-server/src/grpc/policy.rs +++ b/crates/openshell-server/src/grpc/policy.rs @@ -3015,6 +3015,7 @@ mod tests { name: name.to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), + resource_version: 0, }), spec: Some(SandboxSpec { policy: None, @@ -3047,6 +3048,7 @@ mod tests { name: "self".to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), + resource_version: 0, }), spec: Some(SandboxSpec { policy: None, @@ -3078,6 +3080,7 @@ mod tests { name: name.to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), + resource_version: 0, }), spec: Some(SandboxSpec { policy: None, @@ -3112,6 +3115,7 @@ mod tests { name: name.to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), + resource_version: 0, }), spec: Some(SandboxSpec { policy: None, @@ -3146,6 +3150,7 @@ mod tests { name: "x".to_string(), created_at_ms: 1_000_000, labels: HashMap::new(), + resource_version: 0, }), spec: Some(SandboxSpec { policy: None, From 39aa8dbd3261d9fb011bf2818dc015799124426a Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Mon, 18 May 2026 16:51:25 -0700 Subject: [PATCH 15/18] docs(helm): update chart values reference Signed-off-by: Taylor Mutch --- deploy/helm/openshell/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/deploy/helm/openshell/README.md b/deploy/helm/openshell/README.md index c5484684b..b564dcb0f 100644 --- a/deploy/helm/openshell/README.md +++ b/deploy/helm/openshell/README.md @@ -145,6 +145,10 @@ cert-manager alternative. | server.oidc.userRole | string | `""` | Role name for standard user access. | | server.sandboxImage | string | `"ghcr.io/nvidia/openshell-community/sandboxes/base:latest"` | Default sandbox image used when requests do not specify one. | | server.sandboxImagePullPolicy | string | `""` | Kubernetes imagePullPolicy for sandbox pods. Empty = Kubernetes default (Always for :latest, IfNotPresent otherwise). Set to "Always" for dev clusters so new images are picked up without manual eviction. | +| server.sandboxJwt.gatewayId | string | `""` | | +| server.sandboxJwt.k8sSaTokenTtlSecs | int | `3600` | | +| server.sandboxJwt.signingSecretName | string | `""` | | +| server.sandboxJwt.ttlSecs | int | `3600` | | | server.sandboxNamespace | string | `""` | Namespace where sandbox pods are created. Defaults to the Helm release namespace (.Release.Namespace) when left empty. | | server.tls.certSecretName | string | `"openshell-server-tls"` | K8s secret (type kubernetes.io/tls) with tls.crt and tls.key for the server. | | server.tls.clientCaSecretName | string | `"openshell-server-client-ca"` | K8s secret with ca.crt for client certificate verification (mTLS). Set to "" to disable mTLS and run HTTPS-only (use OIDC for auth instead). | From 3e6d765347cf5d89da4272ec1ef752ccef395c86 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Tue, 19 May 2026 11:45:52 -0700 Subject: [PATCH 16/18] chore(markdown): ignore local architecture plans --- .markdownlint-cli2.jsonc | 1 + 1 file changed, 1 insertion(+) diff --git a/.markdownlint-cli2.jsonc b/.markdownlint-cli2.jsonc index 30cf48849..125df0f81 100644 --- a/.markdownlint-cli2.jsonc +++ b/.markdownlint-cli2.jsonc @@ -16,6 +16,7 @@ ".claude/**", ".opencode/**", ".github/**", + "architecture/plans/**", "**/node_modules/**", "target/**", ".pytest_cache/**", From f81e26f528bb498bb24dbc25bb14c3c67d6cf605 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Tue, 19 May 2026 11:46:15 -0700 Subject: [PATCH 17/18] test(server): fix auth endpoint RPC stub lint --- .../tests/auth_endpoint_integration.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/crates/openshell-server/tests/auth_endpoint_integration.rs b/crates/openshell-server/tests/auth_endpoint_integration.rs index 21e136572..c299eedc5 100644 --- a/crates/openshell-server/tests/auth_endpoint_integration.rs +++ b/crates/openshell-server/tests/auth_endpoint_integration.rs @@ -782,17 +782,15 @@ impl openshell_core::proto::open_shell_server::OpenShell for TestOpenShell { async fn issue_sandbox_token( &self, _request: tonic::Request, - ) -> Result, tonic::Status> - { - Err(tonic::Status::unimplemented("not implemented in test")) + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) } async fn refresh_sandbox_token( &self, _request: tonic::Request, - ) -> Result, tonic::Status> - { - Err(tonic::Status::unimplemented("not implemented in test")) + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) } async fn connect_supervisor( From 19b87370775b73c20a42b2b541e9258cd0f63f54 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Tue, 19 May 2026 12:07:31 -0700 Subject: [PATCH 18/18] fix(server): restrict sandbox principal RPC access --- architecture/gateway.md | 8 ++ .../src/auth/authenticator.rs | 8 +- crates/openshell-server/src/auth/guard.rs | 41 ++++++ crates/openshell-server/src/auth/mod.rs | 1 + crates/openshell-server/src/auth/oidc.rs | 2 +- crates/openshell-server/src/auth/principal.rs | 22 ++-- .../openshell-server/src/auth/sandbox_jwt.rs | 2 +- .../src/auth/sandbox_methods.rs | 60 +++++++++ crates/openshell-server/src/cli.rs | 9 +- crates/openshell-server/src/grpc/policy.rs | 8 +- crates/openshell-server/src/lib.rs | 7 +- crates/openshell-server/src/multiplex.rs | 96 ++++++++++---- .../src/supervisor_session.rs | 122 ++++++++++++++++-- 13 files changed, 317 insertions(+), 69 deletions(-) create mode 100644 crates/openshell-server/src/auth/sandbox_methods.rs diff --git a/architecture/gateway.md b/architecture/gateway.md index 2b032f0fd..3d7ae4b6b 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -53,6 +53,14 @@ token through `IssueSandboxToken`. Supervisors renew gateway JWTs in memory before expiry. Older tokens are not server-revoked; deployments bound replay exposure with short `gateway_jwt.ttl_secs` lifetimes. +Sandbox JWTs are not user credentials. The gRPC router accepts +`Principal::Sandbox` only on the supervisor-to-gateway RPC allowlist +(`ConnectSupervisor`, `RelayStream`, token renewal, config sync, policy status, +log push, and policy-analysis callbacks). Handlers then compare the +authenticated sandbox ID with any sandbox ID or name resolved from the request. +Supervisor control and relay streams require a matching sandbox principal before +the gateway registers the session or bridges relay bytes. + ## API Surface The gateway API is organized around platform objects and operational streams: diff --git a/crates/openshell-server/src/auth/authenticator.rs b/crates/openshell-server/src/auth/authenticator.rs index ee11f8f35..066b55a13 100644 --- a/crates/openshell-server/src/auth/authenticator.rs +++ b/crates/openshell-server/src/auth/authenticator.rs @@ -19,7 +19,7 @@ //! - [`super::oidc::OidcAuthenticator`] — user OIDC Bearer tokens //! - [`PermissiveUserAuthenticator`] — final-fallback dev-mode catch-all //! that produces a synthetic user principal when no OIDC is -//! configured. Preserves the pre-PR-1 "no OIDC = open" posture for +//! configured. Preserves the "no OIDC = open" dev posture for //! singleplayer / helm-dev deployments. use super::identity::{Identity, IdentityProvider}; @@ -99,9 +99,9 @@ impl std::fmt::Debug for AuthenticatorChain { /// Final-fallback authenticator that produces a synthetic user principal /// for any request the earlier authenticators didn't claim. Used only /// when no user-side authentication is configured (no OIDC, no fronting -/// proxy contract) — the pre-PR-1 gateway accepted such requests with -/// no auth at all; this preserves that posture in a principal-aware -/// way so handlers always see *some* principal in extensions. +/// proxy contract). This preserves the dev-mode open posture in a +/// principal-aware way so handlers always see *some* principal in +/// extensions. /// /// Producing a User principal (rather than Anonymous) means dev-mode /// requests pass the per-handler IDOR guard via the User-bypass diff --git a/crates/openshell-server/src/auth/guard.rs b/crates/openshell-server/src/auth/guard.rs index aac768017..edcd6bc01 100644 --- a/crates/openshell-server/src/auth/guard.rs +++ b/crates/openshell-server/src/auth/guard.rs @@ -9,6 +9,7 @@ //! already evaluated. use super::principal::Principal; +use super::principal::SandboxPrincipal; use tonic::Status; use tracing::info; @@ -66,6 +67,32 @@ pub fn enforce_sandbox_scope( Ok(principal) } +/// Require a sandbox principal and reject users or anonymous callers. +/// +/// Supervisor-only control/data plane RPCs (`ConnectSupervisor`, +/// `RelayStream`) must be presented by the sandbox supervisor itself. +/// User principals intentionally pass [`ensure_sandbox_scope`] for normal +/// CLI/TUI APIs because RBAC is their gate, but they are not valid +/// supervisor identities. +#[allow(clippy::result_large_err)] +pub fn ensure_sandbox_principal_scope( + principal: &Principal, + claimed_sandbox_id: &str, +) -> Result { + match principal { + Principal::Sandbox(p) => { + ensure_sandbox_scope(principal, claimed_sandbox_id)?; + Ok(p.clone()) + } + Principal::User(_) => Err(Status::permission_denied( + "supervisor RPCs require a sandbox principal", + )), + Principal::Anonymous => Err(Status::unauthenticated( + "supervisor RPCs require an authenticated sandbox principal", + )), + } +} + #[cfg(test)] mod tests { use super::*; @@ -119,6 +146,20 @@ mod tests { assert_eq!(err.code(), tonic::Code::Unauthenticated); } + #[test] + fn sandbox_principal_scope_returns_matching_sandbox() { + let principal = sandbox("sbx-1"); + let scoped = ensure_sandbox_principal_scope(&principal, "sbx-1").expect("scope OK"); + assert_eq!(scoped.sandbox_id, "sbx-1"); + } + + #[test] + fn sandbox_principal_scope_rejects_users() { + let err = ensure_sandbox_principal_scope(&user("alice"), "sbx-1") + .expect_err("users are not supervisor identities"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + #[test] fn enforce_reads_from_request_extensions() { let mut req = tonic::Request::new(()); diff --git a/crates/openshell-server/src/auth/mod.rs b/crates/openshell-server/src/auth/mod.rs index 880b02d38..ca032a006 100644 --- a/crates/openshell-server/src/auth/mod.rs +++ b/crates/openshell-server/src/auth/mod.rs @@ -17,5 +17,6 @@ pub mod k8s_sa; pub mod oidc; pub mod principal; pub mod sandbox_jwt; +pub mod sandbox_methods; pub use http::router; diff --git a/crates/openshell-server/src/auth/oidc.rs b/crates/openshell-server/src/auth/oidc.rs index 6c1339e4f..5e5a23500 100644 --- a/crates/openshell-server/src/auth/oidc.rs +++ b/crates/openshell-server/src/auth/oidc.rs @@ -356,7 +356,7 @@ impl JwksCache { /// /// Returns `Ok(None)` when no Bearer header is present, so the chain can fall /// through to other authenticators (e.g. the gateway-minted sandbox JWT -/// authenticator added in PR 2). +/// authenticator). pub struct OidcAuthenticator { cache: Arc, } diff --git a/crates/openshell-server/src/auth/principal.rs b/crates/openshell-server/src/auth/principal.rs index 7000ae342..a95eb831b 100644 --- a/crates/openshell-server/src/auth/principal.rs +++ b/crates/openshell-server/src/auth/principal.rs @@ -26,7 +26,7 @@ pub enum Principal { User(UserPrincipal), /// Sandbox supervisor authenticated by an identity bound to a specific /// sandbox UUID. The wrapped `sandbox_id` MUST match any sandbox referenced - /// in the request body for sandbox-class methods (PR-4 guard). + /// in the request body for sandbox-class methods. Sandbox(#[allow(dead_code)] SandboxPrincipal), /// Truly unauthenticated caller (health probes, reflection). Sandbox-class /// and user-class methods reject this variant. @@ -43,17 +43,14 @@ pub struct UserPrincipal { /// Sandbox caller — bound to one specific sandbox UUID. /// -/// `sandbox_id` and `source` are consumed by the PR-4 handler guard; until -/// then they only exist in the type so the trait shape is stable across the -/// PR series. +/// `sandbox_id` and `source` are consumed by the router and handler guards. #[derive(Debug, Clone)] #[allow(dead_code)] pub struct SandboxPrincipal { - /// Canonical sandbox UUID. Empty string only for the PR-1 legacy marker; - /// PR 2 onwards always populates this from a verified credential. + /// Canonical sandbox UUID populated from a verified sandbox credential. pub sandbox_id: String, - /// How this principal was verified — used for audit logs and to gate the - /// PR-4 IDOR check against unverified sources. + /// How this principal was verified — used for audit logs and method-specific + /// authorization checks. pub source: SandboxIdentitySource, /// SPIFFE trust domain. Populated when the credential is SPIFFE-shaped; /// reserved for future per-sandbox cert / SPIRE authenticators. @@ -63,18 +60,17 @@ pub struct SandboxPrincipal { /// How a [`SandboxPrincipal`] was authenticated. /// /// Variant fields are populated by the producing authenticator and consumed -/// by audit logging + the PR-4 IDOR guard. Until PR 4 lands those readers -/// they look unused to the dead-code lint. +/// by audit logging and method-specific authorization checks. #[derive(Debug, Clone)] #[allow(dead_code)] pub enum SandboxIdentitySource { /// Gateway-minted JWT validated against the gateway's signing key. /// Produced by [`super::sandbox_jwt::SandboxJwtAuthenticator`]. BootstrapJwt { issuer: String }, - /// Per-sandbox client certificate. Reserved for the v2 channel-bound - /// identity follow-up. + /// Per-sandbox client certificate. Reserved for channel-bound sandbox + /// identity. BootstrapCert { fingerprint: String }, - /// SPIRE-issued SVID. Reserved for the SPIFFE/SPIRE follow-up. + /// SPIRE-issued SVID. Reserved for SPIFFE/SPIRE sandbox identity. SpiffeSvid { spiffe_id: String }, /// K8s `ServiceAccount` token used to bootstrap a gateway-minted JWT /// via `IssueSandboxToken`. Populated only on that one RPC path. diff --git a/crates/openshell-server/src/auth/sandbox_jwt.rs b/crates/openshell-server/src/auth/sandbox_jwt.rs index 69ce18d23..2ec890249 100644 --- a/crates/openshell-server/src/auth/sandbox_jwt.rs +++ b/crates/openshell-server/src/auth/sandbox_jwt.rs @@ -5,7 +5,7 @@ //! //! The gateway signs an Ed25519 JWT for each sandbox at create time and //! the sandbox supervisor presents it as `Authorization: Bearer ` on -//! every gRPC call (PR 3). This module implements both sides of the +//! supervisor-to-gateway gRPC calls. This module implements both sides of the //! gateway-controlled token: //! - [`SandboxJwtIssuer`] mints fresh tokens (called from //! `handle_create_sandbox` and the `IssueSandboxToken` RPC). diff --git a/crates/openshell-server/src/auth/sandbox_methods.rs b/crates/openshell-server/src/auth/sandbox_methods.rs new file mode 100644 index 000000000..f93705704 --- /dev/null +++ b/crates/openshell-server/src/auth/sandbox_methods.rs @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Method-level allowlist for sandbox principals. +//! +//! Gateway-minted sandbox JWTs identify a single sandbox supervisor. They +//! must not authorize user-facing or admin APIs. The router rejects sandbox +//! principals for every method outside this supervisor-to-gateway allowlist; +//! handlers still perform same-sandbox checks on request bodies. + +/// Methods a `Principal::Sandbox` may invoke. +const ALLOWED_SANDBOX_METHODS: &[&str] = &[ + "/openshell.v1.OpenShell/IssueSandboxToken", + "/openshell.v1.OpenShell/RefreshSandboxToken", + "/openshell.v1.OpenShell/ConnectSupervisor", + "/openshell.v1.OpenShell/RelayStream", + "/openshell.v1.OpenShell/GetSandboxConfig", + "/openshell.v1.OpenShell/GetSandboxProviderEnvironment", + "/openshell.v1.OpenShell/UpdateConfig", + "/openshell.v1.OpenShell/ReportPolicyStatus", + "/openshell.v1.OpenShell/PushSandboxLogs", + "/openshell.v1.OpenShell/SubmitPolicyAnalysis", + "/openshell.v1.OpenShell/GetDraftPolicy", +]; + +pub fn is_sandbox_callable(path: &str) -> bool { + ALLOWED_SANDBOX_METHODS.contains(&path) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn supervisor_callbacks_are_allowed() { + assert!(is_sandbox_callable( + "/openshell.v1.OpenShell/ConnectSupervisor" + )); + assert!(is_sandbox_callable("/openshell.v1.OpenShell/RelayStream")); + assert!(is_sandbox_callable( + "/openshell.v1.OpenShell/GetSandboxConfig" + )); + } + + #[test] + fn user_and_admin_methods_are_not_allowed() { + assert!(!is_sandbox_callable( + "/openshell.v1.OpenShell/ListSandboxes" + )); + assert!(!is_sandbox_callable( + "/openshell.v1.OpenShell/DeleteSandbox" + )); + assert!(!is_sandbox_callable( + "/openshell.v1.OpenShell/CreateProvider" + )); + assert!(!is_sandbox_callable( + "/openshell.v1.OpenShell/ApproveDraftChunk" + )); + } +} diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index 9e7f4d8e7..a7bbf7652 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -336,11 +336,10 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { }); } - // PR-2 wires gateway_jwt via the config file only — there's no CLI - // flag yet because the standard deployments (helm chart + RPM init - // script) drop the keypair to a known path and pass that path through - // the TOML. A CLI shortcut can be added if a singleplayer operator - // needs to override. + // `gateway_jwt` is configured through TOML. Standard deployments + // (helm chart + RPM init script) drop the keypair to a known path and + // pass that path through the file. A CLI shortcut can be added if a + // singleplayer operator needs to override it. if let Some(jwt) = file .as_ref() .and_then(|f| f.openshell.gateway.gateway_jwt.clone()) diff --git a/crates/openshell-server/src/grpc/policy.rs b/crates/openshell-server/src/grpc/policy.rs index 03d0c2800..0dab40704 100644 --- a/crates/openshell-server/src/grpc/policy.rs +++ b/crates/openshell-server/src/grpc/policy.rs @@ -2901,9 +2901,9 @@ mod tests { use std::sync::Arc; use tonic::Code; - /// Wrap a request with a user `Principal` so handlers' scope guards - /// (introduced in PR 4) treat the test caller as a CLI user — equivalent - /// to the pre-PR-4 behavior where all tests effectively ran as user. + /// Wrap a request with a user `Principal` so handler scope guards treat + /// the test caller as a CLI user. Most handler tests exercise + /// user-facing behavior and should not trip sandbox equality checks. fn with_user(mut request: Request) -> Request { request .extensions_mut() @@ -3000,7 +3000,7 @@ mod tests { assert!(!is_sandbox_caller(&req)); } - // ---- PR-4 IDOR guard (issue #1354) ---- + // ---- Sandbox IDOR guard (issue #1354) ---- #[tokio::test] async fn cross_sandbox_get_sandbox_config_denied() { diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 7af52e0bd..d25cf7f43 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -237,10 +237,9 @@ pub async fn run_server( ); // Load the gateway-minted sandbox JWT signing key when configured. - // Optional in PR 2 so single-driver dev deployments without certgen - // continue to start. The helm-deployed gateway and the RPM init script - // populate `gateway_jwt` once `certgen` has produced the on-disk - // material. + // Optional so single-driver dev deployments without certgen continue + // to start. The helm-deployed gateway and the RPM init script populate + // `gateway_jwt` once `certgen` has produced the on-disk material. if let Some(ref jwt) = config.gateway_jwt { let signing_pem = std::fs::read(&jwt.signing_key_path).map_err(|e| { Error::config(format!( diff --git a/crates/openshell-server/src/multiplex.rs b/crates/openshell-server/src/multiplex.rs index b4c1bff76..6244aad4d 100644 --- a/crates/openshell-server/src/multiplex.rs +++ b/crates/openshell-server/src/multiplex.rs @@ -272,7 +272,7 @@ where /// 4. `PermissiveUserAuthenticator` — installed only when no OIDC is /// configured (singleplayer / helm-dev). Catches anything the /// sandbox authenticators didn't claim and produces a synthetic -/// user principal, preserving the pre-PR-1 "no OIDC = open" posture. +/// user principal, preserving the existing "no OIDC = open" dev posture. /// /// When neither OIDC nor gateway-minted JWTs are configured (a barebones /// dev gateway), the chain is left as `None` so the router short-circuits @@ -310,9 +310,9 @@ fn build_authenticator_chain(state: &ServerState) -> Option /// - When no chain is configured (OIDC not configured), forward without /// authentication — preserves today's pass-through behavior. /// - Otherwise, run the chain. The first match produces a `Principal`. -/// `Principal::User` is gated by the RBAC `AuthzPolicy`. The legacy -/// sandbox marker also inserts the metadata marker for backwards-compat -/// with handlers that still consume it (PR-1 only; removed in PR 3). +/// `Principal::User` is gated by the RBAC `AuthzPolicy`. +/// `Principal::Sandbox` is gated by a supervisor-method allowlist, then +/// handlers enforce same-sandbox scope on request bodies. #[derive(Clone)] pub struct AuthGrpcRouter { inner: S, @@ -409,15 +409,26 @@ where Err(status) => return Ok(status_response(status)), }; - // Authorize user principals via RBAC. Sandbox principals get - // a per-handler `sandbox_id` equality check in PR 4; right now - // they bypass RBAC because the public sandbox-class methods - // they call were path-bypassed before this refactor too. - if let Principal::User(ref user) = principal - && let Some(ref policy) = authz_policy - && let Err(status) = policy.check(&user.identity, &path) - { - return Ok(status_response(status)); + match principal { + Principal::User(ref user) => { + if let Some(ref policy) = authz_policy + && let Err(status) = policy.check(&user.identity, &path) + { + return Ok(status_response(status)); + } + } + Principal::Sandbox(_) => { + if !crate::auth::sandbox_methods::is_sandbox_callable(&path) { + return Ok(status_response(tonic::Status::permission_denied( + "sandbox principals may not call this method", + ))); + } + } + Principal::Anonymous => { + return Ok(status_response(tonic::Status::unauthenticated( + "anonymous callers may not call authenticated methods", + ))); + } } req.extensions_mut().insert(principal); @@ -925,6 +936,12 @@ mod tests { .unwrap() } + fn grpc_status(res: &Response) -> Option { + res.headers() + .get("grpc-status") + .map(|v| v.to_str().unwrap().to_string()) + } + fn user_principal(subject: &str) -> Principal { Principal::User(UserPrincipal { identity: Identity { @@ -983,6 +1000,45 @@ mod tests { } } + #[tokio::test] + async fn sandbox_principal_can_call_allowlisted_method() { + let mock = Arc::new(MockAuthenticator::returning(Ok(Some(sandbox_principal())))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + + let res = router + .call(empty_request("/openshell.v1.OpenShell/GetSandboxConfig")) + .await + .unwrap(); + + assert_eq!(res.status(), 200); + assert!(matches!( + seen.lock().unwrap().as_ref(), + Some(Principal::Sandbox(_)) + )); + } + + #[tokio::test] + async fn sandbox_principal_is_denied_on_user_and_admin_methods() { + for path in [ + "/openshell.v1.OpenShell/ListSandboxes", + "/openshell.v1.OpenShell/DeleteSandbox", + "/openshell.v1.OpenShell/CreateProvider", + "/openshell.v1.OpenShell/ApproveDraftChunk", + ] { + let mock = Arc::new(MockAuthenticator::returning(Ok(Some(sandbox_principal())))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + + let res = router.call(empty_request(path)).await.unwrap(); + + assert!(seen.lock().unwrap().is_none(), "{path} reached handler"); + assert_eq!(grpc_status(&res).as_deref(), Some("7"), "{path}"); + } + } + #[tokio::test] async fn missing_principal_returns_unauthenticated() { let mock = Arc::new(MockAuthenticator::returning(Ok(None))); @@ -995,11 +1051,7 @@ mod tests { .unwrap(); assert!(seen.lock().unwrap().is_none()); // tonic sets grpc-status=16 (UNAUTHENTICATED) in trailers. - let grpc_status = res - .headers() - .get("grpc-status") - .map(|v| v.to_str().unwrap().to_string()); - assert_eq!(grpc_status.as_deref(), Some("16")); + assert_eq!(grpc_status(&res).as_deref(), Some("16")); } #[tokio::test] @@ -1015,13 +1067,7 @@ mod tests { .await .unwrap(); assert!(seen.lock().unwrap().is_none()); - assert_eq!( - res.headers() - .get("grpc-status") - .map(|v| v.to_str().unwrap().to_string()) - .as_deref(), - Some("16") - ); + assert_eq!(grpc_status(&res).as_deref(), Some("16")); } #[tokio::test] diff --git a/crates/openshell-server/src/supervisor_session.rs b/crates/openshell-server/src/supervisor_session.rs index 91f40c289..d2dc8630d 100644 --- a/crates/openshell-server/src/supervisor_session.rs +++ b/crates/openshell-server/src/supervisor_session.rs @@ -18,6 +18,7 @@ use openshell_core::proto::{ }; use crate::ServerState; +use crate::auth::principal::Principal; const HEARTBEAT_INTERVAL_SECS: u32 = 15; const RELAY_PENDING_TIMEOUT: Duration = Duration::from_secs(10); @@ -337,17 +338,40 @@ impl SupervisorSessionRegistry { /// Returns the `DuplexStream` half that the supervisor side should read/write. // `tonic::Status` is large but is the API surface of gRPC handlers. #[allow(clippy::result_large_err)] - pub fn claim_relay(&self, channel_id: &str) -> Result { + pub fn claim_relay( + &self, + channel_id: &str, + principal: Option<&Principal>, + ) -> Result { let pending = { let mut map = self.pending_relays.lock().unwrap(); + let pending = map + .get(channel_id) + .ok_or_else(|| Status::not_found("unknown or expired relay channel"))?; + + if let Some(principal) = principal + && let Err(status) = crate::auth::guard::ensure_sandbox_principal_scope( + principal, + &pending.sandbox_id, + ) + { + info!( + channel_id = %channel_id, + sandbox_id = %pending.sandbox_id, + "relay stream: rejecting cross-sandbox claim" + ); + return Err(status); + } + + if pending.created_at.elapsed() > RELAY_PENDING_TIMEOUT { + map.remove(channel_id); + return Err(Status::deadline_exceeded("relay channel timed out")); + } + map.remove(channel_id) - .ok_or_else(|| Status::not_found("unknown or expired relay channel"))? + .expect("pending relay existed before removal") }; - if pending.created_at.elapsed() > RELAY_PENDING_TIMEOUT { - return Err(Status::deadline_exceeded("relay channel timed out")); - } - // Create a duplex stream pair: one end for the gateway bridge, one for // the supervisor HTTP CONNECT handler. let (gateway_stream, supervisor_stream) = tokio::io::duplex(64 * 1024); @@ -449,6 +473,7 @@ pub async fn handle_relay_stream( >, Status, > { + let principal = request.extensions().get::().cloned(); let mut inbound = request.into_inner(); // First frame must identify the channel. @@ -470,7 +495,7 @@ pub async fn handle_relay_stream( }; // Claim the pending relay. Consumes the entry — it cannot be reused. - let supervisor_side = registry.claim_relay(&channel_id)?; + let supervisor_side = registry.claim_relay(&channel_id, principal.as_ref())?; info!(channel_id = %channel_id, "relay stream: claimed pending relay, bridging"); let (mut read_half, mut write_half) = tokio::io::split(supervisor_side); @@ -554,6 +579,7 @@ pub async fn handle_connect_supervisor( >, Status, > { + let principal = request.extensions().get::().cloned(); let mut inbound = request.into_inner(); // Step 1: Wait for SupervisorHello. @@ -569,6 +595,9 @@ pub async fn handle_connect_supervisor( if sandbox_id.is_empty() { return Err(Status::invalid_argument("sandbox_id is required")); } + if let Some(principal) = principal.as_ref() { + crate::auth::guard::ensure_sandbox_principal_scope(principal, &sandbox_id)?; + } require_persisted_sandbox(&state.store, &sandbox_id).await?; let session_id = Uuid::new_v4().to_string(); @@ -783,6 +812,8 @@ fn handle_supervisor_message( #[cfg(test)] mod tests { use super::*; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{SandboxIdentitySource, SandboxPrincipal, UserPrincipal}; use crate::persistence::Store; use tokio::io::{AsyncReadExt, AsyncWriteExt}; @@ -823,6 +854,28 @@ mod tests { } } + fn sandbox_principal(sandbox_id: &str) -> Principal { + Principal::Sandbox(SandboxPrincipal { + sandbox_id: sandbox_id.to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + }, + trust_domain: Some("openshell".to_string()), + }) + } + + fn user_principal(subject: &str) -> Principal { + Principal::User(UserPrincipal { + identity: Identity { + subject: subject.to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + }) + } + // ---- registry: register / remove ---- #[test] @@ -1235,7 +1288,10 @@ mod tests { #[test] fn claim_relay_unknown_channel() { let registry = SupervisorSessionRegistry::new(); - let err = registry.claim_relay("nonexistent").expect_err("should err"); + let principal = sandbox_principal("sbx-test"); + let err = registry + .claim_relay("nonexistent", Some(&principal)) + .expect_err("should err"); assert_eq!(err.code(), tonic::Code::NotFound); } @@ -1248,11 +1304,51 @@ mod tests { pending_relay("sbx-test", relay_tx, Instant::now()), ); - let result = registry.claim_relay("ch-1"); + let principal = sandbox_principal("sbx-test"); + let result = registry.claim_relay("ch-1", Some(&principal)); assert!(result.is_ok()); assert!(!registry.pending_relays.lock().unwrap().contains_key("ch-1")); } + #[test] + fn claim_relay_rejects_cross_sandbox_principal_without_consuming_channel() { + let registry = SupervisorSessionRegistry::new(); + let (relay_tx, _relay_rx) = oneshot::channel(); + registry.pending_relays.lock().unwrap().insert( + "ch-cross".to_string(), + pending_relay("sbx-owner", relay_tx, Instant::now()), + ); + + let attacker = sandbox_principal("sbx-attacker"); + let err = registry + .claim_relay("ch-cross", Some(&attacker)) + .expect_err("cross-sandbox relay claim must fail"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + assert!( + registry + .pending_relays + .lock() + .unwrap() + .contains_key("ch-cross"), + "failed cross-sandbox claim must not consume the channel" + ); + } + + #[test] + fn claim_relay_rejects_user_principal() { + let registry = SupervisorSessionRegistry::new(); + let (relay_tx, _relay_rx) = oneshot::channel(); + registry.pending_relays.lock().unwrap().insert( + "ch-user".to_string(), + pending_relay("sbx-owner", relay_tx, Instant::now()), + ); + + let err = registry + .claim_relay("ch-user", Some(&user_principal("alice"))) + .expect_err("users are not supervisor identities"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + #[tokio::test] async fn relay_open_failure_completes_pending_waiter() { let registry = SupervisorSessionRegistry::new(); @@ -1293,7 +1389,7 @@ mod tests { ); let err = registry - .claim_relay("ch-old") + .claim_relay("ch-old", Some(&sandbox_principal("sbx-test"))) .expect_err("expired entry must fail"); assert_eq!(err.code(), tonic::Code::DeadlineExceeded); // Entry must have been consumed regardless. @@ -1317,7 +1413,7 @@ mod tests { ); let err = registry - .claim_relay("ch-1") + .claim_relay("ch-1", Some(&sandbox_principal("sbx-test"))) .expect_err("should err when receiver is gone"); assert_eq!(err.code(), tonic::Code::Internal); } @@ -1331,7 +1427,9 @@ mod tests { pending_relay("sbx-test", relay_tx, Instant::now()), ); - let mut supervisor_side = registry.claim_relay("ch-io").expect("claim should succeed"); + let mut supervisor_side = registry + .claim_relay("ch-io", Some(&sandbox_principal("sbx-test"))) + .expect("claim should succeed"); let mut gateway_side = relay_rx .await .expect("gateway side should receive result")