diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs index b487c94a..9e385c68 100644 --- a/crates/openshell-bootstrap/src/errors.rs +++ b/crates/openshell-bootstrap/src/errors.rs @@ -169,6 +169,21 @@ const FAILURE_PATTERNS: &[FailurePattern] = &[ match_mode: MatchMode::Any, diagnose: diagnose_docker_not_running, }, + // CDI specs missing — Docker daemon has CDI configured but no spec files exist + // or the requested device ID (nvidia.com/gpu=all) is not in any spec. + // Matches errors from Docker 25+ and containerd CDI injection paths. + FailurePattern { + matchers: &[ + "CDI device not found", + "unknown CDI device", + "failed to inject CDI devices", + "no CDI devices found", + "CDI device injection failed", + "unresolvable CDI devices", + ], + match_mode: MatchMode::Any, + diagnose: diagnose_cdi_specs_missing, + }, ]; fn diagnose_corrupted_state(gateway_name: &str) -> GatewayFailureDiagnosis { @@ -396,6 +411,29 @@ fn diagnose_certificate_issue(gateway_name: &str) -> GatewayFailureDiagnosis { } } +fn diagnose_cdi_specs_missing(_gateway_name: &str) -> GatewayFailureDiagnosis { + GatewayFailureDiagnosis { + summary: "CDI specs not found on host".to_string(), + explanation: "GPU passthrough via CDI was selected (your Docker daemon has CDI spec \ + directories configured) but no CDI device specs were found on the host. \ + Specs must be pre-generated before OpenShell can inject the GPU into the \ + cluster container." + .to_string(), + recovery_steps: vec![ + RecoveryStep::with_command( + "Generate CDI specs on the host (nvidia-ctk creates /etc/cdi/ if it does not exist)", + "sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml", + ), + RecoveryStep::with_command( + "Verify the specs were generated and include nvidia.com/gpu entries", + "nvidia-ctk cdi list", + ), + RecoveryStep::new("Then retry: openshell gateway start --gpu"), + ], + retryable: false, + } +} + fn diagnose_docker_not_running(_gateway_name: &str) -> GatewayFailureDiagnosis { GatewayFailureDiagnosis { summary: "Docker is not running".to_string(), @@ -925,4 +963,76 @@ mod tests { "commands should include gateway name, got: {all_commands}" ); } + + #[test] + fn test_diagnose_cdi_device_not_found() { + let diagnosis = diagnose_failure( + "test", + "could not run container: CDI device not found: nvidia.com/gpu=all", + None, + ); + assert!(diagnosis.is_some()); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("CDI"), + "expected CDI diagnosis, got: {}", + d.summary + ); + assert!(!d.retryable); + } + + #[test] + fn test_diagnose_cdi_injection_failed_unresolvable() { + // Exact error observed from Docker 500 response + let diagnosis = diagnose_failure( + "test", + "Docker responded with status code 500: CDI device injection failed: unresolvable CDI devices nvidia.com/gpu=all", + None, + ); + assert!(diagnosis.is_some()); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("CDI"), + "expected CDI diagnosis, got: {}", + d.summary + ); + assert!(!d.retryable); + } + + #[test] + fn test_diagnose_unknown_cdi_device() { + // containerd error path + let diagnosis = diagnose_failure( + "test", + "unknown CDI device requested: nvidia.com/gpu=all", + None, + ); + assert!(diagnosis.is_some()); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("CDI"), + "expected CDI diagnosis, got: {}", + d.summary + ); + } + + #[test] + fn test_diagnose_cdi_recovery_mentions_nvidia_ctk() { + let d = diagnose_failure("test", "CDI device not found", None) + .expect("should match CDI pattern"); + let all_steps: String = d + .recovery_steps + .iter() + .map(|s| format!("{} {}", s.description, s.command.as_deref().unwrap_or(""))) + .collect::>() + .join("\n"); + assert!( + all_steps.contains("nvidia-ctk cdi generate"), + "recovery steps should mention nvidia-ctk cdi generate, got: {all_steps}" + ); + assert!( + all_steps.contains("/etc/cdi/"), + "recovery steps should mention /etc/cdi/, got: {all_steps}" + ); + } } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 14963244..2384a217 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -877,13 +877,106 @@ pub(crate) fn spawn_route_refresh( /// Minimum read-only paths required for a proxy-mode sandbox child process to /// function: dynamic linker, shared libraries, DNS resolution, CA certs, -/// Python venv, and openshell logs. -const PROXY_BASELINE_READ_ONLY: &[&str] = &["/usr", "/lib", "/etc", "/app", "/var/log"]; +/// Python venv, openshell logs, process info, and random bytes. +/// +/// `/proc` and `/dev/urandom` are included here for the same reasons they +/// appear in `restrictive_default_policy()`: virtually every process needs +/// them. Before the Landlock per-path fix (#677) these were effectively free +/// because a missing path silently disabled the entire ruleset; now they must +/// be explicit. +const PROXY_BASELINE_READ_ONLY: &[&str] = &[ + "/usr", + "/lib", + "/etc", + "/app", + "/var/log", + "/proc", + "/dev/urandom", +]; /// Minimum read-write paths required for a proxy-mode sandbox child process: /// user working directory and temporary files. const PROXY_BASELINE_READ_WRITE: &[&str] = &["/sandbox", "/tmp"]; +/// GPU read-only paths. +/// +/// `/run/nvidia-persistenced`: NVML tries to connect to the persistenced +/// socket at init time. If the directory exists but Landlock denies traversal +/// (EACCES vs ECONNREFUSED), NVML returns `NVML_ERROR_INSUFFICIENT_PERMISSIONS` +/// even though the daemon is optional. Only read/traversal access is needed. +const GPU_BASELINE_READ_ONLY: &[&str] = &["/run/nvidia-persistenced"]; + +/// GPU read-write paths (static). +/// +/// `/dev/nvidiactl`, `/dev/nvidia-uvm`, `/dev/nvidia-uvm-tools`, +/// `/dev/nvidia-modeset`: control and UVM devices injected by CDI. +/// Landlock restricts `open(2)` on device files even when DAC allows it; +/// these need read-write because NVML/CUDA opens them with `O_RDWR`. +/// +/// `/proc`: CUDA writes to `/proc//task//comm` during `cuInit()` +/// to set thread names. Without write access, `cuInit()` returns error 304. +/// Must use `/proc` (not `/proc/self/task`) because Landlock rules bind to +/// inodes and child processes have different procfs inodes than the parent. +/// +/// Per-GPU device files (`/dev/nvidia0`, …) are enumerated at runtime by +/// `enumerate_gpu_device_nodes()` since the count varies. +const GPU_BASELINE_READ_WRITE: &[&str] = &[ + "/dev/nvidiactl", + "/dev/nvidia-uvm", + "/dev/nvidia-uvm-tools", + "/dev/nvidia-modeset", + "/proc", +]; + +/// Returns true if GPU devices are present in the container. +fn has_gpu_devices() -> bool { + std::path::Path::new("/dev/nvidiactl").exists() +} + +/// Enumerate per-GPU device nodes (`/dev/nvidia0`, `/dev/nvidia1`, …). +fn enumerate_gpu_device_nodes() -> Vec { + let mut paths = Vec::new(); + if let Ok(entries) = std::fs::read_dir("/dev") { + for entry in entries.flatten() { + let name = entry.file_name(); + let name = name.to_string_lossy(); + if let Some(suffix) = name.strip_prefix("nvidia") { + if suffix.is_empty() || !suffix.chars().all(|c| c.is_ascii_digit()) { + continue; + } + paths.push(entry.path().to_string_lossy().into_owned()); + } + } + } + paths +} + +/// Collect all baseline paths for enrichment: proxy defaults + GPU (if present). +/// Returns `(read_only, read_write)` as owned `String` vecs. +fn baseline_enrichment_paths() -> (Vec, Vec) { + let mut ro: Vec = PROXY_BASELINE_READ_ONLY + .iter() + .map(|&s| s.to_string()) + .collect(); + let mut rw: Vec = PROXY_BASELINE_READ_WRITE + .iter() + .map(|&s| s.to_string()) + .collect(); + + if has_gpu_devices() { + ro.extend(GPU_BASELINE_READ_ONLY.iter().map(|&s| s.to_string())); + rw.extend(GPU_BASELINE_READ_WRITE.iter().map(|&s| s.to_string())); + rw.extend(enumerate_gpu_device_nodes()); + } + + // A path promoted to read_write (e.g. /proc for GPU) should not also + // appear in read_only — Landlock handles the overlap correctly but the + // duplicate is confusing when inspecting the effective policy. + ro.retain(|p| !rw.contains(p)); + + (ro, rw) +} + /// Ensure a proto `SandboxPolicy` includes the baseline filesystem paths /// required for proxy-mode sandboxes. Paths are only added if missing; /// user-specified paths are never removed. @@ -902,14 +995,16 @@ fn enrich_proto_baseline_paths(proto: &mut openshell_core::proto::SandboxPolicy) ..Default::default() }); + let (ro, rw) = baseline_enrichment_paths(); + + // Baseline paths are system-injected, not user-specified. Skip paths + // that do not exist in this container image to avoid noisy warnings from + // Landlock and, more critically, to prevent a single missing baseline + // path from abandoning the entire Landlock ruleset under best-effort + // mode (see issue #664). let mut modified = false; - for &path in PROXY_BASELINE_READ_ONLY { - if !fs.read_only.iter().any(|p| p.as_str() == path) { - // Baseline paths are system-injected, not user-specified. Skip - // paths that do not exist in this container image to avoid noisy - // warnings from Landlock and, more critically, to prevent a single - // missing baseline path from abandoning the entire Landlock - // ruleset under best-effort mode (see issue #664). + for path in &ro { + if !fs.read_only.iter().any(|p| p == path) && !fs.read_write.iter().any(|p| p == path) { if !std::path::Path::new(path).exists() { debug!( path, @@ -917,12 +1012,12 @@ fn enrich_proto_baseline_paths(proto: &mut openshell_core::proto::SandboxPolicy) ); continue; } - fs.read_only.push(path.to_string()); + fs.read_only.push(path.clone()); modified = true; } } - for &path in PROXY_BASELINE_READ_WRITE { - if !fs.read_write.iter().any(|p| p.as_str() == path) { + for path in &rw { + if !fs.read_write.iter().any(|p| p == path) { if !std::path::Path::new(path).exists() { debug!( path, @@ -930,7 +1025,7 @@ fn enrich_proto_baseline_paths(proto: &mut openshell_core::proto::SandboxPolicy) ); continue; } - fs.read_write.push(path.to_string()); + fs.read_write.push(path.clone()); modified = true; } } @@ -950,12 +1045,12 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { return; } + let (ro, rw) = baseline_enrichment_paths(); + let mut modified = false; - for &path in PROXY_BASELINE_READ_ONLY { + for path in &ro { let p = std::path::PathBuf::from(path); - if !policy.filesystem.read_only.contains(&p) { - // Baseline paths are system-injected — skip non-existent paths to - // avoid Landlock ruleset abandonment (issue #664). + if !policy.filesystem.read_only.contains(&p) && !policy.filesystem.read_write.contains(&p) { if !p.exists() { debug!( path, @@ -967,7 +1062,7 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { modified = true; } } - for &path in PROXY_BASELINE_READ_WRITE { + for path in &rw { let p = std::path::PathBuf::from(path); if !policy.filesystem.read_write.contains(&p) { if !p.exists() { @@ -987,6 +1082,75 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { } } +#[cfg(test)] +mod baseline_tests { + use super::*; + + #[test] + fn proc_not_in_both_read_only_and_read_write_when_gpu_present() { + // When GPU devices are present, /proc is promoted to read_write + // (CUDA needs to write /proc//task//comm). It should + // NOT also appear in read_only. + if !has_gpu_devices() { + // Can't test GPU dedup without GPU devices; skip silently. + return; + } + let (ro, rw) = baseline_enrichment_paths(); + assert!( + rw.contains(&"/proc".to_string()), + "/proc should be in read_write when GPU is present" + ); + assert!( + !ro.contains(&"/proc".to_string()), + "/proc should NOT be in read_only when it is already in read_write" + ); + } + + #[test] + fn proc_in_read_only_without_gpu() { + if has_gpu_devices() { + // On a GPU host we can't test the non-GPU path; skip silently. + return; + } + let (ro, _rw) = baseline_enrichment_paths(); + assert!( + ro.contains(&"/proc".to_string()), + "/proc should be in read_only when GPU is not present" + ); + } + + #[test] + fn baseline_read_write_always_includes_sandbox_and_tmp() { + let (_ro, rw) = baseline_enrichment_paths(); + assert!(rw.contains(&"/sandbox".to_string())); + assert!(rw.contains(&"/tmp".to_string())); + } + + #[test] + fn enumerate_gpu_device_nodes_skips_bare_nvidia() { + // "nvidia" (without a trailing digit) is a valid /dev entry on some + // systems but is not a per-GPU device node. The enumerator must + // not match it. + let nodes = enumerate_gpu_device_nodes(); + assert!( + !nodes.contains(&"/dev/nvidia".to_string()), + "bare /dev/nvidia should not be enumerated: {nodes:?}" + ); + } + + #[test] + fn no_duplicate_paths_in_baseline() { + let (ro, rw) = baseline_enrichment_paths(); + // No path should appear in both lists. + for path in &ro { + assert!( + !rw.contains(path), + "path {path} appears in both read_only and read_write" + ); + } + } +} + /// Load sandbox policy from local files or gRPC. /// /// Priority: diff --git a/e2e/python/test_inference_routing.py b/e2e/python/test_inference_routing.py index bda0d8cb..c83049c1 100644 --- a/e2e/python/test_inference_routing.py +++ b/e2e/python/test_inference_routing.py @@ -32,7 +32,7 @@ _BASE_FILESYSTEM = sandbox_pb2.FilesystemPolicy( include_workdir=True, - read_only=["/usr", "/lib", "/etc", "/app", "/var/log"], + read_only=["/usr", "/lib", "/etc", "/app", "/var/log", "/proc", "/dev/urandom"], read_write=["/sandbox", "/tmp"], ) _BASE_LANDLOCK = sandbox_pb2.LandlockPolicy(compatibility="best_effort") diff --git a/e2e/python/test_sandbox_policy.py b/e2e/python/test_sandbox_policy.py index 6a4bf5ed..625fe8da 100644 --- a/e2e/python/test_sandbox_policy.py +++ b/e2e/python/test_sandbox_policy.py @@ -22,7 +22,7 @@ _BASE_FILESYSTEM = sandbox_pb2.FilesystemPolicy( include_workdir=True, - read_only=["/usr", "/lib", "/etc", "/app", "/var/log"], + read_only=["/usr", "/lib", "/etc", "/app", "/var/log", "/proc", "/dev/urandom"], read_write=["/sandbox", "/tmp"], ) _BASE_LANDLOCK = sandbox_pb2.LandlockPolicy(compatibility="best_effort")