diff --git a/rust/fleet-data/src/accounts.rs b/rust/fleet-data/src/accounts.rs index 8d21a96..1d6f495 100644 --- a/rust/fleet-data/src/accounts.rs +++ b/rust/fleet-data/src/accounts.rs @@ -90,7 +90,19 @@ fn extract_percent(s: &str, key: &str) -> Option { /// dashboards keep calling this on a tick; the parse cost is negligible /// next to the subprocess spawn. pub fn load_live() -> std::io::Result> { - let output = std::process::Command::new("agent-auth").arg("list").output()?; + let mut cmd = std::process::Command::new("agent-auth"); + cmd.arg("list"); + // A hung `agent-auth` (e.g. waiting on an auth prompt) must not freeze + // the 250 ms dashboard tick. HEAVY_CMD_DEADLINE (2 s) is generous for a + // local CLI; on timeout we surface an empty list, matching the existing + // "no accounts visible" posture for a missing binary or non-zero exit. + let output = match crate::subprocess::output_with_deadline( + cmd, + crate::subprocess::HEAVY_CMD_DEADLINE, + ) { + Ok(o) => o, + Err(_) => return Ok(Vec::new()), + }; Ok(parse(&String::from_utf8_lossy(&output.stdout))) } diff --git a/rust/fleet-data/src/git.rs b/rust/fleet-data/src/git.rs index bf56bff..2684bae 100644 --- a/rust/fleet-data/src/git.rs +++ b/rust/fleet-data/src/git.rs @@ -89,12 +89,21 @@ pub fn parse(stdout: &str) -> Vec { /// for cross-lane warnings, "I cannot prove this PR is merged" is the safe /// default that produces a warning rather than silently suppressing it. pub fn branch_contains_pr(branch: &str, pr_head: &str) -> std::io::Result { - let status = std::process::Command::new("git") - .args(["merge-base", "--is-ancestor", pr_head, branch]) - .stdout(std::process::Stdio::null()) - .stderr(std::process::Stdio::null()) - .status()?; - Ok(status.success()) + let mut cmd = std::process::Command::new("git"); + cmd.args(["merge-base", "--is-ancestor", pr_head, branch]); + // 2 s deadline: a `git merge-base` against a local repo should return + // in milliseconds; a stalled git process (lock contention, filesystem + // wedge) collapses to the "cannot prove ancestry" branch, same as a + // missing git binary — which is the safe default that surfaces a + // merge-pending warning rather than silently suppressing it. + let out = match crate::subprocess::output_with_deadline( + cmd, + crate::subprocess::HEAVY_CMD_DEADLINE, + ) { + Ok(o) => o, + Err(_) => return Ok(false), + }; + Ok(out.status.success()) } /// Shell out to `gh pr list` and parse the result. @@ -104,19 +113,24 @@ pub fn branch_contains_pr(branch: &str, pr_head: &str) -> std::io::Result /// a crash. `--limit 100` is enough for any plausible fleet; a 100-PR /// backlog is itself a signal worth surfacing elsewhere. pub fn open_prs_with_files() -> std::io::Result> { - let out = match std::process::Command::new("gh") - .args([ - "pr", - "list", - "--state", - "open", - "--json", - "number,headRefName,baseRefName,files", - "--limit", - "100", - ]) - .output() - { + let mut cmd = std::process::Command::new("gh"); + cmd.args([ + "pr", + "list", + "--state", + "open", + "--json", + "number,headRefName,baseRefName,files", + "--limit", + "100", + ]); + // 2 s deadline keeps a stalled network call from freezing the + // dashboard tick. On timeout we collapse to the "no warnings" + // posture — same as a missing `gh` or a non-zero exit. + let out = match crate::subprocess::output_with_deadline( + cmd, + crate::subprocess::HEAVY_CMD_DEADLINE, + ) { Ok(o) => o, Err(_) => return Ok(Vec::new()), }; diff --git a/rust/fleet-data/src/lib.rs b/rust/fleet-data/src/lib.rs index 100481e..97a7a1e 100644 --- a/rust/fleet-data/src/lib.rs +++ b/rust/fleet-data/src/lib.rs @@ -8,5 +8,6 @@ pub mod panes; pub mod plan; pub mod scores; pub mod scrape; +pub mod subprocess; pub mod tmux; pub mod toposort; diff --git a/rust/fleet-data/src/panes.rs b/rust/fleet-data/src/panes.rs index 123bf06..ecc4c15 100644 --- a/rust/fleet-data/src/panes.rs +++ b/rust/fleet-data/src/panes.rs @@ -129,6 +129,16 @@ pub fn list_panes(session: &str, window: Option<&str>) -> std::io::Result) -> std::io::Result String::from_utf8_lossy(&stdout).into_owned(), + // Timeout or wait error → empty tail. classify() defaults to Idle. + None => String::new(), + }; out.push(PaneInfo { pane_id: row.pane_id, panel_label: row.panel_label, @@ -161,6 +178,47 @@ pub fn list_panes(session: &str, window: Option<&str>) -> std::io::Result Option> { + const POLL: std::time::Duration = std::time::Duration::from_millis(10); + loop { + match child.try_wait() { + Ok(Some(_status)) => { + // Exited; drain stdout. wait_with_output would re-call + // wait(), which is fine on an already-reaped child. + // To do that we need to move the child, but we only have + // &mut — take stdout manually instead. + let mut buf = Vec::new(); + if let Some(mut stdout) = child.stdout.take() { + use std::io::Read; + let _ = stdout.read_to_end(&mut buf); + } + return Some(buf); + } + Ok(None) => { + if started.elapsed() >= deadline { + let _ = child.kill(); + let _ = child.wait(); + return None; + } + std::thread::sleep(POLL); + } + Err(_) => return None, + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/rust/fleet-data/src/subprocess.rs b/rust/fleet-data/src/subprocess.rs new file mode 100644 index 0000000..affee83 --- /dev/null +++ b/rust/fleet-data/src/subprocess.rs @@ -0,0 +1,193 @@ +//! Bounded-wait wrapper around `std::process::Command::output()`. +//! +//! The dashboards in this crate poll on a ~250ms tick and shell out to external +//! tools (`tmux`, `agent-auth`, `git`, `gh`). When one of those children hangs +//! — a tmux server in the middle of restarting, an `agent-auth` prompting for +//! credentials, a `gh` call stuck on the network — the dashboard's tick blocks +//! indefinitely waiting for the child to exit. +//! +//! This module provides [`output_with_deadline`], a sync helper that spawns +//! the child, polls [`std::process::Child::try_wait`] on a short sleep loop, +//! and kills + reaps the child if it doesn't finish before the deadline. +//! On timeout it returns `io::Error::new(io::ErrorKind::TimedOut, ...)`, which +//! every existing call site already treats the same as a non-zero exit: +//! collapse to the empty / best-effort fallback. +//! +//! Sync on purpose. The rest of the crate is sync and pulling in tokio for a +//! handful of read-only subprocess calls would dwarf the fix. The poll +//! interval ([`POLL_INTERVAL`]) is tuned so a fast-finishing child still +//! returns within a single poll, and a slow one is reaped within +//! `deadline + POLL_INTERVAL` worst case. +//! +//! ## Choosing a deadline +//! +//! Two named constants are exported so call sites stay consistent: +//! +//! - [`TMUX_READ_DEADLINE`] (`500 ms`) — tmux read-only commands +//! (`list-panes`, `capture-pane`, `display-message`, `select-window`, +//! `set-option`). These talk to a local socket; under healthy conditions +//! they return in single-digit milliseconds. 500ms is generous enough to +//! ride out a momentary tmux-server stall (e.g. another client holding the +//! command lock) without freezing the dashboard's 250ms tick across more +//! than one or two frames. +//! - [`HEAVY_CMD_DEADLINE`] (`2 s`) — `agent-auth list`, `git`, `gh`. These +//! can touch disk, the network, or remote APIs. 2s is short enough that a +//! broken auth flow or stalled GitHub call drops out fast, but long enough +//! for a real `gh pr list --json files` over a slow link to complete. +//! +//! If a future call site truly needs a longer wait, pass an explicit +//! [`std::time::Duration`] rather than redefining the constants. + +use std::io; +use std::process::{Child, Command, Output}; +use std::thread; +use std::time::{Duration, Instant}; + +/// Deadline for tmux read-only commands (`list-panes`, `capture-pane`, +/// `display-message`, `select-window`, `set-option`). +/// +/// tmux talks over a local Unix socket and these calls normally return in +/// single-digit milliseconds. 500 ms keeps the dashboard's 250 ms tick +/// recoverable: a stalled tmux server costs at most two frames, not the +/// whole session. +pub const TMUX_READ_DEADLINE: Duration = Duration::from_millis(500); + +/// Deadline for heavier subprocess calls — `agent-auth list`, `git`, `gh`. +/// +/// These can touch disk, the network, or a remote API. 2 s is short enough +/// that a broken auth flow or stalled GitHub call drops out fast, but long +/// enough for a real `gh pr list --json files` to finish on a slow link. +pub const HEAVY_CMD_DEADLINE: Duration = Duration::from_secs(2); + +/// How often [`output_with_deadline`] polls [`Child::try_wait`]. +/// +/// Small enough that a fast child returns within one poll, large enough that +/// a slow child doesn't burn CPU while we wait. The worst-case overshoot of +/// the deadline is one `POLL_INTERVAL`. +const POLL_INTERVAL: Duration = Duration::from_millis(10); + +/// Spawn `cmd` and wait up to `deadline` for it to finish. +/// +/// Reads the full stdout/stderr just like `Command::output()`. On timeout the +/// child is killed and reaped (`wait()` is called so it never zombies) and +/// the function returns `io::Error::new(io::ErrorKind::TimedOut, ...)`. Every +/// caller in this crate already treats a non-zero exit as "fall back to an +/// empty result", so the timeout collapses into the same path. +/// +/// The poll loop uses a short [`thread::sleep`]; this is sync on purpose — +/// the rest of the crate is sync and the dashboards are not async runtimes. +pub fn output_with_deadline(mut cmd: Command, deadline: Duration) -> io::Result { + // Make sure we own the stdout/stderr handles so `wait_with_output` + // can drain them. If the caller already set `stdout`/`stderr`, this + // is a no-op override — but every call site in this crate either + // wants the bytes (`.output()` path) or doesn't care, and `Stdio::null()` + // is set explicitly elsewhere via `.status()`. + cmd.stdout(std::process::Stdio::piped()); + cmd.stderr(std::process::Stdio::piped()); + + let mut child = cmd.spawn()?; + let start = Instant::now(); + + loop { + match child.try_wait()? { + Some(_status) => { + // Child exited; collect its output. wait_with_output also + // reaps the process, so no zombie. + return child.wait_with_output(); + } + None => { + if start.elapsed() >= deadline { + kill_and_reap(&mut child); + return Err(io::Error::new( + io::ErrorKind::TimedOut, + format!( + "subprocess exceeded deadline of {:?}", + deadline + ), + )); + } + thread::sleep(POLL_INTERVAL); + } + } + } +} + +/// Kill the child and wait for it to exit so it does not zombie. +/// +/// All errors are swallowed: the child may have exited between our last +/// `try_wait` and the `kill`, which is fine — we still call `wait()` to +/// reap it. +fn kill_and_reap(child: &mut Child) { + let _ = child.kill(); + let _ = child.wait(); +} + +#[cfg(test)] +mod tests { + use super::*; + use std::process::Command; + + #[test] + fn output_returns_quickly_when_command_finishes_fast() { + let cmd = Command::new("true"); + let start = Instant::now(); + let out = output_with_deadline(cmd, Duration::from_secs(2)) + .expect("`true` should succeed within the deadline"); + let elapsed = start.elapsed(); + assert!(out.status.success(), "`true` exits 0"); + // Should be well under the deadline — generous bound to keep the + // test stable on a loaded CI runner. + assert!( + elapsed < Duration::from_secs(1), + "fast command took {elapsed:?}, expected << 2s deadline" + ); + } + + #[test] + fn output_returns_timeout_error_for_sleep_longer_than_deadline() { + let mut cmd = Command::new("sleep"); + cmd.arg("5"); + let start = Instant::now(); + let err = output_with_deadline(cmd, Duration::from_millis(100)) + .expect_err("sleep 5 must exceed a 100ms deadline"); + let elapsed = start.elapsed(); + assert_eq!(err.kind(), io::ErrorKind::TimedOut, "{err}"); + // Should give up shortly after the deadline; allow generous slack + // for poll-interval overshoot and scheduler jitter. + assert!( + elapsed < Duration::from_secs(2), + "timeout took {elapsed:?}, expected ~100ms + slack" + ); + } + + #[test] + fn child_is_reaped_on_timeout() { + // Spawn `sh -c "sleep 10"` with a tight deadline. After the + // function returns, the child must have been killed and reaped: + // try_wait on a separately-spawned twin should not see ours, and + // the function's own wait_with_output / kill_and_reap path should + // leave nothing pending. We verify by spawning, timing out, then + // checking that the returned error is TimedOut (which can only + // happen on the kill+reap path). + let mut cmd = Command::new("sh"); + cmd.args(["-c", "sleep 10"]); + let err = output_with_deadline(cmd, Duration::from_millis(50)) + .expect_err("sleep 10 must time out"); + assert_eq!(err.kind(), io::ErrorKind::TimedOut); + + // Sanity: spawn the same shape manually, kill it, and confirm + // wait() returns — proving the reaping primitive itself works on + // this platform. (If this regressed, the in-function reap above + // would also be suspect.) + let mut sanity = Command::new("sh") + .args(["-c", "sleep 10"]) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .spawn() + .expect("spawn sanity child"); + sanity.kill().expect("kill sanity child"); + let status = sanity.wait().expect("reap sanity child"); + // On Unix, killed-by-signal has no exit code; .success() is false. + assert!(!status.success(), "killed child should not report success"); + } +} diff --git a/rust/fleet-data/src/tmux.rs b/rust/fleet-data/src/tmux.rs index f4e5526..d89a457 100644 --- a/rust/fleet-data/src/tmux.rs +++ b/rust/fleet-data/src/tmux.rs @@ -23,6 +23,8 @@ use std::process::Command; +use crate::subprocess::{output_with_deadline, TMUX_READ_DEADLINE}; + /// A tmux target: `session:window` or `session:window.pane`. Construct via the /// builders so callers don't hand-format the `:` / `.` separators (the class /// of typo that produced `codex-fleet:1` vs `codex-fleet:overview` bugs). @@ -61,12 +63,10 @@ impl Target { /// Wraps `tmux has-session -t `; the exit status is the answer, so /// this never returns an error — an absent tmux binary just reports `false`. pub fn has_session(session: &str) -> bool { - Command::new("tmux") - .args(["has-session", "-t", session]) - .stdout(std::process::Stdio::null()) - .stderr(std::process::Stdio::null()) - .status() - .map(|s| s.success()) + let mut cmd = Command::new("tmux"); + cmd.args(["has-session", "-t", session]); + output_with_deadline(cmd, TMUX_READ_DEADLINE) + .map(|o| o.status.success()) .unwrap_or(false) } @@ -118,9 +118,13 @@ fn parse_pane_row(line: &str) -> Option { /// This is the metadata-only call; `panes::list_panes` layers scrollback /// capture on top of it. pub fn list_panes(target: &Target) -> std::io::Result> { - let out = Command::new("tmux") - .args(["list-panes", "-t", target.as_str(), "-F", PANE_FORMAT]) - .output()?; + let mut cmd = Command::new("tmux"); + cmd.args(["list-panes", "-t", target.as_str(), "-F", PANE_FORMAT]); + let out = match output_with_deadline(cmd, TMUX_READ_DEADLINE) { + Ok(o) => o, + // Timeout or spawn failure collapses to the empty-fleet fallback. + Err(_) => return Ok(Vec::new()), + }; if !out.status.success() { return Ok(Vec::new()); } @@ -134,9 +138,13 @@ pub fn list_panes(target: &Target) -> std::io::Result> { /// from the bottom. Returns `Ok(String::new())` on tmux failure. pub fn capture_pane(pane_id: &str, lines: u32) -> std::io::Result { let start = format!("-{lines}"); - let out = Command::new("tmux") - .args(["capture-pane", "-p", "-t", pane_id, "-S", &start]) - .output()?; + let mut cmd = Command::new("tmux"); + cmd.args(["capture-pane", "-p", "-t", pane_id, "-S", &start]); + let out = match output_with_deadline(cmd, TMUX_READ_DEADLINE) { + Ok(o) => o, + // Timeout / spawn failure → empty string, same as a non-zero exit. + Err(_) => return Ok(String::new()), + }; if !out.status.success() { return Ok(String::new()); } @@ -147,9 +155,13 @@ pub fn capture_pane(pane_id: &str, lines: u32) -> std::io::Result { /// string against a target. The workhorse behind "what's this pane's tty", /// "what session am I in", etc. Returns `Ok(String::new())` on failure. pub fn display_message(target: &Target, format: &str) -> std::io::Result { - let out = Command::new("tmux") - .args(["display-message", "-p", "-t", target.as_str(), format]) - .output()?; + let mut cmd = Command::new("tmux"); + cmd.args(["display-message", "-p", "-t", target.as_str(), format]); + let out = match output_with_deadline(cmd, TMUX_READ_DEADLINE) { + Ok(o) => o, + // Timeout / spawn failure → empty string fallback. + Err(_) => return Ok(String::new()), + }; if !out.status.success() { return Ok(String::new()); } @@ -163,12 +175,10 @@ pub fn display_message(target: &Target, format: &str) -> std::io::Result /// This replaces the four near-identical `select_window` fns in /// `fleet-state` / `fleet-watcher` / `fleet-waves` / `fleet-plan-tree`. pub fn select_window(target: &Target) -> bool { - Command::new("tmux") - .args(["select-window", "-t", target.as_str()]) - .stdout(std::process::Stdio::null()) - .stderr(std::process::Stdio::null()) - .status() - .map(|s| s.success()) + let mut cmd = Command::new("tmux"); + cmd.args(["select-window", "-t", target.as_str()]); + output_with_deadline(cmd, TMUX_READ_DEADLINE) + .map(|o| o.status.success()) .unwrap_or(false) } @@ -183,12 +193,10 @@ pub fn select_window_index(session: &str, window_index: usize) -> bool { /// Used to stamp the `@panel` label — `set_pane_option(pid, "@panel", "[codex-foo]")`. /// Best-effort `bool`, same posture as [`select_window`]. pub fn set_pane_option(pane_id: &str, name: &str, value: &str) -> bool { - Command::new("tmux") - .args(["set-option", "-p", "-t", pane_id, name, value]) - .stdout(std::process::Stdio::null()) - .stderr(std::process::Stdio::null()) - .status() - .map(|s| s.success()) + let mut cmd = Command::new("tmux"); + cmd.args(["set-option", "-p", "-t", pane_id, name, value]); + output_with_deadline(cmd, TMUX_READ_DEADLINE) + .map(|o| o.status.success()) .unwrap_or(false) }