diff --git a/src/checks/execution.rs b/src/checks/execution.rs index 0fc424c..253168e 100644 --- a/src/checks/execution.rs +++ b/src/checks/execution.rs @@ -115,8 +115,15 @@ impl ExecutionActor { // Permit acquired ⇒ the agent is about to run: mark the check Running. let _ = presenter.tell(UiEvent::CheckStarted { id }).await; - let mut result = - run_one(executor, sandbox, job.id, job.check.clone(), &working_dir).await; + let mut result = run_one( + executor, + sandbox, + job.id, + job.check.clone(), + &working_dir, + attempt, + ) + .await; // Harvest this attempt's trace *before* signalling completion, so it // is collected even for retried attempts (whose outcome never reaches @@ -249,6 +256,7 @@ async fn run_one( id: CheckId, check: Check, working_dir: &Path, + attempt: usize, ) -> Result { let handle = sandbox.create(working_dir).await?; @@ -256,6 +264,7 @@ async fn run_one( check_id: id, check, working_dir: handle.path().to_path_buf(), + attempt: u32::try_from(attempt).unwrap_or(u32::MAX), }; let outcome = executor.run_check(request).await; @@ -422,7 +431,9 @@ mod tests { .await .unwrap(); assert!(out[0].satisfied); - // Ran twice: one silent attempt, then one reporting attempt. - assert_eq!(executor.seen().len(), 2); + // Ran twice: one silent attempt, then one reporting attempt — and the + // executor was told which attempt each was (retries must be able to + // vary temperature/instructions rather than replaying attempt 1). + assert_eq!(executor.seen_attempts(), vec![(0, 1), (0, 2)]); } } diff --git a/src/checks/executor/cersei.rs b/src/checks/executor/cersei.rs index 5672858..23ac1ee 100644 --- a/src/checks/executor/cersei.rs +++ b/src/checks/executor/cersei.rs @@ -15,6 +15,7 @@ use cersei_types::CerseiError; use miette::{Result, miette}; use tokio_util::sync::CancellationToken; +use super::jail::Jailed; use super::judge::{JudgeTool, VerdictSink}; use super::trace::{TraceHeader, TraceRecorder, serialize_trace}; use super::{ @@ -70,11 +71,27 @@ impl CerseiExecutor { /// The read-only tool set a verification agent gets by default: observe, do not /// mutate. Execution-requiring checks (which would need Bash/Write) are gated /// separately and are future work — the default is least privilege. +/// +/// Each tool is [`Jailed`] to the agent's working directory: "read-only" alone +/// still allowed reading anywhere the user can, which let lost agents launch +/// unbounded globs over the host filesystem (timeouts) and grade the live +/// repository instead of the sandbox (postmortem C5). The jail turns an +/// out-of-sandbox path into an immediate tool error that steers the agent back. fn read_only_tools() -> Vec> { vec![ - Box::new(cersei_tools::file_read::FileReadTool), - Box::new(cersei_tools::grep_tool::GrepTool), - Box::new(cersei_tools::glob_tool::GlobTool), + Box::new(Jailed::path_keys( + cersei_tools::file_read::FileReadTool, + &["file_path"], + )), + Box::new(Jailed::path_keys( + cersei_tools::grep_tool::GrepTool, + &["path"], + )), + Box::new(Jailed::glob( + cersei_tools::glob_tool::GlobTool, + &["path"], + "pattern", + )), ] } @@ -111,6 +128,16 @@ fn effort_temperature(effort: Effort) -> f32 { } } +/// The sampling temperature for a given attempt: the effort base, raised by +/// 0.5 per retry and capped at 1.0. At effort=low the base is 0.0, and a +/// temperature-0 retry is a replay: the 2026-07-01 postmortem caught one check +/// reproducing its fatal trajectory near-verbatim on all three attempts. A +/// retry has to sample differently to be worth its wall-clock. +fn attempt_temperature(effort: Effort, attempt: u32) -> f32 { + let base = effort_temperature(effort); + (base + 0.5 * attempt.saturating_sub(1) as f32).min(1.0) +} + #[async_trait] impl CheckExecutor for CerseiExecutor { async fn run_check(&self, req: AgentRunRequest) -> Result { @@ -136,7 +163,12 @@ impl CheckExecutor for CerseiExecutor { let cancel = CancellationToken::new(); let judge = JudgeTool::new(sink.clone(), cancel.clone()); - let instructions = assemble_instructions(&req.check, &judge_tool_directive()); + let instructions = assemble_instructions( + &req.check, + &judge_tool_directive(), + &req.working_dir, + req.attempt, + ); let mut agent_builder = Agent::builder() .provider_boxed(provider) @@ -149,7 +181,7 @@ impl CheckExecutor for CerseiExecutor { .tools(read_only_tools()) .tool(judge) // Thinking is intentionally left disabled (see `effort_temperature`). - .temperature(effort_temperature(self.effort)) + .temperature(attempt_temperature(self.effort, req.attempt)) .max_turns(MAX_TURNS) .cancel_token(cancel.clone()); @@ -237,3 +269,18 @@ impl CheckExecutor for CerseiExecutor { Ok(outcome) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn retries_raise_the_temperature_up_to_the_cap() { + assert_eq!(attempt_temperature(Effort::Low, 1), 0.0); + assert_eq!(attempt_temperature(Effort::Low, 2), 0.5); + assert_eq!(attempt_temperature(Effort::Low, 3), 1.0); + assert_eq!(attempt_temperature(Effort::Medium, 2), 1.0); + // Already at the cap: retries must not push past valid API range. + assert_eq!(attempt_temperature(Effort::High, 3), 1.0); + } +} diff --git a/src/checks/executor/claude.rs b/src/checks/executor/claude.rs index c32c042..7503ae9 100644 --- a/src/checks/executor/claude.rs +++ b/src/checks/executor/claude.rs @@ -97,7 +97,12 @@ impl CheckExecutor for ClaudeExecutor { "dispatching claude -p check (fallback)", ); - let instructions = assemble_instructions(&req.check, &file_report_directive()); + let instructions = assemble_instructions( + &req.check, + &file_report_directive(), + &req.working_dir, + req.attempt, + ); let mut cmd = Command::new(&self.program); cmd.arg("-p") diff --git a/src/checks/executor/fake.rs b/src/checks/executor/fake.rs index 570dec8..522d3bf 100644 --- a/src/checks/executor/fake.rs +++ b/src/checks/executor/fake.rs @@ -22,7 +22,8 @@ pub struct FakeExecutor { /// id to `(report_on_attempt, report)`. Exercises the retry path: the same /// `CheckId` is re-run, so the fake counts attempts per id. silent_until: HashMap, - seen: Mutex>, + /// Every `(check_id, attempt)` the fake was asked to run, in call order. + seen: Mutex>, } impl FakeExecutor { @@ -72,6 +73,18 @@ impl FakeExecutor { /// The check ids the fake was asked to run, in call order. pub fn seen(&self) -> Vec { + self.seen + .lock() + .unwrap() + .iter() + .map(|(id, _)| *id) + .collect() + } + + /// The `(check_id, attempt)` pairs the fake was asked to run, in call + /// order — lets tests assert the retry plumbing threads attempt numbers + /// through to the executor. + pub fn seen_attempts(&self) -> Vec<(CheckId, u32)> { self.seen.lock().unwrap().clone() } } @@ -81,8 +94,8 @@ impl CheckExecutor for FakeExecutor { async fn run_check(&self, req: AgentRunRequest) -> Result { let attempt = { let mut seen = self.seen.lock().unwrap(); - seen.push(req.check_id); - seen.iter().filter(|id| **id == req.check_id).count() + seen.push((req.check_id, req.attempt)); + seen.iter().filter(|(id, _)| *id == req.check_id).count() }; if self.silent.contains(&req.check_id) { diff --git a/src/checks/executor/jail.rs b/src/checks/executor/jail.rs new file mode 100644 index 0000000..2625e5e --- /dev/null +++ b/src/checks/executor/jail.rs @@ -0,0 +1,348 @@ +//! Confining a check agent's read-only tools to its sandbox (F4 of the +//! 2026-07-01 timeout postmortem). +//! +//! The CoW sandbox isolates *writes* by construction, but cersei's read-only +//! tools accept absolute paths and will read or walk anywhere the user can. +//! That broke two ways in practice: lost agents launched unbounded recursive +//! globs over the host filesystem (`**/*.rs` over `~/workspace`, even `/`) and +//! timed out inside them, and two agents graded the *live* repository instead +//! of the sandbox copy. [`Jailed`] wraps a tool and rejects any path-bearing +//! input that resolves outside the agent's working directory, returning a tool +//! error that redirects the agent back into the sandbox instead of letting the +//! call run away. + +use std::path::{Component, Path, PathBuf}; + +use async_trait::async_trait; +use cersei_tools::{PermissionLevel, Tool, ToolCategory, ToolContext, ToolResult}; +use serde_json::Value; + +/// A [`Tool`] decorator that confines path-bearing inputs to the agent's +/// working directory (read from the [`ToolContext`] at call time, so one +/// wrapper is correct for every concurrently running agent). +pub struct Jailed { + inner: T, + /// Input keys that carry a filesystem path (absolute or relative). + path_keys: &'static [&'static str], + /// Input key that carries a *glob pattern*, which needs its own rule: the + /// glob primitive joins the pattern onto its base directory, so an + /// absolute pattern replaces the base entirely and a `..` component climbs + /// out of it. + glob_key: Option<&'static str>, +} + +impl Jailed { + /// Confine the string inputs named by `path_keys` to the working directory. + pub fn path_keys(inner: T, path_keys: &'static [&'static str]) -> Self { + Self { + inner, + path_keys, + glob_key: None, + } + } + + /// Like [`Jailed::path_keys`], additionally confining the glob pattern + /// under `glob_key`. + pub fn glob(inner: T, path_keys: &'static [&'static str], glob_key: &'static str) -> Self { + Self { + inner, + path_keys, + glob_key: Some(glob_key), + } + } +} + +#[async_trait] +impl Tool for Jailed { + fn name(&self) -> &str { + self.inner.name() + } + fn description(&self) -> &str { + self.inner.description() + } + fn input_schema(&self) -> Value { + self.inner.input_schema() + } + fn permission_level(&self) -> PermissionLevel { + self.inner.permission_level() + } + fn category(&self) -> ToolCategory { + self.inner.category() + } + + async fn execute(&self, input: Value, ctx: &ToolContext) -> ToolResult { + if let Err(denied) = enforce(&input, &ctx.working_dir, self.path_keys, self.glob_key) { + return ToolResult::error(denied); + } + self.inner.execute(input, ctx).await + } +} + +/// Check every confined input key; `Err` carries the agent-facing denial. +/// Non-object input is left for the inner tool to reject as it sees fit. +fn enforce( + input: &Value, + root: &Path, + path_keys: &[&str], + glob_key: Option<&str>, +) -> Result<(), String> { + let Some(obj) = input.as_object() else { + return Ok(()); + }; + + for key in path_keys { + if let Some(raw) = obj.get(*key).and_then(Value::as_str) + && !within_root(Path::new(raw), root) + { + return Err(deny(Path::new(raw), root)); + } + } + + if let Some(key) = glob_key + && let Some(pattern) = obj.get(key).and_then(Value::as_str) + { + if Path::new(pattern) + .components() + .any(|c| c == Component::ParentDir) + { + return Err(format!( + "glob pattern `{pattern}` contains `..`, which would escape the sandbox \ +working directory `{}`. Use a pattern relative to that directory.", + root.display(), + )); + } + if Path::new(pattern).is_absolute() && !within_root(&literal_prefix(pattern), root) { + return Err(deny(Path::new(pattern), root)); + } + } + + Ok(()) +} + +fn deny(offending: &Path, root: &Path) -> String { + format!( + "`{}` is outside the sandbox working directory `{}`. Every project file for this \ +check lives under that directory; use a path inside it (or omit `path` to default to it).", + offending.display(), + root.display(), + ) +} + +/// Whether `candidate` (absolute, or relative to `root`) stays inside `root`. +/// +/// The candidate is judged by its *resolved* location, so a symlink inside the +/// sandbox cannot point a tool outside it. Both spellings of the root — as +/// configured and fully resolved — are accepted: macOS temp sandboxes are +/// reached through the `/var` → `/private/var` symlink, so the agent may +/// legitimately hold either form. +fn within_root(candidate: &Path, root: &Path) -> bool { + let absolute = if candidate.is_absolute() { + candidate.to_path_buf() + } else { + root.join(candidate) + }; + let resolved = resolve_lenient(&lexical_normalize(&absolute)); + + let mut roots = vec![lexical_normalize(root)]; + if let Ok(resolved_root) = root.canonicalize() + && !roots.contains(&resolved_root) + { + roots.push(resolved_root); + } + + roots.iter().any(|r| resolved.starts_with(r)) +} + +/// Canonicalize the deepest existing ancestor of `path` and re-append the +/// remaining (nonexistent) components. Plain `canonicalize` fails on paths +/// that don't exist yet, but their symlink-resolved location is exactly what +/// the jail must judge — e.g. `/escape-link/nope.txt` must be denied, +/// and `/var//nope.rs` must match a `/private/var/…` root. The input +/// is already lexically normalized, so re-appending cannot reintroduce `..`. +fn resolve_lenient(path: &Path) -> PathBuf { + let mut existing = path.to_path_buf(); + let mut tail: Vec = Vec::new(); + loop { + match existing.canonicalize() { + Ok(mut resolved) => { + for part in tail.iter().rev() { + resolved.push(part); + } + return resolved; + } + Err(_) => match (existing.file_name(), existing.parent()) { + (Some(name), Some(parent)) => { + tail.push(name.to_os_string()); + existing = parent.to_path_buf(); + } + // Ran out of components without finding an existing ancestor + // (or the path ends in `..`/root): judge it as-is. + _ => return path.to_path_buf(), + }, + } + } +} + +/// Resolve `.` and `..` components without touching the filesystem. +fn lexical_normalize(path: &Path) -> PathBuf { + let mut out = PathBuf::new(); + for component in path.components() { + match component { + Component::CurDir => {} + Component::ParentDir => { + out.pop(); + } + other => out.push(other.as_os_str()), + } + } + out +} + +/// The leading components of a glob pattern before the first one containing a +/// metacharacter — the directory the walk actually starts from. +fn literal_prefix(pattern: &str) -> PathBuf { + let mut out = PathBuf::new(); + for component in Path::new(pattern).components() { + if component + .as_os_str() + .to_string_lossy() + .contains(['*', '?', '[', '{']) + { + break; + } + out.push(component.as_os_str()); + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + use cersei_tools::permissions::AllowReadOnly; + use cersei_tools::{CostTracker, Extensions, glob_tool::GlobTool}; + use std::sync::Arc; + + fn ctx_in(dir: &Path) -> ToolContext { + ToolContext { + working_dir: dir.to_path_buf(), + session_id: "jail-test".into(), + permissions: Arc::new(AllowReadOnly), + cost_tracker: Arc::new(CostTracker::new()), + mcp_manager: None, + extensions: Extensions::default(), + } + } + + #[test] + fn relative_paths_stay_inside() { + let tmp = tempfile::tempdir().unwrap(); + assert!(within_root(Path::new("src/main.rs"), tmp.path())); + assert!(within_root(Path::new("."), tmp.path())); + } + + #[test] + fn absolute_paths_inside_are_allowed_and_outside_denied() { + let tmp = tempfile::tempdir().unwrap(); + assert!(within_root(&tmp.path().join("src"), tmp.path())); + assert!(!within_root(Path::new("/etc"), tmp.path())); + assert!(!within_root(Path::new("/"), tmp.path())); + } + + #[test] + fn dotdot_cannot_climb_out() { + let tmp = tempfile::tempdir().unwrap(); + assert!(!within_root(Path::new(".."), tmp.path())); + assert!(!within_root(&tmp.path().join("../sibling"), tmp.path())); + // Climbing out and back in is fine — only the resolved location matters. + let back_in = tmp.path().join("..").join( + tmp.path() + .file_name() + .expect("tempdir has a terminal component"), + ); + assert!(within_root(&back_in, tmp.path())); + } + + #[test] + fn symlink_aliased_roots_are_equivalent() { + // macOS tempdirs live under `/var/folders`, a symlink into + // `/private/var/folders`: the raw and canonical spellings must both be + // accepted, in both directions. + let tmp = tempfile::tempdir().unwrap(); + let raw = tmp.path().to_path_buf(); + let canonical = raw.canonicalize().unwrap(); + assert!(within_root(&canonical.join("file.rs"), &raw)); + assert!(within_root(&raw.join("file.rs"), &canonical)); + } + + #[cfg(unix)] + #[test] + fn symlink_pointing_out_of_the_sandbox_is_denied() { + let tmp = tempfile::tempdir().unwrap(); + let outside = tempfile::tempdir().unwrap(); + std::fs::write(outside.path().join("secret.txt"), "s").unwrap(); + std::os::unix::fs::symlink(outside.path(), tmp.path().join("escape")).unwrap(); + assert!(!within_root( + &tmp.path().join("escape/secret.txt"), + tmp.path() + )); + // Even a nonexistent path routed through the link resolves outside. + assert!(!within_root( + &tmp.path().join("escape/nope.txt"), + tmp.path() + )); + } + + #[test] + fn glob_patterns_are_confined() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + let ok = |v: Value| enforce(&v, root, &["path"], Some("pattern")).is_ok(); + + assert!(ok(serde_json::json!({ "pattern": "**/*.rs" }))); + assert!(ok(serde_json::json!({ "pattern": "src/**" }))); + // Absolute pattern inside the sandbox names the same walk — allowed. + assert!(ok(serde_json::json!({ + "pattern": format!("{}/**/*.rs", root.display()) + }))); + // An absolute pattern replaces the base directory: outside is denied. + assert!(!ok(serde_json::json!({ "pattern": "/Users/**/*.rs" }))); + // `..` climbs out of the base directory before the walk starts. + assert!(!ok(serde_json::json!({ "pattern": "../**/*.rs" }))); + // The `path` key is confined like any other path input. + assert!(!ok(serde_json::json!({ "pattern": "*", "path": "/" }))); + } + + #[tokio::test] + async fn jailed_glob_denies_escapes_with_a_redirecting_error() { + let tmp = tempfile::tempdir().unwrap(); + let tool = Jailed::glob(GlobTool, &["path"], "pattern"); + let result = tool + .execute( + serde_json::json!({ "pattern": "**/*.rs", "path": "/Users" }), + &ctx_in(tmp.path()), + ) + .await; + assert!(result.is_error); + assert!( + result + .content + .contains("outside the sandbox working directory") + ); + assert!(result.content.contains(&tmp.path().display().to_string())); + } + + #[tokio::test] + async fn jailed_glob_delegates_in_sandbox_calls() { + let tmp = tempfile::tempdir().unwrap(); + std::fs::write(tmp.path().join("lib.rs"), "").unwrap(); + let tool = Jailed::glob(GlobTool, &["path"], "pattern"); + let result = tool + .execute( + serde_json::json!({ "pattern": "*.rs" }), + &ctx_in(tmp.path()), + ) + .await; + assert!(!result.is_error); + assert!(result.content.contains("lib.rs")); + } +} diff --git a/src/checks/executor/mod.rs b/src/checks/executor/mod.rs index b14918c..8142922 100644 --- a/src/checks/executor/mod.rs +++ b/src/checks/executor/mod.rs @@ -13,10 +13,11 @@ pub mod cersei; pub mod claude; #[cfg(test)] mod fake; +mod jail; pub mod judge; mod trace; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use async_trait::async_trait; use miette::Result; @@ -38,6 +39,12 @@ pub struct AgentRunRequest { pub check: Check, /// The sandbox directory to run the agent in (its working directory). pub working_dir: PathBuf, + /// Which attempt this is, 1-based. Retries must not replay the failed + /// attempt verbatim: executors use this to raise the sampling temperature + /// and to tell the agent a previous attempt went unreported (the 2026-07-01 + /// timeout postmortem showed temperature-0 retries reproducing the same + /// fatal trajectory three times in a row). + pub attempt: u32, } /// The result of running one check's agent in-process. @@ -102,16 +109,41 @@ the check is treated as a FAILURE.", /// Assemble the instruction text handed to an agent: standing operating /// instructions, the executor-supplied `reporting` directive, then the check /// prompt verbatim. (MULTI-1350, parametrized in MULTI-1367.) -pub fn assemble_instructions(check: &Check, reporting: &str) -> String { +/// +/// The sandbox path is stated explicitly. Agents that weren't told it (pre +/// 2026-07-01 postmortem) went hunting for the repository across the host +/// filesystem — guessing paths out of the loaded CLAUDE.md — and timed out +/// inside unbounded directory walks. On a retry (`attempt > 1`) the agent is +/// also told a previous attempt went unreported, so the new trajectory has a +/// reason to differ from the failed one. +pub fn assemble_instructions( + check: &Check, + reporting: &str, + working_dir: &Path, + attempt: u32, +) -> String { + let retry_note = if attempt > 1 { + format!( + "\nNOTE: this is attempt {attempt} for this check; a previous attempt finished \ +without reporting a verdict (it may have timed out while exploring). Stay inside the \ +working directory and report as soon as the evidence supports a conclusion.\n" + ) + } else { + String::new() + }; format!( "You are validating a single requirement for the MultiTool Checks tool.\n\ -Your current working directory is a sandboxed, throwaway copy of the user's repository; \ -you may read it and run commands against it freely.\n\ +Your working directory is `{working_dir}` — a sandboxed, throwaway copy of the user's \ +repository that you may inspect freely. Every file relevant to this check lives under \ +that path: do not read or search outside it. Tool calls that take an optional `path` \ +default to it when omitted.\n\ +{retry_note}\ \n\ {reporting}\n\ \n\ --- CHECK: {title} ---\n\ {prompt}\n", + working_dir = working_dir.display(), title = check.title, prompt = check.prompt, ) @@ -124,16 +156,50 @@ mod tests { assert_obj_safe!(CheckExecutor); - #[test] - fn instructions_embed_prompt_and_demand_single_report() { - let check = Check { + fn check() -> Check { + Check { title: "No yellow".into(), prompt: "scan for yellow text".into(), - }; - let text = assemble_instructions(&check, &judge_tool_directive()); + } + } + + #[test] + fn instructions_embed_prompt_and_demand_single_report() { + let text = assemble_instructions( + &check(), + &judge_tool_directive(), + Path::new("/tmp/sandbox-copy"), + 1, + ); assert!(text.contains("scan for yellow text")); assert!(text.contains(JUDGE_TOOL)); assert!(text.contains("EXACTLY ONCE")); assert!(text.contains("No yellow")); } + + #[test] + fn instructions_state_the_working_directory_path() { + let text = assemble_instructions( + &check(), + &judge_tool_directive(), + Path::new("/tmp/sandbox-copy"), + 1, + ); + assert!(text.contains("`/tmp/sandbox-copy`")); + assert!(text.contains("do not read or search outside it")); + // First attempts carry no retry note. + assert!(!text.contains("previous attempt")); + } + + #[test] + fn retries_carry_a_note_about_the_unreported_attempt() { + let text = assemble_instructions( + &check(), + &judge_tool_directive(), + Path::new("/tmp/sandbox-copy"), + 2, + ); + assert!(text.contains("attempt 2")); + assert!(text.contains("previous attempt finished without reporting")); + } } diff --git a/src/checks/executor/trace.rs b/src/checks/executor/trace.rs index 2590a2f..4224c01 100644 --- a/src/checks/executor/trace.rs +++ b/src/checks/executor/trace.rs @@ -241,12 +241,8 @@ impl TraceRecorder { reason: reason.clone(), }, ), - AgentEvent::Status(s) => { - c.push(elapsed_ms, TraceRecord::Status { message: s.clone() }) - } - AgentEvent::Error(s) => { - c.push(elapsed_ms, TraceRecord::Error { message: s.clone() }) - } + AgentEvent::Status(s) => c.push(elapsed_ms, TraceRecord::Status { message: s.clone() }), + AgentEvent::Error(s) => c.push(elapsed_ms, TraceRecord::Error { message: s.clone() }), AgentEvent::Complete(output) => c.push( elapsed_ms, TraceRecord::Complete { @@ -318,8 +314,9 @@ impl StampedRecord { /// record's own fields. Built via `to_value` + insert rather than /// `#[serde(flatten)]` to sidestep flatten's internally-tagged-enum quirks. fn to_json_line(&self) -> String { - let mut value = serde_json::to_value(&self.record) - .unwrap_or_else(|e| json!({ "event": "trace_serialize_error", "detail": e.to_string() })); + let mut value = serde_json::to_value(&self.record).unwrap_or_else( + |e| json!({ "event": "trace_serialize_error", "detail": e.to_string() }), + ); if let Value::Object(map) = &mut value { map.insert("elapsed_ms".to_string(), Value::from(self.elapsed_ms)); } @@ -334,9 +331,17 @@ impl StampedRecord { #[derive(Serialize)] #[serde(tag = "event", rename_all = "snake_case")] enum TraceRecord { - Text { text: String }, - Thinking { text: String }, - ToolStart { name: String, id: String, input: Value }, + Text { + text: String, + }, + Thinking { + text: String, + }, + ToolStart { + name: String, + id: String, + input: Value, + }, ToolEnd { name: String, id: String, @@ -344,30 +349,83 @@ enum TraceRecord { duration_ms: u64, result: String, }, - ToolPermissionCheck { name: String, id: String, level: String }, - PermissionRequired { detail: String }, - TurnStart { turn: u32 }, - TurnComplete { turn: u32, stop_reason: StopReason, usage: Usage }, - ModelRequestStart { turn: u32, message_count: usize, token_estimate: u64 }, - ModelResponseStart { turn: u32, model: String }, - TokenWarning { pct_used: f64, state: String }, - CompactStart { reason: String, messages_before: usize }, - CompactEnd { messages_after: usize, tokens_freed: u64 }, - SessionLoaded { session_id: String, message_count: usize }, - SessionSaved { session_id: String }, + ToolPermissionCheck { + name: String, + id: String, + level: String, + }, + PermissionRequired { + detail: String, + }, + TurnStart { + turn: u32, + }, + TurnComplete { + turn: u32, + stop_reason: StopReason, + usage: Usage, + }, + ModelRequestStart { + turn: u32, + message_count: usize, + token_estimate: u64, + }, + ModelResponseStart { + turn: u32, + model: String, + }, + TokenWarning { + pct_used: f64, + state: String, + }, + CompactStart { + reason: String, + messages_before: usize, + }, + CompactEnd { + messages_after: usize, + tokens_freed: u64, + }, + SessionLoaded { + session_id: String, + message_count: usize, + }, + SessionSaved { + session_id: String, + }, CostUpdate { turn_cost: f64, cumulative_cost: f64, input_tokens: u64, output_tokens: u64, }, - SubAgentSpawned { agent_id: String, prompt: String }, - SubAgentComplete { agent_id: String, turns: u32 }, - HookFired { hook_event: String, hook_name: String }, - HookBlocked { hook_event: String, hook_name: String, reason: String }, - Status { message: String }, - Error { message: String }, - Complete { turns: u32, stop_reason: StopReason }, + SubAgentSpawned { + agent_id: String, + prompt: String, + }, + SubAgentComplete { + agent_id: String, + turns: u32, + }, + HookFired { + hook_event: String, + hook_name: String, + }, + HookBlocked { + hook_event: String, + hook_name: String, + reason: String, + }, + Status { + message: String, + }, + Error { + message: String, + }, + Complete { + turns: u32, + stop_reason: StopReason, + }, } /// Truncate a tool result to [`MAX_TOOL_RESULT_BYTES`] on a char boundary, @@ -427,9 +485,10 @@ pub(super) fn serialize_trace( out.push('\n'); } - let verdict = outcome.verdict.as_ref().map(|v| { - json!({ "success": v.success, "evidence": v.evidence }) - }); + let verdict = outcome + .verdict + .as_ref() + .map(|v| json!({ "success": v.success, "evidence": v.evidence })); let footer_line = json!({ "event": "outcome", "verdict": verdict, diff --git a/src/checks/mod.rs b/src/checks/mod.rs index a4ab366..554729e 100644 --- a/src/checks/mod.rs +++ b/src/checks/mod.rs @@ -295,8 +295,14 @@ async fn run_pipeline( backend: Box, trace_collector: Option>, ) -> Result> { - let (execution, reporting, presenter, rx) = - spawn_core(cfg, executor, sandbox, working_dir, backend, trace_collector); + let (execution, reporting, presenter, rx) = spawn_core( + cfg, + executor, + sandbox, + working_dir, + backend, + trace_collector, + ); let discovery = DiscoveryActor::spawn(DiscoveryActor::new( working_dir.to_path_buf(), execution.clone(), diff --git a/src/checks/trace_archive.rs b/src/checks/trace_archive.rs index 446ac88..47cb0d8 100644 --- a/src/checks/trace_archive.rs +++ b/src/checks/trace_archive.rs @@ -170,7 +170,13 @@ pub(crate) fn write_archive(collector: &TraceCollector, path: &std::path::Path) mod tests { use super::*; - fn entry(req_index: usize, req_title: &str, check_id: CheckId, check_title: &str, attempt: usize) -> TraceEntry { + fn entry( + req_index: usize, + req_title: &str, + check_id: CheckId, + check_title: &str, + attempt: usize, + ) -> TraceEntry { TraceEntry { req_index, req_title: req_title.to_string(), @@ -192,8 +198,8 @@ mod tests { #[test] fn archive_round_trips_expected_paths() { - use std::io::Read; use flate2::read::GzDecoder; + use std::io::Read; let collector = TraceCollector::new(); // Requirement 0 has one check run twice (attempt 1 then 2).