From 15d4266ed2d903b0c0bfbef23bfb175434f0fa5e Mon Sep 17 00:00:00 2001 From: Robbie McKinstry Date: Wed, 1 Jul 2026 18:07:04 -0400 Subject: [PATCH] feat(check): opt-in session-trace archive for `multi check` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `--trace-archive ` (off by default; also `MULTI_CHECKS_TRACE_ARCHIVE` / `[checks].trace_archive`) to capture every check agent session and bundle them into a single in-memory `.tar.gz` for debugging failed runs. Captures all executions, not just failures. Capture is via `Agent::on_event`, not cersei's `.memory()` session persistence: the executor cancels its agent the instant a verdict lands and wraps the run in a drop-on-timeout, and cersei only flushes its transcript on the agentic loop's normal exit — so a memory-backed transcript would miss exactly the cancelled/timed-out/errored runs this feature exists to debug. `emit()` fires the event handler for every event before those early returns, so an event-sourced trace survives them. - executor: a coalescing `TraceRecorder` (text/thinking deltas joined into per-turn blocks, tool results bounded) serializes each execution into a self-contained NDJSON doc — header, records, outcome footer — returned in `AgentOutcome::trace_jsonl`. - execution actor: pushes each attempt (retries included) into a shared `TraceCollector` before signalling completion downstream, so retry traces are captured and collection is race-free with run finalization. - run: builds the `.tar.gz` with pure-Rust `tar` + `flate2` (no temp dir, no subprocess), laid out as `{NN}-{requirement}/{check}.attempt-{k}.jsonl` — one directory per requirement, one file per check execution. Trace archiving is best-effort: a failure logs but never fails an otherwise successful check run. Co-Authored-By: Claude Opus 4.8 (1M context) --- BUGS.md | 22 ++ Cargo.lock | 23 ++ Cargo.toml | 5 + src/checks/config/mod.rs | 13 +- src/checks/config/schema.rs | 12 + src/checks/e2e.rs | 2 + src/checks/execution.rs | 30 +- src/checks/executor/cersei.rs | 45 ++- src/checks/executor/claude.rs | 2 + src/checks/executor/fake.rs | 3 + src/checks/executor/mod.rs | 7 + src/checks/executor/trace.rs | 625 ++++++++++++++++++++++++++++++++++ src/checks/mod.rs | 72 +++- src/checks/trace_archive.rs | 238 +++++++++++++ src/config/check/mod.rs | 8 + 15 files changed, 1099 insertions(+), 8 deletions(-) create mode 100644 src/checks/executor/trace.rs create mode 100644 src/checks/trace_archive.rs diff --git a/BUGS.md b/BUGS.md index 423bd33..67fad6b 100644 --- a/BUGS.md +++ b/BUGS.md @@ -2,6 +2,15 @@ These are bugs (or missing features) I've observed while working with `multi checks`. +- [ ] Output is now hanging. I suspect this is recent (within the last few commits) and it started +happening after implement the changes to the `Presenter` actor to fix writing text off-screen without wrapping. + +- [ ] No limit on max turns. + +- [ ] Remove the `Claude -p` executor. + +- [ ] Logs no longer report the id of the check that failed (or the number of attempted retries) + - [ ] No use of Cersei workflows to chain multiple prompts together. - [ ] No support for Fireworks AI. @@ -22,6 +31,19 @@ These are bugs (or missing features) I've observed while working with `multi che - CERSEI: `append_system_prompt()` function is dead unless routed through the separate build_system_prompt() composer. +- [ ] `Ctrl-C` (shutdown signals) needs to be handled gracefully and cross-platform. +The `multi check` presenter installs a raw `libc::signal(SIGINT, …)` handler +(`install_terminal_guards`, src/checks/presenter/inline.rs:224) that is Unix-only +(won't build/run on Windows) and hard-`_exit(130)`s: it restores the cursor but +skips graceful teardown, so in-flight agent sessions, the MCP result server, and +spawned Cersei subprocesses are killed without cleanup and no partial +results/traces get flushed. Contrast `multi run`, which already does this +correctly and cross-platform via `tokio_graceful_shutdown::Toplevel::catch_signals()` ++ `handle_shutdown_requests()` (src/cmd/run/canary_mode.rs:63). `check` should adopt +the same graceful-shutdown path (or an equivalent `tokio::signal::ctrl_c` + coordinated +cancellation covering SIGINT/SIGTERM and Windows Ctrl-C/Ctrl-Break) while still +guaranteeing the terminal is restored on the way out. + ## Fixes - [x] No loading of CLAUDE.md files diff --git a/Cargo.lock b/Cargo.lock index 67c8cc0..feacf67 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3468,6 +3468,7 @@ dependencies = [ "directories", "failsafe", "figment", + "flate2", "futures-core", "futures-util", "ignore", @@ -3489,6 +3490,7 @@ dependencies = [ "serde_json5", "serde_with", "static_assertions", + "tar", "tempfile", "textwrap", "thiserror 2.0.18", @@ -5868,6 +5870,17 @@ dependencies = [ "serde", ] +[[package]] +name = "tar" +version = "0.4.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6221d9a6003c78398e3b239969f352578258df48c8eb051caadae0015bc840" +dependencies = [ + "filetime", + "libc", + "xattr", +] + [[package]] name = "tempfile" version = "3.27.0" @@ -7165,6 +7178,16 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" +[[package]] +name = "xattr" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" +dependencies = [ + "libc", + "rustix 1.1.4", +] + [[package]] name = "xmlparser" version = "0.13.6" diff --git a/Cargo.toml b/Cargo.toml index f6abe84..9cc1ea0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,6 +38,9 @@ derive-getters = "0.5.0" dialoguer = "0.11.0" directories = "6.0" figment = { version = "0.10", default-features = false, features = ["env"] } +# Pure-Rust gzip (miniz_oxide backend, the default). Used with `tar` to build the +# opt-in `multi check --trace-archive` bundle entirely in memory. +flate2 = "1.0" failsafe = "1.3.0" futures-core = "0.3.31" futures-util = "0.3.31" @@ -60,6 +63,8 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" serde_json5 = "0.2.1" serde_with = { version = "3.12", features = ["chrono"] } +# Pure-Rust tar writer for the in-memory `multi check --trace-archive` bundle. +tar = "0.4" thiserror.workspace = true tokio = { workspace = true, features = ["full"] } tokio-graceful-shutdown = "0.16.0" diff --git a/src/checks/config/mod.rs b/src/checks/config/mod.rs index cea1930..9b9aded 100644 --- a/src/checks/config/mod.rs +++ b/src/checks/config/mod.rs @@ -18,6 +18,7 @@ mod providers; mod schema; use std::num::NonZeroUsize; +use std::path::PathBuf; use std::time::Duration; use figment::{ @@ -74,6 +75,9 @@ pub struct Config { /// calling the judge tool; a fresh attempt usually succeeds. A check only /// resolves as errored after all attempts are exhausted. pub max_attempts: usize, + /// Where to bundle the opt-in session-trace archive, or `None` (default) to + /// disable trace capture. See [`crate::checks::trace_archive`]. + pub trace_archive: Option, } impl Config { @@ -135,6 +139,9 @@ impl Resolved { cfg.model.clone(), cfg.effort, cfg.agent_timeout, + // The archive path lives at the orchestration layer; the executor + // only needs to know whether to capture a per-execution trace. + cfg.trace_archive.is_some(), )), ExecutorKind::Claude => cfg.build_claude_executor(), }; @@ -198,6 +205,7 @@ pub fn load(overrides: CliOverrides) -> Result { concurrency, agent_timeout: DEFAULT_AGENT_TIMEOUT, max_attempts: DEFAULT_MAX_ATTEMPTS, + trace_archive: checks.trace_archive, }; Ok(Resolved { @@ -224,6 +232,7 @@ pub fn configuration() -> Config { concurrency: default_concurrency(), agent_timeout: DEFAULT_AGENT_TIMEOUT, max_attempts: DEFAULT_MAX_ATTEMPTS, + trace_archive: None, } } @@ -245,6 +254,7 @@ mod tests { effort: Some(Effort::Low), executor: None, concurrency: None, + trace_archive: None, providers: ProvidersSection::default(), }, } @@ -273,6 +283,7 @@ mod tests { None, None, None, + None, ); let checks = resolve_layers(file, overrides).unwrap(); assert_eq!(checks.provider, Some(ProviderKind::OpenAi)); @@ -307,7 +318,7 @@ mod tests { // ...and a flag outranks env. let overrides = - CliOverrides::new(None, Some("claude-opus-4-8".into()), None, None, None); + CliOverrides::new(None, Some("claude-opus-4-8".into()), None, None, None, None); let checks = resolve_layers(file, overrides).unwrap(); assert_eq!(checks.model.as_deref(), Some("claude-opus-4-8")); Ok(()) diff --git a/src/checks/config/schema.rs b/src/checks/config/schema.rs index 73acafc..7a67bfb 100644 --- a/src/checks/config/schema.rs +++ b/src/checks/config/schema.rs @@ -5,6 +5,8 @@ //! overrides, so figment can merge them by key path with `flag > env > file` //! precedence. See [`super::load`]. +use std::path::PathBuf; + use clap::ValueEnum; use serde::{Deserialize, Serialize}; @@ -81,6 +83,12 @@ pub struct ChecksSection { /// available CPU cores). #[serde(default, skip_serializing_if = "Option::is_none")] pub concurrency: Option, + /// Where to write the opt-in session-trace archive. When set, every check + /// execution's agent session is captured and bundled into this `.tar.gz` + /// (see [`crate::checks::trace_archive`]); unset (the default) disables + /// capture entirely. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub trace_archive: Option, /// Optional, non-secret per-provider base-URL overrides. #[serde(default)] pub providers: ProvidersSection, @@ -140,6 +148,8 @@ pub struct CliChecksOverrides { pub executor: Option, #[serde(skip_serializing_if = "Option::is_none")] pub concurrency: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub trace_archive: Option, } impl CliOverrides { @@ -150,6 +160,7 @@ impl CliOverrides { effort: Option, executor: Option, concurrency: Option, + trace_archive: Option, ) -> Self { Self { checks: CliChecksOverrides { @@ -158,6 +169,7 @@ impl CliOverrides { effort, executor, concurrency, + trace_archive, }, } } diff --git a/src/checks/e2e.rs b/src/checks/e2e.rs index 7d66aff..acd3b2b 100644 --- a/src/checks/e2e.rs +++ b/src/checks/e2e.rs @@ -201,6 +201,7 @@ impl CheckExecutor for InterleavingExecutor { stop_reason: Some("interleaving probe".into()), turns: 1, error: None, + trace_jsonl: None, }) } } @@ -267,6 +268,7 @@ async fn invalid_suite_aborts_run_without_spawning_agents() { Arc::new(NoopSandbox), dir.path(), null_backend(), + None, ) .await; diff --git a/src/checks/execution.rs b/src/checks/execution.rs index af84c91..0fc424c 100644 --- a/src/checks/execution.rs +++ b/src/checks/execution.rs @@ -31,6 +31,7 @@ use crate::checks::model::{Check, CheckId, CheckOutcome, Verdict}; use crate::checks::presenter::{PresenterActor, UiEvent}; use crate::checks::reporting::ReportingActor; use crate::checks::sandbox::Sandbox; +use crate::checks::trace_archive::{TraceCollector, TraceEntry}; /// The execution actor: turns a stream of discovered checks into a stream of /// completed checks, fanning agent runs out onto bounded background tasks. @@ -46,6 +47,11 @@ pub(crate) struct ExecutionActor { reporting: ActorRef, /// The display-only presenter, told of each check's lifecycle milestones. presenter: ActorRef, + /// Opt-in sink for per-execution session traces (`multi check + /// --trace-archive`). `None` disables capture. Shared across the spawned + /// per-attempt tasks; every attempt (including retries) pushes its trace + /// here before signalling completion downstream. + trace_collector: Option>, } impl Actor for ExecutionActor { @@ -62,6 +68,7 @@ impl Actor for ExecutionActor { impl ExecutionActor { /// Build the actor. `concurrency` and `max_attempts` are clamped to ≥1. + #[allow(clippy::too_many_arguments)] pub(crate) fn new( executor: Arc, sandbox: Arc, @@ -70,6 +77,7 @@ impl ExecutionActor { max_attempts: usize, reporting: ActorRef, presenter: ActorRef, + trace_collector: Option>, ) -> Self { Self { executor, @@ -79,6 +87,7 @@ impl ExecutionActor { max_attempts: max_attempts.max(1), reporting, presenter, + trace_collector, } } @@ -92,6 +101,7 @@ impl ExecutionActor { let working_dir = self.working_dir.clone(); let reporting = self.reporting.clone(); let presenter = self.presenter.clone(); + let trace_collector = self.trace_collector.clone(); let me = ctx.actor_ref().clone(); let id = job.id; @@ -105,7 +115,25 @@ impl ExecutionActor { // Permit acquired ⇒ the agent is about to run: mark the check Running. let _ = presenter.tell(UiEvent::CheckStarted { id }).await; - let result = run_one(executor, sandbox, job.id, job.check.clone(), &working_dir).await; + let mut result = + run_one(executor, sandbox, job.id, job.check.clone(), &working_dir).await; + + // Harvest this attempt's trace *before* signalling completion, so it + // is collected even for retried attempts (whose outcome never reaches + // reporting) and is race-free with run finalization (the collector is + // populated before the `tell` that could trigger it). + if let Some(collector) = &trace_collector + && let Some(bytes) = result.as_mut().ok().and_then(|o| o.trace_jsonl.take()) + { + collector.push(TraceEntry { + req_index: job.req_index, + req_title: job.req_title.clone(), + check_id: job.id, + check_title: job.check.title.clone(), + attempt, + bytes, + }); + } if has_verdict(Some(&result)) { // The agent reported: reconcile, surface the verdict to the diff --git a/src/checks/executor/cersei.rs b/src/checks/executor/cersei.rs index 8e9a4db..5672858 100644 --- a/src/checks/executor/cersei.rs +++ b/src/checks/executor/cersei.rs @@ -3,6 +3,7 @@ //! per-check judge tool — no `claude -p` subprocess, no MCP endpoints. use std::path::Path; +use std::sync::Arc; use std::time::Duration; use async_trait::async_trait; @@ -15,6 +16,7 @@ use miette::{Result, miette}; use tokio_util::sync::CancellationToken; use super::judge::{JudgeTool, VerdictSink}; +use super::trace::{TraceHeader, TraceRecorder, serialize_trace}; use super::{ AgentOutcome, AgentRunRequest, CheckExecutor, assemble_instructions, judge_tool_directive, }; @@ -42,15 +44,25 @@ pub struct CerseiExecutor { /// Per-agent wall-clock timeout; on expiry the run is dropped (which stops /// the in-process agent) and the check resolves as errored. timeout: Duration, + /// Whether to capture each execution's agent session as a trace (returned in + /// [`AgentOutcome::trace_jsonl`]). Driven by `multi check --trace-archive`. + capture_traces: bool, } impl CerseiExecutor { - pub fn new(factory: ProviderFactory, model: String, effort: Effort, timeout: Duration) -> Self { + pub fn new( + factory: ProviderFactory, + model: String, + effort: Effort, + timeout: Duration, + capture_traces: bool, + ) -> Self { Self { factory, model, effort, timeout, + capture_traces, } } } @@ -151,6 +163,17 @@ impl CheckExecutor for CerseiExecutor { agent_builder = agent_builder.system_prompt(project_prompt); } + // When trace capture is on, tee every agent event into a recorder. + // `emit` invokes this synchronously for each event *before* the loop's + // early returns, so the trace survives post-verdict cancellation and the + // drop-on-timeout below (which cersei's own session persistence would + // miss). The executor owns a clone, so a dropped agent doesn't lose it. + let recorder = self.capture_traces.then(|| Arc::new(TraceRecorder::new())); + if let Some(recorder) = &recorder { + let recorder = Arc::clone(recorder); + agent_builder = agent_builder.on_event(move |event| recorder.record(event)); + } + let agent = agent_builder .build() .map_err(|e| miette!("building check agent: {e}"))?; @@ -166,12 +189,13 @@ impl CheckExecutor for CerseiExecutor { // which surfaces as `CerseiError::Cancelled`). let verdict = sink.verdict(); - let outcome = match result { + let mut outcome = match result { Ok(Ok(output)) => AgentOutcome { verdict, stop_reason: Some(format!("{:?}", output.stop_reason)), turns: output.turns, error: None, + trace_jsonl: None, }, Ok(Err(err)) => { let reported = verdict.is_some(); @@ -182,6 +206,7 @@ impl CheckExecutor for CerseiExecutor { .then(|| "cancelled".to_string()), turns: 0, error: (!reported).then(|| err.to_string()), + trace_jsonl: None, } } Err(_elapsed) => AgentOutcome { @@ -190,9 +215,25 @@ impl CheckExecutor for CerseiExecutor { stop_reason: None, turns: 0, error: Some(format!("agent timed out after {:?}", self.timeout)), + trace_jsonl: None, }, }; + // Render the captured session (if any) now that the outcome is known, so + // the trace's footer carries the authoritative verdict/stop/error. + if let Some(recorder) = &recorder { + let header = TraceHeader { + check_id: req.check_id, + check_title: &req.check.title, + model: &self.model, + effort: self.effort, + working_dir: &req.working_dir, + session_id: &session_id, + }; + let bytes = serialize_trace(recorder, &header, &outcome); + outcome.trace_jsonl = Some(bytes); + } + Ok(outcome) } } diff --git a/src/checks/executor/claude.rs b/src/checks/executor/claude.rs index 39fdbfa..c32c042 100644 --- a/src/checks/executor/claude.rs +++ b/src/checks/executor/claude.rs @@ -138,6 +138,7 @@ impl CheckExecutor for ClaudeExecutor { stop_reason: Some(format!("exit {:?}", output.status.code())), turns: 0, error, + trace_jsonl: None, }) } Err(_elapsed) => { @@ -148,6 +149,7 @@ impl CheckExecutor for ClaudeExecutor { stop_reason: None, turns: 0, error: Some(format!("agent timed out after {:?}", self.timeout)), + trace_jsonl: None, }) } } diff --git a/src/checks/executor/fake.rs b/src/checks/executor/fake.rs index 7eaa68f..570dec8 100644 --- a/src/checks/executor/fake.rs +++ b/src/checks/executor/fake.rs @@ -91,6 +91,7 @@ impl CheckExecutor for FakeExecutor { stop_reason: Some("fake: finished without reporting".into()), turns: 1, error: None, + trace_jsonl: None, }); } @@ -102,6 +103,7 @@ impl CheckExecutor for FakeExecutor { stop_reason: Some("fake: silent-until".into()), turns: 1, error: None, + trace_jsonl: None, }); } @@ -110,6 +112,7 @@ impl CheckExecutor for FakeExecutor { stop_reason: Some("fake: reported".into()), turns: 1, error: None, + trace_jsonl: None, }) } } diff --git a/src/checks/executor/mod.rs b/src/checks/executor/mod.rs index 8468caf..b14918c 100644 --- a/src/checks/executor/mod.rs +++ b/src/checks/executor/mod.rs @@ -14,6 +14,7 @@ pub mod claude; #[cfg(test)] mod fake; pub mod judge; +mod trace; use std::path::PathBuf; @@ -58,6 +59,12 @@ pub struct AgentOutcome { /// An execution-level error distinct from a check merely *failing* (stream /// error, agent-build error, or timeout). `None` on a clean finish. pub error: Option, + /// The self-contained NDJSON session trace for this one execution, when + /// trace capture is enabled (`multi check --trace-archive`). `None` when + /// capture is off and for executors that don't produce traces (the `claude` + /// fallback and the test fake). The execution layer moves these into the + /// per-run [`crate::checks::trace_archive`] bundle. + pub trace_jsonl: Option>, } impl AgentOutcome { diff --git a/src/checks/executor/trace.rs b/src/checks/executor/trace.rs new file mode 100644 index 0000000..4224c01 --- /dev/null +++ b/src/checks/executor/trace.rs @@ -0,0 +1,625 @@ +//! Opt-in session-trace capture for the in-process cersei executor. +//! +//! When `multi check --trace-archive` is set, each check execution attaches a +//! [`TraceRecorder`] as the agent's `on_event` sink. The recorder coalesces the +//! streamed [`AgentEvent`]s into a compact ordered list of [`TraceRecord`]s +//! (token-level text/thinking deltas are joined into per-turn blocks), and once +//! the run resolves [`serialize_trace`] renders them as a self-contained NDJSON +//! document: a `header` line, the records, then an `outcome` footer. Those bytes +//! travel back in [`AgentOutcome::trace_jsonl`] and are bundled by +//! [`crate::checks::trace_archive`]. +//! +//! ## Why `on_event`, not cersei's `.memory()` session persistence +//! +//! cersei can persist a Claude-Code-compatible transcript via a `Memory` +//! backend, but it flushes `memory.store()` only on the agentic loop's *normal* +//! exit. This executor cancels its agent the instant a verdict lands and wraps +//! the run in a drop-on-timeout, so the store call is skipped for exactly the +//! cancelled / timed-out / errored runs this feature exists to debug. `emit()` +//! fires the `on_event` handler for every event *before* those early returns, so +//! an event-sourced trace survives them. (Two events — `ModelRequestStart` / +//! `ModelResponseStart` — are emitted only on the streaming channel, which the +//! blocking `run()` path drops, so they never reach us; everything else does.) + +use std::path::Path; +use std::sync::Mutex; +use std::time::Instant; + +use cersei_agent::events::AgentEvent; +use cersei_types::{StopReason, Usage}; +use chrono::{DateTime, Utc}; +use serde::Serialize; +use serde_json::{Value, json}; + +use super::AgentOutcome; +use crate::checks::config::Effort; + +/// Cap on a single captured tool result. File reads and greps can be enormous; +/// past this we truncate (with a marker) to keep the archive bounded. +const MAX_TOOL_RESULT_BYTES: usize = 16 * 1024; + +/// The on-disk trace format version, stamped in the header so a reader can adapt. +const TRACE_FORMAT_VERSION: u32 = 1; + +/// A recorder attached to one agent run via `Agent::on_event`. Interior-mutable +/// so it can live behind an `Arc` shared with the (synchronous) event callback. +pub(super) struct TraceRecorder { + inner: Mutex, + /// Monotonic start, for the relative `elapsed_ms` stamp on each record. + started: Instant, + /// Wall-clock start, recorded in the trace header. + started_at: DateTime, +} + +impl TraceRecorder { + pub(super) fn new() -> Self { + Self { + inner: Mutex::new(Coalescer::default()), + started: Instant::now(), + started_at: Utc::now(), + } + } + + /// Record one event. Called synchronously from `Agent::emit` for every event + /// on the run, including on the cancel/timeout/error paths. + pub(super) fn record(&self, event: &AgentEvent) { + let elapsed_ms = self.started.elapsed().as_millis() as u64; + // A poisoned lock only means a prior panic while recording; recover the + // guard and keep capturing rather than panicking the agent's event path. + let mut c = self.inner.lock().unwrap_or_else(|e| e.into_inner()); + match event { + AgentEvent::TextDelta(t) => { + c.flush_thinking(elapsed_ms); + c.pending_text.push_str(t); + } + AgentEvent::ThinkingDelta(t) => { + c.flush_text(elapsed_ms); + c.pending_thinking.push_str(t); + } + AgentEvent::ToolStart { name, id, input } => c.push( + elapsed_ms, + TraceRecord::ToolStart { + name: name.clone(), + id: id.clone(), + input: input.clone(), + }, + ), + AgentEvent::ToolEnd { + name, + id, + result, + is_error, + duration, + .. + } => c.push( + elapsed_ms, + TraceRecord::ToolEnd { + name: name.clone(), + id: id.clone(), + is_error: *is_error, + duration_ms: duration.as_millis() as u64, + result: truncate_result(result), + }, + ), + AgentEvent::ToolPermissionCheck { name, id, level } => c.push( + elapsed_ms, + TraceRecord::ToolPermissionCheck { + name: name.clone(), + id: id.clone(), + level: format!("{level:?}"), + }, + ), + AgentEvent::PermissionRequired(req) => c.push( + elapsed_ms, + TraceRecord::PermissionRequired { + detail: format!("{req:?}"), + }, + ), + AgentEvent::TurnStart { turn } => { + c.push(elapsed_ms, TraceRecord::TurnStart { turn: *turn }) + } + AgentEvent::TurnComplete { + turn, + stop_reason, + usage, + } => c.push( + elapsed_ms, + TraceRecord::TurnComplete { + turn: *turn, + stop_reason: stop_reason.clone(), + usage: usage.clone(), + }, + ), + // Emitted only on the streaming channel, which the blocking `run()` + // path drops — recorded here for completeness should that change. + AgentEvent::ModelRequestStart { + turn, + message_count, + token_estimate, + } => c.push( + elapsed_ms, + TraceRecord::ModelRequestStart { + turn: *turn, + message_count: *message_count, + token_estimate: *token_estimate, + }, + ), + AgentEvent::ModelResponseStart { turn, model } => c.push( + elapsed_ms, + TraceRecord::ModelResponseStart { + turn: *turn, + model: model.clone(), + }, + ), + AgentEvent::TokenWarning { pct_used, state } => c.push( + elapsed_ms, + TraceRecord::TokenWarning { + pct_used: *pct_used, + state: format!("{state:?}"), + }, + ), + AgentEvent::CompactStart { + reason, + messages_before, + } => c.push( + elapsed_ms, + TraceRecord::CompactStart { + reason: format!("{reason:?}"), + messages_before: *messages_before, + }, + ), + AgentEvent::CompactEnd { + messages_after, + tokens_freed, + } => c.push( + elapsed_ms, + TraceRecord::CompactEnd { + messages_after: *messages_after, + tokens_freed: *tokens_freed, + }, + ), + AgentEvent::SessionLoaded { + session_id, + message_count, + } => c.push( + elapsed_ms, + TraceRecord::SessionLoaded { + session_id: session_id.clone(), + message_count: *message_count, + }, + ), + AgentEvent::SessionSaved { session_id } => c.push( + elapsed_ms, + TraceRecord::SessionSaved { + session_id: session_id.clone(), + }, + ), + AgentEvent::CostUpdate { + turn_cost, + cumulative_cost, + input_tokens, + output_tokens, + } => c.push( + elapsed_ms, + TraceRecord::CostUpdate { + turn_cost: *turn_cost, + cumulative_cost: *cumulative_cost, + input_tokens: *input_tokens, + output_tokens: *output_tokens, + }, + ), + AgentEvent::SubAgentSpawned { agent_id, prompt } => c.push( + elapsed_ms, + TraceRecord::SubAgentSpawned { + agent_id: agent_id.clone(), + prompt: prompt.clone(), + }, + ), + AgentEvent::SubAgentComplete { agent_id, result } => c.push( + elapsed_ms, + TraceRecord::SubAgentComplete { + agent_id: agent_id.clone(), + turns: result.turns, + }, + ), + AgentEvent::HookFired { event, hook_name } => c.push( + elapsed_ms, + TraceRecord::HookFired { + hook_event: format!("{event:?}"), + hook_name: hook_name.clone(), + }, + ), + AgentEvent::HookBlocked { + event, + hook_name, + reason, + } => c.push( + elapsed_ms, + TraceRecord::HookBlocked { + hook_event: format!("{event:?}"), + hook_name: hook_name.clone(), + reason: reason.clone(), + }, + ), + AgentEvent::Status(s) => c.push(elapsed_ms, TraceRecord::Status { message: s.clone() }), + AgentEvent::Error(s) => c.push(elapsed_ms, TraceRecord::Error { message: s.clone() }), + AgentEvent::Complete(output) => c.push( + elapsed_ms, + TraceRecord::Complete { + turns: output.turns, + stop_reason: output.stop_reason.clone(), + }, + ), + } + } + + /// Drain the coalesced records, flushing any pending delta buffers first. + fn drain(&self) -> Vec { + let elapsed_ms = self.started.elapsed().as_millis() as u64; + let mut c = self.inner.lock().unwrap_or_else(|e| e.into_inner()); + c.flush_deltas(elapsed_ms); + std::mem::take(&mut c.records) + } +} + +/// Accumulates records, coalescing contiguous runs of text/thinking deltas. +#[derive(Default)] +struct Coalescer { + records: Vec, + pending_text: String, + pending_thinking: String, +} + +impl Coalescer { + fn flush_thinking(&mut self, elapsed_ms: u64) { + if !self.pending_thinking.is_empty() { + let text = std::mem::take(&mut self.pending_thinking); + self.records.push(StampedRecord { + elapsed_ms, + record: TraceRecord::Thinking { text }, + }); + } + } + + fn flush_text(&mut self, elapsed_ms: u64) { + if !self.pending_text.is_empty() { + let text = std::mem::take(&mut self.pending_text); + self.records.push(StampedRecord { + elapsed_ms, + record: TraceRecord::Text { text }, + }); + } + } + + fn flush_deltas(&mut self, elapsed_ms: u64) { + self.flush_thinking(elapsed_ms); + self.flush_text(elapsed_ms); + } + + /// Flush any pending deltas (to preserve ordering), then push `record`. + fn push(&mut self, elapsed_ms: u64, record: TraceRecord) { + self.flush_deltas(elapsed_ms); + self.records.push(StampedRecord { elapsed_ms, record }); + } +} + +/// A record plus the milliseconds since the run started. +struct StampedRecord { + elapsed_ms: u64, + record: TraceRecord, +} + +impl StampedRecord { + /// Render as a single NDJSON object, splicing `elapsed_ms` alongside the + /// record's own fields. Built via `to_value` + insert rather than + /// `#[serde(flatten)]` to sidestep flatten's internally-tagged-enum quirks. + fn to_json_line(&self) -> String { + let mut value = serde_json::to_value(&self.record).unwrap_or_else( + |e| json!({ "event": "trace_serialize_error", "detail": e.to_string() }), + ); + if let Value::Object(map) = &mut value { + map.insert("elapsed_ms".to_string(), Value::from(self.elapsed_ms)); + } + value.to_string() + } +} + +/// One coalesced moment in the agent session. Serialized as an internally-tagged +/// object (`{"event": "...", ...}`); non-serializable cersei payloads (permission +/// requests, hook events, warning/compaction enums) are captured as `Debug` +/// strings, which is plenty for post-hoc debugging. +#[derive(Serialize)] +#[serde(tag = "event", rename_all = "snake_case")] +enum TraceRecord { + Text { + text: String, + }, + Thinking { + text: String, + }, + ToolStart { + name: String, + id: String, + input: Value, + }, + ToolEnd { + name: String, + id: String, + is_error: bool, + duration_ms: u64, + result: String, + }, + ToolPermissionCheck { + name: String, + id: String, + level: String, + }, + PermissionRequired { + detail: String, + }, + TurnStart { + turn: u32, + }, + TurnComplete { + turn: u32, + stop_reason: StopReason, + usage: Usage, + }, + ModelRequestStart { + turn: u32, + message_count: usize, + token_estimate: u64, + }, + ModelResponseStart { + turn: u32, + model: String, + }, + TokenWarning { + pct_used: f64, + state: String, + }, + CompactStart { + reason: String, + messages_before: usize, + }, + CompactEnd { + messages_after: usize, + tokens_freed: u64, + }, + SessionLoaded { + session_id: String, + message_count: usize, + }, + SessionSaved { + session_id: String, + }, + CostUpdate { + turn_cost: f64, + cumulative_cost: f64, + input_tokens: u64, + output_tokens: u64, + }, + SubAgentSpawned { + agent_id: String, + prompt: String, + }, + SubAgentComplete { + agent_id: String, + turns: u32, + }, + HookFired { + hook_event: String, + hook_name: String, + }, + HookBlocked { + hook_event: String, + hook_name: String, + reason: String, + }, + Status { + message: String, + }, + Error { + message: String, + }, + Complete { + turns: u32, + stop_reason: StopReason, + }, +} + +/// Truncate a tool result to [`MAX_TOOL_RESULT_BYTES`] on a char boundary, +/// appending a marker noting how many bytes were dropped. +fn truncate_result(s: &str) -> String { + if s.len() <= MAX_TOOL_RESULT_BYTES { + return s.to_string(); + } + let mut end = MAX_TOOL_RESULT_BYTES; + while end > 0 && !s.is_char_boundary(end) { + end -= 1; + } + format!("{}…[truncated {} bytes]", &s[..end], s.len() - end) +} + +/// Fields the executor knows about the execution, stamped into the trace header. +/// (Requirement grouping and the retry/attempt number are added by the archive +/// layer as the file's path, since the executor doesn't know them.) +pub(super) struct TraceHeader<'a> { + pub check_id: usize, + pub check_title: &'a str, + pub model: &'a str, + pub effort: Effort, + pub working_dir: &'a Path, + pub session_id: &'a str, +} + +/// Render the recorded session as a self-contained NDJSON document: a `header` +/// line, one line per coalesced record, then an `outcome` footer summarizing the +/// authoritative [`AgentOutcome`] (which is meaningful even when no terminal +/// `Complete`/`Error` event arrived, e.g. on a drop-on-timeout). +pub(super) fn serialize_trace( + recorder: &TraceRecorder, + header: &TraceHeader<'_>, + outcome: &AgentOutcome, +) -> Vec { + let records = recorder.drain(); + + let mut out = String::new(); + + let header_line = json!({ + "event": "header", + "trace_format": TRACE_FORMAT_VERSION, + "check_id": header.check_id, + "check_title": header.check_title, + "model": header.model, + "effort": format!("{:?}", header.effort).to_lowercase(), + "working_dir": header.working_dir.display().to_string(), + "session_id": header.session_id, + "started_at": recorder.started_at.to_rfc3339(), + }); + out.push_str(&header_line.to_string()); + out.push('\n'); + + for rec in &records { + out.push_str(&rec.to_json_line()); + out.push('\n'); + } + + let verdict = outcome + .verdict + .as_ref() + .map(|v| json!({ "success": v.success, "evidence": v.evidence })); + let footer_line = json!({ + "event": "outcome", + "verdict": verdict, + "stop_reason": outcome.stop_reason, + "turns": outcome.turns, + "error": outcome.error, + "finished_at": Utc::now().to_rfc3339(), + }); + out.push_str(&footer_line.to_string()); + out.push('\n'); + + out.into_bytes() +} + +#[cfg(test)] +mod tests { + use super::*; + use std::time::Duration; + + use crate::checks::executor::CheckReport; + + fn parse_lines(bytes: &[u8]) -> Vec { + String::from_utf8(bytes.to_vec()) + .unwrap() + .lines() + .map(|l| serde_json::from_str::(l).unwrap()) + .collect() + } + + #[test] + fn coalesces_deltas_and_frames_header_and_outcome() { + let rec = TraceRecorder::new(); + // Two contiguous text deltas must coalesce into one record. + rec.record(&AgentEvent::TextDelta("Hello ".into())); + rec.record(&AgentEvent::TextDelta("world".into())); + rec.record(&AgentEvent::ToolStart { + name: "grep_tool".into(), + id: "t1".into(), + input: json!({ "pattern": "yellow" }), + }); + rec.record(&AgentEvent::ToolEnd { + name: "grep_tool".into(), + id: "t1".into(), + result: "no matches".into(), + is_error: false, + duration: Duration::from_millis(5), + compression: None, + }); + rec.record(&AgentEvent::TurnComplete { + turn: 1, + stop_reason: StopReason::EndTurn, + usage: Usage::default(), + }); + + let outcome = AgentOutcome { + verdict: Some(CheckReport { + success: true, + evidence: Some("ok".into()), + }), + stop_reason: Some("EndTurn".into()), + turns: 1, + error: None, + trace_jsonl: None, + }; + let header = TraceHeader { + check_id: 3, + check_title: "No yellow", + model: "claude-sonnet-4-6", + effort: Effort::Low, + working_dir: Path::new("/tmp/sandbox"), + session_id: "multi-check-3", + }; + + let events = parse_lines(&serialize_trace(&rec, &header, &outcome)); + + // First line is the header, last is the authoritative outcome. + assert_eq!(events.first().unwrap()["event"], "header"); + assert_eq!(events.first().unwrap()["check_id"], 3); + assert_eq!(events.first().unwrap()["effort"], "low"); + let last = events.last().unwrap(); + assert_eq!(last["event"], "outcome"); + assert_eq!(last["verdict"]["success"], true); + assert_eq!(last["turns"], 1); + + // The two text deltas became a single coalesced record. + let texts: Vec<_> = events.iter().filter(|e| e["event"] == "text").collect(); + assert_eq!(texts.len(), 1); + assert_eq!(texts[0]["text"], "Hello world"); + + assert!(events.iter().any(|e| e["event"] == "tool_start")); + assert!( + events + .iter() + .any(|e| e["event"] == "tool_end" && e["result"] == "no matches") + ); + // Every record line (between the header and outcome frames) carries a + // relative timestamp; the frames themselves don't. + assert!( + events + .iter() + .filter(|e| e["event"] != "header" && e["event"] != "outcome") + .all(|e| e.get("elapsed_ms").is_some()) + ); + } + + #[test] + fn truncates_oversized_tool_results() { + let rec = TraceRecorder::new(); + rec.record(&AgentEvent::ToolEnd { + name: "file_read".into(), + id: "t1".into(), + result: "x".repeat(MAX_TOOL_RESULT_BYTES + 100), + is_error: false, + duration: Duration::from_millis(1), + compression: None, + }); + let outcome = AgentOutcome::default(); + let header = TraceHeader { + check_id: 0, + check_title: "t", + model: "m", + effort: Effort::Low, + working_dir: Path::new("."), + session_id: "s", + }; + + let events = parse_lines(&serialize_trace(&rec, &header, &outcome)); + let tool_end = events.iter().find(|e| e["event"] == "tool_end").unwrap(); + let result = tool_end["result"].as_str().unwrap(); + assert!(result.contains("truncated"), "expected truncation marker"); + // Bounded to the cap plus the short marker. + assert!(result.len() <= MAX_TOOL_RESULT_BYTES + 40); + } +} diff --git a/src/checks/mod.rs b/src/checks/mod.rs index b0f8a61..bfbec47 100644 --- a/src/checks/mod.rs +++ b/src/checks/mod.rs @@ -33,10 +33,12 @@ pub mod model; mod presenter; mod reporting; pub mod sandbox; +mod trace_archive; #[cfg(test)] mod e2e; +use std::io::Write; use std::path::Path; use std::sync::Arc; @@ -44,6 +46,8 @@ use kameo::actor::{ActorRef, Spawn}; use miette::{IntoDiagnostic, Result, miette}; use tokio::sync::oneshot; +use crate::checks::trace_archive::TraceCollector; + use crate::Terminal; use crate::checks::config::{CliOverrides, Config}; use crate::checks::discovery::DiscoveryActor; @@ -88,9 +92,35 @@ pub async fn run(terminal: &Terminal, working_dir: &Path, overrides: CliOverride owns_record, } = select_backend(terminal.stdout_allows_color()); + // If trace capture is enabled, create the shared collector now and hand a + // clone to the pipeline: the executor fills each execution's trace and the + // execution actor routes every attempt (retries included) here. + let trace_collector = resolved + .config + .trace_archive + .as_ref() + .map(|_| Arc::new(TraceCollector::new())); + // Phases 2–5: drive the actor pipeline (with the presenter) to its terminal // result, then render the record. - let outcomes = run_pipeline(&resolved.config, executor, sandbox, working_dir, backend).await?; + let outcomes = run_pipeline( + &resolved.config, + executor, + sandbox, + working_dir, + backend, + trace_collector.clone(), + ) + .await?; + + // Bundle the captured traces. Best-effort: a trace-archiving failure must not + // fail an otherwise-successful check run. `run_pipeline` has already torn the + // presenter down, so stderr is free for the notice. + if let (Some(collector), Some(path)) = + (&trace_collector, resolved.config.trace_archive.as_deref()) + { + write_trace_archive(collector, path); + } if owns_record { // The inline TUI was the sole terminal writer and has already flushed the @@ -117,6 +147,7 @@ fn spawn_core( sandbox: Arc, working_dir: &Path, backend: Box, + trace_collector: Option>, ) -> ( ActorRef, ActorRef, @@ -134,6 +165,7 @@ fn spawn_core( cfg.max_attempts, reporting.clone(), presenter.clone(), + trace_collector, )); (execution, reporting, presenter, rx) } @@ -202,6 +234,31 @@ async fn stream_requirements( Ok(()) } +/// Build and write the opt-in session-trace archive. Best-effort: on failure it +/// logs and returns rather than failing an otherwise-successful check run. Only +/// called when `--trace-archive` is set. +fn write_trace_archive(collector: &TraceCollector, path: &Path) { + if collector.is_empty() { + tracing::info!("trace capture enabled but no check executions ran; no archive written"); + return; + } + match trace_archive::write_archive(collector, path) { + Ok(count) => { + tracing::info!(count, path = %path.display(), "wrote check session-trace archive"); + let _ = writeln!( + std::io::stderr(), + "Wrote {count} check session trace(s) to {}", + path.display() + ); + } + Err(e) => tracing::error!( + error = %e, + path = %path.display(), + "failed to write check session-trace archive", + ), + } +} + /// Await the pipeline's terminal result, mapping a dropped channel (a dead /// reporting actor) to a diagnostic so a crashed actor fails the run rather than /// hanging it. @@ -223,9 +280,16 @@ async fn run_pipeline( sandbox: Arc, working_dir: &Path, backend: Box, + trace_collector: Option>, ) -> Result> { - let (execution, reporting, presenter, rx) = - spawn_core(cfg, executor, sandbox, working_dir, backend); + let (execution, reporting, presenter, rx) = spawn_core( + cfg, + executor, + sandbox, + working_dir, + backend, + trace_collector, + ); let discovery = DiscoveryActor::spawn(DiscoveryActor::new( working_dir.to_path_buf(), execution.clone(), @@ -262,7 +326,7 @@ async fn run_to_outcomes( backend: Box, ) -> Result> { let (execution, reporting, presenter, rx) = - spawn_core(cfg, executor, sandbox, working_dir, backend); + spawn_core(cfg, executor, sandbox, working_dir, backend, None); stream_requirements(&execution, &presenter, requirements).await?; let outcomes = await_result(rx).await; shutdown_presenter(&presenter).await; diff --git a/src/checks/trace_archive.rs b/src/checks/trace_archive.rs new file mode 100644 index 0000000..47cb0d8 --- /dev/null +++ b/src/checks/trace_archive.rs @@ -0,0 +1,238 @@ +//! In-memory assembly of the opt-in session-trace archive +//! (`multi check --trace-archive `). +//! +//! Each check *execution* — including every retry — contributes one NDJSON trace +//! produced by the executor (see [`crate::checks::executor::trace`]) and carried +//! back in [`AgentOutcome::trace_jsonl`]. The execution actor pushes each one +//! into a shared [`TraceCollector`], tagged with its requirement, check title, +//! and 1-based attempt number. When the run finishes, [`TraceCollector::into_tar_gz`] +//! lays the traces out as +//! +//! ```text +//! {NN}-{requirement-slug}/{check-slug}.attempt-{k}.jsonl +//! ``` +//! +//! — one directory per requirement, one file per (check, attempt) — and streams +//! them through a `tar` builder wrapped in a `flate2` gzip encoder, entirely in +//! memory: pure Rust, no temp directory, no `tar`/`gzip` subprocess. +//! +//! [`AgentOutcome::trace_jsonl`]: crate::checks::executor::AgentOutcome::trace_jsonl + +use std::collections::HashSet; +use std::io::Write; +use std::sync::Mutex; + +use flate2::Compression; +use flate2::write::GzEncoder; +use miette::{IntoDiagnostic, Result}; +use tar::{Builder, Header}; + +use crate::checks::model::CheckId; + +/// One captured execution trace plus the metadata that places it in the archive. +pub(crate) struct TraceEntry { + /// The requirement's declaration-order index (its archive directory). + pub req_index: usize, + /// The requirement title (slugged into the directory name). + pub req_title: String, + /// The check's run-unique id, used only to disambiguate a same-title clash. + pub check_id: CheckId, + /// The check title (slugged into the file name). + pub check_title: String, + /// 1-based attempt number; retries increment it. + pub attempt: usize, + /// The self-contained NDJSON trace document for this one execution. + pub bytes: Vec, +} + +/// Thread-safe sink the execution tasks push into as each attempt completes. +/// Shared behind an `Arc` across the concurrent per-check tasks. +#[derive(Default)] +pub(crate) struct TraceCollector { + entries: Mutex>, +} + +impl TraceCollector { + pub(crate) fn new() -> Self { + Self::default() + } + + /// Record one execution's trace. Never blocks meaningfully — a brief lock on + /// a `Vec` push. + pub(crate) fn push(&self, entry: TraceEntry) { + self.lock().push(entry); + } + + /// How many execution traces have been collected. + pub(crate) fn len(&self) -> usize { + self.lock().len() + } + + /// Whether no execution traces have been collected. + pub(crate) fn is_empty(&self) -> bool { + self.lock().is_empty() + } + + /// Consume the collected traces into a gzip-compressed tar archive, built + /// entirely in memory. + pub(crate) fn into_tar_gz(&self) -> Result> { + let mut entries = std::mem::take(&mut *self.lock()); + // Deterministic layout: group by requirement, then check, then attempt. + entries.sort_by(|a, b| { + a.req_index + .cmp(&b.req_index) + .then(a.check_id.cmp(&b.check_id)) + .then(a.attempt.cmp(&b.attempt)) + }); + + let mut used_paths: HashSet = HashSet::new(); + let mut tar = Builder::new(GzEncoder::new(Vec::new(), Compression::default())); + + for entry in &entries { + let dir = format!("{:02}-{}", entry.req_index + 1, slug(&entry.req_title)); + let base = slug(&entry.check_title); + let mut path = format!("{dir}/{base}.attempt-{}.jsonl", entry.attempt); + // Two distinct checks under one requirement can share a title (e.g. + // both inherited it from the requirement), colliding on the same + // (title, attempt) path. Disambiguate the loser with its check id. + if !used_paths.insert(path.clone()) { + path = format!( + "{dir}/{base}.check-{}.attempt-{}.jsonl", + entry.check_id, entry.attempt + ); + used_paths.insert(path.clone()); + } + + let mut header = Header::new_gnu(); + header.set_size(entry.bytes.len() as u64); + header.set_mode(0o644); + // Fixed mtime keeps the archive byte-reproducible for a given input. + header.set_mtime(0); + header.set_cksum(); + tar.append_data(&mut header, &path, entry.bytes.as_slice()) + .into_diagnostic()?; + } + + // `into_inner` finishes the tar (writing its trailer), then `finish` + // flushes the gzip trailer and hands back the compressed bytes. + let gz = tar.into_inner().into_diagnostic()?; + gz.finish().into_diagnostic() + } + + fn lock(&self) -> std::sync::MutexGuard<'_, Vec> { + // A poisoned lock only means a task panicked mid-push; recover the guard + // rather than propagating the panic into the archive step. + self.entries.lock().unwrap_or_else(|e| e.into_inner()) + } +} + +/// Slugify a title into one filesystem-safe path segment: lowercase alphanumerics +/// kept, every other run collapsed to a single `-`, ends trimmed. Empty input (or +/// all-punctuation) becomes `untitled`. +fn slug(title: &str) -> String { + let mut out = String::with_capacity(title.len()); + let mut prev_dash = false; + for ch in title.chars() { + if ch.is_ascii_alphanumeric() { + out.push(ch.to_ascii_lowercase()); + prev_dash = false; + } else if !prev_dash { + out.push('-'); + prev_dash = true; + } + } + let trimmed = out.trim_matches('-'); + if trimmed.is_empty() { + "untitled".to_string() + } else { + trimmed.to_string() + } +} + +/// Build the archive from `collector` and write it to `path`, creating parent +/// directories as needed. A single self-contained step so the orchestrator can +/// treat trace archiving as best-effort: failures are surfaced as `Err` for the +/// caller to log without failing the check run. +pub(crate) fn write_archive(collector: &TraceCollector, path: &std::path::Path) -> Result { + let count = collector.len(); + let bytes = collector.into_tar_gz()?; + if let Some(parent) = path.parent() + && !parent.as_os_str().is_empty() + { + std::fs::create_dir_all(parent).into_diagnostic()?; + } + let mut file = std::fs::File::create(path).into_diagnostic()?; + file.write_all(&bytes).into_diagnostic()?; + Ok(count) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn entry( + req_index: usize, + req_title: &str, + check_id: CheckId, + check_title: &str, + attempt: usize, + ) -> TraceEntry { + TraceEntry { + req_index, + req_title: req_title.to_string(), + check_id, + check_title: check_title.to_string(), + attempt, + bytes: format!("{{\"event\":\"header\",\"check_id\":{check_id}}}\n").into_bytes(), + } + } + + #[test] + fn slug_is_filesystem_safe() { + assert_eq!(slug("No YELLOW text!"), "no-yellow-text"); + assert_eq!(slug(" spaces "), "spaces"); + assert_eq!(slug("a/b\\c"), "a-b-c"); + assert_eq!(slug("***"), "untitled"); + assert_eq!(slug(""), "untitled"); + } + + #[test] + fn archive_round_trips_expected_paths() { + use flate2::read::GzDecoder; + use std::io::Read; + + let collector = TraceCollector::new(); + // Requirement 0 has one check run twice (attempt 1 then 2). + collector.push(entry(0, "First req", 0, "Check A", 1)); + collector.push(entry(0, "First req", 0, "Check A", 2)); + // Requirement 1 has two checks that share a title (collision path). + collector.push(entry(1, "Second req", 1, "Same", 1)); + collector.push(entry(1, "Second req", 2, "Same", 1)); + + assert_eq!(collector.len(), 4); + let gz = collector.into_tar_gz().unwrap(); + // Draining left the collector empty. + assert_eq!(collector.len(), 0); + + let mut tar_bytes = Vec::new(); + GzDecoder::new(&gz[..]).read_to_end(&mut tar_bytes).unwrap(); + let mut archive = tar::Archive::new(&tar_bytes[..]); + let mut paths: Vec = archive + .entries() + .unwrap() + .map(|e| e.unwrap().path().unwrap().to_string_lossy().into_owned()) + .collect(); + paths.sort(); + + assert_eq!( + paths, + vec![ + "01-first-req/check-a.attempt-1.jsonl".to_string(), + "01-first-req/check-a.attempt-2.jsonl".to_string(), + // The second check with the same title is disambiguated by id. + "02-second-req/same.attempt-1.jsonl".to_string(), + "02-second-req/same.check-2.attempt-1.jsonl".to_string(), + ] + ); + } +} diff --git a/src/config/check/mod.rs b/src/config/check/mod.rs index f6f1ff3..169b3fd 100644 --- a/src/config/check/mod.rs +++ b/src/config/check/mod.rs @@ -39,6 +39,13 @@ pub struct CheckSubcommand { /// available CPU cores. #[arg(long)] concurrency: Option, + + /// Capture every agent check session and bundle the traces into this + /// `.tar.gz` archive (off by default). One directory per requirement, one + /// file per check execution named by check title and numbered by retry. + /// Overrides `checks.trace_archive` from env/file. + #[arg(long, value_name = "PATH")] + trace_archive: Option, } impl CheckSubcommand { @@ -56,6 +63,7 @@ impl CheckSubcommand { self.effort, self.executor, self.concurrency.map(NonZeroUsize::get), + self.trace_archive.clone(), ) } }