Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions BUGS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@

These are bugs (or missing features) I've observed while working with `multi checks`.

- [ ] Output is now hanging. I suspect this is recent (within the last few commits) and it started
happening after implement the changes to the `Presenter` actor to fix writing text off-screen without wrapping.

- [ ] No limit on max turns.

- [ ] Remove the `Claude -p` executor.

- [ ] Logs no longer report the id of the check that failed (or the number of attempted retries)

- [ ] No use of Cersei workflows to chain multiple prompts together.

- [ ] No support for Fireworks AI.
Expand All @@ -22,6 +31,19 @@ These are bugs (or missing features) I've observed while working with `multi che

- CERSEI: `append_system_prompt()` function is dead unless routed through the separate build_system_prompt() composer.

- [ ] `Ctrl-C` (shutdown signals) needs to be handled gracefully and cross-platform.
The `multi check` presenter installs a raw `libc::signal(SIGINT, …)` handler
(`install_terminal_guards`, src/checks/presenter/inline.rs:224) that is Unix-only
(won't build/run on Windows) and hard-`_exit(130)`s: it restores the cursor but
skips graceful teardown, so in-flight agent sessions, the MCP result server, and
spawned Cersei subprocesses are killed without cleanup and no partial
results/traces get flushed. Contrast `multi run`, which already does this
correctly and cross-platform via `tokio_graceful_shutdown::Toplevel::catch_signals()`
+ `handle_shutdown_requests()` (src/cmd/run/canary_mode.rs:63). `check` should adopt
the same graceful-shutdown path (or an equivalent `tokio::signal::ctrl_c` + coordinated
cancellation covering SIGINT/SIGTERM and Windows Ctrl-C/Ctrl-Break) while still
guaranteeing the terminal is restored on the way out.

## Fixes

- [x] No loading of CLAUDE.md files
Expand Down
23 changes: 23 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ derive-getters = "0.5.0"
dialoguer = "0.11.0"
directories = "6.0"
figment = { version = "0.10", default-features = false, features = ["env"] }
# Pure-Rust gzip (miniz_oxide backend, the default). Used with `tar` to build the
# opt-in `multi check --trace-archive` bundle entirely in memory.
flate2 = "1.0"
failsafe = "1.3.0"
futures-core = "0.3.31"
futures-util = "0.3.31"
Expand All @@ -60,6 +63,8 @@ serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
serde_json5 = "0.2.1"
serde_with = { version = "3.12", features = ["chrono"] }
# Pure-Rust tar writer for the in-memory `multi check --trace-archive` bundle.
tar = "0.4"
thiserror.workspace = true
tokio = { workspace = true, features = ["full"] }
tokio-graceful-shutdown = "0.16.0"
Expand Down
13 changes: 12 additions & 1 deletion src/checks/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ mod providers;
mod schema;

use std::num::NonZeroUsize;
use std::path::PathBuf;
use std::time::Duration;

use figment::{
Expand Down Expand Up @@ -74,6 +75,9 @@ pub struct Config {
/// calling the judge tool; a fresh attempt usually succeeds. A check only
/// resolves as errored after all attempts are exhausted.
pub max_attempts: usize,
/// Where to bundle the opt-in session-trace archive, or `None` (default) to
/// disable trace capture. See [`crate::checks::trace_archive`].
pub trace_archive: Option<PathBuf>,
}

impl Config {
Expand Down Expand Up @@ -135,6 +139,9 @@ impl Resolved {
cfg.model.clone(),
cfg.effort,
cfg.agent_timeout,
// The archive path lives at the orchestration layer; the executor
// only needs to know whether to capture a per-execution trace.
cfg.trace_archive.is_some(),
)),
ExecutorKind::Claude => cfg.build_claude_executor(),
};
Expand Down Expand Up @@ -198,6 +205,7 @@ pub fn load(overrides: CliOverrides) -> Result<Resolved> {
concurrency,
agent_timeout: DEFAULT_AGENT_TIMEOUT,
max_attempts: DEFAULT_MAX_ATTEMPTS,
trace_archive: checks.trace_archive,
};

Ok(Resolved {
Expand All @@ -224,6 +232,7 @@ pub fn configuration() -> Config {
concurrency: default_concurrency(),
agent_timeout: DEFAULT_AGENT_TIMEOUT,
max_attempts: DEFAULT_MAX_ATTEMPTS,
trace_archive: None,
}
}

Expand All @@ -245,6 +254,7 @@ mod tests {
effort: Some(Effort::Low),
executor: None,
concurrency: None,
trace_archive: None,
providers: ProvidersSection::default(),
},
}
Expand Down Expand Up @@ -273,6 +283,7 @@ mod tests {
None,
None,
None,
None,
);
let checks = resolve_layers(file, overrides).unwrap();
assert_eq!(checks.provider, Some(ProviderKind::OpenAi));
Expand Down Expand Up @@ -307,7 +318,7 @@ mod tests {

// ...and a flag outranks env.
let overrides =
CliOverrides::new(None, Some("claude-opus-4-8".into()), None, None, None);
CliOverrides::new(None, Some("claude-opus-4-8".into()), None, None, None, None);
let checks = resolve_layers(file, overrides).unwrap();
assert_eq!(checks.model.as_deref(), Some("claude-opus-4-8"));
Ok(())
Expand Down
12 changes: 12 additions & 0 deletions src/checks/config/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
//! overrides, so figment can merge them by key path with `flag > env > file`
//! precedence. See [`super::load`].

use std::path::PathBuf;

use clap::ValueEnum;
use serde::{Deserialize, Serialize};

Expand Down Expand Up @@ -81,6 +83,12 @@ pub struct ChecksSection {
/// available CPU cores).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub concurrency: Option<usize>,
/// Where to write the opt-in session-trace archive. When set, every check
/// execution's agent session is captured and bundled into this `.tar.gz`
/// (see [`crate::checks::trace_archive`]); unset (the default) disables
/// capture entirely.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub trace_archive: Option<PathBuf>,
/// Optional, non-secret per-provider base-URL overrides.
#[serde(default)]
pub providers: ProvidersSection,
Expand Down Expand Up @@ -140,6 +148,8 @@ pub struct CliChecksOverrides {
pub executor: Option<ExecutorKind>,
#[serde(skip_serializing_if = "Option::is_none")]
pub concurrency: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub trace_archive: Option<PathBuf>,
}

impl CliOverrides {
Expand All @@ -150,6 +160,7 @@ impl CliOverrides {
effort: Option<Effort>,
executor: Option<ExecutorKind>,
concurrency: Option<usize>,
trace_archive: Option<PathBuf>,
) -> Self {
Self {
checks: CliChecksOverrides {
Expand All @@ -158,6 +169,7 @@ impl CliOverrides {
effort,
executor,
concurrency,
trace_archive,
},
}
}
Expand Down
2 changes: 2 additions & 0 deletions src/checks/e2e.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ impl CheckExecutor for InterleavingExecutor {
stop_reason: Some("interleaving probe".into()),
turns: 1,
error: None,
trace_jsonl: None,
})
}
}
Expand Down Expand Up @@ -267,6 +268,7 @@ async fn invalid_suite_aborts_run_without_spawning_agents() {
Arc::new(NoopSandbox),
dir.path(),
null_backend(),
None,
)
.await;

Expand Down
30 changes: 29 additions & 1 deletion src/checks/execution.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ use crate::checks::model::{Check, CheckId, CheckOutcome, Verdict};
use crate::checks::presenter::{PresenterActor, UiEvent};
use crate::checks::reporting::ReportingActor;
use crate::checks::sandbox::Sandbox;
use crate::checks::trace_archive::{TraceCollector, TraceEntry};

/// The execution actor: turns a stream of discovered checks into a stream of
/// completed checks, fanning agent runs out onto bounded background tasks.
Expand All @@ -46,6 +47,11 @@ pub(crate) struct ExecutionActor {
reporting: ActorRef<ReportingActor>,
/// The display-only presenter, told of each check's lifecycle milestones.
presenter: ActorRef<PresenterActor>,
/// Opt-in sink for per-execution session traces (`multi check
/// --trace-archive`). `None` disables capture. Shared across the spawned
/// per-attempt tasks; every attempt (including retries) pushes its trace
/// here before signalling completion downstream.
trace_collector: Option<Arc<TraceCollector>>,
}

impl Actor for ExecutionActor {
Expand All @@ -62,6 +68,7 @@ impl Actor for ExecutionActor {

impl ExecutionActor {
/// Build the actor. `concurrency` and `max_attempts` are clamped to ≥1.
#[allow(clippy::too_many_arguments)]
pub(crate) fn new(
executor: Arc<dyn CheckExecutor + Send + Sync>,
sandbox: Arc<dyn Sandbox + Send + Sync>,
Expand All @@ -70,6 +77,7 @@ impl ExecutionActor {
max_attempts: usize,
reporting: ActorRef<ReportingActor>,
presenter: ActorRef<PresenterActor>,
trace_collector: Option<Arc<TraceCollector>>,
) -> Self {
Self {
executor,
Expand All @@ -79,6 +87,7 @@ impl ExecutionActor {
max_attempts: max_attempts.max(1),
reporting,
presenter,
trace_collector,
}
}

Expand All @@ -92,6 +101,7 @@ impl ExecutionActor {
let working_dir = self.working_dir.clone();
let reporting = self.reporting.clone();
let presenter = self.presenter.clone();
let trace_collector = self.trace_collector.clone();
let me = ctx.actor_ref().clone();
let id = job.id;

Expand All @@ -105,7 +115,25 @@ impl ExecutionActor {
// Permit acquired ⇒ the agent is about to run: mark the check Running.
let _ = presenter.tell(UiEvent::CheckStarted { id }).await;

let result = run_one(executor, sandbox, job.id, job.check.clone(), &working_dir).await;
let mut result =
run_one(executor, sandbox, job.id, job.check.clone(), &working_dir).await;

// Harvest this attempt's trace *before* signalling completion, so it
// is collected even for retried attempts (whose outcome never reaches
// reporting) and is race-free with run finalization (the collector is
// populated before the `tell` that could trigger it).
if let Some(collector) = &trace_collector
&& let Some(bytes) = result.as_mut().ok().and_then(|o| o.trace_jsonl.take())
{
collector.push(TraceEntry {
req_index: job.req_index,
req_title: job.req_title.clone(),
check_id: job.id,
check_title: job.check.title.clone(),
attempt,
bytes,
});
}

if has_verdict(Some(&result)) {
// The agent reported: reconcile, surface the verdict to the
Expand Down
Loading