diff --git a/BUGS.md b/BUGS.md index f4e0fef..766e5aa 100644 --- a/BUGS.md +++ b/BUGS.md @@ -2,20 +2,21 @@ These are bugs (or missing features) I've observed while working with `multi checks`. -- [ ] Output is now hanging. I suspect this is recent (within the last few commits) and it started -happening after implement the changes to the `Presenter` actor to fix writing text off-screen without wrapping. - - [ ] Remove the `Claude -p` executor. -- [ ] Running 16 agents seems to nearly freeze the computer. Use an OTel profile to determine if this is true. +- [ ] No use of Cersei workflows to chain multiple prompts together. -- [ ] Temperature not configured. +- [ ] Logs no longer report the id of the check that failed (or the number of attempted retries) -- [ ] No limit on max turns. +- [ ] Assemble_instructions is hard-coded: src/checks/executor/mod.rs:98 (definition), called from src/checks/executor/cersei.rs:110 -- [ ] Logs no longer report the id of the check that failed (or the number of attempted retries) +- [ ] No system prompt provided. -- [ ] No use of Cersei workflows to chain multiple prompts together. +- [ ] Not sure if prompt caching is enabled at all. + +- [ ] Running 16 agents seems to nearly freeze the computer. Use an OTel profile to determine if this is true. + +- [ ] No limit on max turns. - [ ] No support for Fireworks AI. @@ -25,14 +26,6 @@ happening after implement the changes to the `Presenter` actor to fix writing te - [ ] No loading of RULES.md files from the .claude directory. -- [ ] Assemble_instructions is hard-coded: src/checks/executor/mod.rs:98 (definition), called from src/checks/executor/cersei.rs:110 - -- [ ] No system prompt provided. - -- [ ] Not sure if prompt caching is enabled at all. - -- [ ] No trace capture. We need a way to record all session traces so that we can analyze why they failed. - - CERSEI: `append_system_prompt()` function is dead unless routed through the separate build_system_prompt() composer. - [ ] `Ctrl-C` (shutdown signals) needs to be handled gracefully and cross-platform. @@ -50,6 +43,13 @@ guaranteeing the terminal is restored on the way out. ## Fixes +- [x] No trace capture. We need a way to record all session traces so that we can analyze why they failed. + +- [x] Output is now hanging. I suspect this is recent (within the last few commits) and it started +happening after implement the changes to the `Presenter` actor to fix writing text off-screen without wrapping. + +- [x] Temperature not configured. + - [x] No loading of CLAUDE.md files - [x] Concurrency still not respected. diff --git a/Cargo.lock b/Cargo.lock index feacf67..25039e8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,17 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "ahash" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9" +dependencies = [ + "getrandom 0.2.17", + "once_cell", + "version_check", +] + [[package]] name = "ahash" version = "0.8.12" @@ -994,7 +1005,7 @@ checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0" [[package]] name = "cersei-agent" version = "0.2.6" -source = "git+https://github.com/wack/cersei.git?branch=trunk#d485d50e31c055c8a2bc7ba5d9ad03b384088c0c" +source = "git+https://github.com/wack/cersei.git?branch=trunk#521ea38bd1c178967e6d5fe564f8fcc911780bcb" dependencies = [ "anyhow", "async-trait", @@ -1021,7 +1032,7 @@ dependencies = [ [[package]] name = "cersei-compression" version = "0.2.6" -source = "git+https://github.com/wack/cersei.git?branch=trunk#d485d50e31c055c8a2bc7ba5d9ad03b384088c0c" +source = "git+https://github.com/wack/cersei.git?branch=trunk#521ea38bd1c178967e6d5fe564f8fcc911780bcb" dependencies = [ "anyhow", "once_cell", @@ -1035,7 +1046,7 @@ dependencies = [ [[package]] name = "cersei-embeddings" version = "0.2.6" -source = "git+https://github.com/wack/cersei.git?branch=trunk#d485d50e31c055c8a2bc7ba5d9ad03b384088c0c" +source = "git+https://github.com/wack/cersei.git?branch=trunk#521ea38bd1c178967e6d5fe564f8fcc911780bcb" dependencies = [ "async-trait", "futures", @@ -1051,7 +1062,7 @@ dependencies = [ [[package]] name = "cersei-hooks" version = "0.2.6" -source = "git+https://github.com/wack/cersei.git?branch=trunk#d485d50e31c055c8a2bc7ba5d9ad03b384088c0c" +source = "git+https://github.com/wack/cersei.git?branch=trunk#521ea38bd1c178967e6d5fe564f8fcc911780bcb" dependencies = [ "async-trait", "cersei-types", @@ -1063,7 +1074,7 @@ dependencies = [ [[package]] name = "cersei-lsp" version = "0.2.6" -source = "git+https://github.com/wack/cersei.git?branch=trunk#d485d50e31c055c8a2bc7ba5d9ad03b384088c0c" +source = "git+https://github.com/wack/cersei.git?branch=trunk#521ea38bd1c178967e6d5fe564f8fcc911780bcb" dependencies = [ "dashmap", "serde", @@ -1077,7 +1088,7 @@ dependencies = [ [[package]] name = "cersei-mcp" version = "0.2.6" -source = "git+https://github.com/wack/cersei.git?branch=trunk#d485d50e31c055c8a2bc7ba5d9ad03b384088c0c" +source = "git+https://github.com/wack/cersei.git?branch=trunk#521ea38bd1c178967e6d5fe564f8fcc911780bcb" dependencies = [ "async-trait", "cersei-types", @@ -1091,7 +1102,7 @@ dependencies = [ [[package]] name = "cersei-memory" version = "0.2.6" -source = "git+https://github.com/wack/cersei.git?branch=trunk#d485d50e31c055c8a2bc7ba5d9ad03b384088c0c" +source = "git+https://github.com/wack/cersei.git?branch=trunk#521ea38bd1c178967e6d5fe564f8fcc911780bcb" dependencies = [ "async-trait", "cersei-types", @@ -1109,7 +1120,7 @@ dependencies = [ [[package]] name = "cersei-provider" version = "0.2.6" -source = "git+https://github.com/wack/cersei.git?branch=trunk#d485d50e31c055c8a2bc7ba5d9ad03b384088c0c" +source = "git+https://github.com/wack/cersei.git?branch=trunk#521ea38bd1c178967e6d5fe564f8fcc911780bcb" dependencies = [ "async-trait", "base64 0.22.1", @@ -1129,7 +1140,7 @@ dependencies = [ [[package]] name = "cersei-tools" version = "0.2.6" -source = "git+https://github.com/wack/cersei.git?branch=trunk#d485d50e31c055c8a2bc7ba5d9ad03b384088c0c" +source = "git+https://github.com/wack/cersei.git?branch=trunk#521ea38bd1c178967e6d5fe564f8fcc911780bcb" dependencies = [ "async-trait", "base64 0.22.1", @@ -1141,6 +1152,7 @@ dependencies = [ "dashmap", "dirs", "glob", + "globset", "grep", "html2text", "ignore", @@ -1172,7 +1184,7 @@ dependencies = [ [[package]] name = "cersei-types" version = "0.2.6" -source = "git+https://github.com/wack/cersei.git?branch=trunk#d485d50e31c055c8a2bc7ba5d9ad03b384088c0c" +source = "git+https://github.com/wack/cersei.git?branch=trunk#521ea38bd1c178967e6d5fe564f8fcc911780bcb" dependencies = [ "anyhow", "base64 0.22.1", @@ -2390,6 +2402,9 @@ name = "hashbrown" version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +dependencies = [ + "ahash 0.7.8", +] [[package]] name = "hashbrown" @@ -2737,7 +2752,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.4", + "socket2 0.5.10", "system-configuration 0.7.0", "tokio", "tower-service", @@ -4027,7 +4042,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbe55bddb694583a9db101e5ae5b31f570f2ccce312ac7d64c2e4a430510c4b3" dependencies = [ - "ahash", + "ahash 0.8.12", "async-trait", "blake2", "bytes", @@ -4061,7 +4076,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51dbd9509e3bb25a699bee76ba1befbffb4e733694d7e682d4bfe35a1a48cbb4" dependencies = [ - "ahash", + "ahash 0.8.12", "async-trait", "brotli", "bytes", @@ -4095,7 +4110,7 @@ dependencies = [ "serde", "serde_yaml", "sfv", - "socket2 0.6.4", + "socket2 0.5.10", "strum", "strum_macros", "thread_local", @@ -4174,7 +4189,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9bb8f0df84b4b9afd63742c78e6c4b39413554f857e7d41502825e4ff9798e3e" dependencies = [ "arrayvec", - "hashbrown 0.17.1", + "hashbrown 0.12.3", "parking_lot", "rand 0.10.1", ] @@ -4425,7 +4440,7 @@ dependencies = [ "quinn-udp", "rustc-hash 2.1.2", "rustls 0.23.41", - "socket2 0.6.4", + "socket2 0.5.10", "thiserror 2.0.18", "tokio", "tracing", @@ -4462,7 +4477,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.4", + "socket2 0.5.10", "tracing", "windows-sys 0.60.2", ] diff --git a/guides/checks.md b/guides/checks.md index e785b24..acc578b 100644 --- a/guides/checks.md +++ b/guides/checks.md @@ -187,9 +187,10 @@ base_url = "https://..." An unset flag contributes nothing — it never overrides a value from the environment or file. The `model` is validated against a hardcoded allowlist of known IDs for the selected provider; an unknown ID is a clear error. `effort` -currently maps to the in-process agent's sampling temperature (`low` → most -deterministic, `high` → most exploratory); mapping it to an extended-thinking -budget is pending an upstream provider fix. +maps to the in-process agent's extended-thinking budget: `medium` and `high` +enable extended thinking (4096- and 8192-token budgets respectively), while +`low` — the default — keeps thinking off for speed and cost, running the agent +deterministically instead. The **`executor`** selects the execution engine. The default `cersei` runs each check as an in-process agent (native multi-provider model swapping, no external diff --git a/src/checks/executor/cersei.rs b/src/checks/executor/cersei.rs index 23ac1ee..3c4450b 100644 --- a/src/checks/executor/cersei.rs +++ b/src/checks/executor/cersei.rs @@ -4,10 +4,12 @@ use std::path::Path; use std::sync::Arc; +use std::sync::atomic::{AtomicU32, Ordering}; use std::time::Duration; use async_trait::async_trait; use cersei_agent::Agent; +use cersei_agent::events::AgentEvent; use cersei_memory::claudemd; use cersei_tools::permissions::AllowReadOnly; use cersei_tools::{Tool, clear_session_shell_state}; @@ -39,8 +41,8 @@ pub struct CerseiExecutor { factory: ProviderFactory, /// The concrete model ID to run (e.g. `claude-sonnet-4-6`). model: String, - /// The effort level, mapped to a sampling temperature (see - /// [`effort_temperature`]). + /// The effort level, mapped to an extended-thinking budget (medium/high) + /// or a sampling temperature (low) — see [`thinking_budget`]. effort: Effort, /// Per-agent wall-clock timeout; on expiry the run is dropped (which stops /// the in-process agent) and the check resolves as errored. @@ -110,32 +112,36 @@ fn project_instructions(project_root: &Path) -> Option { (!prompt.trim().is_empty()).then_some(prompt) } -/// Map our coarse [`Effort`] onto a sampling temperature. +/// Map our coarse [`Effort`] onto an extended-thinking budget. /// -/// Extended thinking would be the natural effort vehicle, but cersei-provider -/// 0.1.9 cannot round-trip Anthropic *thinking-block signatures*: its SSE parser -/// drops `signature_delta`, so the thinking block it sends back on the second -/// turn carries an empty signature and the API rejects it -/// (`Invalid signature in thinking block`). Until that is fixed upstream -/// (https://github.com/pacifio/cersei/issues/21) we leave thinking disabled and -/// apply effort as temperature instead — lower effort is more deterministic, -/// higher effort more exploratory. -fn effort_temperature(effort: Effort) -> f32 { +/// Medium and high effort buy real extended thinking: `wack/cersei` carries +/// the provider fixes for round-tripping thinking blocks (`signature_delta` +/// accumulation in 94f18b2, `redacted_thinking` preservation in 5bd06db), so +/// the temperature-as-effort workaround that previously lived here is retired. +/// Low effort — the default — keeps thinking off to stay fast and cheap, and +/// steers with temperature instead (see [`attempt_temperature`]). +/// +/// Budgets follow cersei's own `EffortLevel` scale (medium 4096, high 8192) +/// and sit comfortably under the agent's default 16k `max_tokens` (the API +/// requires `budget_tokens < max_tokens`). +fn thinking_budget(effort: Effort) -> Option { match effort { - Effort::Low => 0.0, - Effort::Medium => 0.5, - Effort::High => 1.0, + Effort::Low => None, + Effort::Medium => Some(4_096), + Effort::High => Some(8_192), } } -/// The sampling temperature for a given attempt: the effort base, raised by -/// 0.5 per retry and capped at 1.0. At effort=low the base is 0.0, and a -/// temperature-0 retry is a replay: the 2026-07-01 postmortem caught one check -/// reproducing its fatal trajectory near-verbatim on all three attempts. A -/// retry has to sample differently to be worth its wall-clock. -fn attempt_temperature(effort: Effort, attempt: u32) -> f32 { - let base = effort_temperature(effort); - (base + 0.5 * attempt.saturating_sub(1) as f32).min(1.0) +/// The sampling temperature for a thinking-free (low-effort) attempt: +/// deterministic (0.0) on the first attempt, raised by 0.5 per retry and +/// capped at 1.0. A temperature-0 retry is a replay: the 2026-07-01 postmortem +/// caught one check reproducing its fatal trajectory near-verbatim on all +/// three attempts — a retry has to sample differently to be worth its +/// wall-clock. Thinking runs take no temperature at all (the API requires it +/// unset when thinking is enabled, and thinking samples at 1.0), which gives +/// their retries natural diversity. +fn attempt_temperature(attempt: u32) -> f32 { + (0.5 * attempt.saturating_sub(1) as f32).min(1.0) } #[async_trait] @@ -180,11 +186,16 @@ impl CheckExecutor for CerseiExecutor { .permission_policy(AllowReadOnly) .tools(read_only_tools()) .tool(judge) - // Thinking is intentionally left disabled (see `effort_temperature`). - .temperature(attempt_temperature(self.effort, req.attempt)) .max_turns(MAX_TURNS) .cancel_token(cancel.clone()); + // Exactly one reasoning control applies: the API rejects a temperature + // when extended thinking is enabled (see [`thinking_budget`]). + agent_builder = match thinking_budget(self.effort) { + Some(budget) => agent_builder.thinking_budget(budget), + None => agent_builder.temperature(attempt_temperature(req.attempt)), + }; + // `.system_prompt()`, not `.append_system_prompt()`: cersei's agent // runner only ever reads `Agent.system_prompt` when building each // completion request (`append_system_prompt` is exclusively consumed @@ -195,15 +206,29 @@ impl CheckExecutor for CerseiExecutor { agent_builder = agent_builder.system_prompt(project_prompt); } - // When trace capture is on, tee every agent event into a recorder. - // `emit` invokes this synchronously for each event *before* the loop's - // early returns, so the trace survives post-verdict cancellation and the - // drop-on-timeout below (which cersei's own session persistence would - // miss). The executor owns a clone, so a dropped agent doesn't lose it. + // Observe agent events for two purposes sharing the builder's single + // `on_event` slot. The turn counter runs unconditionally: the success + // path cancels the agent the instant it reports, which makes `run` + // return `Err(Cancelled)` and discards cersei's own turn count — so + // without it every successful check would report 0 turns. The trace + // recorder is opt-in; `emit` invokes this synchronously for each event + // *before* the loop's early returns, so the trace survives post-verdict + // cancellation and the drop-on-timeout below (which cersei's own + // session persistence would miss). The executor owns clones, so a + // dropped agent loses neither. let recorder = self.capture_traces.then(|| Arc::new(TraceRecorder::new())); - if let Some(recorder) = &recorder { - let recorder = Arc::clone(recorder); - agent_builder = agent_builder.on_event(move |event| recorder.record(event)); + let turns_seen = Arc::new(AtomicU32::new(0)); + { + let recorder = recorder.clone(); + let turns_seen = Arc::clone(&turns_seen); + agent_builder = agent_builder.on_event(move |event| { + if let AgentEvent::TurnStart { turn } = event { + turns_seen.fetch_max(*turn, Ordering::Relaxed); + } + if let Some(recorder) = &recorder { + recorder.record(event); + } + }); } let agent = agent_builder @@ -236,7 +261,7 @@ impl CheckExecutor for CerseiExecutor { // Our own post-report cancellation is not an error. stop_reason: matches!(err, CerseiError::Cancelled) .then(|| "cancelled".to_string()), - turns: 0, + turns: turns_seen.load(Ordering::Relaxed), error: (!reported).then(|| err.to_string()), trace_jsonl: None, } @@ -245,7 +270,7 @@ impl CheckExecutor for CerseiExecutor { // A verdict may have landed in the instant before the timeout. verdict, stop_reason: None, - turns: 0, + turns: turns_seen.load(Ordering::Relaxed), error: Some(format!("agent timed out after {:?}", self.timeout)), trace_jsonl: None, }, @@ -276,11 +301,22 @@ mod tests { #[test] fn retries_raise_the_temperature_up_to_the_cap() { - assert_eq!(attempt_temperature(Effort::Low, 1), 0.0); - assert_eq!(attempt_temperature(Effort::Low, 2), 0.5); - assert_eq!(attempt_temperature(Effort::Low, 3), 1.0); - assert_eq!(attempt_temperature(Effort::Medium, 2), 1.0); + assert_eq!(attempt_temperature(1), 0.0); + assert_eq!(attempt_temperature(2), 0.5); + assert_eq!(attempt_temperature(3), 1.0); // Already at the cap: retries must not push past valid API range. - assert_eq!(attempt_temperature(Effort::High, 3), 1.0); + assert_eq!(attempt_temperature(4), 1.0); + } + + #[test] + fn only_low_effort_runs_without_thinking() { + assert_eq!(thinking_budget(Effort::Low), None); + let medium = thinking_budget(Effort::Medium).unwrap(); + let high = thinking_budget(Effort::High).unwrap(); + assert!(medium < high); + // The Anthropic minimum thinking budget is 1024; the agent's default + // max_tokens is 16384 and budgets must stay strictly below it. + assert!(medium >= 1024); + assert!(high < 16384); } } diff --git a/src/checks/executor/mod.rs b/src/checks/executor/mod.rs index 8142922..3633b7b 100644 --- a/src/checks/executor/mod.rs +++ b/src/checks/executor/mod.rs @@ -40,10 +40,11 @@ pub struct AgentRunRequest { /// The sandbox directory to run the agent in (its working directory). pub working_dir: PathBuf, /// Which attempt this is, 1-based. Retries must not replay the failed - /// attempt verbatim: executors use this to raise the sampling temperature - /// and to tell the agent a previous attempt went unreported (the 2026-07-01 - /// timeout postmortem showed temperature-0 retries reproducing the same - /// fatal trajectory three times in a row). + /// attempt verbatim: executors use this to tell the agent a previous + /// attempt went unreported and — on thinking-free runs — to raise the + /// sampling temperature (the 2026-07-01 timeout postmortem showed + /// temperature-0 retries reproducing the same fatal trajectory three + /// times in a row). pub attempt: u32, }