Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions src/checks/execution.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,15 @@ impl ExecutionActor {
// Permit acquired ⇒ the agent is about to run: mark the check Running.
let _ = presenter.tell(UiEvent::CheckStarted { id }).await;

let mut result =
run_one(executor, sandbox, job.id, job.check.clone(), &working_dir).await;
let mut result = run_one(
executor,
sandbox,
job.id,
job.check.clone(),
&working_dir,
attempt,
)
.await;

// Harvest this attempt's trace *before* signalling completion, so it
// is collected even for retried attempts (whose outcome never reaches
Expand Down Expand Up @@ -249,13 +256,15 @@ async fn run_one(
id: CheckId,
check: Check,
working_dir: &Path,
attempt: usize,
) -> Result<AgentOutcome> {
let handle = sandbox.create(working_dir).await?;

let request = crate::checks::executor::AgentRunRequest {
check_id: id,
check,
working_dir: handle.path().to_path_buf(),
attempt: u32::try_from(attempt).unwrap_or(u32::MAX),
};

let outcome = executor.run_check(request).await;
Expand Down Expand Up @@ -422,7 +431,9 @@ mod tests {
.await
.unwrap();
assert!(out[0].satisfied);
// Ran twice: one silent attempt, then one reporting attempt.
assert_eq!(executor.seen().len(), 2);
// Ran twice: one silent attempt, then one reporting attempt — and the
// executor was told which attempt each was (retries must be able to
// vary temperature/instructions rather than replaying attempt 1).
assert_eq!(executor.seen_attempts(), vec![(0, 1), (0, 2)]);
}
}
57 changes: 52 additions & 5 deletions src/checks/executor/cersei.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use cersei_types::CerseiError;
use miette::{Result, miette};
use tokio_util::sync::CancellationToken;

use super::jail::Jailed;
use super::judge::{JudgeTool, VerdictSink};
use super::trace::{TraceHeader, TraceRecorder, serialize_trace};
use super::{
Expand Down Expand Up @@ -70,11 +71,27 @@ impl CerseiExecutor {
/// The read-only tool set a verification agent gets by default: observe, do not
/// mutate. Execution-requiring checks (which would need Bash/Write) are gated
/// separately and are future work — the default is least privilege.
///
/// Each tool is [`Jailed`] to the agent's working directory: "read-only" alone
/// still allowed reading anywhere the user can, which let lost agents launch
/// unbounded globs over the host filesystem (timeouts) and grade the live
/// repository instead of the sandbox (postmortem C5). The jail turns an
/// out-of-sandbox path into an immediate tool error that steers the agent back.
fn read_only_tools() -> Vec<Box<dyn Tool>> {
vec![
Box::new(cersei_tools::file_read::FileReadTool),
Box::new(cersei_tools::grep_tool::GrepTool),
Box::new(cersei_tools::glob_tool::GlobTool),
Box::new(Jailed::path_keys(
cersei_tools::file_read::FileReadTool,
&["file_path"],
)),
Box::new(Jailed::path_keys(
cersei_tools::grep_tool::GrepTool,
&["path"],
)),
Box::new(Jailed::glob(
cersei_tools::glob_tool::GlobTool,
&["path"],
"pattern",
)),
]
}

Expand Down Expand Up @@ -111,6 +128,16 @@ fn effort_temperature(effort: Effort) -> f32 {
}
}

/// The sampling temperature for a given attempt: the effort base, raised by
/// 0.5 per retry and capped at 1.0. At effort=low the base is 0.0, and a
/// temperature-0 retry is a replay: the 2026-07-01 postmortem caught one check
/// reproducing its fatal trajectory near-verbatim on all three attempts. A
/// retry has to sample differently to be worth its wall-clock.
fn attempt_temperature(effort: Effort, attempt: u32) -> f32 {
let base = effort_temperature(effort);
(base + 0.5 * attempt.saturating_sub(1) as f32).min(1.0)
}

#[async_trait]
impl CheckExecutor for CerseiExecutor {
async fn run_check(&self, req: AgentRunRequest) -> Result<AgentOutcome> {
Expand All @@ -136,7 +163,12 @@ impl CheckExecutor for CerseiExecutor {
let cancel = CancellationToken::new();
let judge = JudgeTool::new(sink.clone(), cancel.clone());

let instructions = assemble_instructions(&req.check, &judge_tool_directive());
let instructions = assemble_instructions(
&req.check,
&judge_tool_directive(),
&req.working_dir,
req.attempt,
);

let mut agent_builder = Agent::builder()
.provider_boxed(provider)
Expand All @@ -149,7 +181,7 @@ impl CheckExecutor for CerseiExecutor {
.tools(read_only_tools())
.tool(judge)
// Thinking is intentionally left disabled (see `effort_temperature`).
.temperature(effort_temperature(self.effort))
.temperature(attempt_temperature(self.effort, req.attempt))
.max_turns(MAX_TURNS)
.cancel_token(cancel.clone());

Expand Down Expand Up @@ -237,3 +269,18 @@ impl CheckExecutor for CerseiExecutor {
Ok(outcome)
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn retries_raise_the_temperature_up_to_the_cap() {
assert_eq!(attempt_temperature(Effort::Low, 1), 0.0);
assert_eq!(attempt_temperature(Effort::Low, 2), 0.5);
assert_eq!(attempt_temperature(Effort::Low, 3), 1.0);
assert_eq!(attempt_temperature(Effort::Medium, 2), 1.0);
// Already at the cap: retries must not push past valid API range.
assert_eq!(attempt_temperature(Effort::High, 3), 1.0);
}
}
7 changes: 6 additions & 1 deletion src/checks/executor/claude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,12 @@ impl CheckExecutor for ClaudeExecutor {
"dispatching claude -p check (fallback)",
);

let instructions = assemble_instructions(&req.check, &file_report_directive());
let instructions = assemble_instructions(
&req.check,
&file_report_directive(),
&req.working_dir,
req.attempt,
);

let mut cmd = Command::new(&self.program);
cmd.arg("-p")
Expand Down
19 changes: 16 additions & 3 deletions src/checks/executor/fake.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ pub struct FakeExecutor {
/// id to `(report_on_attempt, report)`. Exercises the retry path: the same
/// `CheckId` is re-run, so the fake counts attempts per id.
silent_until: HashMap<CheckId, (usize, CheckReport)>,
seen: Mutex<Vec<CheckId>>,
/// Every `(check_id, attempt)` the fake was asked to run, in call order.
seen: Mutex<Vec<(CheckId, u32)>>,
}

impl FakeExecutor {
Expand Down Expand Up @@ -72,6 +73,18 @@ impl FakeExecutor {

/// The check ids the fake was asked to run, in call order.
pub fn seen(&self) -> Vec<CheckId> {
self.seen
.lock()
.unwrap()
.iter()
.map(|(id, _)| *id)
.collect()
}

/// The `(check_id, attempt)` pairs the fake was asked to run, in call
/// order — lets tests assert the retry plumbing threads attempt numbers
/// through to the executor.
pub fn seen_attempts(&self) -> Vec<(CheckId, u32)> {
self.seen.lock().unwrap().clone()
}
}
Expand All @@ -81,8 +94,8 @@ impl CheckExecutor for FakeExecutor {
async fn run_check(&self, req: AgentRunRequest) -> Result<AgentOutcome> {
let attempt = {
let mut seen = self.seen.lock().unwrap();
seen.push(req.check_id);
seen.iter().filter(|id| **id == req.check_id).count()
seen.push((req.check_id, req.attempt));
seen.iter().filter(|(id, _)| *id == req.check_id).count()
};

if self.silent.contains(&req.check_id) {
Expand Down
Loading