diff --git a/BUGS.md b/BUGS.md index 67fad6b..f4e0fef 100644 --- a/BUGS.md +++ b/BUGS.md @@ -5,10 +5,14 @@ These are bugs (or missing features) I've observed while working with `multi che - [ ] Output is now hanging. I suspect this is recent (within the last few commits) and it started happening after implement the changes to the `Presenter` actor to fix writing text off-screen without wrapping. -- [ ] No limit on max turns. - - [ ] Remove the `Claude -p` executor. +- [ ] Running 16 agents seems to nearly freeze the computer. Use an OTel profile to determine if this is true. + +- [ ] Temperature not configured. + +- [ ] No limit on max turns. + - [ ] Logs no longer report the id of the check that failed (or the number of attempted retries) - [ ] No use of Cersei workflows to chain multiple prompts together. diff --git a/src/checks/mod.rs b/src/checks/mod.rs index f4d4738..a4ab366 100644 --- a/src/checks/mod.rs +++ b/src/checks/mod.rs @@ -42,6 +42,7 @@ use std::io::Write; use std::path::Path; use std::sync::Arc; +use kameo::Actor; use kameo::actor::{ActorRef, Spawn}; use miette::{IntoDiagnostic, Result, miette}; use tokio::sync::oneshot; @@ -170,12 +171,24 @@ fn spawn_core( (execution, reporting, presenter, rx) } -/// Stop the presenter and wait for its teardown to finish. `stop_gracefully` -/// drains any `UiEvent`s still in its mailbox first, so the terminal restore / -/// final scrollback flush sees a complete state. -async fn shutdown_presenter(presenter: &ActorRef) { - let _ = presenter.stop_gracefully().await; - presenter.wait_for_shutdown().await; +/// Stop `actor` and wait for its teardown to finish. `stop_gracefully` drains +/// any messages still in its mailbox first, so an actor's `on_stop` (the +/// presenter's terminal restore / final scrollback flush, in particular) sees a +/// complete state. +/// +/// Every pipeline actor is stopped this way rather than merely having its +/// `ActorRef` dropped: an actor's mailbox only closes once *every* clone of its +/// `ActorRef` is gone, and [`ExecutionActor`]'s per-check background tasks each +/// hold clones of `execution`/`reporting`/`presenter` for their own lifetime +/// (see `execution::dispatch`). Relying on that implicit ref-counting to close +/// the mailbox — instead of sending an explicit `Signal::Stop`, which the actor +/// loop honors regardless of how many `ActorRef` clones are still outstanding — +/// makes teardown a race against those tasks actually finishing, rather than a +/// deterministic signal. An explicit stop is what let the presenter's shutdown +/// stay reliable; the other three actors need the same treatment. +async fn shutdown_actor(actor: &ActorRef) { + let _ = actor.stop_gracefully().await; + actor.wait_for_shutdown().await; } /// Stream a (validated) requirement set into the execution actor: one @@ -297,12 +310,15 @@ async fn run_pipeline( .map_err(|e| miette!("failed to start discovery: {e}"))?; let outcomes = await_result(rx).await; - // The run is over: drain the presenter's mailbox and run its teardown - // (terminal restore / final scrollback flush) before returning. - shutdown_presenter(&presenter).await; - // Keep refs alive across the await; dropping them earlier would stop the - // actors mid-run. - drop((discovery, execution, reporting, presenter)); + // The run is over: stop every actor explicitly rather than merely dropping + // its `ActorRef` (see `shutdown_actor`). Presenter goes last so it's + // guaranteed to have drained every `UiEvent` the other actors' own + // shutdown might still emit (e.g. a final log line) before its terminal + // restore / scrollback flush runs. + shutdown_actor(&discovery).await; + shutdown_actor(&execution).await; + shutdown_actor(&reporting).await; + shutdown_actor(&presenter).await; outcomes } @@ -323,7 +339,8 @@ async fn run_to_outcomes( spawn_core(cfg, executor, sandbox, working_dir, backend, None); stream_requirements(&execution, &presenter, requirements).await?; let outcomes = await_result(rx).await; - shutdown_presenter(&presenter).await; - drop((execution, reporting, presenter)); + shutdown_actor(&execution).await; + shutdown_actor(&reporting).await; + shutdown_actor(&presenter).await; outcomes } diff --git a/src/cmd/check.rs b/src/cmd/check.rs index af8de28..1fdc63d 100644 --- a/src/cmd/check.rs +++ b/src/cmd/check.rs @@ -29,9 +29,18 @@ impl Check { self.args.directory(), self.args.overrides(), ))?; - if code != 0 { - std::process::exit(code); - } - Ok(()) + // Exit directly rather than returning and letting `rt` drop: dropping a + // `Runtime` blocks the calling thread until every task it ever spawned + // has fully unwound (`Runtime::drop`'s docs: "The thread initiating the + // shutdown blocks until all spawned work has been stopped. This can + // take an indefinite amount of time."). By this point the check pipeline + // has already produced its result and, for a TTY run, the presenter has + // already flushed the terminal record — there is nothing further for + // this process to do, so it should not be held hostage by a stray task + // (ours or a dependency's) that is slow, or fails, to wind down. This + // used to only apply on the `code != 0` path, which papered over the + // asymmetry: an all-checks-pass run had no hard exit and could hang + // instead of returning control to the shell. + std::process::exit(code); } }