From a3a23767beb9d3b40a49d2e624bd46acecc29014 Mon Sep 17 00:00:00 2001 From: NagyVikt Date: Sat, 16 May 2026 00:46:10 +0200 Subject: [PATCH] fix(add-workers): pass discovered emails to cap-probe + retry discover without --exclude-tmux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two cascading bugs in `scripts/codex-fleet/add-workers.sh` made the discovery wrapper report `only 0 healthy unallocated accounts available` and then fail with `no healthy accounts available`, even when ≥1 account in the canonical `agent-auth list` pool passed the 5h<100% / weekly<90% / not-already-active filter. Root cause #1 — cap-probe invoked with no email arguments `pick_accounts()` called `cap-probe.sh "$need"` (only the count). cap-probe's shebang is ` email1 email2 ...`: after `shift`, it iterates `for email in "$@"` over an empty list, probes nothing, and exits with 0 healthy rows. The wrapper then took the empty result as "0 healthy" and moved on. Root cause #2 — discover-accounts fail-closes when target tmux session is absent `bash discover-accounts.sh --exclude-tmux ` runs `tmux list-panes -s -t | sed | sort | tr | sed` under `set -eo pipefail`. On a host where `` doesn't exist on the default tmux server (e.g. running `add-workers.sh` outside the fleet session, or with `CODEX_FLEET_TMUX_SOCKET` unset so the wrapper degrades to the operator's default tmux), tmux exits 1, pipefail kicks in, the helper exits before reaching its python emitter, and the wrapper sees an empty tempfile. The wrapper then treated empty as "all candidates allocated" instead of "tmux filter unusable, retry without it". Fix (surgical, in-file only) 1. After the first discover-accounts call, if the tempfile is empty, retry without `--exclude-tmux`. We still keep `--exclude-active` so accounts already in `fleet-active-accounts.txt` are skipped. 2. Before invoking cap-probe, extract the email column from the discovered TSV and pass each email as a positional arg so cap-probe has something to probe. Empty discovery skips cap-probe entirely. The helper-side bug (discover-accounts.sh exiting 1 when the tmux session is missing instead of treating an empty tmux query as "no live panes to exclude") is left untouched per file-scope contract; the wrapper now compensates for it. Verified on host 2026-05-16: bash -n scripts/codex-fleet/add-workers.sh # exit 0 docker run koalaman/shellcheck:stable …add-workers.sh # only pre-existing findings bash scripts/codex-fleet/add-workers.sh 1 --dry-run # picks admin-mite (1 healthy) bash scripts/codex-fleet/add-workers.sh 2 --dry-run # picks 2 healthy --- scripts/codex-fleet/add-workers.sh | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/scripts/codex-fleet/add-workers.sh b/scripts/codex-fleet/add-workers.sh index f484be9..aa44bf9 100755 --- a/scripts/codex-fleet/add-workers.sh +++ b/scripts/codex-fleet/add-workers.sh @@ -143,15 +143,35 @@ pick_accounts() { local discovered_tmp discovered_tmp="$(mktemp)" local discover_session="${CODEX_FLEET_SESSION:-codex-fleet${FLEET_ID:+-$FLEET_ID}}" + # Try with --exclude-tmux first (filters accounts already wired to a + # live pane). discover-accounts.sh runs `tmux list-panes -s -t ` + # under `set -eo pipefail`; on a host where that session doesn't exist + # the pipe fails and the helper exits with 0 rows. Retry without + # --exclude-tmux when the first pass returns empty so we don't + # fail-closed and report "0 healthy" while accounts are on disk. ACTIVE_FILE="$ACTIVE_FILE" bash "$SCRIPT_DIR/lib/discover-accounts.sh" \ --exclude-active --exclude-tmux "$discover_session" \ > "$discovered_tmp" 2>/dev/null || true + if [ ! -s "$discovered_tmp" ]; then + ACTIVE_FILE="$ACTIVE_FILE" bash "$SCRIPT_DIR/lib/discover-accounts.sh" \ + --exclude-active \ + > "$discovered_tmp" 2>/dev/null || true + fi if [ -s "$discovered_tmp" ]; then # Optionally filter through cap-probe to drop capped accounts. If # cap-probe isn't available, take everything discovered as-is. if [ -x "$SCRIPT_DIR/cap-probe.sh" ]; then - local healthy_emails - healthy_emails="$(bash "$SCRIPT_DIR/cap-probe.sh" "$need" 2>/dev/null || true)" + # cap-probe.sh requires ` email1 email2 ...` — without + # the email list it has nothing to probe and exits with 0 healthy + # rows. Feed it every discovered email so the cap filter can + # actually run. + local discovered_emails + discovered_emails="$(awk -F'\t' 'NF>=2 && $2!="" {print $2}' "$discovered_tmp")" + local healthy_emails="" + if [ -n "$discovered_emails" ]; then + # shellcheck disable=SC2086 + healthy_emails="$(bash "$SCRIPT_DIR/cap-probe.sh" "$need" $discovered_emails 2>/dev/null || true)" + fi if [ -n "$healthy_emails" ]; then # Intersection: discovered ∩ healthy. while IFS=$'\t' read -r aid email; do