diff --git a/.console/log.md b/.console/log.md index 949239fb..5765b175 100644 --- a/.console/log.md +++ b/.console/log.md @@ -1,3 +1,14 @@ +## 2026-06-12 — fix(reviewer): require CI *settled* before declaring green (root cause of #269 merging red) + +The merge gate declared CI green whenever get_failed_checks returned [] — but that only means +"nothing has failed yet"; a check still queued/in_progress has conclusion=None and is invisible to +get_failed_checks. So the reviewer could self-review (~1min) and merge on LGTM before the ~2-3min +test jobs finished, turning main red. This is how #269 merged with 4 red checks and held main red ~5h. +Fix: new GitHubPRClient.get_incomplete_checks (status != "completed"); all three CI-evaluation sites +(primary self-review gate, WO-3 retraction, WO-3 no-progress direct-merge) now require zero failed +AND zero pending before proceeding. New "ci_never_settled" escalation if checks never settle within +the existing wait bound. +tests (adapter + gate defers-on-pending) + mock defaults updated. + ## 2026-06-12 — #270 rescoped to the query layer (clean on reverted main) After reverting #269 (b82b944d), #270 is rebuilt as green-main + the genuinely-new flaky-test @@ -1978,197 +1989,5 @@ corrected the stale "five watcher lanes" wording to the actual set pipeline lanes; `tools/loop/controller.py` (loop-*) = the separate external dev-loop controller. They start/stop independently; full pause needs both. -## 2026-06-04 — Reconcile `.console/` (reconcile/console branch) - -Ran the `.console/` reconciliation pass (PlatformManifest console-reconciliation-spec). -Authored `.console/reconcile.yaml` (untracked) classifying every backlog item as -done/partial/incomplete with an owner; cross-repo rows route to CxRP / SwitchBoard / -Warehouse / PlatformManifest / a private downstream repo / Custodian. Filled doc -homes for every owned done item so `cl reconcile check` is GREEN with zero DOC GAPs. -Scrubbed the remaining scrub-target names from tracked `docs/` (genericized to a -private downstream repo; numbered detector IDs left intact). Ran -`cl reconcile prune --apply`: completed log+backlog history moved to the private -archive, source trimmed to active sections + recent-N + an archive pointer -(log 3144→132, backlog 622→368 lines). A second `--apply` is a no-op. Flipped -`audit.reconcile_enforce: true` in `.custodian/config.yaml`. Tracked `.console/` + -`docs/` are now scrub-target clean (R2 / boundary I2). - -## 2026-06-03 — Reapply OC-venv ruff fallback lost in PR #236 merge - -Root cause: PR #236 (coverage 95.75% → 90% gate) overwrote commit 554b55bd which -added the three-tier ruff lookup (target venv → system PATH → OC root .venv/bin/ruff). -Without it, _phase0_ci_fix falls back to bare "ruff" causing FileNotFoundError for -repos without their own ruff binary (e.g. PlatformManifest). Re-applied on -oc-watchdog/20260603-0647-reapply-ruff-fallback. - -Also this cycle: resolved PR #235 merge conflict + custodian T4/T8 violations -(goal/ba5d9a46) to unblock OPEN_PR_GATE holding task #192. - -## 2026-06-02 — Reviewer: CI-green is a precondition, not an auto-merge (operator-directed) - -**Status**: ✅ Implemented on `feat/ci-green-requires-lgtm`. Closes the bypass left -by the verdict-gate work (#224): every managed repo has -`auto_merge_on_ci_green: true`, which merged autonomy PRs the instant CI was -green — *before* the new verdict gate ran. Green CI ≠ complete (missing docs etc. -pass CI), so PRs could still ship half-finished. - -**Change** (`pr_review_watcher/main.py _phase1` fast path): CI-green is now a -PRECONDITION. While CI is red the PR defers (no expensive self-review). Once CI -is green it falls through to the verdict-gated self-review — LGTM is still the -only merge path. Stale `operations_center.example.yaml` reviewer docs updated -(removed human-review phase, surfaced `max_fix_attempts`, documented the -precondition). Tests: ci-green-requires-LGTM + ci-red-defers-without-review. -108 passed; ruff clean. - ---- - -## 2026-06-02 — Probe-and-clear for stale worker-backend cooldowns - -Worker-backend cooldowns carry an *estimated* `reset_at` and were never retracted -on their own — only expiring when `reset_at` passed. When a limit lifted early -(e.g. sonnet recovered before its guessed weekly reset), the cooldown lingered: -status surfaces showed the model cooling, and when every model looked cooling the -board_unblock gate deferred dispatch for no reason. - -Added a probe-and-clear path: -- `UsageStore.clear_worker_backend_cooldown(worker_backend, model, ..., include_account_wide)` - retracts a model's active `model_weekly` cooldown (and, on request, account-wide - cooldowns — one model running disproves an all-models block); appends a - `worker_backend_cooldown_cleared` audit event. -- `backends/worker_backend_probe.py` — `probe_model` runs a cheap `claude -p`/`codex - exec` against a model (mirrors the controller's invocation); `ok` only on exit 0 - with no limit signal. `refresh_cooldowns` probes each *cooling* model and clears - the ones proven runnable. Probes never record cooldowns — a flaky probe can only - fail to clear, never falsely block. -- New entrypoint `operations-center-worker-backend-probe` + `worker-backend-probe` - subcommand (safe to run on a schedule / cron). -- Wired as a self-heal into `board_unblock._dispatch_cooldown_reason`: when every - allowed backend looks cooling, probe + re-read before deferring — turning a - would-be stale-cooldown deadlock into a self-heal. Injected for offline tests. - -Plus three hardening fixes: -- Periodic self-heal: the watchdog hourly loop now runs `worker-backend-probe` - (--timeout 30) so stale cooldowns clear even when the board is idle (no-op when - nothing is cooling). -- `record_worker_backend_cooldown` coalesces duplicates — drops any still-active - cooldown for the same (worker_backend, limit_kind, model) before appending, so - re-recording the same limit each cycle no longer piles up identical events - (observed: 12 identical sonnet rows). -- The board_unblock gate bounds its probe to `_GATE_PROBE_TIMEOUT_SECONDS` (20s) - so a hung probe can't stall a board cycle; the standalone CLI/cron keeps the - 90s default. - -Tests: clear primitive (per-model / account-wide / no-op), dedup-on-record, -probe module (fake runner: ok/limit-signal/nonzero/timeout; refresh clears only -runnable models; account-wide cleared on first success; no-op when nothing -cooling), CLI smoke, and the board_unblock self-heal. Verified end-to-end against -the live claude CLI. - -## 2026-05-30 — controller: make opus fallback reachable - -_backend_available checked _command_available(backend) with the raw name, so _command_available("opus") always failed (opus has no binary; it uses the claude CLI). The sonnet→opus→codex fallback was therefore dead code — opus could never be selected. Resolve the cli ("claude" for opus) so opus is reachable. Also repaired 3 parse_rate_limit_reset tests left broken by the earlier (reset, log_text) tuple-return change and added opus/priority/global-limit selection tests. 15 passed. - ---- - -## 2026-05-28 — P6 follow-up: fixed 10 pre-existing ty errors exposed by ty==0.0.40 pin - -## 2026-05-28 — Operator: work order 0009 — execution hygiene - -6 execution quality problems documented and assigned. See ADR 0009. -P1/P5: stop polluting .console/ truth files; P2: delete STAGE_*.md; P3: open-PR gate; -P4: squash stage commits; P6: pin tool versions. - ---- - -## 2026-05-28 — Operator: re-rebase PR #180 onto new main (post #181 merge) - -Resolved conftest.py conflict: took PR #180 tmp_path refactor, ruff auto-fixed unused import. -All 3609 tests pass. - ---- - -## 2026-05-28 — Loop controller: robustly resolve `cl` (CL_HOME fallback) - -The loop controller resolved `claude`/`codex` robustly via `_resolve_command` -(PATH + `~/.local/bin` fallbacks) but invoked `cl` as a bare `["cl", ...]`, -relying solely on PATH. That works when the loop is launched `nohup` from an -interactive shell (whose `~/.bashrc` puts `$CL_HOME/bin` on PATH) but fails -silently under cron/systemd/clean shells — `cl` not found → no anchor → loop -runs unanchored → ContextGuard blocks claude. Mirrors the OperatorConsole pane -bug just fixed. - -Added a `cl` branch to `_fallback_command_candidates` (uses `CL_HOME`) and -routed all four `cl` calls (session start/end, hydrate, capture) through -`_resolve_command`. Verified: with `cl` off PATH but `CL_HOME` set, the -controller resolves it and anchors at PlatformManifest. - -## 2026-05-25 - -- Fixed the pre-existing repo-wide pytest collection blocker by renaming the duplicate hardening module to `tests/observer/test_collectors_hardening/test_execution_health_hardening.py`, avoiding the `test_execution_health` import collision. -- Restored observer test consistency around dependency drift and execution health artifacts: - - `ExecutionOutcomeValidator` now accepts the retained artifact statuses `no_op` and `error` in addition to `executed`, `failed`, `timeout`, and `unknown`. - - `DependencyDriftCollector` now returns `not_available` consistently so `ObservationCoverageDeriver` can detect persistent missing coverage correctly. -- Fixed malformed-payload alert handling to normalize naive timestamps to UTC before lookback comparisons in `observer/security_logging.py`. -- Added OC→CxRP backend normalization in `contracts/cxrp_mapper.py` so OC executor backends like `team_executor`, `dag_executor`, and `critique_executor` serialize onto the current CxRP backend enum without failing mapper tests. -- Validation: - - `python -m pytest` → `3536 passed, 7 skipped` - - `python -m pytest -m integration` → `3 passed` - -## 2026-05-25 - -- Added executor worker-backend observability end to end: the `team_executor`, `dag_executor`, and `critique_executor` adapters now expose `execute_and_capture()` with `observed_runtime` showing preferred backend, selected backend, fallback usage, and backend cooldown snapshot. -- Added a live operator status surface for worker-backend cooldowns via `operations-center-worker-backend-status` and `./scripts/operations-center.sh worker-backend-status`, backed by a new `UsageStore.current_worker_backend_cooldowns()` summary API. -- Extended retained trace visibility so `operations-center-run-show ` prints the `Observed runtime` block, making actual `claude_code` vs `codex_cli` selection visible per run without re-reading raw record metadata. -- Validation: focused pytest slices passed (`68 passed`) and targeted Ruff checks passed. Repo-wide `python -m pytest` and `python -m pytest -m integration` are still blocked by the pre-existing duplicate-module import mismatch between `tests/test_execution_health.py` and `tests/observer/test_collectors_hardening/test_execution_health.py`. - -## Archived - -_Archived completed history → `/home/dev/Documents/GitHub/PrivateManifest/archive/console/OperationsCenter/log-2026-06-04.md`_ - - -## 2026-06-08 — Review goal-text: explicit read-only constraint - -Added "TASK TYPE: Read-only code review / SINGLE REQUIRED ACTION: Write verdict.json" -header to review goal_text. Root cause: budget team coordinator (Haiku effort=low) was -decomposing the review task into implementation sub-stages that tried to modify source -files rather than just writing verdict.json. PR #253 had 7 consecutive no_verdict failures. -New phrasing prevents the coordinator from creating non-verdict-writing stages. -Also cleared PR #253 escalation for one more retry cycle. - -## 2026-06-08 — fix(tests): loosen snapshot performance timing bounds - -Flaky CI failure: 0.1s limit failed with 0.177s on shared runners. -Raised to 1.0s — still catches catastrophic regression (10x+). - -## 2026-06-08 — WO-1 close-with-receipt invariant hardened - -## 2026-06-08 — fix(controller): persisted Claude cooldowns fall through to Codex - -Loop controller now seeds Sonnet/Opus/Codex cooldowns from the persisted usage -ledger on restart and reselects after chained backend limits, so exhausted -Claude weekly quotas fall through to Codex instead of sleeping until reset. - -## 2026-06-08 — fix(controller): Claude weekly cooldown is account-wide - -Bare Claude Code weekly-limit messages now classify as `global_weekly` and cool -both Claude controller lanes so status surfaces do not leave Haiku looking runnable. - -Controller startup also normalizes matching persisted Sonnet+Opus weekly resets -to account-wide metadata so `loop_controller_state.json` reports the same scope. - -## 2026-06-10 — fix(reviewer): make no-progress detection reliable + preserve external escalation - -Root cause: no-progress check required AI concern summaries to match exactly (text comparison), -but LLM output varies. Also: TOCTOU race where reviewer overwrote watchdog's escalation after -fix pass. Fixed both; 88 reviewer tests pass. - -## 2026-06-10 — fix(tests): use dynamic dates in flaky storage cleanup tests - -Hardcoded 2026-06-07 "recent" date fell behind the 3-day retention window causing CI failures. - -## 2026-06-12 — fix(observer): restore ty: ignore suppression for boto3/requests -Commit 5f763c99 updated mypy error codes on TYPE_CHECKING-guarded imports in -snapshot_repository.py but dropped the ty-specific `# ty: ignore[unresolved-import]` -comments. The ty CI check then failed with unresolved-import on lines 24–25. -Restored both suppression annotations so mypy and ty both pass. + diff --git a/src/operations_center/adapters/github_pr.py b/src/operations_center/adapters/github_pr.py index 1addd515..766c11a8 100644 --- a/src/operations_center/adapters/github_pr.py +++ b/src/operations_center/adapters/github_pr.py @@ -255,6 +255,52 @@ def get_failed_checks( failed.append(f"{name}: {summary}") return failed + def get_incomplete_checks( + self, + owner: str, + repo: str, + pr_number: int, + *, + pr_data: dict | None = None, + ignored_checks: list[str] | None = None, + ) -> list[str]: + """Return names of checks not yet in a terminal state for the PR head. + + A check is "incomplete" when its ``status`` is anything other than + ``completed`` (e.g. ``queued`` / ``in_progress``) — such a run has no + ``conclusion`` yet, so :meth:`get_failed_checks` cannot see it. + + Callers gating a merge on green CI MUST treat a non-empty result as + "not green yet": an empty failure list means only "nothing has failed + *so far*", which is true while CI is still running. Declaring green in + that window is how a PR can be merged before its tests finish and then + turn the base branch red. + """ + if pr_data is None: + pr_data = self.get_pr(owner, repo, pr_number) + head_sha = (pr_data.get("head") or {}).get("sha", "") + if not head_sha: + return [] + try: + check_runs = self.get_check_runs(owner, repo, head_sha) + except Exception: + return [] + ignored = [s.lower() for s in (ignored_checks or [])] + # Dedupe by name (keep the newest run) — same rationale as get_failed_checks. + latest: dict[str, dict] = {} + for cr in check_runs: + name = cr.get("name", "unknown") + if cr.get("id", 0) > latest.get(name, {}).get("id", 0): + latest[name] = cr + pending = [] + for cr in latest.values(): + if cr.get("status") != "completed": + name = cr.get("name", "unknown") + if ignored and any(pat in name.lower() for pat in ignored): + continue + pending.append(name) + return pending + def list_open_prs(self, owner: str, repo: str) -> list[dict]: resp = self._request( "GET", diff --git a/src/operations_center/entrypoints/pr_review_watcher/main.py b/src/operations_center/entrypoints/pr_review_watcher/main.py index 16955500..cb96a8ab 100644 --- a/src/operations_center/entrypoints/pr_review_watcher/main.py +++ b/src/operations_center/entrypoints/pr_review_watcher/main.py @@ -1526,7 +1526,15 @@ def _phase1( pr_data=pr_data, ignored_checks=_rignored, ) - if not _rfailed: + # Settled-and-green only: don't retract while checks run. + _rpending = gh_client.get_incomplete_checks( + owner, + repo, + pr_number, + pr_data=pr_data, + ignored_checks=_rignored, + ) + if not _rfailed and not _rpending: _retract_flag( state, gh_client, @@ -1634,7 +1642,57 @@ def _phase1( ) _save_state(state_path, state) return - state["ci_wait_cycles"] = 0 # CI green — reset the wait counter + # No check has FAILED — but an empty failure list only means + # "nothing has failed yet". While any check is still queued/running + # its conclusion is None, invisible to get_failed_checks. Declaring + # green here would let a self-review LGTM merge the PR before CI + # finishes, turning the base branch red (this is how #269 merged red). + # Require CI to have SETTLED (no pending checks) before green. + pending = gh_client.get_incomplete_checks( + owner, + repo, + pr_number, + pr_data=pr_data, + ignored_checks=ignored, + ) + if pending: + state["ci_wait_cycles"] = state.get("ci_wait_cycles", 0) + 1 + if state["ci_wait_cycles"] >= _MAX_CI_WAIT_CYCLES: + detail = ( + f"CI has not settled after {state['ci_wait_cycles']} checks " + f"({len(pending)} still running: {', '.join(pending[:5])}). " + f"Not merged (CI incomplete) and not closed (work preserved) " + f"— needs a human to investigate stuck CI." + ) + record_escalation( + pr_number=pr_number, + repo_key=repo_key, + reason="ci_never_settled", + detail=detail, + ) + _escalate_needs_human( + state, + state_path, + gh_client, + owner, + repo, + settings, + reason="ci_never_settled", + detail=detail, + current_head_sha=current_head_sha, + ) + return + logger.info( + "pr_review_watcher: PR #%d CI still running (%d pending, wait %d/%d) " + "— deferring self-review until checks settle", + pr_number, + len(pending), + state["ci_wait_cycles"], + _MAX_CI_WAIT_CYCLES, + ) + _save_state(state_path, state) + return + state["ci_wait_cycles"] = 0 # CI settled and green — reset the wait counter logger.info( "pr_review_watcher: PR #%d CI green — proceeding to verdict-gated " "self-review (LGTM required to merge)", @@ -1912,7 +1970,12 @@ def _phase1( _failed_np = gh_client.get_failed_checks( owner, repo, pr_number, pr_data=pr_data, ignored_checks=_ign_np ) - if not _failed_np: + # Only merge on CI that has SETTLED — never while checks are still + # running (no failure yet != green). + _pending_np = gh_client.get_incomplete_checks( + owner, repo, pr_number, pr_data=pr_data, ignored_checks=_ign_np + ) + if not _failed_np and not _pending_np: logger.info( "pr_review_watcher: PR #%d repeated no-progress after " "CI-green retraction budget exhausted; CI still green — " diff --git a/tests/integration/reviewer/test_ci_green_gate.py b/tests/integration/reviewer/test_ci_green_gate.py index 711dca3c..22f70d8e 100644 --- a/tests/integration/reviewer/test_ci_green_gate.py +++ b/tests/integration/reviewer/test_ci_green_gate.py @@ -85,6 +85,52 @@ def test_ci_with_failing_checks_blocks_review( state_after = load_pr_state(state_path) assert state_after["ci_wait_cycles"] == 1 + def test_ci_still_running_defers_review_does_not_merge( + self, + tmp_path: Path, + audit_verdict_builder: AuditVerdictBuilder, + ): + """Test: no failed checks but some still in-progress → defer, do NOT merge. + + Regression for the premature-green bug (how #269 merged red): an empty + failed-checks list only means "nothing has failed *yet*". If checks are + still running, the gate must defer rather than declare green and let a + self-review LGTM merge the PR before CI finishes. + """ + settings = mock_settings() + gh = mock_github_client() + + state = create_pr_state( + repo_key="TestRepo", + pr_number=42, + phase="self_review", + self_review_loops=0, + ) + state_path = save_pr_state(tmp_path, state) + + # No check has FAILED, but the test job is still running. + gh.get_failed_checks.return_value = [] + gh.get_incomplete_checks.return_value = ["Test (pytest)"] + + with patch.object(watcher, "_run_pipeline") as mock_pipeline: + watcher._phase1( + state, + state_path, + {"number": 42, "title": "Test PR", "draft": False, "head": {"ref": "goal/42"}}, + gh, + "owner", + "TestRepo", + tmp_path, + tmp_path / "cfg.yaml", + settings, + ) + + # Gate must defer: no self-review, no merge, wait counter advanced. + mock_pipeline.assert_not_called() + gh.merge_pr.assert_not_called() + state_after = load_pr_state(state_path) + assert state_after["ci_wait_cycles"] == 1 + def test_ci_red_then_green_allows_merge_after_fix( self, tmp_path: Path, diff --git a/tests/test_pr_review_watcher.py b/tests/test_pr_review_watcher.py index e5344aba..87ec5a78 100644 --- a/tests/test_pr_review_watcher.py +++ b/tests/test_pr_review_watcher.py @@ -68,6 +68,9 @@ def _make_gh(*, comment_id: int = 0) -> MagicMock: gh.post_comment.return_value = {"id": comment_id} if comment_id else {} gh.merge_pr.return_value = {} gh.update_comment.return_value = {} + # Default: CI has settled (no checks still running). The premature-green gate + # treats a non-empty result as "not green yet"; tests override as needed. + gh.get_incomplete_checks.return_value = [] return gh diff --git a/tests/unit/adapters/test_github_pr_cov.py b/tests/unit/adapters/test_github_pr_cov.py index e9732817..8ea6ed8d 100644 --- a/tests/unit/adapters/test_github_pr_cov.py +++ b/tests/unit/adapters/test_github_pr_cov.py @@ -342,6 +342,55 @@ def test_get_failed_checks_check_runs_exception(client): assert out == [] +# --------------------------------------------------------------------------- +# get_incomplete_checks +# --------------------------------------------------------------------------- +def test_get_incomplete_checks_flags_running_runs(client): + runs = [ + {"id": 1, "name": "lint", "status": "completed", "conclusion": "success"}, + {"id": 2, "name": "Test (pytest)", "status": "in_progress", "conclusion": None}, + {"id": 3, "name": "Snapshot", "status": "queued", "conclusion": None}, + ] + with mock.patch.object(client, "get_check_runs", return_value=runs): + out = client.get_incomplete_checks("o", "r", 1, pr_data={"head": {"sha": "x"}}) + assert sorted(out) == ["Snapshot", "Test (pytest)"] + + +def test_get_incomplete_checks_empty_when_all_completed(client): + runs = [ + {"id": 1, "name": "lint", "status": "completed", "conclusion": "success"}, + {"id": 2, "name": "test", "status": "completed", "conclusion": "failure"}, + ] + with mock.patch.object(client, "get_check_runs", return_value=runs): + out = client.get_incomplete_checks("o", "r", 1, pr_data={"head": {"sha": "x"}}) + assert out == [] + + +def test_get_incomplete_checks_honors_ignored_and_latest_run(client): + # A stale in_progress run superseded by a newer completed run for the same + # name must not count as pending; ignored names are excluded. + runs = [ + {"id": 1, "name": "flaky", "status": "in_progress", "conclusion": None}, + {"id": 5, "name": "flaky", "status": "completed", "conclusion": "success"}, + {"id": 6, "name": "Snapshot validation", "status": "queued", "conclusion": None}, + ] + with mock.patch.object(client, "get_check_runs", return_value=runs): + out = client.get_incomplete_checks( + "o", "r", 1, pr_data={"head": {"sha": "x"}}, ignored_checks=["snapshot"] + ) + assert out == [] + + +def test_get_incomplete_checks_no_head_sha(client): + assert client.get_incomplete_checks("o", "r", 1, pr_data={"head": {}}) == [] + + +def test_get_incomplete_checks_check_runs_exception(client): + with mock.patch.object(client, "get_check_runs", side_effect=RuntimeError("boom")): + out = client.get_incomplete_checks("o", "r", 1, pr_data={"head": {"sha": "x"}}) + assert out == [] + + def test_get_failed_checks_uses_output_title(client): runs = [ { diff --git a/tests/verdicts/conftest.py b/tests/verdicts/conftest.py index 48919089..3a34199d 100644 --- a/tests/verdicts/conftest.py +++ b/tests/verdicts/conftest.py @@ -272,6 +272,9 @@ def mock_github_client() -> MagicMock: gh.delete_branch.return_value = {} gh.get_mergeable.return_value = True gh.get_failed_checks.return_value = [] + # Default: CI has settled (no checks still running). Tests exercising the + # "CI still in progress" path override this with a non-empty list. + gh.get_incomplete_checks.return_value = [] return gh