diff --git a/.github/actions/setup-bocpy-test/action.yml b/.github/actions/setup-bocpy-test/action.yml
index 5b96bb5..eb71dab 100644
--- a/.github/actions/setup-bocpy-test/action.yml
+++ b/.github/actions/setup-bocpy-test/action.yml
@@ -26,6 +26,17 @@ inputs:
       windows-x86 sets this to ``x86``).
     required: false
     default: ""
+  check-latest:
+    description: >
+      Forwarded to ``actions/setup-python``. When ``"true"`` the
+      action resolves the interpreter against the downloadable
+      versions-manifest instead of relying on a local toolcache hit.
+      Required for ``x86`` on the Windows 2025 image, where 32-bit
+      CPython is no longer pre-installed in the toolcache (only the
+      manifest ships ``win32`` builds). Defaults to ``"false"`` so
+      the cached fast path is used everywhere else.
+    required: false
+    default: "false"
   upgrade-pip:
     description: >
       When ``"true"`` (the default) run ``python -m pip install
@@ -51,6 +62,7 @@ runs:
       with:
         python-version: ${{ inputs.python-version }}
         architecture: ${{ inputs.architecture }}
+        check-latest: ${{ inputs.check-latest }}
 
     - name: Upgrade pip
       if: ${{ inputs.upgrade-pip == 'true' }}
diff --git a/.github/agents/editor-lens.agent.md b/.github/agents/editor-lens.agent.md
index 84144d1..5463824 100644
--- a/.github/agents/editor-lens.agent.md
+++ b/.github/agents/editor-lens.agent.md
@@ -25,6 +25,38 @@ target is a codebase where every remaining comment is one a maintainer
 would write today, from scratch, knowing nothing about the PR that
 introduced it.
 
+### The inline-comment single-line rule (repo norm)
+
+**Every inline comment defaults to a single line of at most 120
+characters, or it is deleted.** An inline comment is a `#` block in
+Python or a `//` block in C that sits inside a function body or above a
+statement. This is the repo standard, not a per-PR cleanup: verbose
+multi-line inline comments rot as the code beneath them changes, drift
+out of sync, and bury the few comments that earn their keep. A
+multi-line inline comment is a smell — collapse it to one line or cut
+it.
+
+A multi-line inline comment survives **only** with an explicit
+per-case justification, and only these justifications qualify:
+
+- a non-obvious concurrency invariant the code cannot express (2PL
+  lock ordering, MCS handoff, memory-ordering rationale);
+- the rationale block above a non-trivial version-gate `#if`/`#elif`
+  ladder;
+- an X-macro / `clang-format off` table boundary that is itself
+  structural;
+- a reference anchor that needs a line of context to be followed.
+
+If a surviving multi-line inline comment does not fall into one of
+those buckets, collapse it. When in doubt, collapse.
+
+**Docstrings and doc-blocks are exempt from the single-line rule.**
+Python docstrings, C `///` / `/** */` Doxygen headers, and Sphinx
+`:param:` / `:returns:` stubs in `.pyi` files are *documentation*, not
+inline commentary. They may — and should — carry in-depth, useful
+prose across multiple lines. Trim genuine wordiness, but do not force a
+docstring onto one line; a docstring's job is to document thoroughly.
+
 A second, broader mandate: catch **cryptic references to internal
 review artifacts** wherever they appear in the diff, including
 user-facing files (`README.md`, `sphinx/source/**`, `CHANGELOG.md`,
@@ -144,7 +176,9 @@ finalize.
   express: 2PL lock ordering by cown ID, MCS handoff invariants,
   memory-ordering rationale (`// acquire pairs with the release in
   ...`), why a particular `_Py_atomic_*` was chosen, why a
-  sub-interpreter API ladder is structured the way it is.
+  sub-interpreter API ladder is structured the way it is. These are
+  the canonical justification for a multi-line inline comment — but
+  prefer one tight line even here when the invariant fits.
 - **Version-gate rationale** — the prose above a non-trivial
   `#if PY_VERSION_HEX >= ...` ladder explaining what changed
   upstream. Trim if wordy; do not delete.
@@ -160,6 +194,14 @@ finalize.
 
 ### Rewrite (collapse, don't delete)
 
+- **Any multi-line inline comment without a qualifying justification.**
+  Per the inline-comment single-line rule above, a `#` or `//`
+  comment spanning more than one line is collapsed to a single
+  ≤120-char line unless it is a concurrency invariant, version-gate
+  rationale, X-macro / `clang-format` table boundary, or a reference
+  anchor needing context. Default to collapsing; keep multi-line only
+  when one of those buckets applies. (Docstrings and Doxygen / Sphinx
+  doc-blocks are exempt — see the rule above.)
 - **Wordy explanations of correct behavior.** Three sentences
   paraphrasing what the next ten lines obviously do → one line, a
   reference anchor, or nothing.
@@ -249,8 +291,10 @@ When reviewing, produce findings in these sections:
    archaeology, or paraphrase. List file + line range + the comment
    text. These can be deleted without further review. *Full prose
    edit scope only.*
-3. **Rewrites** — wordy or stale comments that should be collapsed.
-   For each, give the original and the proposed replacement. *Full
+3. **Rewrites** — wordy or stale comments that should be collapsed,
+   including every multi-line inline comment collapsed to a single
+   ≤120-char line under the inline-comment single-line rule. For
+   each, give the original and the proposed replacement. *Full
    prose edit scope only.*
 4. **Keep with edit** — load-bearing comments that need a small fix
    (stale file path, wrong PEP number, dated phrasing). *Full prose
@@ -262,7 +306,9 @@ When reviewing, produce findings in these sections:
    comment and what's ambiguous. Always include any `TODO` / `FIXME`
    without an issue or sketch link.
 6. **Summary** — counts (cuts / rewrites / edits / kept / asked),
-   and an estimated LOC reduction.
+   the number of multi-line inline comments collapsed to one line
+   and the number of multi-line inline comments kept (each with its
+   qualifying justification), and an estimated LOC reduction.
 
 When invoked via `review-loop`, expect to iterate: apply approved
 cuts and rewrites, then re-scan the same target until no new
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 339701d..2488b42 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -36,6 +36,24 @@ updates:
         applies-to: version-updates
         patterns:
           - "*"
+    # Deferred docs-stack updates. These releases require Python >=3.11
+    # and therefore cannot enter ``ci/constraints-docs.txt`` while the
+    # project floor in ``pyproject.toml`` is still ``>=3.10``. Keep this
+    # list in sync with the comment above ``requires-python`` in
+    # ``pyproject.toml``; remove the block (and regenerate the
+    # constraints) once the floor is bumped to >=3.11 after Python 3.10
+    # reaches EOL (October 2026).
+    #
+    # ``ignore`` filters out matching packages before the group is
+    # assembled, so non-docs bumps (cyclonedx-python-lib, idna, etc.)
+    # still land in a smaller weekly PR. Security advisories for the
+    # ignored packages bypass this list and open immediately.
+    ignore:
+      - dependency-name: "docutils"
+      - dependency-name: "ruamel-yaml"
+      - dependency-name: "sphinx-tabs"
+      - dependency-name: "sphinx-toolbox"
+      - dependency-name: "standard-imghdr"
     labels:
       - "dependencies"
       - "ci"
diff --git a/.github/skills/commenting-c-and-python/SKILL.md b/.github/skills/commenting-c-and-python/SKILL.md
index 5ad003c..975f056 100644
--- a/.github/skills/commenting-c-and-python/SKILL.md
+++ b/.github/skills/commenting-c-and-python/SKILL.md
@@ -60,19 +60,29 @@ typedef struct boc_message {
 
 ### Inline Comments — `//` Style
 
-Use `//` for short explanatory comments inside function bodies. Place them on
-the line **above** the code they describe, at the current indentation level:
+**An inline comment defaults to a single line of at most 120
+characters, or it is deleted.** Verbose multi-line inline comments rot
+as the code beneath them changes and make the code harder to read, not
+easier. Place the comment on the line **above** the code it describes,
+at the current indentation level:
 
 ```c
-  // two possibilities:
-  // 1. queue is empty
-  // 2. queue is inconsistent
-
-  // step 1: swap the new node in as the new head
+  // swap the new node in as the new head
   node->next = head;
 ```
 
-End-of-line `//` comments are reserved for preprocessor version annotations:
+A multi-line `//` inline comment is permitted **only** when it records
+something the code cannot express and that does not fit on one line:
+a non-obvious concurrency invariant (2PL lock ordering, MCS handoff,
+memory-ordering rationale), the rationale above a version-gate
+`#if`/`#elif` ladder, an X-macro / `clang-format off` table boundary,
+or a reference anchor that needs a line of context. When in doubt,
+collapse to one line. (Doxygen `///` / `/** */` doc-blocks are
+documentation, not inline comments, and are exempt from this rule —
+they may carry in-depth prose.)
+
+End-of-line `//` comments are reserved for preprocessor version
+annotations:
 
 ```c
 #if PY_VERSION_HEX >= 0x030E0000 // 3.14
@@ -249,19 +259,26 @@ def send(tag: str, contents: Any):
 
 ### Inline `#` Comments
 
-Use `#` comments for short notes inside function bodies. Place them on the line
-above the code, at the current indentation:
+**An inline comment defaults to a single line of at most 120
+characters, or it is deleted.** Verbose multi-line inline comments rot
+as the surrounding code changes and reduce readability. Place the
+comment on the line above the code, at the current indentation:
 
 ```python
         orphan_cowns = _core.cowns()
         if len(orphan_cowns) != 0:
             logger.debug("acquiring orphan cowns")
-            # at this stage all behaviors have exited, but it may be the case
-            # that some cowns are released but associated with this interpreter.
-            # by acquiring them, we ensure that the XIData objects have been
-            # freed _before_ this interpreter is destroyed.
+            # acquire orphaned cowns so their XIData is freed before teardown
 ```
 
+A multi-line `#` inline comment is permitted **only** when it records
+something the code cannot express and that does not fit on one line:
+a non-obvious concurrency invariant, a behavior-changing transpiler
+rule, the rationale above a version gate, or a reference anchor that
+needs context. When in doubt, collapse to one line. Docstrings are
+*not* inline comments and are exempt — they should carry in-depth,
+useful documentation across as many lines as the reader needs.
+
 Same-line `#` comments are acceptable for very short annotations:
 
 ```python
diff --git a/.github/skills/testing-with-boc/SKILL.md b/.github/skills/testing-with-boc/SKILL.md
index b1284e8..40db00a 100644
--- a/.github/skills/testing-with-boc/SKILL.md
+++ b/.github/skills/testing-with-boc/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: testing-with-boc
-description: "Write tests for bocpy Behavior-Oriented Concurrency code. Use when: writing pytest tests for @when behaviors, Cown scheduling, send/receive messaging, cown grouping, chained behaviors, exception propagation. Covers parameter-count rules, module-level class requirements, and the send/receive assertion pattern."
+description: "Write tests for bocpy Behavior-Oriented Concurrency code. Use when: writing pytest tests for @when behaviors, Cown scheduling, send/receive messaging, cown grouping, chained behaviors, exception propagation. Covers parameter-count rules, module-level class requirements, and the quiesce()+unwrap() result-reading pattern."
 ---
 
 # Testing with Behavior-Oriented Concurrency (BOC)
@@ -24,10 +24,12 @@ execute asynchronously on worker interpreters.
 |---------|-------------|
 | `Cown(value)` | A concurrently-owned wrapper. Behaviors receive exclusive temporal access to the cown's `.value`. |
 | `@when(*cowns)` | Decorator that schedules the function as a behavior. The decorator replaces the function with a `Cown` holding the return value. The first N parameters bind to the N cowns; any trailing parameters are auto-captured from the caller's frame (see below). |
+| `quiesce(timeout=None)` | Blocks until all in-flight behaviors complete **without** tearing down the workers. The preferred test-thread barrier before reading results. |
+| `Cown.unwrap()` | After quiescence, returns the behavior's result value, or **re-raises** the exception the behavior captured (verbatim, on the test thread). Raises `RuntimeError` if called while behaviors are still in flight. |
 | `send(tag, contents)` | Sends a cross-interpreter message with the given tag. |
 | `receive(tags, timeout)` | Blocks until a message with a matching tag arrives (or times out). Returns `(TIMEOUT, None)` on timeout. |
 | `TIMEOUT` | Sentinel string returned as the tag by `receive` when a timeout elapses. |
-| `wait(timeout)` | Blocks until all scheduled behaviors have completed. |
+| `wait(timeout)` | Blocks until all scheduled behaviors have completed, then tears the runtime down. |
 
 ### Cown count, parameter count, and auto-captured extras
 
@@ -152,53 +154,36 @@ def test_accumulator_bad(self):
 - Install test dependencies: `pip install -e .[test]`
 - Run: `pytest -vv`
 
-## Pattern 1 — Assert Inside a Behavior via `send`/`receive`
+## Pattern 1 — `quiesce()` + `unwrap()` (default)
 
-Because behaviors run asynchronously, you **cannot** assert directly in the test
-body after scheduling a behavior. Instead, use `send` to ship the result out of
-the behavior and `receive` in the test to collect and verify it.
+This is the preferred pattern for **result-shipping** assertions: the behavior
+computes a value (or raises), and the test thread verifies it. A behavior
+returns its result, so `@when` hands back a `Cown` holding that result. The
+test blocks once on `quiesce()` (which lets every in-flight behavior finish
+without tearing down the workers), then reads the result with
+`Cown.unwrap()`.
+
+`unwrap()` returns the stored value on success, and **re-raises** any exception
+the behavior captured — verbatim, on the test thread — so a failing assertion
+inside a behavior surfaces as a real `AssertionError` in the test. It also
+guards against misuse: calling it while behaviors are still in flight raises
+`RuntimeError`, so always `quiesce()` first.
+
+Always pass a **timeout** to `quiesce()`. If the barrier is not reached in time
+(e.g. a behavior never fires because of a `@when` arg-count mismatch) it raises
+`TimeoutError`, so the test fails fast instead of hanging forever. Use a
+module-level constant such as `QUIESCE_TIMEOUT = 5`.
 
 ```python
-from bocpy import Cown, when, send, receive, drain, TIMEOUT, wait
+from bocpy import Cown, when, quiesce, wait
+
+QUIESCE_TIMEOUT = 5  # seconds; quiesce() raises TimeoutError if exceeded
 
-RECEIVE_TIMEOUT = 10
 
 class TestExample:
     @classmethod
     def teardown_class(cls):
-        wait()  # drain the scheduler after all tests
-
-    def receive_asserts(self, count=1):
-        """Helper: collect `count` assertion messages and fail on mismatch.
-
-        Uses a timeout so that if a behavior never fires (e.g. due to a
-        parameter-count mismatch in @when) the test fails quickly instead
-        of hanging forever. The "assert" queue is always drained before
-        returning so leftover messages from a failing test do not leak
-        into subsequent tests in CI.
-        """
-        failed = None
-        timed_out = False
-        try:
-            for _ in range(count):
-                result = receive("assert", RECEIVE_TIMEOUT)
-                if result[0] == TIMEOUT:
-                    timed_out = True
-                    break
-                _, (actual, expected) = result
-                if failed is None and actual != expected:
-                    failed = (actual, expected)
-        finally:
-            drain("assert")
-
-        assert not timed_out, (
-            "Timed out waiting for an 'assert' message from a behavior. "
-            "Check that every @when arg count matches the decorated "
-            "function's parameter count."
-        )
-        if failed is not None:
-            actual, expected = failed
-            assert actual == expected, f"expected {expected!r}, got {actual!r}"
+        wait()  # tear the runtime down after all tests
 
     def test_double(self):
         x = Cown(3)
@@ -207,24 +192,63 @@ class TestExample:
         def result(x):
             return x.value * 2
 
-        @when(result)
-        def _(r):
-            send("assert", (r.value, 6))
+        quiesce(QUIESCE_TIMEOUT)
+        assert result.unwrap() == 6
+```
+
+Tuple results read just as naturally:
+
+```python
+    def test_pair(self):
+        v = Cown(Matrix(1, 2, [3.0, 4.0]))
+
+        @when(v)
+        def result(v):
+            n = v.value.normalize()
+            return (n[0, 0], n[0, 1])
+
+        quiesce(QUIESCE_TIMEOUT)
+        n0, n1 = result.unwrap()
+        assert n0 == pytest.approx(0.6)
+        assert n1 == pytest.approx(0.8)
+```
+
+### Asserting a behavior raised
+
+Because `unwrap()` re-raises the captured exception, use
+`pytest.raises` to assert that a behavior failed:
+
+```python
+    def test_raises(self):
+        x = Cown(1)
+
+        @when(x)
+        def boom(x):
+            raise ValueError("bad input")
 
-        self.receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        with pytest.raises(ValueError, match="bad input"):
+            boom.unwrap()
 ```
 
+Once `unwrap()` consumes an exception it clears the cown's exception flag, so
+the runtime will not later report it as unhandled, and a second `unwrap()`
+returns the (now `None`) value rather than re-raising.
+
 ### Why this pattern?
 
-`@when` returns immediately — the behavior hasn't executed yet. The test thread
-must block on `receive("assert")` to synchronize with the behavior's completion.
-Calling `wait()` in `teardown_class` ensures any remaining work finishes before
-the next test class starts.
+`@when` returns immediately — the behavior has not executed yet. `quiesce()` is
+the test-thread barrier that lets it (and any behaviors it chains) finish.
+Unlike `wait()`, it leaves the workers alive, so the result cowns remain
+readable and further behaviors can still be scheduled. Reading results
+directly avoids the message-queue round-trip and per-assert timeout polling
+that a `send`/`receive`-based assertion would require.
 
 ## Pattern 2 — Testing Nested / Chained Behaviors
 
-Behaviors can schedule further behaviors. Use multiple `send` calls and
-`receive_asserts(count)` to verify each step in the chain.
+Behaviors can schedule further behaviors. A behavior that chains more work can
+return the inner result cown; after `quiesce()` the whole chain has run, so you
+unwrap the final result on the test thread.
 
 ```python
 def test_nested(self):
@@ -237,18 +261,13 @@ def test_nested(self):
         @when(x)
         def step2(x):
             x.value *= 3      # x is now 6
+            return x.value
 
         return step2
 
-    @when(x, step1)
-    def check(x, s):
-        send("assert", (x.value, 2))
-
-        @when(x, s.value)     # s.value is the inner Cown
-        def deep_check(x, _):
-            send("assert", (x.value, 6))
-
-    self.receive_asserts(2)
+    quiesce(QUIESCE_TIMEOUT)
+    # step1 returns the inner step2 cown; unwrap it to read the final value.
+    assert step1.unwrap().unwrap() == 6
 ```
 
 ## Pattern 3 — Multi-Cown Coordination
@@ -267,14 +286,16 @@ def test_transfer(self):
         x.value -= 50
 
     @when(x)
-    def _(x):
-        send("assert", (x.value, 50))
+    def read_x(x):
+        return x.value
 
     @when(y)
-    def _(y):
-        send("assert", (y.value, 50))
+    def read_y(y):
+        return y.value
 
-    self.receive_asserts(2)
+    quiesce(QUIESCE_TIMEOUT)
+    assert read_x.unwrap() == 50
+    assert read_y.unwrap() == 50
 ```
 
 ## Pattern 4 — Cown Grouping
@@ -298,7 +319,7 @@ to `@when` becomes its own parameter in the decorated function:
 ### Full group example
 
 ```python
-from bocpy import Cown, when, send, receive
+from bocpy import Cown, when, quiesce
 
 cowns = [Cown(i) for i in range(10)]  # values 0..9, sum = 45
 
@@ -325,20 +346,17 @@ def group_single_group(g0: list[Cown[int]], single: Cown[int], g1: list[Cown[int
 
 ### Testing grouped results
 
-The results are all cowns, so use the same `send`/`receive` pattern. You can
-itself pass a list of result cowns as a group to `@when`:
+The results are all cowns, so collect them after `quiesce()` with `unwrap()`.
+You can also pass a list of result cowns as a group to a follow-up `@when`:
 
 ```python
 def test_cown_grouping(self):
     expected = 45
     results = [group_sum, group_then_single, single_then_group, group_single_group]
 
-    @when(results)
-    def check(results: list[Cown]):
-        for r in results:
-            send("assert", (r.value, expected))
-
-    self.receive_asserts(len(results))
+    quiesce(QUIESCE_TIMEOUT)
+    for r in results:
+        assert r.unwrap() == expected
 ```
 
 ### Key rules for grouping
@@ -353,9 +371,9 @@ def test_cown_grouping(self):
 ## Pattern 5 — Exception Propagation
 
 If a behavior raises, the exception is captured in the returned cown's `.value`
-**and** the cown's `.exception` flag is set to `True`. This lets downstream
-behaviors distinguish a thrown exception from a value that just happens to be
-an `Exception` instance returned normally.
+**and** the cown's `.exception` flag is set to `True`. The simplest way to
+assert a behavior raised is `unwrap()` under `pytest.raises`, which re-raises
+the captured exception verbatim on the test thread:
 
 ```python
 def test_exception_in_behavior(self):
@@ -365,15 +383,15 @@ def test_exception_in_behavior(self):
     def bad(x):
         x.value /= 0          # ZeroDivisionError
 
-    @when(bad)
-    def _(b):
-        send("assert", (b.exception, True))
-        send("assert", (isinstance(b.value, ZeroDivisionError), True))
-        b.value = None         # writing .value clears the exception flag
-
-    self.receive_asserts(2)
+    quiesce(QUIESCE_TIMEOUT)
+    with pytest.raises(ZeroDivisionError):
+        bad.unwrap()
+```
 
+An `Exception` object a behavior **returns** (rather than raises) is just a
+value: `.exception` stays `False` and `unwrap()` returns it normally.
 
+```python
 def test_returned_exception_is_not_flagged(self):
     """An Exception object *returned* from a behavior is just a value."""
     x = Cown(1)
@@ -382,22 +400,19 @@ def test_returned_exception_is_not_flagged(self):
     def returns_exc(x):
         return ValueError("not really an error")
 
-    @when(returns_exc)
-    def _(r):
-        send("assert", (r.exception, False))
-        send("assert", (isinstance(r.value, ValueError), True))
-
-    self.receive_asserts(2)
+    quiesce(QUIESCE_TIMEOUT)
+    result = returns_exc.unwrap()
+    assert isinstance(result, ValueError)
 ```
 
-Notes:
+If you need to inspect the flag from **inside** a downstream behavior (rather
+than unwrap on the test thread), the same rules apply:
 
+- `cown.exception` distinguishes a thrown exception from a returned
+  `Exception` value — assert on it before `isinstance(.value, Exception)`.
 - Writing `cown.value = ...` from inside a behavior **clears** `.exception`.
-- `cown.exception` is also writable inside a behavior, in case you want to
-  manually mark or unmark a cown as carrying an error.
-- Always assert on `.exception` before `isinstance(.value, Exception)` —
-  otherwise a behavior that legitimately returns an `Exception` will be
-  indistinguishable from one that raised.
+- `cown.exception` is also writable inside a behavior, to manually mark or
+  unmark a cown as carrying an error.
 
 ## Pattern 6 — Noticeboard
 
@@ -420,6 +435,11 @@ data from the **same** snapshot — even if other behaviors write in the
 meantime. To see a write made by another behavior, schedule a follow-up
 behavior (typically by chaining via a cown returned from `@when`).
 
+Because the snapshot can only be read from **inside** a behavior, store the
+read into the result cown returned by `@when` and `unwrap()` it on the test
+thread after `quiesce()` (Pattern 1). The result cown is used only by that one
+behavior, so it does not affect scheduling.
+
 ```python
 def test_noticeboard_roundtrip(self):
     x = Cown(0)
@@ -432,9 +452,10 @@ def test_noticeboard_roundtrip(self):
     # applied and step2's snapshot sees it.
     @when(x, step1)
     def step2(x, _):
-        send("assert", (notice_read("greeting"), "hello"))
+        return notice_read("greeting")
 
-    self.receive_asserts()
+    quiesce(QUIESCE_TIMEOUT)
+    assert step2.unwrap() == "hello"
 ```
 
 ### Atomic update
@@ -470,9 +491,10 @@ class TestCounter:
 
         @when(x, bump)
         def check(x, _):
-            send("assert", (notice_read("count"), 8))
+            return notice_read("count")
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert check.unwrap() == 8
 ```
 
 ### Delete via `REMOVED`
@@ -498,9 +520,10 @@ def test_remove_via_update(self):
 
     @when(x, tick)
     def check(x, _):
-        send("assert", ("lives" in noticeboard(), False))
+        return "lives" in noticeboard()
 
-    self.receive_asserts()
+    quiesce(QUIESCE_TIMEOUT)
+    assert check.unwrap() is False
 ```
 
 ### Common noticeboard pitfalls
@@ -509,7 +532,7 @@ def test_remove_via_update(self):
 |---------|-----|
 | Reading a value back inside the **same** behavior that wrote it | The snapshot was taken at the start of the behavior. Chain a follow-up `@when` to observe the write. |
 | Passing a lambda or closure to `notice_update` | They are not picklable. Use a module-level function with `functools.partial`, or an `operator` function. |
-| Asserting in the test body that `noticeboard()` contains a key | Read inside a behavior and `send` the result out — `noticeboard()` and `notice_read()` outside any behavior return a snapshot that is never refreshed. |
+| Asserting in the test body that `noticeboard()` contains a key | Read inside a behavior, return the result, then `quiesce()` and `unwrap()` it — `noticeboard()` and `notice_read()` outside any behavior return a snapshot that is never refreshed. |
 | Writing more than 64 distinct keys | Excess writes are dropped with a logged warning — they do **not** raise. Keep tests within the limit (and `notice_delete` keys you no longer need). |
 
 ## Pattern 7 — Parameterized Tests
@@ -523,11 +546,8 @@ def test_fibonacci(self, n):
     result = fib_parallel(n)
     expected = fib_sequential(n)
 
-    @when(result)
-    def _(r):
-        send("assert", (r.value, expected))
-
-    self.receive_asserts()
+    quiesce(QUIESCE_TIMEOUT)
+    assert result.unwrap() == expected
 ```
 
 ## Pattern 8 — Testing `send`/`receive` Messaging Directly
@@ -575,10 +595,11 @@ def test_object_in_cown(self):
             c.value.increment()
 
     @when(c)
-    def _(c):
-        send("assert", (c.value.n, 10))
+    def final(c):
+        return c.value.n
 
-    self.receive_asserts()
+    quiesce(QUIESCE_TIMEOUT)
+    assert final.unwrap() == 10
 ```
 
 ## Common Pitfalls
@@ -587,12 +608,11 @@ def test_object_in_cown(self):
 |---------|-----|
 | **Parameter count mismatch in `@when`** | The decorated function must have **exactly** as many parameters as `@when` arguments. A mismatch crashes the worker. Use closure variables instead of default arguments to capture extra values. |
 | **Classes/functions defined inside a test** | Behaviors run in sub-interpreters that import the module. Define all classes and functions used in behaviors at **module level** so workers can resolve them. |
-| Asserting in the test body right after `@when` | The behavior hasn't run yet. Use `send`/`receive` to synchronize. |
+| Asserting in the test body right after `@when` | The behavior hasn't run yet. Call `quiesce()` first, then read results with `unwrap()` (Pattern 1). |
 | `receive` without a timeout | If a behavior crashes silently, the test hangs forever. Always pass a timeout (e.g. `RECEIVE_TIMEOUT = 10`) and assert the result is not `TIMEOUT`. |
 | Forgetting `wait()` in teardown | Pending behaviors may leak into the next test class. Always call `wait()` in `teardown_class`. |
-| Reading `cown.value` outside a behavior | A cown must be acquired first. Read values inside `@when` or use `send`/`receive`. |
+| Reading `cown.value` outside a behavior | A cown must be acquired first. After `quiesce()`, use `unwrap()` (which acquires for you); otherwise read inside `@when`. |
 | Using default arguments to capture loop variables | Default args add parameters, breaking the arg-count rule. Use a closure variable instead: `val = i` on a separate line before `@when`. |
-| Mismatched `receive_asserts` count | The count must match the exact number of `send("assert", ...)` calls expected. |
 | Non-XIData-compatible objects in cowns across interpreters | Stick to built-in types or objects that support cross-interpreter data. |
 | Importing `unittest.mock` in a BOC test | The transpiler exports the whole test module for import in every worker sub-interpreter. `unittest.mock` transitively imports `asyncio`, which can deadlock during PEP 684 per-interpreter init (observed on macOS arm64 + Python 3.12/3.13). Use the in-house `mockreplacement.patch_attr` / `Recorder` helpers (see `test/mockreplacement.py`), and import them **inside the test method** — never at module scope, because workers also fail to find `mockreplacement` on their `sys.path` during bootstrap. |
 | Test function names with uppercase letters (N802) | Test names must be lowercase. E.g., `test_t_equals_transpose`, **not** `test_T_equals_transpose`, even when testing a property like `.T`. |
diff --git a/.github/skills/thinking-in-boc/SKILL.md b/.github/skills/thinking-in-boc/SKILL.md
index f12f86b..f3ec12d 100644
--- a/.github/skills/thinking-in-boc/SKILL.md
+++ b/.github/skills/thinking-in-boc/SKILL.md
@@ -114,14 +114,18 @@ later `@when` to enforce happens-after across unrelated data:
 ```python
 @when(x)
 def writer(x):
-    notice_write("k", x.value)
-    notice_sync()                           # commit before returning
+    x.value = compute()
 
-@when(x, writer)                            # waits for writer to finish
-def reader(x, _):
-    assert notice_read("k") == x.value
+@when(y, writer)                            # y is unrelated; writer is the result cown
+def reader(y, w):
+    consume(y.value, w.value)               # runs only after writer finished
 ```
 
+Here `reader` touches its own data (`y`) and would otherwise be free to run
+concurrently with `writer`. Depending on the `writer` result cown is what
+serializes them: the runtime cannot acquire `writer` until that behavior has
+returned, so `reader` sees its result via `w.value`.
+
 ### 4. Run when *any* worker is free — `@when()`
 
 `@when()` with no arguments schedules a behavior with no data dependencies.
@@ -176,30 +180,7 @@ cown between chunks, so:
 `prime_factor.py` (`sieve_check` → `sieve_work` → `sieve_check`) is the
 canonical example in this repository.
 
-### 6. Flushing your own queued mutations — `notice_sync()`
-
-The noticeboard mutator runs on its own thread. `notice_write` /
-`notice_update` / `notice_delete` are fire-and-forget. If a *subsequent
-behavior* must observe your noticeboard mutation, call `notice_sync()` at
-the end of the writing behavior:
-
-```python
-@when(x)
-def writer(x):
-    notice_write("k", v)
-    notice_sync()                           # block until commit
-
-@when(x, writer)                            # now reader sees v
-def reader(x, _):
-    assert notice_read("k") == v
-```
-
-`notice_sync()` flushes **only the calling thread's** prior writes. For
-cross-producer ordering, lean on `@when(cowns)` (pattern 2) — let the cown
-graph do the synchronization, and let each writer's `notice_sync()` make
-its own commit visible before it releases its cown.
-
-### 7. Single-assignment rendezvous — the behavior's own result cown
+### 6. Single-assignment rendezvous — the behavior's own result cown
 
 `@when` returns a `Cown` holding whatever the behavior returns. That cown
 *is* your rendezvous — there is no need to allocate a separate `Cown(None)`
diff --git a/.github/workflows/pr_gate.yml b/.github/workflows/pr_gate.yml
index 8bd1cbd..264b184 100644
--- a/.github/workflows/pr_gate.yml
+++ b/.github/workflows/pr_gate.yml
@@ -243,6 +243,10 @@ jobs:
         with:
           python-version: ${{ matrix.python_version }}
           architecture: x86
+          # 32-bit CPython is not pre-installed in the Windows 2025
+          # toolcache, so resolve against the downloadable manifest
+          # (which ships win32 builds) instead of a local cache hit.
+          check-latest: "true"
 
       - name: Python test
         run: pytest -vv -s
diff --git a/.gitignore b/.gitignore
index 19d4474..4496a84 100644
--- a/.gitignore
+++ b/.gitignore
@@ -210,3 +210,4 @@ __marimo__/
 .env*
 .vscode
 .copilot
+results
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1f74231..aa60494 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,140 @@
+## 2026-06-08 - Version 0.10.0
+A result-reading and documentation release. `Cown.unwrap()` replaces
+ad-hoc context-manager reads of behavior results with a single
+quiescence-guarded call lowered to the C capsule, and the test suite
+moves wholesale to the `quiesce()` + `unwrap()` pattern. `Matrix`
+gains arg-reductions (`argmin` / `argmax`) and an explicit PRNG
+`seed`, and its matmul kernel is re-ordered for cache-friendly
+auto-vectorization (bit-for-bit identical output). The legacy
+`notice_sync` shim is removed in favour of
+`quiesce(noticeboard=True)` for reads and the new `notice_seed`
+for synchronous main-thread seeding.
+
+**New Features**
+
+- **`notice_seed(key, value)`** — a synchronous, main-interpreter-only
+  noticeboard write that commits under the noticeboard mutex *before it
+  returns*, so every behavior scheduled afterwards observes it. Unlike
+  the fire-and-forget `notice_write`, it gives read-your-writes ordering
+  for installing read-mostly configuration before scheduling the
+  behaviors that read it, and it starts the runtime if called first — so
+  seeding can be a program's first bocpy call with no explicit `start()`.
+  It is a plain overwrite and does not provide `notice_update`'s
+  read-modify-write atomicity. Calling it from a worker raises
+  `RuntimeError`.
+- **`Cown.unwrap()`** — return the cown's stored value, or re-raise a
+  captured behavior exception on the caller's thread (Rust
+  `Result::unwrap` shape). Acquires the cown for the read and requires
+  global quiescence (`quiesce` / `wait`) first, raising
+  `RuntimeError` otherwise so a result is never read while its
+  producer is still in flight. Lowered to a C-level
+  `CownCapsule.unwrap`, so a behavior that returns a `Cown`
+  (surfacing downstream as a bare `CownCapsule`) unwraps the same way
+  without rewrapping. `unwrap()` *consumes* the cown: it takes the
+  stored payload by reference and resets the cown to hold `None` before
+  releasing it, so the returned object is never re-serialized back into
+  the cown. This matters for move-typed payloads such as `Matrix`,
+  whose ownership would otherwise be flipped away from the caller on
+  release, leaving an unreadable result. Because the payload is removed,
+  a captured exception is not re-reported when the cown is dropped, and a
+  second `unwrap()` returns `None`. The emptied cown stays schedulable,
+  so a later behavior can refill it.
+- **`Matrix.argmin(axis=None)` / `Matrix.argmax(axis=None)`** — index
+  of the minimum / maximum element, first occurrence on ties. Flat
+  (`axis=None`) returns a row-major `int`; `axis=0` / `axis=1`
+  return per-column / per-row index vectors. NaN elements are skipped
+  unless the running extreme starts at NaN, which pins the result to
+  that position (this differs from NumPy, which propagates NaN).
+- **`Matrix.seed(value)`** — classmethod seeding the process-global C
+  PRNG used by `normal()` / `uniform()`, making subsequent draws
+  reproducible when generation stays on a single thread.
+- **`Matrix` pickling** — `Matrix` now supports `pickle` (all
+  protocols) and `copy.deepcopy` via `__reduce__`, so a matrix nested in
+  a pickled container (dict, list, …) round-trips with its neighbours
+  instead of raising `TypeError`. Serialization copies the raw,
+  native-endian, row-major `double` buffer in one block, so the cost is
+  linear in the element count with no per-element Python object churn and
+  every value (including `NaN`, `±inf`, `-0.0`, and subnormals) is
+  preserved bit-for-bit. The current interpreter must own the matrix:
+  pickling one that has been released into a `Cown` raises
+  `RuntimeError`. The encoding is native-endian, so a pickle is not
+  portable across architectures of differing byte order.
+- **`examples/fanout_benchmark.py`** — a dispatch-rate microbenchmark
+  for the fanout workload (a producer that allocates fresh consumer
+  cowns it does not hold and dispatches one `@when` each), surfacing
+  per-worker queue contention (`enqueue_cas_retries`) as the gating
+  signal. Complements the chain workload in `examples/benchmark.py`.
+
+**Improvements**
+
+- **matmul cache-friendly reorder** — `impl_matmul` is re-ordered from
+  `ijk` to `ikj` so the inner loop walks contiguous rows of the
+  right-hand operand and the output, enabling compiler
+  auto-vectorization. Output is bit-for-bit identical (each inner
+  product still accumulates `k` in ascending order); measured ~2.9–3.2×
+  faster on square shapes, ~1.5–1.8× on rectangular ones. A
+  bitwise-reproducibility regression test pins the accumulation order.
+
+**Bug Fixes**
+
+A warm welcome and thank-you to first-time contributor **Shivanand
+Mishra** (@xemishra), who tracked down and fixed a subtle transpiler
+bug this release — exactly the kind of sharp-eyed catch that makes the
+project better.
+
+- **`@when` result assignment dropped for module-level behaviors**
+  (#30, thanks @xemishra) — a behavior defined at module level
+  transpiled without its result cown, so the exported module silently
+  dropped the return value and downstream behaviors could not schedule
+  over it. Fixed, with a regression test guarding the exported-module
+  shape.
+- **Nested `@when` capture** — the transpiler now correctly surfaces a
+  nested `@when`'s free names as the outer behavior's captures and
+  resolves its cown arguments in the outer frame, instead of leaving
+  them to Python's closure machinery where they could not be reached
+  from the worker interpreter.
+- **`Matrix` range/return checks** — added overflow and return-value
+  checks on the `range_read` path uncovered while migrating the
+  matrix tests.
+
+**Breaking Changes**
+
+- **`notice_sync` removed** — the noticeboard-sync shim is gone from
+  `bocpy.__all__`. Use `quiesce(noticeboard=True)` instead, which
+  blocks until in-flight behaviors complete and returns a noticeboard
+  snapshot without tearing the runtime down.
+
+**Documentation**
+
+- Removed the `notice_sync` references from `noticeboard` and the
+  type stubs; documented the NaN tie-break behavior of
+  `argmin` / `argmax`; corrected the happens-after example in the
+  `thinking-in-boc` skill to order across genuinely unrelated data;
+  added a `fanout_benchmark.py` section to the examples README.
+
+**Tests**
+
+- Migrated `test_boc.py`, `test_noticeboard.py`, and the scheduler /
+  pinned-pump suites to the `quiesce()` + `Cown.unwrap()` pattern.
+  Added matmul bitwise-reproducibility and `argmin` / `argmax` NaN
+  regression tests.
+
+**Dependencies**
+
+- Bumped the `github-actions` group (#31, #27, dependabot):
+  `actions/checkout` 6.0.2 → 6.0.3 and `pypa/cibuildwheel`
+  3.4.1 → 4.0.0.
+
+**Internal**
+
+- Large comment scrub across the C extensions, Python runtime, scripts,
+  and tests, followed by a remediation pass that restored load-bearing
+  rationale (memory-ordering fences, UAF guards, deliberate-leak notes,
+  and the vendored Apache-2.0 provenance header) as condensed
+  summaries.
+- Ignored Sphinx-related updates in `dependabot.yml` to keep the docs
+  toolchain pinned.
+
 ## 2026-06-05 - Version 0.9.0
 Main-pinned cowns — a new `PinnedCown` subclass holds its
 value as a plain `PyObject *` on the main interpreter, never
diff --git a/CITATION.cff b/CITATION.cff
index 174d9cc..b938e57 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -5,6 +5,6 @@ authors:
   given-names: "Matthew Alastair"
   orcid: "https://orcid.org/0000-0002-1019-8036"
 title: "bocpy"
-version: 0.9.0
-date-released: 2026-06-05
+version: 0.10.0
+date-released: 2026-06-08
 url: "https://github.com/microsoft/bocpy"
\ No newline at end of file
diff --git a/examples/README.md b/examples/README.md
index dfa1ecb..cec2221 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -83,3 +83,22 @@ key knobs are summarised below.
 
 See [`scripts/bench_matrix.py`](../scripts/bench_matrix.py) for the
 matrix-arithmetic micro-bench used to guard `_math.c` performance.
+
+## [Fanout Benchmark](fanout_benchmark.py)
+`fanout_benchmark.py` measures the dispatch-rate ceiling for the *fanout*
+workload: a producer behavior that, on every step, allocates a batch of
+fresh consumer cowns it does not hold and dispatches one `@when` per
+consumer before rescheduling itself. Because the producer never holds the
+consumer cowns, every child dispatch exercises the producer-local arm of
+the scheduler, so the benchmark surfaces per-worker queue contention
+(`enqueue_cas_retries`) — complementing `benchmark.py`'s chain workload.
+
+- `--producers`, `--fanout-width`, `--producer-steps` — shape the workload
+  (number of producer cowns, consumers dispatched per step, and steps per
+  producer).
+- `--payload-rows` / `--payload-cols` — size of each consumer's `Matrix`.
+- `--sweep-axis` / `--sweep-values` — sweep one knob across a list of values
+  in a single run.
+- `--repeats`, `--output`, `--quiet`, `--json-stdout` — repeat-count,
+  results-file path, and reporting toggles for batch runs.
+
diff --git a/examples/bank.py b/examples/bank.py
index 3478d39..2110993 100644
--- a/examples/bank.py
+++ b/examples/bank.py
@@ -34,12 +34,6 @@ def do_transfer(src: Cown[Account], dst: Cown[Account]):
         else:
             print("failure")
 
-        # Schedule follow-up behaviors that each acquire only a single
-        # account cown.  These demonstrate that a behavior body can
-        # schedule further behaviors on a subset of its cowns — the
-        # inner behaviors will run after the outer one releases.  The
-        # two inner behaviors are independent (they hold disjoint
-        # cowns) and may run in either order or in parallel.
         @when(src)
         def _(a: Cown[Account]):
             print("src (after transfer):", a.value)
diff --git a/examples/benchmark.py b/examples/benchmark.py
index 8863202..2d919c5 100644
--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -33,11 +33,10 @@
 from typing import Optional
 
 from bocpy import _core
-from bocpy import (Cown, Matrix, notice_write, noticeboard, PinnedCown,
+from bocpy import (Cown, Matrix, notice_seed, notice_write, noticeboard,
+                   PinnedCown,
                    pump, start, wait, when)
 
-# Sentinels for the parent/child JSON protocol.  Uppercase so the
-# transpiler keeps them as module-level constants in the worker export.
 SENTINEL_BEGIN = "---BOCPY-BENCH-BEGIN---"
 SENTINEL_END = "---BOCPY-BENCH-END---"
 SCHEMA_VERSION = 1
@@ -58,11 +57,6 @@ def _physical_cpu_count() -> int:
     return os.cpu_count() or 1
 
 
-# ---------------------------------------------------------------------------
-# Behavior code (chain workload)
-# ---------------------------------------------------------------------------
-
-
 class ChainState:
     """Per-chain mutable state carried inside a ``Cown[ChainState]``.
 
@@ -125,16 +119,7 @@ def schedule_step(state_cown: Cown, window_list: list, group_size: int) -> None:
     @when(state_cown, window_list)
     def _step(state, window):
         cs = state.value
-        # When ``cr_null`` is set, skip the matmul loop entirely.  The
-        # behavior still acquires its window of cowns, mutates
-        # ``ChainState``, and reschedules itself — so the measured
-        # throughput reflects pure BOC runtime overhead (2PL, queue
-        # ops, sub-interpreter crossings, return-cown allocation)
-        # with the application work removed.
         if not noticeboard().get("cr_null", False):
-            # The inner loop's first slot multiplies window[0] by itself.
-            # Intentional — it keeps the per-behavior multiply count
-            # exactly `iters * group_size`.
             for _ in range(cs.iters):
                 for c in window:
                     window[0].value = window[0].value @ c.value
@@ -142,15 +127,17 @@ def _step(state, window):
         cs.count += 1
         cs.head_idx = (cs.head_idx + cs.stride) % cs.ring_size
         if not noticeboard().get("cr_stop", False):
-            # Pass the already-acquired `state` cown wrapper directly
-            # rather than the closure-captured `state_cown` to keep the
-            # capture set minimal.
             schedule_step(state, next_window(cs, group_size), group_size)
 
 
-# ---------------------------------------------------------------------------
-# Configuration and result types (plain data only; no Cowns)
-# ---------------------------------------------------------------------------
+# Mirror of the noticeboard's hard entry cap in boc_noticeboard.c
+# (NB_MAX_ENTRIES). The workload publishes one "ring_{r}" entry per ring
+# plus the NB_RESERVED_KEYS non-ring keys below; validate_config rejects
+# configs that would overflow this rather than dropping entries mid-run.
+NB_CAPACITY = 64
+# Non-ring noticeboard keys: cr_null, cr_stop, final_count,
+# final_count_ts_ns, pinned_dispatches.
+NB_RESERVED_KEYS = 5
 
 
 @dataclass
@@ -190,13 +177,7 @@ class RepeatResult:
     wall_clock_ns_start: int
     scheduler_stats: Optional[list] = None
     queue_stats: Optional[list] = None
-    # ``derived`` holds the post-processed metrics computed from the
-    # per-window scheduler-stats delta (see
-    # ``compute_derived_metrics``).
     derived: Optional[dict] = None
-    # Count of pinned spinner @when dispatches that fired during the
-    # run (0 when ``pinned_spinner`` was off). Lifted out of the
-    # noticeboard at shutdown.
     pinned_dispatches: int = 0
 
 
@@ -213,32 +194,20 @@ class PointResult:
     error: Optional[dict] = None
 
 
-# ---------------------------------------------------------------------------
-# Sizing / validation helpers (parent-side, no BOC required)
-# ---------------------------------------------------------------------------
-
-
 def derive_sizes(cfg: BenchConfig) -> BenchConfig:
     """Auto-size ``rings`` and ``chains_per_ring`` if not overridden.
 
+    Biases toward many rings with few chains each so the benchmark
+    measures scheduler scaling (lots of independent work) rather than
+    contention/starvation within a single oversubscribed ring.
+
     :param cfg: An input config (mutated and returned).
     :return: The same config with ``rings`` / ``chains_per_ring`` set.
     """
     if cfg.chains_per_ring is None:
-        # Use a small per-ring chain count (4) so chains never collide
-        # on adjacent slots as they advance. Independent rings carry
-        # the load instead.
         cfg.chains_per_ring = max(
             1, cfg.ring_size // (cfg.group_size * cfg.stride * 8))
     if cfg.rings is None:
-        # Bias toward more *rings* rather than more chains-per-ring:
-        # chains on the same ring contend for adjacent slots as they
-        # advance, so per-ring concurrency is bounded well below
-        # ``chains_per_ring``. Independent rings, by contrast, never
-        # collide. Provision at least ``workers * 4`` rings so every
-        # worker sees a deep, independent supply of ready chains and
-        # the measured throughput reflects scheduler scaling rather
-        # than workload starvation.
         cfg.rings = max(cfg.workers * 16 // cfg.chains_per_ring,
                         cfg.workers * 4)
     return cfg
@@ -256,6 +225,11 @@ def validate_config(cfg: BenchConfig) -> Optional[str]:
     if cfg.group_size * cfg.stride * 2 > cfg.ring_size:
         return (f"group_size*stride*2 ({cfg.group_size}*{cfg.stride}*2) "
                 f"> ring_size ({cfg.ring_size}); chains would collide")
+    nb_used = cfg.rings + NB_RESERVED_KEYS
+    if nb_used > NB_CAPACITY:
+        return (f"rings ({cfg.rings}) + {NB_RESERVED_KEYS} reserved keys "
+                f"= {nb_used} exceeds noticeboard capacity ({NB_CAPACITY}); "
+                f"lower --workers/--rings or raise --ring-size")
     if cfg.workers < 1:
         return f"workers must be >= 1, got {cfg.workers}"
     if cfg.iters < 1:
@@ -282,11 +256,6 @@ def emit_soft_warnings(cfg: BenchConfig, cpu_count: int) -> None:
               f"hyperthreads will be used)", file=sys.stderr)
 
 
-# ---------------------------------------------------------------------------
-# Workload construction
-# ---------------------------------------------------------------------------
-
-
 def build_workload(cfg: BenchConfig):
     """Build per-ring cowns and per-chain state cowns.
 
@@ -310,9 +279,7 @@ def build_workload(cfg: BenchConfig):
                                     (cfg.payload_rows, cfg.payload_cols)))
                 for _ in range(cfg.ring_size)]
         rings.append(ring)
-        notice_write(f"ring_{r}", ring)
-        # Spread chains evenly across the ring so adjacent chains'
-        # initial windows don't overlap.
+        notice_seed(f"ring_{r}", ring)
         spacing = max(1, cfg.ring_size // cfg.chains_per_ring)
         for k in range(cfg.chains_per_ring):
             head = (k * spacing) % cfg.ring_size
@@ -324,11 +291,6 @@ def build_workload(cfg: BenchConfig):
     return rings, state_cowns
 
 
-# ---------------------------------------------------------------------------
-# Snapshot helpers (used by the measurement flow)
-# ---------------------------------------------------------------------------
-
-
 def schedule_snap(state_cowns: list) -> None:
     """Schedule the final snapshot behavior.
 
@@ -349,13 +311,6 @@ def snap(states):
     notice_write("cr_stop", True)
 
 
-# The pinned cown wraps a one-slot list: ``[count]``. Counting inside
-# the cown's own value keeps the hot per-dispatch path off the
-# noticeboard, so NB_VERSION is not bumped on every spinner iteration
-# (which would invalidate every worker's cached snapshot and skew the
-# worker-throughput measurement). The spinner publishes the final
-# count to the noticeboard exactly once -- on the iteration where it
-# observes ``cr_stop`` and breaks the tail-recursion.
 _PINNED_COUNT = 0
 
 
@@ -386,11 +341,6 @@ def _spinner(p):
             notice_write("pinned_dispatches", p.value[_PINNED_COUNT])
 
 
-# ---------------------------------------------------------------------------
-# Single-point measurement (in-process; one BOC start/wait cycle)
-# ---------------------------------------------------------------------------
-
-
 def run_single_point_body(cfg: BenchConfig, repeat_index: int) -> RepeatResult:
     """Run one chain-ring measurement in a fresh BOC runtime.
 
@@ -404,16 +354,9 @@ def run_single_point_body(cfg: BenchConfig, repeat_index: int) -> RepeatResult:
     :param repeat_index: Index of this repeat for reporting.
     :return: A ``RepeatResult`` with no Cown references.
     """
-    # Start the runtime first: ``build_workload`` writes rings to the
-    # noticeboard, and noticeboard writes require the runtime to be
-    # running.
     start(worker_count=cfg.workers)
     rings, state_cowns = build_workload(cfg)
-    # Publish the null-payload toggle so worker behaviors can read it
-    # from their per-behavior noticeboard snapshot.  Written before the
-    # warmup sleep so the noticeboard thread has flushed it well
-    # before t_measure_start.
-    notice_write("cr_null", cfg.null_payload)
+    notice_seed("cr_null", cfg.null_payload)
     payload_bytes = cfg.payload_rows * cfg.payload_cols * 8
     total_bytes = cfg.rings * cfg.ring_size * payload_bytes
     print(f"workload: chain rings={cfg.rings} ring_size={cfg.ring_size} "
@@ -423,10 +366,6 @@ def run_single_point_body(cfg: BenchConfig, repeat_index: int) -> RepeatResult:
           file=sys.stderr)
 
     try:
-        # Kick off one chain per (ring, chain-slot) pair.  Recompute the
-        # head positions exactly the way `build_workload` chose them:
-        # we cannot read `cs_cown.value` from the main thread because
-        # Cowns are released to the runtime on construction.
         spacing = max(1, cfg.ring_size // cfg.chains_per_ring)
         chain_idx = 0
         for r in range(cfg.rings):
@@ -444,13 +383,6 @@ def run_single_point_body(cfg: BenchConfig, repeat_index: int) -> RepeatResult:
         wall_clock_ns_start = time.time_ns()
         t_measure_start_ns = time.perf_counter_ns()
 
-        # Measurement window. When ``--pinned-spinner`` is set, drive
-        # a tail-recursing @when on a PinnedCown by hand from this
-        # thread via ``pump(max_behaviors=1)``. The spinner's per-body
-        # ``time.sleep(sleep_s)`` self-paces the dispatch rate. The
-        # point is to load the C-level pinned-queue 0->1 wakeup path
-        # (single terminator cv broadcast per pump) and measure
-        # worker-throughput regression under that load.
         if cfg.pinned_spinner:
             pinned = PinnedCown([0])
             schedule_pinned_spinner(pinned, cfg.pinned_spinner_sleep_s)
@@ -462,35 +394,18 @@ def run_single_point_body(cfg: BenchConfig, repeat_index: int) -> RepeatResult:
 
         schedule_snap(state_cowns)
 
-        # Snapshot tagged-queue counters BEFORE wait() tears the
-        # runtime down. Per-tag assignments are rebound on the next
-        # start(), so capture here while they still reflect this run.
         queue_stats_snap = (
             _core.queue_stats() if hasattr(_core, "queue_stats") else None
         )
     finally:
-        # Drop bare-Cown locals before wait(). ``pinned`` (when set)
-        # is intentionally left in scope: ``wait()`` auto-pumps any
-        # remaining pinned bodies on this thread, the spinner
-        # observes ``cr_stop`` from snap, publishes its final count,
-        # and stops re-scheduling so the queue drains.
         del rings
         del state_cowns
-        # ``wait(stats=True, noticeboard=True)`` returns a WaitResult
-        # carrying both the per-worker scheduler_stats snapshot and
-        # the final noticeboard contents. Both are captured AFTER all
-        # behaviors completed but BEFORE the per-worker array and the
-        # noticeboard entries are freed -- the only correct moment.
         wait_result = wait(stats=True, noticeboard=True)
         sched_stats_end = wait_result.stats
         nb_snap = wait_result.noticeboard
 
     total = int(nb_snap.get("final_count", 0))
     pinned_dispatches = int(nb_snap.get("pinned_dispatches", 0))
-    # Use the snap behavior's own write-time so the elapsed_s
-    # numerator denominator pairing matches: ``total`` is the count
-    # snap observed at that instant, ``t_measure_start`` is when the
-    # chains began contributing to it.
     snap_ts_ns = nb_snap.get("final_count_ts_ns")
     if snap_ts_ns is None:
         raise RuntimeError("snap behavior did not publish final_count_ts_ns")
@@ -510,16 +425,6 @@ def run_single_point_body(cfg: BenchConfig, repeat_index: int) -> RepeatResult:
                         pinned_dispatches=pinned_dispatches)
 
 
-# ---------------------------------------------------------------------------
-# Stats-delta + derived metrics
-# ---------------------------------------------------------------------------
-
-
-# Counter fields in ``_core.scheduler_stats()`` that are monotonically
-# increasing per-worker counters and therefore subtractable across two
-# snapshots.  Non-counter fields (``last_steal_attempt_ns``,
-# ``parked``) are carried over from the end-of-window snapshot
-# unchanged because subtracting them is meaningless.
 _COUNTER_FIELDS = (
     "pushed_local",
     "dispatched_to_pending",
@@ -584,10 +489,6 @@ def compute_derived_metrics(stats: Optional[list],
     }
     if not stats:
         return out
-    # Producer worker = the worker with the most local pushes over
-    # the measurement window. For chain that is whichever worker's
-    # queue saw the most ``schedule_fifo`` evicts of ``pending`` to
-    # ``q``.
     pushed_local = [int(w.get("pushed_local", 0)) for w in stats]
     if not pushed_local or max(pushed_local) == 0:
         return out
@@ -610,11 +511,6 @@ def compute_derived_metrics(stats: Optional[list],
     return out
 
 
-# ---------------------------------------------------------------------------
-# Subprocess orchestration
-# ---------------------------------------------------------------------------
-
-
 def cfg_to_argv(cfg: BenchConfig) -> list:
     """Render a ``BenchConfig`` as CLI args for a child invocation.
 
@@ -647,9 +543,6 @@ def cfg_to_argv(cfg: BenchConfig) -> list:
     return args
 
 
-# Sidechannel: the parent passes its --emit-scheduler-stats flag down
-# to the child via an env var so cfg_to_argv stays a pure function of
-# BenchConfig (the flag is a reporting concern, not a workload knob).
 BOCPY_BENCH_EMIT_SCHED_STATS_ENV = "BOCPY_BENCH_EMIT_SCHED_STATS"
 
 
@@ -724,11 +617,6 @@ def _extract_sentinel_payload(stdout: str) -> Optional[dict]:
         return None
 
 
-# ---------------------------------------------------------------------------
-# Sweep orchestration (parent side)
-# ---------------------------------------------------------------------------
-
-
 def cfg_for_axis(base: BenchConfig, axis: str, value) -> BenchConfig:
     """Clone ``base`` with one axis varied to ``value``.
 
@@ -740,7 +628,6 @@ def cfg_for_axis(base: BenchConfig, axis: str, value) -> BenchConfig:
     :return: A fresh ``BenchConfig`` with that axis applied.
     """
     cfg = BenchConfig(**asdict(base))
-    # Reset auto-sized fields so each point recomputes.
     cfg.rings = base.rings
     cfg.chains_per_ring = base.chains_per_ring
     if axis == "workers":
@@ -838,7 +725,7 @@ def run_sweep(axis: str, values: list, base: BenchConfig,
                         error={"message": str(ex), "stderr_tail": ""})
                     points.append(asdict(point))
                     _flush_results(output_path, metadata, sweep_meta, points)
-                    repeats = None  # marker: already appended
+                    repeats = None
                     break
         except KeyboardInterrupt:
             interrupted = True
@@ -926,11 +813,6 @@ def _json_default(obj):
                     "JSON-serializable")
 
 
-# ---------------------------------------------------------------------------
-# Metadata
-# ---------------------------------------------------------------------------
-
-
 def collect_metadata(argv: list, git_sha: Optional[str]) -> dict:
     """Collect metadata for the top of the results JSON.
 
@@ -993,11 +875,6 @@ def _git_sha() -> Optional[str]:
     return None
 
 
-# ---------------------------------------------------------------------------
-# ASCII table renderer
-# ---------------------------------------------------------------------------
-
-
 def render_table(document: dict) -> str:
     """Render a compact ASCII summary table from a results document.
 
@@ -1078,11 +955,6 @@ def _axis_label(axis: str, pt: dict) -> str:
     return "-"
 
 
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
-
-
 def parse_payload_token(token: str) -> tuple:
     """Parse a payload token of the form ``"<rows>x<cols>"``.
 
@@ -1274,12 +1146,8 @@ def child_main(args) -> int:
         "pinned_dispatches": rep.pinned_dispatches,
     }
     if args.emit_scheduler_stats:
-        # Read from the snapshot taken INSIDE run_single_point_body,
-        # before wait() freed the per-worker array. Querying _core
-        # here would return empty lists.
         payload["scheduler_stats"] = rep.scheduler_stats or []
         payload["queue_stats"] = rep.queue_stats or []
-    # Always forward derived metrics (small dict; harmless when None).
     if rep.derived is not None:
         payload["derived"] = rep.derived
     sys.stdout.write("\n" + SENTINEL_BEGIN + "\n")
@@ -1302,7 +1170,6 @@ def parent_main(args) -> int:
         print(f"benchmark: {ex}", file=sys.stderr)
         return 2
 
-    # Pre-spawn validation across every sweep point.
     cpu = _physical_cpu_count()
     derived_points = []
     for value in sweep_values:
@@ -1317,13 +1184,9 @@ def parent_main(args) -> int:
 
     git_sha = _git_sha()
 
-    # Sidechannel: forward the emit-scheduler-stats flag to children
-    # via an env var. cfg_to_argv stays a pure function of BenchConfig
-    # because the flag is a reporting concern, not a workload knob.
     if args.emit_scheduler_stats:
         os.environ[BOCPY_BENCH_EMIT_SCHED_STATS_ENV] = "1"
 
-    # Wall-clock estimate for sweep duration.
     startup_slack = 5.0
     est = sum((cfg.duration + cfg.warmup + startup_slack) * base.repeats
               for cfg in derived_points)
diff --git a/examples/boids.py b/examples/boids.py
index 6d3a02c..7b0f712 100644
--- a/examples/boids.py
+++ b/examples/boids.py
@@ -239,11 +239,6 @@ def __init__(self, num_boids: int, width: int, height: int, spacing=50):
         self.spacing = spacing
         self.num_boids = num_boids
         positions, velocities = init_boids(num_boids, width, height)
-        # The per-frame writeback runs on the main thread against
-        # these matrices; per-cell physics runs on workers against
-        # cell-local cowns. ``self.positions`` / ``self.velocities``
-        # alias the same matrix objects for direct main-thread reads
-        # (spatial hashing, drawing).
         self.positions_cown = PinnedCown(positions)
         self.velocities_cown = PinnedCown(velocities)
         self.positions = positions
@@ -261,13 +256,11 @@ def spatial_hashing(self, positions: Matrix):
 
         :param positions: An *N* x 2 matrix of boid positions.
         """
-        # clear cell start
         for i in range(self.num_cells + 1):
             self.cell_start[i] = 0
 
         self.grid_cells.clear()
 
-        # first we count how many entries are in each cell
         for i, pos in enumerate(positions):
             r = int_coord(pos.y, self.spacing)
             c = int_coord(pos.x, self.spacing)
@@ -277,7 +270,6 @@ def spatial_hashing(self, positions: Matrix):
             self.hash_values[i] = h
             self.cell_start[h] += 1
 
-        # perform the cumulative sum
         start = 0
         for i in range(self.num_cells):
             start += self.cell_start[i]
@@ -285,12 +277,8 @@ def spatial_hashing(self, positions: Matrix):
 
         self.cell_start[-1] = start
 
-        # populate the cell entries
         for i in range(self.num_boids):
             h = self.hash_values[i]
-            # the effect is that we fill from the back. Once all
-            # nodes have been placed, the start will be at the
-            # beginning of the cell entries.
             self.cell_start[h] -= 1
             self.cell_entries[self.cell_start[h]] = i
 
@@ -343,10 +331,6 @@ def step(self, width: int, height: int):
         results = [cd.update(self.cell_data, width, height) for cd in cells]
         self.num_behaviors = len(cells)
 
-        # One pinned dispatch per frame -- coarse-grained writeback.
-        # Workers compute per-cell physics in parallel; this single
-        # main-thread behavior batches the global-matrix update once all
-        # cell results are ready.
         @when(results, self.positions_cown, self.velocities_cown)
         def _writeback(per_cell, all_pos, all_vel):
             pos_mat = all_pos.value
@@ -450,8 +434,6 @@ def update(self, delta_time: float):
             result = pump()
             self.pending_updates -= result.executed
             if self.pending_updates > 0:
-                # avoid creating extra work until the previous
-                # update has been applied
                 return
 
             self.pending_updates += 1
@@ -517,8 +499,6 @@ def update(self, delta_time: float):
                              "triangle size (default: 1.0).")
     args = parser.parse_args()
 
-    # Validate at the boundary; downstream code (Matrix sizing, hash modulo,
-    # 1.0/fps) assumes positive values and would crash or silently misbehave.
     if args.boids <= 0:
         parser.error("--boids must be positive")
     if args.width <= 0 or args.height <= 0:
@@ -532,25 +512,14 @@ def update(self, delta_time: float):
     if args.scale <= 0:
         parser.error("--scale must be positive")
 
-    # Start the BOC runtime explicitly so --workers takes effect for every
-    # mode.
     start(worker_count=args.workers)
 
     if args.mode == "video":
         import subprocess
 
-        # Create the window first so we can query the actual framebuffer
-        # dimensions (which may differ from logical size on HiDPI displays).
-        # The overlay (boid count / behavior rate) is suppressed in video
-        # mode so the rendered output stays clean.
         boids = Boids(args.width, args.height, args.boids,
                       show_overlay=False, scale=args.scale)
 
-        # Allow graceful close: override on_close to set a flag and return
-        # True so pyglet does not destroy the window mid-frame. The loop
-        # below honors the flag; the finally block tears the window down.
-        # Use a bocpy-prefixed attribute name to avoid colliding with any
-        # underscore-prefixed pyglet internals.
         boids.bocpy_video_closing = False
 
         def _on_close():
@@ -559,7 +528,6 @@ def _on_close():
 
         boids.on_close = _on_close
 
-        # Determine the real framebuffer size (HiDPI-correct).
         boids.switch_to()
         boids.dispatch_events()
         boids.clear()
@@ -572,8 +540,6 @@ def _on_close():
                   f"(window requested {args.width}x{args.height}); "
                   f"encoding at framebuffer resolution.")
 
-        # Validate frame count BEFORE spawning ffmpeg so we don't leak the
-        # subprocess if the duration/fps combination produces no frames.
         num_frames = int(args.duration * args.fps)
         if num_frames == 0:
             print(f"error: --duration {args.duration} is too short for "
@@ -604,8 +570,6 @@ def _on_close():
             wait()
             return
         except OSError as exc:
-            # Other startup failures (read-only output dir, ENOMEM, etc.)
-            # also need cleanup to avoid leaking the window/runtime.
             print(f"error: failed to start ffmpeg: {exc}")
             boids.close()
             wait()
@@ -632,9 +596,6 @@ def _on_close():
                     boids.behaviors_label.draw()
 
                 buf = pyglet.image.get_buffer_manager().get_color_buffer()
-                # Defensive: framebuffer size must remain stable for the
-                # encoder. If it changes (window manager fiddling, monitor
-                # move) we abort rather than emit garbled frames.
                 if (buf.width, buf.height) != (fb_width, fb_height):
                     print(f"error: framebuffer size changed mid-record "
                           f"({fb_width}x{fb_height} -> "
@@ -668,8 +629,6 @@ def _on_close():
                     except subprocess.TimeoutExpired:
                         pass
             finally:
-                # Always release the pyglet window and BOC runtime, even if
-                # ffmpeg cleanup raised something unexpected.
                 try:
                     boids.close()
                 finally:
@@ -677,9 +636,6 @@ def _on_close():
 
         if ff.returncode != 0:
             if ff.returncode is None:
-                # We tried to kill ffmpeg but it never reaped within 5s after
-                # SIGKILL. The output file (if any) is almost certainly
-                # truncated and missing the libx264 moov atom.
                 print("error: ffmpeg was killed and did not exit; "
                       "output file is likely truncated.")
             else:
@@ -688,9 +644,6 @@ def _on_close():
                 print(ff_stderr.decode("utf-8", errors="replace"), end="")
             return
 
-        # Successful render: report the average behavior rate over the
-        # entire run, computed from cumulative counters (not the rolling
-        # 1s window used for the on-screen overlay).
         if boids.total_elapsed > 0:
             avg_rate = boids.total_behaviors / boids.total_elapsed
             print(f"behavior/s (avg over {boids.total_elapsed:.1f}s of "
diff --git a/examples/dining_philosophers.py b/examples/dining_philosophers.py
index c1a9d60..80db0b8 100644
--- a/examples/dining_philosophers.py
+++ b/examples/dining_philosophers.py
@@ -1,10 +1,10 @@
-"""Dining philosophers demo using cowns and message passing."""
+"""Dining philosophers demo using cowns."""
 
 import argparse
 import logging
 from typing import NamedTuple
 
-from bocpy import Cown, receive, send, wait, when
+from bocpy import Cown, quiesce, wait, when
 
 
 class Fork:
@@ -28,19 +28,17 @@ def eat(self: "Philosopher"):
         """Attempt to eat; reschedule until hunger reaches zero."""
         index = self.index
 
+        # BOC acquires both forks atomically in cown-id order, so the classic deadlock cannot occur.
         @when(self.left, self.right, self.hunger)
         def take_bite(left: Cown[Fork], right: Cown[Fork], hunger: Cown[int]):
             left.value.use()
             right.value.use()
-            send("report", ("bite", index))
+            print(f"Philosopher {index} has taken a bite")
             hunger.value -= 1
             if hunger.value > 0:
                 Philosopher(index, left, right, hunger).eat()
             else:
-                # send the report after the forks have been released
-                @when()
-                def _():
-                    send("report", ("full", index))
+                print(f"Philosopher {index} is full")
 
 
 def main():
@@ -56,15 +54,7 @@ def main():
     for i in range(args.philosophers):
         Philosopher.eat(Philosopher(i, forks[i-1], forks[i], Cown(args.hunger)))
 
-    num_eating = args.philosophers
-    while num_eating > 0:
-        match receive("report"):
-            case ["report", ("bite", index)]:
-                print(f"Philosopher {index} has taken a bite")
-
-            case ["report", ("full", index)]:
-                print(f"Philosopher {index} is full")
-                num_eating -= 1
+    quiesce()
 
     for i, f in enumerate(forks):
         with f as fork:
diff --git a/examples/fanout_benchmark.py b/examples/fanout_benchmark.py
index 475559b..0ab3bf8 100644
--- a/examples/fanout_benchmark.py
+++ b/examples/fanout_benchmark.py
@@ -7,7 +7,7 @@
 1. Allocates ``fanout_width`` **fresh** ``Cown[Matrix]`` consumers
    (the producer does not hold them).
 2. Dispatches ``@when(consumer_i)`` per consumer; each child mutates
-   its own cown and emits a ``"child"`` completion token.
+   its own cown.
 3. Reschedules itself on the producer cown until ``producer_steps``
    steps have run.
 
@@ -36,18 +36,13 @@
 import time
 from typing import Optional
 
-from bocpy import Cown, Matrix, receive, send, start, wait, when
+from bocpy import Cown, Matrix, quiesce, start, wait, when
 
 SENTINEL_BEGIN = "---BOCPY-FANOUT-BEGIN---"
 SENTINEL_END = "---BOCPY-FANOUT-END---"
 SCHEMA_VERSION = 1
 
 
-# ---------------------------------------------------------------------------
-# Behavior code (fanout workload, fresh-cown shape)
-# ---------------------------------------------------------------------------
-
-
 class ProducerState:
     """Per-producer state held inside a ``Cown[ProducerState]``.
 
@@ -83,8 +78,8 @@ def schedule_child(consumer_cown: Cown, child_iters: int) -> None:
     """Schedule one child step on a fresh consumer cown.
 
     The child does ``child_iters`` in-place self-multiplications of
-    its matrix, then emits a ``("child", 1)`` token so the parent
-    can count completions.
+    its matrix. It reports nothing: BOC quiescence is the completion
+    signal, so no per-child token rides the message queue.
 
     :param consumer_cown: The child's exclusively-acquired matrix cown.
     :param child_iters: Inner-loop matmul iterations, captured.
@@ -93,7 +88,6 @@ def schedule_child(consumer_cown: Cown, child_iters: int) -> None:
     def _child(c):
         for _ in range(child_iters):
             c.value = c.value @ c.value
-        send("child", 1)
 
 
 def schedule_producer(p_cown: Cown) -> None:
@@ -101,8 +95,9 @@ def schedule_producer(p_cown: Cown) -> None:
 
     Allocates ``fanout_width`` fresh ``Cown[Matrix]`` consumers,
     dispatches one child per consumer, then either reschedules
-    itself or emits ``("producer_done", producer_id)`` when
-    ``target_steps`` is reached.
+    itself or stops once ``target_steps`` is reached. The running
+    dispatch count is accumulated on the producer's state cown so the
+    parent can cross-check it after quiescence.
 
     The producer holds only ``p_cown``; the fresh consumer cowns are
     not in its acquired set, so each child dispatch takes the
@@ -122,18 +117,10 @@ def _step(producer):
         ps.dispatched += k
         ps.steps += 1
         if ps.steps >= ps.target_steps:
-            send("producer_done", (ps.producer_id, ps.dispatched))
             return
-        # Pass the already-acquired wrapper rather than the
-        # closure-captured ``p_cown`` to keep the capture set minimal.
         schedule_producer(producer)
 
 
-# ---------------------------------------------------------------------------
-# Configuration and result types
-# ---------------------------------------------------------------------------
-
-
 @dataclass
 class FanoutConfig:
     """Plain-data fanout configuration (no Cowns)."""
@@ -174,11 +161,6 @@ class PointResult:
     error: Optional[dict] = None
 
 
-# ---------------------------------------------------------------------------
-# Sizing / validation
-# ---------------------------------------------------------------------------
-
-
 def derive_sizes(cfg: FanoutConfig) -> FanoutConfig:
     """Auto-size ``producers`` and ``fanout_width`` if not overridden.
 
@@ -217,19 +199,14 @@ def validate_config(cfg: FanoutConfig) -> Optional[str]:
     return None
 
 
-# ---------------------------------------------------------------------------
-# Single-point measurement
-# ---------------------------------------------------------------------------
-
-
 def run_single_point_body(cfg: FanoutConfig, repeat_index: int) -> RepeatResult:
     """Run one fanout measurement in a fresh BOC runtime.
 
     Total expected completions = ``producers * fanout_width *
-    producer_steps``. The parent waits for that many ``child`` tokens
-    and ``producers`` ``producer_done`` tokens before tearing the
-    runtime down. ``wait(stats=True)`` returns the per-worker
-    counters captured at shutdown.
+    producer_steps``. The parent blocks on :func:`quiesce` until every
+    producer and child behavior has run, then tears the runtime down.
+    ``wait(stats=True)`` returns the per-worker counters captured at
+    shutdown.
 
     :param cfg: The fully-derived config.
     :param repeat_index: Repeat index for reporting.
@@ -247,7 +224,6 @@ def run_single_point_body(cfg: FanoutConfig, repeat_index: int) -> RepeatResult:
           f"(~{payload_bytes / 1024:.2f} KiB per consumer cown)",
           file=sys.stderr)
 
-    # Allocate producer state cowns.
     producer_cowns = [
         Cown(ProducerState(
             producer_id=pid,
@@ -259,7 +235,6 @@ def run_single_point_body(cfg: FanoutConfig, repeat_index: int) -> RepeatResult:
         for pid in range(cfg.producers)
     ]
 
-    # Generous wall-clock ceiling.
     timeout_s = max(60.0, total_expected * 0.001)
 
     try:
@@ -269,29 +244,19 @@ def run_single_point_body(cfg: FanoutConfig, repeat_index: int) -> RepeatResult:
         for p_cown in producer_cowns:
             schedule_producer(p_cown)
 
-        # Drain child completions.
-        completed = 0
-        while completed < total_expected:
-            msg = receive(["child"], timeout_s)
-            if msg is None or msg[0] != "child":
-                raise RuntimeError(
-                    f"only {completed}/{total_expected} child tokens "
-                    f"received within {timeout_s:.0f}s")
-            completed += 1
-
-        # Drain producer-done acks.
-        producer_dispatched = 0
-        for _ in range(cfg.producers):
-            msg = receive(["producer_done"], timeout_s)
-            if msg is None or msg[0] != "producer_done":
-                raise RuntimeError(
-                    f"producer_done not received within {timeout_s:.0f}s")
-            _, (_pid, count) = msg
-            producer_dispatched += count
+        try:
+            quiesce(timeout_s)
+        except TimeoutError:
+            raise RuntimeError(
+                f"fanout did not quiesce within {timeout_s:.0f}s "
+                f"(expected {total_expected} children)")
 
         t_end = time.perf_counter()
         elapsed_s = t_end - t_measure_start
 
+        completed = total_expected
+        producer_dispatched = sum(
+            p.unwrap().dispatched for p in producer_cowns)
         if producer_dispatched != completed:
             raise RuntimeError(
                 f"dispatched/completed mismatch: dispatched="
@@ -311,11 +276,6 @@ def run_single_point_body(cfg: FanoutConfig, repeat_index: int) -> RepeatResult:
         derived=compute_derived_metrics(sched_stats_end, int(completed)))
 
 
-# ---------------------------------------------------------------------------
-# Derived metrics (dispatch-contention signal)
-# ---------------------------------------------------------------------------
-
-
 def compute_derived_metrics(stats: Optional[list],
                             completed_children: int) -> dict:
     """Compute the dispatch-contention signal from a per-worker stats snapshot.
@@ -380,11 +340,6 @@ def compute_derived_metrics(stats: Optional[list],
     if total_attempts > 0:
         out["idle_ratio"] = total_failures / total_attempts
 
-    # Fairness: distribution of work across workers. We count
-    # popped_local + popped_via_steal per worker — this is what each
-    # worker actually executed (regardless of who pushed it). For a
-    # single-producer fanout the producer worker pushes everything;
-    # fairness measures whether stealing redistributed evenly.
     pops = [
         int(w.get("popped_local", 0)) + int(w.get("popped_via_steal", 0))
         for w in stats
@@ -398,7 +353,6 @@ def compute_derived_metrics(stats: Optional[list],
             out["fairness_cv"] = stdev / mean if mean > 0 else None
         else:
             out["fairness_cv"] = 0.0
-        # Gini: 0 is perfectly equal, 1 is maximally unequal.
         sorted_pops = sorted(pops)
         cum = 0
         weighted = 0
@@ -414,11 +368,6 @@ def compute_derived_metrics(stats: Optional[list],
     return out
 
 
-# ---------------------------------------------------------------------------
-# Subprocess orchestration (one repeat per child, fresh runtime)
-# ---------------------------------------------------------------------------
-
-
 def cfg_to_argv(cfg: FanoutConfig) -> list:
     """Render a ``FanoutConfig`` as CLI args for a child invocation.
 
@@ -496,11 +445,6 @@ def _extract_sentinel_payload(stdout: str) -> Optional[dict]:
         return None
 
 
-# ---------------------------------------------------------------------------
-# Sweep orchestration
-# ---------------------------------------------------------------------------
-
-
 def cfg_for_axis(base: FanoutConfig, axis: str, value) -> FanoutConfig:
     """Clone ``base`` with one axis varied to ``value``.
 
@@ -513,8 +457,6 @@ def cfg_for_axis(base: FanoutConfig, axis: str, value) -> FanoutConfig:
     cfg = FanoutConfig(**asdict(base))
     if axis == "workers":
         cfg.workers = int(value)
-        # Re-derive producers/fanout-width when sweeping workers
-        # unless the user explicitly pinned them at the base.
         if base.producers is None:
             cfg.producers = None
         if base.fanout_width is None:
@@ -672,11 +614,6 @@ def _json_default(obj):
                     "JSON-serializable")
 
 
-# ---------------------------------------------------------------------------
-# Metadata
-# ---------------------------------------------------------------------------
-
-
 def collect_metadata(argv: list, git_sha: Optional[str]) -> dict:
     """Collect metadata for the top of the results JSON."""
     try:
@@ -718,11 +655,6 @@ def _git_sha() -> Optional[str]:
     return None
 
 
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
-
-
 def parse_sweep_values(axis: str, raw: Optional[str]) -> list:
     """Parse ``--sweep-values`` per-axis at argparse time.
 
diff --git a/examples/prime_factor.py b/examples/prime_factor.py
index 35b4377..be98793 100644
--- a/examples/prime_factor.py
+++ b/examples/prime_factor.py
@@ -6,11 +6,8 @@
 import math
 import random
 
-from bocpy import (Cown, notice_read, notice_update, notice_write,
-                   receive, send, wait, when)
-
-
-# -- Helpers for notice_update (must be module-level and picklable) -----------
+from bocpy import (Cown, notice_read, notice_seed, notice_update,
+                   notice_write, quiesce, wait, when)
 
 
 def _merge_sieve(existing, new_primes):
@@ -29,7 +26,6 @@ def _merge_sieve(existing, new_primes):
         return new_primes
 
     cutoff = existing[-1]
-    # Binary search for the first new prime beyond the existing sieve
     lo, hi = 0, len(new_primes)
     while lo < hi:
         mid = (lo + hi) // 2
@@ -43,9 +39,6 @@ def _merge_sieve(existing, new_primes):
     return existing + new_primes[lo:]
 
 
-# -- Sieve phase: find primes from random candidates -------------------------
-
-
 class SieveLane:
     """Progress state for one sieve lane."""
 
@@ -71,7 +64,6 @@ def sieve_check(lane: Cown[SieveLane]):
     @when(lane)
     def _(lane):
         if lane.value.remaining <= 0:
-            send("sieve_done", lane.value.found)
             return
 
         sieve_work(lane)
@@ -82,7 +74,7 @@ def sieve_work(lane: Cown[SieveLane]):
     @when(lane)
     def _(lane):
         info = lane.value
-        sieve = list(notice_read("sieve") or [2, 3])
+        sieve = list(notice_read("sieve"))
         new_sieve_primes = []
         count = min(info.batch, info.remaining)
 
@@ -90,7 +82,6 @@ def _(lane):
             c = random.randrange(info.lo, info.hi) | 1
             limit = int(math.isqrt(c)) + 1
 
-            # Extend the local sieve until it covers sqrt(c)
             n = sieve[-1] + 2
             while sieve[-1] < limit:
                 if all(n % p != 0 for p in sieve if p * p <= n):
@@ -98,7 +89,6 @@ def _(lane):
                     new_sieve_primes.append(n)
                 n += 2
 
-            # Test c against the sieve
             is_prime = True
             for p in sieve:
                 if p * p > c:
@@ -119,9 +109,6 @@ def _(lane):
         sieve_check(lane)
 
 
-# -- Factor phase: Pollard's rho with parallel random walks ------------------
-
-
 class RhoLane:
     """State for one Pollard's rho random walk."""
 
@@ -166,7 +153,6 @@ def _(lane):
                 print(f"  lane {info.lane_id} found factor {d}")
                 return
             if d == n:
-                # Cycle with trivial gcd — restart with new constants
                 info.c = random.randrange(1, n)
                 info.x = random.randrange(2, n)
                 info.y = info.x
@@ -178,9 +164,6 @@ def _(lane):
         rho_check(lane, n)
 
 
-# -- Main --------------------------------------------------------------------
-
-
 def main():
     """Sieve for primes, build a semiprime, then factor it in parallel."""
     parser = argparse.ArgumentParser("Prime Factor")
@@ -197,25 +180,32 @@ def main():
 
     logging.basicConfig(level=args.loglevel)
 
-    # Phase 1 — parallel sieve to find primes from random candidates
     lo = 1 << (args.bits - 1)
     hi = (1 << args.bits) - 1
     per_lane = args.candidates // args.lanes
     print(f"sieving {args.candidates} candidates ({args.bits}-bit) "
           f"across {args.lanes} lanes ...")
 
+    # Seed the shared sieve synchronously so every lane reads a populated
+    # "sieve" entry from its first behavior; later growth is merged by the
+    # worker-side notice_update below.
+    notice_seed("sieve", [2, 3, 5, 7, 11, 13, 17, 19])
+
+    sieve_lane_cowns = []
     for i in range(args.lanes):
-        lane = Cown(SieveLane(i, per_lane, args.batch, lo, hi))
-        sieve_check(lane)
+        lane_cown = Cown(SieveLane(i, per_lane, args.batch, lo, hi))
+        sieve_check(lane_cown)
+        sieve_lane_cowns.append(lane_cown)
+
+    quiesce()
 
     primes = []
-    for _ in range(args.lanes):
-        _, found = receive("sieve_done")
-        primes.extend(found)
+    for lane_cown in sieve_lane_cowns:
+        with lane_cown as lane:
+            primes.extend(lane.found)
 
     print(f"found {len(primes)} primes")
 
-    # Phase 2 — pick two primes, form a semiprime, and factor it
     p, q = random.sample(primes, 2)
     n = p * q
 
@@ -223,15 +213,8 @@ def main():
     print(f"Pollard's rho with {args.lanes} parallel walks, batch={args.batch}")
 
     for i in range(args.lanes):
-        lane = Cown(RhoLane(i, n, args.batch))
-        rho_check(lane, n)
+        rho_check(Cown(RhoLane(i, n, args.batch)), n)
 
-    # Every rho lane self-terminates once `notice_read("factor")`
-    # returns a value, so the runtime quiesces on its own. `wait()`
-    # is the barrier; `noticeboard=True` lifts the result back
-    # across shutdown. `.get` (not `[]`) so a quiescence with no
-    # factor surfaces as a clean diagnostic rather than a
-    # `KeyError`.
     snap = wait(noticeboard=True)
     factor = snap.get("factor")
     if factor is None:
diff --git a/pyproject.toml b/pyproject.toml
index 01c9492..552404f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "bocpy"
-version = "0.9.0"
+version = "0.10.0"
 authors = [
     {name = "bocpy Team", email="bocpy@microsoft.com"}
 ]
diff --git a/scripts/_vendored_warehouse_wheel.py b/scripts/_vendored_warehouse_wheel.py
index d83fbcd..c2dce63 100644
--- a/scripts/_vendored_warehouse_wheel.py
+++ b/scripts/_vendored_warehouse_wheel.py
@@ -56,8 +56,6 @@ def validate_record(wheel_filepath: str) -> bool:
     filename = os.path.basename(wheel_filepath)
     name, version, _ = filename.split("-", 2)
     record_filename = f"{name}-{version}.dist-info/RECORD"
-    # Files that must be missing from 'RECORD',
-    # so we ignore them when cross-checking.
     record_exemptions = {
         f"{name}-{version}.dist-info/RECORD.jws",
         f"{name}-{version}.dist-info/RECORD.p7s",
@@ -66,7 +64,7 @@ def validate_record(wheel_filepath: str) -> bool:
         with zipfile.ZipFile(wheel_filepath) as zfp:
             wheel_record_contents = zfp.read(record_filename).decode()
             record_entries = {
-                fn.replace("\\", "/")  # Normalize Windows path separators.
+                fn.replace("\\", "/")
                 for fn, *_ in csv.reader(wheel_record_contents.splitlines())
             }
             wheel_entries = {
@@ -86,8 +84,6 @@ def validate_record(wheel_filepath: str) -> bool:
         )
     return True
 
-# See:
-# https://packaging.python.org/en/latest/specifications/entry-points/#data-model
 _ENTRY_POINT_NAME_RE = re.compile(r"[\w.-]+")
 
 
@@ -115,8 +111,6 @@ def validate_entrypoints(wheel_filepath: str) -> bool:
     Validation errors are not currently reported via email.
     """
 
-    # See:
-    # <https://packaging.python.org/en/latest/specifications/entry-points/#file-format>
     class CaseSensitiveConfigParser(configparser.ConfigParser):
         optionxform = staticmethod(str)  # type: ignore[assignment]
 
@@ -124,17 +118,14 @@ class CaseSensitiveConfigParser(configparser.ConfigParser):
     name, version, _ = filename.split("-", 2)
     entry_points_filename = f"{name}-{version}.dist-info/entry_points.txt"
 
-    # A wheel might not have an `entry_points.txt` file.
     try:
         with zipfile.ZipFile(wheel_filepath) as zfp:
             entry_points_contents = zfp.read(entry_points_filename).decode()
     except KeyError:
         return True
     except UnicodeError:
-        # `entry_points.txt` must be decodable as UTF-8.
         raise InvalidWheelEntryPointsError("entry_points.txt is not decodable as UTF-8")
 
-    # The Entry Points specification requires `=` as the delimiter.
     parser = CaseSensitiveConfigParser(delimiters=("=",))
     try:
         parser.read_string(entry_points_contents)
@@ -147,12 +138,7 @@ class CaseSensitiveConfigParser(configparser.ConfigParser):
         try:
             section = parser[section_name]
         except KeyError:
-            # `entry_points.txt` might not have these sections.
             continue
         _validate_section(section)
 
-        # TODO: We could consider validating the entry point value as well.
-        # See:
-        # https://packaging.python.org/en/latest/specifications/entry-points/#data-model
-
     return True
diff --git a/scripts/bench_matrix.py b/scripts/bench_matrix.py
index 277565b..46bae19 100644
--- a/scripts/bench_matrix.py
+++ b/scripts/bench_matrix.py
@@ -59,11 +59,10 @@
 
 
 REPS = 11
-MIN_BATCH_NS = 1_000_000  # 1 ms
-# Per-thunk warmup window. Long enough to push past Intel boost transients
-# (PL2 -> PL1 on i7-14700F lands in the 50-150 ms range) so every timed rep
-# is sampled at the sustained P-state, not at the post-idle boost clock.
-WARMUP_NS = 200_000_000  # 200 ms
+MIN_BATCH_NS = 1_000_000  # 1 ms: long enough to dwarf perf_counter resolution
+# 200 ms warm-up pushes past the Intel PL2->PL1 boost transient (50-150 ms on
+# the reference CPU) so every timed rep samples the sustained P-state.
+WARMUP_NS = 200_000_000
 
 
 def _cpu_model() -> str:
@@ -121,22 +120,17 @@ def _tune_batch(thunk: Callable[[], None]) -> int:
         if elapsed >= MIN_BATCH_NS:
             return batch
         batch *= 2
-        if batch > 1 << 24:  # 16M; sanity guard
+        if batch > 1 << 24:  # 16M: sanity guard against a no-op thunk
             return batch
 
 
 def measure(label: str, thunk: Callable[[], None]) -> dict[str, float]:
     """Run the contract-compliant measurement loop and return ns stats."""
-    # Warm-up batch sizing and JIT-of-the-interpreter warm-up.
     batch = _tune_batch(thunk)
 
     gc.collect()
     gc.disable()
     try:
-        # Sustained warmup: run for >= WARMUP_NS to push the CPU past its
-        # post-idle boost transient and into its sustained P-state. Without
-        # this, the first few timed reps land on boost clocks while the rest
-        # land on base clock, producing 5-10% session-to-session drift.
         warmup_deadline = time.perf_counter_ns() + WARMUP_NS
         while time.perf_counter_ns() < warmup_deadline:
             for _ in range(batch):
@@ -187,7 +181,6 @@ def _make_matrix(rows: int, cols: int, seed: int = 0) -> Matrix:
     values: list[float] = []
     n = rows * cols
     for _ in range(n):
-        # Simple LCG; deterministic across runs.
         rng_state = (rng_state * 1103515245 + 12345) & 0x7FFFFFFF
         values.append((rng_state / 0x7FFFFFFF) * 2.0 - 1.0)
     return Matrix(rows, cols, values)
@@ -254,7 +247,6 @@ def bench_vecdot(results: list[dict[str, float]]) -> None:
     """Bench vecdot against (a*b).sum(axis=k) to confirm the fusion win."""
     _print_section("vecdot vs (a*b).sum(axis=k) (fusion win)")
 
-    # Same-shape 1000x3
     a = _make_matrix(1000, 3, seed=4)
     b = _make_matrix(1000, 3, seed=5)
     for axis_label, axis in [("axis=None", None), ("axis=0", 0), ("axis=1", 1)]:
@@ -281,7 +273,6 @@ def bench_vecdot(results: list[dict[str, float]]) -> None:
             ))
             _print_row(results[-1])
 
-    # Row broadcast: 1000x3 . 1x3
     row = _make_matrix(1, 3, seed=6)
     results.append(measure(
         "vecdot(row)            row-broadcast 1000x3 . 1x3 axis=1",
@@ -294,7 +285,6 @@ def bench_vecdot(results: list[dict[str, float]]) -> None:
     ))
     _print_row(results[-1])
 
-    # Column broadcast: 1000x3 . 1000x1
     col = _make_matrix(1000, 1, seed=7)
     results.append(measure(
         "vecdot(col, axis=0)    col-broadcast 1000x3 . 1000x1",
@@ -341,9 +331,6 @@ def bench_cross(results: list[dict[str, float]]) -> None:
     ))
     _print_row(results[-1])
 
-    # Broadcast: single vec reused across the whole batch. These are the
-    # shapes a user typically writes when applying a constant axis-of-rotation
-    # or up-vector to every particle in a batch.
     b_vec3 = _make_matrix(1, 3, seed=26)
     results.append(measure(
         "cross(vec3)        1000x3 (3D row batch, broadcast 1x3)",
@@ -427,8 +414,6 @@ def bench_angle(results: list[dict[str, float]]) -> None:
     ))
     _print_row(results[-1])
 
-    # Python equivalent — pre-extract the data once so the loop measures
-    # math.atan2 + list allocation, not the C accessor cost.
     rows = shape[0]
     xs = [m[i, 0] for i in range(rows)]
     ys = [m[i, 1] for i in range(rows)]
@@ -446,16 +431,6 @@ def py_angle_loop() -> list[float]:
     _print_row(results[-1])
 
 
-# ----------------------------------------------------------------------
-# Sections below cover the rest of the Matrix surface, added to give a
-# point-in-time perf reference for tracking regressions across versions.
-# Order: cheap properties first, then element-wise, then reductions,
-# then binary arithmetic, matmul, reshape/select/copy/clip/allclose,
-# and finally construction & factories. Per-section function names
-# follow the bench_<group> convention used above.
-# ----------------------------------------------------------------------
-
-
 def bench_properties(results: list[dict[str, float]]) -> None:
     """Bench cheap property getters (rows, columns, T) and __len__."""
     _print_section("property getters (rows / columns / T / len)")
@@ -520,7 +495,6 @@ def bench_binary_arithmetic(results: list[dict[str, float]]) -> None:
     row = _make_matrix(1, 100, seed=106)
     col = _make_matrix(1000, 1, seed=107)
 
-    # Add
     results.append(measure(
         f"add scalar              shape={shape} + 1.5",
         lambda a=a: a + 1.5,
@@ -541,7 +515,6 @@ def bench_binary_arithmetic(results: list[dict[str, float]]) -> None:
         lambda a=a, c=col: a + c,
     ))
     _print_row(results[-1])
-    # Subtract
     results.append(measure(
         f"sub scalar              shape={shape} - 1.5",
         lambda a=a: a - 1.5,
@@ -562,7 +535,6 @@ def bench_binary_arithmetic(results: list[dict[str, float]]) -> None:
         lambda a=a, c=col: a - c,
     ))
     _print_row(results[-1])
-    # Multiply
     results.append(measure(
         f"mul scalar              shape={shape} * 1.5",
         lambda a=a: a * 1.5,
@@ -583,7 +555,6 @@ def bench_binary_arithmetic(results: list[dict[str, float]]) -> None:
         lambda a=a, c=col: a * c,
     ))
     _print_row(results[-1])
-    # Divide
     results.append(measure(
         f"div scalar              shape={shape} / 1.5",
         lambda a=a: a / 1.5,
@@ -604,7 +575,6 @@ def bench_binary_arithmetic(results: list[dict[str, float]]) -> None:
         lambda a=a, c=col: a / c,
     ))
     _print_row(results[-1])
-    # In-place same-shape path (separate _math.c branch).
     a_ip = _make_matrix(*shape, seed=104)
     results.append(measure(
         f"iadd same-shape         shape={shape}",
@@ -660,9 +630,7 @@ def bench_select(results: list[dict[str, float]]) -> None:
 
     shape = (1000, 100)
     m = _make_matrix(*shape, seed=108)
-    # 100 indices walking strided through the rows.
     row_idx = list(range(0, 1000, 10))
-    # 50 columns out of 100.
     col_idx = list(range(0, 100, 2))
     results.append(measure(
         f"select rows (100/1000)  shape={shape}",
@@ -693,7 +661,7 @@ def bench_copy_clip_allclose(results: list[dict[str, float]]) -> None:
     ))
     _print_row(results[-1])
 
-    n = _make_matrix(*shape, seed=109)  # identical to m
+    n = _make_matrix(*shape, seed=109)
     results.append(measure(
         f"allclose(m, n)          shape={shape}",
         lambda m=m, n=n: Matrix.allclose(m, n),
@@ -853,7 +821,6 @@ def main(argv: list[str] | None = None) -> int:
 
     results: list[dict[str, float]] = []
 
-    # Existing Matrix surface — order goes cheap → expensive.
     bench_properties(results)
     bench_unary(results)
     bench_aggregations(results)
diff --git a/scripts/build_sbom.py b/scripts/build_sbom.py
index 528479d..d2d7e60 100644
--- a/scripts/build_sbom.py
+++ b/scripts/build_sbom.py
@@ -40,19 +40,10 @@
 import uuid
 import zipfile
 
-# Bumping this version invalidates the ``tools.components`` entry in
-# the SBOM. Keep it in sync with significant changes to the schema or
-# the injection algorithm.
 SBOM_GENERATOR_VERSION = "0.1.0"
 SBOM_FILENAME = "bocpy.cdx.json"
 PEP770_SBOM_SUBDIR = "sboms"
 
-# Stable namespace used to derive a deterministic UUIDv5 ``serialNumber``
-# from the (name, version, git_commit, wheel_filename) tuple. The
-# specific URL string is what makes this namespace stable across builds
-# and across machines — do NOT change it without a coordinated
-# generator-version bump, since every existing SBOM's serial number
-# would change shape.
 _BOCPY_SBOM_NAMESPACE = uuid.uuid5(
     uuid.NAMESPACE_URL, "https://github.com/microsoft/bocpy/sboms"
 )
@@ -77,9 +68,6 @@ def _sbom_timestamp() -> str:
         try:
             epoch = int(raw)
         except ValueError:
-            # Match the upstream reproducible-build spec: a malformed
-            # value is a hard error rather than a silent fall-through,
-            # so that CI surfaces the misconfiguration loudly.
             raise ValueError(
                 f"SOURCE_DATE_EPOCH must be an integer, got {raw!r}"
             ) from None
@@ -200,10 +188,6 @@ def build_sbom_document(
             },
             "component": root_component,
         },
-        # bocpy has zero third-party runtime Python dependencies; the
-        # components list is intentionally empty. System shared libraries
-        # bundled by auditwheel / delocate / delvewheel are not enumerated
-        # here yet (see module docstring).
         "components": [],
         "dependencies": [{"ref": bom_ref, "dependsOn": []}],
     }
@@ -259,9 +243,6 @@ def inject_sbom_into_wheel(
     if not wheel_path.is_file():
         raise FileNotFoundError(wheel_path)
 
-    # We materialise the new wheel in a temporary file alongside the
-    # original so that the final ``shutil.move`` is an atomic rename
-    # on the same filesystem.
     tmp_fd, tmp_name = tempfile.mkstemp(
         prefix=wheel_path.stem + ".",
         suffix=".whl.tmp",
@@ -276,65 +257,23 @@ def inject_sbom_into_wheel(
             sbom_arcname = f"{dist_info}/{PEP770_SBOM_SUBDIR}/{SBOM_FILENAME}"
             record_arcname = f"{dist_info}/RECORD"
 
-            # Collect every entry except the existing RECORD; we
-            # rewrite it last with the new hashes.
-            #
-            # We carry the SOURCE ``ZipInfo`` (not just the filename)
-            # for every entry we copy through, because wheels that
-            # have been through ``auditwheel`` / ``delocate`` /
-            # ``delvewheel`` rely on per-entry ZIP metadata that
-            # ``ZipFile.writestr(arcname, data)`` would silently
-            # drop:
-            #
-            #   * ``external_attr``  — the Unix mode bits in the
-            #     upper 16 bits encode ``S_IFLNK`` for symlinked
-            #     SONAMEs (``libfoo.so.1 -> libfoo.so.1.2.3``).
-            #     Drop them and the install step writes a regular
-            #     file whose contents are the symlink's text.
-            #   * ``create_system`` — tells the reader how to
-            #     interpret ``external_attr`` (Unix vs DOS vs ...).
-            #   * ``compress_type`` — preserves a deliberate
-            #     ``ZIP_STORED`` choice (some wheel builders leave
-            #     pre-compressed ``.so`` payloads uncompressed; a
-            #     naive ``writestr(arcname, data)`` would re-DEFLATE
-            #     them under the destination ZIP's default
-            #     compression).
-            #   * ``date_time`` — reproducible-build timestamps set
-            #     by the upstream wheel builder.
-            #   * ``extra`` / ``internal_attr`` / ``comment`` —
-            #     uncommon but harmless to preserve; some toolchains
-            #     stash Unix UID/GID/mtime extras here.
-            #
-            # We deliberately do NOT copy ``CRC``, ``compress_size``,
-            # ``file_size``, ``header_offset``, or ``flag_bits``:
-            # those are stream-position metadata that ``writestr``
-            # recomputes when the entry is re-emitted. Copying them
-            # would either be silently overridden or, in the case of
-            # ``flag_bits``, leak the source archive's
-            # data-descriptor / streaming bits into a non-streaming
-            # write path.
             entries: list[tuple[zipfile.ZipInfo, bytes]] = []
             sbom_already_present = False
             for info in src.infolist():
                 if info.filename == record_arcname:
                     continue
                 if info.filename == sbom_arcname:
-                    # Tolerate an already-injected SBOM by replacing it,
-                    # but log a line to stderr so re-injection in CI is
-                    # observable. The reinjection path is exercised when
-                    # ``build_sbom.py inject`` is re-run against an
-                    # already-decorated wheel (idempotency check), so
-                    # silencing it would hide a misconfigured repair
-                    # command running the injector twice.
                     sbom_already_present = True
                     continue
                 if info.is_dir():
-                    # PyPI strips trailing-slash entries from the ZIP
-                    # side before comparing RECORD; keeping a row for
-                    # them triggers send_wheel_record_mismatch_email.
+                    # PyPI strips trailing-slash dir entries before comparing
+                    # RECORD; keeping a row for them triggers a mismatch reject.
                     continue
                 with src.open(info) as f:
                     data = f.read()
+                # Copy every ZipInfo field so re-archived entries stay
+                # byte-identical to the source wheel (perms, timestamps,
+                # RECORD hashes must all still match).
                 new_info = zipfile.ZipInfo(
                     filename=info.filename, date_time=info.date_time
                 )
@@ -346,16 +285,12 @@ def inject_sbom_into_wheel(
                 new_info.comment = info.comment
                 entries.append((new_info, data))
 
-            # The injected SBOM and the rewritten RECORD are NEW entries
-            # that this script owns, so they use freshly-constructed
-            # ``ZipInfo`` objects with the stdlib defaults
-            # (``-rw-------`` perms, ``ZIP_DEFLATED`` compression).
+            # SBOM and rewritten RECORD are new entries we own: fresh ZipInfo
+            # with stdlib defaults (rw-------, ZIP_DEFLATED).
             sbom_info = zipfile.ZipInfo(filename=sbom_arcname)
             sbom_info.compress_type = zipfile.ZIP_DEFLATED
             entries.append((sbom_info, sbom_bytes))
 
-            # Build the new RECORD: every entry gets a hash row except
-            # RECORD itself (which has empty hash + empty size).
             record_buf = io.StringIO()
             writer = csv.writer(
                 record_buf, delimiter=",", quoting=csv.QUOTE_MINIMAL, lineterminator="\n"
@@ -380,12 +315,8 @@ def inject_sbom_into_wheel(
                 file=sys.stderr,
             )
 
-        # Atomic rename — the wheel either has the SBOM and a fresh
-        # RECORD, or it is byte-identical to before.
         shutil.move(str(tmp_path), str(wheel_path))
     except BaseException:
-        # Clean the side-file on any failure so we don't leak a
-        # corrupted ``*.whl.tmp`` into the dest directory.
         if tmp_path.exists():
             try:
                 tmp_path.unlink()
@@ -424,14 +355,6 @@ def _read_pyproject_metadata(repo_root: Path) -> dict[str, str]:
 
 def _cmd_generate(args: argparse.Namespace) -> int:
     """Implement the ``generate`` subcommand."""
-    # Reproducibility guard. The serialNumber is a UUIDv5 derived
-    # from ``name@version+git_commit+wheel_filename``. If a caller
-    # invokes ``generate`` standalone with neither ``--git-commit``
-    # (defaulted to $GITHUB_SHA, often unset locally) nor
-    # ``--wheel-filename``, every wheel of the same name+version
-    # collapses to the same UUID — defeating the per-wheel-identifier
-    # purpose of deterministic serials. ``inject`` always passes
-    # ``--wheel-filename`` so it is unaffected.
     if not args.git_commit and not args.wheel_filename:
         print(
             "error: build_sbom.py generate requires at least one of: "
@@ -475,11 +398,6 @@ def _cmd_inject(args: argparse.Namespace) -> int:
     else:
         wheel_path = target
 
-    # When --copy-to is given, work on a copy in the destination directory
-    # and leave the original untouched. This is the pattern used on Windows
-    # cibuildwheel jobs where there is no native repair tool, so the
-    # script is responsible for both placing the wheel in ``{dest_dir}``
-    # and injecting the SBOM into it.
     if args.copy_to is not None:
         copy_target_dir = Path(args.copy_to)
         copy_target_dir.mkdir(parents=True, exist_ok=True)
diff --git a/scripts/validate_sbom.py b/scripts/validate_sbom.py
index 4fffe8d..22f1242 100644
--- a/scripts/validate_sbom.py
+++ b/scripts/validate_sbom.py
@@ -41,26 +41,16 @@
 
 SBOM_GLOB = "*.cdx.json"
 
-# Match what ``build_sbom.py`` emits — these invariants must hold for
-# every SBOM bocpy ships. Drift here will be caught at CI time.
 EXPECTED_BOM_FORMAT = "CycloneDX"
 EXPECTED_SPEC_VERSION = "1.6"
 EXPECTED_TOOL_NAME = "build_sbom.py"
 EXPECTED_PURL_PREFIX = "pkg:pypi/bocpy@"
 
-# UUIDv5 serial number per CycloneDX 1.6 (the spec requires the
-# ``urn:uuid:`` prefix). ``build_sbom.py`` derives the serial as
-# ``uuid.uuid5(namespace, "<name>@<version>+<git>+<wheel>")`` so the
-# value is byte-identical across rebuilds of the same source tree
-# (reproducible-build contract). UUIDv5's version digit is ``5`` and
-# the variant nibble is the standard ``[89ab]``.
 _URN_UUID_RE = re.compile(
     r"^urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-5[0-9a-f]{3}-[89ab][0-9a-f]{3}"
     r"-[0-9a-f]{12}$"
 )
 
-# ISO 8601 UTC timestamp, ``YYYY-MM-DDTHH:MM:SSZ`` — what
-# ``_sbom_timestamp`` emits.
 _TIMESTAMP_RE = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$")
 
 
@@ -90,7 +80,6 @@ def validate_sbom_document(doc: Any) -> None:
     """
     _require(isinstance(doc, dict), "top-level value must be a JSON object")
 
-    # --- Header invariants --------------------------------------------------
     _require(
         doc.get("bomFormat") == EXPECTED_BOM_FORMAT,
         f"bomFormat must be {EXPECTED_BOM_FORMAT!r}, got {doc.get('bomFormat')!r}",
@@ -111,10 +100,9 @@ def validate_sbom_document(doc: Any) -> None:
         f"version must be a positive integer, got {doc.get('version')!r}",
     )
 
-    # --- metadata ----------------------------------------------------------
     metadata = doc.get("metadata")
     _require(isinstance(metadata, dict), "metadata must be an object")
-    assert isinstance(metadata, dict)  # for type-checkers
+    assert isinstance(metadata, dict)
 
     timestamp = _require_str(metadata, "timestamp")
     _require(
@@ -165,7 +153,6 @@ def validate_sbom_document(doc: Any) -> None:
         f"metadata.component.purl must start with {EXPECTED_PURL_PREFIX!r}, got {purl!r}",
     )
 
-    # --- components & dependencies ----------------------------------------
     components = doc.get("components")
     _require(isinstance(components, list), "components must be a list")
 
diff --git a/scripts/validate_wheel.py b/scripts/validate_wheel.py
index 4956745..8c77844 100644
--- a/scripts/validate_wheel.py
+++ b/scripts/validate_wheel.py
@@ -30,9 +30,6 @@
 import sys
 from typing import Iterable
 
-# ``scripts/`` is added to ``sys.path`` for pytest by pyproject.toml,
-# but a direct ``python scripts/validate_wheel.py`` invocation only
-# has the script's own directory on the path — which is what we want.
 from _vendored_warehouse_wheel import (
     InvalidWheelEntryPointsError,
     InvalidWheelRecordError,
diff --git a/setup.py b/setup.py
index 8fda04d..e606662 100644
--- a/setup.py
+++ b/setup.py
@@ -71,10 +71,23 @@
 #
 # Stays at ``-O3``; never ``-Ofast``/``-ffast-math``, which would break
 # IEEE semantics that ``fabs``, ``nearbyint``, and NaN handling depend on.
+#
+# ``-ffp-contract=off`` is required for bit-reproducible results. By
+# default gcc and clang contract a ``a * b + c`` expression into a single
+# fused-multiply-add (one rounding) on targets that have an FMA unit --
+# notably every arm64 chip. x86-64 at the SSE2 baseline has no FMA, so
+# the same source rounds twice there, and the matmul kernel's ascending-k
+# accumulation then diverges by 1 ULP between architectures (see
+# test_matmul_bitwise_reproducible). Turning contraction off makes the
+# multiply and add round separately everywhere, matching the two-rounding
+# reference. It does not inhibit autovectorisation -- NEON still vectorises
+# the loop with separate FMUL/FADD lanes instead of fused FMLA. MSVC's
+# default ``/fp:precise`` does not contract across statements, so the
+# Windows build needs no equivalent flag.
 if sys.platform == "win32":
     _math_extra_compile_args = ["/O2"]
 else:
-    _math_extra_compile_args = ["-O3"]
+    _math_extra_compile_args = ["-O3", "-ffp-contract=off"]
 
 _ext_modules = [
     Extension(
diff --git a/sphinx/source/_static/favicon.ico b/sphinx/source/_static/favicon.ico
new file mode 100644
index 0000000..0b51d5a
Binary files /dev/null and b/sphinx/source/_static/favicon.ico differ
diff --git a/sphinx/source/api.rst b/sphinx/source/api.rst
index 757bfe5..80089d4 100644
--- a/sphinx/source/api.rst
+++ b/sphinx/source/api.rst
@@ -111,11 +111,11 @@ See the :ref:`noticeboard` guide for a conceptual overview, consistency model,
 and worked examples.
 
 .. autofunction:: notice_write
+.. autofunction:: notice_seed
 .. autofunction:: notice_update
 .. autofunction:: notice_delete
 .. autofunction:: noticeboard
 .. autofunction:: notice_read
-.. autofunction:: notice_sync
 .. autodata:: REMOVED
 
 
diff --git a/sphinx/source/c_abi.rst b/sphinx/source/c_abi.rst
index e940d76..4516c5b 100644
--- a/sphinx/source/c_abi.rst
+++ b/sphinx/source/c_abi.rst
@@ -3,6 +3,8 @@
 C ABI
 =====
 
+.. py:currentmodule:: bocpy
+
 This page documents the public C ABI shipped with bocpy. Use it when
 writing a downstream C extension that needs to participate in
 behavior-oriented concurrency at the C level — typically by registering
diff --git a/sphinx/source/conf.py b/sphinx/source/conf.py
index 0e4789b..bca743f 100644
--- a/sphinx/source/conf.py
+++ b/sphinx/source/conf.py
@@ -14,19 +14,45 @@
 project = 'bocpy'
 copyright = '2026, Microsoft'
 author = 'Microsoft'
-release = '0.9.0'
+release = '0.10.0'
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 
 extensions = [
     "sphinx.ext.autodoc",
+    "sphinx.ext.intersphinx",
     "sphinx.ext.napoleon",
     "sphinx_autodoc_typehints"
 ]
 
 napoleon_use_param = True
 
+# Resolve cross-references to the Python stdlib and a handful of third-party
+# projects we name in docstrings/.rst (e.g. ``typing.Any``,
+# ``setuptools.Extension``). Without these mappings the nitpicky build
+# (``-n``) emits a stream of unresolvable-ref warnings that drown out
+# real issues.
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3", None),
+    "setuptools": ("https://setuptools.pypa.io/en/latest/", None),
+}
+
+# Known false-positive cross-reference targets when running with ``-n``
+# (nitpicky mode). Both are upstream ``sphinx-autodoc-typehints``
+# quirks:
+#
+# * ``typing.Union`` is registered in the Python inventory as
+#   ``py:class``, but the extension emits ``py:data`` references for
+#   it.
+# * ``bocpy.behaviors.T`` is the :class:`Cown` ``TypeVar``; TypeVars
+#   aren't separately documented, so the autogenerated cross-ref has
+#   no target.
+nitpick_ignore = [
+    ("py:data", "typing.Union"),
+    ("py:class", "bocpy.behaviors.T"),
+]
+
 templates_path = ['_templates']
 exclude_patterns = []
 
@@ -37,6 +63,7 @@
 html_static_path = ['_static']
 
 html_logo = "_static/logo-200.svg"
+html_favicon = "_static/favicon.ico"
 
 # ---------------------------------------------------------------------------
 # Stub-aware autodoc: pull docstrings and signatures from __init__.pyi
diff --git a/sphinx/source/index.rst b/sphinx/source/index.rst
index 026252e..bedb833 100644
--- a/sphinx/source/index.rst
+++ b/sphinx/source/index.rst
@@ -1,6 +1,8 @@
 bocpy documentation
 ===================
 
+.. py:currentmodule:: bocpy
+
 `bocpy <https://github.com/microsoft/bocpy>`_ is a Python library implementing
 **Behavior-Oriented Concurrency (BOC)**.  Programmers wrap shared data in
 **cowns** (concurrently-owned objects) and schedule **behaviors** with the
diff --git a/sphinx/source/noticeboard.rst b/sphinx/source/noticeboard.rst
index 66e9acd..881ed34 100644
--- a/sphinx/source/noticeboard.rst
+++ b/sphinx/source/noticeboard.rst
@@ -223,9 +223,9 @@ deleted, regardless of how many readers have observed the entry.
    :func:`wait` with ``noticeboard=True`` (see "Reading the Final
    State at Shutdown" above) and :func:`quiesce` with
    ``noticeboard=True`` (see "Reading the State Between Rounds"
-   above). Seeding the noticeboard with :func:`notice_write` from
-   the main thread *before* scheduling behaviors is fine and is
-   the recommended pattern for installing read-mostly configuration.
+   above). To install read-mostly configuration from the main thread
+   *before* scheduling behaviors, use :func:`notice_seed` (see
+   "Seeding Before Scheduling" below), which commits synchronously.
 
 Writing and Updating
 --------------------
@@ -278,34 +278,42 @@ Deleting Entries
 
     notice_delete("temporary_flag")
 
-``notice_sync`` (Testing Only)
--------------------------------
 
-:func:`notice_sync` blocks until every mutation the calling thread has
-posted so far has been committed by the noticeboard thread. It exists to
-make the noticeboard's eventual consistency tractable for **tests** — a
-test can write a value, call ``notice_sync()``, and then assert that a
-subsequently scheduled behavior observes the write — not as a primitive
-for application code.
+Seeding Before Scheduling
+-------------------------
 
-.. warning::
+:func:`notice_write` is fire-and-forget: it hands the write to the
+noticeboard thread and returns before the value commits, so a behavior
+scheduled immediately afterwards is *not* guaranteed to observe it. To
+install read-mostly configuration on the main thread *before* scheduling
+the behaviors that read it, use :func:`notice_seed`, which commits
+synchronously under the noticeboard mutex and returns only once the entry
+is live::
+
+    from bocpy import notice_seed, notice_read, when, Cown
+
+    notice_seed("config.threshold", 0.5)   # committed before it returns
+
+    work = Cown(load_work())
 
-   Outside of tests, reaching for ``notice_sync`` is almost always an
-   anti-pattern. The guarantee it provides is much weaker than it looks:
+    @when(work)
+    def _(work):
+        threshold = notice_read("config.threshold")   # always observes 0.5
+        ...
 
-   - It only orders the **calling thread's prior writes** against the
-     **next per-behavior snapshot** taken on any thread. Snapshots are
-     captured once per behavior, so a behavior already executing when
-     ``notice_sync`` returns will keep seeing its existing snapshot.
-   - It does **not** refresh the calling behavior's own snapshot — you
-     cannot ``notice_sync`` and then ``notice_read`` to see your write.
-   - It establishes no happens-before relationship between unrelated
-     behaviors and is not a substitute for cown-mediated ordering.
+:func:`notice_seed` may be called only from the primary interpreter — never
+from inside a ``@when`` body (use :func:`notice_write` there). If the runtime
+is not yet running it starts it, so seeding can be the first bocpy call a
+program makes, with no explicit :func:`start`.
 
-   If application code needs read-your-writes ordering, model the shared
-   state as a :class:`Cown`. If you find yourself wanting
-   ``notice_sync`` outside a test, that is a strong signal the noticeboard
-   is the wrong primitive for the problem.
+.. note::
+
+   :func:`notice_seed` is a plain overwrite intended for one-shot seeding
+   *before* concurrent noticeboard mutations are in flight. It does **not**
+   provide the read-modify-write atomicity of :func:`notice_update`, and a
+   seed that races an in-flight :func:`notice_update` on the same key may be
+   lost. Seed once, up front, rather than interleaving seeds with concurrent
+   updates.
 
 
 API Reference
@@ -313,6 +321,8 @@ API Reference
 
 .. autofunction:: notice_write
    :no-index:
+.. autofunction:: notice_seed
+   :no-index:
 .. autofunction:: notice_update
    :no-index:
 .. autofunction:: notice_delete
@@ -321,7 +331,5 @@ API Reference
    :no-index:
 .. autofunction:: notice_read
    :no-index:
-.. autofunction:: notice_sync
-   :no-index:
 .. autodata:: REMOVED
    :no-index:
diff --git a/sphinx/source/sbom.rst b/sphinx/source/sbom.rst
index 4fa93ad..0f7b89f 100644
--- a/sphinx/source/sbom.rst
+++ b/sphinx/source/sbom.rst
@@ -215,7 +215,7 @@ is the only reliable way to predict whether PyPI will accept a wheel:
 only checks RECORD hashes and sizes, and ``check-wheel-contents`` only
 checks layout — none of them runs PyPI's actual acceptance code.
 
-The vendored file's docstring records the exact upstream commit it
+The vendored file's header records the exact upstream commit it
 was synced from and the refresh procedure.  The wheel-integrity job
 in ``.github/workflows/build_wheels.yml`` runs it twice per release:
 once inside ``CIBW_REPAIR_WHEEL_COMMAND`` (so a per-platform build
diff --git a/src/bocpy/__init__.py b/src/bocpy/__init__.py
index 49ad513..002ee72 100644
--- a/src/bocpy/__init__.py
+++ b/src/bocpy/__init__.py
@@ -8,7 +8,7 @@
 from ._core import drain, receive, send, set_tags, TIMEOUT
 from ._math import Matrix
 from .behaviors import (Behaviors, Cown, notice_delete, notice_read,
-                        notice_sync, notice_update, notice_write, noticeboard,
+                        notice_seed, notice_update, notice_write, noticeboard,
                         PinnedCown, pump, PumpResult, quiesce,
                         REMOVED,
                         set_pump_watchdog, set_wait_pump_poll,
@@ -16,23 +16,10 @@
 
 try:
     __version__ = _metadata.version("bocpy")
+# Broad on purpose: namespace-package / frozen / editable installs surface
+# metadata failures other than PackageNotFoundError.
 except Exception as _version_lookup_error:  # pragma: no cover - source checkout w/o install
-    # ``importlib.metadata.version`` normally raises
-    # ``PackageNotFoundError`` for an uninstalled package, but the
-    # broader ``Exception`` net catches namespace-package and
-    # vendored-installer edge cases (Bazel ``py_binary`` zip
-    # imports, Nuitka-frozen apps, some PEP 660 editable hybrids)
-    # that surface as other ``importlib.metadata`` exceptions. The
-    # fallback string lets ``import bocpy`` keep working in those
-    # environments instead of dying at module load. Assign the
-    # fallback first so the WARNING emit — which is best-effort and
-    # itself wrapped — cannot leave ``__version__`` unbound if the
-    # logging stack is broken (closed stderr, misconfigured handler).
     __version__ = "0.0.0+unknown"
-    # Best-effort WARNING naming the exception class so a corrupt
-    # installation does not silently masquerade as a clean source
-    # checkout in downstream version gates / telemetry. Swallow any
-    # logger failure: import-time logging is not load-bearing.
     try:
         _logging.getLogger("bocpy").warning(
             "bocpy package metadata unavailable (%s: %s); "
@@ -84,8 +71,8 @@ def get_sources() -> list[str]:
            "REMOVED", "TIMEOUT",
            "WORKER_COUNT", "__version__", "drain",
            "get_include", "get_sources",
-           "notice_delete", "notice_read",
-           "notice_sync", "notice_update", "notice_write", "noticeboard",
+           "notice_delete", "notice_read", "notice_seed",
+           "notice_update", "notice_write", "noticeboard",
            "pump",
            "quiesce",
            "receive",
diff --git a/src/bocpy/__init__.pyi b/src/bocpy/__init__.pyi
index 853b864..14bfd62 100644
--- a/src/bocpy/__init__.pyi
+++ b/src/bocpy/__init__.pyi
@@ -339,6 +339,34 @@ class Matrix:
             If ``1``, return a *rows* x 1 column vector of row maxima.
         """
 
+    def argmin(self, axis: Optional[int] = None) -> Union[int, "Matrix"]:
+        """Index of the minimum element (first occurrence on ties).
+
+        :param axis: If ``None``, return the flat row-major index of the
+            overall minimum as an ``int``.  If ``0``, return a 1 x *columns*
+            row vector of per-column row indices.  If ``1``, return a
+            *rows* x 1 column vector of per-row column indices.
+
+        .. note::
+           NaN elements are skipped unless the running extreme starts at
+           NaN (element 0 along the reduced axis), which pins the result
+           to that position.  This differs from NumPy, which propagates NaN.
+        """
+
+    def argmax(self, axis: Optional[int] = None) -> Union[int, "Matrix"]:
+        """Index of the maximum element (first occurrence on ties).
+
+        :param axis: If ``None``, return the flat row-major index of the
+            overall maximum as an ``int``.  If ``0``, return a 1 x *columns*
+            row vector of per-column row indices.  If ``1``, return a
+            *rows* x 1 column vector of per-row column indices.
+
+        .. note::
+           NaN elements are skipped unless the running extreme starts at
+           NaN (element 0 along the reduced axis), which pins the result
+           to that position.  This differs from NumPy, which propagates NaN.
+        """
+
     def ceil(self, in_place: bool = False) -> "Matrix":
         """Round each element up to the nearest integer.
 
@@ -383,6 +411,14 @@ class Matrix:
     def copy(self) -> "Matrix":
         """Return a deep copy of this matrix."""
 
+    def __reduce__(self) -> tuple:
+        """Support pickling and :func:`copy.deepcopy`.
+
+        Serializes the matrix to its native-endian raw ``double`` buffer
+        so reconstruction is a single copy with no per-element Python
+        object overhead. The current interpreter must own the matrix.
+        """
+
     def select(self, indices: Union[list[int], tuple[int]], axis=0):
         """Return a new matrix containing only the selected rows or columns.
 
@@ -485,6 +521,20 @@ class Matrix:
             ``float`` instead of a :class:`Matrix`.
         """
 
+    @classmethod
+    def seed(cls, value: int) -> None:
+        """Seed the random generator used by :meth:`normal` and :meth:`uniform`.
+
+        :param value: The seed value.
+
+        .. note::
+           The generator is the process-global C library PRNG shared by
+           every sub-interpreter, so a seed only makes subsequent draws
+           reproducible when random generation stays on a single thread;
+           concurrent draws interleave on the shared state.  The sequence
+           is also not portable across platforms.
+        """
+
     @classmethod
     def vector(cls, values: Sequence[Union[float, int]], as_column=False) -> "Matrix":
         """Create a matrix from a flat sequence of values.
@@ -549,6 +599,41 @@ class Cown(Generic[T]):
     def release(self):
         """Releases the cown."""
 
+    def unwrap(self) -> T:
+        """Consume and return the stored value, or re-raise a captured behavior exception.
+
+        Mirrors Rust's ``Result::unwrap``: on success the stored value
+        is returned; if the cown carries an unhandled behavior
+        exception (``self.exception`` is ``True``) that exception is
+        cleared from the cown and re-raised on the caller's thread.
+
+        ``unwrap`` **consumes** the cown: the stored payload is handed
+        to the caller and the cown is emptied to ``None``. The returned
+        value is therefore owned by the caller, and a second
+        :meth:`unwrap` returns ``None``. Consuming is what makes
+        move-type values (e.g. :class:`Matrix`) usable after the call --
+        the cown no longer aliases the value's single backing store, so
+        the value keeps its ownership on the caller's interpreter rather
+        than being released back into the cown. The emptied cown remains
+        schedulable, so a fresh value may be stored into it again.
+
+        The cown is acquired for the duration of the read, so call
+        :meth:`unwrap` from the caller's thread once the runtime is
+        globally quiescent -- after :func:`quiesce` or :func:`wait`, not
+        merely after this cown's own producer.
+
+        Calling :meth:`unwrap` while behaviors are still in flight
+        raises :class:`RuntimeError`: reading a result before its
+        producer completes would race the worker still mutating the
+        cown. Call :func:`quiesce` (or :func:`wait`) first.
+
+        :returns: The stored value when no exception is held.
+        :rtype: T
+        :raises BaseException: The captured exception, re-raised verbatim
+            with its original type and message.
+        :raises RuntimeError: If the runtime is not quiescent.
+        """
+
     @property
     def exception(self) -> bool:
         """Whether the held value is the result of an unhandled exception."""
@@ -647,7 +732,7 @@ class PinnedCown(Cown[T]):
 
     Nested pumping
         Calling :func:`pump` from inside a pinned-behavior body
-        raises :class:`RuntimeError` (v1).
+        raises :class:`RuntimeError`.
 
     Handle vs. value
         A :class:`PinnedCown` *handle* (the Python wrapper object
@@ -758,7 +843,7 @@ def pump(deadline_ms: Optional[int] = None,
 
     Reentrance
         Not reentrant. Calling from inside a pinned-behavior body
-        raises :class:`RuntimeError` (v1).
+        raises :class:`RuntimeError`.
 
     :param deadline_ms: Wall-clock budget in milliseconds.
         ``None`` for unbounded; otherwise a positive :class:`int`.
@@ -863,6 +948,40 @@ def notice_write(key: str, value: Any) -> None:
     """
 
 
+def notice_seed(key: str, value: Any) -> None:
+    """Synchronously write a value to the noticeboard from the primary interpreter.
+
+    Unlike :func:`notice_write`, this commits **before it returns**: the
+    value is applied under the noticeboard mutex on the calling thread,
+    so once :func:`notice_seed` returns the entry is live and visible to
+    every behavior scheduled afterwards (and to the calling thread's own
+    subsequent :func:`notice_read`). It is the recommended way to install
+    read-mostly configuration before scheduling the behaviors that read
+    it.
+
+    If the runtime is not yet running, :func:`notice_seed` starts it,
+    so seeding can be the first bocpy call a program makes — no explicit
+    :func:`start` is required.
+
+    **Primary interpreter only.** Calling :func:`notice_seed` from a
+    worker raises :class:`RuntimeError`; use :func:`notice_write` for
+    fire-and-forget writes from within behaviors.
+
+    It is a plain overwrite intended for *seeding* before concurrent
+    noticeboard mutations are in flight. It does **not** provide the
+    read-modify-write atomicity of :func:`notice_update`, and a seed
+    that races an in-flight :func:`notice_update` on the same key may be
+    lost. Seed once, up front, rather than interleaving seeds with
+    concurrent updates.
+
+    :param key: The noticeboard key (max 63 UTF-8 bytes).
+    :type key: str
+    :param value: The value to store.
+    :type value: Any
+    :raises RuntimeError: If called from a worker interpreter.
+    """
+
+
 def notice_update(key: str, fn: Callable[[Any], Any], default: Any = None) -> None:
     """Atomically update a noticeboard entry.
 
@@ -981,29 +1100,6 @@ def notice_read(key: str, default: Any = None) -> Any:
     """
 
 
-def notice_sync(timeout: Optional[float] = 30.0) -> None:
-    """Block until the caller's prior noticeboard mutations are committed.
-
-    Because :func:`notice_write`, :func:`notice_update`, and
-    :func:`notice_delete` are fire-and-forget, a behavior that wants
-    read-your-writes ordering against a *subsequent* behavior must call
-    ``notice_sync()`` after its writes. By the time this returns, every
-    write/update/delete posted from the calling thread before the call
-    has been applied to the noticeboard.
-
-    The barrier carries **no ordering guarantee** with respect to
-    writes posted from other threads or behaviors interleaved with the
-    caller's; it only flushes the caller's own queued mutations.
-
-    :param timeout: Maximum seconds to wait. ``None`` waits forever.
-        Defaults to 30 seconds.
-    :type timeout: Optional[float]
-    :raises TimeoutError: If the barrier does not complete within
-        *timeout* seconds.
-    :raises RuntimeError: If the runtime is not started.
-    """
-
-
 @overload
 def wait(timeout: Optional[float] = None, *,
          stats: Literal[False] = False,
diff --git a/src/bocpy/_core.c b/src/bocpy/_core.c
index 633fab4..77d3a51 100644
--- a/src/bocpy/_core.c
+++ b/src/bocpy/_core.c
@@ -9,7 +9,6 @@
 #include <assert.h>
 #include <bocpy/bocpy.h>
 
-// Forward declaration — BOCQueue is defined below.
 typedef struct boc_queue BOCQueue;
 
 /// @brief Initialize the park mutex and condition variable for a queue
@@ -49,78 +48,28 @@ const char *BOC_TIMEOUT = "__timeout__";
 const int BOC_CAPACITY = 1024 * 16;
 atomic_int_least64_t BOC_COUNT = 0;
 atomic_int_least64_t BOC_COWN_COUNT = 0;
-// Live pinned-cown count (incremented in the PinnedCownCapsule factory,
-// decremented when a pinned cown's strong refcount reaches zero).
 atomic_int_least64_t PINNED_COWN_COUNT = 0;
 
-// Process-global queue carrying behaviours that touch one or more
-// pinned cowns. Producers (worker / caller threads, via
-// `boc_sched_dispatch` -> `boc_main_pinned_enqueue`) enqueue the
-// prehdr's `bq_node`; the main interpreter drains it from
-// `main_pump_bounded`. Initialised once per process in
-// `_core_module_exec`; never destroyed (kernel objects outlive
-// module unload, matching the BOC_QUEUES / terminator pattern).
 static boc_bq_t MAIN_PINNED_QUEUE;
 
-// Depth of MAIN_PINNED_QUEUE. Bumped by `boc_main_pinned_enqueue`,
-// decremented by the main pump as it consumes nodes. A
-// `wait()`-blocked main thread observes a non-zero depth via the
-// terminator condvar (woken by `terminator_wake_all`). Signed type: MSVC
-// stdatomic has no unsigned variant; depth never goes negative.
 static atomic_int_least64_t MAIN_PINNED_DEPTH = 0;
 
-// Monotonic-ns timestamp of the most recent 0 -> 1 transition of
-// MAIN_PINNED_DEPTH. Used by the main pump as a fairness signal
-// (oldest-pending-pinned-work age). Sampled lock-free; race-tolerant.
 static atomic_int_least64_t MAIN_PINNED_NONEMPTY_SINCE_NS = 0;
 
-// Thread-local re-entry flag for main_pump_bounded. Set true at the
-// start of each pinned-body iteration and cleared in the per-iteration
-// cleanup block, so a nested pump() called from inside a pinned body
-// observes true at gate 3 and is rejected. Thread-local + plain bool
-// (no atomicity needed): only the owning thread reads or writes it.
 static thread_local bool IN_PUMP_BODY = false;
 
-// Monotonic-ns timestamp of the most recent pump iteration completion.
-// Sampled lock-free; race-tolerant. The pump updates it after each
-// iteration so the watchdog can compare against
-// MAIN_PINNED_NONEMPTY_SINCE_NS.
 static atomic_int_least64_t LAST_PUMP_NS = 0;
 
 #ifdef Py_GIL_DISABLED
-// Free-threaded build only: ID (thrd_current() cast to uintptr_t) of
-// the thread that currently owns the main pump CAS. Zero when idle.
-// Gate 2 of main_pump_bounded CAS-acquires this so two distinct
-// threads cannot pump concurrently. Same-thread re-entry leaves the
-// CAS owned by the outer frame; gate 3's IN_PUMP_BODY check rejects
-// the nested call. intptr_t (not uintptr_t) for MSVC: thread-id bits round-trip
-// bit-cast losslessly.
 static atomic_intptr_t MAIN_PUMP_THREAD = 0;
 #endif
 
-// Pump-starvation watchdog config (set via _core.set_pump_watchdog).
-// Both atomics default to "disabled": the watchdog produces no
-// output until the user opts in by calling set_pump_watchdog().
-// Read on the hot path (boc_main_pinned_check_warn runs on pump
-// entry), so the atomics are deliberately the cheapest shape --
-// relaxed reads against ms-resolution counters. 0 means disabled
-// for WATCHDOG_WARN_MS. WATCHDOG_ON_STARVE is only touched from
-// the main interpreter (set_pump_watchdog refuses non-main; the
-// warn callback fires on pump entry which is also main-only); the
-// atomic_intptr_t is for store/load visibility; intptr_t (not uintptr_t) for
-// MSVC, PyObject* round-trips losslessly.
 static atomic_int_least64_t WATCHDOG_WARN_MS = 0;
 static atomic_intptr_t WATCHDOG_ON_STARVE = 0;
-// Monotonic-ns timestamp of the most recent warn log emission, used
-// to rate-limit the warn channel so a slow pump does not flood logs
-// once per call. 0 = never warned in this NONEMPTY_SINCE epoch.
 static atomic_int_least64_t WATCHDOG_LAST_WARN_NS = 0;
 
 #define BOC_SPIN_COUNT 64
-#define BOC_BACKOFF_CAP_NS 1000000 // 1 ms
-
-// #define BOC_REF_TRACKING
-// #define BOC_TRACE
+#define BOC_BACKOFF_CAP_NS 1000000
 
 /// @brief Note in a RecycleQueue.
 typedef struct boc_recycle_node {
@@ -178,7 +127,7 @@ typedef struct boc_queue {
   /// @details Messages which are sent with this tag will be assigned to this
   /// queue. Calls to receive on the tag will attempt to dequeue from this
   /// queue.
-  atomic_intptr_t tag; // (BOCTag *)
+  atomic_intptr_t tag;
 
   /// @brief Number of threads parked on this queue's condvar
   atomic_int_least64_t waiters;
@@ -187,11 +136,6 @@ typedef struct boc_queue {
   /// @brief Condition variable for parking receivers
   BOCCond park_cond;
 
-  // Contention counters. Bumped with BOC_MO_RELAXED inside
-  // boc_enqueue / boc_dequeue. Read by `_core.queue_stats()`. Grouped
-  // and padded so they sit on their own cacheline and do not
-  // false-share with the hot head/tail/state above. Typed via
-  // `boc_compat.h` so the build works on MSVC (which has no `_Atomic`).
   /// @brief CAS retries observed by enqueuers contending on @c tail.
   boc_atomic_u64_t enqueue_cas_retries;
   /// @brief CAS retries observed by dequeuers contending on @c head.
@@ -212,9 +156,6 @@ static BOCQueue BOC_QUEUES[BOC_QUEUE_COUNT];
 static BOCRecycleQueue *BOC_RECYCLE_QUEUE_TAIL = NULL;
 static atomic_intptr_t BOC_RECYCLE_QUEUE_HEAD = 0;
 
-// Platform condvar implementation
-// ---------------------------------------------------------------------------
-
 static inline void boc_park_init(BOCQueue *q) {
   boc_mtx_init(&q->park_mutex);
   cnd_init(&q->park_cond);
@@ -239,8 +180,6 @@ static inline void boc_park_wait(BOCQueue *q) {
   cnd_wait(&q->park_cond, &q->park_mutex);
 }
 
-// Noticeboard function implementations are below object_to_xidata
-
 /// @brief State for the module.
 typedef struct boc_state {
   /// @brief The index (monotonically increasing) for this module.
@@ -414,7 +353,6 @@ static BOCRecycleQueue *BOCRecycleQueue_new(int_least64_t index) {
     return NULL;
   }
 
-  // this is both the stub and the allocated space for the next item
   BOCRecycleNode *node =
       (BOCRecycleNode *)PyMem_RawMalloc(sizeof(BOCRecycleNode));
   if (node == NULL) {
@@ -518,7 +456,6 @@ static PyObject *object_to_xidata(PyObject *value, XIDATA_T **xidata_ptr) {
 
   PyErr_Clear();
 
-  // no native support, fallback to pickle
   PyObject *bytes = _PyPickle_Dumps(value);
   if (bytes == NULL) {
     return NULL;
@@ -536,87 +473,33 @@ static PyObject *object_to_xidata(PyObject *value, XIDATA_T **xidata_ptr) {
   return NULL;
 }
 
-// ---------------------------------------------------------------------------
-// Noticeboard C functions
-// ---------------------------------------------------------------------------
-
-/// @brief Write a key-value pair into the noticeboard under mutex
-/// @details The value is serialized to XIData here (in the main interpreter),
-/// so XIDATA_FREE is always safe to call from the same interpreter. The
-/// optional third argument is a sequence of CownCapsule objects whose
+/// @brief Serialize a value and commit it to the noticeboard under mutex
+/// @details Shared body of @ref _core_noticeboard_write_direct and
+/// @ref _core_noticeboard_seed. The value is serialized to XIData here (in
+/// the main interpreter), so XIDATA_FREE is always safe to call from the
+/// same interpreter. @p cowns is a sequence of CownCapsule objects whose
 /// underlying BOCCowns are referenced by the serialized bytes; the
-/// noticeboard takes a strong reference on each so that they outlive
-/// every reader's pickled view, regardless of whether the original
-/// CownCapsule is dropped by user code.
-/// @param self The module
-/// @param args Tuple of (key: str, value: object[, cowns: sequence])
-/// @return Py_None on success, NULL on error
-static PyObject *_core_noticeboard_write_direct(PyObject *self,
-                                                PyObject *args) {
-  BOC_STATE_SET(self);
-
-  if (BOC_STATE->index != 0) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "noticeboard_write_direct must be called from the primary "
-                    "interpreter");
-    return NULL;
-  }
-
-  if (noticeboard_check_thread("noticeboard_write_direct") < 0) {
-    return NULL;
-  }
-
-  const char *key;
-  Py_ssize_t key_len;
-  PyObject *value;
-  PyObject *cowns = Py_None;
-
-  if (!PyArg_ParseTuple(args, "s#O|O", &key, &key_len, &value, &cowns)) {
-    return NULL;
-  }
-
-  // Pin the cowns BEFORE serializing so an error here does not leave us
-  // with a stored entry whose cowns can be freed under us.
+/// noticeboard takes a strong reference on each so that they outlive every
+/// reader's pickled view, regardless of whether the original CownCapsule is
+/// dropped by user code. Callers must enforce their own interpreter / thread
+/// preconditions before invoking this helper.
+/// @param key UTF-8 key (NUL-free, up to NB_KEY_SIZE-1 bytes).
+/// @param key_len Length of @p key in bytes (no trailing NUL).
+/// @param value The object to store.
+/// @param cowns Sequence of CownCapsules to pin, or Py_None.
+/// @return Py_None on success, NULL on error (PyErr set).
+static PyObject *nb_serialize_and_commit(const char *key, Py_ssize_t key_len,
+                                         PyObject *value, PyObject *cowns) {
   BOCCown **new_pins = NULL;
   int new_pin_count = 0;
   if (nb_pin_cowns(cowns, &new_pins, &new_pin_count) < 0) {
     return NULL;
   }
 
-  // Serialize the value to XIData in the main interpreter.
-  //
-  // *** DO NOT REMOVE the BOC_NB_CTX toggle below. ***
-  //
-  // The noticeboard is the ONE site that owns its own pin set
-  // (new_pins, populated above by nb_pin_cowns). Any CownCapsule we
-  // pickle as part of this value must therefore use the *borrowing*
-  // reconstructor: the noticeboard entry keeps the inner BOCCown
-  // alive across all readers, and each reader's snapshot takes its
-  // own fresh COWN_INCREF in _cown_capsule_from_pointer_borrowing.
-  //
-  // Without this toggle, CownCapsule_reduce would take a single
-  // COWN_INCREF (inheriting form) per pickle, but the inheriting
-  // reconstructor would *also* be invoked on every reader on every
-  // worker, leading to one consumed INCREF per unpickle and an
-  // eventual COWN_DECREF underflow → UAF.
-  //
-  // The context also carries the pin set so that every CownCapsule the
-  // pickler reduces can be audited against it: any cown the value
-  // reaches via a custom __reduce__ / __getstate__ / copyreg dispatch
-  // but that _gather_pins missed is flagged on ctx.unpinned_count and
-  // the whole write is failed below. This closes the CWE-416 path
-  // where a hidden cown would otherwise produce a borrowing token
-  // whose underlying BOCCown is not held alive by the noticeboard
-  // entry, causing the first reader after the writer's wrapper drops
-  // to resurrect a freed pointer.
-  //
-  // BOC_NB_CTX is thread-local and the noticeboard write thread is
-  // single-threaded by construction (see noticeboard_check_thread
-  // above), so it must be NULL on entry. A non-NULL value would mean
-  // a prior write left a dangling stack pointer behind — a memory-
-  // safety precondition violation, not a recoverable error. The
-  // Py_FatalError fence enforces the invariant in release wheels too.
   if (BOC_NB_CTX != NULL) {
+    // BOC_NB_CTX is thread-local, so it must be NULL on entry; non-NULL means a
+    // prior write on this thread leaked a stack pointer (a memory-safety
+    // precondition violation, not a recoverable error).
     Py_FatalError("noticeboard borrowing context leaked across writes");
   }
   NoticeboardPickleCtx nb_ctx = {
@@ -632,7 +515,6 @@ static PyObject *_core_noticeboard_write_direct(PyObject *self,
     if (xidata != NULL) {
       XIDATA_FREE(xidata);
     }
-    // Roll back the pins we just took.
     for (int i = 0; i < new_pin_count; i++) {
       COWN_DECREF(new_pins[i]);
     }
@@ -643,17 +525,6 @@ static PyObject *_core_noticeboard_write_direct(PyObject *self,
   bool is_pickled = (pickled == Py_True);
   Py_DECREF(pickled);
 
-  // Pin-set audit. If the pickler reduced any CownCapsule whose inner
-  // BOCCown was NOT in our pre-pinned set, the serialized bytes
-  // contain a borrowing token to a cown that nothing in the
-  // noticeboard entry keeps alive. The first reader to resurrect that
-  // pointer after the writer's local Cown wrappers drop would call
-  // COWN_INCREF on freed memory (the cown will have been returned to
-  // the recycler) → CWE-416. Fail closed: discard the bytes, roll
-  // back the pins, raise. The error propagates to the noticeboard
-  // thread's outer try/except in Behaviors.noticeboard(), which logs
-  // a warning and drops the message; the entry is never installed,
-  // so readers cannot observe a partially-pinned snapshot.
   if (nb_ctx.unpinned_count > 0) {
     int unpinned = nb_ctx.unpinned_count;
     XIDATA_FREE(xidata);
@@ -672,21 +543,91 @@ static PyObject *_core_noticeboard_write_direct(PyObject *self,
     return NULL;
   }
 
-  // noticeboard_write takes ownership of xidata + pins on success and
-  // frees them on failure.
   if (noticeboard_write(key, key_len, xidata, is_pickled, new_pins,
                         new_pin_count) < 0) {
     return NULL;
   }
 
-  // Note: this thread's snapshot cache is intentionally NOT cleared.
-  // Within a behavior, a writer must not observe its own write — that
-  // is the no-polling invariant. The cache will be lazily revalidated
-  // at the next behavior boundary (see _core_noticeboard_cache_clear).
-
   Py_RETURN_NONE;
 }
 
+/// @brief Write a key-value pair into the noticeboard under mutex
+/// @details Called only by the registered noticeboard mutator thread (via the
+/// Python @c notice_write helper). Enforces the single-writer invariant so the
+/// read-modify-write in @c notice_update has no TOCTOU window, then delegates
+/// to @ref nb_serialize_and_commit.
+/// @param self The module
+/// @param args Tuple of (key: str, value: object[, cowns: sequence])
+/// @return Py_None on success, NULL on error
+static PyObject *_core_noticeboard_write_direct(PyObject *self,
+                                                PyObject *args) {
+  BOC_STATE_SET(self);
+
+  if (BOC_STATE->index != 0) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "noticeboard_write_direct must be called from the primary "
+                    "interpreter");
+    return NULL;
+  }
+
+  if (noticeboard_check_thread("noticeboard_write_direct") < 0) {
+    return NULL;
+  }
+
+  const char *key;
+  Py_ssize_t key_len;
+  PyObject *value;
+  PyObject *cowns = Py_None;
+
+  if (!PyArg_ParseTuple(args, "s#O|O", &key, &key_len, &value, &cowns)) {
+    return NULL;
+  }
+
+  return nb_serialize_and_commit(key, key_len, value, cowns);
+}
+
+/// @brief Synchronously seed the noticeboard from the primary interpreter
+/// @details Unlike @ref _core_noticeboard_write_direct, this entry does not
+/// require the registered mutator thread: it commits directly under
+/// @ref noticeboard_write's mutex from any thread on the primary interpreter
+/// and returns only after the entry is live and @c NB_VERSION has been bumped.
+/// It exists so the primary interpreter can populate the noticeboard before
+/// scheduling behaviors that read it. Worker interpreters are rejected because
+/// XIData is serialized on the calling interpreter here. On success the
+/// calling thread's snapshot cache is re-armed so the seeding thread's own
+/// next @ref noticeboard_snapshot revalidates and observes the write.
+/// @param self The module
+/// @param args Tuple of (key: str, value: object[, cowns: sequence])
+/// @return Py_None on success, NULL on error
+static PyObject *_core_noticeboard_seed(PyObject *self, PyObject *args) {
+  BOC_STATE_SET(self);
+
+  if (BOC_STATE->index != 0) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "noticeboard_seed must be called from the primary "
+                    "interpreter");
+    return NULL;
+  }
+
+  const char *key;
+  Py_ssize_t key_len;
+  PyObject *value;
+  PyObject *cowns = Py_None;
+
+  if (!PyArg_ParseTuple(args, "s#O|O", &key, &key_len, &value, &cowns)) {
+    return NULL;
+  }
+
+  PyObject *result = nb_serialize_and_commit(key, key_len, value, cowns);
+  if (result != NULL) {
+    // The calling thread has no behavior boundary to re-arm its cache, so clear
+    // the version-checked flag here; otherwise a warm snapshot would keep
+    // returning the pre-seed value and break read-your-writes on this thread.
+    noticeboard_cache_clear_for_behavior();
+  }
+  return result;
+}
+
 /// @brief Return a cached read-only snapshot of the noticeboard
 /// @details Three fast paths, in order:
 ///   1. If @ref NB_VERSION_CHECKED is true, the cached proxy was already
@@ -771,9 +712,6 @@ static PyObject *_core_noticeboard_delete(PyObject *self, PyObject *args) {
     return NULL;
   }
 
-  // Note: this thread's snapshot cache is intentionally NOT cleared;
-  // the no-polling invariant applies equally to deletes.
-
   Py_RETURN_NONE;
 }
 
@@ -840,95 +778,6 @@ static PyObject *_core_clear_noticeboard_thread(PyObject *self,
   Py_RETURN_NONE;
 }
 
-/// @brief Allocate a fresh notice_sync sequence number.
-/// @details Atomically increments @ref NB_SYNC_REQUESTED and returns the
-/// new value. The caller posts @c ("sync", N) on the @c boc_noticeboard
-/// tag and then waits via @ref _core_notice_sync_wait until that sequence
-/// has been processed.
-/// @param self The module (unused)
-/// @param args Unused
-/// @return A Python int with the caller's seq.
-static PyObject *_core_notice_sync_request(PyObject *self,
-                                           PyObject *Py_UNUSED(args)) {
-  BOC_STATE_SET(self);
-  return PyLong_FromLongLong((long long)notice_sync_request());
-}
-
-/// @brief Mark a notice_sync sequence as processed and wake waiters.
-/// @details Called from the noticeboard-thread Python arm when it pops
-/// a @c ("sync", N) sentinel off the queue. Stores @c max(processed, N)
-/// into @ref NB_SYNC_PROCESSED (defensive against any reordering, though
-/// the MPSC tag is FIFO) and broadcasts @ref NB_SYNC_COND.
-/// @param self The module (unused)
-/// @param args A tuple @c (N,) — the sequence number being completed
-/// @return Py_None
-static PyObject *_core_notice_sync_complete(PyObject *self, PyObject *args) {
-  BOC_STATE_SET(self);
-  if (BOC_STATE->index != 0) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "notice_sync_complete must be called from the primary "
-                    "interpreter");
-    return NULL;
-  }
-  long long seq;
-  if (!PyArg_ParseTuple(args, "L", &seq)) {
-    return NULL;
-  }
-
-  Py_BEGIN_ALLOW_THREADS notice_sync_complete((int_least64_t)seq);
-  Py_END_ALLOW_THREADS
-
-      Py_RETURN_NONE;
-}
-
-/// @brief Block until @p my_seq has been processed by the noticeboard thread.
-/// @details Loops on @ref NB_SYNC_COND under @ref NB_SYNC_MUTEX until
-/// @ref NB_SYNC_PROCESSED is at least @p my_seq, or until @p timeout
-/// seconds elapse. A negative or @c None timeout means wait forever.
-/// Releases the GIL across the wait.
-/// @param self The module (unused)
-/// @param args A tuple @c (my_seq, timeout) — int and float-or-None
-/// @return @c True on success, @c False on timeout.
-static PyObject *_core_notice_sync_wait(PyObject *self, PyObject *args) {
-  BOC_STATE_SET(self);
-  long long my_seq;
-  PyObject *timeout_obj;
-  if (!PyArg_ParseTuple(args, "LO", &my_seq, &timeout_obj)) {
-    return NULL;
-  }
-
-  bool wait_forever = false;
-  double timeout = 0.0;
-  if (timeout_obj == Py_None) {
-    wait_forever = true;
-  } else {
-    timeout = PyFloat_AsDouble(timeout_obj);
-    if (timeout == -1.0 && PyErr_Occurred()) {
-      return NULL;
-    }
-    // Boundary validation: rejects NaN as ValueError, maps +Inf to
-    // wait_forever, clamps negatives to 0. Centralised so future
-    // wait entry points can reuse it.
-    if (boc_validate_finite_timeout(timeout, &timeout, &wait_forever) < 0) {
-      return NULL;
-    }
-  }
-
-  bool ok;
-  Py_BEGIN_ALLOW_THREADS ok =
-      notice_sync_wait((int_least64_t)my_seq, timeout, wait_forever);
-  Py_END_ALLOW_THREADS
-
-      if (ok) {
-    Py_RETURN_TRUE;
-  }
-  Py_RETURN_FALSE;
-}
-
-// ---------------------------------------------------------------------------
-// Terminator entry points.
-// ---------------------------------------------------------------------------
-
 /// @brief Try to register a new behavior with the terminator.
 /// @details Returns the post-increment count on success, or -1 if the
 /// terminator is closed (runtime is shutting down). The double-check of
@@ -997,9 +846,6 @@ static PyObject *_core_terminator_wait(PyObject *self, PyObject *args) {
     if (timeout == -1.0 && PyErr_Occurred()) {
       return NULL;
     }
-    // Boundary validation: rejects NaN as ValueError, maps +Inf to
-    // wait_forever, clamps negatives to 0. Centralised so future
-    // wait entry points can reuse it.
     if (boc_validate_finite_timeout(timeout, &timeout, &wait_forever) < 0) {
       return NULL;
     }
@@ -1107,11 +953,6 @@ static PyObject *_core_terminator_reset(PyObject *self,
   int_least64_t prior_count = 0;
   int_least64_t prior_seeded = 0;
   terminator_reset(&prior_count, &prior_seeded);
-  // Pump-watchdog state: depth and timestamps must return to the
-  // depth==0 baseline so the watchdog does not carry stale "queue
-  // has been non-empty since X" / "last warn at Y" readings into
-  // the next run. MAIN_PINNED_DEPTH is reset by the drain in
-  // stop_workers; the rest live here.
   atomic_store(&MAIN_PINNED_NONEMPTY_SINCE_NS, 0);
   atomic_store(&WATCHDOG_LAST_WARN_NS, 0);
   atomic_store(&LAST_PUMP_NS, 0);
@@ -1172,7 +1013,7 @@ typedef struct boc_cown {
   /// @c request_start_enqueue_inner via @c atomic_exchange on the
   /// 2PL link path; read by successors to discover their
   /// predecessor.
-  atomic_intptr_t last; // (BOCRequest *)
+  atomic_intptr_t last;
   /// @brief Atomic reference count for the cown
   atomic_int_least64_t rc;
   /// @brief Atomic weak reference count for the cown
@@ -1185,7 +1026,6 @@ static inline int_least64_t cown_weak_decref(BOCCown *cown) {
            cown, cown->id, weak_rc);
 
   if (weak_rc == 0) {
-    // reference count is truly zero, we can free the memory
     PyMem_RawFree(cown);
     BOC_REF_TRACKING_REMOVE_COWN();
   }
@@ -1213,8 +1053,6 @@ static inline void report_unhandled_exception(BOCCown *cown) {
   fprintf(stderr, "Cown(%p) contains an unhandled exception: ", cown);
 
   if (cown->value != NULL) {
-    // Owning-interpreter path: the value is real and will be Py_CLEAR'd
-    // by cown_decref_inline below as part of normal teardown.
     PyObject_Print(cown->value, stderr, 0);
     fprintf(stderr, "\n");
     return;
@@ -1226,7 +1064,6 @@ static inline void report_unhandled_exception(BOCCown *cown) {
     return;
   }
 
-  // Pickled path: borrow the value just long enough to print it.
   PyObject *borrowed = xidata_to_object(cown->xidata, cown->pickled);
 
   if (borrowed == NULL) {
@@ -1246,16 +1083,6 @@ static void BOCRecycleQueue_enqueue(BOCRecycleQueue *queue, XIDATA_T *xidata);
 /// @brief Atomic decref for the cown
 /// @param cown the cown to decref
 /// @return the new reference count
-// Within this TU we want every COWN_INCREF / COWN_DECREF callsite below
-// to inline directly into its caller — losing that on the schedule /
-// release hot path costs measurable throughput. Mirror CPython's
-// Py_INCREF (inline header macro) vs _Py_IncRef (out-of-line ABI export)
-// pattern: keep `static inline` bodies as the in-TU implementation,
-// expose extern wrappers under the names declared in `boc_cown.h` for
-// boc_noticeboard.c, and override the macros from boc_cown.h to bind locally to
-// the inline versions. The one earlier callsite (the write_direct error
-// rollback above this point) is on an error path and stays bound to the
-// extern wrapper from boc_cown.h — not hot.
 
 static inline int_least64_t cown_decref_inline(BOCCown *cown) {
   int_least64_t rc = atomic_fetch_add(&cown->rc, -1) - 1;
@@ -1271,27 +1098,13 @@ static inline int_least64_t cown_decref_inline(BOCCown *cown) {
     report_unhandled_exception(cown);
   }
 
-  // we can clear the object and recycle the xidata
   if (cown->value != NULL) {
     if (cown->is_pinned) {
-      // Pinned cowns hold a main-interpreter PyObject* in
-      // ``cown->value``. Running ``Py_CLEAR`` from a worker that
-      // happens to drop the last handle would invoke the value's
-      // destructor on the wrong interpreter — undefined behaviour
-      // under PEP 684 and ``Py_GIL_DISABLED``. The safe ship choice
-      // is a controlled leak: skip the clear here, surface the leak
-      // via ``PyErr_WriteUnraisable`` so it is at least observable
-      // in test runs, and let the main interpreter's process-exit
-      // reclaim the bytes. Callers that want zero leaks must keep
-      // pinned-cown handles on main.
       if (bocpy_interpid() != bocpy_main_interpid()) {
-        // Preserve any pending exception on the calling thread:
-        // ``PyErr_WriteUnraisable(NULL)`` writes-and-clears the
-        // error indicator, so a caller mid-unwind would silently
-        // lose its in-flight exception when this leak path fires
-        // (e.g. interpreter shutdown that raced a worker dropping
-        // the last handle). Fetch around the format/write pair and
-        // restore on the way out.
+        // Off-main last drop: leak the value (skip Py_CLEAR) rather than run
+        // its destructor on the wrong interpreter (PEP 684 UB); Fetch/Restore
+        // keeps a caller's in-flight exception alive across WriteUnraisable's
+        // clear.
         PyObject *prev_exc_type, *prev_exc_val, *prev_exc_tb;
         PyErr_Fetch(&prev_exc_type, &prev_exc_val, &prev_exc_tb);
         PyErr_Format(PyExc_RuntimeError,
@@ -1315,23 +1128,14 @@ static inline int_least64_t cown_decref_inline(BOCCown *cown) {
   if (cown->xidata != NULL) {
     assert(!cown->is_pinned);
 
-    // Deserialize-and-drop encoded xidata so embedded CownCapsule INCREFs
-    // balance on orphan death (CWE-401): CownCapsule_reduce takes an
-    // inheriting COWN_INCREF per embedded BOCCown that is normally
-    // consumed when the bytes are unpickled; running pickle.loads + DECREF
-    // here lets CPython's GC fire the matching COWN_DECREFs recursively.
-    // Gated on `pickled` because native XIData round-trips (e.g. Matrix)
-    // cannot embed CownCapsule and would just waste a pickle round-trip.
     if (cown->pickled) {
-      // Preserve any in-flight error across the deserialize;
-      // PyErr_WriteUnraisable below clears it otherwise.
+      // Deserialize-and-drop to fire the recursive COWN_DECREFs that balance
+      // the COWN_INCREFs baked into the pickle bytes (else those cowns leak).
       PyObject *prev_exc_type, *prev_exc_val, *prev_exc_tb;
       PyErr_Fetch(&prev_exc_type, &prev_exc_val, &prev_exc_tb);
 
       PyObject *drained = xidata_to_object(cown->xidata, true);
       if (drained == NULL) {
-        // Partial unpickle: already-decoded capsules unwind cleanly; tail-end
-        // opcodes leak, same as cown_acquire on failure. Surface as unraisable.
         PyErr_WriteUnraisable(NULL);
       } else {
         Py_DECREF(drained);
@@ -1370,9 +1174,6 @@ static inline int_least64_t cown_incref_inline(BOCCown *cown) {
 /// @brief Out-of-line export consumed by other TUs (see @ref boc_cown.h).
 int_least64_t cown_incref(BOCCown *cown) { return cown_incref_inline(cown); }
 
-// Rebind COWN_INCREF / COWN_DECREF to the inline forms so every
-// remaining callsite below (acquire/release/dispatch hot paths) does
-// not pay an indirect call.
 #undef COWN_INCREF
 #undef COWN_DECREF
 #define COWN_INCREF(c) cown_incref_inline((c))
@@ -1436,9 +1237,6 @@ static BOCCown *BOCCown_new(PyObject *value) {
   cown->exception = false;
   cown->is_pinned = false;
   atomic_store_intptr(&cown->last, 0);
-  // each cown starts with both a strong and weak reference
-  // the weak reference will only be decremented when the strong
-  // reference count is zero.
   atomic_store(&cown->rc, 1);
   atomic_store(&cown->weak_rc, 1);
   cown_set_value(cown, value);
@@ -1464,26 +1262,19 @@ static XIDATA_T *BOCRecycleQueue_dequeue(BOCRecycleQueue *queue,
   intptr_t tail_ptr = (intptr_t)queue->tail;
   intptr_t next_ptr = atomic_load_intptr(&tail->next);
   if (next_ptr == 0) {
-    // two possibilities:
-    // 1. queue is empty
-    // 2. queue is inconsistent
     if (!wait_for_consistency) {
-      // whatever this is can wait until the queue is back in a good state
       return NULL;
     }
 
     if (queue->head == tail_ptr) {
-      // the queue is consistent, but empty
       return NULL;
     }
 
-    // the queue is inconsistent, so we spin/wait for step 3 to complete above
     while (next_ptr == 0) {
       next_ptr = atomic_load_intptr(&tail->next);
     }
   }
 
-  // we can proceed to dequeue the tail
   XIDATA_T *data = tail->xidata;
   queue->tail = (BOCRecycleNode *)next_ptr;
   PyMem_RawFree(tail);
@@ -1547,7 +1338,6 @@ static void BOCRecycleQueue_recycle(BOCRecycleQueue *queue, XIDATA_T *xidata) {
 
   Py_DECREF(xidata_ptr);
 
-  // manual clear
   if (xidata->data != NULL) {
     if (xidata->free != NULL) {
       xidata->free(xidata->data);
@@ -1574,28 +1364,20 @@ static void BOCRecycleQueue_enqueue(BOCRecycleQueue *queue, XIDATA_T *xidata) {
 #endif
 
   if (queue == BOC_STATE->recycle_queue) {
-    // no need to enqueue, this is on the local interpreter
     BOCRecycleQueue_recycle(queue, xidata);
     return;
   }
 
-  // allocate space for the next item
   BOCRecycleNode *node =
       (BOCRecycleNode *)PyMem_RawMalloc(sizeof(BOCRecycleNode));
   node->xidata = NULL;
   atomic_store_intptr(&node->next, 0);
 
-  // step 1: swap the new node in as the new head
   intptr_t node_ptr = (intptr_t)node;
   intptr_t old_head_ptr = atomic_exchange_intptr(&queue->head, node_ptr);
   BOCRecycleNode *old_head = (BOCRecycleNode *)old_head_ptr;
-  // queue is now inconsistent
-  // step 2: store the data in this node. This node is somewhere inside the
-  // queue.
   old_head->xidata = xidata;
-  // step 3: connect everything back together
   atomic_store_intptr(&old_head->next, node_ptr);
-  // queue is consistent
 }
 
 /// @brief Empty out the queue and free the contents
@@ -1813,18 +1595,6 @@ static PyObject *CownCapsule_acquired(PyObject *op,
 /// @return -1 if failure, 0 if success
 static int cown_acquire(BOCCown *cown) {
   if (cown->is_pinned) {
-    // Pinned cowns are permanently owned by main and never serialised:
-    // the structural owner-CAS would also reject worker callers (their
-    // bocpy_interpid() never matches BOCPY_NO_OWNER on a pinned cown),
-    // but the short-circuit avoids a wasted CAS and the xidata-NULL
-    // poisoning guard further down.
-    //
-    // The interpreter check is a runtime guard, not an assert: pinned
-    // acquire must never run off-main (it would race the main pump on
-    // ``cown->value`` without the MCS ordering protecting unpinned
-    // cowns). Release builds promote this to a hard ``RuntimeError``
-    // so a structural bug surfaces deterministically instead of
-    // silently corrupting state.
     if (bocpy_interpid() != bocpy_main_interpid()) {
       PyErr_Format(PyExc_RuntimeError,
                    "cannot acquire pinned cown %p from non-main "
@@ -1841,7 +1611,6 @@ static int cown_acquire(BOCCown *cown) {
   int_least64_t desired = bocpy_interpid();
   if (!atomic_compare_exchange_strong(&cown->owner, &expected, desired)) {
     if (expected == desired) {
-      // already acquired by this interpreter
       return 0;
     }
 
@@ -1852,14 +1621,6 @@ static int cown_acquire(BOCCown *cown) {
     return -1;
   }
 
-  // Poisoned-state guard. If a previous acquire of this
-  // cown failed inside xidata_to_object, we discarded the bytes
-  // (their embedded inherited refcount contributions had already
-  // been partially consumed by pickle's error path) and rolled
-  // owner back to NO_OWNER. Re-running pickle.loads against those
-  // bytes would risk dereferencing freed BOCCown* pointers. Refuse
-  // cleanly so the worker recovery arm marks the result cown with
-  // .exception = True and propagates a deterministic error.
   if (cown->xidata == NULL) {
     atomic_store(&cown->owner, (int_least64_t)BOCPY_NO_OWNER);
     PyErr_SetString(
@@ -1874,14 +1635,6 @@ static int cown_acquire(BOCCown *cown) {
   cown->value = xidata_to_object(cown->xidata, cown->pickled);
 
   if (cown->value == NULL) {
-    // pickle.loads may have partially constructed inner
-    // CownCapsules before failing, consuming inherited refcount
-    // contributions from the encoded BOCCown* pointers. The bytes
-    // are now "poisoned" — a retry could dereference freed memory.
-    // Discard the xidata so any future acquire fails fast via the
-    // cown->xidata == NULL guard above, and surface the original
-    // deserialisation exception unchanged so the worker recovery arm
-    // records it on the failing behavior's result cown.
     PyObject *exc_type, *exc_value, *exc_tb;
     PyErr_Fetch(&exc_type, &exc_value, &exc_tb);
     PyErr_NormalizeException(&exc_type, &exc_value, &exc_tb);
@@ -1890,13 +1643,6 @@ static int cown_acquire(BOCCown *cown) {
     cown->recycle_queue = NULL;
     cown->xidata = NULL;
 
-    /* Defense in depth: if the recycle-queue
-     * enqueue itself raised (Py_CLEAR running a __del__ side effect,
-     * MemoryError from PyMem_RawMalloc-adjacent paths), suppress that
-     * secondary error so PyErr_Restore reinstates the original
-     * deserialisation exception unchanged. Surface a one-line stderr
-     * note so the suppressed error is at least observable.
-     */
     if (PyErr_Occurred()) {
       PySys_WriteStderr(
           "cown_acquire: enqueue/recycle raised while poisoning cown %p; "
@@ -1939,12 +1685,6 @@ static PyObject *CownCapsule_acquire(PyObject *op, PyObject *Py_UNUSED(dummy)) {
 /// @return -1 if error, 0 otherwise
 static int cown_release(BOCCown *cown) {
   if (cown->is_pinned) {
-    // Pinned cowns never serialise out of main; release is a no-op so
-    // the value stays resident in main and the owner stays == main_id.
-    //
-    // Mirror the runtime guard in ``cown_acquire``: a release coming
-    // from a non-main interpreter is a structural bug, surface it as
-    // a hard ``RuntimeError`` instead of relying on debug asserts.
     if (bocpy_interpid() != bocpy_main_interpid()) {
       PyErr_Format(PyExc_RuntimeError,
                    "cannot release pinned cown %p from non-main "
@@ -1960,7 +1700,6 @@ static int cown_release(BOCCown *cown) {
   int_least64_t owner = atomic_load(&cown->owner);
   if (owner != expected) {
     if (owner == BOCPY_NO_OWNER) {
-      // already released
       return 0;
     }
 
@@ -1977,6 +1716,12 @@ static int cown_release(BOCCown *cown) {
   PyObject *pickled = object_to_xidata(cown->value, &cown->xidata);
 
   if (pickled == NULL) {
+    // Free the husk object_to_xidata left attached: it is unregistered, so a
+    // later decref would enqueue it onto a NULL recycle queue.
+    if (cown->xidata != NULL) {
+      XIDATA_FREE(cown->xidata);
+      cown->xidata = NULL;
+    }
     return -1;
   }
 
@@ -1989,7 +1734,6 @@ static int cown_release(BOCCown *cown) {
 
   int_least64_t desired = BOCPY_NO_OWNER;
   if (!atomic_compare_exchange_strong(&cown->owner, &expected, desired)) {
-    // this should never happen
     PyErr_SetString(PyExc_RuntimeError,
                     "Panic: contention on cown during release");
     return -1;
@@ -2024,11 +1768,6 @@ static PyObject *CownCapsule_release(PyObject *op, PyObject *Py_UNUSED(dummy)) {
 /// @return -1 if error, 0 otherwise
 static int cown_disown(BOCCown *cown) {
   if (cown->is_pinned) {
-    // Defense-in-depth. Pinned cowns must never be disowned: the
-    // value lives permanently on main. The existing owner-CAS below
-    // already rejects worker callers because owner is permanently
-    // main_id, but a direct main-thread call would otherwise drop the
-    // value. Treat as a successful no-op.
     assert(cown->owner == bocpy_main_interpid());
     return 0;
   }
@@ -2076,6 +1815,71 @@ static PyObject *CownCapsule_disown(PyObject *op, PyObject *Py_UNUSED(dummy)) {
   Py_RETURN_NONE;
 }
 
+/// @brief Consumes and returns the stored value, or re-raises a captured
+/// behavior exception.
+/// @details Mirrors Rust's @c Result::unwrap. Acquires the cown for the read,
+/// so the runtime must be quiescent: any behavior still in flight could be
+/// mutating the cown, so the call fails fast with @c RuntimeError instead of
+/// racing the worker. Quiescence is tested as
+/// @c TERMINATOR_COUNT - TERMINATOR_SEEDED > 0, which counts behavior holds
+/// independently of the Pyrona seed: this is exact even inside a
+/// seed-dropped window (e.g. a pinned behavior running on the main thread
+/// during @c quiesce / @c wait), where the bare @c count > 1 test would miss
+/// the last in-flight behavior. @b unwrap @b consumes the cown: the stored
+/// payload is taken by reference and the cown is reset to hold @c None before
+/// it is released, so the release re-serializes @c None rather than the
+/// payload. This is required for move-type values (e.g. @c Matrix), which
+/// alias a single backing store: re-serializing such a value through its
+/// registered move callback would flip ownership away from the caller's
+/// interpreter, leaving the returned object unusable. Consuming instead hands
+/// the payload to the caller with its ownership intact and empties the cown to
+/// a still-schedulable, free-anywhere @c None, so a second @c unwrap returns
+/// @c None. On success the consumed payload is returned; if the cown carries an
+/// unhandled behavior exception (flag read before the reset clears it) the
+/// payload is re-raised verbatim on the caller's thread. Lowering this to the
+/// capsule means a behavior that returns a @c Cown (which surfaces downstream
+/// as a bare CownCapsule) can still be unwrapped without rewrapping it in a
+/// Python @c Cown first.
+/// @param op The CownCapsule object
+/// @param Py_UNUSED (ignored)
+/// @return The stored value on success, or NULL with an exception set
+static PyObject *CownCapsule_unwrap(PyObject *op, PyObject *Py_UNUSED(dummy)) {
+  CownCapsuleObject *self = (CownCapsuleObject *)op;
+  BOCCown *cown = self->cown;
+
+  if (terminator_count() - terminator_seeded() > 0) {
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        "Cown.unwrap() called while behaviors are still in flight. "
+        "Call quiesce() (or wait()) first so the producing behavior "
+        "has completed before reading its result.");
+    return NULL;
+  }
+
+  if (cown_acquire(cown) < 0) {
+    return NULL;
+  }
+
+  // Steal the payload and store None before releasing, so the release
+  // re-serializes None: keeps move-type ownership with the caller (see above).
+  PyObject *payload = Py_NewRef(cown->value);
+  bool was_exception = cown->exception;
+  cown_set_value(cown, Py_None);
+
+  if (cown_release(cown) < 0) {
+    Py_DECREF(payload);
+    return NULL;
+  }
+
+  if (!was_exception) {
+    return payload;
+  }
+
+  PyErr_SetObject((PyObject *)Py_TYPE(payload), payload);
+  Py_DECREF(payload);
+  return NULL;
+}
+
 static PyObject *CownCapsule_get_impl(PyObject *op, void *Py_UNUSED(dummy)) {
   return Py_NewRef(op);
 }
@@ -2136,10 +1940,6 @@ static PyObject *CownCapsule_reduce(PyObject *op, PyObject *Py_UNUSED(dummy)) {
     return NULL;
   }
 
-  // Resolve the cached reconstructor on the module state. The cown
-  // capsule type is registered via PyType_FromModuleAndSpec so
-  // PyType_GetModuleState returns the same struct as BOC_STATE without
-  // an import lookup.
   _core_module_state *state =
       (_core_module_state *)PyType_GetModuleState(Py_TYPE(op));
   if (state == NULL) {
@@ -2148,24 +1948,6 @@ static PyObject *CownCapsule_reduce(PyObject *op, PyObject *Py_UNUSED(dummy)) {
     return NULL;
   }
 
-  // Select reconstructor and INCREF the inner BOCCown when inheriting.
-  // The INCREF must be taken BEFORE we hand the pointer to PyTuple_Pack
-  // so that the resulting pickle bytes always carry a live reference;
-  // failure of PyTuple_Pack below rolls it back.
-  //
-  // When BOC_NB_CTX is non-NULL we are inside a noticeboard write and
-  // must (a) use the borrowing reconstructor and (b) verify that this
-  // cown is a member of the caller's pre-pinned set. A miss flags the
-  // ctx so the writer can fail-close after object_to_xidata returns;
-  // we still emit the borrowing token here because the pickler is
-  // already streaming opcodes and the writer will discard the
-  // resulting xidata wholesale (see the pin-set audit in
-  // _core_noticeboard_write_direct).
-  //
-  // The membership check is a linear scan over the pin array. Expected
-  // N is small (typical noticeboard values hold 0–3 cowns); the array
-  // already lives in one cache line up to N=8. If a profile ever shows
-  // this matters, sort new_pins[] in nb_pin_cowns and bsearch here.
   NoticeboardPickleCtx *nb_ctx = BOC_NB_CTX;
   bool borrowing = (nb_ctx != NULL);
   PyObject *reconstructor = borrowing ? state->cown_reconstructor_borrowing
@@ -2184,8 +1966,6 @@ static PyObject *CownCapsule_reduce(PyObject *op, PyObject *Py_UNUSED(dummy)) {
   } else {
     COWN_INCREF(cown);
   }
-  /* Cached pointer owned by the module; PyTuple_Pack INCREFs its
-   * argument so we do not borrow into the result tuple. */
 
   PyObject *args = PyTuple_Pack(2, ptr, pid_obj);
   Py_DECREF(ptr);
@@ -2212,6 +1992,7 @@ static PyMethodDef CownCapsule_methods[] = {
     {"acquire", CownCapsule_acquire, METH_NOARGS, NULL},
     {"release", CownCapsule_release, METH_NOARGS, NULL},
     {"disown", CownCapsule_disown, METH_NOARGS, NULL},
+    {"unwrap", CownCapsule_unwrap, METH_NOARGS, NULL},
     {"__reduce__", CownCapsule_reduce, METH_NOARGS, NULL},
     {NULL} /* Sentinel */
 };
@@ -2456,8 +2237,6 @@ int boc_main_pinned_enqueue(boc_bq_node_t *n) {
   if (prev == 0) {
     atomic_store(&MAIN_PINNED_NONEMPTY_SINCE_NS, boc_now_ns());
   }
-  // Wake any wait()-blocked main thread so it can re-evaluate (the
-  // main pump will drain MAIN_PINNED_QUEUE before re-blocking).
   terminator_wake_all();
   return 0;
 }
@@ -2525,9 +2304,6 @@ static PyObject *_core_set_pump_watchdog(PyObject *self, PyObject *args,
     return NULL;
   }
   atomic_store(&WATCHDOG_WARN_MS, warn_ms);
-  // Swap the callback under refcount discipline. The previous slot
-  // is decref'd after the store so the new readers see the new
-  // callback before the old one can run to zero.
   PyObject *new_cb = (on_starve == Py_None) ? NULL : Py_NewRef(on_starve);
   PyObject *prev =
       (PyObject *)atomic_exchange_intptr(&WATCHDOG_ON_STARVE, (intptr_t)new_cb);
@@ -2605,7 +2381,6 @@ XIDATA_GETDATA_FUNC(_cown_shared) {
 
   PRINTDBG("_cown_shared(%p)\n", cown);
 
-  // all we do to initialise the xidata is store a pointer to the cown
   XIDATA_INIT(xidata, tstate->interp, cown, obj, _new_cown_object);
   return 0;
 }
@@ -2707,13 +2482,11 @@ void _contents_shared_free(void *data) {
 /// @return 0 if successful, -1 otherwise
 int _contents_shared(PyThreadState *tstate, PyObject *obj, XIDATA_T **out_ptr) {
   if (!PySequence_Check(obj)) {
-    // not a sequence
     return -1;
   }
 
   Py_ssize_t num_items = PySequence_Length(obj);
   if (num_items < 0) {
-    // Length not implemented (i.e., not a finite sequence)
     return -1;
   }
 
@@ -2762,7 +2535,6 @@ int _contents_shared(PyThreadState *tstate, PyObject *obj, XIDATA_T **out_ptr) {
              shared, i, item->ob_type->tp_name, item, Py_REFCNT(item));
 
     if (pickled == NULL) {
-      // wasn't possible to convert the object to xidata
       goto error;
     }
 
@@ -2801,8 +2573,6 @@ static BOCQueue *get_queue_for_tag(PyObject *tag) {
     return NULL;
   }
 
-  // First we check to see if we already have cached the queue this tag is
-  // associated with
   BOCQueue *qptr = BOC_QUEUES;
   for (size_t i = 0; i < BOC_QUEUE_COUNT; ++i, ++qptr) {
     if (BOC_STATE->queue_tags[i] != NULL) {
@@ -2811,7 +2581,6 @@ static BOCQueue *get_queue_for_tag(PyObject *tag) {
         BOC_STATE->queue_tags[i] = NULL;
       } else {
         if (tag_compare_with_PyUnicode(BOC_STATE->queue_tags[i], tag) == 0) {
-          // this is the dedicated queue for this tag
           return qptr;
         } else {
           if (PyErr_Occurred() != NULL) {
@@ -2819,99 +2588,53 @@ static BOCQueue *get_queue_for_tag(PyObject *tag) {
           }
         }
 
-        // not the right queue, keep looking
         continue;
       }
     }
 
-    // check to see if another interpreter has used this queue
     int_least64_t expected = BOC_QUEUE_UNASSIGNED;
     int_least64_t desired = BOC_QUEUE_ASSIGNED;
-    // Pre-check the slot state with a non-allocating load before
-    // committing to a `tag_from_PyUnicode` allocation. Iterating
-    // across many already-ASSIGNED slots while looking for the
-    // dedicated queue of a new tag must NOT allocate per iteration:
-    // the CAS would fail on every ASSIGNED slot and the speculative
-    // tag would immediately be `tag_release`d, turning a cold-start
-    // queue scan into O(BOC_QUEUE_COUNT) malloc/free pairs.
-    //
-    // Only attempt the publish-before-CAS allocation when the slot
-    // is actually UNASSIGNED. The CAS that follows is still needed
-    // to win the slot against a racing peer; on CAS loss we tag-
-    // release and fall through to the discovery branch below
-    // exactly as the prior code did.
     int_least64_t observed = atomic_load(&qptr->state);
     if (observed == BOC_QUEUE_UNASSIGNED) {
-      // Allocate the tag *before* the CAS so that an allocation failure
-      // (UTF-8 error / OOM in tag_from_PyUnicode) leaves the slot in
-      // BOC_QUEUE_UNASSIGNED — peer interpreters can re-attempt and we
-      // never publish ASSIGNED-with-NULL-tag (which would wedge readers
-      // in the busy-wait below). The new tag arrives with rc=1; on CAS
-      // loss we tag_release it (the slot is owned by some other peer
-      // who is responsible for publishing their own tag).
       BOCTag *new_tag = tag_from_PyUnicode(tag, qptr);
       if (new_tag == NULL) {
         return NULL;
       }
       if (atomic_compare_exchange_strong(&qptr->state, &expected, desired)) {
-        // we're the first, this is the new dedicated queue for this tag
         PRINTDBG("Assigning ");
         PRINTOBJDBG(tag);
         PRINTFDBG(" to queue %zu\n", i);
-        // Publish the tag pointer with release semantics so the busy-wait
-        // below sees the non-NULL tag after observing ASSIGNED. The tag
-        // already has rc=1 (queue's owning reference). We then add the
-        // per-interpreter cache reference (rc=2). This replaces the prior
-        // rc=0-then-double-INCREF idiom whose incref window allowed a
-        // racing TAG_DECREF to free a freshly published tag.
         atomic_store_intptr(&qptr->tag, (intptr_t)new_tag);
         BOC_STATE->queue_tags[i] = new_tag;
         TAG_INCREF(new_tag);
         return qptr;
       }
 
-      // CAS lost — another interpreter assigned this slot first. Release
-      // our speculative allocation; we'll fall through to the post-CAS
-      // discovery branch below to pick up the winner's tag.
       TAG_DECREF(new_tag);
     } else {
-      // Slot was already ASSIGNED (or DISABLED) when we looked. Mirror
-      // the post-CAS-failure exit values so the discovery branch below
-      // sees the same `expected` it would have gotten from a failed CAS.
       expected = observed;
     }
 
-    // this queue has already been assigned
     if (expected == BOC_QUEUE_DISABLED) {
-      // queue is being reconfigured by set_tags — skip it
       continue;
     }
 
     BOCTag *qtag = (BOCTag *)atomic_load_intptr(&qptr->tag);
     while (qtag == NULL) {
-      // waiting for another interpreter to allocate and assign
       qtag = (BOCTag *)atomic_load_intptr(&qptr->tag);
     }
 
-    // Discovery path: the qptr->tag pointer is owned by the publisher's
-    // queue reference. Add a per-interpreter cache reference.
     BOC_STATE->queue_tags[i] = qtag;
     TAG_INCREF(qtag);
 
     PRINTDBG("Discovered %s at queue %" PRIdLEAST64 "\n", qtag->str, i);
     if (tag_compare_with_PyUnicode(BOC_STATE->queue_tags[i], tag) == 0) {
-      // this is the dedicated queue for this tag
       return qptr;
     } else if (PyErr_Occurred() != NULL) {
       return NULL;
     }
-
-    // not the right queue, keep looking
   }
 
-  // No queue for this tag — dump observed slot state to stderr so that
-  // intermittent failures (e.g. memory-ordering races on weak-memory
-  // architectures) leave a forensic trail even in release builds.
   fprintf(stderr, "[bocpy] get_queue_for_tag: no queue found for tag ");
   PyObject_Print(tag, stderr, Py_PRINT_RAW);
   fprintf(stderr, " (interpreter index=%" PRIdLEAST64 ")\n", BOC_STATE->index);
@@ -2936,13 +2659,6 @@ static BOCQueue *get_queue_for_tag(PyObject *tag) {
 /// @param contents The contents of the message.
 /// @return A message object
 static BOCMessage *boc_message_new(PyObject *tag, PyObject *contents) {
-  // Zero-init so any later boc_message_free on a partially-built
-  // message sees NULL for `tag`, `xidata`, and `recycle_queue` and
-  // safely no-ops the TAG_DECREF / BOCRecycleQueue_enqueue arms.
-  // Without this, callers must remember to PyMem_RawFree (rather
-  // than boc_message_free) on every early-error path that occurs
-  // before the explicit field assignments below — an invariant
-  // that is easy to break when adding new failure points.
   BOCMessage *message = (BOCMessage *)PyMem_RawCalloc(1, sizeof(BOCMessage));
   if (message == NULL) {
     PyErr_NoMemory();
@@ -2952,10 +2668,6 @@ static BOCMessage *boc_message_new(PyObject *tag, PyObject *contents) {
   BOCQueue *qptr = get_queue_for_tag(tag);
   if (qptr == NULL) {
     PyMem_RawFree(message);
-    // Only set the capacity-exhaustion KeyError if get_queue_for_tag
-    // did not already raise (e.g. UnicodeEncodeError on surrogates,
-    // PyMem_RawMalloc OOM in tag_from_PyUnicode). Overwriting a
-    // pending exception masks the true failure cause.
     if (!PyErr_Occurred()) {
       PyErr_Format(PyExc_KeyError,
                    "No queue available for tag %R: tag capacity exceeded", tag);
@@ -2965,19 +2677,12 @@ static BOCMessage *boc_message_new(PyObject *tag, PyObject *contents) {
 
   BOCTag *qtag = (BOCTag *)atomic_load_intptr(&qptr->tag);
   if (qtag == NULL) {
-    // non-assigned tag — allocate one for this message. The new tag
-    // arrives with rc=1; ownership transfers to message->tag and is
-    // released by boc_message_free.
     message->tag = tag_from_PyUnicode(tag, qptr);
     if (message->tag == NULL) {
       PyMem_RawFree(message);
       return NULL;
     }
   } else {
-    // qtag is owned by qptr->tag (publisher's queue reference). Take
-    // a separate owning reference for message->tag so a concurrent
-    // set_tags that swaps qptr->tag and tag_disables the old one
-    // does not free it out from under us.
     message->tag = qtag;
     TAG_INCREF(message->tag);
   }
@@ -3029,17 +2734,13 @@ static BOCMessage *boc_message_new(PyObject *tag, PyObject *contents) {
 static int boc_enqueue(BOCMessage *message) {
   BOCQueue *qptr = message->tag->queue;
 
-  // get the current tail
   int_least64_t tail = atomic_load(&qptr->tail);
   while (true) {
-    // get the current head
     int_least64_t head = atomic_load(&qptr->head);
     if (tail - head >= BOC_CAPACITY) {
-      // the queue is full
       return -1;
     }
 
-    // attempt to enqueue
     if (atomic_compare_exchange_strong(&qptr->tail, &tail, tail + 1)) {
       PRINTDBG("Enqueued %s at q%" PRIdLEAST64 "[%" PRIdLEAST64
                "] (%" PRIdLEAST64 " - %" PRIdLEAST64 " = %" PRIdLEAST64 ")\n",
@@ -3050,10 +2751,8 @@ static int boc_enqueue(BOCMessage *message) {
 
       boc_atomic_fetch_add_u64_explicit(&qptr->pushed_total, 1, BOC_MO_RELAXED);
 
-      // If any receiver is parked on this queue's condvar, wake it.
-      // The seq_cst load synchronizes with the consumer's seq_cst increment
-      // of waiters, ensuring that either we see the waiter and signal, or the
-      // consumer's re-check dequeue (under the same mutex) finds our message.
+      // seq_cst pairs with the consumer's seq_cst waiters increment: either we
+      // see the waiter and signal, or its re-check dequeue finds our message.
       if (atomic_load_explicit(&qptr->waiters, memory_order_seq_cst) > 0) {
         boc_park_lock(qptr);
         boc_park_signal(qptr);
@@ -3063,7 +2762,6 @@ static int boc_enqueue(BOCMessage *message) {
       return 0;
     }
 
-    // someone else got there first, try again
     boc_atomic_fetch_add_u64_explicit(&qptr->enqueue_cas_retries, 1,
                                       BOC_MO_RELAXED);
   }
@@ -3094,21 +2792,17 @@ static int_least64_t boc_dequeue(PyObject *tag, BOCMessage **message) {
   int_least64_t tail = atomic_load(&qptr->tail);
   int_least64_t count = tail - head;
   if (count == 0) {
-    // queue is empty
     return -1;
   }
 
   while (head < tail) {
-    // attempt to dequeue a message
     if (!atomic_compare_exchange_strong(&qptr->head, &head, head + 1)) {
       if (head >= tail) {
-        // queue is empty
         return -1;
       }
 
       PRINTDBG("Unable to dequeue at head=%" PRIdLEAST64 "\n", head);
 
-      // someone else already consumed this, try again
       boc_atomic_fetch_add_u64_explicit(&qptr->dequeue_cas_retries, 1,
                                         BOC_MO_RELAXED);
       tail = atomic_load(&qptr->tail);
@@ -3117,7 +2811,6 @@ static int_least64_t boc_dequeue(PyObject *tag, BOCMessage **message) {
 
     int_least64_t index = head % BOC_CAPACITY;
     while (qptr->messages[index] == NULL) {
-      // spin in case the message has not yet been written
       Py_BEGIN_ALLOW_THREADS thrd_sleep(&SLEEP_TS, NULL);
       Py_END_ALLOW_THREADS
     }
@@ -3199,10 +2892,9 @@ static PyObject *receive_single_tag(PyObject *tag, bool do_timeout,
                                     double end_time, PyObject *after) {
   BOCQueue *qptr = get_queue_for_tag(tag);
   BOCMessage *message = NULL;
-  struct timespec backoff = {0, 1000}; // 1 µs, only used when do_timeout
+  struct timespec backoff = {0, 1000};
 
   while (true) {
-    // Phase 1: Spin
     for (int spin = 0; spin < BOC_SPIN_COUNT; ++spin) {
       BOCRecycleQueue_empty(BOC_STATE->recycle_queue, false);
 
@@ -3220,7 +2912,6 @@ static PyObject *receive_single_tag(PyObject *tag, bool do_timeout,
       }
     }
 
-    // Phase 2a: Timed — exponential backoff (no parking)
     if (do_timeout) {
       if (boc_now_s() > end_time) {
         goto timed_out;
@@ -3235,19 +2926,16 @@ static PyObject *receive_single_tag(PyObject *tag, bool do_timeout,
       continue;
     }
 
-    // Phase 2b: Untimed — park on condvar (indefinite wait)
     if (qptr == NULL) {
       PyErr_Format(PyExc_KeyError, "No message queue found for tag: %R", tag);
       return NULL;
     }
 
     boc_park_lock(qptr);
-    // seq_cst increment synchronizes with the seq_cst load in boc_enqueue,
-    // ensuring that either the producer sees our waiter count and signals,
-    // or our re-check dequeue below finds the producer's message.
+    // seq_cst pairs with boc_enqueue's waiters load (lost-wakeup fence); the
+    // boc_dequeue re-check under the park lock closes the publish/park race.
     atomic_fetch_add_explicit(&qptr->waiters, 1, memory_order_seq_cst);
 
-    // Re-check under lock (prevents lost wake)
     int_least64_t queue_index = boc_dequeue(tag, &message);
     if (queue_index >= 0) {
       atomic_fetch_sub_explicit(&qptr->waiters, 1, memory_order_seq_cst);
@@ -3263,14 +2951,12 @@ static PyObject *receive_single_tag(PyObject *tag, bool do_timeout,
 
     Py_BEGIN_ALLOW_THREADS;
     boc_park_wait(qptr);
-    // Wake: we hold park_mutex but NOT the GIL.
-    // Release mutex BEFORE re-acquiring GIL to avoid ABBA deadlock:
-    // consumer (mutex → GIL) vs producer (GIL → mutex).
+    // Drop the park mutex before re-acquiring the GIL: consumer takes
+    // mutex->GIL, producer takes GIL->mutex; unlocking first avoids ABBA.
     atomic_fetch_sub_explicit(&qptr->waiters, 1, memory_order_seq_cst);
     boc_park_unlock(qptr);
     Py_END_ALLOW_THREADS;
 
-    // Re-resolve queue pointer (set_tags may have reassigned it)
     BOCQueue *new_qptr = get_queue_for_tag(tag);
     if (new_qptr == NULL) {
       PyErr_SetString(PyExc_RuntimeError, "Tag invalidated during receive");
@@ -3326,12 +3012,11 @@ static PyObject *receive_multi_tag(PyObject *tags_fast, Py_ssize_t tags_size,
                                    PyObject *after) {
   BOCMessage *message = NULL;
   size_t tag_index = 0;
-  struct timespec backoff = {0, 1000}; // 1 µs
+  struct timespec backoff = {0, 1000};
 
   while (true) {
     BOCRecycleQueue_empty(BOC_STATE->recycle_queue, false);
 
-    // Round-robin: try one tag per iteration
     PyObject *tag = PySequence_Fast_GET_ITEM(tags_fast, tag_index);
     tag_index = (tag_index + 1) % tags_size;
 
@@ -3453,7 +3138,6 @@ static PyObject *_core_receive(PyObject *module, PyObject *args,
     end_time = boc_now_s() + timeout;
   }
 
-  // Dispatch: single-tag vs multi-tag
   if (tags_fast != NULL) {
     return receive_multi_tag(tags_fast, tags_size, do_timeout, end_time, after);
   }
@@ -3541,9 +3225,6 @@ PyObject *_core_drain(PyObject *module, PyObject *args) {
 /// @brief Atomic counter for BOC behaviors
 atomic_int_least64_t BOC_BEHAVIOR_COUNT = 0;
 
-// Forward declaration so BOCBehavior can hold an array of request pointers;
-// the BOCRequest struct itself is defined further down (next to the request
-// helpers).
 struct boc_request;
 
 /// @brief Encapsulates a behavior's request for a cown.
@@ -3627,19 +3308,6 @@ typedef struct behavior_s {
   int16_t owner_worker_index;
 } BOCBehavior;
 
-// Layout note. The intrusive queue link (`bq_node`) and the OR-fold
-// `pinned` byte live in a scheduler-owned `boc_behavior_prehdr_t`
-// allocated immediately before each BOCBehavior — CPython
-// `_PyGC_Head` / `_Py_AS_GC()` style. See `boc_sched.h` for the
-// prehdr definition and the `BOC_BEHAVIOR_PREHDR(b)` recovery macro.
-// `behavior_new` / `behavior_free` / the token allocator below own
-// the combined allocation; the rest of `_core.c` keeps treating
-// BOCBehavior as an ordinary pointer-indirect struct.
-
-// Recover a BOCBehavior pointer from the prehdr's bq_node. Inverse
-// of `BOC_BEHAVIOR_PREHDR`; used by the worker pop sites that pull
-// nodes out of the scheduler queues. `bq_node` sits at offset 0 of
-// the prehdr, so the cast IS the container_of (no offsetof needed).
 #define BEHAVIOR_FROM_PREHDR_NODE(node)                                        \
   ((BOCBehavior *)(((boc_behavior_prehdr_t *)(node)) + 1))
 
@@ -3651,17 +3319,9 @@ typedef struct behavior_capsule_object {
 #define BehaviorCapsule_CheckExact(op)                                         \
   Py_IS_TYPE((op), BOC_STATE->behavior_capsule_type)
 
-// Forward declaration: defined alongside the request helpers further down.
-// behavior_free uses it to clean up any unreleased request array if a
-// behavior is destroyed without going through behavior_release_all.
 static void request_decref(BOCRequest *request);
 
 BOCBehavior *behavior_new() {
-  // Combined `prehdr + BOCBehavior` allocation per the pre-header
-  // scheme (see `boc_sched.h` and the layout note above the struct).
-  // The returned pointer is past the prehdr so all existing
-  // `BOCBehavior *` consumers stay unchanged; `behavior_free`
-  // recovers the allocation origin via `BOC_BEHAVIOR_PREHDR(b)`.
   void *raw =
       PyMem_RawMalloc(sizeof(boc_behavior_prehdr_t) + sizeof(BOCBehavior));
   if (raw == NULL) {
@@ -3670,10 +3330,6 @@ BOCBehavior *behavior_new() {
   }
 
   boc_behavior_prehdr_t *prehdr = (boc_behavior_prehdr_t *)raw;
-  // Zero the prehdr in full — `pinned`, `_reserved`, and the
-  // intrusive link's `next_in_queue` (the boc_bq_* enqueue path
-  // requires this field to start NULL, and we are still under the
-  // GIL before the behaviour can be reached from any other thread).
   memset(prehdr, 0, sizeof(*prehdr));
 
   BOCBehavior *behavior = (BOCBehavior *)(prehdr + 1);
@@ -3689,9 +3345,6 @@ BOCBehavior *behavior_new() {
   behavior->captures = NULL;
   behavior->requests = NULL;
   behavior->requests_size = 0;
-  // Ordinary behaviours are not fairness tokens. Token allocation
-  // is performed directly in `_core_scheduler_runtime_start` and
-  // bypasses `behavior_new`.
   behavior->is_token = 0;
   behavior->owner_worker_index = -1;
   BOC_REF_TRACKING_ADD_BEHAVIOR();
@@ -3735,11 +3388,6 @@ void behavior_free(BOCBehavior *behavior) {
   }
 
   if (behavior->requests != NULL) {
-    // Defensive cleanup: if a behavior is destroyed without
-    // behavior_release_all having been called (e.g. a scheduling failure
-    // mid-2PL), drop the owner ref on each request. If a successor is
-    // still holding a concurrent ref (unlikely here since the behavior
-    // never linked), the free is deferred until that successor's decref.
     for (Py_ssize_t i = 0; i < behavior->requests_size; ++i) {
       if (behavior->requests[i] != NULL) {
         request_decref(behavior->requests[i]);
@@ -3752,8 +3400,6 @@ void behavior_free(BOCBehavior *behavior) {
     BOCTag_free(behavior->thunk);
   }
 
-  // Free at the combined allocation's origin (one slot before the
-  // BOCBehavior, per the pre-header scheme).
   PyMem_RawFree(BOC_BEHAVIOR_PREHDR(behavior));
   BOC_REF_TRACKING_REMOVE_BEHAVIOR();
 }
@@ -3922,8 +3568,6 @@ static int BehaviorCapsule_init(PyObject *op, PyObject *args,
     return -1;
   }
 
-  // PyMem_RawCalloc with nelem == 0 is implementation-defined (may return
-  // NULL legally), so only treat NULL as failure when args_size > 0.
   behavior->group_ids = PyMem_RawCalloc((size_t)args_size, sizeof(int));
   if (args_size > 0 && behavior->group_ids == NULL) {
     Py_DECREF(cowns);
@@ -3949,11 +3593,6 @@ static int BehaviorCapsule_init(PyObject *op, PyObject *args,
 
   Py_DECREF(cowns_list_fast);
 
-  // Publish the OR-fold to the prehdr. The scheduler reads this via
-  // `boc_behavior_node_is_pinned` from `boc_sched_dispatch` to route
-  // pinned-touching behaviours onto MAIN_PINNED_QUEUE instead of a
-  // worker WSQ. Token behaviours never reach this path (their prehdr
-  // stays zero-initialised by `_core_scheduler_runtime_start`).
   BOC_BEHAVIOR_PREHDR(behavior)->pinned = pinned;
 
   behavior->args = add_vars(cowns, &behavior->args_size);
@@ -3970,10 +3609,8 @@ static int BehaviorCapsule_init(PyObject *op, PyObject *args,
     return -1;
   }
 
-  // We add two additional counts. One for the result, and another so that
-  // the 2PL is finished before we start running the thunk. Without this,
-  // the calls to release at the end of the thunk could race with the calls to
-  // finish_enqueue in the 2PL.
+  // +2 over the cown args: one hold for the result cown, one so dispatch waits
+  // for 2PL phase-2 to finish before the thunk's release calls can race it.
   behavior->count = (int_least64_t)(behavior->args_size + 2);
 
   return 0;
@@ -4029,9 +3666,6 @@ static int behavior_resolve_one(BOCBehavior *behavior) {
   if (count == 0) {
     BEHAVIOR_INCREF(behavior);
     if (boc_sched_dispatch(&BOC_BEHAVIOR_PREHDR(behavior)->bq_node) < 0) {
-      // Roll back the queue-owned reference we just took. The
-      // dispatch failure means no consumer will ever see this
-      // behavior, so no DECREF will fire from the worker side.
       BEHAVIOR_DECREF(behavior);
       return -1;
     }
@@ -4094,8 +3728,6 @@ static PyObject *BehaviorCapsule_create_requests(PyObject *op,
     return NULL;
   }
 
-  // Result cown always gets a request (it cannot collide with any args
-  // cown — args cowns are user-visible, the result cown is fresh).
   BOCRequest *result_request = request_new_inner(behavior->result);
   if (result_request == NULL) {
     PyMem_RawFree(requests);
@@ -4107,9 +3739,6 @@ static PyObject *BehaviorCapsule_create_requests(PyObject *op,
   BOCCown **ptr = behavior->args;
   for (Py_ssize_t i = 0; i < behavior->args_size; ++i, ++ptr) {
     BOCCown *cown = *ptr;
-    // Linear dedup against the existing entries. args_size is small in
-    // practice (bounded by the cown count of a single @when call), so
-    // O(n^2) here is fine.
     bool seen = false;
     for (Py_ssize_t j = 1; j < count; ++j) {
       if (requests[j]->target == cown) {
@@ -4119,8 +3748,6 @@ static PyObject *BehaviorCapsule_create_requests(PyObject *op,
     }
 
     if (seen) {
-      // Compensate behavior->count for the duplicate that won't enter
-      // the MCS queue (and therefore won't call resolve_one itself).
       if (behavior_resolve_one(behavior) < 0) {
         for (Py_ssize_t k = 0; k < count; ++k) {
           request_decref(requests[k]);
@@ -4142,18 +3769,13 @@ static PyObject *BehaviorCapsule_create_requests(PyObject *op,
     requests[count++] = request;
   }
 
-  // Sort by target so the 2PL enqueue order is deterministic.
   qsort(requests, (size_t)count, sizeof(BOCRequest *), request_cmp_target);
 
-  // Hand ownership of the array to the BOCBehavior.
   behavior->requests = requests;
   behavior->requests_size = count;
 
   PyObject *list = PyList_New(count);
   if (list == NULL) {
-    // Ownership has already been transferred — behavior_free (or
-    // behavior_release_all if the caller still tries to dispatch) will
-    // clean up.
     return NULL;
   }
 
@@ -4174,8 +3796,6 @@ static int behavior_release_all_impl(BOCBehavior *behavior) {
     return 0;
   }
 
-  // Detach the array from the behavior up front so behavior_free's
-  // defensive cleanup will not double-free if anything below raises.
   BOCRequest **requests = behavior->requests;
   Py_ssize_t requests_size = behavior->requests_size;
   behavior->requests = NULL;
@@ -4183,7 +3803,6 @@ static int behavior_release_all_impl(BOCBehavior *behavior) {
 
   for (Py_ssize_t i = 0; i < requests_size; ++i) {
     if (request_release_inner(requests[i]) < 0) {
-      // Free the rest of the array even on error to limit the leak.
       for (Py_ssize_t k = i; k < requests_size; ++k) {
         request_decref(requests[k]);
       }
@@ -4230,18 +3849,8 @@ static PyObject *BehaviorCapsule_schedule(PyObject *op,
   BehaviorCapsuleObject *capsule = (BehaviorCapsuleObject *)op;
   BOCBehavior *behavior = capsule->behavior;
 
-  // Drain the caller's recycle queue opportunistically. The main
-  // interpreter ordinarily drains via its own receive() loop; a worker
-  // that calls @when from inside a behavior body (i.e. is the caller
-  // here) would otherwise have to wait until it returns to
-  // _core_scheduler_worker_pop before reclaiming any xidata pushed onto
-  // its queue by other interpreters. Non-blocking; the recycle queue is
-  // single-consumer (this interpreter), so the drain is safe.
   BOCRecycleQueue_empty(BOC_STATE->recycle_queue, false);
 
-  // Build the request array if it has not already been built (e.g. by an
-  // external caller having invoked create_requests first). create_requests
-  // is idempotent only via its own guard; here we just skip if populated.
   if (behavior->requests == NULL) {
     PyObject *list = BehaviorCapsule_create_requests(op, NULL);
     if (list == NULL) {
@@ -4253,33 +3862,14 @@ static PyObject *BehaviorCapsule_schedule(PyObject *op,
   BOCRequest **requests = behavior->requests;
   Py_ssize_t n = behavior->requests_size;
 
-  // Drop the GIL across the pure-atomic 2PL link/finish span. The
-  // inner ops (atomic_exchange on target->last, atomic_store on prev->next,
-  // BEHAVIOR_INCREF, the spin on prev->scheduled, behavior_resolve_one's
-  // count decrement) touch no Python state. behavior_resolve_one was made
-  // int-returning specifically so it has no Py_RETURN_NONE on the hot path.
-  //
-  // The only Python-state operation reachable from the inner code is the
-  // PyErr_SetString / boc_message_free pair on the count==0 + queue-full
-  // branch. count is sized args_size + 2 by BehaviorCapsule_init, and the
-  // link loop applies at most args_size decrements, so count >= 2 on every
-  // iteration -- the count==0 branch is unreachable here. The final
-  // behavior_resolve_one below runs UNDER the GIL and may legitimately
-  // hit that branch (queue full); it remains the only PyErr surface.
   bool ok = true;
   Py_BEGIN_ALLOW_THREADS for (Py_ssize_t i = 0; i < n; ++i) {
-    // Phase 1: link this request into its cown's MCS queue. The only
-    // failure mode is the unreachable PyErr path documented above; if it
-    // somehow fires, surface it as a generic error after re-acquiring
-    // the GIL (we cannot raise here).
     if (request_start_enqueue_inner(requests[i], behavior) < 0) {
       ok = false;
       break;
     }
   }
   if (ok) {
-    // Phase 2: mark each request scheduled. Pure atomic stores; releases
-    // the spin in any successor that started linking concurrently.
     for (Py_ssize_t i = 0; i < n; ++i) {
       request_finish_enqueue_inner(requests[i]);
     }
@@ -4294,15 +3884,6 @@ static PyObject *BehaviorCapsule_schedule(PyObject *op,
     return NULL;
   }
 
-  // Final resolve_one to account for the +1 the constructor added so
-  // dispatch waits for the 2PL to complete (see BehaviorCapsule_init).
-  // Runs UNDER the GIL: it is the legitimate dispatcher of the start
-  // message and may set a Python exception on a queue-full failure.
-  //
-  // If the resolve_one below hits the runtime-down sentinel inside
-  // @ref boc_sched_dispatch, the BOCRequest chains linked above are
-  // intentionally not unwound; see @ref behavior_resolve_one for
-  // the full rationale.
   if (behavior_resolve_one(behavior) < 0) {
     return NULL;
   }
@@ -4484,13 +4065,11 @@ static PyObject *behavior_execute_impl(BOCBehavior *behavior,
 
   PyObject *group_list = NULL;
   int current_group_id = 0;
-  // args are passed as CownCapsule objects
   for (Py_ssize_t i = 0; i < behavior->args_size; ++i, ++ptr) {
     PyObject *capsule = cown_capsule_wrap(*ptr, false);
     int group_id = behavior->group_ids[i];
 
     if (group_id == current_group_id) {
-      // in a group, append to the current group
       if (PyList_Append(group_list, capsule) < 0) {
         Py_DECREF(thunk_args);
         Py_DECREF(group_list);
@@ -4502,7 +4081,6 @@ static PyObject *behavior_execute_impl(BOCBehavior *behavior,
     }
 
     if (group_list != NULL) {
-      // the current group is complete, add it
       PyTuple_SET_ITEM(thunk_args, arg_idx, group_list);
       arg_idx += 1;
       group_list = NULL;
@@ -4510,13 +4088,11 @@ static PyObject *behavior_execute_impl(BOCBehavior *behavior,
     }
 
     if (group_id > 0) {
-      // singleton
       PyTuple_SET_ITEM(thunk_args, arg_idx, capsule);
       arg_idx += 1;
       continue;
     }
 
-    // new group
     group_list = PyList_New(1);
     if (group_list == NULL) {
       Py_DECREF(thunk_args);
@@ -4528,14 +4104,12 @@ static PyObject *behavior_execute_impl(BOCBehavior *behavior,
   }
 
   if (group_list != NULL) {
-    // the final arg was a group
     PyTuple_SET_ITEM(thunk_args, arg_idx, group_list);
     arg_idx += 1;
     group_list = NULL;
     current_group_id = 0;
   }
 
-  // captures are passed as raw values
   ptr = behavior->captures;
   for (Py_ssize_t i = 0; i < behavior->captures_size; ++i, ++arg_idx, ++ptr) {
     PyObject *value = Py_NewRef((*ptr)->value);
@@ -4563,7 +4137,6 @@ static PyObject *behavior_execute_impl(BOCBehavior *behavior,
   PRINTDBG("Setting result.\n");
 
   if (result != NULL && strcmp(result->ob_type->tp_name, "Cown") == 0) {
-    // attempt to unwrap the cown
     PyObject *capsule = PyObject_GetAttrString(result, "impl");
     Py_DECREF(result);
     result = capsule;
@@ -4632,16 +4205,6 @@ static PyType_Spec BehaviorCapsule_Spec = {
     .flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE,
     .slots = BehaviorCapsule_slots};
 
-// ---------------------------------------------------------------------------
-// main_pump_bounded: drive the main-pinned queue from the main interpreter.
-// Mirrors `worker.run_behavior`'s layered try/finally so the per-iteration
-// cleanup (IN_PUMP_BODY clear + release pair + terminator_dec) always runs.
-// ---------------------------------------------------------------------------
-
-// Convert a Python deadline_ms (long or None) to an absolute monotonic-ns
-// deadline. Returns 0 when no deadline (None) or on invalid input; callers
-// validate non-None inputs separately so we only reach the conversion path
-// with a known-valid PyLong.
 static uint64_t deadline_or_zero(PyObject *deadline_ms) {
   if (deadline_ms == NULL || deadline_ms == Py_None) {
     return 0;
@@ -4654,8 +4217,6 @@ static uint64_t deadline_or_zero(PyObject *deadline_ms) {
   return boc_now_ns() + (uint64_t)ms * 1000000ULL;
 }
 
-// True if more iterations are permitted by the user's max_behaviors cap.
-// None / non-positive values are treated as unbounded.
 static bool max_behaviors_or_inf(PyObject *max_behaviors, Py_ssize_t executed) {
   if (max_behaviors == NULL || max_behaviors == Py_None) {
     return true;
@@ -4668,24 +4229,10 @@ static bool max_behaviors_or_inf(PyObject *max_behaviors, Py_ssize_t executed) {
   return (long long)executed < lim;
 }
 
-// Release-path error logger. Mirrors worker.run_behavior's outer-finally
-// logger.exception arms (release / release_all): log + swallow so a single
-// misbehaving step cannot strand the runtime.
 static inline void _core_log_release_error(void) {
   PyErr_WriteUnraisable(NULL);
 }
 
-// Pump-starvation watchdog warn-side check. Fires when the pinned
-// queue has been non-empty for at least WATCHDOG_WARN_MS without a
-// pump call making progress. Rate-limited: only one warn emission
-// per non-empty epoch (cleared when the queue drains).
-//
-// Three exit paths:
-//   * watchdog disabled (warn_ms == 0): no work.
-//   * queue empty (NONEMPTY_SINCE_NS == 0): no work.
-//   * threshold not crossed: no work.
-// Otherwise: invoke the user's on_starve callback if set, else log a
-// default warning. Called from the main interpreter only (pump entry).
 static void boc_main_pinned_check_warn(void) {
   uint64_t warn_ms = atomic_load(&WATCHDOG_WARN_MS);
   if (warn_ms == 0) {
@@ -4699,10 +4246,6 @@ static void boc_main_pinned_check_warn(void) {
   if (now_ns < since_ns || (now_ns - since_ns) < warn_ms * 1000000ULL) {
     return;
   }
-  // Rate-limit: one warn per non-empty epoch. The epoch closes when
-  // the queue drains (NONEMPTY_SINCE_NS -> 0); WATCHDOG_LAST_WARN_NS
-  // is reset alongside. A relaxed compare against since_ns suffices
-  // because both are monotonic and only main reads/writes them.
   uint64_t last_warn = atomic_load(&WATCHDOG_LAST_WARN_NS);
   if (last_warn >= since_ns) {
     return;
@@ -4721,7 +4264,6 @@ static void boc_main_pinned_check_warn(void) {
     return;
   }
   if (callback != NULL) {
-    // Severity 0 = warn (raise side would use 1; reserved for future).
     PyObject *res = PyObject_CallFunction(callback, "iO", 0, msg);
     if (res == NULL) {
       PyErr_WriteUnraisable(callback);
@@ -4747,10 +4289,6 @@ static void boc_main_pinned_check_warn(void) {
   Py_DECREF(msg);
 }
 
-// Acquire-failure capture. behavior_acquire_impl returned < 0 with
-// PyErr set. Stash the exception on the result cown so a consumer reading
-// it sees a diagnostic, then clear PyErr so the next iteration starts
-// clean.
 static void handle_pinned_acquire_failure(BOCBehavior *b) {
   PyObject *exc = PyErr_GetRaisedException();
   if (exc == NULL) {
@@ -4766,13 +4304,6 @@ static void handle_pinned_acquire_failure(BOCBehavior *b) {
   Py_DECREF(exc);
 }
 
-// Body-failure capture. behavior_execute_impl returned NULL.
-// Fork on `Exception` vs `BaseException`:
-//   - `Exception`: capture on the result cown, clear PyErr, count as
-//     raised, populate first_err under raise_on_error.
-//   - `BaseException` (KeyboardInterrupt, SystemExit, GeneratorExit):
-//     stash via the out-param. Restored by the caller AFTER per-iteration
-//     cleanup completes, so the cleanup arms run with PyErr clear.
 static void handle_pinned_body_exception(BOCBehavior *b, bool raise_on_error,
                                          PyObject **first_err,
                                          Py_ssize_t *raised,
@@ -4789,15 +4320,11 @@ static void handle_pinned_body_exception(BOCBehavior *b, bool raise_on_error,
     }
     Py_DECREF(exc);
   } else {
-    // Transfer ownership to the caller's stash slot.
     *base_err = exc;
   }
 }
 
 #ifdef Py_GIL_DISABLED
-// Portable thread-id source for the FT-only single-pumper CAS. Linux/BSD
-// use C11 `thrd_current`; macOS exposes pthread directly; Windows uses
-// GetCurrentThreadId. All cast to uintptr_t for atomic storage.
 static inline uintptr_t boc_pump_thread_id(void) {
 #if defined(_WIN32)
   return (uintptr_t)GetCurrentThreadId();
@@ -4850,7 +4377,6 @@ static PyObject *_core_main_pump_bounded(PyObject *Py_UNUSED(self),
   bool pump_cas_owner = false;
 #endif
 
-  // Gate 1: main interpreter only.
   if (bocpy_interpid() != bocpy_main_interpid()) {
     PyErr_SetString(PyExc_RuntimeError,
                     "pump() must be called from the main interpreter");
@@ -4858,7 +4384,6 @@ static PyObject *_core_main_pump_bounded(PyObject *Py_UNUSED(self),
   }
 
 #ifdef Py_GIL_DISABLED
-  // Gate 2: free-threaded single-pumper CAS.
   {
     intptr_t self_id = (intptr_t)boc_pump_thread_id();
     intptr_t expected = 0;
@@ -4871,13 +4396,9 @@ static PyObject *_core_main_pump_bounded(PyObject *Py_UNUSED(self),
                       "free-threaded build");
       goto pump_exit;
     }
-    // expected == self_id: re-entry on same thread; gate 3 rejects
-    // nested calls. pump_cas_owner stays false so we do NOT clear
-    // MAIN_PUMP_THREAD on exit (outer frame still owns it).
   }
 #endif
 
-  // Gate 3: nested pump.
   if (IN_PUMP_BODY) {
     PyErr_SetString(PyExc_RuntimeError,
                     "pump() is not reentrant; cannot be called from "
@@ -4898,8 +4419,6 @@ static PyObject *_core_main_pump_bounded(PyObject *Py_UNUSED(self),
     uint64_t new_depth = atomic_fetch_sub(&MAIN_PINNED_DEPTH, 1) - 1;
     if (new_depth == 0) {
       atomic_store(&MAIN_PINNED_NONEMPTY_SINCE_NS, 0);
-      // Close the watchdog warn epoch: the next time the queue
-      // becomes non-empty, the warn fires fresh.
       atomic_store(&WATCHDOG_LAST_WARN_NS, 0);
     }
 
@@ -4918,21 +4437,9 @@ static PyObject *_core_main_pump_bounded(PyObject *Py_UNUSED(self),
     if (acquired) {
       PyObject *rv = behavior_execute_impl(b, boc_export);
       if (rv == NULL) {
-        // Setup failure inside execute_impl (missing thunk,
-        // allocation error). PyErr is live; capture to the result
-        // cown so a consumer sees the diagnostic, then count it as
-        // a raised Exception (it can only be a non-BaseException).
         handle_pinned_body_exception(b, raise_on_error, &first_err, &raised,
                                      &base_err);
       } else {
-        // Body returned normally OR an exception was captured onto
-        // the result cown. Fork on the stored value type:
-        //   - exception flag set and value is BaseException-but-not-
-        //     Exception (KI/SystemExit/GeneratorExit): stash for
-        //     re-raise AFTER cleanup completes.
-        //   - exception flag set and value is Exception: count as
-        //     raised, populate first_err under raise_on_error.
-        //   - exception flag clear: ordinary return value.
         if (b->result->exception) {
           PyObject *captured = b->result->value;
           if (PyObject_IsInstance(captured, PyExc_Exception) == 1) {
@@ -4941,7 +4448,6 @@ static PyObject *_core_main_pump_bounded(PyObject *Py_UNUSED(self),
               first_err = Py_NewRef(captured);
             }
           } else {
-            // BaseException-but-not-Exception: re-raise after cleanup.
             base_err = Py_NewRef(captured);
           }
         }
@@ -4949,9 +4455,6 @@ static PyObject *_core_main_pump_bounded(PyObject *Py_UNUSED(self),
       }
     }
 
-    // release / release_all ALWAYS run (mirrors worker.run_behavior outer
-    // finally). cown_release tolerates partial-acquire by short-circuiting
-    // NO_OWNER cowns, so we can call it even when acquire failed midway.
     if (behavior_release_impl(b) < 0) {
       _core_log_release_error();
     }
@@ -4959,8 +4462,6 @@ static PyObject *_core_main_pump_bounded(PyObject *Py_UNUSED(self),
       _core_log_release_error();
     }
     terminator_dec();
-    // Drop the queue-owned BEHAVIOR_INCREF from behavior_resolve_one; the
-    // inline pump path has no BehaviorCapsule dealloc to do it for us.
     BEHAVIOR_DECREF(b);
 
     IN_PUMP_BODY = false;
@@ -5014,9 +4515,6 @@ static PyObject *_core_main_pump_bounded(PyObject *Py_UNUSED(self),
 /// @return Python int — number of behaviors drained.
 static PyObject *_core_main_pump_drain_all(PyObject *Py_UNUSED(self),
                                            PyObject *Py_UNUSED(args)) {
-  // Gate: drain runs only from the main interpreter. The pinned queue
-  // is single-consumer by design and the result-cown acquire below
-  // asserts main ownership.
   if (bocpy_interpid() != bocpy_main_interpid()) {
     PyErr_SetString(PyExc_RuntimeError,
                     "main_pump_drain_all must be called from the main "
@@ -5044,11 +4542,6 @@ static PyObject *_core_main_pump_drain_all(PyObject *Py_UNUSED(self),
     if (exc == NULL) {
       PyErr_WriteUnraisable(NULL);
     } else {
-      // Result cown sits in published-and-released state (NO_OWNER,
-      // xidata set, value NULL). Acquire on main, write the
-      // drop-exception, then release back to NO_OWNER so any consumer
-      // reading the result cown observes a deterministic diagnostic.
-      // Mirrors `BehaviorCapsule_set_drop_exception`.
       if (cown_acquire(b->result) < 0) {
         PyErr_WriteUnraisable(NULL);
       } else {
@@ -5061,16 +4554,10 @@ static PyObject *_core_main_pump_drain_all(PyObject *Py_UNUSED(self),
       Py_DECREF(exc);
     }
 
-    // MCS unlink + successor handoff on every request the drained
-    // behavior owned. Must run so dependent behaviors on the same
-    // cowns are advanced (even though we're shutting down, the
-    // request arrays still hold strong refs that need to be dropped).
     if (behavior_release_all_impl(b) < 0) {
       PyErr_WriteUnraisable(NULL);
     }
     terminator_dec();
-    // Drop the queue-owned BEHAVIOR_INCREF from behavior_resolve_one (same as
-    // _core_main_pump_bounded).
     BEHAVIOR_DECREF(b);
     drained++;
   }
@@ -5097,7 +4584,6 @@ XIDATA_GETDATA_FUNC(_behavior_shared) {
   BehaviorCapsuleObject *capsule = (BehaviorCapsuleObject *)obj;
   BOCBehavior *behavior = capsule->behavior;
 
-  // all we do to initialise the xidata is store a pointer to the behavior
   XIDATA_INIT(xidata, tstate->interp, behavior, obj, _new_behavior_object);
   return 0;
 }
@@ -5170,7 +4656,6 @@ static PyObject *request_wrap_borrowed(BOCRequest *request) {
 /// @param request The request to release
 /// @return 0 on success, -1 on error (Python exception set)
 static int request_release_inner(BOCRequest *request) {
-  // This code is effectively a MCS-style queue lock release.
   BOCBehavior *next = (BOCBehavior *)atomic_load_intptr(&request->next);
   if (next == NULL) {
     intptr_t expected_ptr = (intptr_t)request;
@@ -5180,17 +4665,8 @@ static int request_release_inner(BOCRequest *request) {
     }
   }
 
-  // Wait for the next pointer to be set by a successor's
-  // request_start_enqueue_inner. Release the GIL across the spin: the
-  // successor is advancing on another thread, may itself be under
-  // Py_BEGIN_ALLOW_THREADS (see BehaviorCapsule_schedule's link loop),
-  // and should not be blocked here. target->last has already been set
-  // past this request by the successor, so this spin terminates as
-  // soon as the successor's `atomic_store(&prev->next, behavior_ptr)`
-  // is visible. The spin is therefore bounded by another thread's
-  // atomic store; if it failed to terminate the runtime invariants
-  // would already be violated, so there is no useful interrupt to
-  // poll for here.
+  // CAS failed => a successor already swung `last` past us but may not have
+  // published request->next yet; spin until it does.
   Py_BEGIN_ALLOW_THREADS while (true) {
     next = (BOCBehavior *)atomic_load_intptr(&request->next);
     if (next) {
@@ -5226,8 +4702,6 @@ static int request_start_enqueue_inner(BOCRequest *request,
   intptr_t prev_ptr =
       atomic_exchange_intptr(&request->target->last, request_ptr);
   if (prev_ptr == 0) {
-    // there is no prior request queued on the cown, so we can immediately
-    // proceed
     if (behavior_resolve_one(behavior) < 0) {
       return -1;
     }
@@ -5236,40 +4710,21 @@ static int request_start_enqueue_inner(BOCRequest *request,
 
   intptr_t behavior_ptr = (intptr_t)behavior;
   BOCRequest *prev = (BOCRequest *)prev_ptr;
-  // Take a temporary ref on the predecessor request: we are about to
-  // spin on prev->scheduled below, and prev's owning behavior can run
-  // release_all concurrently once we have stored prev->next. Without
-  // this ref, the predecessor could be freed between our store of
-  // prev->next and our next load of prev->scheduled -- a UAF the
-  // distributed-release design must guard against because release runs
-  // on the worker thread, not on the same thread as the link loop. The
-  // matching decref happens after the spin completes. At the moment of
-  // the fetch_add, prev is still in
-  // the MCS queue for this cown (our exchange on target->last showed
-  // prev_ptr there), so prev cannot have been freed yet.
+  // Keep prev alive: once we publish prev->next, its owning behavior may run
+  // behavior_release_all on a worker thread and free it; request_decref drops
+  // this hold after the spin below.
   atomic_fetch_add(&prev->rc, 1);
   assert(atomic_load_intptr(&prev->next) == 0);
   atomic_store_intptr(&prev->next, behavior_ptr);
   PRINTDBG("request->next = bid=%" PRIdLEAST64 "\n", behavior->id);
   BEHAVIOR_INCREF(behavior);
-  // Order note: bocpy stores prev->next BEFORE spinning on
-  // prev->scheduled, the opposite of Verona's Slot::set_next which
-  // observes the predecessor's scheduled flag first. The inversion
-  // is safe because (a) the prev->rc++ above keeps prev alive across
-  // the window where prev's owning behavior may run release_all
-  // concurrently once prev->next is published, preventing the UAF
-  // such ordering would otherwise admit (see the rc-comment block
-  // above); and (b) the behavior dispatch invariant ensures no
-  // successor can run user code until ALL its requests have
-  // completed phase 2 (request_finish_enqueue_inner), so the
-  // predecessor cannot retire the chain prematurely while we spin.
+  // Wait for prev's phase-2 (scheduled flag) before dropping it: it may
+  // still be reading our link.
   while (true) {
     if (atomic_load(&prev->scheduled)) {
       break;
     }
   }
-  // Drop the temporary ref; this may be the final ref if the
-  // predecessor's owner has already run release_all.
   request_decref(prev);
 
   return 0;
@@ -5375,14 +4830,11 @@ static PyObject *_core_set_tags(PyObject *module, PyObject *args) {
     return NULL;
   }
 
-  // go queue by queue, disable it, and set the new tag
   BOCQueue *qptr = BOC_QUEUES;
   for (Py_ssize_t i = 0; i < BOC_QUEUE_COUNT; ++i, ++qptr) {
-    // disable the queue
     atomic_store(&qptr->state, BOC_QUEUE_DISABLED);
 
     if (i >= tags_size) {
-      // clear the tags on these unused queues
       BOCTag *oldtag =
           (BOCTag *)atomic_exchange_intptr(&qptr->tag, (intptr_t)NULL);
       if (oldtag != NULL) {
@@ -5406,11 +4858,6 @@ static PyObject *_core_set_tags(PyObject *module, PyObject *args) {
       return NULL;
     }
 
-    // assign a new tag. tag_from_PyUnicode returned with rc=1, which
-    // is exactly the queue's owning reference — no extra TAG_INCREF
-    // is needed here. The previously-installed tag (if any) is
-    // disabled and released so any in-flight messages still holding
-    // owning refs to it can complete and free the tag when done.
     BOCTag *oldtag =
         (BOCTag *)atomic_exchange_intptr(&qptr->tag, (intptr_t)qtag);
     if (oldtag != NULL) {
@@ -5419,7 +4866,6 @@ static PyObject *_core_set_tags(PyObject *module, PyObject *args) {
     }
   }
 
-  // now that all of the queue tags have been updated, drain all messages
   qptr = BOC_QUEUES;
   for (Py_ssize_t i = 0; i < BOC_QUEUE_COUNT; ++i, ++qptr) {
     int_least64_t head = atomic_load(&qptr->head);
@@ -5428,7 +4874,6 @@ static PyObject *_core_set_tags(PyObject *module, PyObject *args) {
       for (int_least64_t i = head; i < tail; ++i) {
         int_least64_t index = i % BOC_CAPACITY;
         while (qptr->messages[index] == NULL) {
-          // spin waiting for the message to be written
           Py_BEGIN_ALLOW_THREADS thrd_sleep(&SLEEP_TS, NULL);
           Py_END_ALLOW_THREADS
         }
@@ -5438,7 +4883,6 @@ static PyObject *_core_set_tags(PyObject *module, PyObject *args) {
       }
     }
 
-    // reset the queue
     atomic_store(&qptr->head, 0);
     atomic_store(&qptr->tail, 0);
     if (i < tags_size) {
@@ -5448,7 +4892,6 @@ static PyObject *_core_set_tags(PyObject *module, PyObject *args) {
     }
   }
 
-  // Wake any receivers parked on condvars so they re-resolve their tags
   qptr = BOC_QUEUES;
   for (Py_ssize_t i = 0; i < BOC_QUEUE_COUNT; ++i, ++qptr) {
     if (atomic_load_explicit(&qptr->waiters, memory_order_seq_cst) > 0) {
@@ -5499,24 +4942,9 @@ static PyObject *_cown_capsule_from_pointer_inheriting(PyObject *module,
     return NULL;
   }
 
-  // Inherit the COWN_INCREF that CownCapsule_reduce took on the writer
-  // side; the pickle bytes carried a live reference and we are
-  // consuming it into this CownCapsule. The dealloc path issues the
-  // matching COWN_DECREF, balancing the original INCREF. No additional
-  // INCREF here — calling COWN_INCREF would double-pin and leak.
-  //
-  // Allocate the capsule BEFORE consuming the inherited pin so an
-  // OOM-failed ``tp_alloc`` does not strand the writer-side INCREF.
-  // The borrowing variant has the same ordering (alloc → INCREF →
-  // assign); keeping the two reconstructors structurally identical
-  // makes the contract easier to audit.
   PyTypeObject *type = BOC_STATE->cown_capsule_type;
   CownCapsuleObject *capsule = (CownCapsuleObject *)type->tp_alloc(type, 0);
   if (capsule == NULL) {
-    // ``tp_alloc`` failed; release the inherited pin so the writer
-    // side's COWN_INCREF in ``CownCapsule_reduce`` is balanced. Without
-    // this, every unpickle-time OOM leaks one strong reference to a
-    // BOCCown that the caller has already let go of.
     COWN_DECREF(cown);
     return NULL;
   }
@@ -5587,10 +5015,6 @@ static PyObject *_cown_capsule_from_pointer_borrowing(PyObject *module,
     return NULL;
   }
 
-  // Take a fresh strong reference for this capsule. Each unpickle is an
-  // independent live reference to the BOCCown; the dealloc path does the
-  // matching COWN_DECREF. The caller must guarantee the BOCCown is still
-  // alive at this point (see CownCapsule_reduce for the contract).
   PyTypeObject *type = BOC_STATE->cown_capsule_type;
   CownCapsuleObject *capsule = (CownCapsuleObject *)type->tp_alloc(type, 0);
   if (capsule == NULL) {
@@ -5652,18 +5076,16 @@ static PyObject *_core_cown_pin_pointers(PyObject *module, PyObject *args) {
     COWN_INCREF(cown);
     PyObject *ptr = PyLong_FromVoidPtr(cown);
     if (ptr == NULL) {
-      // Roll back the ref we just took before joining the cleanup loop.
       COWN_DECREF(cown);
       goto fail;
     }
-    PyList_SET_ITEM(result, i, ptr); // steals ref
+    PyList_SET_ITEM(result, i, ptr);
   }
 
   Py_DECREF(seq);
   return result;
 
 fail:
-  // Drop INCREFs for entries we already pre-pinned (indices 0..i-1).
   for (Py_ssize_t j = 0; j < i; j++) {
     PyObject *ptr_obj = PyList_GET_ITEM(result, j);
     BOCCown *c = (BOCCown *)PyLong_AsVoidPtr(ptr_obj);
@@ -5720,7 +5142,7 @@ static PyObject *_core_scheduler_stats(PyObject *Py_UNUSED(module),
       Py_DECREF(result);
       return NULL;
     }
-    PyList_SET_ITEM(result, i, d); // steals ref
+    PyList_SET_ITEM(result, i, d);
   }
   return result;
 }
@@ -5767,10 +5189,9 @@ static PyObject *_core_queue_stats(PyObject *Py_UNUSED(module),
         boc_atomic_load_u64_explicit(&qptr->popped_total, BOC_MO_RELAXED);
     PyObject *d = Py_BuildValue(
         "{s:n,s:N,s:K,s:K,s:K,s:K}", "queue_index", (Py_ssize_t)qptr->index,
-        "tag", tag_obj, // steals ref
-        "enqueue_cas_retries", (unsigned long long)enq_r, "dequeue_cas_retries",
-        (unsigned long long)deq_r, "pushed_total", (unsigned long long)pushed,
-        "popped_total", (unsigned long long)popped);
+        "tag", tag_obj, "enqueue_cas_retries", (unsigned long long)enq_r,
+        "dequeue_cas_retries", (unsigned long long)deq_r, "pushed_total",
+        (unsigned long long)pushed, "popped_total", (unsigned long long)popped);
     if (d == NULL) {
       Py_DECREF(result);
       return NULL;
@@ -5805,33 +5226,19 @@ static PyObject *_core_scheduler_runtime_start(PyObject *Py_UNUSED(module),
                     "scheduler_runtime_start: worker_count must be >= 0");
     return NULL;
   }
-  // Idempotent shutdown: safe whether or not a previous cycle ran.
   boc_sched_shutdown();
   if (boc_sched_init((Py_ssize_t)n) < 0) {
-    return NULL; // exception already set
+    return NULL;
   }
 
-  // Allocate one fairness-token BOCBehavior per worker. Tokens
-  // are zero-initialised so every refcount / cown-array field is the
-  // safe NULL state, and `is_token = 1` discriminates them at the
-  // worker-pop site. Allocation lives here (and not in
-  // `boc_sched_init`) because `boc_sched.c` deliberately treats
-  // `BOCBehavior` as opaque.
   for (Py_ssize_t i = 0; i < (Py_ssize_t)n; ++i) {
-    // Combined `prehdr + BOCBehavior` allocation per the pre-header
-    // scheme (see `boc_sched.h`). PyMem_RawCalloc zeroes both halves;
-    // the BOCBehavior pointer starts past the prehdr.
     void *token_raw =
         PyMem_RawCalloc(1, sizeof(boc_behavior_prehdr_t) + sizeof(BOCBehavior));
     if (token_raw == NULL) {
-      // Roll back any tokens already installed and tear the runtime
-      // back down so the caller sees a clean failure (no half-init).
       for (Py_ssize_t j = 0; j < i; ++j) {
         boc_bq_node_t *prev_node = boc_sched_get_token_node(j);
         boc_sched_set_token_node(j, NULL);
         if (prev_node != NULL) {
-          // prev_node points at the prehdr's bq_node (offset 0),
-          // so the cast IS the allocation origin.
           PyMem_RawFree((boc_behavior_prehdr_t *)prev_node);
         }
       }
@@ -5841,38 +5248,16 @@ static PyObject *_core_scheduler_runtime_start(PyObject *Py_UNUSED(module),
     }
     boc_behavior_prehdr_t *token_prehdr = (boc_behavior_prehdr_t *)token_raw;
     BOCBehavior *token = (BOCBehavior *)(token_prehdr + 1);
-    // Mark as token. PyMem_RawCalloc has zeroed everything (NULL
-    // thunk/result/args/captures/requests, count == rc == 0,
-    // prehdr->bq_node.next_in_queue == NULL, prehdr->pinned == 0).
-    // The behaviour is never reference-counted via
-    // BEHAVIOR_INCREF/DECREF and never visits the request/cown
-    // machinery; it is recycled in place by the token re-enqueue
-    // path. We give it an `id` of -1 so any diagnostic that prints
-    // `behavior->id` for a token is immediately recognisable.
     token->is_token = 1;
     token->id = -1;
     token->owner_worker_index = (int16_t)i;
     if (boc_sched_set_token_node(i, &token_prehdr->bq_node) < 0) {
-      // worker_index out of range: only possible if WORKER_COUNT
-      // changed under us, which the GIL precludes. Defensive.
       PyMem_RawFree(token_prehdr);
       boc_sched_shutdown();
       PyErr_SetString(PyExc_RuntimeError,
                       "scheduler_runtime_start: token install failed");
       return NULL;
     }
-    // Lazy bootstrap (Verona-faithful): we do NOT enqueue the token
-    // onto the worker's queue here. The worker's
-    // `should_steal_for_fairness` flag is already initialised to
-    // true by `boc_sched_init` (mirrors Verona `core.h:23` —
-    // `should_steal_for_fairness{true}`). The first time the worker
-    // has a non-empty queue and calls `pop_fast`, the fairness gate
-    // routes through `pop_slow`, whose arm re-enqueues this token
-    // from `self->token_work`. From then on the heartbeat is alive
-    // and self-sustaining: every owner-side fairness arm fire
-    // re-enqueues the token, and every token consumption (by owner
-    // or thief) sets the owner's flag back to true via the dispatch
-    // loop in `_core_scheduler_worker_pop`.
   }
 
   Py_RETURN_NONE;
@@ -5887,11 +5272,6 @@ static PyObject *_core_scheduler_runtime_start(PyObject *Py_UNUSED(module),
 /// @return Py_None.
 static PyObject *_core_scheduler_runtime_stop(PyObject *Py_UNUSED(module),
                                               PyObject *Py_UNUSED(dummy)) {
-  // Recover and free per-worker fairness tokens before
-  // `boc_sched_shutdown` frees the worker array. Each token is a
-  // bare `BOCBehavior` allocated by `_core_scheduler_runtime_start`
-  // via PyMem_RawCalloc; it never goes through behavior_free /
-  // BEHAVIOR_DECREF (zero refcount, no captured cowns).
   Py_ssize_t worker_count = boc_sched_worker_count();
   for (Py_ssize_t i = 0; i < worker_count; ++i) {
     boc_bq_node_t *node = boc_sched_get_token_node(i);
@@ -5899,9 +5279,6 @@ static PyObject *_core_scheduler_runtime_stop(PyObject *Py_UNUSED(module),
       continue;
     }
     boc_sched_set_token_node(i, NULL);
-    // The bq_node sits at offset 0 of the prehdr, so the node
-    // pointer IS the allocation origin (per the pre-header scheme;
-    // see `boc_sched.h`). Free at the prehdr.
     PyMem_RawFree((boc_behavior_prehdr_t *)node);
   }
   boc_sched_shutdown();
@@ -5985,34 +5362,13 @@ static PyObject *_core_scheduler_worker_pop(PyObject *Py_UNUSED(module),
                     "scheduler_worker_pop: thread not registered");
     return NULL;
   }
-  // Token-loop. Mirrors Verona `SchedulerThread::run_inner`
-  // (`schedulerthread.h`), which dequeues a `Work*`, executes its
-  // closure, and loops back if the closure was the per-Core
-  // `token_work`. bocpy keeps the loop here (rather than inside
-  // `boc_sched_worker_pop_*`) so the sched TU stays opaque to
-  // `BOCBehavior` layout: only this TU knows how to dereference
-  // `is_token`. The token's "thunk" body is the C-side helper
-  // `boc_sched_set_steal_flag(self, true)` — same effect as the
-  // Verona closure at `core.h:28-32`.
   BOCBehavior *behavior;
   for (;;) {
-    // Drain this worker interpreter's recycle queue. Cross-interpreter
-    // cown_acquire pushes the previous owner's xidata onto THAT owner's
-    // queue; only the owning interpreter is allowed to consume it (the
-    // recycle queue is single-consumer). Without this drain the worker
-    // never reclaims xidata that other workers/the main interpreter
-    // pushed onto its queue, and the corresponding BOCCown weak refs
-    // (taken by BOCRecycleQueue_register on every cown_release) are
-    // never released -- a steady leak of one BOCCown per cross-worker
-    // hop. The legacy receive("boc_behavior") loop drained on every
-    // spin (see receive_single_tag); the distributed-scheduler worker
-    // bypasses receive entirely, so the drain has to live here.
     BOCRecycleQueue_empty(BOC_STATE->recycle_queue, false);
     boc_bq_node_t *n = boc_sched_worker_pop_fast(self);
     if (n == NULL) {
       n = boc_sched_worker_pop_slow(self);
       if (n == NULL) {
-        // pop_slow returns NULL only when stop_requested is set.
         Py_RETURN_NONE;
       }
     }
@@ -6020,22 +5376,6 @@ static PyObject *_core_scheduler_worker_pop(PyObject *Py_UNUSED(module),
     if (!behavior->is_token) {
       break;
     }
-    // Token sentinel: set the OWNING worker's fairness flag, not
-    // ours. The token may have been stolen and is now running on
-    // a thief — but the heartbeat must report back to the owner so
-    // the owner's `pop_slow` fairness arm fires next time it has
-    // local work, re-enqueueing the token from the owner's
-    // `self->token_work` slot. Verona achieves the same effect by
-    // capturing the owning core's `this` in `Closure::make`
-    // (`core.h:24-32`); we use an explicit `owner_worker_index`
-    // field on the token because closures are not free in C.
-    //
-    // The token's `bq_node` is dropped here (NOT re-enqueued by
-    // this thread). The owner's slow-path arm is the only place
-    // that ever re-enqueues a token, and it always uses its own
-    // `token_work` slot — so the bq_node is owner-owned and
-    // single-producer for re-enqueue purposes (no cross-thread
-    // double-enqueue risk).
     boc_sched_worker_t *owner =
         boc_sched_worker_at(behavior->owner_worker_index);
     boc_sched_set_steal_flag(owner, true);
@@ -6046,8 +5386,6 @@ static PyObject *_core_scheduler_worker_pop(PyObject *Py_UNUSED(module),
   if (capsule == NULL) {
     return NULL;
   }
-  // Transfer the queue-owned reference into the capsule. Do NOT
-  // BEHAVIOR_INCREF: the producer already incref'd before dispatch.
   capsule->behavior = behavior;
   return (PyObject *)capsule;
 }
@@ -6092,29 +5430,22 @@ static PyObject *_core_scheduler_drain_all_queues(PyObject *Py_UNUSED(module),
       }
       BOCBehavior *behavior = BEHAVIOR_FROM_PREHDR_NODE(n);
       if (behavior->is_token) {
-        // Token sentinels are not reference-counted and own no
-        // cowns; they live in the per-worker `token_work` slot and
-        // are freed by `_core_scheduler_runtime_stop`. Skip them
-        // here so we don't hand a token to the Python release-all
-        // path (which would dereference NULL request arrays).
         continue;
       }
       BehaviorCapsuleObject *capsule =
           (BehaviorCapsuleObject *)type->tp_alloc(type, 0);
       if (capsule == NULL) {
-        // Rebalance the queue-owned reference we just popped before
-        // bailing — otherwise the behaviour leaks.
         BEHAVIOR_DECREF(behavior);
         Py_DECREF(out);
         return NULL;
       }
-      capsule->behavior = behavior; // ref transferred in
+      capsule->behavior = behavior;
       if (PyList_Append(out, (PyObject *)capsule) < 0) {
         Py_DECREF(capsule);
         Py_DECREF(out);
         return NULL;
       }
-      Py_DECREF(capsule); // list owns it now
+      Py_DECREF(capsule);
     }
   }
   return out;
@@ -6209,6 +5540,11 @@ static PyMethodDef _core_module_methods[] = {
     {"noticeboard_write_direct", _core_noticeboard_write_direct, METH_VARARGS,
      "noticeboard_write_direct($module, key, value, /)"
      "\n--\n\nWrites a key-value pair to the noticeboard."},
+    {"noticeboard_seed", _core_noticeboard_seed, METH_VARARGS,
+     "noticeboard_seed($module, key, value, /)"
+     "\n--\n\nSynchronously writes a key-value pair to the noticeboard "
+     "from the primary interpreter, committing before return. "
+     "Main-interp only."},
     {"noticeboard_snapshot", _core_noticeboard_snapshot, METH_NOARGS,
      "noticeboard_snapshot($module, /)"
      "\n--\n\nReturns a cached snapshot of the noticeboard as a dict."},
@@ -6228,15 +5564,6 @@ static PyMethodDef _core_module_methods[] = {
     {"clear_noticeboard_thread", _core_clear_noticeboard_thread, METH_NOARGS,
      "clear_noticeboard_thread($module, /)"
      "\n--\n\nClears the registered noticeboard mutator thread."},
-    {"notice_sync_request", _core_notice_sync_request, METH_NOARGS,
-     "notice_sync_request($module, /)"
-     "\n--\n\nAllocates a fresh notice_sync sequence number."},
-    {"notice_sync_complete", _core_notice_sync_complete, METH_VARARGS,
-     "notice_sync_complete($module, seq, /)"
-     "\n--\n\nMarks a notice_sync sequence as processed and wakes waiters."},
-    {"notice_sync_wait", _core_notice_sync_wait, METH_VARARGS,
-     "notice_sync_wait($module, seq, timeout, /)"
-     "\n--\n\nBlocks until the given notice_sync sequence is processed."},
     {"terminator_inc", _core_terminator_inc, METH_NOARGS,
      "terminator_inc($module, /)"
      "\n--\n\nIncrement the terminator. Returns new count or -1 if closed."},
@@ -6283,9 +5610,6 @@ static int _core_module_exec(PyObject *module) {
       qptr->messages =
           (BOCMessage **)PyMem_RawCalloc(BOC_CAPACITY, sizeof(BOCMessage *));
       if (qptr->messages == NULL) {
-        // Unwind the queues we already initialised. boc_park_init has
-        // been called for indices [0, i); any messages buffer they hold
-        // must be freed.
         for (size_t j = 0; j < i; ++j) {
           PyMem_RawFree(BOC_QUEUES[j].messages);
           BOC_QUEUES[j].messages = NULL;
@@ -6313,7 +5637,6 @@ static int _core_module_exec(PyObject *module) {
     BOCRecycleQueue *queue_stub =
         (BOCRecycleQueue *)PyMem_RawMalloc(sizeof(BOCRecycleQueue));
     if (queue_stub == NULL) {
-      // Unwind every queue.
       for (size_t i = 0; i < BOC_QUEUE_COUNT; ++i) {
         PyMem_RawFree(BOC_QUEUES[i].messages);
         BOC_QUEUES[i].messages = NULL;
@@ -6329,35 +5652,14 @@ static int _core_module_exec(PyObject *module) {
     atomic_store_intptr(&BOC_RECYCLE_QUEUE_HEAD, (intptr_t)queue_stub);
     BOC_RECYCLE_QUEUE_TAIL = queue_stub;
 
-    // Initialize the noticeboard subsystem (mutex + sync primitives).
-    // noticeboard_init / terminator_init currently return void; if
-    // they ever start failing, this site will need to propagate the
-    // error through `_core_module_exec`.
     noticeboard_init();
 
-    // Initialize the terminator primitives.
-    // The Pyrona seed (count=1, seeded=1) is set by terminator_reset()
-    // when the runtime starts; here we only initialize the kernel objects.
     terminator_init();
 
-    // Initialize the main-pinned dispatch queue. This is a
-    // process-global Verona-style intrusive queue drained by
-    // `main_pump_bounded`. The depth/timestamp counters were
-    // zero-initialised at file scope; only the queue itself needs
-    // explicit init.
     boc_bq_init(&MAIN_PINNED_QUEUE);
 
-    // Initialize the scheduler module with no workers. The
-    // per-worker array stays unallocated and `_core.scheduler_stats()`
-    // returns an empty list until `behaviors.start()` calls
-    // `scheduler_runtime_start` with the real worker count.
     if (boc_sched_init(0) < 0) {
-      // Unwind every globally-allocated subsystem before returning -1
-      // so that the BOC_COUNT == 0 invariant ("first interpreter has
-      // not yet completed module init") is restored.
       noticeboard_destroy();
-      // terminator currently has no destroy entry point; its kernel
-      // objects (mutex + cv) are reusable across init/destroy cycles.
       PyMem_RawFree((void *)BOC_RECYCLE_QUEUE_TAIL);
       BOC_RECYCLE_QUEUE_TAIL = NULL;
       atomic_store_intptr(&BOC_RECYCLE_QUEUE_HEAD, 0);
@@ -6393,13 +5695,6 @@ static int _core_module_exec(PyObject *module) {
 
   state->dumps = PyObject_GetAttrString(state->pickle, "dumps");
   if (state->dumps == NULL) {
-    // Use Py_CLEAR rather than bare Py_DECREF: ``_core_module_clear``
-    // runs on a module that failed its exec slot (multi-phase init
-    // tears the partial module down via the normal collect path), so
-    // a bare DECREF would leave ``state->pickle`` dangling and the
-    // subsequent ``Py_CLEAR(state->pickle)`` in clear would
-    // double-free. Same rationale applies to ``loads`` below and to
-    // the reconstructor block further down.
     Py_CLEAR(state->pickle);
     return -1;
   }
@@ -6447,10 +5742,6 @@ static int _core_module_exec(PyObject *module) {
     return -1;
   }
 
-  // Cache the reconstructor symbols on the module state so
-  // CownCapsule_reduce does not have to PyImport_ImportModule on every
-  // pickle. Both must be bound on the module by m_methods before this
-  // exec slot runs.
   state->cown_reconstructor_inheriting =
       PyObject_GetAttrString(module, "_cown_capsule_from_pointer_inheriting");
   if (state->cown_reconstructor_inheriting == NULL) {
@@ -6459,8 +5750,6 @@ static int _core_module_exec(PyObject *module) {
   state->cown_reconstructor_borrowing =
       PyObject_GetAttrString(module, "_cown_capsule_from_pointer_borrowing");
   if (state->cown_reconstructor_borrowing == NULL) {
-    // Mirror the existing dumps/loads cleanup pattern: drop the
-    // previously acquired strong ref before propagating the error.
     Py_CLEAR(state->cown_reconstructor_inheriting);
     return -1;
   }
@@ -6469,9 +5758,6 @@ static int _core_module_exec(PyObject *module) {
   BOC_STATE = state;
 
   PyModule_AddStringConstant(module, "TIMEOUT", BOC_TIMEOUT);
-  // Wake-reason sentinels returned by ``terminator_wait_pumpable``.
-  // Values mirror ``boc_terminator_wake_reason_t`` so the Python loop
-  // can compare against module-level constants without re-importing.
   if (PyModule_AddIntConstant(module, "TERMINATED", BOC_TERMINATOR_TERMINATED) <
       0) {
     return -1;
@@ -6500,16 +5786,9 @@ static int _core_module_clear(PyObject *module) {
   Py_CLEAR(state->behavior_capsule_type);
   Py_CLEAR(state->cown_reconstructor_inheriting);
   Py_CLEAR(state->cown_reconstructor_borrowing);
-  // The recycle_queue is allocated late in module_exec; it may be NULL if
-  // module_exec returned -1 before reaching BOCRecycleQueue_new(). The
-  // worker recycle queue's xidata_to_cowns dict is owned by this
-  // interpreter and must be cleared here so the GC can collect any
-  // reference cycles anchored through it.
   if (state->recycle_queue != NULL) {
     Py_CLEAR(state->recycle_queue->xidata_to_cowns);
   }
-  // Clear the thread-local snapshot cache so the GC can collect any
-  // reference cycles anchored through the cached dict / proxy.
   noticeboard_drop_local_cache();
   return 0;
 }
@@ -6539,7 +5818,6 @@ void _core_module_free(void *module_ptr) {
   if (remaining == 0) {
     PRINTDBG("All _core modules have been freed, cleaning up\n");
 
-    // last one, clean up
     BOCQueue *qptr = BOC_QUEUES;
     for (size_t i = 0; i < BOC_QUEUE_COUNT; ++i, ++qptr) {
       PyMem_RawFree(qptr->messages);
@@ -6563,11 +5841,8 @@ void _core_module_free(void *module_ptr) {
     BOC_RECYCLE_QUEUE_TAIL = NULL;
     atomic_store_intptr(&BOC_RECYCLE_QUEUE_HEAD, 0);
 
-    // Tear down the noticeboard subsystem (snapshot cache, entries,
-    // pins, mutex, sync primitives).
     noticeboard_destroy();
 
-    // Tear down the scheduler instrumentation skeleton.
     boc_sched_shutdown();
 
     BOC_REF_TRACKING_REPORT();
@@ -6588,8 +5863,6 @@ static int _core_module_traverse(PyObject *module, visitproc visit, void *arg) {
   Py_VISIT(state->behavior_capsule_type);
   Py_VISIT(state->cown_reconstructor_inheriting);
   Py_VISIT(state->cown_reconstructor_borrowing);
-  // recycle_queue is allocated late in module_exec; if exec failed before
-  // reaching BOCRecycleQueue_new() the field is still NULL.
   if (state->recycle_queue != NULL) {
     Py_VISIT(state->recycle_queue->xidata_to_cowns);
   }
diff --git a/src/bocpy/_internal_test.c b/src/bocpy/_internal_test.c
index ed46957..f92b70d 100644
--- a/src/bocpy/_internal_test.c
+++ b/src/bocpy/_internal_test.c
@@ -64,7 +64,7 @@ static struct PyModuleDef moduledef = {
     .m_doc = "Test harness for bocpy internal C primitives "
              "(typed atomics, MPMC queue, ...).",
     .m_size = 0,
-    .m_methods = NULL, // methods are added by registrars in exec slot
+    .m_methods = NULL,
     .m_slots = _internal_test_slots,
 };
 
diff --git a/src/bocpy/_internal_test_atomics.c b/src/bocpy/_internal_test_atomics.c
index 7f24c6b..07ccc10 100644
--- a/src/bocpy/_internal_test_atomics.c
+++ b/src/bocpy/_internal_test_atomics.c
@@ -20,17 +20,13 @@
 
 #include "boc_compat.h"
 
-// Single shared block of atomic slots, accessed by every test entry
-// point through a PyCapsule handle. Cacheline-sized (64B) to avoid
-// false-sharing between the producer and consumer fields when the
-// test spawns multiple threads.
 typedef struct {
-  boc_atomic_u64_t flag;       // 0 → producer not yet ready
-  uint64_t payload;            // plain (non-atomic); guarded by flag
-  boc_atomic_u64_t counter64;  // fetch_add / CAS contention slot
-  boc_atomic_u32_t counter32;  // 32-bit fetch_add contention slot
-  boc_atomic_bool_t bool_slot; // bool exchange / cas test
-  boc_atomic_ptr_t ptr_slot;   // ptr exchange / cas test
+  boc_atomic_u64_t flag;
+  uint64_t payload;
+  boc_atomic_u64_t counter64;
+  boc_atomic_u32_t counter32;
+  boc_atomic_bool_t bool_slot;
+  boc_atomic_ptr_t ptr_slot;
   char _padding[64];
 } hs_state_t;
 
@@ -43,10 +39,6 @@ static hs_state_t *hs_get(PyObject *cap) {
   return (hs_state_t *)PyCapsule_GetPointer(cap, "boc_hs_state");
 }
 
-// ---------------------------------------------------------------------------
-// State setup / inspection.
-// ---------------------------------------------------------------------------
-
 static PyObject *py_make_state(PyObject *Py_UNUSED(self),
                                PyObject *Py_UNUSED(args)) {
   hs_state_t *h = (hs_state_t *)PyMem_RawCalloc(1, sizeof(*h));
@@ -109,10 +101,6 @@ static PyObject *py_load_ptr(PyObject *Py_UNUSED(self), PyObject *cap) {
   return PyLong_FromVoidPtr(v);
 }
 
-// ---------------------------------------------------------------------------
-// Acquire / release handshake (the canonical weak-memory test).
-// ---------------------------------------------------------------------------
-
 static PyObject *py_producer(PyObject *Py_UNUSED(self), PyObject *args) {
   PyObject *cap;
   unsigned long long payload;
@@ -123,11 +111,7 @@ static PyObject *py_producer(PyObject *Py_UNUSED(self), PyObject *args) {
   if (h == NULL) {
     return NULL;
   }
-  Py_BEGIN_ALLOW_THREADS
-      // Plain non-atomic write of the payload, then a release store of
-      // the flag. A consumer that observes flag==1 with an acquire load
-      // MUST see the payload write (acq-rel synchronises-with).
-      h->payload = (uint64_t)payload;
+  Py_BEGIN_ALLOW_THREADS h->payload = (uint64_t)payload;
   boc_atomic_store_u64_explicit(&h->flag, 1, BOC_MO_RELEASE);
   Py_END_ALLOW_THREADS Py_RETURN_NONE;
 }
@@ -139,18 +123,12 @@ static PyObject *py_consumer(PyObject *Py_UNUSED(self), PyObject *cap) {
   }
   uint64_t got;
   Py_BEGIN_ALLOW_THREADS while (
-      boc_atomic_load_u64_explicit(&h->flag, BOC_MO_ACQUIRE) == 0) {
-    // tight spin; the producer thread is the only writer
-  }
+      boc_atomic_load_u64_explicit(&h->flag, BOC_MO_ACQUIRE) == 0) {}
   got = h->payload;
   Py_END_ALLOW_THREADS return PyLong_FromUnsignedLongLong(
       (unsigned long long)got);
 }
 
-// ---------------------------------------------------------------------------
-// Multi-thread fetch_add contention (relaxed counter).
-// ---------------------------------------------------------------------------
-
 static PyObject *py_fetch_add_loop_u64(PyObject *Py_UNUSED(self),
                                        PyObject *args) {
   PyObject *cap;
@@ -185,10 +163,6 @@ static PyObject *py_fetch_add_loop_u32(PyObject *Py_UNUSED(self),
   Py_END_ALLOW_THREADS Py_RETURN_NONE;
 }
 
-// ---------------------------------------------------------------------------
-// Multi-thread CAS contention loop (acq_rel on success, relaxed on failure).
-// ---------------------------------------------------------------------------
-
 static PyObject *py_cas_increment_loop_u64(PyObject *Py_UNUSED(self),
                                            PyObject *args) {
   PyObject *cap;
@@ -204,21 +178,11 @@ static PyObject *py_cas_increment_loop_u64(PyObject *Py_UNUSED(self),
     uint64_t cur = boc_atomic_load_u64_explicit(&h->counter64, BOC_MO_RELAXED);
     while (!boc_atomic_compare_exchange_strong_u64_explicit(
         &h->counter64, &cur, cur + 1, BOC_MO_ACQ_REL, BOC_MO_RELAXED)) {
-      // CAS updates `cur` on failure; loop body is empty.
     }
   }
   Py_END_ALLOW_THREADS Py_RETURN_NONE;
 }
 
-// ---------------------------------------------------------------------------
-// Single-threaded round-trip: every (op, type, order) at least once.
-// ---------------------------------------------------------------------------
-//
-// On Linux the typed API is a thin wrapper around <stdatomic.h>, so this
-// is mostly a "does it compile and link" smoke. On MSVC it exercises the
-// per-order Interlocked* dispatch; on ARM64 MSVC it exercises the
-// __ldar*/__stlr* fast paths.
-
 static int round_trip_u64(void) {
   boc_atomic_u64_t slot = 0;
   const boc_memory_order_t orders[] = {BOC_MO_RELAXED, BOC_MO_ACQUIRE,
@@ -226,18 +190,15 @@ static int round_trip_u64(void) {
                                        BOC_MO_SEQ_CST};
   for (size_t i = 0; i < sizeof(orders) / sizeof(orders[0]); ++i) {
     boc_memory_order_t o = orders[i];
-    // store/load round-trip.
     boc_atomic_store_u64_explicit(&slot, 0x1234567890ABCDEFULL, o);
     if (boc_atomic_load_u64_explicit(&slot, o) != 0x1234567890ABCDEFULL) {
       return -1;
     }
-    // exchange returns previous, installs new.
     uint64_t prev = boc_atomic_exchange_u64_explicit(&slot, 42ULL, o);
     if (prev != 0x1234567890ABCDEFULL ||
         boc_atomic_load_u64_explicit(&slot, o) != 42ULL) {
       return -1;
     }
-    // fetch_add / fetch_sub.
     if (boc_atomic_fetch_add_u64_explicit(&slot, 8ULL, o) != 42ULL ||
         boc_atomic_load_u64_explicit(&slot, o) != 50ULL) {
       return -1;
@@ -246,14 +207,12 @@ static int round_trip_u64(void) {
         boc_atomic_load_u64_explicit(&slot, o) != 45ULL) {
       return -1;
     }
-    // CAS success.
     uint64_t exp = 45ULL;
     if (!boc_atomic_compare_exchange_strong_u64_explicit(&slot, &exp, 99ULL, o,
                                                          BOC_MO_RELAXED) ||
         boc_atomic_load_u64_explicit(&slot, o) != 99ULL) {
       return -1;
     }
-    // CAS failure must update `exp` to the current value.
     exp = 0ULL;
     if (boc_atomic_compare_exchange_strong_u64_explicit(&slot, &exp, 7ULL, o,
                                                         BOC_MO_RELAXED) ||
@@ -389,10 +348,6 @@ static PyObject *py_round_trip(PyObject *Py_UNUSED(self),
   Py_RETURN_NONE;
 }
 
-// ---------------------------------------------------------------------------
-// Registrar.
-// ---------------------------------------------------------------------------
-
 static PyMethodDef methods[] = {
     {"atomics_make_state", py_make_state, METH_NOARGS,
      "Allocate a fresh state slot."},
diff --git a/src/bocpy/_internal_test_bq.c b/src/bocpy/_internal_test_bq.c
index 59bcde0..7165057 100644
--- a/src/bocpy/_internal_test_bq.c
+++ b/src/bocpy/_internal_test_bq.c
@@ -24,10 +24,6 @@
 #include "boc_compat.h"
 #include "boc_sched.h"
 
-// ---------------------------------------------------------------------------
-// Node and queue capsule helpers
-// ---------------------------------------------------------------------------
-
 /// @brief Test node: a `boc_bq_node_t` followed by an integer identity.
 typedef struct {
   boc_bq_node_t node; ///< Link field consumed by `boc_bq_*`.
@@ -41,18 +37,11 @@ static void bq_queue_capsule_destructor(PyObject *capsule) {
   boc_bq_t *q =
       (boc_bq_t *)PyCapsule_GetPointer(capsule, BQ_QUEUE_CAPSULE_NAME);
   if (q != NULL) {
-    // Drain any leftover nodes so destroy_assert_empty does not abort
-    // on a leaked test queue. We do NOT free the nodes here; the
-    // Python side owns them via their own capsules.
     boc_bq_node_t *n;
     while ((n = boc_bq_dequeue(q)) != NULL) {
       (void)n;
     }
     boc_bq_destroy_assert_empty(q);
-    // Raw allocator: bq queues exist precisely to be crossed between
-    // sub-interpreters in production (per-worker queues), so the test
-    // harness uses the same process-global allocator to avoid masking
-    // a cross-interpreter free bug behind a same-interpreter test.
     PyMem_RawFree(q);
   }
 }
@@ -61,7 +50,6 @@ static void bq_node_capsule_destructor(PyObject *capsule) {
   bq_test_node_t *n =
       (bq_test_node_t *)PyCapsule_GetPointer(capsule, BQ_NODE_CAPSULE_NAME);
   if (n != NULL) {
-    // Raw allocator: see bq_queue_capsule_destructor above.
     PyMem_RawFree(n);
   }
 }
@@ -74,10 +62,6 @@ static bq_test_node_t *bq_node_from_capsule(PyObject *capsule) {
   return (bq_test_node_t *)PyCapsule_GetPointer(capsule, BQ_NODE_CAPSULE_NAME);
 }
 
-// ---------------------------------------------------------------------------
-// Methods
-// ---------------------------------------------------------------------------
-
 static PyObject *bq_make_queue(PyObject *Py_UNUSED(self),
                                PyObject *Py_UNUSED(args)) {
   boc_bq_t *q = PyMem_RawMalloc(sizeof(boc_bq_t));
@@ -140,8 +124,6 @@ static PyObject *bq_node_ptr(PyObject *Py_UNUSED(self), PyObject *args) {
   if (n == NULL) {
     return NULL;
   }
-  // bq_test_node_t puts `node` first, so &n == &n->node, but be
-  // explicit for clarity and to keep the test invariant readable.
   return PyLong_FromVoidPtr((void *)&n->node);
 }
 
@@ -185,8 +167,6 @@ static PyObject *bq_dequeue(PyObject *Py_UNUSED(self), PyObject *args) {
   boc_bq_node_t *raw;
   Py_BEGIN_ALLOW_THREADS raw = boc_bq_dequeue(q);
   Py_END_ALLOW_THREADS if (raw == NULL) { Py_RETURN_NONE; }
-  // Recover the embedding test-node and return its id. Tests don't
-  // need the original capsule object back; identity is the contract.
   bq_test_node_t *n = (bq_test_node_t *)raw;
   return PyLong_FromLongLong((long long)n->id);
 }
@@ -211,34 +191,6 @@ static PyObject *bq_dequeue_all(PyObject *Py_UNUSED(self), PyObject *args) {
   if (seg.start == NULL) {
     return list;
   }
-  // Walk the segment via segment_take_one. take_one returns NULL for
-  // three reasons (mpmcq.h:67-89, also documented at
-  // boc_sched.c::boc_sched_steal):
-  //   1. fully empty (impossible here — guarded above),
-  //   2. singleton segment (end == &start->next_in_queue) — append
-  //      start as the tail and return,
-  //   3. broken link: producer P has CASed itself onto the queue
-  //      tail (back.exchange) but has not yet completed the
-  //      "publish next pointer" store. seg.start->next_in_queue
-  //      reads as NULL, but the segment is NOT singleton — there
-  //      is at least one more node the producer is mid-publish.
-  //
-  // Verona's WorkStealingQueue::steal handles case 3 by spreading
-  // the partial segment back across its multi-N WSQ. The bocpy
-  // production caller (boc_sched_steal) handles it by splicing the
-  // partial segment onto self->q, deferring the missing tail to a
-  // subsequent dequeue once the producer's store lands.
-  //
-  // For a test helper there is no other queue to spread/splice
-  // onto, AND the test contract is "every enqueued item is observed
-  // exactly once". The pragmatic answer is to BUSY-SPIN on the
-  // broken next pointer until the producer's store becomes visible.
-  // The producer is mid-call (between `back.exchange` and
-  // `b->store(seg.start, release)` — three instructions wide), so
-  // the spin is bounded by producer scheduling latency. Without
-  // this spin the previous implementation silently dropped the
-  // entire post-broken-link tail, manifesting as the
-  // `[8-100000]` stress test losing 1-227 items per run.
   for (;;) {
     boc_bq_node_t *taken = boc_bq_segment_take_one(&seg);
     if (taken != NULL) {
@@ -252,31 +204,14 @@ static PyObject *bq_dequeue_all(PyObject *Py_UNUSED(self), PyObject *args) {
       Py_DECREF(id);
       continue;
     }
-    // take_one returned NULL. Distinguish singleton from broken-link
-    // (case 1 is impossible; we guarded seg.start != NULL above and
-    // each take_one advances seg.start to a known-non-NULL node).
     if (seg.end == &seg.start->next_in_queue) {
-      // Singleton tail — done.
       break;
     }
-    // Broken-link case: spin until the producer publishes. The wait
-    // is bounded by producer scheduling latency; under TSan or
-    // heavy oversubscription it could be milliseconds, but it is
-    // never unbounded — the producer is mid-call by construction.
-    // Drop the GIL across the spin so other Python threads (e.g.
-    // the other consumer in the stress test) can make progress.
     Py_BEGIN_ALLOW_THREADS while (
         boc_atomic_load_ptr_explicit(&seg.start->next_in_queue,
-                                     BOC_MO_ACQUIRE) == NULL) {
-      // Compiler/CPU hint: tight spin on a single cacheline. No
-      // platform-specific PAUSE intrinsic here — the spin is short
-      // and the cost is dwarfed by GIL re-acquire.
-    }
+                                     BOC_MO_ACQUIRE) == NULL) {}
     Py_END_ALLOW_THREADS
-    // Producer's store is now visible; loop and let take_one walk it.
   }
-  // Append the tail node (seg.start now points at it; its
-  // next_in_queue is NULL by segment-end invariant).
   bq_test_node_t *tail = (bq_test_node_t *)seg.start;
   PyObject *tail_id = PyLong_FromLongLong((long long)tail->id);
   if (tail_id == NULL || PyList_Append(list, tail_id) < 0) {
@@ -303,10 +238,6 @@ static PyObject *bq_is_empty(PyObject *Py_UNUSED(self), PyObject *args) {
   Py_RETURN_FALSE;
 }
 
-// ---------------------------------------------------------------------------
-// Method table and registrar
-// ---------------------------------------------------------------------------
-
 static PyMethodDef bq_methods[] = {
     {"bq_make_queue", bq_make_queue, METH_NOARGS,
      "Create an empty MPMC behaviour queue. Returns a capsule."},
diff --git a/src/bocpy/_internal_test_wsq.c b/src/bocpy/_internal_test_wsq.c
index fba715a..5b3a23b 100644
--- a/src/bocpy/_internal_test_wsq.c
+++ b/src/bocpy/_internal_test_wsq.c
@@ -31,10 +31,6 @@
 #include "boc_compat.h"
 #include "boc_sched.h"
 
-// ---------------------------------------------------------------------------
-// Worker fixture capsule
-// ---------------------------------------------------------------------------
-
 #define WSQ_WORKER_CAPSULE_NAME "bocpy._internal_test.wsq_worker"
 #define WSQ_NODE_CAPSULE_NAME "bocpy._internal_test.wsq_node"
 
@@ -50,12 +46,8 @@ static void wsq_worker_capsule_destructor(PyObject *capsule) {
   if (w == NULL) {
     return;
   }
-  // Drain every sub-queue so destroy_assert_empty does not abort if
-  // a test left items behind. We do NOT free the test nodes here —
-  // they are owned by the Python side via their own capsules.
   for (size_t i = 0; i < (size_t)BOC_WSQ_N; ++i) {
     while (boc_bq_dequeue(&w->q[i]) != NULL) {
-      // discard
     }
     boc_bq_destroy_assert_empty(&w->q[i]);
   }
@@ -75,19 +67,12 @@ static boc_sched_worker_t *wsq_worker_from_capsule(PyObject *capsule) {
                                                     WSQ_WORKER_CAPSULE_NAME);
 }
 
-// ---------------------------------------------------------------------------
-// Methods
-// ---------------------------------------------------------------------------
-
 static PyObject *wsq_n(PyObject *Py_UNUSED(self), PyObject *Py_UNUSED(args)) {
   return PyLong_FromSize_t((size_t)BOC_WSQ_N);
 }
 
 static PyObject *wsq_make_worker(PyObject *Py_UNUSED(self),
                                  PyObject *Py_UNUSED(args)) {
-  // Calloc so all unused worker fields (mutex, cv, ring link, stats,
-  // owner_interp_id, ...) are zero. The WSQ helpers only touch q[]
-  // and the three cursors, all of which we re-init explicitly.
   boc_sched_worker_t *w = PyMem_RawCalloc(1, sizeof(boc_sched_worker_t));
   if (w == NULL) {
     return PyErr_NoMemory();
@@ -255,9 +240,6 @@ static PyObject *wsq_spread_segment_counts(PyObject *Py_UNUSED(self),
     PyErr_SetString(PyExc_ValueError, "length must be positive");
     return NULL;
   }
-  // Allocate L nodes and link them head-to-tail. The link payload
-  // stored in `next_in_queue` is `boc_bq_node_t *`; we use plain
-  // stores via the typed atomic helper to construct the segment.
   wsq_test_node_t **nodes = PyMem_RawCalloc((size_t)length, sizeof(*nodes));
   if (nodes == NULL) {
     return PyErr_NoMemory();
@@ -273,10 +255,6 @@ static PyObject *wsq_spread_segment_counts(PyObject *Py_UNUSED(self),
     }
     nodes[i]->id = (int64_t)i;
   }
-  // Link 0->1->...->L-1; tail's next stays NULL. Relaxed stores
-  // are fine — the segment is private to this thread until we hand
-  // it to enqueue_spread, which uses the queue's release/acquire
-  // protocol on its own.
   for (Py_ssize_t i = 0; i < length - 1; ++i) {
     boc_atomic_store_ptr_explicit(&nodes[i]->node.next_in_queue,
                                   &nodes[i + 1]->node, BOC_MO_RELAXED);
@@ -312,10 +290,6 @@ static PyObject *wsq_spread_segment_counts(PyObject *Py_UNUSED(self),
   return out;
 }
 
-// ---------------------------------------------------------------------------
-// Registrar
-// ---------------------------------------------------------------------------
-
 static PyMethodDef wsq_methods[] = {
     {"wsq_n", wsq_n, METH_NOARGS,
      "Return the compile-time BOC_WSQ_N constant."},
diff --git a/src/bocpy/_math.c b/src/bocpy/_math.c
index 39004f1..a1852cc 100644
--- a/src/bocpy/_math.c
+++ b/src/bocpy/_math.c
@@ -171,14 +171,12 @@ static int impl_transpose_in_place(matrix_impl *matrix) {
   const size_t N = matrix->columns;
 
   if (M == 1 || N == 1) {
-    // vector
     matrix->rows = N;
     matrix->columns = M;
     return update_row_ptrs(matrix);
   }
 
   if (M == N) {
-    // square matrix
     for (size_t r = 0; r < matrix->rows; ++r) {
       for (size_t c = 0; c < r; ++c) {
         double temp = matrix->row_ptrs[r][c];
@@ -346,6 +344,26 @@ enum BinaryOps {
     }                                                                          \
   }
 
+#define DEFINE_BINARY_OUTER(ENUM, STAMP, EXPR)                                 \
+  BOC_CANARY_NOINLINE                                                          \
+  static void impl_##STAMP##_outer(matrix_impl *colvec, matrix_impl *rowvec,   \
+                                   matrix_impl *out) {                         \
+    const size_t M = out->rows;                                                \
+    const size_t N = out->columns;                                             \
+    assert(colvec->rows == M && colvec->columns == 1);                         \
+    assert(rowvec->rows == 1 && rowvec->columns == N);                         \
+    const double *col_ptr = colvec->data;                                      \
+    double *out_ptr = out->data;                                               \
+    for (size_t r = 0; r < M; ++r, ++col_ptr) {                                \
+      const double lhs = *col_ptr;                                             \
+      const double *row_ptr = rowvec->data;                                    \
+      for (size_t c = 0; c < N; ++c, ++row_ptr, ++out_ptr) {                   \
+        const double rhs = *row_ptr;                                           \
+        *out_ptr = (EXPR);                                                     \
+      }                                                                        \
+    }                                                                          \
+  }
+
 #define X(E, S, EX) DEFINE_BINARY_EWISE(E, S, EX)
 BOC_BINARY_OPS(X)
 #undef X
@@ -362,6 +380,10 @@ BOC_BINARY_OPS(X)
 BOC_BINARY_OPS(X)
 #undef X
 
+#define X(E, S, EX) DEFINE_BINARY_OUTER(E, S, EX)
+BOC_BINARY_OPS(X)
+#undef X
+
 static void dispatch_bin_ewise(matrix_impl *lhs, matrix_impl *rhs,
                                matrix_impl *out, enum BinaryOps op) {
   switch (op) {
@@ -418,6 +440,20 @@ static void dispatch_bin_scalar(matrix_impl *matrix, double scalar,
   }
 }
 
+static void dispatch_bin_outer(matrix_impl *colvec, matrix_impl *rowvec,
+                               matrix_impl *out, enum BinaryOps op) {
+  switch (op) {
+#define X(ENUM, STAMP, ...)                                                    \
+  case ENUM:                                                                   \
+    impl_##STAMP##_outer(colvec, rowvec, out);                                 \
+    return;
+    BOC_BINARY_OPS(X)
+#undef X
+  default:
+    fprintf(stderr, "Unknown binary op\n");
+  }
+}
+
 enum AggregateOps {
   Sum = 2000,
   Mean = 2001,
@@ -769,18 +805,20 @@ static void impl_matmul(matrix_impl *lhs, matrix_impl *rhs, matrix_impl *out) {
   assert(M0 == out->rows && N1 == out->columns);
   assert(N0 == rhs->rows);
 
-  double *out_ptr = out->data;
-
+  // ikj (rank-1 update) order: the inner c-loop is a contiguous AXPY over
+  // row-major out and rhs rows, with no loop-carried dependency across c, so
+  // -O3 autovectorises it without -ffast-math. Products are still summed in
+  // ascending-k order per (r, c), so results are bitwise identical to ijk.
+  memset(out->data, 0, out->size * sizeof(double));
   for (size_t r = 0; r < M0; ++r) {
-    for (size_t c = 0; c < N1; ++c, ++out_ptr) {
-      const double *lhs_ptr = lhs->row_ptrs[r];
-      const double *rhs_ptr = rhs->data + c;
-      double sum = 0;
-      for (size_t k = 0; k < N0; ++k, ++lhs_ptr, rhs_ptr += N1) {
-        sum += (*lhs_ptr) * (*rhs_ptr);
+    double *out_row = out->row_ptrs[r];
+    const double *lhs_row = lhs->row_ptrs[r];
+    for (size_t k = 0; k < N0; ++k) {
+      const double a = lhs_row[k];
+      const double *rhs_row = rhs->row_ptrs[k];
+      for (size_t c = 0; c < N1; ++c) {
+        out_row[c] += a * rhs_row[c];
       }
-
-      *out_ptr = sum;
     }
   }
 }
@@ -822,26 +860,32 @@ int range_read(range *range, PyObject *key, size_t length) {
   Py_ssize_t start, stop, step;
   if (PyLong_Check(key)) {
     start = PyLong_AsSsize_t(key);
+    if (start == -1 && PyErr_Occurred()) {
+      return -1;
+    }
     if (start < 0) {
       start += (Py_ssize_t)length;
     }
     stop = start + 1;
     step = 1;
   } else if (PySlice_Check(key)) {
-    PySlice_Unpack(key, &start, &stop, &step);
+    if (PySlice_Unpack(key, &start, &stop, &step) < 0) {
+      return -1;
+    }
   } else {
     PyErr_SetString(PyExc_TypeError, "Key must be a long or a slice");
     return -1;
   }
 
-  PySlice_AdjustIndices((Py_ssize_t)length, &start, &stop, step);
+  Py_ssize_t count =
+      PySlice_AdjustIndices((Py_ssize_t)length, &start, &stop, step);
 
   range->start = start;
   range->stop = stop;
   range->step = step;
-  range->count = (size_t)((range->stop - range->start) / range->step);
+  range->count = (size_t)count;
 
-  if (range->count == 0) {
+  if (count == 0) {
     PyErr_SetNone(PyExc_IndexError);
     return -1;
   }
@@ -983,12 +1027,10 @@ static bool impl_check_acquired(matrix_impl *matrix, bool set_error) {
   return true;
 }
 
-// Forward declarations
-static struct PyModuleDef _math_module;
-
 typedef struct {
   int_least64_t interpid;
   PyTypeObject *matrix_type;
+  PyObject *matrix_unpickle;
 } _math_module_state;
 
 static thread_local _math_module_state *LOCAL_STATE;
@@ -1315,8 +1357,8 @@ static int Matrix_aggregate(PyObject *matrix_op, AxisArg axis,
     return 0;
   }
 
-  /* axis.axis == 1 (row-wise). parse_validate_normalise_axis already
-     rejected anything else. */
+  // Fall-through is axis == 1 (row-wise): parse_validate_normalise_axis
+  // restricts axis to {0, 1}.
   matrix_impl *vector = impl_new(impl->rows, 1);
   if (vector == NULL) {
     return -1;
@@ -1332,9 +1374,6 @@ static int Matrix_aggregate(PyObject *matrix_op, AxisArg axis,
   return 0;
 }
 
-// this macro provides a kind of template for all the aggregate methods to
-// follow, as they are all identical with the exception of the operator
-
 #define MATRIX_AGGREGATE(agg)                                                  \
   static PyObject *Matrix_##agg##_method(PyObject *op, PyObject *args,         \
                                          PyObject *kwds) {                     \
@@ -1361,6 +1400,149 @@ MATRIX_AGGREGATE(Minimum)
 MATRIX_AGGREGATE(Maximum)
 MATRIX_AGGREGATE(MagnitudeSquared)
 
+/* --------------------------------------------------------------------------
+   Arg-reduction (argmin / argmax) kernels.
+
+   These do not fit the BOC_AGG_OPS X-macro: that table accumulates a
+   single double, whereas an arg-reduction must also carry the index of
+   the running extreme. Comparisons are strict so the first occurrence of
+   a tied extreme wins, matching NumPy. Indices are published as doubles
+   in the result matrix (the Matrix type stores only doubles).
+   -------------------------------------------------------------------------- */
+// Arg-reduction (argmin/argmax) kernels: kept out of the BOC_AGG_OPS X-macro
+// because they must carry the running index, not just a double accumulator.
+// Strict comparisons make the first tied extreme win (NumPy tie-break);
+// indices are published as doubles (Matrix stores only doubles).
+static Py_ssize_t argextreme_ewise(matrix_impl *m, bool want_max) {
+  const double *p = m->data;
+  double best = p[0];
+  Py_ssize_t best_i = 0;
+  for (size_t i = 1; i < m->size; ++i) {
+    const double v = p[i];
+    if (want_max ? (v > best) : (v < best)) {
+      best = v;
+      best_i = (Py_ssize_t)i;
+    }
+  }
+  return best_i;
+}
+
+static void argextreme_columnwise(matrix_impl *m, bool want_max,
+                                  matrix_impl *out) {
+  const size_t M = m->rows;
+  const size_t N = m->columns;
+  for (size_t c = 0; c < N; ++c) {
+    double best = m->data[c];
+    size_t best_r = 0;
+    for (size_t r = 1; r < M; ++r) {
+      const double v = m->data[r * N + c];
+      if (want_max ? (v > best) : (v < best)) {
+        best = v;
+        best_r = r;
+      }
+    }
+    out->data[c] = (double)best_r;
+  }
+}
+
+static void argextreme_rowwise(matrix_impl *m, bool want_max,
+                               matrix_impl *out) {
+  const size_t M = m->rows;
+  const size_t N = m->columns;
+  for (size_t r = 0; r < M; ++r) {
+    const double *row = m->data + r * N;
+    double best = row[0];
+    size_t best_c = 0;
+    for (size_t c = 1; c < N; ++c) {
+      const double v = row[c];
+      if (want_max ? (v > best) : (v < best)) {
+        best = v;
+        best_c = c;
+      }
+    }
+    out->data[r] = (double)best_c;
+  }
+}
+
+static int Matrix_argextreme(PyObject *matrix_op, AxisArg axis,
+                             PyObject **out_op, bool want_max) {
+  MatrixObject *matrix = (MatrixObject *)matrix_op;
+  matrix_impl *impl = matrix->impl;
+
+  if (!impl_check_acquired(impl, true)) {
+    return -1;
+  }
+
+  // Defensive: the public constructors reject zero-size matrices, but guard
+  // each axis so a future empty-capable path can't read p[0] out of bounds.
+  const char *empty_error = "arg-reduction of an empty matrix is undefined";
+
+  if (!axis.has_axis) {
+    if (impl->size == 0) {
+      PyErr_SetString(PyExc_ValueError, empty_error);
+      return -1;
+    }
+    *out_op = PyLong_FromSsize_t(argextreme_ewise(impl, want_max));
+    return *out_op == NULL ? -1 : 0;
+  }
+
+  if (axis.axis == 0) {
+    if (impl->rows == 0) {
+      PyErr_SetString(PyExc_ValueError, empty_error);
+      return -1;
+    }
+    matrix_impl *vector = impl_new(1, impl->columns);
+    if (vector == NULL) {
+      return -1;
+    }
+    argextreme_columnwise(impl, want_max, vector);
+    *out_op = wrap_matrix(Py_TYPE(matrix_op), vector);
+    if (*out_op == NULL) {
+      impl_free(vector);
+      return -1;
+    }
+    return 0;
+  }
+
+  if (impl->columns == 0) {
+    PyErr_SetString(PyExc_ValueError, empty_error);
+    return -1;
+  }
+  matrix_impl *vector = impl_new(impl->rows, 1);
+  if (vector == NULL) {
+    return -1;
+  }
+  argextreme_rowwise(impl, want_max, vector);
+  *out_op = wrap_matrix(Py_TYPE(matrix_op), vector);
+  if (*out_op == NULL) {
+    impl_free(vector);
+    return -1;
+  }
+  return 0;
+}
+
+#define MATRIX_ARGEXTREME(name, want_max_val)                                  \
+  static PyObject *Matrix_##name##_method(PyObject *op, PyObject *args,        \
+                                          PyObject *kwds) {                    \
+    PyObject *out = NULL;                                                      \
+    PyObject *axis_obj = NULL;                                                 \
+    static char *kwlist[] = {"axis", NULL};                                    \
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwlist, &axis_obj)) {   \
+      return NULL;                                                             \
+    }                                                                          \
+    AxisArg axis;                                                              \
+    if (parse_validate_normalise_axis(axis_obj, &axis) < 0) {                  \
+      return NULL;                                                             \
+    }                                                                          \
+    if (Matrix_argextreme(op, axis, &out, want_max_val) < 0) {                 \
+      return NULL;                                                             \
+    }                                                                          \
+    return out;                                                                \
+  }
+
+MATRIX_ARGEXTREME(argmin, false)
+MATRIX_ARGEXTREME(argmax, true)
+
 enum BroadcastShape { BCAST_NONE = 0, BCAST_ROW, BCAST_COL };
 
 /* --------------------------------------------------------------------------
@@ -1595,7 +1777,6 @@ static PyObject *Matrix_vecdot(PyObject *op, PyObject *args, PyObject *kwds) {
   PyObject *result = NULL;
   matrix_impl *rhs = NULL;
 
-  /* ``other`` is positional-only; ``axis`` accepts both forms. */
   static char *kwlist[] = {"", "axis", NULL};
   if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O", kwlist, &other, &axis)) {
     return NULL;
@@ -1614,14 +1795,10 @@ static PyObject *Matrix_vecdot(PyObject *op, PyObject *args, PyObject *kwds) {
   matrix_impl *vec_arg = rhs;
   enum BroadcastShape shape;
 
-  /* Shape classification — mirrors Matrix_binary_op's broadcast switch.
-     Two call sites is not enough duplication to warrant a shared helper. */
   if (lhs->rows == rhs->rows && lhs->columns == rhs->columns) {
     shape = BCAST_NONE;
   } else if (lhs->rows == rhs->rows &&
              (lhs->columns == 1 || rhs->columns == 1)) {
-    /* Column-vector broadcast. Canonicalise so the helper sees
-       (matrix, vector). vecdot is commutative — no swap needed. */
     shape = BCAST_COL;
     if (lhs->columns == 1) {
       mat_arg = rhs;
@@ -1629,7 +1806,6 @@ static PyObject *Matrix_vecdot(PyObject *op, PyObject *args, PyObject *kwds) {
     }
   } else if (lhs->columns == rhs->columns &&
              (lhs->rows == 1 || rhs->rows == 1)) {
-    /* Row-vector broadcast — same canonicalisation as above. */
     shape = BCAST_ROW;
     if (lhs->rows == 1) {
       mat_arg = rhs;
@@ -1637,8 +1813,6 @@ static PyObject *Matrix_vecdot(PyObject *op, PyObject *args, PyObject *kwds) {
     }
   } else if ((lhs->rows == 1 || lhs->columns == 1) &&
              (rhs->rows == 1 || rhs->columns == 1) && lhs->size == rhs->size) {
-    /* Both vectors, possibly mixed orientation (1xN vs Nx1): walk the
-       flat buffers in lockstep. No matrix/vector roles, so no swap. */
     shape = BCAST_NONE;
   } else {
     PyErr_Format(PyExc_NotImplementedError,
@@ -1661,8 +1835,8 @@ static PyObject *Matrix_vecdot(PyObject *op, PyObject *args, PyObject *kwds) {
       result = wrap_impl_or_free(out);
     }
   } else {
-    /* axis_arg.axis == 1 (row-wise). parse_validate_normalise_axis
-       already rejected anything else. */
+    // axis == 1 (row-wise): parse_validate_normalise_axis restricts axis
+    // to {0, 1}.
     matrix_impl *out = impl_new(mat_arg->rows, 1);
     if (out != NULL) {
       impl_vecdot_rowwise(mat_arg, vec_arg, out, shape);
@@ -1702,7 +1876,6 @@ static enum CrossAxis classify_cross_axis(const matrix_impl *impl,
                                           bool has_axis, int explicit_axis) {
   const size_t M = impl->rows;
   const size_t N = impl->columns;
-  /* Ambiguous square shapes: axis picks orientation, default is rows. */
   if (M == 2 && N == 2) {
     return (has_axis && explicit_axis == 0) ? CROSS_COLS_2D_2xN
                                             : CROSS_ROWS_2D_Nx2;
@@ -1711,7 +1884,6 @@ static enum CrossAxis classify_cross_axis(const matrix_impl *impl,
     return (has_axis && explicit_axis == 0) ? CROSS_COLS_3D_3xN
                                             : CROSS_ROWS_3D_Nx3;
   }
-  /* Inherently row-oriented scalars: only axis=1 (or no axis) is valid. */
   if (M == 1 && N == 2) {
     if (has_axis && explicit_axis == 0) {
       return CROSS_INVALID;
@@ -1724,7 +1896,6 @@ static enum CrossAxis classify_cross_axis(const matrix_impl *impl,
     }
     return CROSS_SCALAR_3D_1x3;
   }
-  /* Inherently column-oriented scalars: only axis=0 (or no axis) is valid. */
   if (M == 2 && N == 1) {
     if (has_axis && explicit_axis == 1) {
       return CROSS_INVALID;
@@ -1737,9 +1908,6 @@ static enum CrossAxis classify_cross_axis(const matrix_impl *impl,
     }
     return CROSS_SCALAR_3D_3x1;
   }
-  /* Batch shapes with a unique orientation: explicit axis must match.
-     (2x3 and 3x2 remain doubly-valid and fall through to the legacy
-     default selection.) */
   if (N == 2 && M != 3) {
     if (has_axis && explicit_axis == 0) {
       return CROSS_INVALID;
@@ -1764,7 +1932,6 @@ static enum CrossAxis classify_cross_axis(const matrix_impl *impl,
     }
     return CROSS_COLS_3D_3xN;
   }
-  /* Doubly-valid 2x3 / 3x2: legacy default (Nx2 / 2xN wins). */
   if (N == 2) {
     return CROSS_ROWS_2D_Nx2;
   }
@@ -1828,8 +1995,6 @@ static PyObject *Matrix_cross(PyObject *op, PyObject *args, PyObject *kwds) {
     goto done;
   }
 
-  // Scalar inputs: other's orientation is irrelevant, only the flat
-  // element count must match.
   if (flavor == CROSS_SCALAR_2D_1x2 || flavor == CROSS_SCALAR_2D_2x1) {
     if (rhs->size != 2) {
       PyErr_Format(PyExc_NotImplementedError,
@@ -1862,12 +2027,6 @@ static PyObject *Matrix_cross(PyObject *op, PyObject *args, PyObject *kwds) {
     goto done;
   }
 
-  // Batch inputs accept either a same-shape batch or a single 2D/3D
-  // vector (1xK / Kx1) broadcast against every per-vector slot. Cross is
-  // anticommutative, so we deliberately do NOT silently swap operands;
-  // ``self`` must be the batch. Per-branch validation below decides which
-  // mode applies and reports the canonical error if neither fits.
-
   if (flavor == CROSS_ROWS_2D_Nx2) {
     const size_t N = lhs->rows;
     const bool same_shape =
@@ -2039,7 +2198,6 @@ static PyObject *Matrix_cross(PyObject *op, PyObject *args, PyObject *kwds) {
     goto done;
   }
 
-  // Unreachable: every CrossAxis value is handled above.
   PyErr_SetString(PyExc_RuntimeError,
                   "internal: unhandled CrossAxis in Matrix_cross");
 
@@ -2096,8 +2254,8 @@ static int do_normalize(matrix_impl *impl, AxisArg axis, matrix_impl *out) {
     return 0;
   }
 
-  /* axis.axis == 1 (row-wise). parse_validate_normalise_axis already
-     rejected anything else. */
+  // axis == 1 (row-wise): parse_validate_normalise_axis restricts axis
+  // to {0, 1}.
   matrix_impl *divisor = impl_new(impl->rows, 1);
   if (divisor == NULL) {
     return -1;
@@ -2213,7 +2371,6 @@ static void impl_perpendicular_out_of_place(const matrix_impl *impl,
     }
     return;
   }
-  /* VEC2_SCALAR_2x1 or VEC2_COLS_2xN. */
   const double *src_x = impl->data;
   const double *src_y = impl->data + N;
   double *dst_x = out->data;
@@ -2238,7 +2395,6 @@ static void impl_perpendicular_in_place(matrix_impl *impl,
     }
     return;
   }
-  /* VEC2_SCALAR_2x1 or VEC2_COLS_2xN. */
   double *p = impl->data;
   for (size_t c = 0; c < N; ++c, ++p) {
     const double temp = p[0];
@@ -2339,7 +2495,8 @@ static PyObject *Matrix_angle(PyObject *op, PyObject *args, PyObject *kwds) {
     return wrap_impl_or_free(out);
   }
 
-  /* VEC2_COLS_2xN. */
+  // Fall-through is the 2xN column-batch case: classify_vec2_axis already
+  // rejected every shape other than VEC2_ROWS_Nx2 and this one.
   const size_t N = impl->columns;
   matrix_impl *out = impl_new(1, N);
   if (out == NULL) {
@@ -2758,6 +2915,18 @@ static PyObject *Matrix_uniform(PyObject *cls, PyObject *args,
   return wrap_impl_or_free(impl);
 }
 
+static PyObject *Matrix_seed(PyObject *cls, PyObject *args) {
+  unsigned long value;
+
+  if (!PyArg_ParseTuple(args, "k", &value)) {
+    return NULL;
+  }
+
+  srand((unsigned int)value);
+
+  Py_RETURN_NONE;
+}
+
 static PyObject *Matrix_vector(PyObject *cls, PyObject *args) {
   PyObject *sequence = NULL;
   int as_column = 0;
@@ -2810,11 +2979,15 @@ static int unwrap_and_get_shape(PyObject *object, shape *shape,
   return 0;
 }
 
-static PyObject *Matrix_concat(PyObject *cls, PyObject *args) {
+static PyObject *Matrix_concat(PyObject *cls, PyObject *args,
+                               PyObject *kwargs) {
   PyObject *matrices = NULL;
   int axis = 0;
 
-  if (!PyArg_ParseTuple(args, "O|i", &matrices, &axis)) {
+  static char *kwlist[] = {"values", "axis", NULL};
+
+  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|i", kwlist, &matrices,
+                                   &axis)) {
     return NULL;
   }
 
@@ -2950,6 +3123,38 @@ static PyObject *Matrix_concat(PyObject *cls, PyObject *args) {
   return wrap_impl_or_free(out);
 }
 
+/// @brief Pickle support: reduce a matrix to its raw double buffer.
+/// @details Returns ``(_matrix_unpickle, (rows, columns, payload))`` where
+///          ``payload`` is the native-endian byte image of the contiguous
+///          row-major ``double`` data. Reconstruction is a single ``memcpy``,
+///          so pickling cost is linear in the element count with no per-element
+///          Python object churn. ``copy.copy`` and ``copy.deepcopy`` route
+///          through the same path. The current interpreter must own the matrix.
+static PyObject *Matrix_reduce(PyObject *op, PyObject *Py_UNUSED(dummy)) {
+  MatrixObject *self = (MatrixObject *)op;
+  matrix_impl *impl = self->impl;
+
+  if (impl == NULL) {
+    PyErr_SetString(PyExc_ValueError, "Cannot pickle an uninitialized matrix");
+    return NULL;
+  }
+
+  if (!impl_check_acquired(impl, true)) {
+    return NULL;
+  }
+
+  PyObject *rebuild = LOCAL_STATE->matrix_unpickle;
+
+  PyObject *payload = PyBytes_FromStringAndSize(
+      (const char *)impl->data, (Py_ssize_t)(impl->size * sizeof(double)));
+  if (payload == NULL) {
+    return NULL;
+  }
+
+  return Py_BuildValue("(O(nnN))", rebuild, (Py_ssize_t)impl->rows,
+                       (Py_ssize_t)impl->columns, payload);
+}
+
 static PyMethodDef Matrix_methods[] = {
     {"transpose", (PyCFunction)Matrix_transpose, METH_VARARGS | METH_KEYWORDS,
      "transpose($self, /, in_place=False)\n--\n\n"
@@ -3010,6 +3215,18 @@ static PyMethodDef Matrix_methods[] = {
      "min($self, /, axis=None)\n--\n\nMinimum of elements."},
     {"max", (PyCFunction)Matrix_Maximum_method, METH_VARARGS | METH_KEYWORDS,
      "max($self, /, axis=None)\n--\n\nMaximum of elements."},
+    {"argmin", (PyCFunction)Matrix_argmin_method, METH_VARARGS | METH_KEYWORDS,
+     "argmin($self, /, axis=None)\n--\n\n"
+     "Index of the minimum element (first occurrence on ties).\n\n"
+     "NaN elements are skipped unless the running extreme starts at NaN\n"
+     "(element 0 along the reduced axis), which pins the result to that\n"
+     "position. This differs from NumPy, which propagates NaN."},
+    {"argmax", (PyCFunction)Matrix_argmax_method, METH_VARARGS | METH_KEYWORDS,
+     "argmax($self, /, axis=None)\n--\n\n"
+     "Index of the maximum element (first occurrence on ties).\n\n"
+     "NaN elements are skipped unless the running extreme starts at NaN\n"
+     "(element 0 along the reduced axis), which pins the result to that\n"
+     "position. This differs from NumPy, which propagates NaN."},
     {"ceil", (PyCFunction)Matrix_Ceil_method, METH_VARARGS | METH_KEYWORDS,
      "ceil($self, /, in_place=False)\n--\n\n"
      "Element-wise ceiling."},
@@ -3030,6 +3247,9 @@ static PyMethodDef Matrix_methods[] = {
      "Clip elements to a range."},
     {"copy", Matrix_copy, METH_NOARGS,
      "copy($self, /)\n--\n\nReturn a deep copy."},
+    {"__reduce__", Matrix_reduce, METH_NOARGS,
+     "__reduce__($self, /)\n--\n\n"
+     "Pickle helper: serialize the matrix to its raw double buffer."},
     {"select", Matrix_select, METH_VARARGS,
      "select($self, indices, /, axis=0)\n--\n\n"
      "Select rows or columns by index."},
@@ -3050,10 +3270,19 @@ static PyMethodDef Matrix_methods[] = {
      METH_VARARGS | METH_KEYWORDS | METH_CLASS,
      "uniform($type, minval=0.0, maxval=1.0, /, size=None)\n--\n\n"
      "Sample from a uniform distribution."},
+    {"seed", Matrix_seed, METH_VARARGS | METH_CLASS,
+     "seed($type, value, /)\n--\n\n"
+     "Seed the random generator used by normal() and uniform().\n\n"
+     "The generator is the process-global C library PRNG shared by every\n"
+     "sub-interpreter, so a seed only makes subsequent draws reproducible\n"
+     "when random generation stays on a single thread; concurrent draws\n"
+     "interleave on the shared state. The sequence is also not portable\n"
+     "across platforms."},
     {"vector", Matrix_vector, METH_VARARGS | METH_CLASS,
      "vector($type, values, /, as_column=False)\n--\n\n"
      "Create a vector from a sequence."},
-    {"concat", Matrix_concat, METH_VARARGS | METH_CLASS,
+    {"concat", (PyCFunction)Matrix_concat,
+     METH_VARARGS | METH_KEYWORDS | METH_CLASS,
      "concat($type, values, /, axis=0)\n--\n\n"
      "Concatenate matrices along an axis."},
     {NULL} /* Sentinel */
@@ -3297,7 +3526,6 @@ static int Matrix_binary_op(PyObject *lhs_op, PyObject *rhs_op,
   }
 
   if (mat_op != NULL) {
-    // scalar operation
     lhs = unwrap_matrix(mat_op, false);
     if (lhs == NULL) {
       goto error;
@@ -3322,15 +3550,72 @@ static int Matrix_binary_op(PyObject *lhs_op, PyObject *rhs_op,
     goto error;
   }
 
+  // A 1x1 matrix is a scalar: route it through the scalar kernels so it
+  // broadcasts against any shape, exactly like a Python float operand.
+  // Check rhs first so 1x1 op= 1x1 stays in-place-safe.
+  if (rhs->size == 1) {
+    matrix_impl *out = set_output(lhs_op, out_op, inplace);
+    if (out == NULL) {
+      goto error;
+    }
+    dispatch_bin_scalar(lhs, rhs->data[0], out, op);
+    goto exit;
+  }
+  if (lhs->size == 1) {
+    if (inplace) {
+      PyErr_SetString(PyExc_NotImplementedError,
+                      "in-place scalar broadcast would change operand shape");
+      goto error;
+    }
+    matrix_impl *out = set_output(rhs_op, out_op, inplace);
+    if (out == NULL) {
+      goto error;
+    }
+    dispatch_bin_scalar(rhs, lhs->data[0], out, swap_right(op));
+    goto exit;
+  }
+
   const char *mismatch_error = "Dimension mismatch between operands";
 
   if (lhs->rows != rhs->rows) {
     if (lhs->columns != rhs->columns) {
-      PyErr_SetString(PyExc_NotImplementedError, mismatch_error);
-      goto error;
+      matrix_impl *colvec;
+      matrix_impl *rowvec;
+      if (lhs->columns == 1 && rhs->rows == 1) {
+        colvec = lhs;
+        rowvec = rhs;
+      } else if (lhs->rows == 1 && rhs->columns == 1) {
+        colvec = rhs;
+        rowvec = lhs;
+        op = swap_right(op);
+      } else {
+        PyErr_SetString(PyExc_NotImplementedError, mismatch_error);
+        goto error;
+      }
+
+      if (inplace) {
+        PyErr_SetString(PyExc_NotImplementedError,
+                        "in-place outer broadcast would change operand shape");
+        goto error;
+      }
+
+      matrix_impl *out = impl_new(colvec->rows, rowvec->columns);
+      if (out == NULL) {
+        goto error;
+      }
+
+      // Wrap as Py_TYPE(lhs_op) for parity with the elementwise and matmul
+      // paths (preserve the lhs subclass).
+      *out_op = wrap_matrix(Py_TYPE(lhs_op), out);
+      if (*out_op == NULL) {
+        impl_free(out);
+        goto error;
+      }
+
+      dispatch_bin_outer(colvec, rowvec, out, op);
+      goto exit;
     }
 
-    // row-wise
     matrix_impl *matrix;
     matrix_impl *vector;
     if (lhs->rows == 1) {
@@ -3357,7 +3642,6 @@ static int Matrix_binary_op(PyObject *lhs_op, PyObject *rhs_op,
   }
 
   if (lhs->columns != rhs->columns) {
-    // column-wise
     matrix_impl *matrix;
     matrix_impl *vector;
     if (lhs->columns == 1) {
@@ -3383,7 +3667,6 @@ static int Matrix_binary_op(PyObject *lhs_op, PyObject *rhs_op,
     goto exit;
   }
 
-  // element-wise
   matrix_impl *out = set_output(lhs_op, out_op, inplace);
   if (out == NULL) {
     goto error;
@@ -3735,10 +4018,10 @@ static PyObject *Matrix_repr(PyObject *op) {
   size_t length = strlen(prefix);
 
   snprintf(buffer, VALUE_BUFFER_SIZE, "%zu", impl->rows);
-  length += strlen(buffer) + 2; // ", "
+  length += strlen(buffer) + 2;
 
   snprintf(buffer, VALUE_BUFFER_SIZE, "%zu", impl->columns);
-  length += strlen(buffer) + 3; // ", ["
+  length += strlen(buffer) + 3;
 
   for (size_t i = 0; i < impl->size; ++i, ++ptr) {
     snprintf(buffer, VALUE_BUFFER_SIZE, "%g", *ptr);
@@ -3843,7 +4126,6 @@ static PyType_Spec Matrix_Spec = {.name = "bocpy._math.Matrix",
 static PyObject *_new_matrix_object(XIDATA_T *xidata) {
   matrix_impl *impl = (matrix_impl *)xidata->data;
 
-  // take ownership of the C matrix
   int_least64_t expected = BOCPY_NO_OWNER;
   int_least64_t desired = bocpy_interpid();
   if (!atomic_compare_exchange_strong(&impl->owner, &expected, desired)) {
@@ -3854,19 +4136,15 @@ static PyObject *_new_matrix_object(XIDATA_T *xidata) {
     return NULL;
   }
 
-  // Create an instance of MatrixObject using this interpreter's copy of the
-  // type
   PyTypeObject *type = LOCAL_STATE->matrix_type;
   MatrixObject *matrix = (MatrixObject *)type->tp_alloc(type, 0);
   if (matrix == NULL) {
-    // attempt to roll back the ownership change
     int_least64_t rollback_expected = desired;
     desired = BOCPY_NO_OWNER;
     atomic_compare_exchange_strong(&impl->owner, &rollback_expected, desired);
     return NULL;
   }
 
-  // wrap the C matrix
   matrix->impl = impl;
   IMPL_INCREF(impl);
 
@@ -3882,7 +4160,6 @@ XIDATA_GETDATA_FUNC(_matrix_shared) {
   MatrixObject *matrix = (MatrixObject *)obj;
   matrix_impl *impl = matrix->impl;
 
-  // put the underlying C matrix in an ownerless state during transport
   int_least64_t expected = bocpy_interpid();
   int_least64_t desired = BOCPY_NO_OWNER;
   if (!atomic_compare_exchange_strong(&impl->owner, &expected, desired)) {
@@ -3893,12 +4170,66 @@ XIDATA_GETDATA_FUNC(_matrix_shared) {
     return -1;
   }
 
-  // initialize the xidata
   XIDATA_INIT(xidata, tstate->interp, impl, obj, _new_matrix_object);
   return 0;
 }
 
+/// @brief Reconstruct a Matrix from its pickled raw double buffer.
+/// @details Inverse of ``Matrix.__reduce__``: validates the dimensions and the
+///          payload length, then copies the native-endian ``double`` image into
+///          a freshly allocated matrix owned by the current interpreter.
+/// @param args ``(rows, columns, payload)`` where ``payload`` exposes the
+///        buffer protocol.
+/// @return a new MatrixObject reference, or NULL on error
+static PyObject *_matrix_unpickle(PyObject *Py_UNUSED(module), PyObject *args) {
+  Py_ssize_t srows = 0;
+  Py_ssize_t scolumns = 0;
+  Py_buffer payload;
+
+  if (!PyArg_ParseTuple(args, "nny*", &srows, &scolumns, &payload)) {
+    return NULL;
+  }
+
+  if (srows <= 0 || scolumns <= 0) {
+    PyBuffer_Release(&payload);
+    PyErr_SetString(PyExc_ValueError, "Rows and columns must both be > 0");
+    return NULL;
+  }
+
+  size_t rows = (size_t)srows;
+  size_t columns = (size_t)scolumns;
+
+  if (rows > SIZE_MAX / columns) {
+    PyBuffer_Release(&payload);
+    PyErr_SetString(PyExc_ValueError, "Matrix dimensions are too large");
+    return NULL;
+  }
+
+  size_t size = rows * columns;
+  if (size > SIZE_MAX / sizeof(double) ||
+      (size_t)payload.len != size * sizeof(double)) {
+    PyBuffer_Release(&payload);
+    PyErr_SetString(PyExc_ValueError,
+                    "Pickled matrix payload has the wrong length");
+    return NULL;
+  }
+
+  matrix_impl *impl = impl_new(rows, columns);
+  if (impl == NULL) {
+    PyBuffer_Release(&payload);
+    return NULL;
+  }
+
+  memcpy(impl->data, payload.buf, (size_t)payload.len);
+  PyBuffer_Release(&payload);
+
+  return (PyObject *)wrap_impl_or_free(impl);
+}
+
 static PyMethodDef _math_module_methods[] = {
+    {"_matrix_unpickle", _matrix_unpickle, METH_VARARGS,
+     "_matrix_unpickle(rows, columns, payload, /)\n--\n\n"
+     "Internal pickle helper: rebuild a Matrix from its raw byte buffer."},
     {NULL} /* Sentinel */
 };
 
@@ -3915,13 +4246,17 @@ static int _math_module_exec(PyObject *module) {
     return -1;
   }
 
-  // let the XIData system know that the matrix type can be shared
   if (XIDATA_REGISTERCLASS(state->matrix_type, _matrix_shared)) {
     Py_FatalError(
         "could not register MatrixObject for cross-interpreter sharing");
     return -1;
   }
 
+  state->matrix_unpickle = PyObject_GetAttrString(module, "_matrix_unpickle");
+  if (state->matrix_unpickle == NULL) {
+    return -1;
+  }
+
   assert(LOCAL_STATE == NULL);
   LOCAL_STATE = state;
 
@@ -3931,6 +4266,7 @@ static int _math_module_exec(PyObject *module) {
 static int _math_module_clear(PyObject *module) {
   _math_module_state *state = (_math_module_state *)PyModule_GetState(module);
   Py_CLEAR(state->matrix_type);
+  Py_CLEAR(state->matrix_unpickle);
   return 0;
 }
 
@@ -3941,6 +4277,7 @@ static void _math_module_free(void *module) {
 static int _math_module_traverse(PyObject *module, visitproc visit, void *arg) {
   _math_module_state *state = (_math_module_state *)PyModule_GetState(module);
   Py_VISIT(state->matrix_type);
+  Py_VISIT(state->matrix_unpickle);
   return 0;
 }
 
diff --git a/src/bocpy/behaviors.py b/src/bocpy/behaviors.py
index 8124dc7..116bd8e 100644
--- a/src/bocpy/behaviors.py
+++ b/src/bocpy/behaviors.py
@@ -77,32 +77,18 @@ def _default_worker_count() -> int:
 
 WORKER_COUNT: int = _default_worker_count()
 
-# Generous deadline (seconds) for every worker-lifecycle handshake
-# receive (start_workers, stop_workers, _abort_workers). The handshakes
-# normally complete in microseconds; the only reason to wait longer is
-# pathological I/O or a wedged sub-interpreter. Promoting a wedge to a
-# loud failure here means CI fails in minutes instead of hours.
+# Per-handshake receive deadline (s): a wedged sub-interpreter becomes a loud failure, not a silent hang.
 _LIFECYCLE_RECEIVE_TIMEOUT = 120.0
 
-# Self-defence cap on the alternating pump / orphan drain loop in
-# `stop_workers`. A pathological producer that keeps re-feeding
-# MAIN_PINNED_QUEUE between rounds would otherwise wedge teardown
-# forever; on overflow we log and give up rather than spin.
+# Cap on stop()'s drain loop so a producer re-feeding the pinned queue cannot wedge teardown forever.
 _MAX_STOP_DRAIN_ROUNDS = 64
 
-# Upper bound on any millisecond-valued pump argument
-# (`deadline_ms`, `warn_ms`). The C side converts ms to ns via
-# `value * 1_000_000`; without a guard, a caller passing
-# `2**63` quietly wraps to a small or negative deadline. The bound
-# corresponds to the largest ms that fits in an int64 once scaled by
-# 1_000_000 — ~9.2e12 ms (~292 years), enough that no real program
-# should hit it but small enough to reject programmer-error inputs
-# like `sys.maxsize` cleanly.
+# Largest ms pump arg that survives the C side's ms*1_000_000 ns scaling without overflowing int64.
 _MAX_PUMP_MS = (1 << 63) // 1_000_000 - 1
 
 T = TypeVar("T")
 
-# Sentinel distinguishing "key absent" from "key is None" in noticeboard updates.
+# Distinguishes "key absent" from "key is None" in noticeboard updates.
 _ABSENT = object()
 
 
@@ -190,6 +176,37 @@ def release(self):
         """Releases the cown."""
         self.impl.release()
 
+    def unwrap(self) -> T:
+        """Consume and return the stored value, or re-raise a captured behavior exception on the caller's thread.
+
+        Mirrors Rust's ``Result::unwrap``: on success the value is
+        returned; if the cown carries an unhandled behavior exception
+        the exception is cleared and re-raised here. ``unwrap``
+        **consumes** the cown -- it hands the stored payload to the
+        caller and empties the cown to ``None`` -- so the returned value
+        is owned by the caller and a second :meth:`unwrap` returns
+        ``None``. Consuming is what makes move-type values (e.g.
+        :class:`Matrix`) usable after the call: the cown no longer
+        aliases the value's single backing store, so the value keeps its
+        ownership on the caller's interpreter instead of being released
+        back into the cown. The emptied cown stays schedulable, so you
+        may store a fresh value into it again. Acquires the cown for the
+        read, so call it from the caller's thread once the runtime is
+        globally quiescent -- after :func:`quiesce` or :func:`wait`, not
+        merely after this cown's own producer.
+
+        Delegates to the C-level :meth:`CownCapsule.unwrap` so a
+        behavior that returns a :class:`Cown` (which surfaces
+        downstream as a bare ``CownCapsule``) can be unwrapped the same
+        way, without rewrapping it in a Python :class:`Cown` first.
+
+        :returns: The stored value when no exception is held.
+        :raises BaseException: The captured exception, re-raised verbatim.
+        :raises RuntimeError: If the runtime is not quiescent (behaviors
+            are still in flight); call :func:`quiesce` or :func:`wait` first.
+        """
+        return self.impl.unwrap()
+
     @property
     def exception(self) -> bool:
         """Whether the held value is the result of an unhandled exception."""
@@ -341,12 +358,7 @@ def __init__(self, value: T):
             no pickling, no XIData round-trip.
         :raises RuntimeError: If called from a non-main interpreter.
         """
-        # Skip super().__init__: the value must not go through XIData.
-        # Thread affinity lives entirely in C: PinnedCownCapsule refuses
-        # non-main construction, and pump's CAS enforces single-pumper
-        # on free-threaded builds. The capsule sets owner = main
-        # interpreter id permanently, which makes worker cown_acquire
-        # structurally fail.
+        # Skip super().__init__: the value must stay a plain PyObject ref, never an XIData round-trip.
         self.impl = _core.PinnedCownCapsule(value)
 
 
@@ -487,12 +499,6 @@ def pump(deadline_ms: Optional[int] = None,
     deadline_ms = _validate_pump_bound("deadline_ms", deadline_ms, ms=True)
     max_behaviors = _validate_pump_bound("max_behaviors", max_behaviors)
 
-    # Pinned behaviors look up their `__behavior__N` thunk on the
-    # runtime's export_module (same shape contract as the worker
-    # bootstrap's `boc_export`). A NULL export here means the runtime
-    # is not initialised -- pinned schedules cannot work in that
-    # state, so fail loud rather than letting every behavior fall
-    # over with `AttributeError` on thunk lookup.
     boc_export = None
     if BEHAVIORS is not None:
         boc_export = getattr(BEHAVIORS, "export_module", None)
@@ -542,16 +548,9 @@ def set_pump_watchdog(warn_ms: Optional[int] = 1000,
     :raises OverflowError: if ``warn_ms`` exceeds the maximum
         representable nanosecond value.
     """
-    # Validate before crossing the C boundary so callers get a clear
-    # TypeError with the offending arg rather than a generic C-side
-    # parse failure.
     if warn_ms is not None:
         if (not isinstance(warn_ms, int) or isinstance(warn_ms, bool)
                 or warn_ms <= 0):
-            # Reject 0 alongside negatives. The C side treats 0 as
-            # the disable sentinel, which would silently turn the
-            # watchdog off and surprise the caller; require explicit
-            # ``None`` to disable.
             raise TypeError(
                 f"warn_ms must be a positive int or None to disable, "
                 f"got {warn_ms!r}")
@@ -584,9 +583,7 @@ def set_wait_pump_poll(ms: int = 50) -> None:
     _WAIT_PUMP_POLL_MS = ms
 
 
-# Re-read on every iteration of the wait() auto-pump loop so a
-# mid-wait `set_wait_pump_poll(...)` change is honoured without
-# restarting the wait.
+# Re-read each wait() auto-pump iteration so a mid-wait set_wait_pump_poll change takes effect without restarting.
 _WAIT_PUMP_POLL_MS = 50
 
 
@@ -608,69 +605,27 @@ def __init__(self, num_workers: Optional[int]):
         self.classes = set()
         self.worker_threads = []
         self.behavior_lookup: Mapping[int, BehaviorInfo] = {}
-        # Main-side namespace holding the transpiled ``__behavior__N``
-        # thunks. :func:`pump` reads this so pinned-behavior bodies
-        # scheduled via ``@when`` resolve on main the same way they
-        # resolve on workers. Populated by :meth:`start`.
+        # Main-side namespace of transpiled __behavior__N thunks; pump resolves pinned bodies against it.
         self.export_module: Optional[types.ModuleType] = None
         self.logger = logging.getLogger("behaviors")
         self.logger.debug("behaviors init")
-        # The runtime has no central scheduler thread. Caller threads do 2PL
-        # inline (whencall -> behavior_schedule), workers release inline,
-        # and the C-level terminator is the only pending counter.
         self.noticeboard = None
         self._noticeboard_start_error: Optional[BaseException] = None
-        # Set to True by stop() once worker shutdown, noticeboard
-        # tear-down, and the C-level noticeboard slot release have
-        # all completed. The warned-stop / drain-error raise from
-        # stop() happens *after* this flips, so wait()/__exit__ can
-        # use the flag to distinguish "stop() raised but the runtime
-        # is dead -- clear the global handle" from "stop() raised
-        # mid-teardown and the runtime is still alive -- retain the
-        # handle so the caller can retry stop()".
+        # True after full teardown; lets wait()/__exit__ tell a dead runtime from one that can retry stop().
         self._teardown_complete = False
-        # Populated by stop_workers() with any release_all() failures
-        # observed during the per-task-queue orphan drain. stop()
-        # consumes the list and clears it; on a clean stop this stays
-        # empty.
         self._stop_drain_errors: list[BaseException] = []
-        # Set True when stop_workers() has run to completion (whether
-        # from the clean path or the noticeboard-timeout branch). A
-        # subsequent stop() retry must NOT re-invoke stop_workers --
-        # the worker pool is gone and `_core.scheduler_request_stop_all`
-        # would block forever waiting for shutdown replies that never
-        # come. The retry path skips straight to the noticeboard
-        # cleanup that the prior attempt could not complete.
+        # True once workers are down; a stop() retry must skip re-stopping or scheduler_request_stop_all hangs.
         self._workers_stopped = False
-        # Per-worker scheduler_stats() snapshot captured at the moment
-        # workers have replied "shutdown" but BEFORE
-        # `_core.scheduler_runtime_stop()` frees the per-worker array.
-        # Surfaced to the caller via `wait(stats=True)`. ``None`` means
-        # no snapshot was captured (e.g. start_workers failed before any
-        # worker registered, or stop_workers raised before reaching the
-        # capture point).
+        # Snapshots taken before the C side frees the underlying arrays; surfaced via wait(stats=/noticeboard=).
         self._final_stats: Optional[list[dict]] = None
-        # Plain-dict snapshot of the noticeboard captured by stop()
-        # after the noticeboard thread has exited but BEFORE
-        # `_core.noticeboard_clear()` frees the entries. Surfaced to
-        # the caller via `wait(noticeboard=True)`. ``None`` means no
-        # snapshot was captured (e.g. the noticeboard-timeout branch
-        # left the thread alive, or start_noticeboard failed).
         self._final_noticeboard: Optional[dict[str, Any]] = None
         self.final_cowns: tuple[Cown, ...] = ()
         self.bid = 0
-        # Set by :meth:`start` to the synthetic linecache key for the
-        # main-side transpiled export, so :meth:`stop` (and the
-        # abort path) can pop the entry symmetrically.
+        # Synthetic linecache key + saved sys.modules['__bocmain__'] so stop()/abort undo start()'s main-side state.
         self._main_export_file: Optional[str] = None
-        # Set by :meth:`start` to the prior value of
-        # ``sys.modules['__bocmain__']`` (None if no entry existed)
-        # so :meth:`stop` can restore it instead of unconditionally
-        # popping a slot we never owned.
         self._installed_bocmain = False
         self._prior_bocmain: Optional[types.ModuleType] = None
-        # (name, path) of the module pinned by start(); used to detect
-        # mismatched re-start requests.
+        # (name, path) of the module pinned by start(); used to detect mismatched re-start requests.
         self._started_module: Optional[tuple[str, str]] = None
 
     def lookup_behavior(self, line_number: int,  max_decorator_stack=32) -> BehaviorInfo:
@@ -698,11 +653,6 @@ def lookup_behavior(self, line_number: int,  max_decorator_stack=32) -> Behavior
         if line_number in self.behavior_lookup:
             return self.behavior_lookup[line_number]
 
-        # Bound the backward search: a decorator stack of depth N
-        # leaves the @when line N below the def line in 3.10, but
-        # realistic stacks are tiny. 32 is plenty and still small
-        # enough to catch a stale-frame mis-resolution before it
-        # silently returns the wrong behavior.
         for offset in range(1, max_decorator_stack + 1):
             if line_number - offset in self.behavior_lookup:
                 return self.behavior_lookup[line_number - offset]
@@ -720,11 +670,7 @@ def teardown_workers(self):
     def start_workers(self):
         """Launch worker interpreters and wait until they signal readiness."""
         def worker():
-            # Every failure path below MUST send a reply on
-            # "boc_behavior": `start_workers` is blocked in a bounded
-            # receive() per worker, and a missed reply turns into a
-            # silent timeout with no traceback about why the worker
-            # died.
+            # Every failure path below MUST reply on "boc_behavior"; start_workers() blocks in a bounded receive().
             import traceback as _tb
             interp = None
             try:
@@ -742,9 +688,6 @@ def worker():
                     interp, dedent(self.worker_script),
                 )
             except BaseException as ex:  # noqa: B036
-                # `run_string` itself raised (distinct from the worker
-                # script raising, which is surfaced via the returned
-                # ExecutionFailed below).
                 _core.send(
                     "boc_behavior",
                     "interpreters.run_string() failed: "
@@ -753,8 +696,7 @@ def worker():
                 result = None
 
             if result is not None:
-                # Truthy result == ExecutionFailed; `.formatted` carries
-                # the traceback captured inside the worker script.
+                # Truthy result == ExecutionFailed; .formatted carries the traceback captured inside the worker.
                 try:
                     formatted = result.formatted
                 except AttributeError:
@@ -783,18 +725,6 @@ def worker():
                     num_errors += 1
 
                 case [_core.TIMEOUT, _]:
-                    # A worker thread failed to send "started" within
-                    # the deadline. Most likely cause: the sub-
-                    # interpreter wedged during `interpreters.create()`
-                    # or `scheduler_worker_register`, or a C-level
-                    # init deadlock blocked the worker before it could
-                    # signal readiness. Without this branch the runtime
-                    # would block indefinitely; raising promotes the
-                    # deadlock to a loud RuntimeError so CI fails fast.
-                    # NOTE: `teardown_workers` may still block on a
-                    # wedged sub-interpreter's `t.join()`; the receive
-                    # timeout at least guarantees we report the failure
-                    # instead of silently hanging at this call site.
                     self.teardown_workers()
                     raise RuntimeError(
                         f"start_workers: worker {i} did not signal "
@@ -831,28 +761,6 @@ def stop_workers(self):
             cown.acquire()
 
         self.logger.debug("stopping workers")
-        # Single C-level fan-out: flips stop_requested on every
-        # worker and signals each cv. Each worker observes the
-        # flag inside scheduler_worker_pop, exits its do_work loop,
-        # and sends "shutdown" back on boc_behavior.
-        #
-        # Once `scheduler_request_stop_all()` has been called the
-        # worker pool is committed to shutting down: re-entering this
-        # function on a retry would issue a second fan-out and then
-        # block forever in `receive("boc_behavior")` waiting for
-        # shutdown replies from workers that have already replied (or
-        # exited). Wrap everything past the fan-out in try/finally
-        # that pins `_workers_stopped = True` so any exception from
-        # the handshake, teardown, drain, or runtime_stop still
-        # routes a subsequent stop() down the retry-only branch.
-        #
-        # The retry-only branch in `stop()` does NOT itself call
-        # `scheduler_runtime_stop`, so we must guarantee it runs here
-        # even when the handshake / teardown / drain above raised --
-        # otherwise the per-worker `WORKERS` array leaks until the
-        # next `start()`. The C-side stop is idempotent (covered by
-        # `test_scheduler_runtime_stop_is_idempotent`), so running it
-        # unconditionally inside `finally` is safe.
         _core.scheduler_request_stop_all()
         try:
             for i in range(self.num_workers):
@@ -860,16 +768,6 @@ def stop_workers(self):
                     "boc_behavior", _LIFECYCLE_RECEIVE_TIMEOUT,
                 )
                 if tag == _core.TIMEOUT:
-                    # A worker failed to reply "shutdown" after the
-                    # C-level stop fan-out. Most likely cause: the
-                    # worker's do_work() loop is wedged inside
-                    # `scheduler_worker_pop`, or a behavior body
-                    # deadlocked. Log and proceed: the outer `finally`
-                    # below still runs `scheduler_runtime_stop`, which
-                    # is idempotent and tears down the C-side state
-                    # regardless. Without this branch stop() would
-                    # block forever and the runtime could never be
-                    # retried.
                     self.logger.error(
                         "stop_workers: worker %d did not reply 'shutdown' "
                         "within %.1fs; proceeding with teardown anyway. "
@@ -883,20 +781,9 @@ def stop_workers(self):
                 _core.send("boc_cleanup", True)
 
             self.teardown_workers()
-            # Alternate `main_pump_drain_all` and
-            # `_drain_orphan_behaviors` until both report empty in
-            # the same iteration. `release_all` inside the orphan
-            # drain dispatches successors through
-            # `boc_sched_dispatch`, whose pinned fast path routes
-            # pinned-bearing successors onto MAIN_PINNED_QUEUE; a
-            # single pump-then-orphan ordering would leave those
-            # successors enqueued and their terminator_inc holds
-            # undecremented, wedging the next `start()`. The cap is
-            # a self-defence against a runaway producer that keeps
-            # re-feeding the queues: log + give up rather than spin
-            # forever. Main-interp only; skip the pump-side drain
-            # on sub-interpreter shutdown paths where the pinned
-            # queue is provably empty (only main can enqueue).
+            # Alternate pump-drain and orphan-drain until both report empty: release_all routes pinned successors
+            # onto the pinned queue, so draining only one would strand them and wedge the next start(). The cap
+            # bounds a runaway producer; only the primary interpreter owns the pinned queue.
             accumulated_drain_errors = []
             try:
                 if _core.is_primary():
@@ -927,21 +814,12 @@ def stop_workers(self):
                     errors_this_round, _ = self._drain_orphan_behaviors()
                     accumulated_drain_errors.extend(errors_this_round)
             finally:
-                # KeyboardInterrupt/SystemExit re-raised mid-drain must
-                # not erase already-captured release_all failures.
-                # extend (not assign) because _drain_orphan_behaviors
-                # also pushes its in-flight errors before the re-raise.
+                # extend (not assign): _drain_orphan_behaviors may have pushed in-flight errors before re-raising.
                 if accumulated_drain_errors:
                     self._stop_drain_errors.extend(
                         accumulated_drain_errors)
         finally:
             try:
-                # Snapshot the per-worker scheduler counters before
-                # the per-worker array is freed. Workers have already
-                # replied "shutdown" and exited their do_work loops,
-                # so their counters are stable. Surfaced to the
-                # caller via `wait(stats=True)`. Best-effort: any
-                # failure here must not block teardown.
                 try:
                     self._final_stats = _core.scheduler_stats()
                 except Exception as snap_ex:
@@ -950,20 +828,8 @@ def stop_workers(self):
                         snap_ex,
                     )
                     self._final_stats = None
-                # Free the per-worker scheduler array now that no
-                # worker thread can observe it. Paired with the
-                # `scheduler_runtime_start` call in `start()`. Run
-                # inside the outer `finally` so the WORKERS array is
-                # reclaimed even when an earlier step raised --
-                # without this the retry-only branch in `stop()`
-                # would never reach this call site.
                 _core.scheduler_runtime_stop()
             finally:
-                # Mark workers as stopped so a retried stop() (after
-                # the noticeboard-timeout branch raises, or after a
-                # failure anywhere in the handshake/teardown/drain
-                # above) does not try to shut down a worker pool that
-                # is already gone.
                 self._workers_stopped = True
         self.logger.debug("workers stopped")
 
@@ -988,16 +854,9 @@ def start_noticeboard(self):
 
         def noticeboard():
             self.logger.debug("starting the noticeboard thread")
-            # Pin this thread as the only legitimate noticeboard mutator.
-            # The C layer rejects write_direct/delete from any other
-            # thread, eliminating the TOCTOU window in the Python-level
-            # read-modify-write performed by noticeboard_update.
             try:
                 _core.set_noticeboard_thread()
             except BaseException as ex:  # noqa: B036
-                # Captured here and re-raised on the starter thread by
-                # start_noticeboard so the runtime fails loudly instead
-                # of silently stranding the noticeboard mutator.
                 self._noticeboard_start_error = ex
                 ready.set()
                 return
@@ -1016,10 +875,7 @@ def noticeboard():
 
                     case ["boc_noticeboard", ("noticeboard_update", key, fn, default)]:
                         try:
-                            # Force a fresh snapshot for this read-modify-write:
-                            # this thread is not a behavior, so the
-                            # default no-polling semantics do not apply here and
-                            # we want to see the latest committed state.
+                            # Fresh snapshot for the RMW: this thread is not a behavior, so clear the cache first.
                             _core.noticeboard_cache_clear()
                             snap = _core.noticeboard_snapshot()
                             current = snap.get(key, _ABSENT)
@@ -1029,14 +885,7 @@ def noticeboard():
                             if new_value is REMOVED:
                                 _core.noticeboard_delete(key)
                             else:
-                                # write_direct bumps NB_VERSION; other readers'
-                                # caches will revalidate at their next behavior
-                                # boundary. Re-pin any cowns reachable from
-                                # the new value (the previous entry's pins are
-                                # released by write_direct). We are on the
-                                # noticeboard thread here so cown_pin_pointers
-                                # is safe — its INCREFs will be transferred
-                                # into the entry by write_direct.
+                                # write_direct transfers these INCREFs into the entry, keeping the cowns alive.
                                 pin_ptrs = _core.cown_pin_pointers(
                                     _gather_pins(new_value))
                                 _core.noticeboard_write_direct(
@@ -1044,8 +893,7 @@ def noticeboard():
                         except Exception as ex:
                             self.logger.warning(f"noticeboard_update({key!r}) failed: {ex}")
                         finally:
-                            # Re-arm the version check for any subsequent
-                            # snapshot call from this thread.
+                            # Re-arm the version check so later snapshots from this thread see committed state.
                             _core.noticeboard_cache_clear()
 
                     case ["boc_noticeboard", ("noticeboard_delete", key)]:
@@ -1054,23 +902,8 @@ def noticeboard():
                         except Exception as ex:
                             self.logger.warning(f"noticeboard_delete({key!r}) failed: {ex}")
 
-                    case ["boc_noticeboard", ("sync", seq)]:
-                        # Barrier sentinel posted by notice_sync(). Marking
-                        # this sequence complete wakes any caller blocked
-                        # in notice_sync_wait. Because the boc_noticeboard
-                        # tag is FIFO per producer, every write/update/delete
-                        # the caller posted before this sentinel has already
-                        # been processed above by the time we get here.
-                        _core.notice_sync_complete(seq)
-
         self.noticeboard = threading.Thread(target=noticeboard)
         self.noticeboard.start()
-        # Block until the thread has either claimed the noticeboard slot
-        # or captured an error. Without this handshake a failed claim
-        # would be invisible: notice_write/update/delete would enqueue
-        # to boc_noticeboard with no consumer, notice_sync() would block
-        # forever, and stop() would observe a non-alive thread and
-        # discard the entire backlog.
         ready.wait()
         if self._noticeboard_start_error is not None:
             err = self._noticeboard_start_error
@@ -1105,17 +938,7 @@ def start(self, module: Optional[tuple[str, str]] = None):
             export = export_module_from_file(module[1])
             module_name = f"{module[0]}"
 
-        # Defence in depth: the transpiler emits identifier-shaped
-        # names, but `module_name` is interpolated into worker
-        # bootstrap source -- reject anything that is not a valid
-        # dotted Python module path at the boundary so a hostile or
-        # malformed name cannot reach the `repr()`-protected
-        # interpolation below. Dotted names (``pkg.sub.mod``) are
-        # accepted because users may invoke bocpy from a
-        # package-qualified module; each dotted component must
-        # itself be a valid identifier. ``__main__`` falls through
-        # naturally because ``"__main__".isidentifier()`` is True
-        # and ``"__main__".split(".") == ["__main__"]``.
+        # module_name is interpolated into worker source; reject non-dotted-identifier names to block injection.
         if not all(part.isidentifier() for part in module_name.split(".")):
             raise ValueError(
                 f"module_name must be a dotted Python module path; "
@@ -1124,18 +947,6 @@ def start(self, module: Optional[tuple[str, str]] = None):
 
         self.behavior_lookup = export.behaviors
 
-        # Compile the transpiled source into a fresh module on the
-        # main interpreter so :func:`pump` can resolve
-        # ``__behavior__N`` thunks the same way workers do. Workers
-        # bootstrap their own copy inside a sub-interpreter
-        # (``_bocpy_mod`` in the worker_script below); main needs an
-        # equivalent namespace because pinned-behavior bodies execute
-        # under ``main_pump_bounded`` on the main interpreter and
-        # ``behavior_execute_impl`` looks up the thunk via
-        # ``PyObject_GetAttrString(boc_export, ...)``. Without this
-        # the lookup falls back to ``sys.modules["__main__"]`` (which
-        # under pytest is the test runner, not the test module) and
-        # every pinned ``@when`` body fails with ``AttributeError``.
         main_export_name = f"__bocpy_main_export__{module_name}"
         main_export_file = f"<bocpy:main:{module_name}>"
         main_export = types.ModuleType(main_export_name)
@@ -1152,17 +963,7 @@ def start(self, module: Optional[tuple[str, str]] = None):
         )
         self.export_module = main_export
 
-        # Embed the transpiled source as a Python string literal
-        # (via ``repr()``) into the worker bootstrap. Each worker
-        # compiles and exec's the literal into a fresh
-        # ``types.ModuleType``; no file is written to disk. The
-        # synthetic filename ``<bocpy:NAME>`` is registered with
-        # ``linecache`` so tracebacks still surface the transpiled
-        # source line. Every interpolated occurrence of the module
-        # name uses ``repr(module_name)`` so quote / backslash /
-        # non-ASCII content cannot break out of the string literal
-        # (the prior path interpolated ``module_name`` raw via
-        # f-string into ``r"..."``).
+        # repr() embeds the transpiled source as a literal; repr(module_name) blocks quote/backslash break-out.
         src_literal = repr(export.code)
         bocmain_alias = "__bocmain__" if module_name == "__main__" else module_name
         sysmod_key = repr(bocmain_alias)
@@ -1171,18 +972,11 @@ def start(self, module: Optional[tuple[str, str]] = None):
         main_start = worker_script.find(WORKER_MAIN_END)
 
         bootstrap = [
-            # The user-module load below is wrapped in try/except so an
-            # import error, syntax error, or wedging top-level statement
-            # surfaces as a traceback on `boc_behavior` instead of a
-            # silent hang. `send` is already imported at the top of
-            # worker.py and is guaranteed available here. The except
-            # block re-raises so `interpreters.run_string` also reports
-            # the failure via its return value.
+            # Wrap the user-module load so import/syntax errors surface on boc_behavior instead of a silent hang.
             "import linecache",
             "import traceback as _bocpy_tb",
             "import types",
-            # Module name is bound outside the try so the diagnostic can
-            # name it even if the src-literal assignment fails.
+            # Bind the module name outside the try so the diagnostic can name it even if the src literal fails.
             f"_bocpy_modname = {sysmod_key}",
             "try:",
             f"    _bocpy_src = {src_literal}",
@@ -1232,59 +1026,21 @@ def start(self, module: Optional[tuple[str, str]] = None):
         )
 
         set_tags(["boc_behavior", "boc_cleanup", "boc_noticeboard"])
-        # Allocate the per-worker scheduler array before spawning any
-        # workers so each worker's first action (registering its slot)
-        # has a non-empty WORKERS array to claim from. Mirrored by
-        # `_core.scheduler_runtime_stop()` in `stop_workers()` after
-        # the workers are joined, and by every abort path below so
-        # the C-side WORKERS array is reclaimed and the next
-        # `start()` does not observe stale per-task queues.
+        # Allocate the WORKERS array before spawning workers so each can claim a slot; freed by runtime_stop.
         _core.scheduler_runtime_start(self.num_workers)
         try:
-            # Bring up workers and the noticeboard thread first. We seed
-            # the C-level terminator only after both succeed so a failure
-            # in start_noticeboard (or anywhere between here and the
-            # terminator_reset below) leaves the terminator in its
-            # post-stop() quiescent state (count=0, seeded=0) and the
-            # next start() can proceed cleanly without a drift diagnostic
-            # firing. On a partial-startup failure we also tear the
-            # workers back down so the subsequent start() is not blocked
-            # by stale shutdown handshakes or dangling sub-interpreters.
             self.start_workers()
             try:
                 self.start_noticeboard()
             except BaseException:
-                # Close the terminator first so any sibling thread that
-                # somehow races a whencall during the abort window is
-                # refused at terminator_inc rather than slipping a real
-                # behavior into a per-task queue between our scheduler
-                # stop request and the worker shutdown handshake.
-                # TERMINATOR_CLOSED is 0 on the very first start() of
-                # the process and 1 after any prior stop()/abort;
-                # either way, set it to 1 explicitly. terminator_close()
-                # is idempotent.
+                # Close the terminator first so a racing whencall is refused before the abort tears workers down.
                 _core.terminator_close()
                 self._abort_workers()
                 raise
 
-            # Arm the C-level terminator (count=1 seed, closed=0, seeded=1).
-            # reset() returns the prior (count, seeded) so we can detect a
-            # previous run that died without reaching its reconciliation
-            # point (KeyboardInterrupt, stop() that raised, etc.). We refuse
-            # to start on drift rather than silently clobbering whatever
-            # state was left behind -- the previous run is still leaking
-            # behaviors or cowns and starting fresh would mask the bug.
+            # reset() returns the prior (count, seeded); a non-zero pair means drift from a crashed run, so refuse.
             prior_count, prior_seeded = _core.terminator_reset()
             if prior_count != 0 or prior_seeded != 0:
-                # We just armed the terminator (count=1, seeded=1, closed=0).
-                # Close it FIRST so any sibling thread that races a
-                # whencall during the abort window is refused before
-                # touching the half-shut-down pool. Then drop our own
-                # seed via terminator_seed_dec so the next start() sees
-                # (count=0, seeded=0) instead of re-firing the same
-                # drift diagnostic forever. Finally tear down workers
-                # and the noticeboard so the next start() can re-spawn
-                # without colliding with the orphans.
                 _core.terminator_close()
                 _core.terminator_seed_dec()
                 self._abort_noticeboard()
@@ -1297,31 +1053,15 @@ def start(self, module: Optional[tuple[str, str]] = None):
                     "Resolve the earlier failure before starting again."
                 )
         except BaseException:
-            # Defence in depth: if any abort path above failed to call
-            # `_core.scheduler_runtime_stop` (or if `start_workers`
-            # raised before reaching the inner try), free the C-side
-            # WORKERS array here. `scheduler_runtime_stop` is
-            # idempotent — calling it twice on a successful abort is
-            # a no-op on the second call.
+            # Defence in depth: free WORKERS in case an abort path missed it (scheduler_runtime_stop is idempotent).
             try:
                 _core.scheduler_runtime_stop()
             except Exception as ex:
                 self.logger.exception(ex)
-            # Drop the __bocmain__ alias if we installed one, so a
-            # follow-up start() observes a clean sys.modules. Same
-            # rationale as in the successful stop() path.
             self._restore_main_aliases()
             raise
 
     def _restore_main_aliases(self):
-        # Symmetric cleanup of the main-side state ``start()`` may
-        # have installed: the synthetic ``linecache`` entry that
-        # backs tracebacks for the transpiled export, and the
-        # ``__bocmain__`` alias used by worker bootstrap to subclass
-        # user classes defined in ``__main__``. Restoring the prior
-        # ``__bocmain__`` (instead of unconditionally popping it)
-        # preserves an alias the host had set before the runtime
-        # started.
         mef = self._main_export_file
         if mef is not None:
             linecache.cache.pop(mef, None)
@@ -1352,9 +1092,6 @@ def _abort_workers(self):
                     "boc_behavior", _LIFECYCLE_RECEIVE_TIMEOUT,
                 )
                 if tag == _core.TIMEOUT:
-                    # Same wedge as in `stop_workers`, on the abort
-                    # path. Continue the abort regardless -- the
-                    # caller is already error-handling a failed start.
                     self.logger.error(
                         "_abort_workers: worker %d did not reply "
                         "'shutdown' within %.1fs; continuing abort.",
@@ -1412,7 +1149,7 @@ def cycle_noticeboard(self, timeout: Optional[float] = None) -> dict[str, Any]:
             _core.noticeboard_cache_clear()
             snap = dict(_core.noticeboard_snapshot())
         finally:
-            # Restart unconditionally so a failed snapshot does not strand the runtime.
+            # Restart unconditionally so a failed snapshot does not strand the runtime without a mutator thread.
             self.start_noticeboard()
         return snap
 
@@ -1428,13 +1165,12 @@ def quiesce(self, timeout: Optional[float] = None) -> bool:
                 "Behaviors.quiesce() must be called from the primary "
                 "interpreter."
             )
-        # Track whether seed_dec actually dropped the seed so we only re-arm it ourselves.
+        # Only re-arm the seed on exit if seed_dec actually dropped it, so we never over-increment.
         seed_dropped = _core.terminator_seed_dec()
         try:
             return self._wait_for_quiescence(timeout)
         finally:
             if seed_dropped:
-                # Re-arm so a future stop()/quiesce() can drop the seed again; CAS 0->1 is idempotent.
                 _core.terminator_seed_inc()
 
     def __enter__(self):
@@ -1472,8 +1208,7 @@ def _auto_pump_loop(self, timeout: Optional[float]) -> bool:
                 _core.main_pump_bounded(
                     None, 64, False, self.export_module,
                 )
-            # WAIT_TIMED_OUT: fall through to the deadline check at
-            # the top of the next iteration.
+            # WAIT_TIMED_OUT falls through to the next iteration's deadline check.
         return True
 
     def _wait_for_quiescence(self, timeout: Optional[float]) -> bool:
@@ -1546,13 +1281,7 @@ def stop(self, timeout: Optional[float] = None):
         after a noticeboard-timeout abort retries only the
         noticeboard drain.
         """
-        # Take down the seed and wait for quiescence. Both
-        # are idempotent so a second stop() / wait() is a no-op.
-        # Compute one deadline up front so each stage gets the *remaining*
-        # budget rather than the original timeout. Without this, a
-        # caller-supplied timeout=T would let terminator_wait, the
-        # noticeboard drain, and stop_workers each consume up to T,
-        # turning the visible upper bound into 3*T.
+        # One deadline up front so each stage gets the remaining budget, not a fresh timeout (else the bound is 3*T).
         if timeout is None:
             deadline = None
         else:
@@ -1563,30 +1292,13 @@ def _remaining():
                 return None
             return max(0.0, deadline - time.monotonic())
 
-        # Idempotent retry: if a prior stop() reached the
-        # noticeboard-timeout branch, it already drove the
-        # terminator to quiescence and shut the workers down.
-        # Re-running ``stop_workers`` would block forever in
-        # ``scheduler_request_stop_all`` waiting for shutdown
-        # replies from a worker pool that is gone. Skip straight
-        # to the noticeboard cleanup the prior attempt could not
-        # complete.
         if not self._workers_stopped:
             _core.terminator_seed_dec()
             self._wait_for_quiescence(_remaining())
 
-            # Post-wait reconciliation. If wait() timed out the count is
-            # still > 0 -- skip the assertion in that case so a partial
-            # teardown does not mask the underlying timeout.
             c_count = _core.terminator_count()
             c_seeded = _core.terminator_seeded()
             quiesced = (c_count == 0 and c_seeded == 0)
-            # Close the terminator unconditionally before any further drain
-            # work. On the clean path this is the documented refusal point;
-            # on the warned path it MUST happen before stop_workers's
-            # orphan drain so a late whencall caller cannot slip a fresh
-            # behavior into a per-task queue between the drain pass and
-            # scheduler_runtime_stop. terminator_close() is idempotent.
             _core.terminator_close()
             if not quiesced:
                 self.logger.warning(
@@ -1596,31 +1308,14 @@ def _remaining():
                     "that elapsed while behaviors were still in flight."
                 )
 
-            # Drain the noticeboard thread.
             _core.send("boc_noticeboard", "shutdown")
             self.noticeboard.join(_remaining())
             if self.noticeboard.is_alive():
-                # join() timed out. The noticeboard thread still owns the
-                # single-writer slot and may be holding NB_MUTEX while
-                # processing an in-flight mutation. We do not call
-                # `clear_noticeboard_thread` / `noticeboard_clear` (those
-                # would race with the live thread), but we MUST still drain
-                # orphan behaviors so the C-side terminator_count returns
-                # to 0 — otherwise a caller-supplied finite timeout that
-                # fires here permanently strands every behavior currently
-                # parked in a per-task queue. Worker shutdown itself does
-                # not touch NB_MUTEX, so it is safe under a wedged
-                # noticeboard thread.
                 try:
                     self.stop_workers()
                 except Exception as drain_ex:
-                    # Surface drain failures via logging; the outer
-                    # RuntimeError below remains the primary failure
-                    # signal because the noticeboard timeout is what got
-                    # us into this branch.
                     self.logger.exception(drain_ex)
-                # Reset the drain errors list so a subsequent stop() does
-                # not double-report; the drain has already happened.
+                # Reset so a later stop() does not double-report; the drain already ran on this branch.
                 self._stop_drain_errors = []
                 raise RuntimeError(
                     "stop(): noticeboard thread did not shut down within "
@@ -1629,60 +1324,23 @@ def _remaining():
                     "is still pinned; a later stop() call may complete "
                     "the cleanup once the in-flight mutation finishes."
                 )
-            # Shut workers down and reset noticeboard ownership.
-            # stop_workers() now owns the orphan-drain (must happen before
-            # the per-task queues are freed); it stashes any release_all
-            # exceptions on `self._stop_drain_errors` for stop() to re-raise.
             self.stop_workers()
             drain_errors = self._stop_drain_errors
             self._stop_drain_errors = []
         else:
-            # Retry path: workers are already gone. Re-attempt the
-            # noticeboard drain that timed out previously. ``join()``
-            # without a timeout waits forever -- by this point the
-            # in-flight noticeboard fn must have finished or the
-            # caller is no closer to making progress than they were
-            # before. We surface the join via a remaining-budget
-            # join so a caller-supplied timeout still bounds the
-            # retry. The ``is_alive()`` check below is best-effort:
-            # if the thread has already exited it skips the
-            # redundant sentinel send. There is a residual TOCTOU
-            # window (alive at check, exits before the send lands)
-            # in which a stale sentinel can linger in the
-            # ``boc_noticeboard`` queue, but correctness rests on
-            # ``Behaviors.start_runtime`` calling ``set_tags(["...",
-            # "boc_noticeboard"])`` on the next ``start()``, which
-            # clears the queue per the public ``set_tags`` contract.
-            # The guard reduces the frequency of the stale-sentinel
-            # case but is not itself the correctness fence.
             if self.noticeboard.is_alive():
                 _core.send("boc_noticeboard", "shutdown")
             self.noticeboard.join(_remaining())
             if self.noticeboard.is_alive():
-                # Still pinned. Re-raise the same diagnostic so the
-                # caller can keep retrying. ``_workers_stopped`` is
-                # unchanged so a subsequent retry stays on this path.
                 raise RuntimeError(
                     "stop(): noticeboard thread still pinned on retry "
                     f"(timeout={timeout!r}). The in-flight mutation "
                     "has not finished; retry once it has."
                 )
             drain_errors = []
-        # The block below is single-shot per Behaviors instance. A
-        # second `stop()` (or `wait()`-triggered re-entry) MUST NOT
-        # re-snapshot, or it would overwrite ``_final_noticeboard``
-        # with ``{}`` because the first call already ran
-        # ``noticeboard_clear()``. This mirrors the natural gating
-        # of ``_final_stats`` inside ``stop_workers()``, which is
-        # itself guarded by ``_workers_stopped``.
         if not self._teardown_complete:
             _core.clear_noticeboard_thread()
-            # Snapshot before clearing. The noticeboard thread has
-            # exited and workers are joined, so entries are stable.
-            # `cache_clear()` is required because the main thread
-            # may hold a stale `noticeboard()` proxy from earlier
-            # user code. Best-effort: any failure must not block
-            # teardown.
+            # Snapshot before clearing while entries are stable; cache_clear() drops a stale main-thread proxy.
             try:
                 _core.noticeboard_cache_clear()
                 self._final_noticeboard = dict(_core.noticeboard_snapshot())
@@ -1692,22 +1350,9 @@ def _remaining():
                 )
                 self._final_noticeboard = None
             _core.noticeboard_clear()
-            # Teardown is complete: workers are joined, the
-            # noticeboard thread has exited, and the C-level slot is
-            # released. The transpiled module is exec'd in-memory in
-            # each worker, so there is no on-disk artifact to clean
-            # up.
             self._teardown_complete = True
-        # Drop the __bocmain__ alias we installed in start() so a
-        # subsequent bocpy.start() observes a clean sys.modules
-        # (and so the main module isn't pinned in sys.modules under
-        # an alias after the runtime has shut down).
         self._restore_main_aliases()
         if drain_errors:
-            # Surface the first failure so the caller sees the leak at
-            # the failure site rather than later as a mysterious
-            # deadlock on the affected cowns. The remaining errors
-            # were logged inside _drain_orphan_behaviors.
             raise RuntimeError(
                 "stop(): release_all failed for "
                 f"{len(drain_errors)} orphan behavior(s) during drain; "
@@ -1753,21 +1398,16 @@ def _drain_orphan_behaviors(self):
         """
         errors = []
         drained_count = 0
-        # KeyboardInterrupt / SystemExit raised mid-drain must not
-        # abort the drain partway -- the orphaned behaviors would
-        # leak their MCS chains and terminator holds, so the next
-        # start() would diagnose terminator drift forever. Capture
-        # them, finish the drain, and re-raise the first after the
-        # loop returns clean.
+        # A KeyboardInterrupt/SystemExit mid-drain must not abort partway: orphaned behaviors would leak their MCS
+        # chains and terminator holds, so the next start() would diagnose drift forever. Defer it, finish the drain,
+        # then re-raise the first after the loop returns clean.
         deferred_base_exc = None
         while True:
             capsules = _core.scheduler_drain_all_queues()
             if not capsules:
                 if deferred_base_exc is not None:
                     if errors:
-                        # Stash current-round errors so a
-                        # KeyboardInterrupt unwinding past stop() does
-                        # not silently erase release_all failures.
+                        # Stash errors so a KeyboardInterrupt unwinding past stop() does not erase release_all failures.
                         self._stop_drain_errors.extend(errors)
                         note = (
                             f"_drain_orphan_behaviors deferred "
@@ -1794,11 +1434,6 @@ def _drain_orphan_behaviors(self):
                     "behavior dropped during stop(); the runtime was "
                     "torn down before this behavior could acquire its cowns"
                 )
-                # Surface the drop to anyone awaiting the result Cown.
-                # Best-effort: failures here only degrade UX (the user
-                # sees None instead of a diagnostic), so log and
-                # continue with release_all so MCS chains still
-                # unwind.
                 try:
                     payload.set_drop_exception(RuntimeError(
                         "behavior dropped during stop(); the runtime "
@@ -1863,13 +1498,8 @@ def whencall(thunk: str, args: list[Union[Cown, list[Cown]]], captures: list[Any
         "whencall:behavior=Behavior(thunk=%s, result=%r, args=%r, captures=%r)",
         thunk, result, args, captures,
     )
-    # Caller threads run the entire 2PL inline. Register with the
-    # C terminator first so a concurrent stop()/terminator_close() will
-    # refuse the schedule rather than racing teardown. Once the
-    # terminator hold is taken, behavior_schedule is infallible past
-    # prepare; any failure during the prepare phase rolls the hold back.
-    # The matching decrement happens on the worker thread once the
-    # behavior body runs.
+    # Take the terminator hold before scheduling so a concurrent stop()/terminator_close() refuses the schedule
+    # rather than racing teardown; the matching dec runs on the worker thread once the body completes.
     if _core.terminator_inc() < 0:
         raise RuntimeError("runtime is shutting down")
     try:
@@ -1915,7 +1545,6 @@ def start(worker_count: Optional[int] = None,
     if not _core.is_primary():
         raise RuntimeError("start() can only be called from the main interpreter")
 
-    # Idempotent: bare start() no-ops; mismatched explicit args raise.
     if BEHAVIORS is not None:
         if worker_count is not None and worker_count != BEHAVIORS.num_workers:
             raise RuntimeError(
@@ -1945,12 +1574,7 @@ def start(worker_count: Optional[int] = None,
     try:
         BEHAVIORS.start(module)
     except BaseException:
-        # Failed startup must not leave a half-initialised Behaviors
-        # instance bound globally: the next @when would skip start()
-        # entirely and run against a runtime whose noticeboard thread
-        # never claimed the C-level slot (or whose workers never
-        # spawned). Reset the slot so the caller can retry once the
-        # underlying cause is cleared.
+        # Clear the global on failure so the next @when re-runs start() instead of using a half-initialised runtime.
         BEHAVIORS = None
         raise
 
@@ -2059,7 +1683,7 @@ def _remaining() -> Optional[float]:
             f"quiesce(): runtime did not reach quiescence within "
             f"timeout={timeout!r}"
         )
-    # Sample stats post-quiescence so the per-worker counts are stable.
+    # Sample stats after quiescence so the per-worker counts are stable.
     stats_snap = list(_core.scheduler_stats()) if stats else None
     nb_snap = BEHAVIORS.cycle_noticeboard(_remaining()) if noticeboard else None
     return _format(stats_snap, nb_snap)
@@ -2070,7 +1694,7 @@ def wait(timeout: Optional[float] = None, *,
     """Block until all behaviors complete, with optional timeout.
 
     When ``stats=True``, captures the per-worker
-    :func:`_core.scheduler_stats` snapshot. When
+    ``_core.scheduler_stats`` snapshot. When
     ``noticeboard=True``, captures the noticeboard contents as a
     plain ``dict`` at the quiescence point (NOT after teardown — the
     two are equivalent in single-caller programs but the quiescence
@@ -2085,7 +1709,7 @@ def wait(timeout: Optional[float] = None, *,
     - both flags: :class:`WaitResult`.
 
     Internally a thin wrapper around :func:`quiesce` +
-    :meth:`Behaviors.stop`; quiescence timeout warns rather than
+    ``Behaviors.stop``; quiescence timeout warns rather than
     raising.
     """
     global BEHAVIORS
@@ -2103,8 +1727,6 @@ def _format(stats_snap, nb_snap):
         return _format([], {})
 
     if BEHAVIORS._teardown_complete:
-        # Idempotent: prior stop() already stashed final snapshots;
-        # return them rather than running on an empty runtime.
         stats_snap = BEHAVIORS._final_stats if BEHAVIORS._final_stats is not None else []
         nb_snap = BEHAVIORS._final_noticeboard if BEHAVIORS._final_noticeboard is not None else {}
         BEHAVIORS = None
@@ -2120,8 +1742,6 @@ def _remaining() -> Optional[float]:
             return None
         return max(0.0, deadline - time.monotonic())
 
-    # quiesce() first for a pre-teardown snapshot; on TimeoutError fall
-    # back to stop()'s post-teardown one (historical warn-and-tear-down).
     quiesce_snapshots = None
     quiesce_timed_out = False
     try:
@@ -2134,17 +1754,11 @@ def _remaining() -> Optional[float]:
             "wait(): quiesce() timed out (%s); proceeding to stop().", ex,
         )
 
-    # Clear BEHAVIORS only if stop() drove the runtime all the
-    # way through teardown (workers joined, noticeboard exited,
-    # C-level noticeboard slot released). On stop()'s
-    # noticeboard-join-timeout path the runtime is intentionally
-    # left running so the caller can diagnose the leak and
-    # retry; nulling the global handle there would strand the
-    # live workers / noticeboard thread with no Python-side
-    # reference.
     try:
         BEHAVIORS.stop(_remaining())
     except BaseException:
+        # Only clear the global once stop() completed teardown; on its noticeboard-join-timeout path the runtime is
+        # left running for a retry, and nulling the handle there would strand the live workers / noticeboard thread.
         if BEHAVIORS._teardown_complete:
             if quiesce_snapshots is not None:
                 BEHAVIORS = None
@@ -2190,8 +1804,6 @@ def _require_noticeboard_ready(key: str, operation: str) -> None:
     _validate_noticeboard_key(key)
 
 
-# Container types we recurse into when scanning a noticeboard value for
-# CownCapsules to pin. Custom user objects are also descended via __dict__.
 _NB_CONTAINER_TYPES = (list, tuple, set, frozenset)
 
 
@@ -2223,7 +1835,6 @@ def _collect_cown_capsules(obj: Any, out: list, seen: set) -> None:
         seen.add(obj_id)
         return
     if isinstance(obj, (str, bytes, bytearray, int, float, bool, type(None))):
-        # Common leaf types: skip cheaply without recording in `seen`.
         return
     seen.add(obj_id)
     if isinstance(obj, dict):
@@ -2235,17 +1846,12 @@ def _collect_cown_capsules(obj: Any, out: list, seen: set) -> None:
         for item in obj:
             _collect_cown_capsules(item, out, seen)
         return
-    # Fall back to inspecting attributes for ordinary user classes. Built-in
-    # opaque objects (e.g. compiled regex patterns) have no __dict__ and are
-    # left alone.
     d = getattr(obj, "__dict__", None)
     if d is not None:
         _collect_cown_capsules(d, out, seen)
-    # Walk __slots__ up the MRO: slot-only classes (e.g. @dataclass(slots=True))
-    # have no __dict__ at all, so cowns stored in slot attributes would
-    # otherwise be silently missed and recycled out from under the
-    # noticeboard entry.
     cls = type(obj)
+    # Walk __slots__ up the MRO too: slot-only classes (e.g. @dataclass(slots=True)) have no __dict__, so cowns in
+    # slot attributes would otherwise be silently missed and recycled out from under the noticeboard entry.
     for klass in cls.__mro__:
         slots = klass.__dict__.get("__slots__")
         if not slots:
@@ -2253,8 +1859,7 @@ def _collect_cown_capsules(obj: Any, out: list, seen: set) -> None:
         if isinstance(slots, str):
             slots = (slots,)
         for name in slots:
-            # __dict__ and __weakref__ are reserved slot names that
-            # expose the mapping / weakref itself; skip them.
+            # __dict__ / __weakref__ are reserved slot names exposing the mapping itself, not stored values.
             if name in ("__dict__", "__weakref__"):
                 continue
             try:
@@ -2300,21 +1905,64 @@ def notice_write(key: str, value: Any) -> None:
     :type value: Any
     """
     _require_noticeboard_ready(key, "write to")
-    # Gather every CownCapsule reachable from `value` so the noticeboard
-    # can take an independent strong reference on each. We pre-pin them
-    # here on the writer thread (cown_pin_pointers does COWN_INCREF on
-    # each and returns the raw pointers as ints). The pointers ride
-    # along in the message; the noticeboard thread transfers ownership
-    # into the noticeboard entry without an extra INCREF. This closes
-    # the window where the writer behavior could return and drop its
-    # pin list before the noticeboard thread dequeues the message —
-    # without pre-pinning the BOCCowns get freed to the recycle pool
-    # and the unpickle of the value's CownCapsules touches dangling
-    # pointers.
+    # Pre-pin every reachable cown on the writer thread (cown_pin_pointers INCREFs and returns raw pointers): the
+    # noticeboard thread transfers ownership without a second INCREF, closing the window where the writer could drop
+    # its pins before the message is dequeued and the value's cowns get recycled to dangling pointers.
     pin_ptrs = _core.cown_pin_pointers(_gather_pins(value))
     _core.send("boc_noticeboard", ("noticeboard_write", key, value, pin_ptrs))
 
 
+def notice_seed(key: str, value: Any) -> None:
+    """Synchronously write a value to the noticeboard from the primary interpreter.
+
+    Unlike :func:`notice_write`, this commits **before it returns**:
+    the value is serialized and applied under the noticeboard mutex on
+    the calling thread, so once :func:`notice_seed` returns the entry
+    is live and visible to every behavior scheduled afterwards (and to
+    the calling thread's own subsequent :func:`notice_read`). It is the
+    recommended way to install read-mostly configuration on the
+    noticeboard *before* scheduling the behaviors that read it.
+
+    If the runtime is not yet running, :func:`notice_seed` starts it
+    (just like the first ``@when``), so seeding can be the first bocpy
+    call a program makes — no explicit :func:`start` is required.
+
+    **Primary interpreter only.** :func:`notice_seed` may be called only
+    from the primary interpreter (never from inside a ``@when`` body,
+    which runs on a worker). Calling it from a worker raises
+    :class:`RuntimeError`. Use :func:`notice_write` for fire-and-forget
+    writes from within behaviors.
+
+    :func:`notice_seed` is a plain overwrite and is intended for
+    *seeding* — installing values before the behaviors and noticeboard
+    mutations that read them are in flight. It is **not** a concurrent
+    update primitive: it does not provide the read-modify-write
+    atomicity of :func:`notice_update`, and a seed that races an
+    in-flight :func:`notice_update` on the same key may be lost (the
+    update's read-modify-write can overwrite it). Seed once, up front,
+    rather than interleaving seeds with concurrent updates.
+
+    The noticeboard supports up to 64 distinct keys. Values may embed
+    :class:`Cown` references; the noticeboard keeps each embedded cown
+    alive for as long as the entry remains.
+
+    :param key: The noticeboard key (max 63 UTF-8 bytes).
+    :type key: str
+    :param value: The value to store.
+    :type value: Any
+    :raises RuntimeError: If called from a worker interpreter.
+    """
+    if not _core.is_primary():
+        raise RuntimeError("notice_seed may only be called from the primary interpreter")
+    _validate_noticeboard_key(key)
+    if BEHAVIORS is None:
+        start(module=get_caller_module())
+    # Pre-pin every reachable cown (cown_pin_pointers INCREFs and returns raw pointers); the C entry adopts those refs
+    # under the noticeboard mutex, so the strong refs are taken while the originals are still live.
+    pin_ptrs = _core.cown_pin_pointers(_gather_pins(value))
+    _core.noticeboard_seed(key, value, pin_ptrs)
+
+
 def notice_update(key: str, fn: Callable[[Any], Any], default: Any = None) -> None:
     """Atomically update a noticeboard entry.
 
@@ -2450,34 +2098,3 @@ def notice_read(key: str, default: Any = None) -> Any:
     """
     _validate_noticeboard_key(key)
     return _core.noticeboard_snapshot().get(key, default)
-
-
-def notice_sync(timeout: Optional[float] = 30.0) -> None:
-    """Block until the caller's prior noticeboard mutations are committed.
-
-    Because :func:`notice_write`, :func:`notice_update`, and
-    :func:`notice_delete` are fire-and-forget, a behavior that wants
-    read-your-writes ordering against a *subsequent* behavior must call
-    ``notice_sync()`` after its writes. The call posts a sentinel onto
-    the ``boc_noticeboard`` tag (which is FIFO per producer) and blocks
-    until the noticeboard thread has drained that sentinel. By the time
-    this returns, every write/update/delete posted from the calling
-    thread before the sentinel has been applied to the noticeboard.
-
-    The barrier carries **no ordering guarantee** with respect to
-    writes posted from other threads or behaviors interleaved with the
-    caller's; it only flushes the caller's own queued mutations.
-
-    :param timeout: Maximum seconds to wait. ``None`` waits forever.
-        Defaults to 30 seconds.
-    :type timeout: float or None
-    :raises TimeoutError: If the noticeboard thread does not drain the
-        caller's sentinel within *timeout* seconds.
-    :raises RuntimeError: If the runtime is not started.
-    """
-    if _core.is_primary() and BEHAVIORS is None:
-        raise RuntimeError("cannot notice_sync before the runtime is started")
-    seq = _core.notice_sync_request()
-    _core.send("boc_noticeboard", ("sync", seq))
-    if not _core.notice_sync_wait(seq, timeout):
-        raise TimeoutError(f"notice_sync({timeout}s) timed out waiting for seq={seq}")
diff --git a/src/bocpy/boc_compat.c b/src/bocpy/boc_compat.c
index daf1990..dfae1bf 100644
--- a/src/bocpy/boc_compat.c
+++ b/src/bocpy/boc_compat.c
@@ -100,13 +100,8 @@ void thrd_sleep(const struct timespec *duration, struct timespec *remaining) {
   Sleep(ms);
 }
 
-// ---------------------------------------------------------------------------
-// Physical CPU detection (Windows arm). See boc_compat.h for contract.
-// ---------------------------------------------------------------------------
-
 int boc_physical_cpu_count(void) {
   DWORD len = 0;
-  // First call: query required buffer size.
   GetLogicalProcessorInformationEx(RelationProcessorCore, NULL, &len);
   if (len == 0) {
     return 0;
@@ -121,7 +116,6 @@ int boc_physical_cpu_count(void) {
     free(buf);
     return 0;
   }
-  // One record per physical core; count records.
   int count = 0;
   DWORD off = 0;
   while (off < len) {
@@ -138,17 +132,11 @@ int boc_physical_cpu_count(void) {
 
 #elif defined(__APPLE__)
 
-// ---------------------------------------------------------------------------
-// Physical CPU detection (macOS arm). See boc_compat.h for contract.
-// ---------------------------------------------------------------------------
-
 #include <sys/sysctl.h>
 
 int boc_physical_cpu_count(void) {
   int value = 0;
   size_t len = sizeof(value);
-  // hw.physicalcpu_max reports physical cores in the machine; fall back
-  // to hw.physicalcpu (cores currently available) if unavailable.
   if (sysctlbyname("hw.physicalcpu_max", &value, &len, NULL, 0) == 0 &&
       value > 0) {
     return value;
@@ -162,10 +150,6 @@ int boc_physical_cpu_count(void) {
 
 #else // assume Linux / glibc-compatible
 
-// ---------------------------------------------------------------------------
-// Physical CPU detection (Linux arm). See boc_compat.h for contract.
-// ---------------------------------------------------------------------------
-
 #include <ctype.h>
 #include <dirent.h>
 #include <sched.h>
@@ -193,16 +177,10 @@ static int boc_read_first_sibling(const char *path) {
 }
 
 int boc_physical_cpu_count(void) {
-  // Snapshot the affinity mask first so cgroup / taskset restrictions
-  // are honoured. Without this the count would be host-physical even
-  // inside a container with --cpuset-cpus=0-3.
   cpu_set_t affinity;
   CPU_ZERO(&affinity);
   bool have_affinity = (sched_getaffinity(0, sizeof(affinity), &affinity) == 0);
 
-  // Per-CPU sibling-leader array. Sized to the kernel's maximum
-  // reasonable CPU id; CPU_SETSIZE is 1024 on glibc, which exceeds
-  // any current hardware.
   enum { MAX_CPU = 4096 };
   int leaders[MAX_CPU];
   int leader_count = 0;
@@ -213,8 +191,6 @@ int boc_physical_cpu_count(void) {
   }
   struct dirent *ent;
   while ((ent = readdir(d)) != NULL) {
-    // Match "cpu<digits>" entries only. "cpuidle", "cpufreq", etc.
-    // share the prefix but are not per-CPU dirs.
     if (strncmp(ent->d_name, "cpu", 3) != 0) {
       continue;
     }
@@ -228,9 +204,6 @@ int boc_physical_cpu_count(void) {
       continue;
     }
 
-    // Skip CPUs outside our affinity mask: a worker scheduled on
-    // them could not run, so they don't contribute usable physical
-    // cores from this process's point of view.
     if (have_affinity && !CPU_ISSET((int)cpu_id, &affinity)) {
       continue;
     }
@@ -241,15 +214,10 @@ int boc_physical_cpu_count(void) {
              cpu_id);
     int leader = boc_read_first_sibling(path);
     if (leader < 0) {
-      // Topology unreadable (very old kernel, sysfs not mounted, etc.).
-      // Bail out so the caller falls back to the logical count.
       closedir(d);
       return 0;
     }
 
-    // Linear dedup: physical-core counts on real hardware are small
-    // (< 256 typical, < 1024 on the largest current servers), so
-    // O(n^2) over leaders is fine.
     bool seen = false;
     for (int i = 0; i < leader_count; ++i) {
       if (leaders[i] == leader) {
@@ -274,11 +242,6 @@ int boc_physical_cpu_count(void) {
 double boc_now_s(void) {
   const double S_PER_NS = 1.0e-9;
   struct timespec ts;
-  // Prefer clock_gettime on POSIX: timespec_get requires macOS 10.15+ while
-  // Python's default macOS deployment target is older, producing an
-  // -Wunguarded-availability-new warning. clock_gettime has been available on
-  // macOS since 10.12. Windows UCRT provides timespec_get but not
-  // clock_gettime, so fall back there.
 #ifdef _WIN32
   timespec_get(&ts, TIME_UTC);
 #else
@@ -291,17 +254,12 @@ double boc_now_s(void) {
 
 uint64_t boc_now_ns(void) {
 #ifdef _WIN32
-  // QueryPerformanceCounter is monotonic and high-resolution on every
-  // Windows version we target; the frequency is queried once and
-  // cached because it is constant for the lifetime of the system.
   static LARGE_INTEGER freq = {0};
   if (freq.QuadPart == 0) {
     QueryPerformanceFrequency(&freq);
   }
   LARGE_INTEGER counter;
   QueryPerformanceCounter(&counter);
-  // Convert ticks -> ns without overflow on a 64-bit counter for any
-  // realistic frequency (<= 10 GHz): split into seconds + remainder.
   uint64_t sec = (uint64_t)counter.QuadPart / (uint64_t)freq.QuadPart;
   uint64_t rem = (uint64_t)counter.QuadPart % (uint64_t)freq.QuadPart;
   return sec * 1000000000ULL + (rem * 1000000000ULL) / (uint64_t)freq.QuadPart;
diff --git a/src/bocpy/boc_compat.h b/src/bocpy/boc_compat.h
index 571cd71..203a592 100644
--- a/src/bocpy/boc_compat.h
+++ b/src/bocpy/boc_compat.h
@@ -43,16 +43,6 @@
 #include <stdint.h>
 #include <time.h>
 
-// ---------------------------------------------------------------------------
-// Cross-platform alignas / alignof shim
-// ---------------------------------------------------------------------------
-//
-// Portable C11-style alignment macros. MSVC's `<stdalign.h>` only
-// defines `alignas` / `alignof` when the compiler is invoked in C11
-// mode (`/std:c11` or later); the Python build does not pass that
-// flag, so we map directly to the underlying `__declspec(align(...))`
-// / `__alignof` intrinsics on MSVC and fall back to `<stdalign.h>`
-// elsewhere. C++ TUs always get the standard header.
 #if defined(__cplusplus)
 #include <stdalign.h>
 #elif defined(_MSC_VER)
@@ -80,17 +70,6 @@ typedef union boc_max_align {
   void (*_fp)(void);
 } boc_max_align_t;
 
-// ---------------------------------------------------------------------------
-// Memory-order tags
-// ---------------------------------------------------------------------------
-//
-// Used by the typed `boc_atomic_*_explicit` API below. Defined here
-// (above the platform fork) because both arms reference these tags.
-// Distinct integer constants so the MSVC dispatch can `switch` on
-// them; on POSIX they are mapped to `memory_order_*` by the
-// `boc_mo_to_std` helper inside the POSIX arm. Skip 1 to leave room
-// for `consume`.
-
 typedef enum {
   BOC_MO_RELAXED = 0,
   BOC_MO_ACQUIRE = 2,
@@ -99,16 +78,8 @@ typedef enum {
   BOC_MO_SEQ_CST = 5,
 } boc_memory_order_t;
 
-// ===========================================================================
-// Platform fork: Windows / Apple / other POSIX (Linux).
-// ===========================================================================
-
 #ifdef _WIN32
 
-// ---------------------------------------------------------------------------
-// Windows: headers, thread_local, yield
-// ---------------------------------------------------------------------------
-
 #define WIN32_LEAN_AND_MEAN
 #include <process.h>
 #include <windows.h>
@@ -116,10 +87,6 @@ typedef enum {
 #define thread_local __declspec(thread)
 #define boc_yield() SwitchToThread()
 
-// ---------------------------------------------------------------------------
-// Windows: legacy `atomic_*` polyfill on int_least64_t / intptr_t
-// ---------------------------------------------------------------------------
-
 typedef volatile int_least64_t atomic_int_least64_t;
 typedef volatile intptr_t atomic_intptr_t;
 
@@ -132,19 +99,6 @@ int_least64_t atomic_load(atomic_int_least64_t *ptr);
 int_least64_t atomic_exchange(atomic_int_least64_t *ptr, int_least64_t value);
 void atomic_store(atomic_int_least64_t *ptr, int_least64_t value);
 
-// ----- atomic_intptr_t siblings ---------------------------------------------
-// The MSVC polyfill defines `atomic_intptr_t` and `atomic_int_least64_t` as
-// distinct typedefs; the plain `atomic_load` / `atomic_store` / etc. above
-// only accept `atomic_int_least64_t *`. Without these siblings, code that
-// touches an `atomic_intptr_t` field (e.g. BOCRequest::next, BOCCown::last,
-// BOCRecycleQueue::head, BOCQueue::tag, NB_NOTICEBOARD_TID) would silently
-// pass a mistyped pointer to the int64 polyfill on Windows. On POSIX C11 the
-// same names are aliased to the generic atomic_* macros (which already
-// dispatch on type via _Generic), so user code below is platform-uniform.
-//
-// All Interlocked*Pointer intrinsics on x86/x64 are full barriers; the
-// pointer-width matches `intptr_t` on both Win32 and Win64 (CPython itself
-// requires a sane intptr_t == void* relationship).
 static inline intptr_t atomic_load_intptr(atomic_intptr_t *ptr) { return *ptr; }
 
 static inline void atomic_store_intptr(atomic_intptr_t *ptr, intptr_t value) {
@@ -169,14 +123,6 @@ static inline bool atomic_compare_exchange_strong_intptr(atomic_intptr_t *ptr,
   return false;
 }
 
-// All Interlocked* intrinsics on x86/x64 are full barriers, so the
-// memory_order argument is accepted but ignored.
-// Note: atomic_load_explicit uses a plain volatile read on x64 (TSO
-// provides acquire semantics). On x86 (32-bit), 64-bit volatile reads
-// are not atomic, but the legacy atomic_int_least64_t polyfill here is
-// only used for waiter counts and state fields where torn reads are
-// benign (protected by mutex re-checks). The typed boc_atomic_*_u64
-// API above uses _InterlockedCompareExchange64 for true atomicity.
 #if defined(_M_IX86)
 #define atomic_load_explicit(ptr, order)                                       \
   ((int_least64_t)InterlockedCompareExchange64((ptr), 0, 0))
@@ -193,10 +139,6 @@ static inline bool atomic_compare_exchange_strong_intptr(atomic_intptr_t *ptr,
 #endif
 #define memory_order_seq_cst 0
 
-// ---------------------------------------------------------------------------
-// Windows: BOCMutex / BOCCond on SRWLOCK + CONDITION_VARIABLE
-// ---------------------------------------------------------------------------
-
 typedef SRWLOCK BOCMutex;
 typedef CONDITION_VARIABLE BOCCond;
 
@@ -225,9 +167,6 @@ static inline void cnd_wait(BOCCond *c, BOCMutex *m) {
 /// @param m The mutex (must be held by caller)
 /// @return true if signalled (or spurious wake), false if the timeout expired
 static inline bool cnd_timedwait_s(BOCCond *c, BOCMutex *m, double seconds) {
-  // Negated form catches NaN (every comparison with NaN is false),
-  // which a bare `seconds < 0` test does not. Defence in depth
-  // for the public boundary helper `boc_validate_finite_timeout`.
   if (!(seconds >= 0.0))
     seconds = 0.0;
   DWORD ms = (DWORD)(seconds * 1000.0);
@@ -240,31 +179,11 @@ static inline bool cnd_timedwait_s(BOCCond *c, BOCMutex *m, double seconds) {
 
 void thrd_sleep(const struct timespec *duration, struct timespec *remaining);
 
-// ---------------------------------------------------------------------------
-// Windows: typed `boc_atomic_*_explicit` storage typedefs
-// ---------------------------------------------------------------------------
-//
-// `volatile T` storage with distinct typedefs per width so the
-// dispatch picks the right Interlocked* family. Note these are ordinary
-// `volatile`, NOT C11 `_Atomic` — MSVC's `_Atomic` is gated behind
-// `<stdatomic.h>` (VS 2022 17.5+) which is above bocpy's VS 2019 floor.
-
 typedef volatile uint64_t boc_atomic_u64_t;
 typedef volatile uint32_t boc_atomic_u32_t;
-typedef volatile uint8_t boc_atomic_bool_t; // sizeof(bool) == 1
+typedef volatile uint8_t boc_atomic_bool_t;
 typedef void *volatile boc_atomic_ptr_t;
 
-// ---------------------------------------------------------------------------
-// Windows: typed `boc_atomic_*_explicit` implementations
-// ---------------------------------------------------------------------------
-//
-// Switch on order, dispatch to Interlocked*. On x86/x64 every
-// Interlocked* intrinsic is a full barrier, so all orderings collapse
-// to the unsuffixed form (which is correct for any requested
-// ordering). On ARM64 we pick the matching `_acq`/`_rel`/`_nf`
-// variant. `BOC_MO_ACQ_REL` and `BOC_MO_SEQ_CST` use the unsuffixed
-// (full barrier) form on every target.
-
 #if defined(_M_ARM64)
 #define BOC_IL_LOAD64_ACQ(p)                                                   \
   ((uint64_t)__ldar64((unsigned __int64 const volatile *)(p)))
@@ -280,8 +199,6 @@ typedef void *volatile boc_atomic_ptr_t;
   __stlr8((unsigned __int8 volatile *)(p), (unsigned __int8)(v))
 #endif
 
-// ---- u64 -------------------------------------------------------------------
-
 static inline uint64_t boc_atomic_load_u64_explicit(boc_atomic_u64_t *p,
                                                     boc_memory_order_t order) {
 #if defined(_M_ARM64)
@@ -294,10 +211,6 @@ static inline uint64_t boc_atomic_load_u64_explicit(boc_atomic_u64_t *p,
     return BOC_IL_LOAD64_ACQ(p);
   }
 #elif defined(_M_IX86)
-  // On x86, a 64-bit volatile read is not atomic (two 32-bit loads).
-  // Use _InterlockedCompareExchange64(p, 0, 0) which atomically reads
-  // the value without modifying it (CAS that "replaces" 0 with 0 if
-  // matched; either way returns the current value).
   (void)order;
   return (uint64_t)_InterlockedCompareExchange64((volatile __int64 *)p, 0, 0);
 #else
@@ -322,8 +235,6 @@ static inline void boc_atomic_store_u64_explicit(boc_atomic_u64_t *p,
     return;
   }
 #elif defined(_M_IX86)
-  // On x86, a 64-bit volatile write is not atomic. Use an exchange
-  // via CAS loop to atomically store the value.
   (void)order;
   __int64 old = *((volatile __int64 *)p);
   for (;;) {
@@ -357,8 +268,6 @@ boc_atomic_exchange_u64_explicit(boc_atomic_u64_t *p, uint64_t v,
     return (uint64_t)_InterlockedExchange64((volatile __int64 *)p, (__int64)v);
   }
 #elif defined(_M_IX86)
-  // x86 lacks _InterlockedExchange64; emulate with a CAS loop using
-  // _InterlockedCompareExchange64 (which is available on x86).
   (void)order;
   __int64 old = *((volatile __int64 *)p);
   for (;;) {
@@ -429,7 +338,6 @@ boc_atomic_fetch_add_u64_explicit(boc_atomic_u64_t *p, uint64_t v,
                                                (__int64)v);
   }
 #elif defined(_M_IX86)
-  // x86 lacks _InterlockedExchangeAdd64; emulate with a CAS loop.
   (void)order;
   __int64 old = *((volatile __int64 *)p);
   for (;;) {
@@ -452,8 +360,6 @@ boc_atomic_fetch_sub_u64_explicit(boc_atomic_u64_t *p, uint64_t v,
   return boc_atomic_fetch_add_u64_explicit(p, (uint64_t)(-(int64_t)v), order);
 }
 
-// ---- u32 -------------------------------------------------------------------
-
 static inline uint32_t boc_atomic_load_u32_explicit(boc_atomic_u32_t *p,
                                                     boc_memory_order_t order) {
 #if defined(_M_ARM64)
@@ -574,13 +480,6 @@ boc_atomic_fetch_sub_u32_explicit(boc_atomic_u32_t *p, uint32_t v,
   return boc_atomic_fetch_add_u32_explicit(p, (uint32_t)(-(int32_t)v), order);
 }
 
-// ---- bool (uint8_t storage) ------------------------------------------------
-// MSVC has no Interlocked*8 with order suffixes pre-VS-2022; we use the
-// unsuffixed Interlocked*8 (full barrier) for exchange/cas, which satisfies
-// any requested ordering. Plain volatile load/store on a 1-byte slot is
-// atomic on every supported MSVC target (ARM64 included; the architecture
-// guarantees aligned single-byte access atomicity).
-
 static inline bool boc_atomic_load_bool_explicit(boc_atomic_bool_t *p,
                                                  boc_memory_order_t order) {
 #if defined(_M_ARM64)
@@ -639,13 +538,8 @@ static inline bool boc_atomic_compare_exchange_strong_bool_explicit(
   return false;
 }
 
-// ---- ptr -------------------------------------------------------------------
-
 static inline void *boc_atomic_load_ptr_explicit(boc_atomic_ptr_t *p,
                                                  boc_memory_order_t order) {
-  // InterlockedCompareExchangePointerNoFence is the cleanest way to express
-  // a relaxed atomic pointer load, but a plain volatile read suffices on
-  // every supported target (pointer width matches the natural word size).
   (void)order;
   return (void *)*p;
 }
@@ -685,12 +579,6 @@ static inline bool boc_atomic_compare_exchange_strong_ptr_explicit(
   return false;
 }
 
-// Standalone memory fence. `MemoryBarrier()` is a full hardware
-// barrier on every supported MSVC target (x86, x64, ARM64) and
-// matches the strongest standalone fence we ever need from this
-// helper. Mapping every `BOC_MO_*` to a full barrier is correct
-// (over-strong is safe; under-strong is not) and keeps the
-// implementation a one-liner.
 static inline void boc_atomic_thread_fence_explicit(boc_memory_order_t o) {
   (void)o;
   MemoryBarrier();
@@ -698,10 +586,6 @@ static inline void boc_atomic_thread_fence_explicit(boc_memory_order_t o) {
 
 #else // _WIN32
 
-// ---------------------------------------------------------------------------
-// POSIX (Apple + Linux): shared headers, thread_local, yield, intptr aliases
-// ---------------------------------------------------------------------------
-
 #include <errno.h>
 #include <sched.h>
 #include <stdatomic.h>
@@ -710,11 +594,6 @@ static inline void boc_atomic_thread_fence_explicit(boc_memory_order_t o) {
 #define thread_local _Thread_local
 #define boc_yield() sched_yield()
 
-// On POSIX the C11 atomic_* macros dispatch on type via _Generic, so the
-// `atomic_load(&intptr_var)` form Just Works. The `_intptr` siblings are
-// aliased to the generic forms purely so the source reads the same on
-// every platform; on Windows they expand to dedicated InterlockedXxxPointer
-// shims (see polyfill block above).
 #define atomic_load_intptr(ptr) atomic_load(ptr)
 #define atomic_store_intptr(ptr, val) atomic_store((ptr), (val))
 #define atomic_exchange_intptr(ptr, val) atomic_exchange((ptr), (val))
@@ -723,10 +602,6 @@ static inline void boc_atomic_thread_fence_explicit(boc_memory_order_t o) {
 
 #ifdef __APPLE__
 
-// ---------------------------------------------------------------------------
-// Apple: pthread-based BOCMutex / BOCCond
-// ---------------------------------------------------------------------------
-
 #include <pthread.h>
 #define thrd_sleep nanosleep
 
@@ -758,9 +633,6 @@ static inline void cnd_wait(BOCCond *c, BOCMutex *m) {
 /// @param m The mutex (must be held by caller)
 /// @return true if signalled (or spurious wake), false if the timeout expired
 static inline bool cnd_timedwait_s(BOCCond *c, BOCMutex *m, double seconds) {
-  // Negated form catches NaN (every comparison with NaN is false),
-  // which a bare `seconds < 0` test does not. Defence in depth
-  // for the public boundary helper `boc_validate_finite_timeout`.
   if (!(seconds >= 0.0))
     seconds = 0.0;
   struct timespec ts;
@@ -778,10 +650,6 @@ static inline bool cnd_timedwait_s(BOCCond *c, BOCMutex *m, double seconds) {
 
 #else // __APPLE__
 
-// ---------------------------------------------------------------------------
-// Linux (and other non-Apple POSIX): C11 <threads.h>-based BOCMutex / BOCCond
-// ---------------------------------------------------------------------------
-
 #include <threads.h>
 
 typedef mtx_t BOCMutex;
@@ -794,9 +662,6 @@ static inline void boc_mtx_init(BOCMutex *m) { mtx_init(m, mtx_plain); }
 /// @param m The mutex (must be held by caller)
 /// @return true if signalled (or spurious wake), false if the timeout expired
 static inline bool cnd_timedwait_s(BOCCond *c, BOCMutex *m, double seconds) {
-  // Negated form catches NaN (every comparison with NaN is false),
-  // which a bare `seconds < 0` test does not. Defence in depth
-  // for the public boundary helper `boc_validate_finite_timeout`.
   if (!(seconds >= 0.0))
     seconds = 0.0;
   struct timespec ts;
@@ -814,13 +679,6 @@ static inline bool cnd_timedwait_s(BOCCond *c, BOCMutex *m, double seconds) {
 
 #endif // __APPLE__
 
-// ---------------------------------------------------------------------------
-// POSIX: typed `boc_atomic_*_explicit` API on top of <stdatomic.h>
-// ---------------------------------------------------------------------------
-//
-// The compiler folds these wrappers away. Legacy `atomic_*` callers
-// are unaffected; the new API is purely additive.
-
 typedef _Atomic uint64_t boc_atomic_u64_t;
 typedef _Atomic uint32_t boc_atomic_u32_t;
 typedef _Atomic bool boc_atomic_bool_t;
@@ -866,8 +724,6 @@ BOC_ATOMIC_OPS_(u64, uint64_t, boc_atomic_u64_t)
 BOC_ATOMIC_OPS_(u32, uint32_t, boc_atomic_u32_t)
 BOC_ATOMIC_OPS_(bool, bool, boc_atomic_bool_t)
 
-// `ptr` carries `void *` payload but the underlying storage is
-// `_Atomic(void *)`; cast at the API edge to keep call sites clean.
 static inline void *boc_atomic_load_ptr_explicit(boc_atomic_ptr_t *p,
                                                  boc_memory_order_t o) {
   return atomic_load_explicit(p, boc_mo_to_std(o));
@@ -904,19 +760,12 @@ BOC_ATOMIC_FETCH_OPS_(u32, uint32_t, boc_atomic_u32_t)
 #undef BOC_ATOMIC_OPS_
 #undef BOC_ATOMIC_FETCH_OPS_
 
-// Standalone memory fence. POSIX delegates to `atomic_thread_fence`
-// from `<stdatomic.h>`; the helper exists so MSVC can express the
-// same operation via `MemoryBarrier()` without C11 atomics.
 static inline void boc_atomic_thread_fence_explicit(boc_memory_order_t o) {
   atomic_thread_fence(boc_mo_to_std(o));
 }
 
 #endif // _WIN32
 
-// ===========================================================================
-// Cross-platform monotonic time / sleep helpers
-// ===========================================================================
-
 /// @brief Returns the current time as double-precision seconds.
 /// @return the current time
 double boc_now_s(void);
@@ -969,51 +818,18 @@ uint64_t boc_now_ns(void);
 /// @param ns Nanoseconds to sleep. Zero is a no-op return.
 void boc_sleep_ns(uint64_t ns);
 
-// ===========================================================================
-// Cross-platform timeout-validation helper
-// ===========================================================================
-//
-// Public boundary helper for the @c terminator_wait / @c notice_sync_wait
-// entry points. Centralising the NaN/Inf/negative classification here
-// keeps the policy in one place: NaN is a programmer error and surfaces
-// as @c ValueError; +Inf is "wait forever"; negative is clamped to 0
-// (no-wait, returns immediately). Without this, NaN passed straight
-// to @c cnd_timedwait_s would compute @c DWORD ms via @c (DWORD)(NaN *
-// 1000.0) — undefined behaviour on Windows and a wedged-forever wait
-// on POSIX.
-//
-// Returns 0 on success (with @p *wait_forever set); -1 on failure with
-// a Python exception set.
-
 static inline int boc_validate_finite_timeout(double seconds,
                                               double *out_seconds,
                                               bool *out_wait_forever) {
-  // NaN: a comparison with NaN is always false, so `seconds == seconds`
-  // is the canonical portable NaN check (no math.h dependency).
   if (seconds != seconds) {
     PyErr_SetString(PyExc_ValueError, "timeout must not be NaN");
     return -1;
   }
-  // +Inf or any value that the cnd_timedwait clamp would treat as
-  // "wait forever" maps to wait_forever=true. Use a finite sentinel
-  // (DBL_MAX) rather than HUGE_VAL to keep the helper free of math.h
-  // — the operational meaning is identical.
-  //
-  // We clamp at 1e9 seconds (~31.7 years) rather than DBL_MAX so
-  // any caller-supplied value that would overflow `time_t` (signed
-  // 32-bit on some platforms: ~68 years) or the `DWORD` millisecond
-  // arg to Win32 `SleepConditionVariableSRW` (max ~49 days) also
-  // routes through the wait-forever path. Operationally a 31-year
-  // wait is indistinguishable from "wait forever" for any realistic
-  // bocpy caller, and the clamp is the only safe way to avoid
-  // platform-dependent overflow into a sub-second wait or UB.
   if (seconds > 1e9) {
     *out_seconds = 0.0;
     *out_wait_forever = true;
     return 0;
   }
-  // Negative: caller asked for "no wait". Clamp to 0 and return; the
-  // wait helpers will short-circuit with a timeout immediately.
   if (seconds < 0.0) {
     *out_seconds = 0.0;
     *out_wait_forever = false;
diff --git a/src/bocpy/boc_noticeboard.c b/src/bocpy/boc_noticeboard.c
index 7c6f4be..624c347 100644
--- a/src/bocpy/boc_noticeboard.c
+++ b/src/bocpy/boc_noticeboard.c
@@ -9,17 +9,11 @@
 ///   - The per-thread snapshot cache (dict, proxy, version, checked
 ///     flag).
 ///   - The single-writer thread-identity check (@c NB_NOTICEBOARD_TID).
-///   - The notice_sync barrier primitives (@c NB_SYNC_REQUESTED,
-///     @c NB_SYNC_PROCESSED, @c NB_SYNC_MUTEX, @c NB_SYNC_COND).
 
 #include "boc_noticeboard.h"
 
 #include <string.h>
 
-// ---------------------------------------------------------------------------
-// File-scope state.
-// ---------------------------------------------------------------------------
-
 /// @brief A single noticeboard entry.
 typedef struct nb_entry {
   /// @brief The key for this entry (null-terminated UTF-8).
@@ -64,38 +58,17 @@ static thread_local PyObject *NB_SNAPSHOT_PROXY = NULL;
 ///        unset.
 static atomic_intptr_t NB_NOTICEBOARD_TID = 0;
 
-/// @brief Monotonic counter incremented by every notice_sync caller.
-static atomic_int_least64_t NB_SYNC_REQUESTED = 0;
-
-/// @brief Highest sequence number processed by the noticeboard thread.
-static atomic_int_least64_t NB_SYNC_PROCESSED = 0;
-
-/// @brief Mutex protecting NB_SYNC_COND.
-static BOCMutex NB_SYNC_MUTEX;
-
-/// @brief Condition variable signalled when NB_SYNC_PROCESSED advances.
-static BOCCond NB_SYNC_COND;
-
-// ---------------------------------------------------------------------------
-// Module init / teardown.
-// ---------------------------------------------------------------------------
-
 void noticeboard_init(void) {
   memset(&NB, 0, sizeof(NB));
   boc_mtx_init(&NB.mutex);
-  boc_mtx_init(&NB_SYNC_MUTEX);
-  cnd_init(&NB_SYNC_COND);
 }
 
 void noticeboard_destroy(void) {
-  // Drop the calling thread's snapshot cache before freeing entries.
   Py_CLEAR(NB_SNAPSHOT_PROXY);
   Py_CLEAR(NB_SNAPSHOT_CACHE);
   NB_SNAPSHOT_VERSION = -1;
   NB_VERSION_CHECKED = false;
 
-  // Collect entries to free after releasing the mutex — XIDATA_FREE
-  // and COWN_DECREF can run Python __del__ which may re-enter.
   XIDATA_T *to_free[NB_MAX_ENTRIES];
   int to_free_count = 0;
   BOCCown **to_unpin[NB_MAX_ENTRIES];
@@ -131,17 +104,8 @@ void noticeboard_destroy(void) {
   }
 
   mtx_destroy(&NB.mutex);
-  // NB_SYNC_MUTEX / NB_SYNC_COND are SRWLOCK / CONDITION_VARIABLE on
-  // Windows (no destroy needed) and pthread / mtx_t on POSIX (handled
-  // by mtx_destroy / cnd_destroy in boc_compat.h shims). The original
-  // _core.c module-free path never destroyed these; preserve that
-  // behaviour to keep the symbol-additions-only invariant.
 }
 
-// ---------------------------------------------------------------------------
-// Single-writer thread-identity check.
-// ---------------------------------------------------------------------------
-
 int noticeboard_check_thread(const char *op_name) {
   uintptr_t owner = (uintptr_t)atomic_load_intptr(&NB_NOTICEBOARD_TID);
   if (owner == 0) {
@@ -159,10 +123,6 @@ int noticeboard_check_thread(const char *op_name) {
 int noticeboard_set_thread(void) {
   intptr_t expected = 0;
   intptr_t self_id = (intptr_t)(uintptr_t)PyThread_get_thread_ident();
-  // One-shot per runtime: refuse if the slot is already owned.
-  // noticeboard_clear_thread() resets NB_NOTICEBOARD_TID to 0 at
-  // stop(), so a fresh start() cycle is fine. This closes the
-  // hijack-the-mutator-slot hole identified by the security lens.
   if (!atomic_compare_exchange_strong_intptr(&NB_NOTICEBOARD_TID, &expected,
                                              self_id)) {
     PyErr_SetString(PyExc_RuntimeError,
@@ -177,10 +137,6 @@ void noticeboard_clear_thread(void) {
   (void)atomic_exchange_intptr(&NB_NOTICEBOARD_TID, (intptr_t)0);
 }
 
-// ---------------------------------------------------------------------------
-// Snapshot cache primitives.
-// ---------------------------------------------------------------------------
-
 void noticeboard_drop_local_cache(void) {
   Py_CLEAR(NB_SNAPSHOT_PROXY);
   Py_CLEAR(NB_SNAPSHOT_CACHE);
@@ -190,10 +146,6 @@ void noticeboard_drop_local_cache(void) {
 
 void noticeboard_cache_clear_for_behavior(void) { NB_VERSION_CHECKED = false; }
 
-// ---------------------------------------------------------------------------
-// Pin helper.
-// ---------------------------------------------------------------------------
-
 int nb_pin_cowns(PyObject *cowns, BOCCown ***out_array, int *out_count) {
   *out_array = NULL;
   *out_count = 0;
@@ -226,11 +178,6 @@ int nb_pin_cowns(PyObject *cowns, BOCCown ***out_array, int *out_count) {
     PyObject *item = PySequence_Fast_GET_ITEM(seq, i);
     BOCCown *cown = (BOCCown *)PyLong_AsVoidPtr(item);
     if (cown == NULL) {
-      // PyLong_AsVoidPtr returns NULL both on error and for integer 0.
-      // Reject both paths explicitly: a NULL pin would be dereferenced
-      // downstream (COWN_DECREF on NULL is UB), and an integer 0 is
-      // indistinguishable from a crafted attacker pin pointing at the
-      // zero page.
       if (!PyErr_Occurred()) {
         PyErr_SetString(PyExc_ValueError,
                         "noticeboard pin list must not contain NULL / "
@@ -251,7 +198,6 @@ int nb_pin_cowns(PyObject *cowns, BOCCown ***out_array, int *out_count) {
   return 0;
 
 fail:
-  // Release every transferred ref the writer pre-INCREFed for us.
   for (int i = 0; i < taken; i++) {
     COWN_DECREF(pins[i]);
   }
@@ -269,10 +215,6 @@ int nb_pin_cowns(PyObject *cowns, BOCCown ***out_array, int *out_count) {
   return -1;
 }
 
-// ---------------------------------------------------------------------------
-// Mutations.
-// ---------------------------------------------------------------------------
-
 int noticeboard_write(const char *key, Py_ssize_t key_len, XIDATA_T *xidata,
                       bool pickled, BOCCown **pins, int pin_count) {
   if (key_len >= NB_KEY_SIZE) {
@@ -310,9 +252,6 @@ int noticeboard_write(const char *key, Py_ssize_t key_len, XIDATA_T *xidata,
     target->pinned_count = 0;
   }
 
-  // Stash old value and old pins to free after releasing the mutex —
-  // XIDATA_FREE / COWN_DECREF may invoke Python __del__ which could
-  // re-enter the noticeboard.
   XIDATA_T *old_value = target->value;
   BOCCown **old_pins = target->pinned_cowns;
   int old_pin_count = target->pinned_count;
@@ -338,7 +277,6 @@ int noticeboard_write(const char *key, Py_ssize_t key_len, XIDATA_T *xidata,
   return 0;
 
 fail:
-  // Roll back: free the new XIData and decref the new pins.
   if (xidata != NULL) {
     XIDATA_FREE(xidata);
   }
@@ -439,24 +377,15 @@ void noticeboard_clear(void) {
     PyMem_RawFree(to_unpin[i]);
   }
 
-  // Drop this thread's cache so a subsequent same-thread snapshot
-  // does not reuse a stale proxy. Other threads will revalidate via
-  // NB_VERSION.
   noticeboard_drop_local_cache();
 }
 
-// ---------------------------------------------------------------------------
-// Snapshot.
-// ---------------------------------------------------------------------------
-
 PyObject *noticeboard_snapshot(PyObject *loads) {
   if (NB_SNAPSHOT_PROXY != NULL) {
     if (NB_VERSION_CHECKED) {
-      // Within-behavior repeat call: same proxy, no atomic load.
       Py_INCREF(NB_SNAPSHOT_PROXY);
       return NB_SNAPSHOT_PROXY;
     }
-    // First snapshot call this behavior: do exactly one version check.
     int_least64_t current = atomic_load(&NB_VERSION);
     if (current == NB_SNAPSHOT_VERSION) {
       NB_VERSION_CHECKED = true;
@@ -471,22 +400,10 @@ PyObject *noticeboard_snapshot(PyObject *loads) {
     return NULL;
   }
 
-  // Deferred entries: pickled values whose bytes were extracted under
-  // mutex but need unpickling outside the lock.
   PyObject *deferred_keys[NB_MAX_ENTRIES];
   PyObject *deferred_bytes[NB_MAX_ENTRIES];
   int deferred_count = 0;
 
-  // Keepalive pins: while we hold the mutex we take an extra
-  // COWN_INCREF on every pin reachable from a deferred (pickled)
-  // entry. The bytes we are about to unpickle outside the mutex
-  // contain raw BOCCown pointers whose validity depends on the
-  // entry's pin list. Without this extra ref, a concurrent writer
-  // could overwrite the entry the instant we drop the mutex, release
-  // the old pins, and free the BOCCowns before we touch them — UAF
-  // in _cown_capsule_from_pointer. Released after the deferred
-  // unpickling completes. Each deferred entry contributes a heap-
-  // allocated pin pointer array sized to its pin count.
   BOCCown **keepalive_pins[NB_MAX_ENTRIES];
   int keepalive_counts[NB_MAX_ENTRIES];
   for (int i = 0; i < NB_MAX_ENTRIES; i++) {
@@ -496,9 +413,6 @@ PyObject *noticeboard_snapshot(PyObject *loads) {
 
   mtx_lock(&NB.mutex);
 
-  // Capture the noticeboard version while still holding the mutex so
-  // that no concurrent writer can bump it between snapshot completion
-  // and version capture.
   int_least64_t built_version = atomic_load(&NB_VERSION);
 
   for (int i = 0; i < NB.count; i++) {
@@ -507,7 +421,6 @@ PyObject *noticeboard_snapshot(PyObject *loads) {
       continue;
     }
 
-    // XIDATA_NEWOBJECT is lightweight (no Python code execution).
     PyObject *raw = XIDATA_NEWOBJECT(entry->value);
     if (raw == NULL) {
       mtx_unlock(&NB.mutex);
@@ -522,7 +435,6 @@ PyObject *noticeboard_snapshot(PyObject *loads) {
     }
 
     if (!entry->pickled) {
-      // Non-pickled: add directly to dict.
       if (PyDict_SetItem(dict, key, raw) < 0) {
         Py_DECREF(key);
         Py_DECREF(raw);
@@ -532,9 +444,6 @@ PyObject *noticeboard_snapshot(PyObject *loads) {
       Py_DECREF(key);
       Py_DECREF(raw);
     } else {
-      // Pickled: defer unpickling to outside the mutex. Take a fresh
-      // COWN_INCREF on every pin so the BOCCowns referenced by the
-      // bytes survive past mtx_unlock — see keepalive_pins comment.
       if (entry->pinned_count > 0) {
         BOCCown **pins = (BOCCown **)PyMem_RawMalloc(sizeof(BOCCown *) *
                                                      entry->pinned_count);
@@ -560,7 +469,6 @@ PyObject *noticeboard_snapshot(PyObject *loads) {
 
   mtx_unlock(&NB.mutex);
 
-  // Unpickle deferred entries outside the mutex.
   for (int i = 0; i < deferred_count; i++) {
     PyObject *value = PyObject_CallOneArg(loads, deferred_bytes[i]);
     Py_DECREF(deferred_bytes[i]);
@@ -569,13 +477,10 @@ PyObject *noticeboard_snapshot(PyObject *loads) {
     if (value == NULL) {
       Py_DECREF(deferred_keys[i]);
       deferred_keys[i] = NULL;
-      // Clean up remaining deferred entries.
       for (int j = i + 1; j < deferred_count; j++) {
         Py_DECREF(deferred_keys[j]);
         Py_DECREF(deferred_bytes[j]);
       }
-      // Release every keepalive pin (including the one for this
-      // entry).
       for (int j = 0; j < deferred_count; j++) {
         if (keepalive_pins[j] != NULL) {
           for (int k = 0; k < keepalive_counts[j]; k++) {
@@ -612,9 +517,6 @@ PyObject *noticeboard_snapshot(PyObject *loads) {
     Py_DECREF(deferred_keys[i]);
     Py_DECREF(value);
 
-    // Successful unpickle: the snapshot dict (and its CownCapsules)
-    // now hold their own refs on every BOCCown referenced by the
-    // bytes. Drop our keepalive pin for this entry.
     if (keepalive_pins[i] != NULL) {
       for (int k = 0; k < keepalive_counts[i]; k++) {
         COWN_DECREF(keepalive_pins[i][k]);
@@ -630,9 +532,6 @@ PyObject *noticeboard_snapshot(PyObject *loads) {
     return NULL;
   }
 
-  // The proxy holds a strong reference to dict; we keep our own as
-  // well so that the dict is reachable for direct mutation in the
-  // rebuild path and the proxy survives at least as long as the dict.
   NB_SNAPSHOT_CACHE = dict;
   NB_SNAPSHOT_PROXY = proxy;
   NB_SNAPSHOT_VERSION = built_version;
@@ -655,48 +554,3 @@ PyObject *noticeboard_snapshot(PyObject *loads) {
   Py_DECREF(dict);
   return NULL;
 }
-
-// ---------------------------------------------------------------------------
-// notice_sync barrier.
-// ---------------------------------------------------------------------------
-
-int_least64_t notice_sync_request(void) {
-  return atomic_fetch_add(&NB_SYNC_REQUESTED, 1) + 1;
-}
-
-void notice_sync_complete(int_least64_t seq) {
-  mtx_lock(&NB_SYNC_MUTEX);
-  // Defense in depth: with a single noticeboard thread draining the
-  // FIFO boc_noticeboard tag, `seq` arrives strictly monotonically
-  // and a plain `atomic_store(seq)` would be correct. We keep the
-  // max-of pattern so that if a future change introduces a second
-  // mutator thread or any out-of-order delivery, NB_SYNC_PROCESSED
-  // can never regress and unblock waiters early.
-  int_least64_t cur = atomic_load(&NB_SYNC_PROCESSED);
-  if (seq > cur) {
-    atomic_store(&NB_SYNC_PROCESSED, seq);
-  }
-  cnd_broadcast(&NB_SYNC_COND);
-  mtx_unlock(&NB_SYNC_MUTEX);
-}
-
-bool notice_sync_wait(int_least64_t seq, double timeout, bool wait_forever) {
-  bool ok = true;
-  double end_time = wait_forever ? 0.0 : boc_now_s() + timeout;
-
-  mtx_lock(&NB_SYNC_MUTEX);
-  while (atomic_load(&NB_SYNC_PROCESSED) < seq) {
-    if (!wait_forever) {
-      double now = boc_now_s();
-      if (now >= end_time) {
-        ok = false;
-        break;
-      }
-      cnd_timedwait_s(&NB_SYNC_COND, &NB_SYNC_MUTEX, end_time - now);
-    } else {
-      cnd_wait(&NB_SYNC_COND, &NB_SYNC_MUTEX);
-    }
-  }
-  mtx_unlock(&NB_SYNC_MUTEX);
-  return ok;
-}
diff --git a/src/bocpy/boc_noticeboard.h b/src/bocpy/boc_noticeboard.h
index 1352cea..cc77098 100644
--- a/src/bocpy/boc_noticeboard.h
+++ b/src/bocpy/boc_noticeboard.h
@@ -20,8 +20,7 @@
 /// API (@ref noticeboard_snapshot, @ref nb_pin_cowns,
 /// @ref noticeboard_write, @ref noticeboard_delete) set a Python
 /// exception and return -1 / NULL on failure. Functions that are
-/// pure C (@ref noticeboard_clear,
-/// @ref notice_sync_*) cannot fail.
+/// pure C (@ref noticeboard_clear) cannot fail.
 
 #ifndef BOCPY_NOTICEBOARD_H
 #define BOCPY_NOTICEBOARD_H
@@ -42,7 +41,7 @@
 /// @brief Maximum size of a key, including the trailing NUL byte.
 #define NB_KEY_SIZE 64
 
-/// @brief Initialize the noticeboard's mutex and notice_sync primitives.
+/// @brief Initialize the noticeboard's mutex.
 /// @details Called once at module init.
 void noticeboard_init(void);
 
@@ -136,18 +135,4 @@ void noticeboard_clear(void);
 ///         (PyErr set).
 PyObject *noticeboard_snapshot(PyObject *loads);
 
-/// @brief Reserve a fresh notice_sync sequence number.
-int_least64_t notice_sync_request(void);
-
-/// @brief Mark @p seq as processed and wake any @ref notice_sync_wait
-///        callers.
-void notice_sync_complete(int_least64_t seq);
-
-/// @brief Block the calling thread until @p seq has been processed.
-/// @param seq The sequence number returned by @ref notice_sync_request.
-/// @param timeout Maximum wait in seconds. Ignored if @p wait_forever.
-/// @param wait_forever If true, ignore @p timeout and wait until signalled.
-/// @return true if @p seq has been processed, false on timeout.
-bool notice_sync_wait(int_least64_t seq, double timeout, bool wait_forever);
-
 #endif // BOCPY_NOTICEBOARD_H
diff --git a/src/bocpy/boc_sched.c b/src/bocpy/boc_sched.c
index 2d12b85..aa2fa52 100644
--- a/src/bocpy/boc_sched.c
+++ b/src/bocpy/boc_sched.c
@@ -1,13 +1,3 @@
-// boc_sched.c — Work-stealing scheduler.
-//
-// Owns the per-worker MPMC queues, parking protocol, work-stealing,
-// and per-worker fairness tokens.
-//
-// Verona reference: `verona-rt/src/rt/sched/schedulerstats.h` (counter
-// POD subset), `mpmcq.h` (MPMC queue), `schedulerthread.h`
-// (`get_work` / `try_steal` / `steal`), `threadpool.h` (per-start
-// `incarnation` counter; pause/unpause epoch protocol), and
-// `core.h` (fairness token).
 
 #include "boc_sched.h"
 
@@ -19,51 +9,34 @@
 
 #include <Python.h>
 
-// ===========================================================================
-// Verona MPMC behaviour queue (`boc_bq_*`) — port of
-// `verona-rt/src/rt/sched/mpmcq.h`. Memory orderings match Verona
-// line-for-line. Cited line numbers refer to that file.
-// ===========================================================================
-
 void boc_bq_init(boc_bq_t *q) {
-  // Empty representation: back == &front, front == NULL (mpmcq.h:33-37).
-  // Use relaxed stores during init: callers must publish the queue
-  // through their own release edge before any thread observes it.
   boc_atomic_store_ptr_explicit(&q->front, NULL, BOC_MO_RELAXED);
   boc_atomic_store_ptr_explicit(&q->back, (void *)&q->front, BOC_MO_RELAXED);
 }
 
 void boc_bq_destroy_assert_empty(boc_bq_t *q) {
-  // Mirrors ~MPMCQ (mpmcq.h:213-217).
   assert(boc_bq_is_empty(q));
   (void)q;
 }
 
 boc_bq_node_t *boc_bq_acquire_front(boc_bq_t *q) {
-  // Mirrors MPMCQ::acquire_front (mpmcq.h:41-56).
   BOC_SCHED_YIELD();
 
-  // Nothing in the queue (mpmcq.h:46).
+  // Relaxed probe is a fast-path skip; the ACQUIRE exchange is the real fence
+  // that claims exclusive ownership of the front chain (mpmcq.h).
   if (boc_atomic_load_ptr_explicit(&q->front, BOC_MO_RELAXED) == NULL) {
     return NULL;
   }
 
   BOC_SCHED_YIELD();
 
-  // Remove head element. This is like locking the queue for other
-  // removals (mpmcq.h:55).
   return (boc_bq_node_t *)boc_atomic_exchange_ptr_explicit(&q->front, NULL,
                                                            BOC_MO_ACQUIRE);
 }
 
 void boc_bq_enqueue_segment(boc_bq_t *q, boc_bq_segment_t s) {
-  // Mirrors MPMCQ::enqueue_segment (mpmcq.h:97-115).
   BOC_SCHED_YIELD();
 
-  // The element we are writing into must have its next pointer NULL
-  // before the back-exchange (mpmcq.h:103); writes to the segment's
-  // tail link use relaxed because the publish below carries the
-  // happens-before edge.
   boc_atomic_store_ptr_explicit(s.end, NULL, BOC_MO_RELAXED);
 
   BOC_SCHED_YIELD();
@@ -73,41 +46,31 @@ void boc_bq_enqueue_segment(boc_bq_t *q, boc_bq_segment_t s) {
 
   BOC_SCHED_YIELD();
 
-  // The previous back's slot must currently be NULL (its enqueuer set
-  // it that way); we now publish our segment's start there with a
-  // release store so consumers reading through next_in_queue with
-  // acquire see all the segment's writes (mpmcq.h:113).
   assert(boc_atomic_load_ptr_explicit(b, BOC_MO_RELAXED) == NULL);
   boc_atomic_store_ptr_explicit(b, s.start, BOC_MO_RELEASE);
 }
 
 void boc_bq_enqueue(boc_bq_t *q, boc_bq_node_t *n) {
-  // Mirrors MPMCQ::enqueue (mpmcq.h:118-121).
   boc_bq_segment_t s = {n, &n->next_in_queue};
   boc_bq_enqueue_segment(q, s);
 }
 
 void boc_bq_enqueue_front(boc_bq_t *q, boc_bq_node_t *n) {
-  // Mirrors MPMCQ::enqueue_front (mpmcq.h:123-135).
   boc_bq_node_t *old_front = boc_bq_acquire_front(q);
   if (old_front == NULL) {
-    // Post to back (mpmcq.h:128).
     boc_bq_enqueue(q, n);
     return;
   }
 
-  // Link into the front (mpmcq.h:132-134).
   boc_atomic_store_ptr_explicit(&n->next_in_queue, old_front, BOC_MO_RELAXED);
   boc_atomic_store_ptr_explicit(&q->front, n, BOC_MO_RELEASE);
 }
 
 boc_bq_node_t *boc_bq_dequeue(boc_bq_t *q) {
-  // Mirrors MPMCQ::dequeue (mpmcq.h:140-184).
   boc_bq_node_t *old_front = boc_bq_acquire_front(q);
 
   BOC_SCHED_YIELD();
 
-  // Queue is empty or someone else is stealing (mpmcq.h:147-150).
   if (old_front == NULL) {
     return NULL;
   }
@@ -118,17 +81,12 @@ boc_bq_node_t *boc_bq_dequeue(boc_bq_t *q) {
   BOC_SCHED_YIELD();
 
   if (new_front != NULL) {
-    // Remove one element from the queue (mpmcq.h:158-160).
     boc_atomic_store_ptr_explicit(&q->front, new_front, BOC_MO_RELEASE);
     return old_front;
   }
 
   BOC_SCHED_YIELD();
 
-  // Queue contains a single element, attempt to close the queue
-  // (mpmcq.h:165-176). The expected `back` value is the address of the
-  // singleton node's `next_in_queue` slot; the desired value is the
-  // address of `q->front`, restoring the empty representation.
   void *expected = (void *)&old_front->next_in_queue;
   if (boc_atomic_compare_exchange_strong_ptr_explicit(
           &q->back, &expected, (void *)&q->front, BOC_MO_ACQ_REL,
@@ -138,17 +96,15 @@ boc_bq_node_t *boc_bq_dequeue(boc_bq_t *q) {
 
   BOC_SCHED_YIELD();
 
-  // Failed to close the queue, something is being added; restore the
-  // front and let the caller retry (mpmcq.h:181-183).
+  // Lost the back-CAS race: a concurrent enqueue is mid-publish, so restore
+  // front and report empty rather than return a node whose link is unstable.
   boc_atomic_store_ptr_explicit(&q->front, old_front, BOC_MO_RELEASE);
   return NULL;
 }
 
 boc_bq_segment_t boc_bq_dequeue_all(boc_bq_t *q) {
-  // Mirrors MPMCQ::dequeue_all (mpmcq.h:189-203).
   boc_bq_node_t *old_front = boc_bq_acquire_front(q);
 
-  // Queue is empty or someone else is popping (mpmcq.h:194-197).
   if (old_front == NULL) {
     boc_bq_segment_t empty = {NULL, NULL};
     return empty;
@@ -167,7 +123,6 @@ boc_bq_segment_t boc_bq_dequeue_all(boc_bq_t *q) {
 }
 
 boc_bq_node_t *boc_bq_segment_take_one(boc_bq_segment_t *s) {
-  // Mirrors MPMCQ::Segment::take_one (mpmcq.h:67-89).
   boc_bq_node_t *n = s->start;
   if (n == NULL) {
     return NULL;
@@ -186,24 +141,10 @@ boc_bq_node_t *boc_bq_segment_take_one(boc_bq_segment_t *s) {
 }
 
 bool boc_bq_is_empty(boc_bq_t *q) {
-  // Mirrors MPMCQ::is_empty (mpmcq.h:206-210).
   BOC_SCHED_YIELD();
   return boc_atomic_load_ptr_explicit(&q->back, BOC_MO_RELAXED) == &q->front;
 }
 
-// ===========================================================================
-// Per-worker scheduler state
-// ===========================================================================
-
-// The per-worker struct (`boc_sched_worker_t`) is defined in `boc_sched.h`
-// so dispatch and pop call sites can refer to its fields without an
-// extra indirection. Cacheline padding and `static_assert`s live with
-// the type definition.
-
-// ---------------------------------------------------------------------------
-// File-scope state
-// ---------------------------------------------------------------------------
-
 /// @brief Per-worker array, length @ref WORKER_COUNT. NULL when the
 ///        scheduler module is in the down state.
 static boc_sched_worker_t *WORKERS = NULL;
@@ -232,22 +173,6 @@ static boc_atomic_u64_t WORKER_COUNT = 0;
 /// happen with no concurrent producers.
 static boc_atomic_u64_t INCARNATION = 0;
 
-// ---------------------------------------------------------------------------
-// Per-thread state (TLS)
-// ---------------------------------------------------------------------------
-//
-// Each scheduler-aware thread (worker sub-interpreter, or any other
-// thread that calls boc_sched_dispatch from a worker context) keeps
-// its dispatch state in TLS slots rather than in `boc_sched_worker_t`
-// fields. The bocpy precedent: this matches `boc_noticeboard.c`'s
-// `nb_cache_*` thread-locals. Verona equivalent: the same fields
-// are members of `SchedulerThread`, which is itself one-per-OS-thread
-// — TLS is the same effect with one fewer indirection.
-//
-// All slots use the `boc_compat.h` `thread_local` macro (`_Thread_local`
-// on POSIX, `__declspec(thread)` on MSVC) with the **default** TLS
-// model.
-
 /// @brief This thread's worker handle, or NULL if the thread has not
 ///        called @ref boc_sched_worker_register.
 /// @details Read by the producer-locality fast path of
@@ -295,52 +220,18 @@ static thread_local size_t rr_incarnation = 0;
 /// registered as workers — they never call @c try_steal.
 static thread_local boc_sched_worker_t *steal_victim = NULL;
 
-// ---------------------------------------------------------------------------
-// Worker registration counter
-// ---------------------------------------------------------------------------
-//
-// Atomic so multiple worker threads racing to claim slots in
-// `boc_sched_worker_register` do not collide. Reset to zero in
-// `boc_sched_init` so re-entry (`start()`/`wait()`/`start()`) starts
-// fresh at slot 0. Read with relaxed ordering — the consumers that
-// care about happens-before edges (the `current_worker` TLS write
-// and any subsequent dispatch) sequence themselves through
-// `WORKERS[slot]` which is itself zero-initialised by `boc_sched_init`
-// before this counter is reset.
-
 static boc_atomic_u32_t REGISTERED_COUNT = 0;
 
-// ---------------------------------------------------------------------------
-// Park/unpark protocol epochs
-// ---------------------------------------------------------------------------
-//
-// Port of Verona's two-epoch `pause`/`unpause` protocol
-// (`verona-rt/src/rt/sched/threadpool.h:282-379`).
-//
-// `PAUSE_EPOCH` is bumped (seq_cst) by a parker before its
-// `check_for_work` walk and `cv_mu` re-check; this is the
-// "speak now" point that forces any concurrent producer into the
-// CAS arm. `UNPAUSE_EPOCH` is CAS'd forward by a producer that
-// observes `PAUSE_EPOCH > UNPAUSE_EPOCH`; the CAS winner takes
-// responsibility for issuing one wake. `PARKED_COUNT` is a
-// fast-path skip — if zero, the producer's targeted-signal arm
-// does not need to consult the epochs at all.
-//
-// Reset to zero in `boc_sched_init`/`boc_sched_shutdown` so a fresh
-// runtime cycle starts with the invariant `PAUSE_EPOCH == UNPAUSE_EPOCH`
-// (no parker has spoken; producers take the fast arm).
-
+/// @brief Park-epoch handshake. A worker about to park bumps @ref PAUSE_EPOCH
+/// (seq-cst) then re-scans for work; a dispatcher CAS-advances
+/// @ref UNPAUSE_EPOCH to match and the single CAS winner wakes every parked
+/// worker via @ref boc_sched_unpause_all. The seq-cst bump is the lost-wakeup
+/// fence: it totally-orders against the dispatcher's PAUSE_EPOCH acquire-load.
 static boc_atomic_u64_t PAUSE_EPOCH = 0;
 static boc_atomic_u64_t UNPAUSE_EPOCH = 0;
 static boc_atomic_u32_t PARKED_COUNT = 0;
 
-// ---------------------------------------------------------------------------
-// Public API
-// ---------------------------------------------------------------------------
-
 int boc_sched_init(Py_ssize_t worker_count) {
-  // Defensive: refuse a leak if init is called twice without an
-  // intervening shutdown.
   if (WORKERS != NULL) {
     PyErr_SetString(PyExc_RuntimeError,
                     "boc_sched_init called without prior shutdown");
@@ -354,17 +245,6 @@ int boc_sched_init(Py_ssize_t worker_count) {
   }
 
   if (worker_count > 0) {
-    // PyMem_RawCalloc (not PyMem_Calloc): the WORKERS array is
-    // process-global and is touched by every sub-interpreter worker
-    // thread. Since CPython 3.12 the object/Mem allocators are
-    // per-interpreter, so an allocation made in interpreter A would
-    // be invalid (and unfreeable) from interpreter B. The raw
-    // allocator is process-wide and GIL-independent. Zero-init gives
-    // every counter, every typed atomic slot (boc_compat.h
-    // `boc_atomic_*_t` are layout-compatible with the underlying
-    // scalar; zero is the well-defined "false" / NULL / 0 state on
-    // every supported platform), and every reserved slot the correct
-    // starting value.
     WORKERS = (boc_sched_worker_t *)PyMem_RawCalloc((size_t)worker_count,
                                                     sizeof(boc_sched_worker_t));
     if (WORKERS == NULL) {
@@ -372,17 +252,8 @@ int boc_sched_init(Py_ssize_t worker_count) {
       return -1;
     }
 
-    // Per-worker non-trivial initialisation: bq queue, mutex,
-    // condvar, owner-interp placeholder, and the ring-link.
-    // Mutex and condvar wrappers come from `boc_compat.h` (pthread on
-    // POSIX, SRWLock / CONDITION_VARIABLE on MSVC).
     for (Py_ssize_t i = 0; i < worker_count; ++i) {
       boc_sched_worker_t *w = &WORKERS[i];
-      // Initialise all N sub-queues of the WSQ. Cursors are
-      // zero-initialised by the parent `PyMem_RawCalloc` of the
-      // WORKERS array; we re-set them here to make the invariant
-      // explicit and survive any future move to non-zeroing
-      // allocators.
       for (size_t j = 0; j < (size_t)BOC_WSQ_N; ++j) {
         boc_bq_init(&w->q[j]);
       }
@@ -391,54 +262,20 @@ int boc_sched_init(Py_ssize_t worker_count) {
       w->steal_index.idx = 0;
       boc_mtx_init(&w->cv_mu);
       cnd_init(&w->cv);
-      // owner_interp_id is set when the worker calls
-      // `boc_sched_worker_register`. -1 means "not yet registered".
       w->owner_interp_id = -1;
-      // Ring-link: i -> i+1, last wraps to 0. Immutable after this
-      // point.
       w->next_in_ring = &WORKERS[(i + 1) % worker_count];
-      // Verona `core.h:23`: `should_steal_for_fairness{true}` — every
-      // freshly-constructed Core starts with the flag set, so the
-      // first `get_work` call on each worker takes the fairness arm
-      // (which is what enqueues the token into the queue for the
-      // first time; nothing else seeds it). Release-store so a
-      // worker thread that subsequently reads it under acquire sees
-      // the initialised value.
       boc_atomic_store_bool_explicit(&w->should_steal_for_fairness, true,
                                      BOC_MO_RELEASE);
     }
   }
 
-  // Initial publish of WORKER_COUNT and INCARNATION. On the GIL
-  // build no concurrent producers can exist at this point (workers
-  // have not been spawned yet, and `start()` is single-threaded
-  // under the GIL), so plain stores would suffice. On the
-  // free-threaded build (PEP 703) an off-worker producer surviving
-  // a prior stop()/start() cycle can ACQUIRE-load WORKER_COUNT in
-  // `boc_sched_dispatch` and see the new non-zero value here. RELAXED
-  // stores would only synchronise that ACQUIRE with the previous
-  // shutdown's WORKER_COUNT = 0 RELEASE, leaving no happens-before
-  // edge with the per-slot `boc_bq_init` / `boc_mtx_init` writes
-  // above -- the producer could legally read `wc > 0` and then
-  // dereference a `WORKERS[i]` whose mutex is still in pre-init
-  // bytewise state. The same hazard applies to the INCARNATION
-  // re-seed: a producer ACQUIRE-loading the new incarnation must
-  // observe the new WORKERS pointer, not whatever was cached. Use
-  // RELEASE so init and shutdown publish-pair symmetrically with
-  // the dispatch-side ACQUIRE on every cycle.
   boc_atomic_store_u64_explicit(&WORKER_COUNT, (uint64_t)worker_count,
                                 BOC_MO_RELEASE);
   boc_atomic_store_u64_explicit(
       &INCARNATION,
       boc_atomic_load_u64_explicit(&INCARNATION, BOC_MO_RELAXED) + 1,
       BOC_MO_RELEASE);
-  // Re-entry safety: every start cycle starts slot allocation at 0.
-  // Done after WORKER_COUNT/WORKERS are valid so a racing register()
-  // (none expected at this point because workers have not been
-  // spawned yet, but defensively correct) sees a consistent state.
   boc_atomic_store_u32_explicit(&REGISTERED_COUNT, 0, BOC_MO_RELAXED);
-  // Park/unpark protocol epochs: a fresh runtime cycle starts with
-  // the invariant PAUSE_EPOCH == UNPAUSE_EPOCH (no parker has spoken).
   boc_atomic_store_u64_explicit(&PAUSE_EPOCH, 0, BOC_MO_RELAXED);
   boc_atomic_store_u64_explicit(&UNPAUSE_EPOCH, 0, BOC_MO_RELAXED);
   boc_atomic_store_u32_explicit(&PARKED_COUNT, 0, BOC_MO_RELAXED);
@@ -446,42 +283,18 @@ int boc_sched_init(Py_ssize_t worker_count) {
 }
 
 void boc_sched_shutdown(void) {
-  // Order matters for the off-worker dispatch race.
-  // Off-worker producers in `boc_sched_dispatch` acquire-load
-  // WORKER_COUNT and treat 0 as the runtime-down sentinel. We must
-  // therefore publish WORKER_COUNT = 0 (and bump INCARNATION to
-  // self-invalidate any cached `rr_nonlocal` TLS in off-worker
-  // threads) BEFORE freeing the WORKERS array, otherwise a racing
-  // dispatch could dereference a freed slot.
   Py_ssize_t old_count =
       (Py_ssize_t)boc_atomic_load_u64_explicit(&WORKER_COUNT, BOC_MO_RELAXED);
-  // Release-store: pairs with the acquire-load in the off-worker
-  // arm of `boc_sched_dispatch`. A producer that observes
-  // WORKER_COUNT == 0 must NOT then observe a freed WORKERS slot;
-  // RELEASE here + ACQUIRE there gives that happens-before edge
-  // without an explicit `atomic_thread_fence`.
+  // RELEASE store of 0 pairs with the dispatch-side ACQUIRE load: a producer
+  // that observes wc==0 cannot then dereference a freed WORKERS slot.
   boc_atomic_store_u64_explicit(&WORKER_COUNT, 0, BOC_MO_RELEASE);
-  // Bump the incarnation so any thread-local `rr_nonlocal` cached
-  // by off-worker producers becomes self-invalidating; pairs with
-  // the acquire-load in `boc_sched_dispatch`. Doing this here (in
-  // addition to `boc_sched_init`) closes the start/stop/start
-  // window where a producer's TLS still holds the prior
-  // incarnation's worker pointer. RELEASE-store mirrors the
-  // WORKER_COUNT = 0 store above.
   boc_atomic_store_u64_explicit(
       &INCARNATION,
       boc_atomic_load_u64_explicit(&INCARNATION, BOC_MO_RELAXED) + 1,
       BOC_MO_RELEASE);
-  // No standalone fence needed: the RELEASE stores above already
-  // establish the happens-before edge with the dispatch-side
-  // ACQUIRE loads. Pairs with the acquire-load in the dispatch
-  // path.
   if (WORKERS != NULL) {
-    // Per-worker teardown in reverse order. The bq must be empty at
-    // this point — `boc_bq_destroy_assert_empty` aborts if not.
     for (Py_ssize_t i = old_count - 1; i >= 0; --i) {
       boc_sched_worker_t *w = &WORKERS[i];
-      // Tear down all N sub-queues; each must be empty.
       for (size_t j = 0; j < (size_t)BOC_WSQ_N; ++j) {
         boc_bq_destroy_assert_empty(&w->q[j]);
       }
@@ -491,9 +304,6 @@ void boc_sched_shutdown(void) {
     PyMem_RawFree(WORKERS);
     WORKERS = NULL;
   }
-  // Reset the registration counter so external observers see a
-  // clean post-stop state. Symmetric with the reset in
-  // `boc_sched_init`.
   boc_atomic_store_u32_explicit(&REGISTERED_COUNT, 0, BOC_MO_RELAXED);
 }
 
@@ -520,11 +330,6 @@ int boc_sched_stats_snapshot(Py_ssize_t worker_index, boc_sched_stats_t *out) {
   if (worker_index < 0 || worker_index >= wc) {
     return -1;
   }
-  // Best-effort relaxed snapshot. Each field is read independently;
-  // the snapshot may observe individual counter values from
-  // different points in time. Counters are monotonic, so a torn
-  // read between fields can only under-report -- never produce a
-  // value greater than the true count.
   const boc_sched_stats_atomic_t *src = &WORKERS[worker_index].stats;
   out->pushed_local = boc_atomic_load_u64_explicit(
       (boc_atomic_u64_t *)&src->pushed_local, BOC_MO_RELAXED);
@@ -559,74 +364,34 @@ size_t boc_sched_incarnation_get(void) {
   return (size_t)boc_atomic_load_u64_explicit(&INCARNATION, BOC_MO_RELAXED);
 }
 
-// ---------------------------------------------------------------------------
-// Per-worker registration
-// ---------------------------------------------------------------------------
-
 Py_ssize_t boc_sched_worker_register(void) {
-  // Allocate the next slot. Returns the *previous* value, so the
-  // first caller gets 0. Relaxed is fine: the only writer this races
-  // with is itself; downstream consumers reach the slot through a
-  // subsequent TLS write or through `boc_sched_stats_snapshot`, both
-  // of which are sequenced after this call returns.
   uint32_t slot =
       boc_atomic_fetch_add_u32_explicit(&REGISTERED_COUNT, 1, BOC_MO_RELAXED);
   Py_ssize_t wc =
       (Py_ssize_t)boc_atomic_load_u64_explicit(&WORKER_COUNT, BOC_MO_RELAXED);
   if ((Py_ssize_t)slot >= wc) {
-    // Over-registration: roll back the counter so a subsequent
-    // (legitimate) registration would still succeed if a slot frees.
-    // Keeps the `registered_count == worker_count` invariant clean
-    // after a successful run.
     boc_atomic_fetch_sub_u32_explicit(&REGISTERED_COUNT, 1, BOC_MO_RELAXED);
     return -1;
   }
 
-  // Stamp the slot's owner-witness with the calling sub-interpreter
-  // id. This is a debug aid and the wrong-thread assert hook;
-  // nothing reads it on a hot path.
   PyInterpreterState *interp = PyInterpreterState_Get();
   WORKERS[slot].owner_interp_id = (Py_ssize_t)PyInterpreterState_GetID(interp);
 
-  // Install the TLS handle. From here on, any dispatch / pop call on
-  // this thread finds its worker in O(1) without consulting the
-  // WORKERS array.
   current_worker = &WORKERS[slot];
 
-  // Seed the consumer-side batch budget so the first pop on this
-  // thread can take pending without first draining the queue. The
-  // zero default would otherwise mis-classify the first pop as
-  // batch-exhausted and break Verona's `next_work` priority.
   batch = BOC_BQ_BATCH_SIZE;
-  // Clear the steal victim cursor: it is lazy-initialised on the
-  // first try_steal call. A stale TLS pointer from a previous
-  // start cycle would point into a freed worker array.
   steal_victim = NULL;
   return (Py_ssize_t)slot;
 }
 
 boc_sched_worker_t *boc_sched_current_worker(void) { return current_worker; }
 
-// ---------------------------------------------------------------------------
-// Park/unpark protocol implementation
-// ---------------------------------------------------------------------------
-
-// Forward declaration: the slow steal helper is defined further down
-// (with `try_steal` and the quiescence-window machinery). `pop_slow`
-// calls it between the local-queue dequeue and the park, matching
-// Verona's `get_work` ordering (`schedulerthread.h:122-167`).
 static boc_bq_node_t *boc_sched_steal(boc_sched_worker_t *self);
 
 void boc_sched_signal_one(boc_sched_worker_t *target) {
   if (target == NULL) {
     return;
   }
-  // Lock-then-signal: under cv_mu we serialise against the parker's
-  // epoch re-check. If the parker is between its re-check and the
-  // `parked = true` store, our signal would otherwise be lost; the
-  // mutex acquisition forces us to wait until either the parker has
-  // committed to sleep (and `cnd_signal` will wake it) or has bailed
-  // out (and our signal is harmless).
   mtx_lock(&target->cv_mu);
   cnd_signal(&target->cv);
   mtx_unlock(&target->cv_mu);
@@ -638,23 +403,12 @@ void boc_sched_unpause_all(boc_sched_worker_t *self) {
   if (self == NULL || wc == 0) {
     return;
   }
-  // Cheap early-out: if no worker is parked, the walk would do
-  // WORKER_COUNT acquire-loads for nothing. The relaxed load is
-  // sufficient because a producer that observed PARKED_COUNT == 0
-  // and a parker that subsequently parks would, on the next
-  // producer's CAS-arm entry, re-publish (pe != ue forces another
-  // CAS and wake attempt). The protocol explicitly tolerates a
-  // stale zero here.
   if (boc_atomic_load_u32_explicit(&PARKED_COUNT, BOC_MO_RELAXED) == 0) {
     return;
   }
-  // Broadcast wake: walk the entire ring starting from
-  // self->next_in_ring and signal every parked worker. Mirrors
-  // Verona's ThreadSync::unpause_all (threadsync.h:108-128,
-  // threadpool.h:367-373). Without the broadcast, a burst of
-  // producer publishes that all CAS-lose against a single winner
-  // would leave N-1 parkers asleep until they each happen to be
-  // signal-targeted by some later off-worker dispatch.
+  // The lone dispatcher that won the UNPAUSE_EPOCH CAS must wake the whole
+  // ring: CAS losers do not signal, so any parked worker they passed would be
+  // stranded.
   boc_sched_worker_t *w = self->next_in_ring;
   for (Py_ssize_t i = 0; i < wc; ++i) {
     if (boc_atomic_load_bool_explicit(&w->parked, BOC_MO_ACQUIRE)) {
@@ -670,63 +424,21 @@ void boc_sched_worker_request_stop_all(void) {
   }
   Py_ssize_t wc =
       (Py_ssize_t)boc_atomic_load_u64_explicit(&WORKER_COUNT, BOC_MO_RELAXED);
-  // Phase 1: set stop_requested on every worker (release store so a
-  // worker waking from cnd_wait observes the flag with acquire).
   for (Py_ssize_t i = 0; i < wc; ++i) {
     boc_atomic_store_bool_explicit(&WORKERS[i].stop_requested, true,
                                    BOC_MO_RELEASE);
   }
-  // Phase 2: signal every worker's condvar under its mutex. We use
-  // signal-per-worker rather than broadcast on a global condvar
-  // because the bocpy precedent is per-queue waiters; each worker
-  // has its own cv. The mutex acquisition serialises against any
-  // parker that is between its epoch re-check and the cnd_wait call.
   for (Py_ssize_t i = 0; i < wc; ++i) {
     boc_sched_signal_one(&WORKERS[i]);
   }
 }
 
 boc_bq_node_t *boc_sched_worker_pop_slow(boc_sched_worker_t *self) {
-  // stop_requested is checked at the top of every loop iteration,
-  // BEFORE any pause_epoch bump, so a worker exiting on shutdown
-  // does not advance pause_epoch past unpause_epoch.
   for (;;) {
     if (boc_atomic_load_bool_explicit(&self->stop_requested, BOC_MO_ACQUIRE)) {
       return NULL;
     }
 
-    // ----- Steal-for-fairness arm -----
-    //
-    // Verona `schedulerthread.h::get_work:143-162`. When the
-    // fairness flag is set AND the local queue has at least one
-    // visible item, attempt one steal pass *before* draining the
-    // local queue. If the steal succeeds we still re-enqueue the
-    // token and return the stolen item; if it fails we fall through
-    // to the local dequeue. The flag is cleared *before* the token
-    // re-enqueue (Verona note: "Set the flag before rescheduling
-    // the token so that we don't have a race"). The token itself is
-    // installed by `_core_scheduler_runtime_start` and is never
-    // freed by this path; re-enqueue is a node operation only.
-    //
-    // Runs BEFORE the defensive `pending` check so the
-    // batch==0-forced-queue-drain fall-through from `pop_fast`
-    // (which leaves `pending` set when the gate trips) still pays
-    // the fairness tax.
-    //
-    // **WSQ cadence sensitivity.** The token is re-enqueued via
-    // `boc_wsq_enqueue` below, which pushes round-robin via
-    // `enqueue_index` and so rotates the token across the worker's
-    // `BOC_WSQ_N` sub-queues over time. Owner-side
-    // `boc_wsq_dequeue` scans sub-queues in `dequeue_index` order,
-    // so the token's consumption rate (and therefore the
-    // fairness-arm cadence) is proportional to the cursor
-    // desynchronisation between `enqueue_index` and `dequeue_index`
-    // rather than to absolute local work. This matches verona's
-    // design (verona's `Core` carries the same `WrapIndex<N>`
-    // cursors and re-enqueues its fairness token via `enqueue`); a
-    // regression that pinned the token to one sub-queue would shift
-    // `fairness_arm_fires` by a factor of `BOC_WSQ_N` without any
-    // test failure today.
     if (boc_atomic_load_bool_explicit(&self->should_steal_for_fairness,
                                       BOC_MO_ACQUIRE) &&
         !boc_wsq_is_empty(self)) {
@@ -745,15 +457,6 @@ boc_bq_node_t *boc_sched_worker_pop_slow(boc_sched_worker_t *self) {
       }
     }
 
-    // Defensive: under normal flow `pop_fast` exhausts pending
-    // before falling through to `pop_slow`, but the fairness gate
-    // in `pop_fast` returns NULL without consuming `pending` when
-    // it fires, and a future caller may enter slow-path directly
-    // (e.g. test harness). Honour pending first so we never park
-    // while an unconsumed thread-local item is sitting on this
-    // thread. Mirrors `pop_fast`'s pending-fallback branch: reset
-    // `batch` and bump `popped_local` to preserve the near-identity
-    // documented in @ref boc_sched_stats_t.
     if (pending != NULL) {
       boc_bq_node_t *n = pending;
       pending = NULL;
@@ -763,15 +466,6 @@ boc_bq_node_t *boc_sched_worker_pop_slow(boc_sched_worker_t *self) {
       return n;
     }
 
-    // ----- Local-queue dequeue -----
-    //
-    // Verona `get_work:165`. With the fairness arm cleared (or
-    // skipped) this is the primary work source. Mirrors `pop_fast`'s
-    // wsq-dequeue success: reset `batch` and bump `popped_local` so
-    // the global stats invariant holds. The `batch_resets` bump is
-    // intentionally omitted here -- this branch is only reached
-    // when the `pending` check above found NULL, so there was no
-    // pending to "reset over".
     boc_bq_node_t *n = boc_wsq_dequeue(self);
     if (n != NULL) {
       batch = BOC_BQ_BATCH_SIZE;
@@ -780,47 +474,18 @@ boc_bq_node_t *boc_sched_worker_pop_slow(boc_sched_worker_t *self) {
       return n;
     }
 
-    // ----- Empty-queue steal arm -----
-    //
-    // Verona `get_work:171-178`: an empty local queue is treated
-    // "like receiving a token" — try a steal directly. bocpy bundles
-    // the multi-victim ring + quiescence-window backoff into
-    // `boc_sched_steal`; if it returns non-NULL we have a stolen
-    // node (and the splice contract has already moved any remainder
-    // onto self->q). Returning NULL is the signal to commit to the
-    // park below.
     n = boc_sched_steal(self);
     if (n != NULL) {
       return n;
     }
 
-    // ----- Park-attempt -----
-    //
-    // Snapshot UNPAUSE_EPOCH BEFORE bumping PAUSE_EPOCH (mirrors
-    // Verona `threadpool.h::pause:283-285`). The pre-bump snapshot
-    // closes a lost-wakeup race: a producer that publishes between
-    // our bump and the snapshot would otherwise advance UNPAUSE_EPOCH
-    // to the new pause_epoch, but our (post-bump) snapshot would
-    // already see the advanced value, causing the cv_mu re-check
-    // below to compare equal and park anyway, consuming the wake.
-    // With the pre-bump snapshot, the producer's CAS must advance
-    // past `ue_snap`, and the re-check observes the inequality and
-    // bails out of the park. Relaxed is sufficient because the
-    // seq_cst fetch_add on PAUSE_EPOCH that follows provides the
-    // total order with the producer's load of PAUSE_EPOCH.
     uint64_t ue_snap =
         boc_atomic_load_u64_explicit(&UNPAUSE_EPOCH, BOC_MO_RELAXED);
 
-    // Bump PAUSE_EPOCH so any concurrent producer sees pe != ue and
-    // is forced into the CAS arm. seq_cst is required: the increment
-    // must totally-order with the producer's load-acquire of
-    // PAUSE_EPOCH.
+    // Publish park intent (seq-cst) BEFORE the final work re-scan: a dispatcher
+    // that enqueued after our last scan must see this and bump UNPAUSE_EPOCH.
     boc_atomic_fetch_add_u64_explicit(&PAUSE_EPOCH, 1, BOC_MO_SEQ_CST);
 
-    // check_for_work: walks ALL workers via
-    // `boc_sched_any_work_visible()`. Cheap: one acquire-load per
-    // queue, no global lock. A parker that observes work anywhere
-    // in the ring re-loops and either dequeues locally or steals.
 #if BOC_HAVE_TRY_STEAL
     if (boc_sched_any_work_visible()) {
       continue;
@@ -831,23 +496,13 @@ boc_bq_node_t *boc_sched_worker_pop_slow(boc_sched_worker_t *self) {
     }
 #endif
 
-    // Final epoch re-check under cv_mu. Drops the GIL across the
-    // wait so other Python work can proceed. terminator_count is
-    // NOT consulted here — quiescence is transient; only
-    // stop_requested causes exit.
     Py_BEGIN_ALLOW_THREADS mtx_lock(&self->cv_mu);
     if (boc_atomic_load_bool_explicit(&self->stop_requested, BOC_MO_ACQUIRE)) {
       mtx_unlock(&self->cv_mu);
     } else if (boc_atomic_load_u64_explicit(&UNPAUSE_EPOCH, BOC_MO_ACQUIRE) !=
                ue_snap) {
-      // A producer caught up between our epoch bump and the lock;
-      // skip the wait and re-loop.
       mtx_unlock(&self->cv_mu);
     } else {
-      // Bump the cumulative `parked` counter before the actual
-      // wait so a snapshot from another thread sees the entry
-      // even if the wait blocks indefinitely. Live PARKED_COUNT
-      // tracks current depth; stats.parked tracks total entries.
       boc_atomic_fetch_add_u64_explicit(&self->stats.parked, 1, BOC_MO_RELAXED);
       boc_atomic_store_bool_explicit(&self->parked, true, BOC_MO_RELEASE);
       boc_atomic_fetch_add_u32_explicit(&PARKED_COUNT, 1, BOC_MO_ACQ_REL);
@@ -860,20 +515,11 @@ boc_bq_node_t *boc_sched_worker_pop_slow(boc_sched_worker_t *self) {
   }
 }
 
-// ---------------------------------------------------------------------------
-// Dispatch + fast-path pop
-// ---------------------------------------------------------------------------
-
 boc_bq_node_t *boc_sched_worker_pop_fast(boc_sched_worker_t *self) {
   if (self == NULL) {
     return NULL;
   }
 
-  // BATCH_SIZE fairness: take pending only while batch > 0. When
-  // batch hits 0, fall through to the queue so a producer-local
-  // chain (which evicts every prior pending into the queue) cannot
-  // run newest-first forever and starve the older queued items.
-  // Verona `schedulerthread.h:122-138`.
   if (pending != NULL && batch > 0) {
     boc_bq_node_t *n = pending;
     pending = NULL;
@@ -883,18 +529,6 @@ boc_bq_node_t *boc_sched_worker_pop_fast(boc_sched_worker_t *self) {
     return n;
   }
 
-  // ----- Steal-for-fairness gate (Verona schedulerthread.h:143) -----
-  //
-  // Verona's `get_work` runs the fairness arm AFTER consuming
-  // `next_work` (≈ `pending`) but BEFORE draining the local queue.
-  // We mirror that order here: a busy worker steadily draining its
-  // own queue still pays the per-token-period fairness tax, by
-  // routing through `pop_slow` (which owns the arm body —
-  // re-enqueue token, attempt steal, clear flag).
-  //
-  // Returning NULL here costs the caller one extra function-call
-  // (`pop_slow`) per fairness period; the arm itself has the same
-  // cost it has always had.
   if (boc_atomic_load_bool_explicit(&self->should_steal_for_fairness,
                                     BOC_MO_ACQUIRE) &&
       !boc_wsq_is_empty(self)) {
@@ -903,11 +537,6 @@ boc_bq_node_t *boc_sched_worker_pop_fast(boc_sched_worker_t *self) {
 
   boc_bq_node_t *n = boc_wsq_dequeue(self);
   if (n != NULL) {
-    // Any successful queue dequeue resets the budget; if pending was
-    // bypassed because batch had hit 0, count this as a batch_reset
-    // for the fairness exit-criterion test. (A first-time pop with
-    // an empty pending also resets the budget but does not bump the
-    // counter — there was no fast path to bypass.)
     if (pending != NULL) {
       boc_atomic_fetch_add_u64_explicit(&self->stats.batch_resets, 1,
                                         BOC_MO_RELAXED);
@@ -918,10 +547,6 @@ boc_bq_node_t *boc_sched_worker_pop_fast(boc_sched_worker_t *self) {
     return n;
   }
 
-  // Queue is empty. If pending is set we exhausted the batch budget
-  // but have nothing else to fall back on — take pending and reset.
-  // Without this branch a single-worker chain would loop into
-  // pop_slow and park the worker against its own pending item.
   if (pending != NULL) {
     boc_bq_node_t *p = pending;
     pending = NULL;
@@ -937,14 +562,6 @@ boc_bq_node_t *boc_sched_worker_pop_fast(boc_sched_worker_t *self) {
 int boc_sched_dispatch(boc_bq_node_t *n) {
   boc_sched_worker_t *self = current_worker;
 
-  // Off-worker runtime-down gate. Must run BEFORE the pinned fast
-  // path so a pinned `@when` racing teardown is rejected the same
-  // way as an unpinned one — otherwise the pinned arm would drop
-  // the node onto MAIN_PINNED_QUEUE post-`terminator_close` with
-  // no rollback, leaking an undecremented `terminator_inc` and
-  // wedging the next `wait()`. Workers releasing successors of
-  // already-acquired behaviours skip this check (they bypass
-  // `self == NULL`) so mid-run dispatch is unaffected.
   if (self == NULL) {
     Py_ssize_t wc =
         (Py_ssize_t)boc_atomic_load_u64_explicit(&WORKER_COUNT, BOC_MO_ACQUIRE);
@@ -958,14 +575,6 @@ int boc_sched_dispatch(boc_bq_node_t *n) {
     }
   }
 
-  // Pinned-routing fast path. The OR-fold pinned byte was set by
-  // `BehaviorCapsule_init` from the per-arg `BOCCown::is_pinned`
-  // classification. Read it via the scheduler-public prehdr accessor
-  // (no knowledge of BOCBehavior layout required) and divert pinned
-  // behaviours onto the process-global main-pinned queue. The cold
-  // path transfers entirely to `boc_main_pinned_enqueue` (defined
-  // in `_core.c`); the worker-dispatch arms below run only when the
-  // behaviour has no pinned cowns.
   if (boc_behavior_node_is_pinned(n)) {
     return boc_main_pinned_enqueue(n);
   }
@@ -973,41 +582,17 @@ int boc_sched_dispatch(boc_bq_node_t *n) {
   boc_sched_worker_t *target;
 
   if (self != NULL) {
-    // Producer-local arm (Verona schedule_fifo).
-    // Always evict the prior `pending` to the local queue and
-    // install `n` as the new pending. The eviction (not the install)
-    // is what bumps `pushed_local`: the queue push is the externally
-    // visible event for stats purposes; replacing pending with no
-    // prior occupant is a free local handoff that costs nothing
-    // measurable.
     if (pending != NULL) {
       boc_wsq_enqueue(self, pending);
       boc_atomic_fetch_add_u64_explicit(&self->stats.pushed_local, 1,
                                         BOC_MO_RELAXED);
     } else {
-      // Producer-locality bypass: dispatch into an empty `pending`
-      // slot. No queue push, no atomic queue-side state mutation,
-      // but bump `dispatched_to_pending` so the dispatched-work
-      // total remains globally reconcilable as
-      // `Σ pushed_local + Σ dispatched_to_pending + Σ pushed_remote
-      // == Σ popped_local + Σ popped_via_steal`. Without this
-      // bump the queue's `pushed_local` underreports total
-      // dispatched work whenever steady-state pop-then-dispatch
-      // keeps `pending` empty most cycles.
       boc_atomic_fetch_add_u64_explicit(&self->stats.dispatched_to_pending, 1,
                                         BOC_MO_RELAXED);
     }
     pending = n;
     target = self;
   } else {
-    // Off-worker arm: round-robin over the worker ring.
-    //
-    // The runtime-down gate at the top of `boc_sched_dispatch`
-    // already rejected the case where `WORKER_COUNT == 0`, so by
-    // the time we get here at least one worker slot is live.
-    // INCARNATION still needs an acquire load so a stale
-    // `rr_nonlocal` from a prior incarnation is refreshed before
-    // we dereference it.
     size_t inc_now =
         (size_t)boc_atomic_load_u64_explicit(&INCARNATION, BOC_MO_ACQUIRE);
     if (rr_nonlocal == NULL || rr_incarnation != inc_now) {
@@ -1021,30 +606,15 @@ int boc_sched_dispatch(boc_bq_node_t *n) {
     rr_nonlocal = rr_nonlocal->next_in_ring;
   }
 
-  // ---- Slow arm: pause/unpause-aware wake -----------------------------
-  //
-  // Producer half of the parking protocol. Loaded with acquire so
-  // the parker's seq_cst PAUSE_EPOCH bump is observed in order. If
-  // pe == ue the fast path is taken (no parker is racing); otherwise
-  // CAS UNPAUSE_EPOCH forward and, on CAS-win, broadcast-wake every
-  // parked peer.
   uint64_t pe = boc_atomic_load_u64_explicit(&PAUSE_EPOCH, BOC_MO_ACQUIRE);
   uint64_t ue = boc_atomic_load_u64_explicit(&UNPAUSE_EPOCH, BOC_MO_ACQUIRE);
   if (pe != ue) {
     if (boc_atomic_compare_exchange_strong_u64_explicit(
             &UNPAUSE_EPOCH, &ue, pe, BOC_MO_ACQ_REL, BOC_MO_ACQUIRE)) {
-      // Walk from `target` so the wake prefers a peer rather than
-      // the worker we just published to (which is either us or the
-      // round-robin target — both cases are awake or about to be
-      // signalled). For off-worker dispatch `self` is NULL so we
-      // pass `target` directly; for producer-local we pass `self`.
       boc_sched_unpause_all(self != NULL ? self : target);
     }
   }
 
-  // Targeted wake when crossing to a different worker. Producer-
-  // local dispatch (target == self) skips this: the producer thread
-  // is the worker that will run the work, so it cannot be parked.
   if (self == NULL || target != self) {
     boc_sched_signal_one(target);
   }
@@ -1052,27 +622,6 @@ int boc_sched_dispatch(boc_bq_node_t *n) {
   return 0;
 }
 
-// ---------------------------------------------------------------------------
-// Work stealing (`try_steal`)
-// ---------------------------------------------------------------------------
-//
-// Port of the work-stealing primitive from
-// `verona-rt/src/rt/sched/schedulerthread.h::try_steal` plus the
-// underlying queue-level steal at
-// `verona-rt/src/rt/sched/workstealingqueue.h::steal`. Each worker
-// owns a `boc_bq_t q[BOC_WSQ_N]` sub-queue array; this thief reads
-// the victim's sub-queue indexed by `self->steal_index` (verona's
-// `this->steal_index`) and `enqueue_spread`s the remainder across
-// its own N sub-queues to dilute thief-vs-thief contention on
-// subsequent steals.
-//
-// `boc_sched_try_steal` is the **single-victim** fast attempt: at
-// most one `dequeue_all` call against `victim->q[steal_index]`,
-// then the per-thread victim cursor advances unconditionally so the
-// next attempt visits a different victim regardless of outcome. The
-// slow multi-victim loop with quiescence timeout (Verona's
-// `steal()`) follows.
-
 /// @brief Single-victim work-stealing attempt for @p self.
 /// @details Reads the per-thread @c steal_victim cursor (lazy-
 /// initialised to @c self->next_in_ring), tries to steal one node
@@ -1109,35 +658,19 @@ int boc_sched_dispatch(boc_bq_node_t *n) {
 ///         park.
 
 static boc_bq_node_t *boc_sched_try_steal(boc_sched_worker_t *self) {
-  // Lazy-init the cursor on first use. WORKER_COUNT == 0 cannot
-  // happen here because every caller has a registered self handle.
   if (steal_victim == NULL) {
     steal_victim = self->next_in_ring;
   }
 
   boc_sched_worker_t *victim = steal_victim;
-  // Advance the victim cursor unconditionally. Verona does this
-  // after the steal call (whether the call returned work or not);
-  // placing the store before the work-doing code keeps the function
-  // tail-clean (no bookkeeping on the success path).
   steal_victim = steal_victim->next_in_ring;
 
-  // Stamp the monotonic timestamp before any other bookkeeping so
-  // a snapshot taken concurrently observes the entry even if the
-  // call returns NULL early (self-victim, empty victim, etc.).
-  // Relaxed is fine: the field is diagnostic; readers tolerate a
-  // torn read between this store and the snapshot's load.
   boc_atomic_store_u64_explicit(&self->stats.last_steal_attempt_ns,
                                 boc_now_ns(), BOC_MO_RELAXED);
 
   boc_atomic_fetch_add_u64_explicit(&self->stats.steal_attempts, 1,
                                     BOC_MO_RELAXED);
 
-  // Don't steal from yourself (Verona `WorkStealingQueue::steal`
-  // self-check: `if (&victim == this) { ++steal_index; return
-  // nullptr; }`). Counts as a failure for diagnostic purposes — a
-  // single-worker runtime will see steal_failures == steal_attempts
-  // which is the expected steady state.
   if (victim == self) {
     boc_wsq_pre_inc(&self->steal_index);
     boc_atomic_fetch_add_u64_explicit(&self->stats.steal_failures, 1,
@@ -1145,114 +678,40 @@ static boc_bq_node_t *boc_sched_try_steal(boc_sched_worker_t *self) {
     return NULL;
   }
 
-  // Pick the victim's sub-queue indexed by *this thief's*
-  // steal_index (verona: `victim.queues[steal_index]`, where the
-  // index belongs to the calling WSQ — the thief). The cursor is
-  // touched only by `self`, so no atomic is needed.
   size_t vidx = self->steal_index.idx;
   boc_bq_segment_t seg = boc_bq_dequeue_all(&victim->q[vidx]);
 
-  // Try to take the head off the segment.
   boc_bq_node_t *r = boc_bq_segment_take_one(&seg);
   if (r == NULL) {
-    // take_one returns NULL for three reasons (mpmcq.h:67-89):
-    //   1. fully empty segment (start == NULL, end == NULL),
-    //   2. single-element segment (end == &start->next_in_queue),
-    //   3. first link in segment not yet visible (start != NULL,
-    //      next_in_queue still NULL).
-    //
-    // Case 1: nothing to steal — return NULL. Verona's
-    // `WorkStealingQueue::steal` `if (ls.end == nullptr) return
-    // nullptr;`.
     if (seg.end == NULL) {
       boc_atomic_fetch_add_u64_explicit(&self->stats.steal_failures, 1,
                                         BOC_MO_RELAXED);
       return NULL;
     }
-    // Case 2: the segment IS our stolen node — verona returns
-    // `ls.start` directly without spreading anything (there is no
-    // remainder). `workstealingqueue.h:107-108`.
     if (seg.start != NULL && seg.end == &seg.start->next_in_queue) {
       r = seg.start;
       boc_atomic_fetch_add_u64_explicit(&self->stats.popped_via_steal, 1,
                                         BOC_MO_RELAXED);
       return r;
     }
-    // Case 3: take_one observed start != NULL but start->next not
-    // yet visible (the producer has done `back.exchange` but not
-    // yet published the next pointer). The segment is "owned" by
-    // us (acquire_front succeeded inside dequeue_all) and we
-    // cannot safely splice it back into the victim mid-link.
-    //
-    // Verona faithful: `WorkStealingQueue::steal` falls through to
-    // `enqueue_spread(ls); return r;` here, with `r == nullptr`.
-    // We do the same — spread the partial segment onto our own
-    // sub-queues and return NULL so the caller re-loops to its own
-    // dequeue.
     boc_wsq_enqueue_spread(self, seg);
     boc_atomic_fetch_add_u64_explicit(&self->stats.steal_failures, 1,
                                       BOC_MO_RELAXED);
     return NULL;
   }
 
-  // Common case: head taken; spread the rest across self's N
-  // sub-queues so subsequent thieves stealing from self see N
-  // independent targets instead of one. Verona:
-  // `enqueue_spread(ls); return r;`.
   boc_wsq_enqueue_spread(self, seg);
   boc_atomic_fetch_add_u64_explicit(&self->stats.popped_via_steal, 1,
                                     BOC_MO_RELAXED);
   return r;
 }
 
-// ---------------------------------------------------------------------------
-// Slow steal loop
-// ---------------------------------------------------------------------------
-//
-// Port of `verona-rt/src/rt/sched/schedulerthread.h::steal` adapted
-// for bocpy's parking protocol. The main differences:
-//
-//   * Verona has no separate park primitive: its `steal()` busy-spins
-//     with a TSC-quiescence backoff and only commits to the global
-//     `pause` state after the timeout. bocpy already has a condvar
-//     park, so the slow loop's job is *not* to outwait contention —
-//     it just gives a producer a small pre-park grace window in case
-//     work is about to be published, then returns NULL so the caller
-//     (`pop_slow`) parks under cv_mu.
-//
-//   * Verona walks `running` (a flag flipped by the global pause()
-//     side); bocpy walks `self->stop_requested` (per-worker, set by
-//     `boc_sched_worker_request_stop_all`).
-//
-//   * Verona uses TSC ticks (`DefaultPal::tick`) for the quiescence
-//     gate; bocpy uses @ref boc_now_ns (CLOCK_MONOTONIC on POSIX,
-//     QueryPerformanceCounter on Windows).
-//
-// Loop shape (per round):
-//   1. stop_requested check.
-//   2. yield (BOC_SCHED_YIELD).
-//   3. own queue dequeue (catch work that another thread published
-//      onto our q since the last pop attempt).
-//   4. one full ring of `try_steal` calls (bounded at
-//      `WORKER_COUNT - 1` distinct victims; self-victim is skipped
-//      and counted as a failure).
-//   5. on miss, sample the monotonic clock; if the elapsed time
-//      since loop entry exceeds @ref BOC_STEAL_QUIESCENCE_NS,
-//      return NULL → caller parks. Otherwise sleep briefly and
-//      retry.
-//
-// The constant @ref BOC_STEAL_QUIESCENCE_NS is a tunable; 100µs
-// matches Verona's `TSC_QUIESCENCE_TIMEOUT` order of magnitude on
-// contemporary CPUs. The pre-park backoff is a `nanosleep`-style
-// short sleep rather than a busy spin so two parked workers do not
-// race their own backoff loops to 100% CPU.
-
 #ifndef BOC_STEAL_QUIESCENCE_NS
-#define BOC_STEAL_QUIESCENCE_NS 100000ULL // 100µs
+#define BOC_STEAL_QUIESCENCE_NS 100000ULL
 #endif
 
 #ifndef BOC_STEAL_BACKOFF_NS
-#define BOC_STEAL_BACKOFF_NS 5000ULL // 5µs sleep between rounds
+#define BOC_STEAL_BACKOFF_NS 5000ULL
 #endif
 
 /// @brief Multi-victim steal with a brief quiescence window.
@@ -1277,10 +736,6 @@ static boc_bq_node_t *boc_sched_steal(boc_sched_worker_t *self) {
   Py_ssize_t wc =
       (Py_ssize_t)boc_atomic_load_u64_explicit(&WORKER_COUNT, BOC_MO_RELAXED);
   if (wc <= 1) {
-    // No peers to steal from. Skip the whole loop — the caller
-    // will park immediately, which is the only sensible behaviour
-    // on a single-worker runtime. We do not bump steal_attempts
-    // here: the call did not actually visit a victim.
     return NULL;
   }
 
@@ -1293,19 +748,11 @@ static boc_bq_node_t *boc_sched_steal(boc_sched_worker_t *self) {
 
     BOC_SCHED_YIELD();
 
-    // Own-queue catch (Verona schedulerthread.h:269-272).
     boc_bq_node_t *n = boc_wsq_dequeue(self);
     if (n != NULL) {
       return n;
     }
 
-    // One full ring of try_steal. WORKER_COUNT - 1 visits is
-    // enough to attempt every distinct peer once; the cursor
-    // advances inside try_steal so successive calls see different
-    // victims. self-victim is automatically skipped (and counted
-    // as a steal_failure) so a single loop iteration may visit
-    // self once when WORKER_COUNT == 2 (cursor 0→1→0) — that is
-    // benign, the worst case is one wasted check.
     for (Py_ssize_t i = 0; i < wc - 1; ++i) {
       n = boc_sched_try_steal(self);
       if (n != NULL) {
@@ -1313,57 +760,20 @@ static boc_bq_node_t *boc_sched_steal(boc_sched_worker_t *self) {
       }
     }
 
-    // Quiescence gate: if the window has expired, give up and let
-    // the caller park. Without this gate we would either busy-spin
-    // forever (waste CPU) or have no preemption between unrelated
-    // workers (subtle starvation under the GIL). The window must
-    // be short enough that a worker waiting one quiescence-period
-    // does not hurt latency-sensitive workloads; 100µs is well
-    // below any realistic behaviour body and matches Verona's
-    // TSC_QUIESCENCE_TIMEOUT in order of magnitude.
     if (boc_now_ns() >= deadline) {
       return NULL;
     }
 
-    // Brief sleep so two concurrently-failing thieves do not pin
-    // their cores. Using `boc_sleep_ns` (boc_compat.h) rather than
-    // `sched_yield` because we want a hard backoff: a yield is
-    // ineffective when there is no other runnable thread (the
-    // case during quiescence).
     boc_sleep_ns(BOC_STEAL_BACKOFF_NS);
   }
 }
 
-// ---------------------------------------------------------------------------
-// Per-worker fairness token (`token_work`)
-// ---------------------------------------------------------------------------
-//
-// `token_work` is a `boc_atomic_ptr_t` slot inside `boc_sched_worker_t`.
-// The token itself is a `BOCBehavior` allocated by
-// `_core_scheduler_runtime_start` (which is the only TU that knows
-// the `BOCBehavior` layout); this TU treats it as an opaque
-// `boc_bq_node_t *`. Lifecycle:
-//
-//   * `_core_scheduler_runtime_start` calls `boc_sched_init` then, for
-//     every worker, allocates a token `BOCBehavior` (zero-initialised,
-//     `is_token = 1`) and installs `&token->bq_node` here.
-//   * `_core_scheduler_runtime_stop` calls `boc_sched_get_token_node`
-//     for each worker to recover the pointer, frees the `BOCBehavior`,
-//     then calls `boc_sched_shutdown`.
-//
-// The slot is never freed by `boc_sched_shutdown` — that would require
-// this TU to dereference a `BOCBehavior`, breaking the layered
-// boundary. Releasing it before shutdown is a `_core.c` responsibility.
-
 int boc_sched_set_token_node(Py_ssize_t worker_index, boc_bq_node_t *node) {
   Py_ssize_t wc =
       (Py_ssize_t)boc_atomic_load_u64_explicit(&WORKER_COUNT, BOC_MO_RELAXED);
   if (worker_index < 0 || worker_index >= wc) {
     return -1;
   }
-  // Release-store: a worker thread later doing an acquire-load on
-  // `token_work` (e.g. token re-enqueue path) must observe the
-  // node and any of its initialised fields written by the producer.
   boc_atomic_store_ptr_explicit(&WORKERS[worker_index].token_work, (void *)node,
                                 BOC_MO_RELEASE);
   return 0;
@@ -1383,10 +793,6 @@ void boc_sched_set_steal_flag(boc_sched_worker_t *self, bool value) {
   if (self == NULL) {
     return;
   }
-  // Release-store: pairs with the acquire-load at the top of the
-  // fairness arm in `boc_sched_worker_pop_slow`. Verona equivalent
-  // is the closure body in `core.h:28-32`
-  // (`this->should_steal_for_fairness = true`).
   boc_atomic_store_bool_explicit(&self->should_steal_for_fairness, value,
                                  BOC_MO_RELEASE);
 }
@@ -1394,13 +800,6 @@ void boc_sched_set_steal_flag(boc_sched_worker_t *self, bool value) {
 bool boc_sched_any_work_visible(void) {
   Py_ssize_t wc =
       (Py_ssize_t)boc_atomic_load_u64_explicit(&WORKER_COUNT, BOC_MO_RELAXED);
-  // Walk the full worker array. `boc_bq_is_empty` is an acquire-
-  // load on the queue's `front` pointer — cheap, no global lock.
-  // The walk is racy by design (a producer publishing onto a
-  // queue we have already passed will force itself through the
-  // CAS arm of the parker protocol; see `unpause_all`), so a
-  // stale `false` is acceptable: the epoch re-check under `cv_mu`
-  // catches it before the parker sleeps.
   for (Py_ssize_t i = 0; i < wc; ++i) {
     if (!boc_wsq_is_empty(&WORKERS[i])) {
       return true;
diff --git a/src/bocpy/boc_sched.h b/src/bocpy/boc_sched.h
index 9e93442..d97b4ea 100644
--- a/src/bocpy/boc_sched.h
+++ b/src/bocpy/boc_sched.h
@@ -24,22 +24,6 @@
 
 #include "boc_compat.h"
 
-// ---------------------------------------------------------------------------
-// Verona MPMC behaviour queue (`boc_bq_*`)
-// ---------------------------------------------------------------------------
-//
-// Port of `verona-rt/src/rt/sched/mpmcq.h`. Memory orderings match
-// `mpmcq.h` line-for-line; deviations are called out in the
-// doc-comments.
-//
-// The queue is intrusive: each node carries an `_Atomic` link
-// (`boc_bq_node_t::next_in_queue`). Production users embed a
-// `boc_bq_node_t` field (see `boc_behavior_prehdr_t::bq_node` below
-// for the BOCBehavior case) and pass its address to the
-// enqueue/dequeue API; the queue never dereferences anything other
-// than the link, so larger user-defined payloads are reached via
-// container_of-style arithmetic at the call site.
-
 /// @brief Verona-style intrusive link node.
 /// @details Embedded at offset 0 of @c boc_behavior_prehdr_t (see
 /// below); the prehdr sits immediately before each @c BOCBehavior.
@@ -106,8 +90,6 @@ static const size_t BOC_BQ_BATCH_SIZE = 100;
 #define BOC_SCHED_YIELD() ((void)0)
 #endif
 
-// --- Lifecycle -------------------------------------------------------------
-
 /// @brief Initialise an empty queue in place.
 /// @details Sets `back == &front` and `front == NULL`. Safe to call
 /// on a zeroed allocation.
@@ -120,8 +102,6 @@ void boc_bq_init(boc_bq_t *q);
 /// @param q The queue to destroy (must be non-NULL).
 void boc_bq_destroy_assert_empty(boc_bq_t *q);
 
-// --- Producers -------------------------------------------------------------
-
 /// @brief Enqueue a single node at the back of the queue.
 /// @details Equivalent to `boc_bq_enqueue_segment({n, &n->next_in_queue})`.
 /// The node's `next_in_queue` is overwritten. Mirrors `MPMCQ::enqueue`
@@ -144,8 +124,6 @@ void boc_bq_enqueue_segment(boc_bq_t *q, boc_bq_segment_t s);
 /// @param n The node to insert (must be non-NULL).
 void boc_bq_enqueue_front(boc_bq_t *q, boc_bq_node_t *n);
 
-// --- Consumers -------------------------------------------------------------
-
 /// @brief Try to dequeue a single node from the front.
 /// @details May spuriously return NULL even when the queue is non-
 /// empty (concurrent enqueuer mid-link). Callers must be prepared to
@@ -182,8 +160,6 @@ boc_bq_node_t *boc_bq_acquire_front(boc_bq_t *q);
 /// @return The detached head, or NULL.
 boc_bq_node_t *boc_bq_segment_take_one(boc_bq_segment_t *s);
 
-// --- Inspection ------------------------------------------------------------
-
 /// @brief Best-effort emptiness test.
 /// @details Mirrors `MPMCQ::is_empty` (mpmcq.h:206-210). Result may
 /// be stale by the time the caller acts on it.
@@ -191,34 +167,6 @@ boc_bq_node_t *boc_bq_segment_take_one(boc_bq_segment_t *s);
 /// @return @c true if the queue currently appears empty.
 bool boc_bq_is_empty(boc_bq_t *q);
 
-// ---------------------------------------------------------------------------
-// Scheduler-visible behaviour pre-header (`boc_behavior_prehdr_t`)
-// ---------------------------------------------------------------------------
-//
-// Pre-header sitting immediately *before* each BOCBehavior
-// allocation (CPython `_PyGC_Head` / `_Py_AS_GC()` style). Holds
-// the fields the scheduler needs to inspect without including
-// BOCBehavior's private definition: the intrusive queue link and
-// the OR-fold pinned byte set by `BehaviorCapsule_init` from the
-// per-arg cown classification (`is_pinned`).
-//
-// Why a pre-header instead of fields on BOCBehavior. The dispatch
-// path in `boc_sched.c` receives a `boc_bq_node_t *` and must read
-// the pinned byte to route pinned behaviours onto the main-thread
-// queue. BOCBehavior's struct definition is private to `_core.c`,
-// so the alternatives are (a) leak the full struct via this header,
-// (b) call through a function pointer on the hot path, or (c)
-// hard-code an `offsetof(BOCBehavior, pinned)` magic number in
-// `boc_sched.c` and protect it with `static_assert` mirrors. The
-// pre-header avoids all three: the scheduler reads `pinned` via a
-// normal struct field access, and `bq_node` at offset 0 makes the
-// container_of cast trivial and impossible to drift.
-//
-// `_core.c` owns allocation: `behavior_new` calls
-// `PyMem_RawMalloc(sizeof(prehdr) + sizeof(BOCBehavior))`, zeroes
-// the prehdr, and returns the pointer past it. Recovery on the free
-// path uses `BOC_BEHAVIOR_PREHDR(b)` to walk back.
-
 /// @brief Scheduler-visible pre-header attached to every behaviour.
 /// @details Allocated in front of each @c BOCBehavior; sits in the
 /// same cache line as the intrusive link the scheduler dereferences
@@ -275,29 +223,6 @@ static inline uint8_t boc_behavior_node_is_pinned(const boc_bq_node_t *n) {
 ///         contract).
 int boc_main_pinned_enqueue(boc_bq_node_t *n);
 
-// ---------------------------------------------------------------------------
-// Verona work-stealing queue cursors (`boc_wsq_*`)
-// ---------------------------------------------------------------------------
-//
-// Port of `verona-rt/src/rt/sched/workstealingqueue.h` and
-// `ds/wrapindex.h`. A WSQ is N independent `boc_bq_t` sub-queues
-// indexed by three plain-`size_t` cursors:
-//   - `enqueue_index`: producer side; pre-increment then push.
-//   - `dequeue_index`: owner pop side; pre-increment then pop, try
-//                       all N before declaring empty.
-//   - `steal_index`: thief side; selects which of the *victim*'s
-//                     sub-queues to drain in a steal attempt.
-//
-// All three cursors are owned by the worker that owns the WSQ.
-// `enqueue_index` is touched by every thread that pushes onto this
-// worker (including remote producers). The race on it is benign:
-// (1) `size_t` aligned loads/stores are atomic at the hardware level
-// on every ISA bocpy supports; (2) `(idx + 1) % N` is always in
-// `[0, N)` regardless of what value was read; (3) the underlying
-// `boc_bq_t` is multi-producer-safe; (4) the only observable effect
-// is distribution quality, bounded by concurrent-producer count.
-// Verona-rt accepts the same race; we make no deviation.
-
 /// @brief Number of sub-queues per worker WSQ.
 /// @details Matches verona-rt's `WorkStealingQueue<4>` template
 /// instantiation in `core.h`. Tunable at compile time.
@@ -342,10 +267,6 @@ static inline size_t boc_wsq_post_dec(boc_wsq_cursor_t *c) {
   return r;
 }
 
-// ---------------------------------------------------------------------------
-// Scheduler instrumentation
-// ---------------------------------------------------------------------------
-
 /// @brief Per-worker statistics counter block (POD).
 ///
 /// All fields are plain @c uint64_t so a snapshot is a memcpy. Counters
@@ -475,25 +396,6 @@ typedef struct boc_sched_stats_atomic {
   boc_atomic_u64_t fairness_arm_fires;
 } boc_sched_stats_atomic_t;
 
-// ---------------------------------------------------------------------------
-// Per-worker scheduler state (`boc_sched_worker_t`)
-// ---------------------------------------------------------------------------
-//
-// Holds the per-worker MPMC queue, the fairness-token slot
-// (`token_work` / `should_steal_for_fairness`), the parking-protocol
-// `cv_mu` / `cv` pair (`boc_compat.h` `BOCMutex` / `BOCCond`, pthread on
-// POSIX, SRWLock on MSVC), the ring-link `next_in_ring` pointer, the
-// per-worker counter block, and a reserved terminator-delta slot.
-// Atomics use the typed `boc_compat.h` shim (`boc_atomic_*_t` +
-// `boc_atomic_*_explicit`) so the layout compiles identically on POSIX
-// and MSVC ARM64.
-//
-// Cacheline-aligned at the type level (`alignas(BOC_SCHED_CACHELINE)`)
-// and a trailing pad rounds the size up to the next cacheline so that
-// arrays of workers do not false-share between adjacent slots. The
-// pad size is computed from a `_payload` helper struct so it tracks
-// the platform-dependent sizes of `BOCMutex` / `BOCCond` automatically.
-
 #ifndef BOC_SCHED_CACHELINE
 #define BOC_SCHED_CACHELINE 64
 #endif
@@ -614,16 +516,6 @@ static_assert(sizeof(boc_sched_worker_t) % BOC_SCHED_CACHELINE == 0,
 static_assert(alignof(boc_sched_worker_t) >= BOC_SCHED_CACHELINE,
               "boc_sched_worker_t must be cacheline-aligned");
 
-// ---------------------------------------------------------------------------
-// Verona work-stealing queue helpers (`boc_wsq_*`)
-// ---------------------------------------------------------------------------
-//
-// Inline routing wrappers around the per-worker WSQ. They mirror
-// verona-rt's `WorkStealingQueue<N>` member functions one-for-one;
-// the underlying `boc_bq_*` MPMCQ is unchanged. Each wrapper takes a
-// `boc_sched_worker_t *` rather than a bare `boc_bq_t *` because the
-// cursor lives on the worker.
-
 /// @brief Push a single node onto a worker's WSQ.
 /// @details Mirrors `WorkStealingQueue::enqueue` (verona-rt
 /// `workstealingqueue.h`): pre-increments @c enqueue_index then
@@ -700,11 +592,6 @@ static inline void boc_wsq_enqueue_spread(boc_sched_worker_t *self,
     }
     boc_wsq_enqueue(self, n);
   }
-  // Tail residual: verona pushes the final segment unconditionally
-  // onto a single sub-queue via `++enqueue_index`. With N=4 and
-  // typical steal segments of dozens of nodes, the spreading has
-  // already happened; the tail is at most a singleton (or a
-  // mid-link partial we could not drain).
   size_t idx = boc_wsq_pre_inc(&self->enqueue_index);
   boc_bq_enqueue_segment(&self->q[idx], ls);
 }
@@ -776,10 +663,6 @@ int boc_sched_stats_snapshot(Py_ssize_t worker_index, boc_sched_stats_t *out);
 ///         `threadpool.h:40` precedent: not @c _Atomic).
 size_t boc_sched_incarnation_get(void);
 
-// ---------------------------------------------------------------------------
-// Per-worker registration
-// ---------------------------------------------------------------------------
-
 /// @brief Atomically claim a worker slot for the calling thread.
 /// @details Allocates the next free slot in @ref WORKERS using an
 /// internal atomic counter that is reset on every @ref boc_sched_init.
@@ -804,13 +687,6 @@ size_t boc_sched_incarnation_get(void);
 ///         if no free slot remains. No Python exception is set on -1.
 Py_ssize_t boc_sched_worker_register(void);
 
-// ---------------------------------------------------------------------------
-// Park / unpark protocol
-// ---------------------------------------------------------------------------
-//
-// Port of Verona's two-epoch `pause`/`unpause` protocol from
-// `verona-rt/src/rt/sched/threadpool.h:282-379`.
-
 /// @brief Pop the next behaviour for the calling worker, blocking
 ///        until work arrives or shutdown is requested.
 /// @details Implements the parker side of the protocol. The
@@ -875,15 +751,6 @@ void boc_sched_signal_one(boc_sched_worker_t *target);
 /// @c thread_local mirror.
 boc_sched_worker_t *boc_sched_current_worker(void);
 
-// ---------------------------------------------------------------------------
-// Dispatch + fast-path pop
-// ---------------------------------------------------------------------------
-//
-// @ref boc_sched_dispatch is the producer-side entry point. Production
-// callers in @c _core.c invoke it as
-// @c boc_sched_dispatch(&behavior->bq_node); test code reaches it via
-// @c _core.scheduler_dispatch_node / @c _core.scheduler_pop_fast.
-
 /// @brief Schedule a behaviour for execution.
 /// @details Producer-side dispatch with two arms (chosen by whether
 /// the calling thread is registered as a worker):
@@ -936,14 +803,6 @@ int boc_sched_dispatch(boc_bq_node_t *n);
 ///         queue are both empty.
 boc_bq_node_t *boc_sched_worker_pop_fast(boc_sched_worker_t *self);
 
-// ---------------------------------------------------------------------------
-// Build-time feature gate
-// ---------------------------------------------------------------------------
-//
-// `BOC_HAVE_TRY_STEAL` toggles the parker's `check_for_work` walk
-// between "inspect own queue only" (off) and "walk the full ring"
-// (on). Defined unconditionally here; the off mode is reserved for
-// debugging and is not part of any supported build.
 #define BOC_HAVE_TRY_STEAL 1
 
 /// @brief Test whether any worker's queue currently has visible work.
@@ -970,21 +829,6 @@ boc_bq_node_t *boc_sched_worker_pop_fast(boc_sched_worker_t *self);
 /// @return @c true if at least one worker has visible queue work.
 bool boc_sched_any_work_visible(void);
 
-// ---------------------------------------------------------------------------
-// Per-worker fairness token (`token_work`)
-// ---------------------------------------------------------------------------
-//
-// Each worker owns a `BOCBehavior`-shaped sentinel whose `is_token`
-// discriminator is set to 1. The token is allocated by
-// `_core_scheduler_runtime_start` (because it knows the
-// `BOCBehavior` layout) and installed into the worker's
-// `token_work` slot via @ref boc_sched_set_token_node. On every
-// successful pop, the dispatch site checks `is_token`; if set, the
-// popping worker flips its `should_steal_for_fairness` flag and
-// re-enqueues the token instead of running user code. Verona ports:
-// `Core::token_work` (`core.h:22-37`), token-thunk dequeue
-// (`schedulerthread.h::run_inner`).
-
 /// @brief Install the per-worker fairness token's queue node.
 /// @details Stores @p node into @c WORKERS[worker_index].token_work
 /// using @c BOC_MO_RELEASE so a subsequent acquire-load on a worker
diff --git a/src/bocpy/boc_tags.c b/src/bocpy/boc_tags.c
index c0395c1..6c94be0 100644
--- a/src/bocpy/boc_tags.c
+++ b/src/bocpy/boc_tags.c
@@ -27,8 +27,6 @@ BOCTag *tag_from_PyUnicode(PyObject *unicode, BOCQueue *queue) {
   Py_ssize_t size = -1;
   const char *str = PyUnicode_AsUTF8AndSize(unicode, &size);
   if (str == NULL) {
-    // PyUnicode_AsUTF8AndSize sets the exception (UnicodeEncodeError on
-    // surrogates, etc.). Free the partial allocation before returning.
     PyMem_RawFree(tag);
     return NULL;
   }
@@ -44,11 +42,6 @@ BOCTag *tag_from_PyUnicode(PyObject *unicode, BOCQueue *queue) {
 
   memcpy(tag->str, str, tag->size + 1);
   tag->queue = queue;
-  // Return with rc = 1: callers receive an owning reference. The prior
-  // rc = 0 idiom required every caller to TAG_INCREF immediately after
-  // the publish-store, but the publish-then-incref window left the
-  // tag visible to peers at rc = 0 and a racing TAG_DECREF could free
-  // it before the publisher's INCREF ran.
   atomic_store(&tag->rc, 1);
   atomic_store(&tag->disabled, 0);
 
diff --git a/src/bocpy/boc_tags.h b/src/bocpy/boc_tags.h
index c2dd774..9ef0b47 100644
--- a/src/bocpy/boc_tags.h
+++ b/src/bocpy/boc_tags.h
@@ -77,16 +77,6 @@ int tag_compare_with_utf8(BOCTag *lhs, const char *rhs_str,
 /// @return -1 if before, 1 if after, 0 if equivalent. -2 on error.
 int tag_compare_with_PyUnicode(BOCTag *lhs, PyObject *rhs_op);
 
-// ---------------------------------------------------------------------------
-// Hot-path inlines.
-//
-// These were `static` in `_core.c` and called via the TAG_INCREF /
-// TAG_DECREF macros on the send / receive / set_tags paths. Promoting
-// them to `static inline` in this header preserves the inlining when
-// the macros are used from any including TU (and matches CPython's
-// `Py_INCREF` / `Py_DECREF` header-inline pattern).
-// ---------------------------------------------------------------------------
-
 static inline int_least64_t tag_decref(BOCTag *tag) {
   int_least64_t rc = atomic_fetch_add(&tag->rc, -1) - 1;
   if (rc == 0) {
diff --git a/src/bocpy/boc_terminator.c b/src/bocpy/boc_terminator.c
index 1ea4c4e..5457800 100644
--- a/src/bocpy/boc_terminator.c
+++ b/src/bocpy/boc_terminator.c
@@ -25,9 +25,6 @@ static BOCMutex TERMINATOR_MUTEX;
 static BOCCond TERMINATOR_COND;
 
 void terminator_init(void) {
-  // The Pyrona seed (count=1, seeded=1) is set by terminator_reset()
-  // when the runtime starts; here we only initialize the kernel
-  // objects.
   boc_mtx_init(&TERMINATOR_MUTEX);
   cnd_init(&TERMINATOR_COND);
 }
@@ -38,6 +35,8 @@ int_least64_t terminator_inc(void) {
   }
   int_least64_t newval = atomic_fetch_add(&TERMINATOR_COUNT, 1) + 1;
   if (atomic_load(&TERMINATOR_CLOSED)) {
+    // close() raced in after our first check: undo, and broadcast on a
+    // 0-transition since close()'s own wake predated our increment.
     int_least64_t after = atomic_fetch_add(&TERMINATOR_COUNT, -1) - 1;
     if (after == 0) {
       mtx_lock(&TERMINATOR_MUTEX);
@@ -96,8 +95,6 @@ bool terminator_seed_dec(void) {
 }
 
 bool terminator_seed_inc(void) {
-  // CAS 0->1: single-shot inc; no broadcast needed (terminator_wait only wakes
-  // on count==0).
   int_least64_t expected = 0;
   if (atomic_compare_exchange_strong(&TERMINATOR_SEEDED, &expected, 1)) {
     atomic_fetch_add(&TERMINATOR_COUNT, 1);
@@ -107,12 +104,6 @@ bool terminator_seed_inc(void) {
 }
 
 void terminator_reset(int_least64_t *prior_count, int_least64_t *prior_seeded) {
-  // Fence: raise the closed bit before we touch anything else so any
-  // stray thread still holding a reference to the previous runtime
-  // (e.g. a late whencall call) is refused by terminator_inc rather
-  // than slipping a new behavior past the reset boundary. We clear
-  // the bit again at the end, once the new COUNT/SEEDED values have
-  // been published, so a fresh start() sees closed=0.
   atomic_store(&TERMINATOR_CLOSED, 1);
   mtx_lock(&TERMINATOR_MUTEX);
   *prior_count = atomic_load(&TERMINATOR_COUNT);
diff --git a/src/bocpy/transpiler.py b/src/bocpy/transpiler.py
index 9d1ebb9..3922d60 100644
--- a/src/bocpy/transpiler.py
+++ b/src/bocpy/transpiler.py
@@ -66,52 +66,62 @@ def __init__(self, known_vars: Set[str],
         self.known_vars: Set[str] = known_vars
         self.when_aliases: Set[str] = when_aliases
         self.bocpy_module_aliases: Set[str] = bocpy_module_aliases
+        self._entered: bool = False
 
     def clear(self):
         """Reset the tracked state between function visits."""
         self.local_vars.clear()
         self.used_vars.clear()
         self.captured_vars.clear()
+        self._entered = False
 
     def visit_FunctionDef(self, node):  # noqa: N802
-        """Collect locals and recurse to find captured variables."""
-        for arg in node.args.args:
-            self.local_vars.add(arg.arg)
-
-        if node.args.vararg:
-            self.local_vars.add(node.args.vararg.arg)
-
-        if node.args.kwarg:
-            self.local_vars.add(node.args.kwarg.arg)
-
-        for stmt in node.body:
-            if isinstance(stmt, (ast.FunctionDef, ast.AsyncFunctionDef)):
-                self.local_vars.add(stmt.name)
-                # A nested @when is rewritten by WhenTransformer into a
-                # whencall(...) at this position. The cown arguments and the
-                # capture tuple are evaluated in *this* (outer) frame, so any
-                # free names they reference must appear in the outer
-                # behavior's captures. Plain nested def's keep their normal
-                # opaque treatment because Python's own closure handles them.
-                if _has_when_decorator(stmt, self.when_aliases,
-                                       self.bocpy_module_aliases):
-                    inner = CapturedVariableFinder(
-                        self.known_vars,
-                        when_aliases=self.when_aliases,
-                        bocpy_module_aliases=self.bocpy_module_aliases,
-                    )
-                    inner.visit(stmt)
-                    self.used_vars |= inner.captured_vars
-                    for dec in stmt.decorator_list:
-                        if _is_when_call(dec, self.when_aliases,
-                                         self.bocpy_module_aliases):
-                            for arg in dec.args:
-                                self.visit(arg)
-                continue
+        """Find captured variables.
+
+        First call (the root behavior): collect its params as locals and
+        recurse. Every later call is a nested def discovered during recursion:
+        record its name, and for a nested ``@when`` surface its free names as
+        the outer behavior's captures (its cown args and capture tuple are
+        evaluated in the outer frame). Plain nested defs are left to Python's
+        own closure.
+        """
+        if not self._entered:
+            self._entered = True
+
+            for arg in node.args.args:
+                self.local_vars.add(arg.arg)
+
+            if node.args.vararg:
+                self.local_vars.add(node.args.vararg.arg)
 
-            self.generic_visit(stmt)
+            if node.args.kwarg:
+                self.local_vars.add(node.args.kwarg.arg)
 
-        self.captured_vars = self.used_vars - self.local_vars - self.known_vars
+            for stmt in node.body:
+                self.visit(stmt)
+
+            self.captured_vars = self.used_vars - self.local_vars - self.known_vars
+            return
+
+        self.local_vars.add(node.name)
+
+        if _has_when_decorator(node, self.when_aliases,
+                               self.bocpy_module_aliases):
+            # Nested @when is rewritten to a whencall() evaluated here, so its
+            # free names must join the outer captures and its cown args are
+            # resolved in this frame.
+            inner = CapturedVariableFinder(
+                self.known_vars,
+                when_aliases=self.when_aliases,
+                bocpy_module_aliases=self.bocpy_module_aliases,
+            )
+            inner.visit(node)
+            self.used_vars |= inner.captured_vars
+            for dec in node.decorator_list:
+                if _is_when_call(dec, self.when_aliases,
+                                 self.bocpy_module_aliases):
+                    for arg in dec.args:
+                        self.visit(arg)
 
     visit_AsyncFunctionDef = visit_FunctionDef  # noqa: N815
 
@@ -126,9 +136,6 @@ def visit_Name(self, node: ast.Name):  # noqa: N802
 
     def visit_ExceptHandler(self, node: ast.ExceptHandler):  # noqa: N802
         """Treat ``except ... as X`` binding as a local, not a capture."""
-        # ``except ... as X`` (and ``try ... except* ... as X``) bind X
-        # on ``ExceptHandler.name`` as a plain identifier, not an
-        # ``ast.Name(Store)`` node, so the Name visitor never sees it.
         if node.name:
             self.local_vars.add(node.name)
         self.generic_visit(node)
@@ -147,15 +154,7 @@ def __init__(self):
         self.functions = set()
         self.imports = set()
         self.constants = set()
-        # Names that bind to ``bocpy.when`` (populated by
-        # ``visit_ImportFrom``). Always starts with the bare name
-        # ``"when"`` so a synthetic test or partial source still
-        # matches the historical literal-name spelling; the import
-        # visitor adds any explicit ``as`` alias to the set.
         self.when_aliases: set = {"when"}
-        # Names that bind to the ``bocpy`` module (populated by
-        # ``visit_Import``). Used so ``@alias.when(...)`` is
-        # recognised as a behavior decorator.
         self.bocpy_module_aliases: set = set()
 
     def known_vars(self):
@@ -226,10 +225,6 @@ def _record_constant_targets(self, targets):
     def visit_Assign(self, node: ast.Assign):  # noqa: N802
         """Add module-level constants."""
         if isinstance(node.value, ast.Constant):
-            # Constant assignments survive in the export. Record every
-            # target name (including chained ``A = B = 1`` and tuple
-            # ``A, B = 1, 2``) so the decorator validator can resolve
-            # them.
             self._record_constant_targets(node.targets)
             return node
 
@@ -239,7 +234,6 @@ def visit_Assign(self, node: ast.Assign):  # noqa: N802
         name = node.targets[0]
 
         if isinstance(name, ast.Name):
-            # use naming convention to allow some non-constant values as well
             if name.id.isupper():
                 self.constants.add(name.id)
                 return node
@@ -276,13 +270,6 @@ def visit_Module(self, node: ast.Module):  # noqa: N802
 
             new_body.append(new_value)
 
-        # If the user only spelled ``import bocpy [as alias]`` we never
-        # injected ``whencall`` into a ``from bocpy import`` statement,
-        # but the generated ``__behavior__N`` rewrite still emits
-        # ``whencall(...)`` as a bare ``Name``. Prepend an explicit
-        # import so worker resolution succeeds. No-op when ``whencall``
-        # is already imported or when no bocpy import is present (in
-        # which case nothing in the exported module would call it).
         if (self.bocpy_module_aliases and "whencall" not in self.imports):
             inject = ast.ImportFrom(
                 module="bocpy",
@@ -307,22 +294,6 @@ class WhenTransformer(ast.NodeTransformer):
     the function with a call to `whencall` for that behavior.
     """
 
-    # Best-effort early warning for stdlib decorators that produce
-    # non-callable descriptors at module scope (``staticmethod``,
-    # ``classmethod``, ``property``). Applied below ``@when``, these
-    # would silently break worker dispatch — the generated
-    # ``__behavior__N`` is invoked as a plain function on the worker,
-    # but the descriptor is not callable that way; ``property`` even
-    # raises ``TypeError`` at import time.
-    #
-    # This is **not** a correctness guarantee. The transpiler can only
-    # see decorator *syntax*, not what the expression evaluates to at
-    # import time on the worker, so any third-party decorator with the
-    # same shape (e.g., ``functools.cached_property``, custom
-    # descriptor factories) will slip through. Treat the set below as a
-    # convenience: a precise, actionable error for the few stdlib names
-    # we can recognise from the AST. Users applying exotic decorators
-    # below ``@when`` are on their own.
     _BANNED_BELOW_DECORATORS = frozenset({"staticmethod", "classmethod", "property"})
 
     def __init__(self, known_vars: set, path: str, module_scope_names: set,
@@ -380,7 +351,6 @@ def collect_targets(target: ast.AST, into: set) -> None:
 
         def visit(node: ast.AST) -> None:
             if isinstance(node, ast.Lambda):
-                # Defaults are evaluated in the *outer* scope.
                 for d in node.args.defaults:
                     visit(d)
                 for d in node.args.kw_defaults:
@@ -395,8 +365,6 @@ def visit(node: ast.AST) -> None:
                                  ast.GeneratorExp, ast.DictComp)):
                 local: set = set()
                 for i, gen in enumerate(node.generators):
-                    # The *first* iter is evaluated in the enclosing
-                    # scope; later iters see prior targets.
                     if i == 0:
                         visit(gen.iter)
                     else:
@@ -463,15 +431,12 @@ def visit_FunctionDef(self, node: ast.FunctionDef):  # noqa: N802
         if when_dec is None:
             return self.generic_visit(node)
 
-        # Reject async functions — there is no event loop on workers.
         if isinstance(node, ast.AsyncFunctionDef):
             raise SyntaxError(
                 "@when does not support async functions",
                 (self.path, node.lineno, node.col_offset, None),
             )
 
-        # Reject decorators above @when — they would wrap the
-        # scheduling call (a Cown), not the behavior body.
         when_idx = node.decorator_list.index(when_dec)
         if when_idx > 0:
             bad = node.decorator_list[0]
@@ -483,23 +448,9 @@ def visit_FunctionDef(self, node: ast.FunctionDef):  # noqa: N802
                 (self.path, bad.lineno, bad.col_offset, None),
             )
 
-        # first create a deep copy of the function
         behavior_node = copy.deepcopy(node)
         ast.copy_location(behavior_node, node)
 
-        # Extras-as-captures: positional parameters declared beyond the
-        # cown count are captured by name from the caller's frame. This
-        # supports two idioms transparently:
-        #   * the canonical Python loop-snapshot ``def b(c, i=i)`` —
-        #     defaults align with the *tail* of ``args.args``; the
-        #     default name becomes the capture source.
-        #   * the rename form ``def b(c, x=y)`` — capture by ``y``,
-        #     bind into param ``x``.
-        # Undefaulted trailing positionals (``def b(c, factor)``) are
-        # captured by the parameter's own name. Non-Name defaults and
-        # defaults landing on cown positions are rejected up front so a
-        # broken signature surfaces at export time, not as a confusing
-        # worker TypeError.
         extras_captures: list[str] = []
         n_cowns = len(when_dec.args)
         all_params = behavior_node.args.args
@@ -528,40 +479,24 @@ def visit_FunctionDef(self, node: ast.FunctionDef):  # noqa: N802
                 )
             extras_captures.append(dflt.id)
 
-        # Strip defaults so the worker never tries to evaluate them.
-        # The captured values are passed positionally by ``whencall``.
         behavior_node.args.defaults = []
 
-        # find all the captured variables. These will need to be passed
-        # to the behavior as additional arguments, as the closure will
-        # no longer function properly. Extras (already in args.args) are
-        # in ``local_vars`` thanks to the finder's param walk, so they
-        # will not be re-classified as body free-vars.
         self.cap_finder.clear()
         self.cap_finder.visit(behavior_node)
-        # __file__ is rewritten to a string constant by visit_Name below,
-        # so it must not be added to the parameter list as a capture.
         body_captures = [c for c in self.cap_finder.captured_vars
                          if c != "__file__"]
 
-        # add the body captures as trailing parameters; extras are
-        # already part of the user's signature.
         for name in body_captures:
             behavior_node.args.args.append(ast.Name(id=name))
 
         captures = extras_captures + body_captures
 
-        # Remove only @when decorators; other decorators compose with
-        # the behavior body and are preserved in the exported module.
         behavior_node.decorator_list = [
             d for d in behavior_node.decorator_list
             if not _is_when_call(d, self.when_aliases,
                                  self.bocpy_module_aliases)
         ]
 
-        # Reject descriptor-producing decorators that would silently
-        # break worker dispatch when applied to a module-level
-        # ``__behavior__N`` (the worker calls it as a plain function).
         for dec in behavior_node.decorator_list:
             banned = None
             if isinstance(dec, ast.Name) and dec.id in self._BANNED_BELOW_DECORATORS:
@@ -577,27 +512,16 @@ def visit_FunctionDef(self, node: ast.FunctionDef):  # noqa: N802
                     (self.path, dec.lineno, dec.col_offset, None),
                 )
 
-        # Validate that remaining decorator expressions only reference
-        # names available at module scope in the worker. Walk only
-        # *free* variables — names bound by ``Lambda`` /
-        # comprehension / generator-expression scopes inside the
-        # decorator are local and must not be flagged.
         for dec in behavior_node.decorator_list:
             self._validate_decorator_names(dec)
 
-        # deal with any recursive behaviors within this behavior
         behavior_node = self.visit(behavior_node)
 
-        # assign a unique name
         behavior_node.name = f"__behavior__{len(self.behaviors)}"
 
-        # add the node to our list of behavior function nodes
         ast.fix_missing_locations(behavior_node)
         self.nodes.append(behavior_node)
 
-        # this allows name and capture lookup for execution of behaviors
-        # from the primary interpreter (using source line numbers from the
-        # frame)
         self.behaviors[when_dec.lineno] = BehaviorInfo(behavior_node.name, captures)
 
         args = [ast.Constant(value=behavior_node.name),
@@ -623,12 +547,6 @@ def visit_FunctionDef(self, node: ast.FunctionDef):  # noqa: N802
                                            ("behaviors", Mapping[int, BehaviorInfo])])
 
 
-# Module-level dunders (__name__, __doc__, __package__, __spec__, __loader__)
-# are exposed via __builtins__, but inside a behavior they should refer to the
-# *user* module's value, not the worker's exported module. Removing them from
-# `known_vars` lets the capture mechanism pick them up from the call-site
-# frame's globals at runtime. __file__ is handled separately via inlining in
-# WhenTransformer.visit_Name.
 MODULE_DUNDERS = {"__name__", "__doc__", "__package__",
                   "__spec__", "__loader__"}
 
diff --git a/src/bocpy/worker.py b/src/bocpy/worker.py
index 893c694..24fe1b2 100644
--- a/src/bocpy/worker.py
+++ b/src/bocpy/worker.py
@@ -10,24 +10,11 @@
 index = _core.index()
 logger = logging.getLogger(f"worker{index}")
 
-# Generous deadline (seconds) for the boc_cleanup handshake. The main
-# thread normally sends `True` immediately after collecting every
-# worker's "shutdown" reply, so this only fires if `stop_workers`
-# itself wedged. Proceeding with cleanup on timeout prevents this
-# sub-interpreter from outliving the runtime, which would otherwise
-# block `interpreters.destroy()` -> `t.join()` in `teardown_workers`.
 _CLEANUP_RECEIVE_TIMEOUT = 120.0
 
 
 boc_export = None
 
-# The boc_export module and any of its classes which are needed for
-# unpickling are loaded and aliased within these tags when the worker
-# script is generated. The transpiled source is embedded as a Python
-# string literal (via ``repr()``) and exec'd into a fresh
-# ``types.ModuleType``; a ``linecache`` entry under a synthetic
-# filename ``<bocpy:NAME>`` keeps tracebacks pointing at the
-# transpiled source line. No on-disk artifact is created.
 
 # BEGIN boc_export
 # END boc_export
@@ -54,20 +41,11 @@ def run_behavior(behavior):
                 behavior.acquire()
                 acquired = True
             except Exception as ex:
-                # acquire() / cache_clear() failed before the body ran.
-                # The MCS chain is still linked (behavior_schedule
-                # established the links on the caller thread), so the
-                # outer finally below MUST run release/release_all to
-                # unwind it -- otherwise every successor blocks forever.
-                # Mark the result Cown so a caller awaiting it sees a
-                # diagnostic instead of a permanent None.
                 logger.exception(ex)
                 try:
                     behavior.set_exception(ex)
                 except Exception as inner:
                     logger.exception(inner)
-                # Fall through: `acquired` is False, so we skip execute()
-                # but still run the release pair in the outer finally.
 
             if acquired:
                 try:
@@ -76,34 +54,15 @@ def run_behavior(behavior):
                     logger.exception(ex)
                     behavior.set_exception(ex)
         finally:
-            # Runs on every path: clean acquire, failed acquire, normal
-            # body return, body Exception, OR body KI/SystemExit (which
-            # propagates after this finally completes).
-            #
-            # acquire() is sequential (result -> args -> captures) and
-            # bails on first failure, so on a partial-success raise some
-            # cowns are owned by this worker and some are not. release()
-            # is tolerant (it short-circuits NO_OWNER cowns), so calling
-            # it here releases the ones we did acquire before
-            # release_all hands the request to a successor.
             try:
                 behavior.release()
             except Exception as ex:
                 logger.exception(ex)
-            # Release the request array on the worker thread instead of
-            # round-tripping ("release", capsule) through the (now-gone)
-            # central scheduler thread.
             try:
                 behavior.release_all()
             except Exception as ex:
                 logger.exception(ex)
     finally:
-        # Drop the terminator hold unconditionally. If anything above
-        # raised (Exception or BaseException), failing to decrement
-        # here would leave wait() hung forever. Log and swallow
-        # Exception so a single misbehaving step cannot strand the
-        # runtime; KI/SystemExit from terminator_dec itself is
-        # extraordinarily unlikely (pure C atomic) and would propagate.
         try:
             _core.terminator_dec()
         except Exception as ex:
@@ -114,11 +73,6 @@ def do_work():
     """Main worker loop receiving behaviors or shutdown messages."""
     try:
         logger.debug("worker starting")
-        # Claim a scheduler slot and stamp the per-thread TLS handle
-        # before announcing readiness. Subsequent dispatch / pop paths
-        # rely on this slot being installed. If registration fails
-        # (over-spawn vs. scheduler_runtime_start), surface the error
-        # so start_workers stops waiting.
         try:
             slot = _core.scheduler_worker_register()
             logger.debug("registered scheduler slot %d", slot)
@@ -129,10 +83,6 @@ def do_work():
         send("boc_behavior", "started")
         while True:
             try:
-                # scheduler_worker_pop blocks on the worker's own
-                # condvar (with the GIL released). It returns None
-                # only when scheduler_request_stop_all has been
-                # called by stop_workers.
                 behavior = _core.scheduler_worker_pop()
                 if behavior is None:
                     logger.debug("scheduler stop signal received")
@@ -140,24 +90,14 @@ def do_work():
                 run_behavior(behavior)
                 behavior = None
             except (KeyboardInterrupt, SystemExit):
-                # Propagate so the worker can wind down: the outer
-                # try/finally still sends "shutdown" before the
-                # interpreter exits, so stop_workers does not hang.
                 raise
             except Exception as ex:
-                # A regular Exception inside run_behavior or
-                # scheduler_worker_pop must not break the loop -- if
-                # it did, this worker would exit without sending its
-                # "shutdown" reply and stop_workers would block forever
-                # waiting for it.
                 logger.exception(ex)
 
         logger.debug("worker stopped")
     except Exception as ex:
         logger.exception(ex)
     finally:
-        # Always tell stop_workers we are leaving the loop, even on an
-        # unexpected exception, so it never hangs in receive("boc_behavior").
         try:
             send("boc_behavior", "shutdown")
         except Exception as ex:
@@ -173,10 +113,6 @@ def cleanup():
     try:
         tag, _ = receive("boc_cleanup", _CLEANUP_RECEIVE_TIMEOUT)
         if tag == _core.TIMEOUT:
-            # The main thread never sent the boc_cleanup signal --
-            # `stop_workers` is wedged. Log and proceed with the
-            # local cown drain anyway so this sub-interpreter can
-            # be destroyed and its thread joined.
             logger.warning(
                 "cleanup: boc_cleanup signal not received within %.1fs; "
                 "proceeding with cown recycle so the sub-interpreter "
@@ -187,10 +123,6 @@ def cleanup():
         orphan_cowns = _core.cowns()
         if len(orphan_cowns) != 0:
             logger.debug("acquiring orphan cowns")
-            # at this stage all behaviors have exited, but it may be the case
-            # that some cowns are released but associated with this interpreter.
-            # by acquiring them, we ensure that the XIData objects have been
-            # freed _before_ this interpreter is destroyed.
             for cown in orphan_cowns:
                 if cown is not None:
                     cown.acquire()
@@ -205,27 +137,12 @@ def cleanup():
 try:
     do_work()
 finally:
-    # Always run cleanup, even if do_work() bubbled out a
-    # KeyboardInterrupt / SystemExit / PythonFinalizationError.
-    # Skipping cleanup leaves XIData objects live inside this
-    # sub-interpreter; subsequent destruction then fails with
-    # "interpreter has live cross-interpreter data" and the
-    # worker pool teardown blocks.
-    #
-    # The post-cleanup `sys.modules` clears below are also
-    # destruction-critical on Python 3.12 and prior, so they live in
-    # an inner `finally` that runs even if `cleanup()` itself raises
-    # a BaseException (e.g. KeyboardInterrupt parking inside
-    # `receive("boc_cleanup")`, or PythonFinalizationError out of
-    # `_core.recycle()`). Skipping them re-introduces the
-    # subinterpreter-destruction wedge in mirror image.
+    # cleanup() must run on any BaseException: leftover live cross-interpreter
+    # data makes interpreters.destroy() fail and hangs pool teardown.
     try:
         cleanup()
     finally:
         logger = None
-        # in Python 3.12 and prior, the threading module can cause
-        # issues with subinterpreter destruction. `pop(..., None)`
-        # is used instead of `del` so a module already removed by
-        # an earlier failure path does not raise KeyError here.
+        # <=3.12: leaving these in sys.modules wedges sub-interpreter destruction.
         for _modname in ("logging", "threading"):
             sys.modules.pop(_modname, None)
diff --git a/templates/c_abi_consumer/pyproject.toml b/templates/c_abi_consumer/pyproject.toml
index 7e40b20..29e1f0f 100644
--- a/templates/c_abi_consumer/pyproject.toml
+++ b/templates/c_abi_consumer/pyproject.toml
@@ -4,11 +4,11 @@
 # so the build resolves headers against the bocpy install actually
 # being tested. See README.md.
 #
-# The compatible-release bound (~=0.9) keeps the template aligned with
+# The compatible-release bound (~=0.10) keeps the template aligned with
 # the public C ABI it was authored against; bump it in lock-step with
 # ``[project].version`` in the root ``pyproject.toml`` (see the
 # ``finalize-pr`` skill).
-requires = ["setuptools", "wheel", "bocpy~=0.9"]
+requires = ["setuptools", "wheel", "bocpy~=0.10"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -16,4 +16,4 @@ name = "bocpy-c-abi-consumer"
 version = "0.0.0"
 description = "Smoke test and canonical downstream template for the bocpy public C ABI."
 requires-python = ">=3.10"
-dependencies = ["bocpy~=0.9"]
+dependencies = ["bocpy~=0.10"]
diff --git a/templates/c_abi_consumer/test/test_consumer.py b/templates/c_abi_consumer/test/test_consumer.py
index 885b3fb..4412e65 100644
--- a/templates/c_abi_consumer/test/test_consumer.py
+++ b/templates/c_abi_consumer/test/test_consumer.py
@@ -4,7 +4,7 @@
 ``bocpy.get_sources()`` proves the ABI compiles. Importing it proves
 the headers and atomic shim link correctly. The headline test then
 shows that downstream extensions can ride the cross-interpreter
-machinery — ``Cown``, ``@when`` and ``send``/``receive`` — for free:
+machinery — ``Cown`` and ``@when`` — for free:
 
   * A ``Counter`` (a downstream type registered via
     ``XIDATA_REGISTERCLASS``) is wrapped in a ``Cown``.
@@ -15,33 +15,22 @@
     count.
   * The terminal behavior reads ``c.value.address`` and
     ``c.value.count`` *inside* the ``@when`` (where it owns the cown
-    under the proto-Region discipline) and ``send``s the assertion
-    pairs ``(addr, expected_addr)`` and ``(count >= TARGET, True)``
-    back to the main thread. The test ``receive``s them and fails if
-    either pair disagrees, proving the impl pointer survived every
+    under the proto-Region discipline) and writes the observed
+    ``(address, count)`` pair into a dedicated result ``Cown`` it also
+    holds. After ``quiesce()`` the main thread reads that pair with
+    ``Cown.unwrap`` and fails if the address drifted or the count
+    never reached the target — proving the impl pointer survived every
     XIData hop and the consumer callback fired on every dispatch.
 
-Together this exercises the real BOC scheduler, the real worker
-handoff, and the real MPSC message queue — not just an in-process
-round-trip of a single XIData callback.
+Together this exercises the real BOC scheduler and the real worker
+handoff — not just an in-process round-trip of a single XIData
+callback.
 """
 
-# Top-level, unconditional import. The transpiler propagates module-
-# level ``import`` statements into worker sub-interpreters, where the
-# extension's per-interpreter exec slot must run before the consumer
-# callback can dereference its ``LOCAL_STATE``. ``pytest.importorskip``
-# is a runtime call the transpiler does not see, so it would leave the
-# worker without the probe and segfault on the first reconstruction.
 import _bocpy_probe
 import pytest
 
-from bocpy import Cown, drain, receive, send, TIMEOUT, wait, when
-
-
-# --- construction smoke checks -------------------------------------------
-#
-# These do not need BOC. They just confirm the extension built and that
-# the per-interpreter exec slot ran on the main interpreter.
+from bocpy import Cown, quiesce, wait, when
 
 
 def test_counter_construction():
@@ -69,88 +58,53 @@ def test_counter_uninitialised_raises():
         c.refcount
 
 
-# --- BOC-driven XIData round-trip ----------------------------------------
-
 TARGET = 5
-RECEIVE_TIMEOUT = 10
 
+QUIESCE_TIMEOUT = 10
 
-def _step(c, expected_addr):
+
+def _step(c, result):
     """Schedule one round of the tail loop.
 
     Defined at module level so the transpiler can resolve it from the
     worker interpreter when the recursive call inside the behavior is
-    executed. ``expected_addr`` is closed over by value at schedule
-    time (the transpiler snapshots captures into a tuple) so the
-    terminal behavior can compare it against the impl pointer it
-    observes from inside the worker.
+    executed. The terminal behavior records the observed
+    ``(address, count)`` pair into ``result`` so the main thread can
+    read it with ``Cown.unwrap`` after ``quiesce()``.
     """
-    @when(c)
-    def _(c):
-        # Counter follows proto-Region semantics: only the interpreter
-        # currently owning the cown may inspect ``c.value``. Do all
-        # checks here, inside the @when, where ownership is held.
+    @when(c, result)
+    def _(c, result):
         addr = c.value.address
         count = c.value.count
         if count < TARGET:
-            _step(c, expected_addr)
+            _step(c, result)
         else:
-            # Identity check: the impl pointer must survive every
-            # @when handoff in the tail loop. If XIData ever lost it,
-            # ``addr`` would not match the cown's original address.
-            send("assert", (addr, expected_addr))
-            # Progress check: the consumer callback bumps ``count`` on
-            # every reconstruction, so by the terminal behavior we
-            # must have round-tripped at least TARGET times.
-            send("assert", (count >= TARGET, True))
+            result.value = (addr, count)
 
 
 class TestBOCRoundtrip:
-    """BOC-driven round-trip of a ``Counter`` cown via ``@when`` + send."""
+    """BOC-driven round-trip of a ``Counter`` cown via ``@when``."""
 
     @classmethod
     def teardown_class(cls):
         """Drain pending behaviors so the runtime can shut cleanly."""
         wait()
 
-    def receive_asserts(self, count):
-        """Collect ``count`` assertion messages and fail on mismatch.
-
-        Mirrors the helper from .github/skills/testing-with-boc — uses
-        a timeout so a stalled behavior fails the test loudly instead
-        of hanging, and drains the queue on the way out.
-        """
-        failed = None
-        timed_out = False
-        try:
-            for _ in range(count):
-                result = receive("assert", RECEIVE_TIMEOUT)
-                if result[0] == TIMEOUT:
-                    timed_out = True
-                    break
-                _, (actual, expected) = result
-                if failed is None and actual != expected:
-                    failed = (actual, expected)
-        finally:
-            drain("assert")
-
-        assert not timed_out, (
-            "tail-recursive @when chain never reached its terminal "
-            "send('assert', ...). Either XIData round-trip is not "
-            "incrementing the counter or the behavior chain stalled.")
-        if failed is not None:
-            actual, expected = failed
-            assert actual == expected, f"expected {expected!r}, got {actual!r}"
-
-    def test_tail_loop_roundtrips_counter_through_when_and_send(self):
+    def test_tail_loop_roundtrips_counter_through_when(self):
         """Ship a Counter cown through a tail-recursive @when chain."""
         counter = _bocpy_probe.Counter()
         expected_addr = counter.address
         c = Cown(counter)
+        result = Cown(None)
+
+        _step(c, result)
 
-        _step(c, expected_addr)
+        quiesce(QUIESCE_TIMEOUT)
+        observed_addr, observed_count = result.unwrap()
 
-        # Two asserts from the terminal behavior: address identity
-        # and count progress. ``receive_asserts`` blocks until both
-        # arrive (or times out), so no extra sentinel is needed.
-        self.receive_asserts(2)
+        assert observed_addr == expected_addr, (
+            f"impl pointer drifted: expected {expected_addr!r}, "
+            f"got {observed_addr!r}")
+        assert observed_count >= TARGET, (
+            f"count only reached {observed_count}, expected >= {TARGET}; "
+            "the XIData round-trip did not increment the counter")
diff --git a/templates/downstream-agent-bootstrap-prompt.md b/templates/downstream-agent-bootstrap-prompt.md
index 974a0ee..cecd1f8 100644
--- a/templates/downstream-agent-bootstrap-prompt.md
+++ b/templates/downstream-agent-bootstrap-prompt.md
@@ -276,7 +276,6 @@ The five replacement patterns you should know cold:
 | `notice_write(key, value)` | Non-blocking write. |
 | `notice_update(key, fn, default)` | Atomic read-modify-write. `fn` must be picklable (module-level function or `functools.partial`). Return `REMOVED` to delete. |
 | `notice_delete(key)` | Non-blocking delete. |
-| `notice_sync()` | Flush this thread's pending noticeboard writes before releasing the current cown. Use when a downstream behavior must observe your write. |
 | `REMOVED` | Sentinel for deleting via `notice_update`. |
 | `wait(timeout)` | Block until all scheduled behaviors complete; stops the runtime. |
 | `start(workers, export_dir, module)` | Manually start the runtime (auto-called on first `@when`). |
diff --git a/test/test_boc.py b/test/test_boc.py
index 83cfc34..74f193d 100644
--- a/test/test_boc.py
+++ b/test/test_boc.py
@@ -5,49 +5,22 @@
 import random
 import sys
 import threading
+import time
 import traceback
 from typing import NamedTuple
 
 import pytest
 
-from bocpy import (Cown, drain, notice_sync, notice_write, noticeboard,
-                   receive, send, start, TIMEOUT, wait, when)
+from bocpy import (Cown, drain, notice_write,
+                   quiesce, receive, send, start, TIMEOUT, wait, when)
 from bocpy._core import CownCapsule
 
 RECEIVE_TIMEOUT = 10
 
+QUIESCE_TIMEOUT = 5
 
-GLOBAL_FACTOR = 7
-
-
-def receive_asserts(count=1):
-    """Drain all expected assertion messages, then fail on first mismatch.
 
-    The "assert" queue is always drained before returning so that leftover
-    messages from a failing test do not leak into subsequent tests in CI.
-    """
-    failed = None
-    timed_out = False
-    try:
-        for _ in range(count):
-            result = receive("assert", RECEIVE_TIMEOUT)
-            if result[0] == TIMEOUT:
-                timed_out = True
-                break
-            _, (actual, expected) = result
-            if failed is None and actual != expected:
-                failed = (actual, expected)
-    finally:
-        drain("assert")
-
-    assert not timed_out, (
-        "Timed out waiting for an 'assert' message from a behavior. "
-        "Check that every @when arg count matches the decorated "
-        "function's parameter count."
-    )
-    if failed is not None:
-        actual, expected = failed
-        assert actual == expected, f"expected {expected!r}, got {actual!r}"
+GLOBAL_FACTOR = 7
 
 
 class Multiplier:
@@ -269,63 +242,63 @@ def test_simple_dispatch(self):
         y = simple(x)
         assert isinstance(y, Cown)
 
-        @when(y)
-        def _(y):
-            send("assert", (y.value, 2))
-
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert y.unwrap() == 2
 
     def test_nested_dispatch(self):
         """Ensure nested behaviors see updated state."""
         x = Cown(1)
-        y = nested(x)
+        nested(x)
 
-        # Only assert the final state. The intermediate value of x is racy:
-        # the inner nested_triple is scheduled on x from inside nested_double
-        # and may run before or after a behavior the main thread enqueues on
-        # x, depending on worker timing.
-        @when(x, y)
-        def check_double(x, y):
-            @when(x, y.value)
-            def check_triple(x, _inner):
-                send("assert", (x.value, 6))
-
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert x.unwrap() == 6
 
     def test_exception(self):
         """Exceptions propagate as values in behaviors."""
         x = Cown(1)
         y = exception(x)
 
-        @when(y)
-        def _(y):
-            send("assert", (isinstance(y.value, ZeroDivisionError), True))
-            y.value = None
+        quiesce(QUIESCE_TIMEOUT)
+        with pytest.raises(ZeroDivisionError):
+            y.unwrap()
+
+    def test_unwrap_consumes_value(self):
+        """unwrap() consumes the cown: a second unwrap returns None."""
+        x = Cown(1)
+        y = simple(x)
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert y.unwrap() == 2
+        assert y.unwrap() is None
 
     def test_two_cown_coordination(self):
         """Move value between two cowns with coordinated when."""
         x = Cown(100)
         y = Cown(0)
 
-        def check(c: Cown, value: int):
+        def read(c: Cown):
             @when(c)
-            def do_check(c):
-                send("assert", (c.value, value))
+            def do_read(c):
+                return c.value
 
-        check(x, 100)
-        check(y, 0)
+            return do_read
+
+        x_before = read(x)
+        y_before = read(y)
 
         @when(x, y)
         def _(x, y):
             y.value += 50
             x.value -= 50
 
-        check(x, 50)
-        check(y, 50)
+        x_after = read(x)
+        y_after = read(y)
 
-        receive_asserts(4)
+        quiesce(QUIESCE_TIMEOUT)
+        assert x_before.unwrap() == 100
+        assert y_before.unwrap() == 0
+        assert x_after.unwrap() == 50
+        assert y_after.unwrap() == 50
 
     def test_classes(self, num_philosophers=5, hunger=4):
         """Simulate dining philosophers and verify fork usage."""
@@ -339,12 +312,18 @@ def test_classes(self, num_philosophers=5, hunger=4):
                 case ["report", ("full", _)]:
                     num_eating -= 1
 
-        for _, f in enumerate(forks):
+        readers = []
+        for f in forks:
             @when(f)
-            def _(f):
-                send("assert", (f.value.uses, 2*f.value.hunger))
+            def read_fork(f):
+                return (f.value.uses, 2 * f.value.hunger)
+
+            readers.append(read_fork)
 
-        receive_asserts(num_philosophers)
+        quiesce(QUIESCE_TIMEOUT)
+        for r in readers:
+            uses, expected = r.unwrap()
+            assert uses == expected
 
     @pytest.mark.parametrize("n", [1, 10, 15])
     def test_variable_termination(self, n: int):
@@ -352,22 +331,16 @@ def test_variable_termination(self, n: int):
         result = fib_parallel(n)
         expected = fib_sequential(n)
 
-        @when(result)
-        def check(result):
-            send("assert", (result.value, expected))
-
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert result.unwrap() == expected
 
     def test_cown_grouping(self):
         """Verify cown grouping returns correct sums."""
         expected, results = cown_grouping()
 
-        @when(results)
-        def check(results: list[Cown]):
-            for r in results:
-                send("assert", (r.value, expected))
-
-        receive_asserts(len(results))
+        quiesce(QUIESCE_TIMEOUT)
+        for r in results:
+            assert r.unwrap() == expected
 
     def test_grouped_cown_mutation(self):
         """Write to cowns within a group and verify mutations stick."""
@@ -380,10 +353,10 @@ def double_all(group: list[Cown[int]]):
 
         @when(cowns)
         def verify(group: list[Cown[int]]):
-            for i, c in enumerate(group):
-                send("assert", (c.value, i * 2))
+            return [c.value for c in group]
 
-        receive_asserts(5)
+        quiesce(QUIESCE_TIMEOUT)
+        assert verify.unwrap() == [i * 2 for i in range(5)]
 
     def test_group_and_single_mutation(self):
         """Mutate a group and a single cown in the same behavior."""
@@ -397,15 +370,16 @@ def accumulate(group: list[Cown[int]], t: Cown[int]):
                 c.value = 0
 
         @when(total)
-        def check_total(t):
-            send("assert", (t.value, 6))
+        def read_total(t):
+            return t.value
 
         @when(items)
-        def check_zeroed(group: list[Cown[int]]):
-            for c in group:
-                send("assert", (c.value, 0))
+        def read_items(group: list[Cown[int]]):
+            return [c.value for c in group]
 
-        receive_asserts(4)
+        quiesce(QUIESCE_TIMEOUT)
+        assert read_total.unwrap() == 6
+        assert read_items.unwrap() == [0, 0, 0]
 
     def test_behavior_chain(self):
         """Chain three behaviors where each result feeds the next."""
@@ -413,21 +387,18 @@ def test_behavior_chain(self):
 
         @when(x)
         def step1(x):
-            return x.value + 3          # 5
+            return x.value + 3
 
         @when(step1)
         def step2(s1):
-            return s1.value * 4         # 20
+            return s1.value * 4
 
         @when(step2)
         def step3(s2):
-            return s2.value - 7         # 13
-
-        @when(step3)
-        def check(s3):
-            send("assert", (s3.value, 13))
+            return s2.value - 7
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert step3.unwrap() == 13
 
     def test_contention(self):
         """Many behaviors on the same cown serialize correctly."""
@@ -440,10 +411,11 @@ def _(c):
                 c.value += 1
 
         @when(counter)
-        def check(c):
-            send("assert", (c.value, n))
+        def read(c):
+            return c.value
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert read.unwrap() == n
 
     def test_exception_type_error(self):
         """Verify TypeError inside a behavior is captured in the result cown."""
@@ -451,14 +423,11 @@ def test_exception_type_error(self):
 
         @when(x)
         def bad(x):
-            return x.value + 1          # str + int -> TypeError
+            return x.value + 1
 
-        @when(bad)
-        def check(b):
-            send("assert", (isinstance(b.value, TypeError), True))
-            b.value = None
-
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        with pytest.raises(TypeError):
+            bad.unwrap()
 
     def test_exception_key_error(self):
         """Verify KeyError inside a behavior is captured in the result cown."""
@@ -466,14 +435,11 @@ def test_exception_key_error(self):
 
         @when(x)
         def bad(x):
-            return x.value["missing"]   # KeyError
+            return x.value["missing"]
 
-        @when(bad)
-        def check(b):
-            send("assert", (isinstance(b.value, KeyError), True))
-            b.value = None
-
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        with pytest.raises(KeyError):
+            bad.unwrap()
 
     def test_complex_object_repeated_mutation(self):
         """Multiple sequential behaviors mutate the same object in a cown."""
@@ -487,10 +453,11 @@ def _(a):
                 a.value.add(val_to_add)  # noqa: B023
 
         @when(acc)
-        def check(a):
-            send("assert", (sorted(a.value.items), list(range(10))))
+        def read(a):
+            return sorted(a.value.items)
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert read.unwrap() == list(range(10))
 
     def test_duplicate_cown_same_twice(self):
         """Same cown passed twice to @when completes without deadlock."""
@@ -500,11 +467,8 @@ def test_duplicate_cown_same_twice(self):
         def add(a, b):
             return a.value + b.value
 
-        @when(add)
-        def check(r):
-            send("assert", (r.value, 10))
-
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert add.unwrap() == 10
 
     def test_duplicate_cown_same_thrice(self):
         """Same cown passed three times to @when completes without deadlock."""
@@ -514,11 +478,8 @@ def test_duplicate_cown_same_thrice(self):
         def triple(a, b, d):
             return a.value + b.value + d.value
 
-        @when(triple)
-        def check(r):
-            send("assert", (r.value, 9))
-
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert triple.unwrap() == 9
 
     def test_duplicate_cown_non_adjacent(self):
         """Non-adjacent duplicate cowns in @when complete correctly."""
@@ -529,11 +490,8 @@ def test_duplicate_cown_non_adjacent(self):
         def mixed(x, y, z):
             return x.value + y.value + z.value
 
-        @when(mixed)
-        def check(r):
-            send("assert", (r.value, 40))
-
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert mixed.unwrap() == 40
 
     def test_duplicate_cown_in_group(self):
         """Duplicate cowns within a group complete without deadlock."""
@@ -543,11 +501,8 @@ def test_duplicate_cown_in_group(self):
         def group_sum(group):
             return sum(g.value for g in group)
 
-        @when(group_sum)
-        def check(r):
-            send("assert", (r.value, 14))
-
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert group_sum.unwrap() == 14
 
     def test_duplicate_cown_mutation(self):
         """Mutating a cown passed twice reflects same underlying value."""
@@ -558,11 +513,8 @@ def mutate(a, b):
             a.value = 42
             return b.value
 
-        @when(mutate)
-        def check(r):
-            send("assert", (r.value, 42))
-
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert mutate.unwrap() == 42
 
     def test_cown_of_cown_direct(self):
         """CownCapsule as direct child of a Cown survives release/acquire."""
@@ -571,9 +523,10 @@ def test_cown_of_cown_direct(self):
 
         @when(outer)
         def read_outer(o):
-            send("assert", (type(o.value).__name__, "Cown"))
+            return type(o.value).__name__
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert read_outer.unwrap() == "Cown"
 
     def test_cown_of_cown_access_inner(self):
         """Inner cown's value is accessible after outer round-trip."""
@@ -582,9 +535,10 @@ def test_cown_of_cown_access_inner(self):
 
         @when(outer, inner)
         def check_both(o, i):
-            send("assert", (i.value, 99))
+            return i.value
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert check_both.unwrap() == 99
 
     def test_cown_of_cown_in_container(self):
         """CownCapsule nested in a dict survives pickle round-trip."""
@@ -593,9 +547,10 @@ def test_cown_of_cown_in_container(self):
 
         @when(outer)
         def check_container(o):
-            send("assert", (type(o.value["key"]).__name__, "Cown"))
+            return type(o.value["key"]).__name__
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert check_container.unwrap() == "Cown"
 
     def test_cown_of_cown_schedule_inner(self):
         """Extract inner cown from outer and schedule a behavior on it."""
@@ -612,9 +567,112 @@ def schedule_on_inner(r):
 
             @when(inner_cown)
             def read_inner(i):
-                send("assert", (i.value, 10))
+                return i.value
+
+            return read_inner
+
+        quiesce(QUIESCE_TIMEOUT)
+        assert schedule_on_inner.unwrap().unwrap() == 10
+
 
-        receive_asserts()
+class TestUnwrap:
+    """Cown.unwrap() under the quiesce() result-reading scheme."""
+
+    @classmethod
+    def teardown_class(cls):
+        wait()
+
+    def test_unwrap_returns_value(self):
+        """unwrap() returns a worker-produced value after quiesce()."""
+        x = Cown(3)
+        y = simple(x)
+
+        quiesce(QUIESCE_TIMEOUT)
+        assert y.unwrap() == 6
+
+    def test_unwrap_reraises_behavior_exception(self):
+        """unwrap() re-raises a captured exception verbatim on the caller."""
+        x = Cown(1)
+
+        @when(x)
+        def boom(x):
+            assert x.value == 999, f"expected 999, got {x.value}"
+
+        quiesce(QUIESCE_TIMEOUT)
+        with pytest.raises(AssertionError, match="expected 999, got 1"):
+            boom.unwrap()
+
+    def test_unwrap_clears_exception_after_consuming(self):
+        """A consumed exception is cleared: a second unwrap() returns None."""
+        x = Cown(1)
+
+        @when(x)
+        def boom(x):
+            raise ValueError("once")
+
+        quiesce(QUIESCE_TIMEOUT)
+        with pytest.raises(ValueError, match="once"):
+            boom.unwrap()
+        assert boom.unwrap() is None
+
+    def test_unwrap_returned_exception_is_a_value(self):
+        """An Exception *returned* (not raised) is a value, so unwrap() returns it."""
+        x = Cown(1)
+
+        @when(x)
+        def returns_exc(x):
+            return ValueError("just a value")
+
+        quiesce(QUIESCE_TIMEOUT)
+        result = returns_exc.unwrap()
+        assert isinstance(result, ValueError)
+        assert str(result) == "just a value"
+
+    def test_unwrap_in_flight_raises(self):
+        """unwrap() before quiesce(), while work is in flight, raises RuntimeError."""
+        x = Cown(0)
+
+        @when(x)
+        def slow(x):
+            time.sleep(0.2)
+            return x.value + 1
+
+        with pytest.raises(RuntimeError, match="still in flight"):
+            slow.unwrap()
+
+        quiesce(QUIESCE_TIMEOUT)
+        assert slow.unwrap() == 1
+
+    def test_unwrap_rejects_last_behavior_in_seed_dropped_window(self):
+        """unwrap() rejects an in-flight behavior even while the seed is dropped.
+
+        During quiesce()/wait() the Pyrona seed is dropped, so a single
+        in-flight behavior leaves ``terminator_count == 1`` -- the same
+        value as a fully quiesced, seed-armed runtime. The guard keys
+        off ``count - seeded`` rather than ``count > 1`` so it still
+        rejects this case. Simulated by poking the terminator into the
+        seed-dropped + one-hold state on the primary interpreter.
+        """
+        from bocpy import _core
+
+        x = Cown(0)
+        result = simple(x)
+        quiesce(QUIESCE_TIMEOUT)
+
+        seed_dropped = _core.terminator_seed_dec()
+        held = _core.terminator_inc() >= 0
+        try:
+            assert _core.terminator_count() == 1
+            assert _core.terminator_seeded() == 0
+            with pytest.raises(RuntimeError, match="still in flight"):
+                result.unwrap()
+        finally:
+            if held:
+                _core.terminator_dec()
+            if seed_dropped:
+                _core.terminator_seed_inc()
+
+        assert result.unwrap() == 0
 
 
 class TestGlobalCapture:
@@ -631,11 +689,8 @@ def test_method_captures_global_via_local(self):
         x = Cown(5)
         result = m.multiply(x)
 
-        @when(result)
-        def _(r):
-            send("assert", (r.value, 5 * GLOBAL_FACTOR))
-
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert result.unwrap() == 5 * GLOBAL_FACTOR
 
     def test_method_captures_global_directly(self):
         """A method's @when captures a module-level global by name."""
@@ -643,11 +698,8 @@ def test_method_captures_global_directly(self):
         x = Cown(3)
         result = m.multiply_direct(x)
 
-        @when(result)
-        def _(r):
-            send("assert", (r.value, 3 * GLOBAL_FACTOR))
-
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert result.unwrap() == 3 * GLOBAL_FACTOR
 
     @pytest.mark.parametrize("value", [1, 10, 100])
     def test_method_captures_global_parametrized(self, value):
@@ -656,11 +708,8 @@ def test_method_captures_global_parametrized(self, value):
         x = Cown(value)
         result = m.multiply_direct(x)
 
-        @when(result)
-        def _(r):
-            send("assert", (r.value, value * GLOBAL_FACTOR))  # noqa: B023
-
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert result.unwrap() == value * GLOBAL_FACTOR
 
 
 class TestCownAcquireDeserialiseFailure:
@@ -703,15 +752,14 @@ def test_acquire_rollback_surfaces_exception(self):
 
         @when(bad)
         def use_bad(b):
-            # This body never runs — acquire fails first.
-            send("assert", (b.value, "unreachable"))
+            return b.value
 
         @when(use_bad)
         def check(b):
-            send("assert", (b.exception, True))
-            send("assert", (isinstance(b.value, ZeroDivisionError), True))
+            return (b.exception, isinstance(b.value, ZeroDivisionError))
 
-        receive_asserts(2)
+        quiesce(QUIESCE_TIMEOUT)
+        assert check.unwrap() == (True, True)
 
 
 class TestCownInCown:
@@ -756,9 +804,12 @@ def b(c1):
 
             @when(c1, c2)
             def c(c1, c2):
-                send("assert", (c2.value.key, 20))
+                return c2.value.key
 
-        receive_asserts(1)
+            return c
+
+        quiesce(QUIESCE_TIMEOUT)
+        assert b.unwrap().unwrap() == 20
 
     def test_cown_chain_through_message_queue(self):
         """Cown sent via message queue must survive sender's release.
@@ -780,10 +831,6 @@ def test_cown_chain_through_message_queue(self):
         def sender(a):
             inner = Cown(42)
             send("cown_chain", inner)
-            # ``inner`` goes out of scope when ``sender`` returns;
-            # only the encoded pickle bytes inside the queued
-            # message reference the underlying BOCCown after that
-            # point.
 
         @when(anchor)
         def receiver(a):
@@ -792,9 +839,12 @@ def receiver(a):
 
             @when(payload)
             def use(c):
-                send("assert", (c.value, 42))
+                return c.value
+
+            return use
 
-        receive_asserts(1)
+        quiesce(QUIESCE_TIMEOUT)
+        assert receiver.unwrap().unwrap() == 42
         drain("cown_chain")
 
     def test_cown_of_cown_fuzz_container_shapes(self):
@@ -813,14 +863,19 @@ def test_cown_of_cown_fuzz_container_shapes(self):
         * ``__slots__``-only instance
         * 2-level ``Cown[Cown[T]]`` chain
 
-        Every trial sends exactly one ``(value, expected)`` tuple
-        on the ``"assert"`` queue; the receive loop drains all 50
-        at the end via :func:`receive_asserts`.
+        Every trial returns its leaf value through a result-cown chain;
+        after :func:`quiesce` each leaf is read with :meth:`Cown.unwrap`.
+        The 2-level ``Cown[Cown[T]]`` shape adds one extra result-cown
+        layer, so its leaf is one ``unwrap()`` deeper.
         """
         n_trials = 50
         n_shapes = 7
         rng = random.Random(0xC0C0)
+        # The last shape index is the Cown(inner) deep case (the else branch
+        # below); keep them in sync if shapes are reordered.
+        deep_shape = n_shapes - 1
 
+        results = []
         for trial in range(n_trials):
             shape = rng.randrange(n_shapes)
             expected = trial * 1000 + 17
@@ -844,41 +899,50 @@ def make(o):
                 else:
                     o.value = Cown(inner)
 
+            if shape == deep_shape:
+                @when(outer)
+                def verify_deep(o):
+                    wrapping = o.value
+
+                    @when(wrapping)
+                    def peel(wc):
+                        leaf = wc.value
+
+                        @when(leaf)
+                        def check_nested(c):
+                            return c.value
+
+                        return check_nested
+
+                    return peel
+
+                results.append((shape, expected, verify_deep))
+                continue
+
             @when(outer)
             def verify(o):
                 container = o.value
-                inner_c = None
-                if isinstance(container, list):
-                    inner_c = container[0]
-                elif isinstance(container, tuple):
+                if isinstance(container, (list, tuple)):
                     inner_c = container[0]
                 elif isinstance(container, dict):
                     inner_c = container["k"]
-                elif isinstance(container, DataClassSlots):
-                    inner_c = container.c
-                elif isinstance(container, (DictOnly, SlotsOnly)):
+                else:
                     inner_c = container.c
-                elif isinstance(container, Cown):
-                    # 2-level chain: schedule against the wrapping
-                    # cown to peel the layer, then check the leaf.
-                    @when(container)
-                    def peel(wc):
-                        leaf = wc.value
 
-                        @when(leaf)
-                        def check_nested(c):
-                            send("assert", (c.value, expected))  # noqa: B023
-                else:
-                    raise AssertionError(
-                        f"unhandled shape {type(container)!r}"
-                    )
+                @when(inner_c)
+                def check(c):
+                    return c.value
+
+                return check
 
-                if inner_c is not None:
-                    @when(inner_c)
-                    def check(c):
-                        send("assert", (c.value, expected))  # noqa: B023
+            results.append((shape, expected, verify))
 
-        receive_asserts(n_trials)
+        quiesce(QUIESCE_TIMEOUT)
+        for shape, expected, verify in results:
+            leaf = verify.unwrap()
+            if shape == deep_shape:
+                leaf = leaf.unwrap()
+            assert leaf.unwrap() == expected
 
     def test_cached_snapshot_survives_entry_overwrite(self):
         """Borrowing reconstructor: snapshot's CownCapsule owns its own ref.
@@ -897,38 +961,28 @@ def test_cached_snapshot_survives_entry_overwrite(self):
 
         @when(anchor)
         def write_initial(a):
-            # ``inner`` arrives here as a capture-tuple reference.
-            # F821: closure variable is ``del``-ed in the enclosing
-            # scope before the behavior runs; the transpiler has
-            # already snapshotted it into the capture tuple.
-            notice_write("k", inner)  # noqa: B023, F821
-            notice_sync()
-
-        @when(anchor, write_initial)
-        def read_then_overwrite(a, _w):
-            snap = noticeboard()
-            stash["k"] = snap["k"]
-            # Overwrite the entry — releases the noticeboard's
-            # nb_pin_cowns +1 on the original inner. After this point
-            # the only strong reference to the original BOCCown is
-            # the one taken by the borrowing reconstructor when
-            # ``snap["k"]`` was materialised above.
+            notice_write("k", inner)  # noqa: F821
+
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        stash["k"] = snap["k"]
+        del snap
+
+        del inner
+
+        @when(anchor)
+        def overwrite(a):
             notice_write("k", "unrelated_value")
-            notice_sync()
 
-            @when(anchor)
-            def use_stashed(a2):
-                stashed = stash["k"]
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
 
-                @when(stashed)
-                def check(s):
-                    send("assert", (s.value, 20))
+        stashed = stash["k"]
 
-        # Drop the main-thread reference; the capture tuple snapshotted
-        # by the ``write_initial`` schedule above still holds it until
-        # that behavior runs.
-        del inner
-        receive_asserts(1)
+        @when(stashed)
+        def check(s):
+            return s.value
+
+        quiesce(QUIESCE_TIMEOUT)
+        assert check.unwrap() == 20
 
 
 class TestAcquireFailureTerminal:
@@ -963,57 +1017,51 @@ def test_three_waiters_after_decode_failure(self):
 
         @when(bad)
         def use_bad_1(b):
-            # Body must not run.
-            send("assert", (b.value, "unreachable_1"))
+            return b.value
 
         @when(bad)
         def use_bad_2(b):
-            send("assert", (b.value, "unreachable_2"))
+            return b.value
 
         @when(bad)
         def use_bad_3(b):
-            send("assert", (b.value, "unreachable_3"))
+            return b.value
 
         @when(use_bad_1)
         def check_1(b):
-            send("assert", (b.exception, True))
-            send("assert", (isinstance(b.value, ZeroDivisionError), True))
+            return (b.exception, isinstance(b.value, ZeroDivisionError))
 
         @when(use_bad_2)
         def check_2(b):
-            send("assert", (b.exception, True))
-            send("assert", (isinstance(b.value, RuntimeError), True))
-            send("assert", ("permanently unavailable" in str(b.value), True))
-            send("repeat_msg", str(b.value))
+            return (
+                b.exception,
+                isinstance(b.value, RuntimeError),
+                "permanently unavailable" in str(b.value),
+                str(b.value),
+            )
 
         @when(use_bad_3)
         def check_3(b):
-            send("assert", (b.exception, True))
-            send("assert", (isinstance(b.value, RuntimeError), True))
-            send("assert", ("permanently unavailable" in str(b.value), True))
-            send("repeat_msg", str(b.value))
-
-        # 2 from check_1 + 3 from check_2 + 3 from check_3.
-        receive_asserts(8)
-
-        # The two terminal-state messages must be byte-identical so a
-        # future regression that started producing per-waiter messages
-        # (e.g. by formatting in the calling interpreter's id) is
-        # caught.
-        msgs = []
-        try:
-            for _ in range(2):
-                result = receive("repeat_msg", RECEIVE_TIMEOUT)
-                assert result[0] != TIMEOUT, (
-                    "timed out waiting for repeat_msg from check_2/check_3"
-                )
-                msgs.append(result[1])
-        finally:
-            drain("repeat_msg")
-        assert msgs[0] == msgs[1], (
-            f"terminal-state messages diverged: {msgs[0]!r} != {msgs[1]!r}"
+            return (
+                b.exception,
+                isinstance(b.value, RuntimeError),
+                "permanently unavailable" in str(b.value),
+                str(b.value),
+            )
+
+        quiesce(QUIESCE_TIMEOUT)
+
+        assert check_1.unwrap() == (True, True)
+
+        c2 = check_2.unwrap()
+        c3 = check_3.unwrap()
+        assert c2[:3] == (True, True, True)
+        assert c3[:3] == (True, True, True)
+
+        assert c2[3] == c3[3], (
+            f"terminal-state messages diverged: {c2[3]!r} != {c3[3]!r}"
         )
-        assert "permanently unavailable" in msgs[0], msgs[0]
+        assert "permanently unavailable" in c2[3], c2[3]
 
 
 class TestBehaviorCapsuleArgsSize:
@@ -1042,13 +1090,9 @@ def test_zero_args_behavior_capsule(self):
         """BehaviorCapsule with empty args list must construct cleanly."""
         from bocpy import start as _start_runtime
         from bocpy._core import BehaviorCapsule
-        # start() is idempotent; no try/except needed on re-entry.
         _start_runtime()
 
         result = Cown(None)
-        # Empty args list — args_size == 0. The
-        # ``args_size > 0 && group_ids == NULL`` guard avoids a
-        # spurious failure if PyMem_RawCalloc(0, ...) returns NULL.
         capsule = BehaviorCapsule(
             "__behavior_zero_args__",
             result.impl,
@@ -1061,13 +1105,9 @@ def test_large_args_behavior_capsule(self):
         """BehaviorCapsule with many args constructs and group_ids works."""
         from bocpy import start as _start_runtime
         from bocpy._core import BehaviorCapsule
-        # start() is idempotent; no try/except needed on re-entry.
         _start_runtime()
 
         result = Cown(None)
-        # 32 distinct cowns with distinct group_ids. Exercises the
-        # group_ids[i] = group_id loop that NULL-derefs without
-        # the alloc check on OOM.
         cowns = [Cown(i) for i in range(32)]
         args = [(i, c.impl) for i, c in enumerate(cowns)]
 
@@ -1098,11 +1138,12 @@ def bad(x):
 
         @when(bad)
         def check(b):
-            send("assert", (b.exception, True))
-            send("assert", (isinstance(b.value, ZeroDivisionError), True))
+            flags = (b.exception, isinstance(b.value, ZeroDivisionError))
             b.value = None
+            return flags
 
-        receive_asserts(2)
+        quiesce(QUIESCE_TIMEOUT)
+        assert check.unwrap() == (True, True)
 
     def test_exception_flag_on_return(self):
         """Returned Exception object has .exception False."""
@@ -1114,10 +1155,10 @@ def returns_exc(x):
 
         @when(returns_exc)
         def check(r):
-            send("assert", (r.exception, False))
-            send("assert", (isinstance(r.value, ValueError), True))
+            return (r.exception, isinstance(r.value, ValueError))
 
-        receive_asserts(2)
+        quiesce(QUIESCE_TIMEOUT)
+        assert check.unwrap() == (False, True)
 
     def test_exception_flag_cleared_on_value_write(self):
         """Writing .value clears the exception flag."""
@@ -1129,11 +1170,13 @@ def bad(x):
 
         @when(bad)
         def check(b):
-            send("assert", (b.exception, True))
+            before = b.exception
             b.value = "fixed"
-            send("assert", (b.exception, False))
+            after = b.exception
+            return (before, after)
 
-        receive_asserts(2)
+        quiesce(QUIESCE_TIMEOUT)
+        assert check.unwrap() == (True, False)
 
     def test_exception_flag_manual_set_clear(self):
         """Manual .exception set and clear works."""
@@ -1141,13 +1184,15 @@ def test_exception_flag_manual_set_clear(self):
 
         @when(x)
         def check(x):
-            send("assert", (x.exception, False))
+            s0 = x.exception
             x.exception = True
-            send("assert", (x.exception, True))
+            s1 = x.exception
             x.exception = False
-            send("assert", (x.exception, False))
+            s2 = x.exception
+            return (s0, s1, s2)
 
-        receive_asserts(3)
+        quiesce(QUIESCE_TIMEOUT)
+        assert check.unwrap() == (False, True, False)
 
     def test_returned_exception_no_unhandled_report(self, capsys):
         """Returned Exception doesn't trigger unhandled exception report."""
@@ -1159,10 +1204,10 @@ def returns_exc(x):
 
         @when(returns_exc)
         def check(r):
-            send("assert", (r.exception, False))
-            send("assert", (isinstance(r.value, ValueError), True))
+            return (r.exception, isinstance(r.value, ValueError))
 
-        receive_asserts(2)
+        quiesce(QUIESCE_TIMEOUT)
+        assert check.unwrap() == (False, True)
         wait()
         captured = capsys.readouterr()
         assert "unhandled exception" not in captured.err.lower()
@@ -1189,13 +1234,11 @@ def test_non_ascii_literal_in_behavior(self):
         x = Cown(0)
 
         @when(x)
-        def _(x):
-            # "€" (U+20AC) is 3 bytes in UTF-8 and a single byte 0x80 in
-            # cp1252; if the export file is not written as UTF-8 the
-            # worker fails to import this module.
-            send("assert", ("€", "€"))
+        def euro(x):
+            return "€"
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert euro.unwrap() == "€"
 
 
 class TestModuleDunderCapture:
@@ -1220,10 +1263,11 @@ def test_name_resolves_to_user_module(self):
         expected = __name__
 
         @when(x)
-        def _(x):
-            send("assert", (__name__, expected))  # noqa: B023
+        def read_name(x):
+            return __name__
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert read_name.unwrap() == expected
 
     def test_package_resolves_to_user_module(self):
         """__package__ inside a behavior matches the user module's value."""
@@ -1231,25 +1275,11 @@ def test_package_resolves_to_user_module(self):
         expected = __package__
 
         @when(x)
-        def _(x):
-            send("assert", (__package__, expected))  # noqa: B023
-
-        receive_asserts()
+        def read_package(x):
+            return __package__
 
-
-# ---------------------------------------------------------------------------
-# Cross-worker scheduling and cown-identity round-trip invariants.
-#
-# These two properties of the BOC runtime are not asserted directly by
-# any of the @when / Cown / capture tests above:
-#
-#   1. With workers >= 2, behaviors really run on more than one worker
-#      thread. Without this, every "parallel" workload degenerates to
-#      single-threaded throughput.
-#   2. A Cown round-tripped through XIData into a worker arrives back
-#      as a CownCapsule. This exercises the XIData round-trip path
-#      that the 2PL dedup machinery relies on.
-# ---------------------------------------------------------------------------
+        quiesce(QUIESCE_TIMEOUT)
+        assert read_package.unwrap() == expected
 
 
 class TestCrossWorker:
@@ -1257,12 +1287,8 @@ class TestCrossWorker:
 
     @classmethod
     def teardown_class(cls):
-        """Drain leftover tagged messages so subsequent tests start clean."""
-        for tag in ("probe_tid", "probe_id"):
-            try:
-                drain(tag)
-            except Exception:
-                pass
+        """Drain the runtime after the cross-worker probes."""
+        wait()
 
     def test_two_workers_observe_distinct_thread_ids(self):
         """At workers=2, >=2 distinct worker thread ids must appear."""
@@ -1276,21 +1302,19 @@ def test_two_workers_observe_distinct_thread_ids(self):
         cells = [Cown(0) for _ in range(tid_samples)]
 
         start(worker_count=2)
-        try:
-            for c in cells:
-                @when(c)
-                def _tid(_c):
-                    send("probe_tid", threading.get_ident())
-        finally:
-            del cells
-            wait()
+        readers = []
+        for c in cells:
+            @when(c)
+            def _tid(_c):
+                deadline = time.perf_counter() + 0.01
+                while time.perf_counter() < deadline:
+                    pass
+                return threading.get_ident()
+
+            readers.append(_tid)
 
-        thread_ids = set()
-        for _ in range(tid_samples):
-            msg = receive(["probe_tid"], RECEIVE_TIMEOUT)
-            assert msg is not None and msg[0] != TIMEOUT, (
-                "thread-id probe timed out")
-            thread_ids.add(msg[1])
+        quiesce(QUIESCE_TIMEOUT)
+        thread_ids = {r.unwrap() for r in readers}
 
         assert len(thread_ids) >= 2, (
             f"only {len(thread_ids)} distinct worker thread id observed "
@@ -1314,23 +1338,18 @@ def test_cown_round_trips_through_xidata(self):
         seen = {}
 
         start(worker_count=2)
-        try:
-            for idx, cell in enumerate(ring):
-                # The transpiler auto-captures `idx` and `cell` as free
-                # variables; do NOT use the `idx=idx` default-arg trick
-                # — it confuses the worker module export.
-                @when(cell)
-                def _probe(c):
-                    send("probe_id", (idx, c))  # noqa: B023
-            for _ in range(ring_size):
-                msg = receive(["probe_id"], RECEIVE_TIMEOUT)
-                assert msg is not None and msg[0] != TIMEOUT, (
-                    "identity probe timed out")
-                _, (probe_idx, probe_cown) = msg
-                seen[probe_idx] = probe_cown
-        finally:
-            del ring
-            wait()
+        probes = []
+        for idx, cell in enumerate(ring):
+            @when(cell)
+            def _probe(c):
+                return (idx, c)  # noqa: B023
+
+            probes.append(_probe)
+
+        quiesce(QUIESCE_TIMEOUT)
+        for p in probes:
+            probe_idx, probe_cown = p.unwrap()
+            seen[probe_idx] = probe_cown
 
         for idx in range(ring_size):
             observed = seen.get(idx)
@@ -1378,22 +1397,17 @@ def test_traceback_resolves_via_linecache(self):
         """
         c = Cown(0)
         start(worker_count=2)
-        try:
-            @when(c)
-            def _b(c):  # noqa: B023
-                try:
-                    raise RuntimeError("synthetic-from-test-traceback")
-                except RuntimeError:
-                    send("tb_done", traceback.format_exc())
-            tag, tb_str = receive(["tb_done"], RECEIVE_TIMEOUT)
-            assert tag != TIMEOUT, "traceback probe timed out"
-        finally:
-            drain("tb_done")
-            wait()
 
-        # The traceback must reference the synthetic bootstrap
-        # filename ``<bocpy:__bocmain__>`` (the test module is the
-        # worker's __main__ alias).
+        @when(c)
+        def _b(c):
+            try:
+                raise RuntimeError("synthetic-from-test-traceback")
+            except RuntimeError:
+                return traceback.format_exc()
+
+        quiesce(QUIESCE_TIMEOUT)
+        tb_str = _b.unwrap()
+
         assert "<bocpy:" in tb_str, (
             f"traceback did not reference synthetic filename; got:\n{tb_str}"
         )
@@ -1408,28 +1422,10 @@ def test_tricky_source_round_trips(self):
         """
         c = Cown(0)
         start(worker_count=2)
-        try:
-            @when(c)
-            def _(c):  # noqa: B023
-                # 1. Non-ASCII identifier-class literal
-                # 2. Embedded quotes of every flavour
-                # 3. Triple-quoted string literal
-                # 4. Backslash and raw-string-style content
-                # 5. Surrogate-free Unicode (U+1F600 grinning face)
-                # 6. NUL byte in a literal — repr() must escape it
-                payload = (
-                    "héllo",
-                    'mix "single" and \'double\' quotes',
-                    """triple-quoted with embedded "quote" and 'apostrophe'""",
-                    r"raw \n not a newline",
-                    'back\\slash and "escaped quote"',
-                    "emoji \U0001F600 in literal",
-                    "with\x00nul",
-                )
-                send("tricky_done", payload)
-            tag, payload = receive(["tricky_done"], RECEIVE_TIMEOUT)
-            assert tag != TIMEOUT, "tricky-source probe timed out"
-            assert payload == (
+
+        @when(c)
+        def _tricky(c):
+            return (
                 "héllo",
                 'mix "single" and \'double\' quotes',
                 """triple-quoted with embedded "quote" and 'apostrophe'""",
@@ -1437,10 +1433,18 @@ def _(c):  # noqa: B023
                 'back\\slash and "escaped quote"',
                 "emoji \U0001F600 in literal",
                 "with\x00nul",
-            ), f"payload round-trip mismatch: {payload!r}"
-        finally:
-            drain("tricky_done")
-            wait()
+            )
+
+        quiesce(QUIESCE_TIMEOUT)
+        assert _tricky.unwrap() == (
+            "héllo",
+            'mix "single" and \'double\' quotes',
+            """triple-quoted with embedded "quote" and 'apostrophe'""",
+            r"raw \n not a newline",
+            'back\\slash and "escaped quote"',
+            "emoji \U0001F600 in literal",
+            "with\x00nul",
+        ), "payload round-trip mismatch"
 
     def test_module_name_with_quote_rejected(self):
         """``module_name`` containing a double-quote is rejected at start().
@@ -1452,29 +1456,16 @@ def test_module_name_with_quote_rejected(self):
         is still nonsensical and the boundary check refuses it with
         a ``ValueError``.
         """
-        # Reach Behaviors.start directly so we can pass an arbitrary
-        # module name. We cannot use the public ``bocpy.start()``
-        # entry point because it overrides ``module`` from the
-        # caller's frame.
         from bocpy import behaviors as _behaviors
 
-        wait()  # ensure no live runtime
+        wait()
         b = _behaviors.Behaviors(2)
-        # Provide a path that exists so export_module_from_file does not
-        # raise on FileNotFoundError before reaching the validation.
-        # The transpiler will parse this test file itself; the body
-        # never runs because the validation fires first.
         with pytest.raises(ValueError, match="dotted Python module path"):
             b.start(module=('a"b', __file__))
 
 
-# ---------------------------------------------------------------------------
-# NaN/Inf timeout helper
-# ---------------------------------------------------------------------------
-
-
 class TestTimeoutValidation:
-    """Boundary validation for wait/notice_sync_wait timeouts.
+    """Boundary validation for wait timeouts.
 
     The C-level ``boc_validate_finite_timeout`` helper rejects NaN with
     ``ValueError``, treats ``+Inf`` as "wait forever", and clamps
@@ -1493,12 +1484,6 @@ def test_terminator_wait_nan_timeout_raises_value_error(self):
         with pytest.raises(ValueError, match="NaN"):
             _core.terminator_wait(float("nan"))
 
-    def test_notice_sync_wait_nan_timeout_raises_value_error(self):
-        """NaN timeout to ``_core.notice_sync_wait`` raises ``ValueError``."""
-        from bocpy import _core
-        with pytest.raises(ValueError, match="NaN"):
-            _core.notice_sync_wait(0, float("nan"))
-
     def test_wait_inf_timeout_blocks_until_done(self):
         """``+Inf`` timeout treats wait as "wait forever" and returns once done.
 
@@ -1507,74 +1492,81 @@ def test_wait_inf_timeout_blocks_until_done(self):
         than blocking. The point is that it does *not* raise.
         """
         from bocpy import _core
-        # No runtime has incremented the terminator, so this returns at
-        # once. The test exists to assert +Inf is accepted (not ValueError).
         assert _core.terminator_wait(float("inf")) is True
 
     def test_terminator_wait_negative_timeout_returns_immediately(self):
         """Negative timeout to ``_core.terminator_wait`` is mapped to wait_forever.
 
-        bocpy's existing convention treats negatives as "wait forever"
-        (matching the historical Python-side semantics). The new
-        validator preserves that behaviour for negatives — only NaN is
-        upgraded to a hard error. With no live runtime the terminator
+        bocpy treats negatives as "wait forever"; the timeout validator
+        preserves that for negatives and upgrades only NaN to a hard
+        error. With no live runtime the terminator
         is already at 0, so this returns immediately either way.
         """
         from bocpy import _core
-        # Returns True immediately because count is already 0.
         assert _core.terminator_wait(-1.0) is True
 
 
-# ---------------------------------------------------------------------------
-# BaseException discipline
-# ---------------------------------------------------------------------------
-
-
 class TestBaseExceptionDiscipline:
-    """KeyboardInterrupt in a @when body releases the cown.
-
-    Without ``finally``-based cleanup, ``except Exception`` arms in
-    ``worker.py`` and the orphan-drain loop in ``behaviors.py``
-    silently let ``KeyboardInterrupt`` / ``SystemExit`` escape past
-    the per-iteration cleanup. The MCS chain would stay linked, the
-    cown would stay owned, and every successor on it would strand.
+    """A ``BaseException`` escaping a @when body still releases the cown.
+
+    The worker's per-behavior cleanup and the orphan-drain loop in
+    ``behaviors.py`` must release cowns in a ``finally``, not only
+    under ``except Exception``. ``KeyboardInterrupt`` and
+    ``SystemExit`` derive from ``BaseException`` (not ``Exception``),
+    so an ``except Exception`` arm lets them escape past the
+    per-iteration cleanup: the MCS chain stays linked, the cown stays
+    owned, and every successor on it strands.
+
+    Note: these tests *explicitly* ``raise KeyboardInterrupt`` inside
+    the body — they do not simulate a Ctrl-C / SIGINT. A signal-driven
+    ``KeyboardInterrupt`` can only ever surface on the main thread of
+    the main interpreter, never inside a worker sub-interpreter;
+    ``KeyboardInterrupt`` is used here purely as the canonical
+    non-``Exception`` ``BaseException`` to drive the cleanup path.
     """
 
     @classmethod
     def teardown_class(cls):
         wait()
-        drain("ki_done")
-
-    def test_keyboard_interrupt_during_worker_releases_cown(self):
-        """A ``KeyboardInterrupt`` from a @when body releases the cown.
 
-        Schedules a behavior that raises ``KeyboardInterrupt``, then
-        a follow-on behavior on the same cown. If the
-        ``finally``-based release / release_all chain is wired
-        correctly, the follow-on runs and the test sees its message.
-        Otherwise the cown is stranded and ``receive`` times out.
+    def test_base_exception_from_worker_body_releases_cown(self):
+        """An explicitly-raised ``BaseException`` releases the cown.
+
+        Schedules a behavior that does ``raise KeyboardInterrupt`` (a
+        ``BaseException``, not an ``Exception``), then a follow-on
+        behavior on the same cown. If the worker's ``finally``-based
+        release / release_all chain is wired correctly, the cown is
+        released even though the escaping exception is not an
+        ``Exception``, so the follow-on runs and ``_follow.unwrap()``
+        returns its value. If cleanup were gated on ``except
+        Exception`` the cown would stay owned and ``quiesce`` would
+        time out.
+
+        The worker captures the escaped ``BaseException`` onto
+        ``_raise``'s result cown, so ``_raise.unwrap()`` re-raises it;
+        consuming it here also keeps it from being reported as
+        unhandled at teardown.
         """
         wait()
         start(worker_count=2)
-        try:
-            c = Cown(0)
 
-            @when(c)
-            def _raise(c):
-                raise KeyboardInterrupt("intentional KI")
+        c = Cown(0)
 
-            @when(c)
-            def _follow(c):
-                send("ki_done", "ok")
+        @when(c)
+        def _raise(c):
+            raise KeyboardInterrupt("intentional KI")
 
-            tag, payload = receive("ki_done", RECEIVE_TIMEOUT)
-            assert tag != TIMEOUT, (
-                "follow-on never ran -- cown was not released after KI"
-            )
-            assert payload == "ok"
-        finally:
-            drain("ki_done")
-            wait()
+        @when(c)
+        def _follow(c):
+            return "ok"
+
+        quiesce(QUIESCE_TIMEOUT)
+        with pytest.raises(KeyboardInterrupt, match="intentional KI"):
+            _raise.unwrap()
+        assert _follow.unwrap() == "ok", (
+            "follow-on never ran -- cown was not released after the "
+            "BaseException escaped the body"
+        )
 
     def test_keyboard_interrupt_during_orphan_drain_completes_drain(self):
         """KI mid-drain still drains the remaining orphans.
@@ -1585,23 +1577,13 @@ def test_keyboard_interrupt_during_orphan_drain_completes_drain(self):
         orphans before the deferred KI is re-raised, so no MCS chain or
         terminator hold leaks.
         """
-        # NOT ``unittest.mock``: workers re-import this module, and
-        # mock pulls in asyncio which can deadlock under PEP 684 init
-        # on macOS arm64.
         from mockreplacement import patch_attr
 
         from bocpy import behaviors as _behaviors
 
         wait()
-        # Build a Behaviors directly so we can drive _drain_orphan_behaviors
-        # against synthetic capsules without standing up the full runtime.
         b = _behaviors.Behaviors(2)
 
-        # Synthetic capsule that records its release_all call. We do
-        # NOT actually inject these into the C scheduler queue; instead
-        # we monkey-patch `_core.scheduler_drain_all_queues` to return
-        # them, and patch `_core.terminator_dec` to be a no-op so the
-        # test does not touch global C state.
         class FakeCapsule:
             def __init__(self):
                 self.set_drop_called = False
@@ -1616,14 +1598,11 @@ def release_all(self):
         cap_ki = FakeCapsule()
         cap_ok = FakeCapsule()
 
-        # First call returns both capsules; second call returns [] so
-        # the drain loop terminates cleanly.
         drain_returns = [[cap_ki, cap_ok], []]
 
         def fake_drain():
             return drain_returns.pop(0) if drain_returns else []
 
-        # Make set_drop_exception on cap_ki raise KI; cap_ok works normally.
         original_set_drop = FakeCapsule.set_drop_exception
 
         def patched_set_drop(self, exc):
@@ -1645,13 +1624,9 @@ def _fake_terminator_dec(*args, **kwargs):
             with pytest.raises(KeyboardInterrupt, match="orphan-drain KI"):
                 b._drain_orphan_behaviors()
 
-        # cap_ok must still have had its release_all called -- the KI on
-        # cap_ki did not abort the drain partway.
         assert cap_ok.released, (
             "second orphan was not drained -- KI aborted the loop"
         )
-        # cap_ki's release_all was attempted too (the KI was raised
-        # from set_drop_exception, which runs *before* release_all).
         assert cap_ki.released
 
 
@@ -1687,11 +1662,8 @@ def test_decorator_modifies_return_value(self):
         def doubled_plus_one(x):
             return x.value * 2
 
-        @when(doubled_plus_one)
-        def _(result):
-            send("assert", (result.value, 21))
-
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert doubled_plus_one.unwrap() == 21
 
     def test_stacked_below_decorators_apply_in_order(self):
         """Stacked below-decorators compose innermost-first on the worker.
@@ -1708,11 +1680,8 @@ def test_stacked_below_decorators_apply_in_order(self):
         def composed(x):
             return x.value
 
-        @when(composed)
-        def _(result):
-            send("assert", (result.value, 22))
-
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert composed.unwrap() == 22
 
     def test_below_decorator_inside_nested_when(self):
         """A nested ``@when`` body may itself carry a below-decorator."""
@@ -1726,11 +1695,10 @@ def outer(x):
             def inner(y):
                 return y.value
 
-            @when(inner)
-            def _(result):
-                send("assert", (result.value, 8))
+            return inner
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert outer.unwrap().unwrap() == 8
 
 
 class TestLoopDefaultCapture:
@@ -1744,12 +1712,17 @@ def teardown_class(cls):
     def test_loop_default_captures_per_iteration_value(self):
         """``i=i`` captures the loop value at schedule time, not at execution."""
         c = Cown(0)
+        readers = []
         for i in range(4):
             @when(c)
-            def _(c, i=i):
-                send("assert", (i, i))
+            def read(c, i=i):
+                return i
+
+            readers.append(read)
 
-        receive_asserts(4)
+        quiesce(QUIESCE_TIMEOUT)
+        for idx, r in enumerate(readers):
+            assert r.unwrap() == idx
 
     def test_rename_default_binds_into_param(self):
         """``def b(c, x=y)`` — capture ``y`` from caller, bind into ``x``."""
@@ -1757,7 +1730,8 @@ def test_rename_default_binds_into_param(self):
         y = 99
 
         @when(c)
-        def _(c, x=y):
-            send("assert", (x, 99))
+        def read(c, x=y):
+            return x
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert read.unwrap() == 99
diff --git a/test/test_build_sbom.py b/test/test_build_sbom.py
index 64baf06..299addb 100644
--- a/test/test_build_sbom.py
+++ b/test/test_build_sbom.py
@@ -24,10 +24,6 @@
 REPO_ROOT = Path(__file__).resolve().parent.parent
 
 
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
 DIST = "bocpy"
 VERSION = "0.6.0"
 DIST_INFO = f"{DIST}-{VERSION}.dist-info"
@@ -87,11 +83,6 @@ def _read_record_rows(wheel_path: Path) -> list[tuple[str, str, str]]:
     return rows
 
 
-# ---------------------------------------------------------------------------
-# Pure-function tests
-# ---------------------------------------------------------------------------
-
-
 def test_build_sbom_document_minimal_shape():
     """A no-extras document carries the required CycloneDX fields."""
     doc = build_sbom.build_sbom_document(
@@ -116,9 +107,8 @@ def test_build_sbom_document_minimal_shape():
     assert root["purl"] == "pkg:pypi/bocpy@0.6.0"
     assert root["bom-ref"] == "pkg:pypi/bocpy@0.6.0"
     assert root["licenses"] == [{"license": {"id": "MIT"}}]
-    assert "properties" not in root  # no git sha, no wheel filename
+    assert "properties" not in root
 
-    # bocpy has zero third-party runtime deps.
     assert doc["components"] == []
     assert doc["dependencies"] == [
         {"ref": "pkg:pypi/bocpy@0.6.0", "dependsOn": []}
@@ -160,17 +150,9 @@ def test_build_sbom_document_serialises_to_stable_json():
     )
     serialised = json.dumps(doc, indent=2, sort_keys=True)
     reloaded = json.loads(serialised)
-    # ``serialNumber`` is deterministic for a fixed input set (see
-    # the determinism tests below); JSON round-trip should therefore
-    # be lossless for every field including the serial.
     assert doc == reloaded
 
 
-# ---------------------------------------------------------------------------
-# Wheel-injection round-trip
-# ---------------------------------------------------------------------------
-
-
 def test_inject_sbom_round_trip(tmp_path: Path) -> None:
     """Injecting an SBOM lands at the PEP 770 path with a correct RECORD."""
     wheel_path = tmp_path / f"{DIST}-{VERSION}-cp314-cp314-linux_x86_64.whl"
@@ -197,21 +179,15 @@ def test_inject_sbom_round_trip(tmp_path: Path) -> None:
     with zipfile.ZipFile(wheel_path, "r") as wheel:
         names = wheel.namelist()
         assert sbom_arc in names
-        # Original payload entries are still present and untouched.
         assert f"{DIST}/__init__.py" in names
         assert f"{DIST}/_core.so" in names
         assert wheel.read(f"{DIST}/_core.so") == PROBE_PAYLOAD
-        # The injected SBOM is exactly the bytes we passed in.
         assert wheel.read(sbom_arc) == sbom_bytes
 
-    # RECORD covers every data entry (including the new SBOM) and has
-    # an empty row for itself.
     rows = _read_record_rows(wheel_path)
     paths = [r[0] for r in rows]
     assert sbom_arc in paths
-    # The very last row is the empty-hash entry for RECORD itself.
     assert rows[-1] == (f"{DIST_INFO}/RECORD", "", "")
-    # Verify every other row's hash matches the actual archive bytes.
     with zipfile.ZipFile(wheel_path, "r") as wheel:
         for path, hash_spec, size in rows[:-1]:
             data = wheel.read(path)
@@ -231,14 +207,11 @@ def test_inject_sbom_replaces_existing_sbom(tmp_path: Path) -> None:
     wheel_path = tmp_path / f"{DIST}-{VERSION}-cp314-cp314-linux_x86_64.whl"
     _build_probe_wheel(wheel_path)
 
-    # First injection.
     build_sbom.inject_sbom_into_wheel(wheel_path, b'{"first": true}\n')
-    # Second injection with different bytes.
     build_sbom.inject_sbom_into_wheel(wheel_path, b'{"second": true}\n')
 
     with zipfile.ZipFile(wheel_path, "r") as wheel:
         sbom_arc = f"{DIST_INFO}/sboms/bocpy.cdx.json"
-        # No duplicates.
         assert wheel.namelist().count(sbom_arc) == 1
         assert wheel.read(sbom_arc) == b'{"second": true}\n'
 
@@ -258,7 +231,6 @@ def test_inject_sbom_does_not_leave_tmp_on_failure(
     wheel_path = tmp_path / f"{DIST}-{VERSION}-cp314-cp314-linux_x86_64.whl"
     _build_probe_wheel(wheel_path)
 
-    # Force ``shutil.move`` to blow up after the new wheel is written.
     def _boom(src: str, dst: str) -> str:
         raise OSError("disk full")
 
@@ -271,11 +243,6 @@ def _boom(src: str, dst: str) -> str:
     assert leftover == [], f"tmp files leaked: {leftover}"
 
 
-# ---------------------------------------------------------------------------
-# pyproject.toml metadata smoke
-# ---------------------------------------------------------------------------
-
-
 def test_read_pyproject_metadata_round_trip() -> None:
     """The metadata reader returns the fields the SBOM generator needs."""
     meta = build_sbom._read_pyproject_metadata(REPO_ROOT)
@@ -287,11 +254,6 @@ def test_read_pyproject_metadata_round_trip() -> None:
     assert meta["vcs"].startswith("https://")
 
 
-# ---------------------------------------------------------------------------
-# CLI ``inject`` mode
-# ---------------------------------------------------------------------------
-
-
 def test_cli_inject_copy_to_leaves_original_alone(tmp_path: Path) -> None:
     """``inject --copy-to DIR`` copies, injects into the copy, and leaves the source pristine."""
     src_dir = tmp_path / "src"
@@ -316,10 +278,8 @@ def test_cli_inject_copy_to_leaves_original_alone(tmp_path: Path) -> None:
     )
     assert rc == 0
 
-    # Source wheel untouched.
     assert wheel_path.read_bytes() == original_bytes
 
-    # Destination wheel has the embedded SBOM under the PEP 770 path.
     copied = dest_dir / wheel_path.name
     assert copied.is_file()
     with zipfile.ZipFile(copied, "r") as wheel:
@@ -338,16 +298,6 @@ def test_cli_inject_copy_to_leaves_original_alone(tmp_path: Path) -> None:
     assert props["cdx:python:wheel_filename"] == wheel_path.name
 
 
-# ---------------------------------------------------------------------------
-# ``inject_sbom_into_wheel`` must preserve per-entry ZIP metadata
-# (``external_attr``, ``create_system``, ``compress_type``,
-# ``date_time``). ``auditwheel``/``delocate`` wheels rely on the upper
-# 16 bits of ``external_attr`` to mark symlinked SONAMEs; losing them
-# turns ``libfoo.so.1 -> libfoo.so.1.2.3`` into a regular file whose
-# contents are the link target's text.
-# ---------------------------------------------------------------------------
-
-
 def _build_attr_probe_wheel(path: Path) -> None:
     """Build a wheel whose entries exercise the preservation contract.
 
@@ -365,17 +315,14 @@ def _build_attr_probe_wheel(path: Path) -> None:
 
     init_info = zipfile.ZipInfo(filename=f"{DIST}/__init__.py", date_time=pinned_dt)
     init_info.compress_type = zipfile.ZIP_DEFLATED
-    init_info.create_system = 3  # Unix
-    init_info.external_attr = (0o644 & 0xFFFF) << 16  # -rw-r--r--
+    init_info.create_system = 3
+    init_info.external_attr = (0o644 & 0xFFFF) << 16
 
     so_info = zipfile.ZipInfo(filename=f"{DIST}/_core.so", date_time=pinned_dt)
-    so_info.compress_type = zipfile.ZIP_STORED  # deliberately uncompressed
+    so_info.compress_type = zipfile.ZIP_STORED
     so_info.create_system = 3
-    so_info.external_attr = (0o755 & 0xFFFF) << 16  # -rwxr-xr-x
+    so_info.external_attr = (0o755 & 0xFFFF) << 16
 
-    # Symlink: external_attr upper 16 bits = stat mode with S_IFLNK set;
-    # payload bytes are the link target string. This is exactly the
-    # shape that auditwheel emits for vendored SONAMEs.
     link_info = zipfile.ZipInfo(
         filename=f"{DIST}/libprobe.so.1", date_time=pinned_dt
     )
@@ -434,8 +381,6 @@ def test_inject_sbom_preserves_per_entry_zip_attributes(tmp_path: Path) -> None:
     wheel_path = tmp_path / f"{DIST}-{VERSION}-cp314-cp314-manylinux_2_28_x86_64.whl"
     _build_attr_probe_wheel(wheel_path)
 
-    # Capture the source metadata BEFORE injection so we know what to
-    # check for afterward.
     with zipfile.ZipFile(wheel_path, "r") as src:
         before = {info.filename: info for info in src.infolist()}
 
@@ -444,44 +389,34 @@ def test_inject_sbom_preserves_per_entry_zip_attributes(tmp_path: Path) -> None:
     with zipfile.ZipFile(wheel_path, "r") as wheel:
         after = {info.filename: info for info in wheel.infolist()}
 
-    # Every original entry is still present.
     for arcname in before:
         assert arcname in after, f"entry {arcname!r} dropped by injector"
 
-    # 1. Symlink bit on libprobe.so.1 survives — the auditwheel
-    #    SONAME case.
     sym = after[f"{DIST}/libprobe.so.1"]
     sym_mode = (sym.external_attr >> 16) & 0xFFFF
     assert stat.S_ISLNK(sym_mode), (
         f"S_IFLNK lost on symlink entry: external_attr=0x{sym.external_attr:08x}, "
         f"high-bits mode=0o{sym_mode:o}"
     )
-    # Permission bits on the symlink also preserved.
     assert (sym_mode & 0o777) == 0o777
 
-    # 2. Executable bit on the .so survives, and the entry stays
-    #    ZIP_STORED — no silent re-DEFLATE.
     so = after[f"{DIST}/_core.so"]
     assert ((so.external_attr >> 16) & 0o777) == 0o755
     assert so.compress_type == zipfile.ZIP_STORED, (
         f"ZIP_STORED entry was recompressed: got {so.compress_type!r}"
     )
 
-    # 3. Regular module: mode bits, create_system, and date_time
-    #    all preserved.
     init = after[f"{DIST}/__init__.py"]
     init_src = before[f"{DIST}/__init__.py"]
     assert ((init.external_attr >> 16) & 0o777) == 0o644
     assert init.create_system == init_src.create_system == 3
     assert init.date_time == init_src.date_time == (2024, 1, 1, 12, 0, 0)
 
-    # 4. The injected SBOM landed and is reachable.
     sbom_arc = f"{DIST_INFO}/sboms/bocpy.cdx.json"
     assert sbom_arc in after
     with zipfile.ZipFile(wheel_path, "r") as wheel:
         assert wheel.read(sbom_arc) == b'{"v": 1}\n'
 
-    # 5. RECORD covers every entry and self-row is empty.
     rows = _read_record_rows(wheel_path)
     paths = [r[0] for r in rows]
     assert f"{DIST}/libprobe.so.1" in paths
@@ -489,16 +424,6 @@ def test_inject_sbom_preserves_per_entry_zip_attributes(tmp_path: Path) -> None:
     assert rows[-1] == (f"{DIST_INFO}/RECORD", "", "")
 
 
-# ---------------------------------------------------------------------------
-# SBOMs must be deterministic for a fixed input set so wheel hashes
-# do not drift across rebuilds of the same source tree. `serialNumber`
-# is a UUIDv5 derived from
-# `<name>@<version>+<git_commit>+<wheel_filename>` under a stable
-# bocpy namespace; `metadata.timestamp` honours `SOURCE_DATE_EPOCH`
-# when set, per the freedesktop reproducible-build convention.
-# ---------------------------------------------------------------------------
-
-
 def _sbom_inputs() -> dict:
     """Fixed input set for the determinism tests."""
     return dict(
@@ -523,14 +448,11 @@ def test_build_sbom_document_is_byte_identical_for_same_inputs(
     source, drifting the wheel hash across rebuilds. The deterministic
     UUIDv5 serial + ``SOURCE_DATE_EPOCH`` path keep them byte-stable.
     """
-    # 2024-01-01T12:00:00Z (chosen to also exercise an even-minute,
-    # even-hour rounding so off-by-one errors are visible).
     monkeypatch.setenv("SOURCE_DATE_EPOCH", "1704110400")
 
     doc1 = build_sbom.build_sbom_document(**_sbom_inputs())
     doc2 = build_sbom.build_sbom_document(**_sbom_inputs())
 
-    # JSON-encode under the same settings the injection path uses.
     bytes1 = (json.dumps(doc1, indent=2, sort_keys=True) + "\n").encode()
     bytes2 = (json.dumps(doc2, indent=2, sort_keys=True) + "\n").encode()
     assert bytes1 == bytes2, (
@@ -540,17 +462,10 @@ def test_build_sbom_document_is_byte_identical_for_same_inputs(
         f"vs {doc2['metadata']['timestamp']!r}"
     )
 
-    # The pinned epoch round-trips through ``strftime`` exactly.
     assert doc1["metadata"]["timestamp"] == "2024-01-01T12:00:00Z"
 
-    # The serial number is UUIDv5 (version digit ``5`` and standard
-    # variant ``[89ab]``), which the validator pins.
     serial = doc1["serialNumber"]
     assert serial.startswith("urn:uuid:")
-    # The version digit lives at the start of the UUID's third group:
-    # "urn:uuid:XXXXXXXX-XXXX-VXXX-VXXX-XXXXXXXXXXXX"
-    #  012345678 90123456 7 8901 2 3
-    #                              ^ index 23
     assert serial[23] == "5", (
         f"expected UUIDv5 serial, got version digit {serial[23]!r} in {serial!r}"
     )
@@ -588,9 +503,6 @@ def test_build_sbom_document_falls_back_to_now_without_source_date_epoch(
     monkeypatch.delenv("SOURCE_DATE_EPOCH", raising=False)
     doc = build_sbom.build_sbom_document(**_sbom_inputs())
     ts = doc["metadata"]["timestamp"]
-    # ``YYYY-MM-DDTHH:MM:SSZ`` shape; the validator regex is the
-    # canonical reference but we duplicate the assertion here so a
-    # build_sbom-only regression fires close to the source.
     assert len(ts) == 20
     assert ts[4] == ts[7] == "-"
     assert ts[10] == "T"
@@ -623,15 +535,9 @@ def test_validate_sbom_accepts_deterministic_serial() -> None:
     import validate_sbom
 
     doc = build_sbom.build_sbom_document(**_sbom_inputs())
-    # Must not raise.
     validate_sbom.validate_sbom_document(doc)
 
 
-# ---------------------------------------------------------------------------
-# CLI ``generate`` mode — distinguisher guard
-# ---------------------------------------------------------------------------
-
-
 def test_cli_generate_requires_distinguisher_when_neither_provided(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str],
 ) -> None:
@@ -731,11 +637,6 @@ def test_cli_generate_accepts_wheel_filename_alone(
     assert out_path.is_file()
 
 
-# ---------------------------------------------------------------------------
-# PyPI RECORD-mismatch regression (the 0.7.0 warning email bug)
-# ---------------------------------------------------------------------------
-
-
 def _build_probe_wheel_with_dir_entries(path: Path) -> None:
     """Build a probe wheel that includes explicit ZIP directory entries.
 
@@ -758,8 +659,6 @@ def _build_probe_wheel_with_dir_entries(path: Path) -> None:
 
     record_buf = io.StringIO()
     writer = csv.writer(record_buf, lineterminator="\n")
-    # Mimic the pre-fix injector exactly: emit a RECORD row for every
-    # ZIP entry, including the directories (empty hash, size 0).
     for arcname, data in file_entries:
         writer.writerow(_record_row(arcname, data))
     for arcname in dir_entries:
@@ -806,17 +705,13 @@ def test_inject_sbom_strips_directory_entries(tmp_path: Path) -> None:
     )
     _build_probe_wheel_with_dir_entries(wheel_path)
 
-    # Sanity check: the pre-injection wheel reproduces the 0.7.0
-    # failure (i.e. our regression fixture is faithful).
     with pytest.raises(InvalidWheelRecordError):
         validate_record(str(wheel_path))
 
     build_sbom.inject_sbom_into_wheel(wheel_path, b'{"v": 1}\n')
 
-    # 1. PyPI's validator now accepts the wheel.
     validate_record(str(wheel_path))
 
-    # 2. No directory entries leaked into the new ZIP or RECORD.
     with zipfile.ZipFile(wheel_path, "r") as wheel:
         for info in wheel.infolist():
             assert not info.is_dir(), (
diff --git a/test/test_compat_atomics.py b/test/test_compat_atomics.py
index 893f5c2..2c07856 100644
--- a/test/test_compat_atomics.py
+++ b/test/test_compat_atomics.py
@@ -23,8 +23,6 @@
     reason="internal test extension not built (set BOCPY_BUILD_INTERNAL_TESTS=1 and reinstall)",
 )
 
-# Bind the atomics-domain methods under the historical `ca.*` name so
-# the body of this file stays readable and untouched.
 ca = SimpleNamespace(
     make_state=_it.atomics_make_state,
     reset=_it.atomics_reset,
@@ -65,11 +63,6 @@ def test_reset_zeros_all_slots():
     assert ca.load_counter32(h) == 0
 
 
-# ---------------------------------------------------------------------------
-# Acquire / release handshake
-# ---------------------------------------------------------------------------
-
-
 @pytest.mark.parametrize("payload", [
     1,
     0xDEADBEEF,
@@ -116,11 +109,6 @@ def consume(h=h, result=result):
         assert result == [payload], f"iteration {i}: expected {payload:#x}, got {result[0]:#x}"
 
 
-# ---------------------------------------------------------------------------
-# Multi-thread fetch_add contention
-# ---------------------------------------------------------------------------
-
-
 @pytest.mark.parametrize("threads,per_thread", [
     (2, 50_000),
     (4, 50_000),
@@ -164,11 +152,6 @@ def test_fetch_add_u32_contention(threads, per_thread):
     assert ca.load_counter32(h) == threads * per_thread
 
 
-# ---------------------------------------------------------------------------
-# Multi-thread CAS contention
-# ---------------------------------------------------------------------------
-
-
 @pytest.mark.parametrize("threads,per_thread", [
     (2, 25_000),
     (4, 25_000),
diff --git a/test/test_internal_mpmcq.py b/test/test_internal_mpmcq.py
index 6722310..4f8227e 100644
--- a/test/test_internal_mpmcq.py
+++ b/test/test_internal_mpmcq.py
@@ -18,11 +18,6 @@
 )
 
 
-# ---------------------------------------------------------------------------
-# Single-threaded sanity
-# ---------------------------------------------------------------------------
-
-
 def test_empty_on_construction_and_after_drain():
     """A fresh queue is empty, and remains empty after a drain cycle."""
     q = bq.bq_make_queue()
@@ -98,11 +93,6 @@ def test_dequeue_all_returns_fifo_segment():
     assert bq.bq_is_empty(q)
 
 
-# ---------------------------------------------------------------------------
-# Multi-producer stress
-# ---------------------------------------------------------------------------
-
-
 @pytest.mark.parametrize("producers,per_producer", [(8, 20_000)])
 def test_mpmc_stress_no_loss_no_dup(producers, per_producer):
     """Many producers, two consumers (one dequeue + one dequeue_all loop).
@@ -116,9 +106,6 @@ def test_mpmc_stress_no_loss_no_dup(producers, per_producer):
     total = producers * per_producer
     q = bq.bq_make_queue()
 
-    # Pre-allocate every node up front (alloc under GIL is not what we
-    # want to stress). Encode (producer_id, sequence) in a single int
-    # so the consumer side can verify per-producer FIFO ordering.
     nodes = [
         [bq.bq_make_node(p * per_producer + i) for i in range(per_producer)]
         for p in range(producers)
@@ -159,8 +146,6 @@ def dequeue_all_consumer() -> None:
     for t in prods:
         t.join()
 
-    # Drain remainder under stop signal.
-    # Spin until consumers report all values seen, then stop them.
     import time
     deadline = time.monotonic() + 30.0
     while time.monotonic() < deadline:
@@ -172,7 +157,6 @@ def dequeue_all_consumer() -> None:
     cons1.join()
     cons2.join()
 
-    # Final mop-up in case the consumer threads exited mid-segment.
     while True:
         v = bq.bq_dequeue(q)
         if v is None:
@@ -186,11 +170,4 @@ def dequeue_all_consumer() -> None:
         "values do not form 0..N-1 — duplication or corruption"
     )
 
-    # Note: we deliberately do NOT assert per-producer FIFO on `seen`.
-    # Even though MPMCQ preserves enqueue order at the dequeue point,
-    # `seen` is appended under a lock by two concurrent consumers, so
-    # its order reflects lock-acquisition order, not dequeue order.
-    # The invariant under test is that every value appears exactly
-    # once — no losses, no duplicates.
-
     assert bq.bq_is_empty(q)
diff --git a/test/test_internal_wsq.py b/test/test_internal_wsq.py
index 54c6c77..ea7704f 100644
--- a/test/test_internal_wsq.py
+++ b/test/test_internal_wsq.py
@@ -18,11 +18,6 @@
 WSQ_N = _it.wsq_n()
 
 
-# ---------------------------------------------------------------------------
-# Cursor arithmetic
-# ---------------------------------------------------------------------------
-
-
 def test_pre_inc_uniform_over_full_cycles():
     """`boc_wsq_pre_inc` must distribute uniformly over k = N * K calls."""
     K = 1000  # noqa: N806
@@ -34,13 +29,11 @@ def test_pre_inc_uniform_over_full_cycles():
 def test_pre_inc_first_indices():
     """First N pre-increments must visit indices 1, 2, ..., N-1, 0."""
     counts = _it.wsq_pre_inc_histogram(WSQ_N)
-    # Every index hit exactly once over a full cycle (regardless of order).
     assert counts == [1] * WSQ_N
 
 
 def test_pre_inc_partial_cycle_within_bounds():
     """A partial cycle hits a contiguous prefix of indices."""
-    # k = N - 1: indices 1..N-1 each receive 1, index 0 receives 0.
     counts = _it.wsq_pre_inc_histogram(WSQ_N - 1)
     assert counts[0] == 0
     for i in range(1, WSQ_N):
@@ -50,18 +43,11 @@ def test_pre_inc_partial_cycle_within_bounds():
 def test_post_dec_first_returns_zero_then_wraps():
     """`boc_wsq_post_dec` returns the *pre*-decrement index."""
     seq = _it.wsq_post_dec_sequence(WSQ_N + 2)
-    # First call: cursor was 0 -> returns 0, advances to N-1.
     assert seq[0] == 0
-    # Then N-1, N-2, ..., 0 (wrap), N-1, N-2.
     expected = [0] + list(range(WSQ_N - 1, -1, -1)) + [WSQ_N - 1]
     assert seq == expected[: len(seq)]
 
 
-# ---------------------------------------------------------------------------
-# Single-node enqueue distribution
-# ---------------------------------------------------------------------------
-
-
 def test_enqueue_round_robin_full_cycles():
     """N*K single pushes hit every sub-queue exactly K times."""
     K = 256  # noqa: N806
@@ -73,11 +59,10 @@ def test_enqueue_round_robin_full_cycles():
 
 def test_enqueue_partial_cycle_distribution():
     """A non-multiple-of-N push count distributes within ±1 across sub-queues."""
-    K = 7  # noqa: N806  7 pushes, N=4 -> [1, 2, 2, 2] in some rotation.
+    K = 7  # noqa: N806
     w = _it.wsq_make_worker()
     counts = _it.wsq_enqueue_drain_counts(w, K)
     assert sum(counts) == K
-    # Max-min must be <= 1: round-robin gives near-uniform.
     assert max(counts) - min(counts) <= 1
 
 
@@ -88,11 +73,6 @@ def test_enqueue_zero_pushes_leaves_all_empty():
     assert counts == [0] * WSQ_N
 
 
-# ---------------------------------------------------------------------------
-# enqueue_spread distribution invariant
-# ---------------------------------------------------------------------------
-
-
 def test_spread_preserves_total_count():
     """All L nodes from a stolen segment land somewhere across the WSQ."""
     for length in (1, 2, 3, WSQ_N, WSQ_N + 1, 4 * WSQ_N, 100):
@@ -108,10 +88,8 @@ def test_spread_distributes_long_segment_uniformly():
     w = _it.wsq_make_worker()
     counts = _it.wsq_spread_segment_counts(w, length)
     assert sum(counts) == length
-    # Every sub-queue must receive at least one node.
     assert all(c >= 1 for c in counts), (
         f"some sub-queue starved: {counts}")
-    # Spread is near-uniform: max-min <= 1 for an exact multiple of N.
     assert max(counts) - min(counts) <= 1, (
         f"long-segment spread non-uniform: {counts}")
 
diff --git a/test/test_matrix.py b/test/test_matrix.py
index a831440..5f494e0 100644
--- a/test/test_matrix.py
+++ b/test/test_matrix.py
@@ -1,17 +1,18 @@
 """Tests for the bocpy Matrix class using fuzzed inputs across multiple sizes."""
 
+import copy
 import math
+import pickle
 import random
+import struct
 import sys
 
 import pytest
 
-from bocpy import Cown, drain, Matrix, receive, send, TIMEOUT, wait, when
+from bocpy import Cown, Matrix, quiesce, wait, when
 
+QUIESCE_TIMEOUT = 5
 
-# ---------------------------------------------------------------------------
-# Fixtures – fuzzed inputs covering a range of matrix sizes
-# ---------------------------------------------------------------------------
 
 MATRIX_SIZES = [
     (1, 1),
@@ -60,11 +61,6 @@ def mat_pair(shape, rng):
     return Matrix(rows, cols, vals_a), Matrix(rows, cols, vals_b)
 
 
-# ---------------------------------------------------------------------------
-# Construction & properties
-# ---------------------------------------------------------------------------
-
-
 class TestConstruction:
     """Tests for Matrix construction and initialization."""
 
@@ -95,7 +91,6 @@ def test_list_init(self, mat, shape, random_values):
         rows, cols = shape
         assert mat.rows == rows
         assert mat.columns == cols
-        # spot-check individual elements
         for i in range(rows):
             for j in range(cols):
                 assert mat[i, j] == pytest.approx(random_values[i * cols + j])
@@ -116,11 +111,6 @@ def test_wrong_value_count(self, shape):
             Matrix(rows, cols, [1.0] * (rows * cols + 1))
 
 
-# ---------------------------------------------------------------------------
-# Factory functions
-# ---------------------------------------------------------------------------
-
-
 class TestFactories:
     """Tests for factory functions (zeros, ones, normal, uniform)."""
 
@@ -159,10 +149,34 @@ def test_normal_defaults(self):
         val = Matrix.normal()
         assert isinstance(val, float)
 
+    def test_seed_makes_uniform_reproducible(self):
+        """Seeding before uniform() reproduces the same matrix."""
+        Matrix.seed(12345)
+        a = Matrix.uniform(0.0, 1.0, size=(4, 4))
+        Matrix.seed(12345)
+        b = Matrix.uniform(0.0, 1.0, size=(4, 4))
+        assert Matrix.allclose(a, b)
+
+    def test_seed_makes_normal_reproducible(self):
+        """Seeding before normal() reproduces the same matrix."""
+        Matrix.seed(999)
+        a = Matrix.normal(0.0, 1.0, size=(5, 3))
+        Matrix.seed(999)
+        b = Matrix.normal(0.0, 1.0, size=(5, 3))
+        assert Matrix.allclose(a, b)
+
+    def test_different_seeds_differ(self):
+        """Different seeds produce different sequences."""
+        Matrix.seed(1)
+        a = Matrix.uniform(0.0, 1.0, size=(8, 8))
+        Matrix.seed(2)
+        b = Matrix.uniform(0.0, 1.0, size=(8, 8))
+        assert not Matrix.allclose(a, b)
 
-# ---------------------------------------------------------------------------
-# Indexing / subscript
-# ---------------------------------------------------------------------------
+    def test_seed_requires_argument(self):
+        """seed() with no argument raises TypeError."""
+        with pytest.raises(TypeError):
+            Matrix.seed()
 
 
 class TestIndexing:
@@ -192,7 +206,6 @@ def test_row_slice(self, mat, shape):
             pytest.skip("need ≥2 rows")
         row_mat = mat[0]
         if cols == 1:
-            # single-column matrix: indexing one row returns a scalar
             assert isinstance(row_mat, float)
         else:
             assert row_mat.rows == 1
@@ -204,11 +217,6 @@ def test_len_returns_rows(self, mat, shape):
         assert len(mat) == rows
 
 
-# ---------------------------------------------------------------------------
-# Arithmetic operators
-# ---------------------------------------------------------------------------
-
-
 class TestArithmetic:
     """Tests for element-wise arithmetic operators."""
 
@@ -243,7 +251,6 @@ def test_elementwise_divide(self, mat_pair, shape):
         """Verify element-wise matrix division."""
         a, b = mat_pair
         rows, cols = shape
-        # ensure no zeros in divisor
         for i in range(rows):
             for j in range(cols):
                 if b[i, j] == 0.0:
@@ -274,11 +281,6 @@ def test_scalar_multiply(self, mat, shape, rng):
                 assert c[i, j] == pytest.approx(mat[i, j] * val)
 
 
-# ---------------------------------------------------------------------------
-# In-place operators
-# ---------------------------------------------------------------------------
-
-
 class TestInplaceOps:
     """Tests for in-place arithmetic operators."""
 
@@ -335,9 +337,76 @@ def test_itruediv(self, shape, rng):
                 assert a[i, j] == pytest.approx(expected[i * cols + j])
 
 
-# ---------------------------------------------------------------------------
-# Matrix multiply (@)
-# ---------------------------------------------------------------------------
+class TestOneByOneBroadcast:
+    """A 1x1 matrix acts as a scalar and broadcasts against any shape."""
+
+    OPS = [
+        ("add", lambda a, b: a + b),
+        ("sub", lambda a, b: a - b),
+        ("mul", lambda a, b: a * b),
+        ("div", lambda a, b: a / b),
+    ]
+
+    @pytest.mark.parametrize("name,op", OPS, ids=[o[0] for o in OPS])
+    def test_matrix_op_scalar_matrix(self, name, op, mat, shape, rng):
+        """``MxN op 1x1`` matches the same op against a Python float."""
+        scalar = rng.uniform(1, 10)
+        result = op(mat, Matrix(1, 1, scalar))
+        rows, cols = shape
+        assert result.rows == rows and result.columns == cols
+        for i in range(rows):
+            for j in range(cols):
+                assert result[i, j] == pytest.approx(op(mat[i, j], scalar))
+
+    @pytest.mark.parametrize("name,op", OPS, ids=[o[0] for o in OPS])
+    def test_scalar_matrix_op_matrix(self, name, op, mat, shape, rng):
+        """``1x1 op MxN`` (1x1 on the left) keeps the reflected operand order."""
+        scalar = rng.uniform(1, 10)
+        result = op(Matrix(1, 1, scalar), mat)
+        rows, cols = shape
+        assert result.rows == rows and result.columns == cols
+        for i in range(rows):
+            for j in range(cols):
+                assert result[i, j] == pytest.approx(op(scalar, mat[i, j]))
+
+    @pytest.mark.parametrize("name,op", OPS, ids=[o[0] for o in OPS])
+    def test_matches_python_float_operand(self, name, op, mat, shape, rng):
+        """A 1x1 operand is bit-for-bit equivalent to a Python float operand."""
+        scalar = rng.uniform(1, 10)
+        from_matrix = op(mat, Matrix(1, 1, scalar))
+        from_float = op(mat, scalar)
+        rows, cols = shape
+        for i in range(rows):
+            for j in range(cols):
+                assert from_matrix[i, j] == from_float[i, j]
+
+    def test_inplace_matrix_op_scalar_matrix(self, shape, rng):
+        """``MxN op= 1x1`` mutates the MxN operand in place."""
+        rows, cols = shape
+        vals = [rng.uniform(-50, 50) for _ in range(rows * cols)]
+        scalar = rng.uniform(1, 10)
+        a = Matrix(rows, cols, vals)
+        a += Matrix(1, 1, scalar)
+        for i in range(rows):
+            for j in range(cols):
+                assert a[i, j] == pytest.approx(vals[i * cols + j] + scalar)
+
+    def test_inplace_scalar_matrix_op_matrix_rejected(self):
+        """``1x1 op= MxN`` would change the operand shape and is rejected."""
+        a = Matrix(1, 1, 5.0)
+        b = Matrix(2, 3, [1, 2, 3, 4, 5, 6])
+        with pytest.raises(NotImplementedError,
+                           match="in-place scalar broadcast"):
+            a += b
+
+    def test_one_by_one_op_one_by_one(self):
+        """``1x1 op 1x1`` stays elementwise and yields a 1x1 result."""
+        a = Matrix(1, 1, 7.0)
+        b = Matrix(1, 1, 2.0)
+        assert (a + b)[0, 0] == pytest.approx(9.0)
+        assert (a - b)[0, 0] == pytest.approx(5.0)
+        assert (a * b)[0, 0] == pytest.approx(14.0)
+        assert (a / b)[0, 0] == pytest.approx(3.5)
 
 
 class TestMatmul:
@@ -370,10 +439,35 @@ def test_matmul_values(self, matmul_pair):
                 expected = sum(a[i, p] * b[p, j] for p in range(k))
                 assert c[i, j] == pytest.approx(expected, rel=1e-9)
 
+    def test_matmul_bitwise_reproducible(self):
+        """matmul is deterministic and accumulates k in ascending order.
+
+        Guards the ikj loop reorder: the inner product for each output
+        element must still sum ``p = 0..k-1`` in order, so the result is
+        bit-for-bit identical to an ascending-p Python reference and to a
+        repeat run. A future loop reorder that changed accumulation order
+        would perturb the low bits and trip this test.
+        """
+        rng = random.Random(0xBEEF)
+        m, k, n = 6, 7, 5
+        vals_a = [rng.uniform(-10, 10) for _ in range(m * k)]
+        vals_b = [rng.uniform(-10, 10) for _ in range(k * n)]
+        a = Matrix(m, k, vals_a)
+        b = Matrix(k, n, vals_b)
+
+        c1 = a @ b
+        c2 = a @ b
 
-# ---------------------------------------------------------------------------
-# Transpose
-# ---------------------------------------------------------------------------
+        def bits(x):
+            return struct.pack("<d", x)
+
+        for i in range(m):
+            for j in range(n):
+                acc = 0.0
+                for p in range(k):
+                    acc += vals_a[i * k + p] * vals_b[p * n + j]
+                assert bits(c1[i, j]) == bits(acc)
+                assert bits(c2[i, j]) == bits(c1[i, j])
 
 
 @pytest.mark.parametrize("in_place_mode", [False, True], ids=["copy", "in_place"])
@@ -417,17 +511,11 @@ def test_return_value_contract(self, shape, random_values, in_place_mode):
             assert result is m
         else:
             assert result is not m
-            # Source must be untouched.
             for i in range(rows):
                 for j in range(cols):
                     assert m[i, j] == pytest.approx(random_values[i * cols + j])
 
 
-# ---------------------------------------------------------------------------
-# Aggregation: sum, mean, magnitude
-# ---------------------------------------------------------------------------
-
-
 class TestAggregation:
     """Tests for sum, mean, and magnitude aggregations."""
 
@@ -556,11 +644,6 @@ def test_invalid_axis_raises(self, mat):
             mat.magnitude_squared(2)
 
 
-# ---------------------------------------------------------------------------
-# length property
-# ---------------------------------------------------------------------------
-
-
 class TestLengthProperty:
     """Tests for the read-only `length` property."""
 
@@ -596,8 +679,6 @@ def test_length_is_property_not_method(self):
 class TestVecdot:
     """Tests for `vecdot(other, axis=None)`."""
 
-    # ---- Hand-computed goldens (same-shape) -----------------------------
-
     def test_same_shape_total_golden(self):
         """1x3 . 1x3 axis=None matches hand-computed sum-of-products."""
         a = Matrix(1, 3, [1.0, 2.0, 3.0])
@@ -621,13 +702,10 @@ def test_same_shape_axis1_golden(self):
         assert out.rows == 1 and out.columns == 1
         assert out[0, 0] == pytest.approx(32.0)
 
-    # ---- Broadcast: row-vector (1xN) ------------------------------------
-
     def test_row_broadcast_total_golden(self):
         """2x3 . 1x3 axis=None sums per-row dot products."""
         m = Matrix(2, 3, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
         rv = Matrix(1, 3, [2.0, 3.0, 4.0])
-        # row0: 1*2+2*3+3*4=20; row1: 4*2+5*3+6*4=47; total 67
         assert m.vecdot(rv) == pytest.approx(67.0)
 
     def test_row_broadcast_axis0_golden(self):
@@ -648,13 +726,10 @@ def test_row_broadcast_axis1_golden(self):
         for i, want in enumerate([20.0, 47.0]):
             assert out[i, 0] == pytest.approx(want)
 
-    # ---- Broadcast: column-vector (Mx1) ---------------------------------
-
     def test_col_broadcast_total_golden(self):
         """2x3 . 2x1 axis=None scales each row by its scalar and sums."""
         m = Matrix(2, 3, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
         cv = Matrix(2, 1, [10.0, 20.0])
-        # row0: (1+2+3)*10=60; row1: (4+5+6)*20=300; total 360
         assert m.vecdot(cv) == pytest.approx(360.0)
 
     def test_col_broadcast_axis0_golden(self):
@@ -675,8 +750,6 @@ def test_col_broadcast_axis1_golden(self):
         for i, want in enumerate([60.0, 300.0]):
             assert out[i, 0] == pytest.approx(want)
 
-    # ---- Vector-vector tolerance ----------------------------------------
-
     def test_vector_vector_same_orientation(self):
         """1xN . 1xN returns a scalar."""
         a = Matrix(1, 4, [1.0, 2.0, 3.0, 4.0])
@@ -687,11 +760,8 @@ def test_vector_vector_mixed_orientation(self):
         """1xN . Nx1 walks the flat buffers and returns a scalar."""
         a = Matrix(1, 3, [1.0, 2.0, 3.0])
         c = Matrix(3, 1, [1.0, 2.0, 3.0])
-        # 1*1 + 2*2 + 3*3 = 14
         assert a.vecdot(c) == pytest.approx(14.0)
 
-    # ---- Error paths ----------------------------------------------------
-
     def test_vector_length_mismatch_raises(self):
         """Mismatched vector lengths surface the dimension-mismatch error."""
         a = Matrix(1, 2, [1.0, 2.0])
@@ -720,8 +790,6 @@ def test_axis_wrong_type_raises(self):
         with pytest.raises(TypeError, match="axis must be an int or None"):
             a.vecdot(b, axis="hello")
 
-    # ---- Result types ---------------------------------------------------
-
     def test_axis_none_returns_float(self):
         """`vecdot(..., axis=None)` returns a Python float."""
         a = Matrix(1, 3, [1.0, 2.0, 3.0])
@@ -735,8 +803,6 @@ def test_axis_int_returns_matrix(self):
         assert isinstance(a.vecdot(b, 0), Matrix)
         assert isinstance(a.vecdot(b, 1), Matrix)
 
-    # ---- Equivalence fuzz: vecdot == (a * b) reduced --------------------
-
     def test_equivalence_axis_none(self, mat_pair):
         """`a.vecdot(b)` equals `(a * b).sum()` for any same-shape pair."""
         a, b = mat_pair
@@ -758,8 +824,6 @@ def test_equivalence_axis1(self, mat_pair):
         for i in range(ax.rows):
             assert ax[i, 0] == pytest.approx(ref[i, 0])
 
-    # ---- Commutativity (catches a missing canonicalisation swap) --------
-
     def test_vecdot_commutative_row_broadcast(self):
         """`mat.vecdot(row_vec) == row_vec.vecdot(mat)` across all axes."""
         m = Matrix(3, 4, list(range(12)))
@@ -784,8 +848,6 @@ def test_vecdot_commutative_col_broadcast(self):
                 for j in range(a.columns):
                     assert a[i, j] == pytest.approx(b[i, j])
 
-    # ---- Refcount / impl-leak coverage --------------------------------
-
     def test_vecdot_does_not_free_self_impl(self):
         """Repeated calls do not drop `self->impl`'s C-internal refcount.
 
@@ -802,12 +864,9 @@ def test_vecdot_does_not_free_self_impl(self):
             mat.vecdot(other)
             mat.vecdot(other, axis=0)
             mat.vecdot(other, axis=1)
-        # If impl was freed, ANY of these would crash or read garbage.
         assert (mat.rows, mat.columns) == (1, 3)
         assert mat.magnitude() == pytest.approx(math.sqrt(14.0))
 
-    # ---- Keyword-form smoke -------------------------------------------
-
     def test_vecdot_keyword_axis_matches_positional(self):
         """`mat.vecdot(other, axis=1)` matches `mat.vecdot(other, 1)`."""
         a = Matrix(1, 3, [1.0, 2.0, 3.0])
@@ -821,16 +880,9 @@ def test_vecdot_keyword_axis_none_matches_default(self):
         assert a.vecdot(b, axis=None) == pytest.approx(a.vecdot(b))
 
 
-# ---------------------------------------------------------------------------
-# Cross product
-# ---------------------------------------------------------------------------
-
-
 class TestCross:
     """Tests for the 2D / 3D ``cross`` method."""
 
-    # ---- 2D ---------------------------------------------------------------
-
     def test_2d_returns_float(self):
         """``[1,2].cross([3,4]) == 1*4 - 2*3 == -2.0`` and returns a float."""
         a = Matrix(1, 2, [1.0, 2.0])
@@ -861,8 +913,6 @@ def test_2d_anticommutativity_fuzz(self):
             b = Matrix(1, 2, ys)
             assert a.cross(b) == pytest.approx(-b.cross(a))
 
-    # ---- 3D ---------------------------------------------------------------
-
     def test_3d_basis_ijk_identity(self):
         """i x j = k, j x k = i, k x i = j."""
         i = Matrix(1, 3, [1.0, 0.0, 0.0])
@@ -907,7 +957,6 @@ def test_3d_row_in_row_out(self):
         b = Matrix(1, 3, [4.0, 5.0, 6.0])
         out = a.cross(b)
         assert (out.rows, out.columns) == (1, 3)
-        # (2*6-3*5, 3*4-1*6, 1*5-2*4) = (-3, 6, -3)
         assert out[0, 0] == pytest.approx(-3.0)
         assert out[0, 1] == pytest.approx(6.0)
         assert out[0, 2] == pytest.approx(-3.0)
@@ -934,8 +983,6 @@ def test_3d_other_orientation_irrelevant(self):
         for idx in range(3):
             assert out_row[0, idx] == pytest.approx(out_col[0, idx])
 
-    # ---- 2D batches -------------------------------------------------------
-
     def test_2d_rows_batch_nx2(self):
         """3x2 self cross 3x2 other -> 3x1 column of per-row scalars."""
         a = Matrix(3, 2, [1.0, 2.0,
@@ -947,10 +994,9 @@ def test_2d_rows_batch_nx2(self):
         out = a.cross(b)
         assert isinstance(out, Matrix)
         assert (out.rows, out.columns) == (3, 1)
-        # Row i: a[i,0]*b[i,1] - a[i,1]*b[i,0]
-        assert out[0, 0] == pytest.approx(1.0 * 8.0 - 2.0 * 7.0)   # -6
-        assert out[1, 0] == pytest.approx(3.0 * 10.0 - 4.0 * 9.0)  # -6
-        assert out[2, 0] == pytest.approx(5.0 * 12.0 - 6.0 * 11.0)  # -6
+        assert out[0, 0] == pytest.approx(1.0 * 8.0 - 2.0 * 7.0)
+        assert out[1, 0] == pytest.approx(3.0 * 10.0 - 4.0 * 9.0)
+        assert out[2, 0] == pytest.approx(5.0 * 12.0 - 6.0 * 11.0)
 
     def test_2d_cols_batch_2xn(self):
         """2x3 self cross 2x3 other -> 1x3 row of per-column scalars."""
@@ -961,7 +1007,6 @@ def test_2d_cols_batch_2xn(self):
         out = a.cross(b)
         assert isinstance(out, Matrix)
         assert (out.rows, out.columns) == (1, 3)
-        # Col j: a[0,j]*b[1,j] - a[1,j]*b[0,j]
         assert out[0, 0] == pytest.approx(1.0 * 8.0 - 2.0 * 7.0)
         assert out[0, 1] == pytest.approx(3.0 * 10.0 - 4.0 * 9.0)
         assert out[0, 2] == pytest.approx(5.0 * 12.0 - 6.0 * 11.0)
@@ -979,8 +1024,6 @@ def test_2d_rows_batch_anticommutativity_fuzz(self):
         for i in range(n):
             assert ab[i, 0] == pytest.approx(-ba[i, 0])
 
-    # ---- 3D batches -------------------------------------------------------
-
     def test_3d_rows_batch_nx3(self):
         """3x3 row batch with axis=1 -> 3x3 of per-row cross products."""
         a = Matrix(3, 3, [1.0, 0.0, 0.0,
@@ -991,22 +1034,18 @@ def test_3d_rows_batch_nx3(self):
                           4.0, 5.0, 6.0])
         out = a.cross(b, axis=1)
         assert (out.rows, out.columns) == (3, 3)
-        # i x j = k
         assert out[0, 0] == pytest.approx(0.0)
         assert out[0, 1] == pytest.approx(0.0)
         assert out[0, 2] == pytest.approx(1.0)
-        # j x k = i
         assert out[1, 0] == pytest.approx(1.0)
         assert out[1, 1] == pytest.approx(0.0)
         assert out[1, 2] == pytest.approx(0.0)
-        # (1,2,3) x (4,5,6) = (-3, 6, -3)
         assert out[2, 0] == pytest.approx(-3.0)
         assert out[2, 1] == pytest.approx(6.0)
         assert out[2, 2] == pytest.approx(-3.0)
 
     def test_3d_cols_batch_3xn(self):
         """3x3 col batch with axis=0 -> 3x3 of per-column cross products."""
-        # Columns are the same three vector pairs as the row test.
         a = Matrix(3, 3, [1.0, 0.0, 1.0,
                           0.0, 1.0, 2.0,
                           0.0, 0.0, 3.0])
@@ -1015,15 +1054,12 @@ def test_3d_cols_batch_3xn(self):
                           0.0, 1.0, 6.0])
         out = a.cross(b, axis=0)
         assert (out.rows, out.columns) == (3, 3)
-        # Column 0: i x j = k
         assert out[0, 0] == pytest.approx(0.0)
         assert out[1, 0] == pytest.approx(0.0)
         assert out[2, 0] == pytest.approx(1.0)
-        # Column 1: j x k = i
         assert out[0, 1] == pytest.approx(1.0)
         assert out[1, 1] == pytest.approx(0.0)
         assert out[2, 1] == pytest.approx(0.0)
-        # Column 2: (1,2,3) x (4,5,6) = (-3, 6, -3)
         assert out[0, 2] == pytest.approx(-3.0)
         assert out[1, 2] == pytest.approx(6.0)
         assert out[2, 2] == pytest.approx(-3.0)
@@ -1057,8 +1093,6 @@ def test_3d_rows_batch_anticommutativity_fuzz(self):
             for j in range(3):
                 assert ab[i, j] == pytest.approx(-ba[i, j])
 
-    # ---- Broadcast (single vec broadcast against batch self) --------------
-
     def test_2d_rows_batch_broadcast_other_row(self):
         """Nx2 self cross 1x2 other -> Mx1 of per-row scalars (vec reused)."""
         a = Matrix(3, 2, [1.0, 2.0,
@@ -1067,7 +1101,6 @@ def test_2d_rows_batch_broadcast_other_row(self):
         b = Matrix(1, 2, [7.0, 8.0])
         out = a.cross(b)
         assert (out.rows, out.columns) == (3, 1)
-        # Row i: a[i,0]*8 - a[i,1]*7
         assert out[0, 0] == pytest.approx(1.0 * 8.0 - 2.0 * 7.0)
         assert out[1, 0] == pytest.approx(3.0 * 8.0 - 4.0 * 7.0)
         assert out[2, 0] == pytest.approx(5.0 * 8.0 - 6.0 * 7.0)
@@ -1091,7 +1124,6 @@ def test_2d_cols_batch_broadcast_other_col(self):
         b = Matrix(2, 1, [7.0, 8.0])
         out = a.cross(b)
         assert (out.rows, out.columns) == (1, 3)
-        # Col j: a[0,j]*8 - a[1,j]*7
         assert out[0, 0] == pytest.approx(1.0 * 8.0 - 2.0 * 7.0)
         assert out[0, 1] == pytest.approx(3.0 * 8.0 - 4.0 * 7.0)
         assert out[0, 2] == pytest.approx(5.0 * 8.0 - 6.0 * 7.0)
@@ -1115,15 +1147,12 @@ def test_3d_rows_batch_broadcast_other_row(self):
         b = Matrix(1, 3, [1.0, 1.0, 1.0])
         out = a.cross(b, axis=1)
         assert (out.rows, out.columns) == (3, 3)
-        # i x (1,1,1) = (0, -1, 1) since (0*1 - 0*1, 0*1 - 1*1, 1*1 - 0*1)
         assert out[0, 0] == pytest.approx(0.0)
         assert out[0, 1] == pytest.approx(-1.0)
         assert out[0, 2] == pytest.approx(1.0)
-        # j x (1,1,1) = (1, 0, -1)
         assert out[1, 0] == pytest.approx(1.0)
         assert out[1, 1] == pytest.approx(0.0)
         assert out[1, 2] == pytest.approx(-1.0)
-        # k x (1,1,1) = (-1, 1, 0)
         assert out[2, 0] == pytest.approx(-1.0)
         assert out[2, 1] == pytest.approx(1.0)
         assert out[2, 2] == pytest.approx(0.0)
@@ -1143,22 +1172,18 @@ def test_3d_rows_batch_broadcast_other_col(self):
 
     def test_3d_cols_batch_broadcast_other_col(self):
         """3xN self cross 3x1 other -> 3xN matches per-column cross."""
-        # Columns are i, j, k.
         a = Matrix(3, 3, [1.0, 0.0, 0.0,
                           0.0, 1.0, 0.0,
                           0.0, 0.0, 1.0])
         b = Matrix(3, 1, [1.0, 1.0, 1.0])
         out = a.cross(b, axis=0)
         assert (out.rows, out.columns) == (3, 3)
-        # Column 0 = i x (1,1,1) = (0, -1, 1)
         assert out[0, 0] == pytest.approx(0.0)
         assert out[1, 0] == pytest.approx(-1.0)
         assert out[2, 0] == pytest.approx(1.0)
-        # Column 1 = j x (1,1,1) = (1, 0, -1)
         assert out[0, 1] == pytest.approx(1.0)
         assert out[1, 1] == pytest.approx(0.0)
         assert out[2, 1] == pytest.approx(-1.0)
-        # Column 2 = k x (1,1,1) = (-1, 1, 0)
         assert out[0, 2] == pytest.approx(-1.0)
         assert out[1, 2] == pytest.approx(1.0)
         assert out[2, 2] == pytest.approx(0.0)
@@ -1214,11 +1239,8 @@ def test_2x2_broadcast_against_1x2_other(self):
         assert out[0, 0] == pytest.approx(1.0 * 6.0 - 2.0 * 5.0)
         assert out[1, 0] == pytest.approx(3.0 * 6.0 - 4.0 * 5.0)
 
-    # ---- Broadcast: rejected directions -----------------------------------
-
     def test_reverse_broadcast_vector_self_batch_other_raises(self):
         """Cross is anticommutative; reverse broadcast (vec.cross(batch)) is rejected."""
-        # 1x3 self vs Nx3 other — self is the scalar flavor, rhs size != 3
         a = Matrix(1, 3, [1.0, 2.0, 3.0])
         b = Matrix(5, 3, [float(i) for i in range(15)])
         with pytest.raises(NotImplementedError,
@@ -1227,7 +1249,6 @@ def test_reverse_broadcast_vector_self_batch_other_raises(self):
 
     def test_broadcast_wrong_size_raises(self):
         """Broadcast other must have the matching flat size (2 or 3)."""
-        # Nx3 self, 1x2 other (size 2, not 3)
         a = Matrix(5, 3, [float(i) for i in range(15)])
         b = Matrix(1, 2, [1.0, 2.0])
         with pytest.raises(NotImplementedError,
@@ -1236,15 +1257,12 @@ def test_broadcast_wrong_size_raises(self):
 
     def test_broadcast_other_must_be_vector_raises(self):
         """Non-vector other with matching size still rejected (no inferred shape)."""
-        # Nx3 self, 3x3 other but N != 3
         a = Matrix(5, 3, [float(i) for i in range(15)])
         b = Matrix(3, 3, [float(i) for i in range(9)])
         with pytest.raises(NotImplementedError,
                            match=r"cross: .* incompatible with rhs \d+x\d+"):
             a.cross(b)
 
-    # ---- Ambiguous square shapes ------------------------------------------
-
     def test_2x2_default_per_row(self):
         """2x2 default treats rows as 2D vectors (matches perpendicular/angle)."""
         a = Matrix(2, 2, [1.0, 2.0,
@@ -1264,7 +1282,6 @@ def test_2x2_axis_0_per_col(self):
                           7.0, 8.0])
         out = a.cross(b, axis=0)
         assert (out.rows, out.columns) == (1, 2)
-        # Col 0: 1*7 - 3*5 = -8; Col 1: 2*8 - 4*6 = -8
         assert out[0, 0] == pytest.approx(1.0 * 7.0 - 3.0 * 5.0)
         assert out[0, 1] == pytest.approx(2.0 * 8.0 - 4.0 * 6.0)
 
@@ -1287,7 +1304,6 @@ def test_3x3_default_per_row(self):
                           1.0, 0.0, 0.0])
         out = a.cross(b)
         assert (out.rows, out.columns) == (3, 3)
-        # i x j = k; j x k = i; k x i = j
         assert out[0, 0] == pytest.approx(0.0)
         assert out[0, 2] == pytest.approx(1.0)
         assert out[1, 0] == pytest.approx(1.0)
@@ -1295,7 +1311,6 @@ def test_3x3_default_per_row(self):
 
     def test_3x3_axis_0_per_col(self):
         """3x3 with axis=0 treats columns as 3D vectors."""
-        # Same vectors as above but transposed.
         a = Matrix(3, 3, [1.0, 0.0, 0.0,
                           0.0, 1.0, 0.0,
                           0.0, 0.0, 1.0])
@@ -1304,13 +1319,10 @@ def test_3x3_axis_0_per_col(self):
                           0.0, 1.0, 0.0])
         out = a.cross(b, axis=0)
         assert (out.rows, out.columns) == (3, 3)
-        # Column 0: i x j = k
         assert out[0, 0] == pytest.approx(0.0)
         assert out[1, 0] == pytest.approx(0.0)
         assert out[2, 0] == pytest.approx(1.0)
-        # Column 1: j x k = i
         assert out[0, 1] == pytest.approx(1.0)
-        # Column 2: k x i = j
         assert out[1, 2] == pytest.approx(1.0)
 
     def test_axis_negative_normalizes(self):
@@ -1344,8 +1356,6 @@ def test_axis_none_keyword_matches_default(self):
             for c in range(3):
                 assert explicit[r, c] == pytest.approx(default[r, c])
 
-    # ---- Errors -----------------------------------------------------------
-
     @pytest.mark.parametrize("rows,cols", [(1, 4), (4, 1), (1, 1), (1, 5), (4, 4), (4, 5)])
     def test_invalid_shape_raises(self, rows, cols):
         """Shapes that aren't 1x2/2x1/Nx2/2xN/1x3/3x1/Nx3/3xN raise NotImplementedError."""
@@ -1381,8 +1391,6 @@ def test_size_mismatch_batch_raises(self):
                            match=r"cross: Nx3 batch lhs \d+x\d+ incompatible with rhs \d+x\d+"):
             a.cross(b)
 
-    # ---- Exception propagation through a behavior -------------------------
-
     def test_in_behavior_propagates_exception(self):
         """A cross-shape error inside ``@when`` lands on the result cown."""
         a = Cown(Matrix(1, 4, [1.0, 2.0, 3.0, 4.0]))
@@ -1399,11 +1407,6 @@ def result(a):  # noqa: D401 — short behavior
         assert "cross requires a 2D or 3D vector" in str(result.value)
 
 
-# ---------------------------------------------------------------------------
-# Normalize
-# ---------------------------------------------------------------------------
-
-
 @pytest.mark.parametrize("in_place_mode", [False, True], ids=["copy", "in_place"])
 class TestNormalize:
     """Tests for the ``normalize`` method.
@@ -1519,7 +1522,6 @@ def test_normalize_finite_for_nonzero_inputs(self, in_place_mode):
                 continue
             cases += 1
             for axis in (None, 0, 1):
-                # In-place mutates and would taint subsequent calls, so rebuild per axis.
                 m = Matrix(4, 3, values)
                 if axis is None:
                     n = m.normalize(in_place=in_place_mode)
@@ -1536,11 +1538,9 @@ def test_normalize_keyword_axis(self, in_place_mode):
         mat_no = Matrix(2, 2, [3.0, 4.0, 6.0, 8.0])
         positional_kw = mat_kw.normalize(axis=1, in_place=in_place_mode)
         no_axis = mat_no.normalize(in_place=in_place_mode)
-        # axis=1 normalizes each row independently — both rows go to (0.6, 0.8).
         for r in range(2):
             assert positional_kw[r, 0] == pytest.approx(0.6)
             assert positional_kw[r, 1] == pytest.approx(0.8)
-        # Sanity: total normalize gives a different result (single magnitude).
         assert no_axis[0, 0] != pytest.approx(positional_kw[0, 0])
 
     def test_in_place_matches_copy(self, in_place_mode):
@@ -1563,11 +1563,6 @@ def test_in_place_matches_copy(self, in_place_mode):
                         assert clone[r, c] == pytest.approx(expected[r, c])
 
 
-# ---------------------------------------------------------------------------
-# Perpendicular (2D)
-# ---------------------------------------------------------------------------
-
-
 @pytest.mark.parametrize("in_place_mode", [False, True], ids=["copy", "in_place"])
 class TestPerpendicular:
     """Tests for the ``perpendicular`` method.
@@ -1660,7 +1655,6 @@ def test_2x2_default_is_per_row(self, in_place_mode):
         """Default for the ambiguous 2x2 shape is per-row (decision #3)."""
         m = Matrix(2, 2, [1.0, 2.0, 3.0, 4.0])
         p = m.perpendicular(in_place=in_place_mode)
-        # Row 0 (1, 2) -> (-2, 1); row 1 (3, 4) -> (-4, 3).
         assert p[0, 0] == pytest.approx(-2.0)
         assert p[0, 1] == pytest.approx(1.0)
         assert p[1, 0] == pytest.approx(-4.0)
@@ -1670,7 +1664,6 @@ def test_2x2_axis0_explicit_per_column(self, in_place_mode):
         """Explicit ``axis=0`` overrides the 2x2 default to per-column."""
         m = Matrix(2, 2, [1.0, 2.0, 3.0, 4.0])
         p = m.perpendicular(axis=0, in_place=in_place_mode)
-        # Col 0 (1, 3) -> (-3, 1); col 1 (2, 4) -> (-4, 2).
         assert p[0, 0] == pytest.approx(-3.0)
         assert p[1, 0] == pytest.approx(1.0)
         assert p[0, 1] == pytest.approx(-4.0)
@@ -1757,11 +1750,6 @@ def test_in_place_matches_copy(self, in_place_mode):
                         assert clone[r, c] == pytest.approx(expected[r, c])
 
 
-# ---------------------------------------------------------------------------
-# Angle (atan2)
-# ---------------------------------------------------------------------------
-
-
 class TestAngle:
     """Tests for the ``angle`` method (``atan2(y, x)`` per 2D vector)."""
 
@@ -1809,7 +1797,7 @@ def test_matches_python_atan2_fuzz(self):
         """Per-element equivalence against ``math.atan2`` for random Nx2."""
         rng = random.Random(80)
         for _ in range(50):
-            rows = rng.randint(3, 8)  # Skip Nx2 with N in {1, 2} (scalar / ambiguous).
+            rows = rng.randint(3, 8)
             values = []
             expected = []
             for _r in range(rows):
@@ -1835,7 +1823,6 @@ def test_2x2_axis0_explicit_per_column(self):
         m = Matrix(2, 2, [1.0, 0.0, 0.0, 1.0])
         a = m.angle(axis=0)
         assert (a.rows, a.columns) == (1, 2)
-        # Col 0 (x=1, y=0) -> 0; col 1 (x=0, y=1) -> pi/2.
         assert a[0, 0] == pytest.approx(0.0)
         assert a[0, 1] == pytest.approx(math.pi / 2.0)
 
@@ -1880,13 +1867,6 @@ def test_axis_none_keyword(self):
             assert explicit[r, 0] == pytest.approx(default[r, 0])
 
 
-# ---------------------------------------------------------------------------
-# Axis decoder & shape disambiguation
-# ---------------------------------------------------------------------------
-
-
-# Methods that accept ``axis`` as a keyword argument. All matrix methods now
-# uniformly accept ``axis=`` (or positional, where the signature allows it).
 _AXIS_METHODS = [
     (lambda: Matrix(2, 2, [1.0, 0.0, 0.0, 1.0]), "normalize"),
     (lambda: Matrix(2, 2, [1.0, 2.0, 3.0, 4.0]), "perpendicular"),
@@ -1937,7 +1917,7 @@ def test_overflow_axis_above_long_range(self, factory, method):
     @pytest.mark.parametrize("factory,method", _AXIS_METHODS,
                              ids=[m for _, m in _AXIS_METHODS])
     def test_former_sentinel_no_longer_silent(self, factory, method):
-        """``axis=-1000`` (the historical NO_AXIS sentinel) now raises rather than silently meaning "no axis"."""
+        """``axis=-1000`` raises rather than being silently treated as a no-axis sentinel."""
         mat = factory()
         with pytest.raises(NotImplementedError, match="axis must be -2, -1, 0, or 1"):
             getattr(mat, method)(axis=-1000)
@@ -1950,7 +1930,6 @@ def test_axis_keyword_matches_positional(self, factory, method):
         mat_pos = factory()
         kw = getattr(mat_kw, method)(axis=1)
         pos = getattr(mat_pos, method)(1)
-        # Both forms succeeded; for methods returning a Matrix, results agree.
         if isinstance(kw, Matrix):
             assert (kw.rows, kw.columns) == (pos.rows, pos.columns)
             for r in range(kw.rows):
@@ -2031,8 +2010,6 @@ def test_rejected_axis_does_not_mutate_in_place(
 class TestShapeDisambiguation:
     """Explicit-axis contradictions on unique-orientation shapes raise rather than silently fall through."""
 
-    # ---- perpendicular / angle (vec2 classifier) ----
-
     @pytest.mark.parametrize("method", ["perpendicular", "angle"])
     def test_1x2_axis0_rejected(self, method):
         """``1x2`` is row-oriented; ``axis=0`` contradicts and raises."""
@@ -2065,8 +2042,6 @@ def test_2xN_axis1_rejected(self, method):  # noqa: N802 (shape name)
                            match=f"{method} requires a 2D vector or Nx2 or 2xN matrix"):
             getattr(m, method)(axis=1)
 
-    # ---- cross (cross classifier) ----
-
     def test_cross_1x2_axis0_rejected(self):
         """1x2 scalar 2D rejects ``axis=0``."""
         a = Matrix(1, 2, [1.0, 2.0])
@@ -2123,8 +2098,6 @@ def test_cross_3xN_axis1_rejected(self):  # noqa: N802 (shape name)
         with pytest.raises(NotImplementedError, match="cross requires a 2D or 3D vector"):
             a.cross(b, axis=1)
 
-    # ---- doubly-valid shapes still work both ways ----
-
     def test_2x2_both_axes_accepted(self):
         """``2x2`` is ambiguous: both axes succeed (axis picks orientation)."""
         m = Matrix(2, 2, [1.0, 0.0, 0.0, 1.0])
@@ -2133,7 +2106,7 @@ def test_2x2_both_axes_accepted(self):
         assert m.angle(axis=0) is not None
         assert m.angle(axis=1) is not None
 
-    def test_cross_2x3_axis_ignored_always_2D_batch(self):  # noqa: N802 (shape name)
+    def test_cross_2x3_axis_ignored_always_2D_batch(self):  # noqa: N802
         """``2x3`` always uses the 2D-batch interpretation; ``axis=`` is silently ignored.
 
         Pins the doubly-valid contract: a ``2x3`` input could in principle
@@ -2147,16 +2120,14 @@ def test_cross_2x3_axis_ignored_always_2D_batch(self):  # noqa: N802 (shape name
         default = a.cross(b)
         with_axis_0 = a.cross(b, axis=0)
         with_axis_1 = a.cross(b, axis=1)
-        # Shape pinned: 2D-batch of three z-scalars laid out per column.
         assert (default.rows, default.columns) == (1, 3)
         assert (with_axis_0.rows, with_axis_0.columns) == (1, 3)
         assert (with_axis_1.rows, with_axis_1.columns) == (1, 3)
-        # axis= is a no-op on this shape.
         for c in range(3):
             assert with_axis_0[0, c] == pytest.approx(default[0, c])
             assert with_axis_1[0, c] == pytest.approx(default[0, c])
 
-    def test_cross_3x2_axis_ignored_always_2D_batch(self):  # noqa: N802 (shape name)
+    def test_cross_3x2_axis_ignored_always_2D_batch(self):  # noqa: N802
         """``3x2`` always uses the 2D-batch interpretation; ``axis=`` is silently ignored.
 
         Mirror of the ``2x3`` case for the row-batch orientation. Both
@@ -2176,11 +2147,6 @@ def test_cross_3x2_axis_ignored_always_2D_batch(self):  # noqa: N802 (shape name
             assert with_axis_1[r, 0] == pytest.approx(default[r, 0])
 
 
-# ---------------------------------------------------------------------------
-# Element-wise unary operations
-# ---------------------------------------------------------------------------
-
-
 @pytest.mark.parametrize("in_place_mode", [False, True], ids=["copy", "in_place"])
 class TestUnaryOps:
     """Tests for element-wise unary operations.
@@ -2275,11 +2241,6 @@ def test_abs_operator(self, mat):
         assert Matrix.allclose(abs(mat), mat.abs())
 
 
-# ---------------------------------------------------------------------------
-# allclose
-# ---------------------------------------------------------------------------
-
-
 class TestAllclose:
     """Tests for the allclose comparison function."""
 
@@ -2313,11 +2274,6 @@ def test_within_tolerance(self, shape, rng):
         assert Matrix.allclose(a, b)
 
 
-# ---------------------------------------------------------------------------
-# String representation (smoke tests)
-# ---------------------------------------------------------------------------
-
-
 class TestRepr:
     """Smoke tests for string representations."""
 
@@ -2334,11 +2290,6 @@ def test_repr_does_not_crash(self, mat):
         assert len(r) > 0
 
 
-# ---------------------------------------------------------------------------
-# Edge cases & properties
-# ---------------------------------------------------------------------------
-
-
 class TestEdgeCases:
     """Tests for edge cases and algebraic properties."""
 
@@ -2389,11 +2340,6 @@ def test_transpose_matmul_symmetry(self):
         assert Matrix.allclose(lhs, rhs)
 
 
-# ---------------------------------------------------------------------------
-# Select
-# ---------------------------------------------------------------------------
-
-
 class TestSelect:
     """Tests for Matrix.select() — row and column sub-selection."""
 
@@ -2512,10 +2458,6 @@ def test_select_invalid_axis_raises(self):
             m.select([0], 2)
 
 
-# ---------------------------------------------------------------------------
-# Vector specializations
-# ---------------------------------------------------------------------------
-
 VECTOR_LENGTHS = [1, 3, 5, 10, 32]
 
 
@@ -2550,14 +2492,12 @@ def test_row_vector_item(self, n):
         v = Matrix(1, n, vals)
         row = v[0]
         if n == 1:
-            # 1x1 matrix: single-element row returns a float
             assert isinstance(row, float)
             assert row == pytest.approx(vals[0])
         else:
             assert isinstance(row, Matrix)
             assert row.rows == 1
             assert row.columns == n
-        # Two-index access always works for element retrieval
         for i in range(n):
             assert v[0, i] == pytest.approx(vals[i])
 
@@ -2759,11 +2699,9 @@ def test_assign_row_vector_to_slice(self):
         m = Matrix.zeros((5, 3))
         v = Matrix(1, 3, [1.0, 2.0, 3.0])
         m[1:4, :] = v
-        # rows 0 and 4 should remain zero
         for j in range(3):
             assert m[0, j] == pytest.approx(0.0)
             assert m[4, j] == pytest.approx(0.0)
-        # rows 1-3 should be the broadcast vector
         for i in range(1, 4):
             for j in range(3):
                 assert m[i, j] == pytest.approx(v[0, j])
@@ -2773,11 +2711,9 @@ def test_assign_column_vector_to_slice(self):
         m = Matrix.zeros((3, 5))
         v = Matrix(3, 1, [7.0, 8.0, 9.0])
         m[:, 1:4] = v
-        # columns 0 and 4 should remain zero
         for i in range(3):
             assert m[i, 0] == pytest.approx(0.0)
             assert m[i, 4] == pytest.approx(0.0)
-        # columns 1-3 should be the broadcast vector
         for i in range(3):
             for j in range(1, 4):
                 assert m[i, j] == pytest.approx(v[i, 0])
@@ -2966,11 +2902,6 @@ def test_vector_t_property(self, n):
         assert Matrix.allclose(col.T, row)
 
 
-# ---------------------------------------------------------------------------
-# Min / Max aggregation
-# ---------------------------------------------------------------------------
-
-
 class TestMinMax:
     """Tests for min() and max() aggregation methods."""
 
@@ -3045,11 +2976,6 @@ def test_max_all_same(self, shape):
         assert m.max() == pytest.approx(7.5)
 
 
-# ---------------------------------------------------------------------------
-# Clip
-# ---------------------------------------------------------------------------
-
-
 class TestClip:
     """Tests for the clip() method."""
 
@@ -3110,11 +3036,6 @@ def test_clip_invalid_range(self):
             m.clip(10.0, 0.0)
 
 
-# ---------------------------------------------------------------------------
-# Copy
-# ---------------------------------------------------------------------------
-
-
 class TestCopy:
     """Tests for the copy() method."""
 
@@ -3136,13 +3057,110 @@ def test_copy_is_independent(self, shape, rng):
         original = Matrix(rows, cols, vals)
         c = original.copy()
         c[0, 0] = 999999.0
-        # Original should be unchanged
         assert original[0, 0] == pytest.approx(vals[0])
 
 
-# ---------------------------------------------------------------------------
-# Matrix.vector() factory
-# ---------------------------------------------------------------------------
+class TestPickle:
+    """Tests for pickling and copy.deepcopy support."""
+
+    @staticmethod
+    def _bit_image(m):
+        """Native-endian byte image of every element, for bit-exact compares."""
+        return b"".join(
+            struct.pack("d", m[r, c])
+            for r in range(m.rows)
+            for c in range(m.columns)
+        )
+
+    # Special double bit patterns a raw-memcpy codec must preserve exactly:
+    # NaN, +/-inf, negative zero, the smallest subnormal, and an ordinary value.
+    SPECIAL_VALUES = [
+        float("nan"),
+        float("inf"),
+        -float("inf"),
+        -0.0,
+        5e-324,
+        1.5,
+    ]
+
+    @pytest.mark.parametrize("proto", range(pickle.HIGHEST_PROTOCOL + 1))
+    def test_pickle_roundtrip_bit_exact(self, mat, shape, proto):
+        """Pickling preserves shape and the exact bit image across protocols."""
+        restored = pickle.loads(pickle.dumps(mat, protocol=proto))
+        assert (restored.rows, restored.columns) == shape
+        assert self._bit_image(restored) == self._bit_image(mat)
+
+    @pytest.mark.parametrize("proto", range(pickle.HIGHEST_PROTOCOL + 1))
+    def test_pickle_special_values_bit_exact(self, proto):
+        """NaN, +/-inf, -0.0 and subnormals survive pickling bit-for-bit."""
+        original = Matrix(1, len(self.SPECIAL_VALUES), self.SPECIAL_VALUES)
+        restored = pickle.loads(pickle.dumps(original, protocol=proto))
+        assert self._bit_image(restored) == self._bit_image(original)
+
+    def test_pickle_roundtrip_independent(self, shape, rng):
+        """A restored matrix does not share storage with the original."""
+        rows, cols = shape
+        vals = [rng.uniform(-50, 50) for _ in range(rows * cols)]
+        original = Matrix(rows, cols, vals)
+        restored = pickle.loads(pickle.dumps(original))
+        restored[0, 0] = 123456.0
+        assert original[0, 0] == pytest.approx(vals[0])
+
+    def test_pickle_inside_container(self, mat):
+        """A Matrix nested in a container round-trips with its neighbours."""
+        payload = {"m": mat, "n": 7, "nested": [Matrix(1, 1, [9.0])]}
+        restored = pickle.loads(pickle.dumps(payload))
+        assert restored["n"] == 7
+        assert self._bit_image(restored["m"]) == self._bit_image(mat)
+        assert restored["nested"][0][0, 0] == pytest.approx(9.0)
+
+    def test_deepcopy_bit_exact(self, mat, shape):
+        """copy.deepcopy reproduces shape and the exact bit image."""
+        clone = copy.deepcopy(mat)
+        assert (clone.rows, clone.columns) == shape
+        assert self._bit_image(clone) == self._bit_image(mat)
+
+    def test_deepcopy_special_values_bit_exact(self):
+        """copy.deepcopy preserves special double bit patterns exactly."""
+        original = Matrix(1, len(self.SPECIAL_VALUES), self.SPECIAL_VALUES)
+        clone = copy.deepcopy(original)
+        assert self._bit_image(clone) == self._bit_image(original)
+
+    def test_deepcopy_is_independent(self, shape, rng):
+        """Mutating a deepcopy does not affect the original."""
+        rows, cols = shape
+        vals = [rng.uniform(-50, 50) for _ in range(rows * cols)]
+        original = Matrix(rows, cols, vals)
+        clone = copy.deepcopy(original)
+        clone[0, 0] = 654321.0
+        assert original[0, 0] == pytest.approx(vals[0])
+
+    def test_unpickle_rejects_bad_length(self):
+        """The reconstruct helper rejects a payload of the wrong size."""
+        from bocpy import _math
+
+        with pytest.raises(ValueError):
+            _math._matrix_unpickle(2, 2, b"\x00" * 8)
+
+    def test_unpickle_rejects_bad_dimensions(self):
+        """The reconstruct helper rejects non-positive dimensions."""
+        from bocpy import _math
+
+        with pytest.raises(ValueError):
+            _math._matrix_unpickle(0, 2, b"")
+
+    def test_unpickle_rejects_wrong_payload_type(self):
+        """The reconstruct helper rejects a non-buffer payload."""
+        from bocpy import _math
+
+        with pytest.raises(TypeError):
+            _math._matrix_unpickle(1, 1, "not-bytes")
+
+    def test_reduce_rejects_uninitialized(self):
+        """__reduce__ on a __new__-only matrix raises rather than crashing."""
+        bare = Matrix.__new__(Matrix)
+        with pytest.raises(ValueError):
+            bare.__reduce__()
 
 
 class TestVector:
@@ -3220,11 +3238,6 @@ def test_vector_as_column_false_is_row(self):
             assert v[0, i] == pytest.approx(vals[i])
 
 
-# ---------------------------------------------------------------------------
-# concat
-# ---------------------------------------------------------------------------
-
-
 class TestConcat:
     """Tests for Matrix.concat() — concatenation along rows or columns."""
 
@@ -3321,6 +3334,24 @@ def test_concat_negative_axis(self):
         result_neg = Matrix.concat([a, b], -1)
         assert Matrix.allclose(result_pos, result_neg)
 
+    def test_concat_axis_as_kwarg(self):
+        """axis may be passed as a keyword argument."""
+        a = Matrix(2, 2, [1.0, 2.0, 3.0, 4.0])
+        b = Matrix(2, 2, [5.0, 6.0, 7.0, 8.0])
+        by_kw = Matrix.concat([a, b], axis=1)
+        by_pos = Matrix.concat([a, b], 1)
+        assert by_kw.rows == 2
+        assert by_kw.columns == 4
+        assert Matrix.allclose(by_kw, by_pos)
+
+    def test_concat_values_as_kwarg(self):
+        """values may also be passed as a keyword argument."""
+        a = Matrix(1, 2, [1.0, 2.0])
+        b = Matrix(1, 2, [3.0, 4.0])
+        result = Matrix.concat(values=[a, b], axis=0)
+        assert result.rows == 2
+        assert result.columns == 2
+
     def test_concat_empty_returns_none(self):
         """Concatenating an empty list returns None."""
         result = Matrix.concat([])
@@ -3374,9 +3405,207 @@ def test_concat_columns_preserves_values_large(self):
                 assert result[i, j + 3] == pytest.approx(b[i, j])
 
 
-# ---------------------------------------------------------------------------
-# allclose with equal_nan
-# ---------------------------------------------------------------------------
+def _flat_argextreme(values, want_max):
+    """Reference flat row-major arg-extreme with first-occurrence ties."""
+    best_i = 0
+    best = values[0]
+    for i, v in enumerate(values):
+        if (v > best) if want_max else (v < best):
+            best = v
+            best_i = i
+    return best_i
+
+
+class TestArgExtreme:
+    """Tests for Matrix.argmin and Matrix.argmax."""
+
+    def test_argmin_no_axis_golden(self):
+        """Flat argmin returns the row-major index of the minimum."""
+        m = Matrix(2, 3, [3.0, 1.0, 2.0, 0.0, 9.0, 4.0])
+        assert m.argmin() == 3
+        assert isinstance(m.argmin(), int)
+
+    def test_argmax_no_axis_golden(self):
+        """Flat argmax returns the row-major index of the maximum."""
+        m = Matrix(2, 3, [3.0, 1.0, 2.0, 0.0, 9.0, 4.0])
+        assert m.argmax() == 4
+        assert isinstance(m.argmax(), int)
+
+    def test_argmin_ties_first_occurrence(self):
+        """A tied minimum resolves to the first (lowest) flat index."""
+        m = Matrix(1, 4, [5.0, 1.0, 1.0, 5.0])
+        assert m.argmin() == 1
+
+    def test_argmax_ties_first_occurrence(self):
+        """A tied maximum resolves to the first (lowest) flat index."""
+        m = Matrix(1, 4, [9.0, 3.0, 9.0, 1.0])
+        assert m.argmax() == 0
+
+    def test_argmin_axis0_golden(self):
+        """argmin(axis=0) returns per-column row indices as a 1xcols matrix."""
+        m = Matrix(2, 3, [3.0, 1.0, 2.0, 0.0, 9.0, 4.0])
+        result = m.argmin(axis=0)
+        assert result.rows == 1
+        assert result.columns == 3
+        assert [result[0, c] for c in range(3)] == [1.0, 0.0, 0.0]
+
+    def test_argmax_axis1_golden(self):
+        """argmax(axis=1) returns per-row column indices as a rowsx1 matrix."""
+        m = Matrix(2, 3, [3.0, 1.0, 2.0, 0.0, 9.0, 4.0])
+        result = m.argmax(axis=1)
+        assert result.rows == 2
+        assert result.columns == 1
+        assert [result[r, 0] for r in range(2)] == [0.0, 1.0]
+
+    def test_argmin_negative_axis(self):
+        """axis=-1 behaves like axis=1 and axis=-2 like axis=0."""
+        m = Matrix(3, 2, [4.0, 1.0, 2.0, 8.0, 7.0, 3.0])
+        assert Matrix.allclose(m.argmin(axis=-1), m.argmin(axis=1))
+        assert Matrix.allclose(m.argmin(axis=-2), m.argmin(axis=0))
+
+    @pytest.mark.parametrize("want_max", [False, True])
+    def test_argextreme_no_axis_fuzz(self, mat, shape, random_values, want_max):
+        """Flat arg-extreme matches a Python reference across many shapes."""
+        result = mat.argmax() if want_max else mat.argmin()
+        assert result == _flat_argextreme(random_values, want_max)
+
+    @pytest.mark.parametrize("want_max", [False, True])
+    def test_argextreme_axis0_fuzz(self, mat, shape, random_values, want_max):
+        """Per-column arg-extreme matches a Python reference."""
+        rows, cols = shape
+        result = mat.argmax(axis=0) if want_max else mat.argmin(axis=0)
+        assert result.rows == 1
+        assert result.columns == cols
+        for c in range(cols):
+            column = [random_values[r * cols + c] for r in range(rows)]
+            assert result[0, c] == _flat_argextreme(column, want_max)
+
+    @pytest.mark.parametrize("want_max", [False, True])
+    def test_argextreme_axis1_fuzz(self, mat, shape, random_values, want_max):
+        """Per-row arg-extreme matches a Python reference."""
+        rows, cols = shape
+        result = mat.argmax(axis=1) if want_max else mat.argmin(axis=1)
+        assert result.rows == rows
+        assert result.columns == 1
+        for r in range(rows):
+            row = [random_values[r * cols + c] for c in range(cols)]
+            assert result[r, 0] == _flat_argextreme(row, want_max)
+
+    def test_argmin_invalid_axis_raises(self):
+        """An out-of-range axis raises NotImplementedError."""
+        m = Matrix(2, 2, [1.0, 2.0, 3.0, 4.0])
+        with pytest.raises(NotImplementedError):
+            m.argmin(axis=2)
+
+    def test_argextreme_nan_in_middle_is_skipped(self):
+        """A NaN that is not the running extreme is ignored (strict compares)."""
+        nan = float("nan")
+        m = Matrix(1, 4, [3.0, nan, 1.0, 2.0])
+        assert m.argmin() == 2
+        assert m.argmax() == 0
+
+    def test_argextreme_leading_nan_pins_result(self):
+        """A NaN at element 0 pins the index there (differs from NumPy)."""
+        nan = float("nan")
+        m = Matrix(1, 3, [nan, 1.0, 2.0])
+        assert m.argmin() == 0
+        assert m.argmax() == 0
+
+
+def _outer_op(op, row_vals, col_vals):
+    """Reference RxC outer broadcast: out[r,c] = op(col[r], row[c])."""
+    return [[op(col_vals[r], row_vals[c]) for c in range(len(row_vals))]
+            for r in range(len(col_vals))]
+
+
+class TestOuterBroadcast:
+    """Full (outer) broadcast of a 1xC row vector against an Rx1 column vector."""
+
+    @pytest.mark.parametrize("rows,cols", [(2, 3), (4, 1), (1, 4), (5, 6)])
+    def test_outer_multiply_col_times_row(self, rows, cols):
+        """colvec * rowvec yields RxC where out[r,c] = col[r] * row[c]."""
+        col_vals = [float(r + 1) for r in range(rows)]
+        row_vals = [float(c + 1) * 10 for c in range(cols)]
+        col = Matrix(rows, 1, col_vals)
+        row = Matrix(1, cols, row_vals)
+        result = col * row
+        assert result.rows == rows
+        assert result.columns == cols
+        expected = _outer_op(lambda a, b: a * b, row_vals, col_vals)
+        for r in range(rows):
+            for c in range(cols):
+                assert result[r, c] == pytest.approx(expected[r][c])
+
+    @pytest.mark.parametrize("rows,cols", [(2, 3), (5, 6)])
+    def test_outer_multiply_row_times_col(self, rows, cols):
+        """rowvec * colvec is commutative and yields the same RxC result."""
+        col_vals = [float(r + 1) for r in range(rows)]
+        row_vals = [float(c + 1) * 10 for c in range(cols)]
+        col = Matrix(rows, 1, col_vals)
+        row = Matrix(1, cols, row_vals)
+        result = row * col
+        expected = _outer_op(lambda a, b: a * b, row_vals, col_vals)
+        for r in range(rows):
+            for c in range(cols):
+                assert result[r, c] == pytest.approx(expected[r][c])
+
+    @pytest.mark.parametrize("rows,cols", [(2, 3), (5, 6)])
+    def test_outer_subtract_preserves_operand_order(self, rows, cols):
+        """col - row uses out[r,c] = col[r] - row[c] (non-commutative)."""
+        col_vals = [float(r + 1) for r in range(rows)]
+        row_vals = [float(c + 1) * 10 for c in range(cols)]
+        col = Matrix(rows, 1, col_vals)
+        row = Matrix(1, cols, row_vals)
+        result = col - row
+        expected = _outer_op(lambda a, b: a - b, row_vals, col_vals)
+        for r in range(rows):
+            for c in range(cols):
+                assert result[r, c] == pytest.approx(expected[r][c])
+
+    @pytest.mark.parametrize("rows,cols", [(2, 3), (5, 6)])
+    def test_outer_subtract_row_minus_col(self, rows, cols):
+        """row - col uses out[r,c] = row[c] - col[r] (reflected order)."""
+        col_vals = [float(r + 1) for r in range(rows)]
+        row_vals = [float(c + 1) * 10 for c in range(cols)]
+        col = Matrix(rows, 1, col_vals)
+        row = Matrix(1, cols, row_vals)
+        result = row - col
+        for r in range(rows):
+            for c in range(cols):
+                assert result[r, c] == pytest.approx(row_vals[c] - col_vals[r])
+
+    @pytest.mark.parametrize("rows,cols", [(2, 3), (5, 6)])
+    def test_outer_divide_operand_order(self, rows, cols):
+        """col / row uses out[r,c] = col[r] / row[c]."""
+        col_vals = [float(r + 2) for r in range(rows)]
+        row_vals = [float(c + 3) for c in range(cols)]
+        col = Matrix(rows, 1, col_vals)
+        row = Matrix(1, cols, row_vals)
+        result = col / row
+        for r in range(rows):
+            for c in range(cols):
+                assert result[r, c] == pytest.approx(col_vals[r] / row_vals[c])
+
+    @pytest.mark.parametrize("rows,cols", [(2, 3), (5, 6)])
+    def test_outer_add_commutative(self, rows, cols):
+        """col + row equals row + col element-wise."""
+        col = Matrix(rows, 1, [float(r + 1) for r in range(rows)])
+        row = Matrix(1, cols, [float(c + 1) * 10 for c in range(cols)])
+        assert Matrix.allclose(col + row, row + col)
+
+    def test_outer_inplace_raises(self):
+        """In-place outer broadcast would change shape and is rejected."""
+        col = Matrix(3, 1, [1.0, 2.0, 3.0])
+        row = Matrix(1, 4, [1.0, 2.0, 3.0, 4.0])
+        with pytest.raises(NotImplementedError):
+            col *= row
+
+    def test_incompatible_shapes_still_raise(self):
+        """A row vector whose width mismatches a full matrix still raises."""
+        a = Matrix(1, 3, [1.0, 2.0, 3.0])
+        b = Matrix(2, 4, 1.0)
+        with pytest.raises(NotImplementedError):
+            _ = a * b
 
 
 class TestAllcloseExtended:
@@ -3418,15 +3647,9 @@ def test_allclose_rtol(self):
         """allclose with relative tolerance."""
         a = Matrix(1, 3, [100.0, 200.0, 300.0])
         b = Matrix(1, 3, [101.0, 202.0, 303.0])
-        # rtol=0.02 → tolerance at 100 is 2.0, at 200 is 4.0, at 300 is 6.0
         assert Matrix.allclose(a, b, rtol=0.02, atol=0.0)
 
 
-# ---------------------------------------------------------------------------
-# Matrix.uniform() defaults
-# ---------------------------------------------------------------------------
-
-
 class TestUniformDefaults:
     """Tests for Matrix.uniform() with default arguments."""
 
@@ -3449,11 +3672,6 @@ def test_uniform_custom_range_no_size(self):
             assert 5.0 <= val < 10.0
 
 
-# ---------------------------------------------------------------------------
-# Matrix iteration (non-vector)
-# ---------------------------------------------------------------------------
-
-
 class TestMatrixIteration:
     """Tests for __iter__ on multi-row (non-vector) matrices."""
 
@@ -3486,11 +3704,6 @@ def test_iter_single_column_matrix(self):
             assert got == pytest.approx(expected)
 
 
-# ---------------------------------------------------------------------------
-# Scalar binary arithmetic (int / float with Matrix, both orderings)
-# ---------------------------------------------------------------------------
-
-
 class TestScalarBinaryArithmetic:
     """Verify scalar arithmetic in both orderings.
 
@@ -3565,7 +3778,6 @@ def test_matrix_div_scalar(self, mat, shape, scalar):
     def test_scalar_div_matrix(self, shape, rng, scalar):
         """scalar / mat (reflected divide) is scalar divided by each element."""
         rows, cols = shape
-        # avoid zeros in the matrix
         vals = [rng.uniform(1, 50) for _ in range(rows * cols)]
         m = Matrix(rows, cols, vals)
         c = scalar / m
@@ -3622,11 +3834,6 @@ def test_itruediv_scalar(self, shape, rng, scalar):
                 assert m[i, j] == pytest.approx(vals[i * cols + j] / scalar)
 
 
-# ---------------------------------------------------------------------------
-# List / tuple as row-vector operand in binary arithmetic
-# ---------------------------------------------------------------------------
-
-
 class TestListTupleBinaryArithmetic:
     """Verify list/tuple broadcast arithmetic.
 
@@ -3765,11 +3972,6 @@ def test_inplace_mul_tuple(self):
                 assert m[i, j] == pytest.approx((float(i * 3 + j) + 1) * v[j])
 
 
-# ---------------------------------------------------------------------------
-# List / tuple for __setitem__ (setting rows / columns / slices)
-# ---------------------------------------------------------------------------
-
-
 class TestListTupleAssignment:
     """Verify list/tuple assignment into matrix slices.
 
@@ -3839,20 +4041,12 @@ def test_set_int_scalar_to_element(self):
         m = Matrix.zeros((3, 3))
         m[1, 2] = 99
         assert m[1, 2] == pytest.approx(99.0)
-        # Others remain zero
         assert m[0, 0] == pytest.approx(0.0)
 
 
-# ---------------------------------------------------------------------------
-# x, y, z, w properties
-# ---------------------------------------------------------------------------
-
-
 class TestXYZWProperties:
     """Tests for the x, y, z, w shorthand properties that alias data[0..3]."""
 
-    # -- getter tests --
-
     def test_x_getter_1x1(self):
         """x on a 1x1 matrix returns data[0]."""
         m = Matrix(1, 1, [42.0])
@@ -3911,15 +4105,12 @@ def test_xyzw_on_2d_matrix(self):
         assert m.z == pytest.approx(3.0)
         assert m.w == pytest.approx(4.0)
 
-    # -- setter tests --
-
     def test_x_setter(self):
         """Setting x modifies data[0]."""
         m = Matrix(1, 4, [1.0, 2.0, 3.0, 4.0])
         m.x = 99.0
         assert m.x == pytest.approx(99.0)
         assert m[0, 0] == pytest.approx(99.0)
-        # other elements unchanged
         assert m.y == pytest.approx(2.0)
 
     def test_y_setter(self):
@@ -3980,8 +4171,6 @@ def test_setter_with_int(self):
         assert m.z == pytest.approx(3.0)
         assert m.w == pytest.approx(4.0)
 
-    # -- IndexError for undersized matrices --
-
     def test_y_getter_raises_on_1_element(self):
         """y raises IndexError when the matrix has fewer than 2 elements."""
         m = Matrix(1, 1, [5.0])
@@ -4018,8 +4207,6 @@ def test_w_setter_raises_on_3_elements(self):
         with pytest.raises(IndexError):
             m.w = 10.0
 
-    # -- roundtrip: set then get --
-
     def test_xyzw_roundtrip(self):
         """Set all four properties and read them back."""
         m = Matrix(1, 4, 0.0)
@@ -4032,8 +4219,6 @@ def test_xyzw_roundtrip(self):
         assert m.z == pytest.approx(100.0)
         assert m.w == pytest.approx(-0.001)
 
-    # -- x always works (even on 1-element matrix) --
-
     def test_x_on_scalar_matrix(self):
         """x works on a 1x1 matrix (the minimum size)."""
         m = Matrix(1, 1, [7.7])
@@ -4041,8 +4226,6 @@ def test_x_on_scalar_matrix(self):
         m.x = -3.3
         assert m.x == pytest.approx(-3.3)
 
-    # -- verify independence from subscript indexing --
-
     def test_x_matches_subscript(self):
         """x/y/z/w match two-index subscript access on the same positions."""
         vals = [11.0, 22.0, 33.0, 44.0, 55.0]
@@ -4064,11 +4247,6 @@ def test_xyzw_getter_parametrized(self, size):
         assert m.w == pytest.approx(vals[3])
 
 
-# ---------------------------------------------------------------------------
-# Negative indexing
-# ---------------------------------------------------------------------------
-
-
 class TestNegativeIndexing:
     """Tests for negative integer indices in __getitem__ and __setitem__.
 
@@ -4098,7 +4276,6 @@ def test_negative_element_index(self):
     def test_negative_row_negative_col(self):
         """m[-2, -3] accesses the correct element."""
         m = Matrix(3, 4, [float(i) for i in range(12)])
-        # row -2 = row 1, col -3 = col 1 → element 1*4+1 = 5
         assert m[-2, -3] == pytest.approx(5.0)
 
     def test_set_negative_indices(self):
@@ -4114,11 +4291,6 @@ def test_negative_row_column_vector(self):
         assert m[-3] == pytest.approx(30.0)
 
 
-# ---------------------------------------------------------------------------
-# Slice indexing
-# ---------------------------------------------------------------------------
-
-
 class TestSliceIndexing:
     """Tests for slice-based __getitem__ on matrices."""
 
@@ -4162,10 +4334,43 @@ def test_row_and_column_slice(self):
             for j in range(2):
                 assert sub[i, j] == pytest.approx(m[i + 1, j + 2])
 
+    def test_reversed_step_slice(self):
+        """m[::-1] reverses the rows."""
+        m = Matrix(4, 2, [float(i) for i in range(8)])
+        sub = m[::-1]
+        assert sub.rows == 4
+        assert sub.columns == 2
+        for out_r, src_r in enumerate([3, 2, 1, 0]):
+            for j in range(2):
+                assert sub[out_r, j] == pytest.approx(m[src_r, j])
+
+    def test_negative_bound_slice(self):
+        """m[-2:] selects the trailing rows."""
+        m = Matrix(4, 3, [float(i) for i in range(12)])
+        sub = m[-2:]
+        assert sub.rows == 2
+        assert sub.columns == 3
+        for out_r, src_r in enumerate([2, 3]):
+            for j in range(3):
+                assert sub[out_r, j] == pytest.approx(m[src_r, j])
+
+    @pytest.mark.parametrize("sl", [
+        slice(-1, 1),
+        slice(1, 1),
+        slice(3, 1),
+        slice(2, 0),
+    ])
+    def test_empty_slice_raises_indexerror(self, sl):
+        """An empty or reversed-bound slice raises IndexError, never segfaults."""
+        m = Matrix(4, 3, [float(i) for i in range(12)])
+        with pytest.raises(IndexError):
+            _ = m[sl]
 
-# ---------------------------------------------------------------------------
-# Repr roundtrip
-# ---------------------------------------------------------------------------
+    def test_empty_column_slice_raises_indexerror(self):
+        """An empty column slice raises IndexError rather than crashing."""
+        m = Matrix(4, 3, [float(i) for i in range(12)])
+        with pytest.raises(IndexError):
+            _ = m[:, -1:0]
 
 
 class TestReprFormat:
@@ -4191,11 +4396,6 @@ def test_repr_small_matrix_roundtrip(self):
         assert Matrix.allclose(m, m2)
 
 
-# ---------------------------------------------------------------------------
-# Matmul dimension mismatch
-# ---------------------------------------------------------------------------
-
-
 class TestMatmulErrors:
     """Tests for error handling in matrix multiplication."""
 
@@ -4207,11 +4407,6 @@ def test_matmul_incompatible_shapes_raises(self):
             _ = a @ b
 
 
-# ---------------------------------------------------------------------------
-# Normal distribution statistics
-# ---------------------------------------------------------------------------
-
-
 class TestNormalDistribution:
     """Statistical sanity checks for Matrix.normal()."""
 
@@ -4227,11 +4422,6 @@ def test_normal_zero_stddev(self):
         assert Matrix.allclose(m, expected)
 
 
-# ---------------------------------------------------------------------------
-# Uniform distribution bounds with matrix output
-# ---------------------------------------------------------------------------
-
-
 class TestUniformDistributionMatrix:
     """Verify uniform() matrix output respects the given bounds."""
 
@@ -4243,11 +4433,6 @@ def test_uniform_matrix_in_range(self):
         assert m.max() < hi
 
 
-# ---------------------------------------------------------------------------
-# Matrix inside a Cown — ownership semantics
-# ---------------------------------------------------------------------------
-
-
 class TestMatrixInCown:
     """A Matrix placed in a Cown is released and cannot be accessed directly."""
 
@@ -4292,6 +4477,25 @@ def test_transpose_raises(self):
         with pytest.raises(RuntimeError):
             m.transpose()
 
+    def test_pickle_raises(self):
+        """Pickling an unacquired (cown-resident) matrix raises RuntimeError."""
+        m = Matrix(3, 3, [float(i) for i in range(9)])
+        Cown(m)
+        with pytest.raises(RuntimeError):
+            pickle.dumps(m)
+
+    def test_second_cown_raises_not_segfault(self):
+        """Wrapping an already-released matrix in a second Cown raises cleanly.
+
+        The first Cown moves the matrix to NO_OWNER, so the second Cown
+        cannot re-serialize the move-typed payload and must surface a
+        RuntimeError rather than crash.
+        """
+        m = Matrix(3, 3, [float(i) for i in range(9)])
+        Cown(m)
+        with pytest.raises(RuntimeError):
+            Cown(m)
+
     def test_x_getter_raises(self):
         """Reading .x on an unacquired matrix raises RuntimeError."""
         m = Matrix(3, 3, [float(i) for i in range(9)])
@@ -4344,15 +4548,80 @@ def test_sequential_acquires(self):
             val.x = 10.0
             assert val.x == pytest.approx(10.0)
         with c as val:
-            # should see the mutation from previous block
             assert val.x == pytest.approx(10.0)
             val.y = 20.0
             assert val.y == pytest.approx(20.0)
 
 
-# ---------------------------------------------------------------------------
-# Vector methods invoked from inside a @when behavior
-# ---------------------------------------------------------------------------
+class TestUnwrapMatrix:
+    """unwrap() on a Cown[Matrix] hands back a caller-owned, readable matrix.
+
+    unwrap consumes the cown, so the returned matrix keeps its ownership and
+    is readable by the caller.
+    """
+
+    @classmethod
+    def teardown_class(cls):
+        wait()
+
+    def test_unwrap_returns_readable_matrix(self):
+        """The unwrapped matrix is owned by the caller and readable."""
+        m = Matrix(2, 2, [1.0, 2.0, 3.0, 4.0])
+        c = Cown(m)
+
+        @when(c)
+        def _(c):
+            c.value[0, 0] = 42.0
+
+        quiesce(QUIESCE_TIMEOUT)
+        res = c.unwrap()
+        assert res.acquired is True
+        assert res[0, 0] == pytest.approx(42.0)
+        assert res[1, 1] == pytest.approx(4.0)
+
+    def test_unwrap_consumes_cown(self):
+        """A second unwrap of the same cown returns None (consumed)."""
+        m = Matrix(1, 3, [5.0, 6.0, 7.0])
+        c = Cown(m)
+
+        @when(c)
+        def _(c):
+            c.value[0, 0] = 99.0
+
+        quiesce(QUIESCE_TIMEOUT)
+        first = c.unwrap()
+        assert first.acquired is True
+        assert first[0, 0] == pytest.approx(99.0)
+        assert c.unwrap() is None
+
+    def test_emptied_cown_is_reschedulable(self):
+        """After consuming, the cown still accepts a fresh behavior."""
+        m = Matrix(1, 2, [1.0, 2.0])
+        c = Cown(m)
+
+        @when(c)
+        def _(c):
+            c.value[0, 0] = 3.0
+
+        quiesce(QUIESCE_TIMEOUT)
+        taken = c.unwrap()
+        assert taken[0, 0] == pytest.approx(3.0)
+
+        # The cown now holds None. A fresh behavior must still acquire it
+        # and install a new matrix, which a later unwrap then consumes.
+        @when(c)
+        def _(c):
+            assert c.value is None
+            c.value = Matrix(1, 2, [10.0, 20.0])
+
+        @when(c)
+        def _(c):
+            c.value[0, 1] = 30.0
+
+        quiesce(QUIESCE_TIMEOUT)
+        again = c.unwrap()
+        assert again[0, 0] == pytest.approx(10.0)
+        assert again[0, 1] == pytest.approx(30.0)
 
 
 class TestVectorMethodsInCown:
@@ -4360,50 +4629,16 @@ class TestVectorMethodsInCown:
 
     Mirrors the in-process Matrix-vector tests but routes every call
     through the worker dispatch path so Matrix XIData round-trip plus
-    in-cown mutation are both exercised. Assertions are shipped out of
-    the behaviors via ``send("assert", ...)`` and collected on the test
-    thread by :meth:`receive_asserts`, per the project's BOC testing
-    convention; reading ``result.value`` directly from the test thread
-    would violate cown ownership.
+    in-cown mutation are both exercised. Each behavior returns its
+    result; the test thread calls :func:`quiesce` and then reads the
+    result via :meth:`Cown.unwrap`, which re-raises any exception the
+    behavior captured.
     """
 
-    RECEIVE_TIMEOUT = 10
-
     @classmethod
     def teardown_class(cls):
         wait()
 
-    def receive_asserts(self, count=1):
-        """Collect ``count`` ('assert', (actual, expected)) messages.
-
-        Drains the queue on exit so a failure in one test does not leak
-        residual messages into the next.
-        """
-        failed = None
-        timed_out = False
-        try:
-            for _ in range(count):
-                result = receive("assert", self.RECEIVE_TIMEOUT)
-                if result[0] == TIMEOUT:
-                    timed_out = True
-                    break
-                _, (actual, expected) = result
-                if failed is None and actual != expected:
-                    failed = (actual, expected)
-        finally:
-            drain("assert")
-
-        assert not timed_out, (
-            "Timed out waiting for an 'assert' message from a behavior. "
-            "Check that every @when arg count matches the decorated "
-            "function's parameter count."
-        )
-        if failed is not None:
-            actual, expected = failed
-            assert actual == expected, f"expected {expected!r}, got {actual!r}"
-
-    # ---- scalar-returning -------------------------------------------------
-
     def test_vecdot_in_behavior(self):
         """``[1,2,3]·[4,5,6] == 32`` via worker dispatch."""
         a = Cown(Matrix(1, 3, [1.0, 2.0, 3.0]))
@@ -4413,12 +4648,8 @@ def test_vecdot_in_behavior(self):
         def result(a):
             return a.value.vecdot(b)
 
-        @when(result)
-        def _(r):
-            send("assert", (r.exception, False))
-            send("assert", (r.value == pytest.approx(32.0), True))
-
-        self.receive_asserts(2)
+        quiesce(QUIESCE_TIMEOUT)
+        assert result.unwrap() == pytest.approx(32.0)
 
     def test_length_in_behavior(self):
         """``length`` getter ``[3, 4] == 5`` via worker dispatch (it's a property)."""
@@ -4428,12 +4659,8 @@ def test_length_in_behavior(self):
         def result(v):
             return v.value.length
 
-        @when(result)
-        def _(r):
-            send("assert", (r.exception, False))
-            send("assert", (r.value == pytest.approx(5.0), True))
-
-        self.receive_asserts(2)
+        quiesce(QUIESCE_TIMEOUT)
+        assert result.unwrap() == pytest.approx(5.0)
 
     def test_magnitude_squared_in_behavior(self):
         """``magnitude_squared([3, 4]) == 25`` via worker dispatch."""
@@ -4443,12 +4670,8 @@ def test_magnitude_squared_in_behavior(self):
         def result(v):
             return v.value.magnitude_squared()
 
-        @when(result)
-        def _(r):
-            send("assert", (r.exception, False))
-            send("assert", (r.value == pytest.approx(25.0), True))
-
-        self.receive_asserts(2)
+        quiesce(QUIESCE_TIMEOUT)
+        assert result.unwrap() == pytest.approx(25.0)
 
     def test_angle_in_behavior(self):
         """``angle([0, 1]) == pi/2`` via worker dispatch."""
@@ -4458,14 +4681,8 @@ def test_angle_in_behavior(self):
         def result(v):
             return v.value.angle()
 
-        @when(result)
-        def _(r):
-            send("assert", (r.exception, False))
-            send("assert", (r.value == pytest.approx(math.pi / 2.0), True))
-
-        self.receive_asserts(2)
-
-    # ---- matrix-returning (copy form) ------------------------------------
+        quiesce(QUIESCE_TIMEOUT)
+        assert result.unwrap() == pytest.approx(math.pi / 2.0)
 
     def test_cross_3d_in_behavior(self):
         """``[1,2,3] × [4,5,6] == [-3, 6, -3]`` via worker dispatch."""
@@ -4477,14 +4694,11 @@ def result(a):
             out = a.value.cross(b)
             return (out[0, 0], out[0, 1], out[0, 2])
 
-        @when(result)
-        def _(r):
-            send("assert", (r.exception, False))
-            send("assert", (r.value[0] == pytest.approx(-3.0), True))
-            send("assert", (r.value[1] == pytest.approx(6.0), True))
-            send("assert", (r.value[2] == pytest.approx(-3.0), True))
-
-        self.receive_asserts(4)
+        quiesce(QUIESCE_TIMEOUT)
+        out = result.unwrap()
+        assert out[0] == pytest.approx(-3.0)
+        assert out[1] == pytest.approx(6.0)
+        assert out[2] == pytest.approx(-3.0)
 
     def test_normalize_copy_in_behavior(self):
         """``normalize([3, 4]) == [0.6, 0.8]``; original cown value untouched."""
@@ -4495,16 +4709,12 @@ def result(v):
             n = v.value.normalize()
             return (n[0, 0], n[0, 1], v.value[0, 0], v.value[0, 1])
 
-        @when(result)
-        def _(r):
-            send("assert", (r.exception, False))
-            n0, n1, src0, src1 = r.value
-            send("assert", (n0 == pytest.approx(0.6), True))
-            send("assert", (n1 == pytest.approx(0.8), True))
-            send("assert", (src0 == pytest.approx(3.0), True))
-            send("assert", (src1 == pytest.approx(4.0), True))
-
-        self.receive_asserts(5)
+        quiesce(QUIESCE_TIMEOUT)
+        n0, n1, src0, src1 = result.unwrap()
+        assert n0 == pytest.approx(0.6)
+        assert n1 == pytest.approx(0.8)
+        assert src0 == pytest.approx(3.0)
+        assert src1 == pytest.approx(4.0)
 
     def test_perpendicular_copy_in_behavior(self):
         """``perpendicular([1, 0]) == [0, 1]``; original cown value untouched."""
@@ -4515,18 +4725,12 @@ def result(v):
             p = v.value.perpendicular()
             return (p[0, 0], p[0, 1], v.value[0, 0], v.value[0, 1])
 
-        @when(result)
-        def _(r):
-            send("assert", (r.exception, False))
-            p0, p1, src0, src1 = r.value
-            send("assert", (p0 == pytest.approx(0.0), True))
-            send("assert", (p1 == pytest.approx(1.0), True))
-            send("assert", (src0 == pytest.approx(1.0), True))
-            send("assert", (src1 == pytest.approx(0.0), True))
-
-        self.receive_asserts(5)
-
-    # ---- in-place mutators -----------------------------------------------
+        quiesce(QUIESCE_TIMEOUT)
+        p0, p1, src0, src1 = result.unwrap()
+        assert p0 == pytest.approx(0.0)
+        assert p1 == pytest.approx(1.0)
+        assert src0 == pytest.approx(1.0)
+        assert src1 == pytest.approx(0.0)
 
     def test_normalize_in_place_in_behavior(self):
         """``normalize(in_place=True)`` mutates the matrix held by the cown."""
@@ -4540,13 +4744,10 @@ def _(v):
         def check(v):
             return (v.value[0, 0], v.value[0, 1])
 
-        @when(check)
-        def _(r):
-            send("assert", (r.exception, False))
-            send("assert", (r.value[0] == pytest.approx(0.6), True))
-            send("assert", (r.value[1] == pytest.approx(0.8), True))
-
-        self.receive_asserts(3)
+        quiesce(QUIESCE_TIMEOUT)
+        r0, r1 = check.unwrap()
+        assert r0 == pytest.approx(0.6)
+        assert r1 == pytest.approx(0.8)
 
     def test_perpendicular_in_place_in_behavior(self):
         """``perpendicular(in_place=True)`` mutates the matrix held by the cown."""
@@ -4560,13 +4761,10 @@ def _(v):
         def check(v):
             return (v.value[0, 0], v.value[0, 1])
 
-        @when(check)
-        def _(r):
-            send("assert", (r.exception, False))
-            send("assert", (r.value[0] == pytest.approx(0.0), True))
-            send("assert", (r.value[1] == pytest.approx(1.0), True))
-
-        self.receive_asserts(3)
+        quiesce(QUIESCE_TIMEOUT)
+        r0, r1 = check.unwrap()
+        assert r0 == pytest.approx(0.0)
+        assert r1 == pytest.approx(1.0)
 
     def test_negate_in_place_in_behavior(self):
         """``negate(in_place=True)`` mutates the matrix held by the cown."""
@@ -4580,11 +4778,8 @@ def _(v):
         def check(v):
             return (v.value[0, 0], v.value[0, 1], v.value[0, 2])
 
-        @when(check)
-        def _(r):
-            send("assert", (r.exception, False))
-            send("assert", (r.value[0] == pytest.approx(-1.0), True))
-            send("assert", (r.value[1] == pytest.approx(2.0), True))
-            send("assert", (r.value[2] == pytest.approx(-3.0), True))
-
-        self.receive_asserts(4)
+        quiesce(QUIESCE_TIMEOUT)
+        r0, r1, r2 = check.unwrap()
+        assert r0 == pytest.approx(-1.0)
+        assert r1 == pytest.approx(2.0)
+        assert r2 == pytest.approx(-3.0)
diff --git a/test/test_message_queue.py b/test/test_message_queue.py
index 01f2d0a..5cb1461 100644
--- a/test/test_message_queue.py
+++ b/test/test_message_queue.py
@@ -1,19 +1,8 @@
-"""Tests for the underlying message queue system.
-
-Exercises the low-level queue mechanics: automatic tag-to-queue assignment,
-FIFO ordering, selective receive, capacity limits, set_tags management,
-high-volume throughput, concurrent producers/consumers, error handling,
-and queue reset behavior.
-
-The message queue supports two modes:
-- **Automatic assignment**: tags are assigned to queues on first use, up to the
-  hard limit of ``MAX_QUEUES`` (16).
-- **Explicit assignment via set_tags()**: pre-assigns tags to queues, clears all
-  pending messages, and resets queue state.
-
-Both modes are tested here.  Tests that do *not* need set_tags rely purely on
-the auto-assignment path and use a ``set_tags([])`` call only in the fixture to
-ensure a clean slate.
+"""Tests for the low-level message queue.
+
+Covers tag-to-queue assignment (automatic and explicit via set_tags),
+FIFO ordering, selective receive, capacity limits, throughput, and
+concurrent producers/consumers.
 """
 
 import random
@@ -24,19 +13,9 @@
 
 from bocpy import drain, receive, send, set_tags, TIMEOUT
 
-
-# ---------------------------------------------------------------------------
-# Constants
-# ---------------------------------------------------------------------------
-
-# Maximum number of dedicated queues supported by the C layer (BOC_QUEUE_COUNT).
 MAX_QUEUES = 16
 
 
-# ---------------------------------------------------------------------------
-# Fixtures
-# ---------------------------------------------------------------------------
-
 @pytest.fixture(autouse=True)
 def reset_queues():
     """Reset all queues before and after every test.
@@ -50,11 +29,6 @@ def reset_queues():
     set_tags([])
 
 
-# ===================================================================
-# Auto-assignment: tags acquire queues on first send/receive
-# ===================================================================
-
-
 class TestAutoAssignment:
     """Verify that tags are automatically assigned to queues on first use."""
 
@@ -88,7 +62,7 @@ def test_exceeding_max_auto_tags(self):
         """Auto-assigning more than MAX_QUEUES tags raises KeyError."""
         tags = [f"aover{i}" for i in range(MAX_QUEUES)]
         for t in tags:
-            send(t, "fill")  # fills all 16 slots
+            send(t, "fill")
         with pytest.raises(KeyError, match="tag capacity exceeded"):
             send("one_too_many", "boom")
 
@@ -102,11 +76,6 @@ def test_reuse_same_tag(self):
         assert v2 == 2
 
 
-# ===================================================================
-# FIFO ordering within a single queue
-# ===================================================================
-
-
 class TestFIFO:
     """Ensure per-tag FIFO delivery via auto-assigned queues."""
 
@@ -128,11 +97,6 @@ def test_fifo_large_burst(self):
             assert val == i
 
 
-# ===================================================================
-# Selective receive (tag filtering)
-# ===================================================================
-
-
 class TestSelectiveReceive:
     """Receive should pick the first matching tag and leave others."""
 
@@ -142,7 +106,6 @@ def test_skip_unmatched(self):
         send("sel_want", "yes")
         _, val = receive("sel_want", 1)
         assert val == "yes"
-        # "sel_skip" message still available
         _, val = receive("sel_skip", 1)
         assert val == "no"
 
@@ -188,11 +151,6 @@ def test_single_str_tag(self):
         assert val == 99
 
 
-# ===================================================================
-# Timeout behavior
-# ===================================================================
-
-
 class TestTimeout:
     """Timeout and after-callback edge cases."""
 
@@ -217,7 +175,7 @@ def delayed_send():
 
         t = threading.Thread(target=delayed_send)
         t.start()
-        tag, val = receive("neg_blk", 5)  # generous upper bound
+        tag, val = receive("neg_blk", 5)
         t.join()
         assert tag == "neg_blk"
         assert val == "arrived"
@@ -247,11 +205,6 @@ def test_after_returns_custom_values(self):
         assert val == {"k": "v"}
 
 
-# ===================================================================
-# Error handling
-# ===================================================================
-
-
 class TestErrors:
     """Error paths in send/receive."""
 
@@ -283,11 +236,9 @@ def test_send_unpaired_surrogate_tag_no_leak(self):
         slot is still usable by sending a normal tag through the
         queue afterwards.
         """
-        bad_tag = "\ud800"  # lone high surrogate
+        bad_tag = "\ud800"
         with pytest.raises(UnicodeEncodeError):
             send(bad_tag, "payload")
-        # Sanity: the queue subsystem is still functional after the
-        # failed attempt.
         send("post_surrogate_ok", "ok")
         _, val = receive("post_surrogate_ok", 1)
         assert val == "ok"
@@ -305,15 +256,9 @@ def test_set_tags_unpaired_surrogate_no_leak(self):
         """
         with pytest.raises(UnicodeEncodeError):
             set_tags(["ok_tag", "\ud800"])
-        # Restore queues to a usable state for the rest of the suite.
         set_tags([])
 
 
-# ===================================================================
-# Queue isolation
-# ===================================================================
-
-
 class TestQueueIsolation:
     """Messages on different tags do not interfere."""
 
@@ -343,11 +288,6 @@ def test_drain_one_tag_leaves_others(self):
         assert val == "b1"
 
 
-# ===================================================================
-# Payload round-trip fidelity
-# ===================================================================
-
-
 class TestPayloadFidelity:
     """Ensure various Python types survive the send/receive round-trip."""
 
@@ -380,11 +320,6 @@ def test_payload_roundtrip(self, payload):
         assert val == payload
 
 
-# ===================================================================
-# High-volume / throughput
-# ===================================================================
-
-
 class TestThroughput:
     """High-volume message passing to stress the ring buffer."""
 
@@ -419,11 +354,6 @@ def producer():
         assert results == list(range(n))
 
 
-# ===================================================================
-# Concurrent producers
-# ===================================================================
-
-
 class TestConcurrentProducers:
     """Multiple threads sending to the same queue concurrently."""
 
@@ -484,11 +414,6 @@ def producer(pid):
             )
 
 
-# ===================================================================
-# Cross-thread send/receive
-# ===================================================================
-
-
 class TestCrossThread:
     """Messages crossing thread boundaries."""
 
@@ -542,11 +467,6 @@ def sender(i):
         assert values == set(range(n))
 
 
-# ===================================================================
-# set_tags — explicit tag management
-# ===================================================================
-
-
 class TestSetTags:
     """Tests for the optional set_tags() management function."""
 
@@ -564,7 +484,6 @@ def test_set_tags_clears_all(self):
         for i in range(MAX_QUEUES):
             send(f"clr{i}", f"stale_{i}")
         set_tags([f"fresh{i}" for i in range(MAX_QUEUES)])
-        # New tags start empty.
         tag, val = receive("fresh0", 0)
         assert tag == TIMEOUT
 
@@ -608,7 +527,6 @@ def test_set_tags_empty_resets_to_auto_assign(self):
         set_tags(["pre"])
         send("pre", "msg")
         set_tags([])
-        # All queues are now unassigned; auto-assignment kicks in.
         send("new_auto", "works")
         tag, val = receive("new_auto", 1)
         assert tag == "new_auto"
@@ -622,11 +540,6 @@ def test_set_tags_then_overflow(self):
             send("extra_tag_beyond_limit", "boom")
 
 
-# ===================================================================
-# set_tags — idempotence and repeated calls
-# ===================================================================
-
-
 class TestSetTagsRepeated:
     """Calling set_tags multiple times should be safe."""
 
@@ -635,7 +548,6 @@ def test_double_set_tags(self):
         set_tags(["first"])
         send("first", "msg1")
         set_tags(["second"])
-        # "first" messages are gone; nothing sent on "second" yet.
         tag, val = receive("second", 0)
         assert tag == TIMEOUT
 
@@ -661,11 +573,6 @@ def test_rapid_set_tags_cycles(self):
             assert val == cycle
 
 
-# ===================================================================
-# Worker-pool integration (match/case receive loop)
-# ===================================================================
-
-
 class TestWorkerPool:
     """Multi-worker batch-processing pattern with shutdown protocol."""
 
@@ -715,11 +622,6 @@ def worker():
         assert result == (max_value * (max_value - 1)) // 2
 
 
-# ===================================================================
-# drain(): clear pending messages for specific tags
-# ===================================================================
-
-
 class TestDrain:
     """Verify that drain() removes pending messages for the specified tags."""
 
@@ -760,9 +662,9 @@ def test_drain_leaves_other_tags(self):
     def test_drain_empty_tag(self):
         """Draining a tag with no pending messages is a no-op."""
         send("d_empty", "x")
-        receive("d_empty", 1)  # consume the only message
+        receive("d_empty", 1)
 
-        drain(["d_empty"])  # should not raise
+        drain(["d_empty"])
 
         assert receive("d_empty", 0.1)[0] == TIMEOUT
 
@@ -809,11 +711,6 @@ def test_drain_non_string_tag_raises(self):
             drain([123])
 
 
-# ===================================================================
-# Spin-then-park: lost-wake stress
-# ===================================================================
-
-
 class TestLostWakeStress:
     """Verify that the spin-then-park strategy never loses a wake signal."""
 
@@ -874,11 +771,6 @@ def delayed_send():
         assert val == iteration
 
 
-# ===================================================================
-# Spin-then-park: multi-tag receive
-# ===================================================================
-
-
 class TestMultiTagBackoff:
     """Multi-tag receive correctness under the exponential backoff path."""
 
@@ -956,11 +848,6 @@ def producer(tag, offset):
             assert sorted(per_tag[t]) == expected
 
 
-# ===================================================================
-# Spin-then-park: timeout accuracy
-# ===================================================================
-
-
 class TestTimeoutAccuracy:
     """Verify that timed receives return within a reasonable time window."""
 
@@ -982,7 +869,7 @@ def test_timeout_upper_bound(self, timeout):
         tag, _ = receive("ta_upper", timeout)
         elapsed = time.monotonic() - start
         assert tag == TIMEOUT
-        upper = timeout + 0.1  # 100 ms grace for scheduling jitter
+        upper = timeout + 0.1
         assert elapsed <= upper, (
             f"Returned too late: {elapsed:.4f}s > {upper:.4f}s"
         )
diff --git a/test/test_noticeboard.py b/test/test_noticeboard.py
index 22c5c24..a6a31c2 100644
--- a/test/test_noticeboard.py
+++ b/test/test_noticeboard.py
@@ -4,44 +4,13 @@
 
 import pytest
 
-from bocpy import (Cown, drain, notice_delete, notice_read, notice_sync,
-                   notice_update, notice_write, noticeboard,
-                   receive,
-                   REMOVED, send, start, TIMEOUT, wait, when)
+from bocpy import (Cown, notice_delete, notice_read,
+                   notice_seed, notice_update, notice_write, noticeboard,
+                   quiesce, REMOVED, start, wait, when)
 import bocpy._core as _core
 
 
-RECEIVE_TIMEOUT = 10
-
-
-def receive_asserts(count=1):
-    """Drain all expected assertion messages, then fail on first mismatch.
-
-    The "assert" queue is always drained before returning so that leftover
-    messages from a failing test do not leak into subsequent tests in CI.
-    """
-    failed = None
-    timed_out = False
-    try:
-        for _ in range(count):
-            result = receive("assert", RECEIVE_TIMEOUT)
-            if result[0] == TIMEOUT:
-                timed_out = True
-                break
-            _, (actual, expected) = result
-            if failed is None and actual != expected:
-                failed = (actual, expected)
-    finally:
-        drain("assert")
-
-    assert not timed_out, (
-        "Timed out waiting for an 'assert' message from a behavior. "
-        "Check that every @when arg count matches the decorated "
-        "function's parameter count."
-    )
-    if failed is not None:
-        actual, expected = failed
-        assert actual == expected, f"expected {expected!r}, got {actual!r}"
+QUIESCE_TIMEOUT = 5
 
 
 class TestNoticeboard:
@@ -59,14 +28,9 @@ def test_write_then_read_roundtrip(self):
         @when(x)
         def step1(x):
             notice_write("greeting", "hello")
-            notice_sync()
-
-        @when(x, step1)
-        def step2(x, _):
-            snap = noticeboard()
-            send("assert", (snap.get("greeting"), "hello"))
 
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get("greeting") == "hello"
 
     def test_write_overwrite(self):
         """Overwriting a key replaces the previous value."""
@@ -75,31 +39,26 @@ def test_write_overwrite(self):
         @when(x)
         def step1(x):
             notice_write("counter", 10)
-            notice_sync()
 
         @when(x, step1)
         def step2(x, _):
             notice_write("counter", 20)
-            notice_sync()
 
-        @when(x, step2)
-        def step3(x, _):
-            snap = noticeboard()
-            send("assert", (snap.get("counter"), 20))
-
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get("counter") == 20
 
     def test_snapshot_returns_mapping(self):
         """Snapshot returns a read-only mapping even with no writes."""
         x = Cown(0)
 
         @when(x)
-        def _(x):
+        def probe(x):
             from collections.abc import Mapping
             snap = noticeboard()
-            send("assert", (isinstance(snap, Mapping), True))
+            return isinstance(snap, Mapping)
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert probe.unwrap() is True
 
     def test_multiple_keys(self):
         """Multiple keys can coexist in the noticeboard."""
@@ -110,16 +69,9 @@ def step1(x):
             notice_write("a", 1)
             notice_write("b", 2)
             notice_write("c", 3)
-            notice_sync()
-
-        @when(x, step1)
-        def step2(x, _):
-            snap = noticeboard()
-            send("assert", (snap.get("a"), 1))
-            send("assert", (snap.get("b"), 2))
-            send("assert", (snap.get("c"), 3))
 
-        receive_asserts(3)
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert (snap.get("a"), snap.get("b"), snap.get("c")) == (1, 2, 3)
 
     def test_frozen_snapshot(self):
         """Snapshot is frozen: a write after snapshot doesn't change it."""
@@ -128,19 +80,20 @@ def test_frozen_snapshot(self):
         @when(x)
         def step1(x):
             notice_write("val", 100)
-            notice_sync()
 
-        @when(x, step1)
-        def step2(x, _):
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+
+        @when(x)
+        def step2(x):
             snap1 = noticeboard()
             notice_write("val", 200)
-            notice_sync()
             snap2 = noticeboard()
-            # Both calls in the same behavior return the same cached snapshot
-            send("assert", (snap1.get("val"), 100))
-            send("assert", (snap1.get("val"), snap2.get("val")))
+            return (snap1.get("val"), snap2.get("val"))
 
-        receive_asserts(2)
+        quiesce(QUIESCE_TIMEOUT)
+        val1, val2 = step2.unwrap()
+        assert val1 == 100
+        assert val1 == val2
 
     def test_snapshot_cache_cleared_between_behaviors(self):
         """Each behavior gets a fresh snapshot, not the previous one's cache."""
@@ -149,21 +102,26 @@ def test_snapshot_cache_cleared_between_behaviors(self):
         @when(x)
         def step1(x):
             notice_write("seq", 1)
-            notice_sync()
 
-        @when(x, step1)
-        def step2(x, _):
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+
+        @when(x)
+        def step2(x):
             snap = noticeboard()
-            send("assert", (snap.get("seq"), 1))
+            seq = snap.get("seq")
             notice_write("seq", 2)
-            notice_sync()
+            return seq
 
-        @when(x, step2)
-        def step3(x, _):
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+
+        @when(x)
+        def step3(x):
             snap = noticeboard()
-            send("assert", (snap.get("seq"), 2))
+            return snap.get("seq")
 
-        receive_asserts(2)
+        quiesce(QUIESCE_TIMEOUT)
+        assert step2.unwrap() == 1
+        assert step3.unwrap() == 2
 
     def test_picklable_value(self):
         """Complex (picklable) values round-trip through the noticeboard."""
@@ -172,14 +130,9 @@ def test_picklable_value(self):
         @when(x)
         def step1(x):
             notice_write("data", [1, 2, 3])
-            notice_sync()
 
-        @when(x)
-        def step2(x):
-            snap = noticeboard()
-            send("assert", (snap.get("data"), [1, 2, 3]))
-
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get("data") == [1, 2, 3]
 
     def test_set_value_forces_pickle_path(self):
         """A set is not natively shareable and must take the pickle path."""
@@ -188,14 +141,9 @@ def test_set_value_forces_pickle_path(self):
         @when(x)
         def step1(x):
             notice_write("tags", {1, 2, 3})
-            notice_sync()
-
-        @when(x, step1)
-        def step2(x, _):
-            snap = noticeboard()
-            send("assert", (snap.get("tags"), {1, 2, 3}))
 
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get("tags") == {1, 2, 3}
 
     def test_int_value(self):
         """Integer values (native cross-interpreter) round-trip correctly."""
@@ -204,14 +152,9 @@ def test_int_value(self):
         @when(x)
         def step1(x):
             notice_write("num", 42)
-            notice_sync()
-
-        @when(x, step1)
-        def step2(x, _):
-            snap = noticeboard()
-            send("assert", (snap.get("num"), 42))
 
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get("num") == 42
 
     def test_none_value(self):
         """None round-trips through the noticeboard."""
@@ -220,15 +163,10 @@ def test_none_value(self):
         @when(x)
         def step1(x):
             notice_write("empty", None)
-            notice_sync()
-
-        @when(x, step1)
-        def step2(x, _):
-            snap = noticeboard()
-            send("assert", ("empty" in snap, True))
-            send("assert", (snap["empty"], None))
 
-        receive_asserts(2)
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert "empty" in snap
+        assert snap["empty"] is None
 
     def test_notice_read_existing_key(self):
         """notice_read returns the value for an existing key."""
@@ -237,33 +175,37 @@ def test_notice_read_existing_key(self):
         @when(x)
         def step1(x):
             notice_write("color", "blue")
-            notice_sync()
 
-        @when(x, step1)
-        def step2(x, _):
-            send("assert", (notice_read("color"), "blue"))
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+
+        @when(x)
+        def step2(x):
+            return notice_read("color")
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert step2.unwrap() == "blue"
 
     def test_notice_read_missing_key_default(self):
         """notice_read returns None for a missing key by default."""
         x = Cown(0)
 
         @when(x)
-        def _(x):
-            send("assert", (notice_read("nonexistent"), None))
+        def probe(x):
+            return notice_read("nonexistent")
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert probe.unwrap() is None
 
     def test_notice_read_missing_key_custom_default(self):
         """notice_read returns the custom default for a missing key."""
         x = Cown(0)
 
         @when(x)
-        def _(x):
-            send("assert", (notice_read("nonexistent", 42), 42))
+        def probe(x):
+            return notice_read("nonexistent", 42)
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert probe.unwrap() == 42
 
     def test_notice_read_uses_cached_snapshot(self):
         """Two notice_read calls in the same behavior use the same snapshot."""
@@ -272,18 +214,19 @@ def test_notice_read_uses_cached_snapshot(self):
         @when(x)
         def step1(x):
             notice_write("tick", 1)
-            notice_sync()
 
-        @when(x, step1)
-        def step2(x, _):
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+
+        @when(x)
+        def step2(x):
             val1 = notice_read("tick")
             notice_write("tick", 99)
-            notice_sync()
             val2 = notice_read("tick")
-            # Both reads see the cached snapshot, not the new write
-            send("assert", (val1, val2))
+            return (val1, val2)
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        val1, val2 = step2.unwrap()
+        assert val1 == val2
 
 
 class TestNoticeboardBoundary:
@@ -301,35 +244,27 @@ def setup_method(self):
     def test_max_key_length_63_bytes(self):
         """A key of exactly 63 UTF-8 bytes is accepted."""
         x = Cown(0)
-        long_key = "k" * 63  # exactly 63 bytes
+        long_key = "k" * 63
 
         @when(x)
         def step1(x):
             notice_write(long_key, "ok")
-            notice_sync()
 
-        @when(x, step1)
-        def step2(x, _):
-            val = notice_read(long_key)
-            send("assert", (val, "ok"))
-
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get(long_key) == "ok"
 
     def test_key_length_64_bytes_rejected(self):
         """A key of 64 UTF-8 bytes is rejected with ValueError."""
         x = Cown(0)
-        too_long = "k" * 64  # 64 bytes, exceeds 63-byte limit
+        too_long = "k" * 64
 
         @when(x)
-        def _(x):
-            try:
-                notice_write(too_long, "fail")
-                notice_sync()
-                send("assert", (False, True))  # should not reach here
-            except ValueError:
-                send("assert", (True, True))
+        def probe(x):
+            notice_write(too_long, "fail")
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        with pytest.raises(ValueError):
+            probe.unwrap()
 
     def test_64_entries_accepted(self):
         """The noticeboard accepts up to 64 distinct keys."""
@@ -339,16 +274,11 @@ def test_64_entries_accepted(self):
         def step1(x):
             for i in range(64):
                 notice_write(f"slot{i}", i)
-                notice_sync()
-
-        @when(x, step1)
-        def step2(x, _):
-            snap = noticeboard()
-            send("assert", (len(snap) >= 64, True))
-            send("assert", (snap.get("slot0"), 0))
-            send("assert", (snap.get("slot63"), 63))
 
-        receive_asserts(3)
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert len(snap) >= 64
+        assert snap.get("slot0") == 0
+        assert snap.get("slot63") == 63
 
     def test_65th_entry_silently_dropped(self):
         """The 65th distinct key is silently dropped by the noticeboard thread."""
@@ -358,51 +288,37 @@ def test_65th_entry_silently_dropped(self):
         def step1(x):
             for i in range(65):
                 notice_write(f"cap{i}", i)
-                notice_sync()
-
-        @when(x, step1)
-        def step2(x, _):
-            snap = noticeboard()
-            # Only 64 entries should be present; the 65th is dropped
-            cap_keys = [k for k in snap if k.startswith("cap")]
-            send("assert", (len(cap_keys), 64))
-            # The first 64 keys (cap0..cap63) should be present
-            send("assert", (snap.get("cap0"), 0))
-            send("assert", (snap.get("cap63"), 63))
-            # The 65th key (cap64) should be missing
-            send("assert", ("cap64" not in snap, True))
 
-        receive_asserts(4)
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        cap_keys = [k for k in snap if k.startswith("cap")]
+        assert len(cap_keys) == 64
+        assert snap.get("cap0") == 0
+        assert snap.get("cap63") == 63
+        assert "cap64" not in snap
 
     def test_write_non_string_key_rejected(self):
         """Non-string key raises TypeError."""
         x = Cown(0)
 
         @when(x)
-        def _(x):
-            try:
-                notice_write(123, "value")
-                notice_sync()
-                send("assert", (False, True))
-            except TypeError:
-                send("assert", (True, True))
+        def probe(x):
+            notice_write(123, "value")
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        with pytest.raises(TypeError):
+            probe.unwrap()
 
     def test_key_with_nul_rejected(self):
         """A key containing NUL is rejected with ValueError."""
         x = Cown(0)
 
         @when(x)
-        def _(x):
-            try:
-                notice_write("a\x00b", "value")
-                notice_sync()
-                send("assert", (False, True))
-            except ValueError:
-                send("assert", (True, True))
+        def probe(x):
+            notice_write("a\x00b", "value")
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        with pytest.raises(ValueError):
+            probe.unwrap()
 
 
 class TestNoticeboardConcurrency:
@@ -425,23 +341,12 @@ def test_concurrent_writes_from_independent_behaviors(self):
             @when(cowns[i])
             def writer(c):
                 notice_write(f"cw_{c.value}", c.value * 10)
-                # Block this behavior until the write commits, so the
-                # reader (which acquires every cown below) is guaranteed
-                # to observe it.
-                notice_sync()
-
-        # The reader requires every writer cown, so it cannot run until
-        # every writer behavior has returned — and notice_sync() above
-        # ensures each writer's mutation is committed before it returns.
-        @when(cowns)
-        def reader(cowns):
-            snap = noticeboard()
-            count = sum(1 for k in snap if k.startswith("cw_"))
-            send("assert", (count, 8))
-            send("assert", (snap.get("cw_0"), 0))
-            send("assert", (snap.get("cw_7"), 70))
 
-        receive_asserts(3)
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        count = sum(1 for k in snap if k.startswith("cw_"))
+        assert count == 8
+        assert snap.get("cw_0") == 0
+        assert snap.get("cw_7") == 70
 
 
 class TestNoticeboardUTF8:
@@ -459,37 +364,27 @@ def setup_method(self):
     def test_multibyte_key_within_limit(self):
         """A 3-byte character at byte position 60 fits within 63-byte limit."""
         x = Cown(0)
-        # "€" is 3 UTF-8 bytes; 60 ASCII + 3 = 63 bytes total
         key_63 = "a" * 60 + "€"
 
         @when(x)
         def step1(x):
             notice_write(key_63, "ok")
-            notice_sync()
 
-        @when(x, step1)
-        def step2(x, _):
-            val = notice_read(key_63)
-            send("assert", (val, "ok"))
-
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get(key_63) == "ok"
 
     def test_multibyte_key_exceeds_limit(self):
         """A 3-byte character at byte position 61 exceeds the 63-byte limit."""
         x = Cown(0)
-        # 61 ASCII + 3 = 64 bytes total, exceeds limit
         key_64 = "a" * 61 + "€"
 
         @when(x)
-        def _(x):
-            try:
-                notice_write(key_64, "fail")
-                notice_sync()
-                send("assert", (False, True))
-            except ValueError:
-                send("assert", (True, True))
+        def probe(x):
+            notice_write(key_64, "fail")
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        with pytest.raises(ValueError):
+            probe.unwrap()
 
 
 class TestNoticeboardRestart:
@@ -502,31 +397,22 @@ def test_noticeboard_empty_after_restart(self):
         @when(x)
         def step1(x):
             notice_write("before_restart", 42)
-            notice_sync()
-
-        @when(x)
-        def step2(x):
-            snap = noticeboard()
-            send("assert", (snap.get("before_restart"), 42))
 
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get("before_restart") == 42
         wait()
 
-        # Start fresh — noticeboard should be cleared by stop()
         y = Cown(0)
 
         @when(y)
-        def check(y):
-            snap = noticeboard()
-            send("assert", ("before_restart" not in snap, True))
+        def step2(y):
+            pass
 
-        receive_asserts()
+        snap2 = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert "before_restart" not in snap2
         wait()
 
 
-# Module-level helpers for notice_update tests (must be picklable).
-
-
 def _increment(x):
     """Return x + 1."""
     return x + 1
@@ -578,18 +464,13 @@ def test_basic_increment(self):
         @when(x)
         def step1(x):
             notice_write("counter", 10)
-            notice_sync()
 
         @when(x, step1)
         def step2(x, _):
             notice_update("counter", _increment)
-            notice_sync()
-
-        @when(x, step2)
-        def step3(x, _):
-            send("assert", (notice_read("counter"), 11))
 
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get("counter") == 11
 
     def test_default_on_absent_key(self):
         """Update a missing key uses the default value."""
@@ -598,13 +479,9 @@ def test_default_on_absent_key(self):
         @when(x)
         def step1(x):
             notice_update("missing", _add_ten, default=0)
-            notice_sync()
 
-        @when(x, step1)
-        def step2(x, _):
-            send("assert", (notice_read("missing"), 10))
-
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get("missing") == 10
 
     def test_none_sentinel(self):
         """A key holding None is distinguished from an absent key."""
@@ -613,20 +490,13 @@ def test_none_sentinel(self):
         @when(x)
         def step1(x):
             notice_write("k", None)
-            notice_sync()
 
         @when(x, step1)
         def step2(x, _):
             notice_update("k", _wrap_value, default="WRONG")
-            notice_sync()
-
-        @when(x, step2)
-        def step3(x, _):
-            val = notice_read("k")
-            # fn should have received None (the stored value), not "WRONG"
-            send("assert", (val, (None, "seen")))
 
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get("k") == (None, "seen")
 
     def test_concurrent_updates(self):
         """Multiple independent behaviors updating the same key."""
@@ -637,45 +507,33 @@ def test_concurrent_updates(self):
             @when(cowns[i])
             def writer(c):
                 notice_update("counter", _increment, default=0)
-                notice_sync()
 
-        # Reader requires every writer cown -> runs only after every
-        # writer behavior returns -> after every notice_sync() commits.
-        @when(cowns)
-        def reader(_):
-            send("assert", (notice_read("counter"), n))
-
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get("counter") == n
 
     def test_key_validation_type(self):
         """Non-string key raises TypeError."""
         x = Cown(0)
 
         @when(x)
-        def _(x):
-            try:
-                notice_update(123, _increment)
-                notice_sync()
-                send("assert", (False, True))
-            except TypeError:
-                send("assert", (True, True))
+        def probe(x):
+            notice_update(123, _increment)
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        with pytest.raises(TypeError):
+            probe.unwrap()
 
     def test_fn_not_callable(self):
         """Non-callable fn raises TypeError."""
         x = Cown(0)
 
         @when(x)
-        def _(x):
-            try:
-                notice_update("key", "not_callable")
-                notice_sync()
-                send("assert", (False, True))
-            except TypeError:
-                send("assert", (True, True))
+        def probe(x):
+            notice_update("key", "not_callable")
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        with pytest.raises(TypeError):
+            probe.unwrap()
 
     def test_fn_raises_keeps_previous_value(self):
         """If fn raises, the key retains its previous value."""
@@ -684,18 +542,13 @@ def test_fn_raises_keeps_previous_value(self):
         @when(x)
         def step1(x):
             notice_write("safe", 42)
-            notice_sync()
 
         @when(x, step1)
         def step2(x, _):
             notice_update("safe", _div_by_zero)
-            notice_sync()
-
-        @when(x, step2)
-        def step3(x, _):
-            send("assert", (notice_read("safe"), 42))
 
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get("safe") == 42
 
     def test_functools_partial(self):
         """functools.partial with a builtin works as fn."""
@@ -704,13 +557,9 @@ def test_functools_partial(self):
         @when(x)
         def step1(x):
             notice_update("best", partial(max, 42), default=0)
-            notice_sync()
 
-        @when(x, step1)
-        def step2(x, _):
-            send("assert", (notice_read("best"), 42))
-
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get("best") == 42
 
 
 class TestNoticeboardReadOnly:
@@ -728,20 +577,23 @@ def test_snapshot_mutation_rejected(self):
         @when(x)
         def step1(x):
             notice_write("immut", 1)
-            notice_sync()
 
-        @when(x, step1)
-        def step2(x, _):
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+
+        @when(x)
+        def step2(x):
             snap = noticeboard()
+            raised = False
             try:
                 snap["immut"] = 999
-                send("assert", (False, True))  # should not reach here
             except TypeError:
-                send("assert", (True, True))
-            # Original value is unaffected
-            send("assert", (notice_read("immut"), 1))
+                raised = True
+            return (raised, notice_read("immut"))
 
-        receive_asserts(2)
+        quiesce(QUIESCE_TIMEOUT)
+        raised, original = step2.unwrap()
+        assert raised is True
+        assert original == 1
 
     def test_snapshot_del_rejected(self):
         """Deleting a key from the snapshot raises TypeError."""
@@ -750,18 +602,19 @@ def test_snapshot_del_rejected(self):
         @when(x)
         def step1(x):
             notice_write("del_test", 42)
-            notice_sync()
 
         @when(x, step1)
         def step2(x, _):
             snap = noticeboard()
+            raised = False
             try:
                 del snap["del_test"]
-                send("assert", (False, True))
             except TypeError:
-                send("assert", (True, True))
+                raised = True
+            return raised
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert step2.unwrap() is True
 
 
 class TestNoticeboardPreRuntime:
@@ -782,13 +635,11 @@ def test_notice_write_before_start(self):
         """notice_write raises RuntimeError before the runtime is started."""
         with pytest.raises(RuntimeError, match="cannot write to the noticeboard"):
             notice_write("key", "value")
-            notice_sync()
 
     def test_notice_update_before_start(self):
         """notice_update raises RuntimeError before the runtime is started."""
         with pytest.raises(RuntimeError, match="cannot update the noticeboard"):
             notice_update("key", _increment)
-            notice_sync()
 
 
 class TestNoticeboardFireAndForget:
@@ -810,17 +661,12 @@ def test_write_persists_after_behavior_failure(self):
         @when(x)
         def failing(x):
             notice_write("survivor", 42)
-            notice_sync()
             raise ValueError("intentional failure")
 
-        @when(x, failing)
-        def check(x, _):
-            send("assert", (notice_read("survivor"), 42))
-
-        receive_asserts()
-
-
-# Module-level helpers for notice_delete / REMOVED tests.
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get("survivor") == 42
+        with pytest.raises(ValueError, match="intentional failure"):
+            failing.unwrap()
 
 
 def _read_ring_first_value(_ignored):
@@ -890,39 +736,35 @@ def setup_method(self):
 
     def test_ring_of_cowns_survives_writer_dropping_reference(self):
         """A list of cowns on the noticeboard is usable after writer drops it."""
-        # Build a small ring of cowns in a behavior, publish it to the
-        # noticeboard, then drop every local reference to the ring on the
-        # writer side. The noticeboard becomes the only thing keeping the
-        # cowns alive across worker reads.
         x = Cown(0)
 
         @when(x)
         def writer(x):
             ring = [Cown(i * 10) for i in range(8)]
             notice_write("ring", ring)
-            notice_sync()
-            # Local goes out of scope at function return — only the
-            # noticeboard's pin is left.
 
-        @when(x, writer)
-        def first_read(x, _):
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+
+        @when(x)
+        def first_read(x):
             ring = noticeboard()["ring"]
-            send("assert", (len(ring), 8))
+            return len(ring)
 
         @when(x, first_read)
         def second_read(x, _):
             ring = noticeboard()["ring"]
-            send("assert", (len(ring), 8))
+            return len(ring)
 
         @when(x, second_read)
         def acquire_first(x, _):
             ring = noticeboard()["ring"]
-            # Acquire the first cown for read; this dereferences the
-            # underlying BOCCown and would assert if it had been freed.
             with ring[0] as v:
-                send("assert", (v, 0))
+                return v
 
-        receive_asserts(count=3)
+        quiesce(QUIESCE_TIMEOUT)
+        assert first_read.unwrap() == 8
+        assert second_read.unwrap() == 8
+        assert acquire_first.unwrap() == 0
 
     def test_overwrite_releases_old_cown_pins(self):
         """Overwriting a noticeboard entry releases the old entry's pins."""
@@ -932,21 +774,22 @@ def test_overwrite_releases_old_cown_pins(self):
         def first_write(x):
             first = [Cown(i) for i in range(4)]
             notice_write("ring", first)
-            notice_sync()
 
         @when(x, first_write)
         def second_write(x, _):
             second = [Cown(100 + i) for i in range(4)]
             notice_write("ring", second)
-            notice_sync()
 
-        @when(x, second_write)
-        def check(x, _):
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+
+        @when(x)
+        def check(x):
             ring = noticeboard()["ring"]
             with ring[0] as v:
-                send("assert", (v, 100))
+                return v
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert check.unwrap() == 100
 
     def test_delete_releases_cown_pins(self):
         """notice_delete drops the entry's pins; a fresh write reuses the slot."""
@@ -956,34 +799,33 @@ def test_delete_releases_cown_pins(self):
         def initial_write(x):
             ring = [Cown(i) for i in range(3)]
             notice_write("ring", ring)
-            notice_sync()
 
         @when(x, initial_write)
         def remove_entry(x, _):
             notice_delete("ring")
-            notice_sync()
 
-        # The delete is non-blocking; verify in a subsequent behavior so
-        # the noticeboard thread has had a chance to process the message and the
-        # per-behavior snapshot cache is rebuilt.
-        @when(x, remove_entry)
-        def verify_gone(x, _):
-            send("assert", ("ring" in noticeboard(), False))
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+
+        @when(x)
+        def verify_gone(x):
+            return "ring" in noticeboard()
 
-        # After delete + new write the noticeboard reads the new entry.
         @when(x, verify_gone)
         def write_new(x, _):
             new_ring = [Cown(999)]
             notice_write("ring", new_ring)
-            notice_sync()
 
-        @when(x, write_new)
-        def check_new(x, _):
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+
+        @when(x)
+        def check_new(x):
             ring = noticeboard()["ring"]
             with ring[0] as v:
-                send("assert", (v, 999))
+                return v
 
-        receive_asserts(count=2)
+        quiesce(QUIESCE_TIMEOUT)
+        assert verify_gone.unwrap() is False
+        assert check_new.unwrap() == 999
 
     def test_slot_only_holder_cown_survives_writer(self):
         """Cowns reachable through ``__slots__`` are pinned by the noticeboard.
@@ -1001,18 +843,19 @@ def test_slot_only_holder_cown_survives_writer(self):
         def writer(x):
             holder = SlotHolder(Cown(12345), "first")
             notice_write("slot_holder", holder)
-            notice_sync()
-            # Local goes out of scope at function return -- only the
-            # noticeboard's pin should keep the inner Cown alive.
 
-        @when(x, writer)
-        def read_back(x, _):
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+
+        @when(x)
+        def read_back(x):
             holder = noticeboard()["slot_holder"]
-            send("assert", (holder.label, "first"))
             with holder.cown as v:
-                send("assert", (v, 12345))
+                return (holder.label, v)
 
-        receive_asserts(count=2)
+        quiesce(QUIESCE_TIMEOUT)
+        label, v = read_back.unwrap()
+        assert label == "first"
+        assert v == 12345
 
     def test_slot_subclass_cown_survives_writer(self):
         """Cowns reachable through an MRO chain of ``__slots__`` are pinned.
@@ -1027,18 +870,20 @@ def test_slot_subclass_cown_survives_writer(self):
         def writer(x):
             holder = SlotSubclass(Cown(7777), "sub", Cown(8888))
             notice_write("slot_sub", holder)
-            notice_sync()
 
-        @when(x, writer)
-        def read_back(x, _):
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+
+        @when(x)
+        def read_back(x):
             holder = noticeboard()["slot_sub"]
-            send("assert", (holder.label, "sub"))
-            with holder.cown as v:
-                send("assert", (v, 7777))
-            with holder.extra as v:
-                send("assert", (v, 8888))
+            with holder.cown as v1, holder.extra as v2:
+                return (holder.label, v1, v2)
 
-        receive_asserts(count=3)
+        quiesce(QUIESCE_TIMEOUT)
+        label, v1, v2 = read_back.unwrap()
+        assert label == "sub"
+        assert v1 == 7777
+        assert v2 == 8888
 
 
 class TestNoticeboardSnapshotImmutable:
@@ -1062,18 +907,14 @@ def test_snapshot_is_mappingproxy(self):
         @when(x)
         def setup_then_check(x):
             notice_write("k", "v")
-            notice_sync()
 
         @when(x, setup_then_check)
         def check(x, _):
             snap = noticeboard()
-            # Avoid importing MappingProxyType inside the behavior — the
-            # transpiler would capture the symbol and pickling the
-            # ``mappingproxy`` builtin class fails. Compare by type name
-            # instead.
-            send("assert", (type(snap).__name__, "mappingproxy"))
+            return type(snap).__name__
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert check.unwrap() == "mappingproxy"
 
     def test_snapshot_rejects_mutation(self):
         """Attempting to mutate the snapshot raises TypeError."""
@@ -1082,18 +923,19 @@ def test_snapshot_rejects_mutation(self):
         @when(x)
         def writer(x):
             notice_write("k", "v")
-            notice_sync()
 
         @when(x, writer)
         def check(x, _):
             snap = noticeboard()
+            raised = False
             try:
                 snap["k"] = "new"  # type: ignore[index]
-                send("assert", ("no-error", "TypeError"))
             except TypeError:
-                send("assert", ("TypeError", "TypeError"))
+                raised = True
+            return raised
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert check.unwrap() is True
 
 
 class TestNoticeboardThreadOnly:
@@ -1102,16 +944,14 @@ class TestNoticeboardThreadOnly:
     @classmethod
     def setup_class(cls):
         """Start the runtime so that NB_NOTICEBOARD_TID is registered."""
-        # A trivial behavior is enough to spin up the runtime. After
-        # this point any direct C-level write/delete from the main
-        # thread must be rejected.
         x = Cown(0)
 
         @when(x)
         def _noop(x):
-            send("assert", (1, 1))
+            return 1
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert _noop.unwrap() == 1
 
     @classmethod
     def teardown_class(cls):
@@ -1147,19 +987,13 @@ def test_delete_existing_key(self):
         @when(x)
         def step1(x):
             notice_write("doomed", 99)
-            notice_sync()
 
         @when(x, step1)
         def step2(x, _):
             notice_delete("doomed")
-            notice_sync()
-
-        @when(x, step2)
-        def check(x, _):
-            snap = noticeboard()
-            send("assert", ("doomed" not in snap, True))
 
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert "doomed" not in snap
 
     def test_delete_absent_key_is_noop(self):
         """notice_delete on a missing key is a silent no-op."""
@@ -1169,13 +1003,9 @@ def test_delete_absent_key_is_noop(self):
         def step1(x):
             notice_write("keeper", "safe")
             notice_delete("nonexistent")
-            notice_sync()
-
-        @when(x, step1)
-        def check(x, _):
-            send("assert", (notice_read("keeper"), "safe"))
 
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get("keeper") == "safe"
 
     def test_update_fn_returns_removed(self):
         """When fn returns REMOVED, the entry is deleted."""
@@ -1184,19 +1014,13 @@ def test_update_fn_returns_removed(self):
         @when(x)
         def step1(x):
             notice_write("target", 42)
-            notice_sync()
 
         @when(x, step1)
         def step2(x, _):
             notice_update("target", _return_removed)
-            notice_sync()
-
-        @when(x, step2)
-        def check(x, _):
-            snap = noticeboard()
-            send("assert", ("target" not in snap, True))
 
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert "target" not in snap
 
     def test_update_conditional_remove(self):
         """REMOVED only triggers when fn actually returns it."""
@@ -1205,19 +1029,13 @@ def test_update_conditional_remove(self):
         @when(x)
         def step1(x):
             notice_write("val", 50)
-            notice_sync()
 
-        # 50 <= 100, so fn returns 51
         @when(x, step1)
         def step2(x, _):
             notice_update("val", _conditionally_remove)
-            notice_sync()
-
-        @when(x, step2)
-        def check1(x, _):
-            send("assert", (notice_read("val"), 51))
 
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get("val") == 51
 
     def test_update_conditional_remove_triggers(self):
         """REMOVED triggers when value exceeds threshold."""
@@ -1226,20 +1044,13 @@ def test_update_conditional_remove_triggers(self):
         @when(x)
         def step1(x):
             notice_write("val", 200)
-            notice_sync()
 
-        # 200 > 100, so fn returns REMOVED
         @when(x, step1)
         def step2(x, _):
             notice_update("val", _conditionally_remove)
-            notice_sync()
-
-        @when(x, step2)
-        def check(x, _):
-            snap = noticeboard()
-            send("assert", ("val" not in snap, True))
 
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert "val" not in snap
 
     def test_removed_then_update_uses_default(self):
         """After deletion, notice_update uses the default value."""
@@ -1248,23 +1059,17 @@ def test_removed_then_update_uses_default(self):
         @when(x)
         def step1(x):
             notice_write("counter", 10)
-            notice_sync()
 
         @when(x, step1)
         def step2(x, _):
             notice_delete("counter")
-            notice_sync()
 
         @when(x, step2)
         def step3(x, _):
             notice_update("counter", _increment, default=0)
-            notice_sync()
 
-        @when(x, step3)
-        def check(x, _):
-            send("assert", (notice_read("counter"), 1))
-
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert snap.get("counter") == 1
 
     def test_delete_frees_capacity(self):
         """Deleting an entry frees a slot for a new entry."""
@@ -1274,25 +1079,17 @@ def test_delete_frees_capacity(self):
         def fill(x):
             for i in range(64):
                 notice_write(f"k{i}", i)
-            notice_sync()
 
         @when(x, fill)
         def delete_one(x, _):
             notice_delete("k0")
-            notice_sync()
 
         @when(x, delete_one)
         def add_new(x, _):
             notice_write("new_key", "hello")
-            notice_sync()
-
-        @when(x, add_new)
-        def check(x, _):
-            snap = noticeboard()
-            present = "new_key" in snap and "k0" not in snap
-            send("assert", (present, True))
 
-        receive_asserts()
+        snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+        assert "new_key" in snap and "k0" not in snap
 
 
 class TestNoticeDeletePreRuntime:
@@ -1310,7 +1107,6 @@ def test_notice_delete_before_start(self):
         """notice_delete raises RuntimeError before the runtime is started."""
         with pytest.raises(RuntimeError, match="cannot delete from the noticeboard"):
             notice_delete("key")
-            notice_sync()
 
 
 class TestNoticeDeleteValidation:
@@ -1323,7 +1119,7 @@ def teardown_class(cls):
 
     def test_notice_delete_non_string_key(self):
         """notice_delete raises TypeError for non-string key."""
-        x = Cown(0)  # triggers runtime start
+        x = Cown(0)
 
         @when(x)
         def _(x):
@@ -1331,7 +1127,6 @@ def _(x):
 
         with pytest.raises(TypeError, match="noticeboard key must be a str"):
             notice_delete(123)
-            notice_sync()
 
 
 class TestRemovedSentinel:
@@ -1382,83 +1177,46 @@ def test_self_write_invisible_within_behavior(self):
         @when(x)
         def step1(x):
             notice_write("self", "before")
-            notice_sync()
 
-        @when(x, step1)
-        def step2(x, _):
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+
+        @when(x)
+        def step2(x):
             before = notice_read("self")
             notice_write("self", "after")
-            notice_sync()
             after_same_behavior = notice_read("self")
-            send("assert", (before, "before"))
-            send("assert", (after_same_behavior, "before"))
+            return (before, after_same_behavior)
 
-        @when(x, step2)
-        def step3(x, _):
-            # New behavior — must see the committed write.
-            send("assert", (notice_read("self"), "after"))
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
 
-        receive_asserts(3)
+        @when(x)
+        def step3(x):
+            return notice_read("self")
+
+        quiesce(QUIESCE_TIMEOUT)
+        before, after_same_behavior = step2.unwrap()
+        assert before == "before"
+        assert after_same_behavior == "before"
+        assert step3.unwrap() == "after"
 
     def test_cross_behavior_visibility_preserved(self):
-        """Sanity: write in A is visible in B (no regression vs baseline)."""
+        """Sanity: a write in A is visible to a later behavior after a barrier."""
         x = Cown(0)
 
         @when(x)
         def writer(x):
             notice_write("xv", "from_A")
-            notice_sync()
-
-        @when(x, writer)
-        def reader(x, _):
-            send("assert", (notice_read("xv"), "from_A"))
 
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
 
+        @when(x)
+        def reader(x):
+            return notice_read("xv")
 
-class TestNoticeSyncReturnType:
-    """Pin the documented return type of ``notice_sync()`` (None)."""
-
-    @classmethod
-    def teardown_class(cls):
-        """Drain the runtime after the suite."""
-        wait()
+        quiesce(QUIESCE_TIMEOUT)
+        assert reader.unwrap() == "from_A"
 
-    def test_returns_none_inside_behavior(self):
-        x = Cown(0)
 
-        @when(x)
-        def _(x):
-            notice_write("rk", 1)
-            result = notice_sync()
-            send("assert", (result, None))
-
-        receive_asserts()
-
-
-# ---------------------------------------------------------------------------
-# Pin-walker audit: cowns hidden from the walker must not survive a write.
-# ---------------------------------------------------------------------------
-#
-# The pin walker (``_gather_pins`` / ``_collect_cown_capsules``) traverses
-# ``__dict__``, ``__slots__`` (up the MRO), and the standard container
-# protocols (dict/list/tuple/set/frozenset). A value whose pickler reaches
-# a cown by *any other route* — module-level cache lookup, closure capture,
-# ``copyreg.dispatch_table``, custom ``__reduce__`` / ``__getstate__`` —
-# would, without the audit, produce a borrowing token whose underlying
-# ``BOCCown`` is not held alive by the noticeboard entry's pin set. The
-# first reader to resurrect that pointer after the writer's local wrapper
-# drops would touch freed memory (CWE-416).
-#
-# The audit checks every ``CownCapsule_reduce`` against the caller's pin
-# set during the borrowing pickle and fails the whole write closed if any
-# cown is unaccounted for. These tests pin that contract.
-
-
-# Module-level state for the hidden-cown reducer. The class can be pickled
-# by sub-interpreters (it's importable), but the inner cown is fetched via
-# the module cache rather than stored as an attribute — so the walker
-# cannot see it.
 _HIDDEN_CACHE: dict = {}
 
 
@@ -1519,12 +1277,6 @@ def setup_method(self):
         _HIDDEN_CACHE.clear()
 
     def teardown_method(self):
-        # Symmetric clear so a strong reference to the hidden cown
-        # (the dict value) does not linger past the test that created
-        # it. Without this, the cown survives until the next test's
-        # setup_method runs — long enough to alias with subsequent
-        # noticeboard activity if the suite is reordered or a new
-        # test imports _HiddenCownToken from another module.
         _HIDDEN_CACHE.clear()
 
     def test_hidden_cown_rejected_and_entry_not_installed(self, caplog):
@@ -1534,21 +1286,12 @@ def test_hidden_cown_rejected_and_entry_not_installed(self, caplog):
         @when(x)
         def writer(x):
             hidden_cown = Cown(42)
-            # _HIDDEN_CACHE entry created as a side effect; only the key
-            # is reachable through __dict__, so _gather_pins returns [].
             token = _HiddenCownToken(7, hidden_cown)
             notice_write("hidden", token)
-            notice_sync()
-
-        @when(x, writer)
-        def reader(x, _):
-            # The audit fires on the noticeboard thread; the exception
-            # is caught and logged at WARNING. The behavioural assertion
-            # is that the entry never landed.
-            send("assert", (notice_read("hidden"), None))
 
         with caplog.at_level("WARNING", logger="behaviors"):
-            receive_asserts()
+            snap = quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+            assert snap.get("hidden") is None
 
         matching = [
             r for r in caplog.records
@@ -1577,19 +1320,25 @@ def writer(x):
             a = Cown("a")
             b = Cown("b")
             notice_write("pair", _VisibleCownPair(a, b))
-            notice_sync()
 
-        @when(x, writer)
-        def reader(x, _):
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
+
+        @when(x)
+        def reader(x):
             pair = notice_read("pair")
-            # Snapshot returned a value, the value is the right type,
-            # and both embedded cowns survived as live CownCapsules.
-            send("assert", (pair is not None, True))
-            send("assert", (isinstance(pair, _VisibleCownPair), True))
-            send("assert", (isinstance(pair.a, Cown), True))
-            send("assert", (isinstance(pair.b, Cown), True))
+            return (
+                pair is not None,
+                isinstance(pair, _VisibleCownPair),
+                isinstance(pair.a, Cown),
+                isinstance(pair.b, Cown),
+            )
 
-        receive_asserts(4)
+        quiesce(QUIESCE_TIMEOUT)
+        not_none, is_pair, a_cown, b_cown = reader.unwrap()
+        assert not_none is True
+        assert is_pair is True
+        assert a_cown is True
+        assert b_cown is True
 
 
 class TestWaitNoticeboardCapture:
@@ -1602,8 +1351,8 @@ class TestWaitNoticeboardCapture:
     ``wait()`` itself is the quiescence barrier *and* drains the
     ``boc_noticeboard`` queue (the shutdown sentinel is FIFO behind
     every prior mutation), so the test bodies do not need an extra
-    ``send``/``receive`` handshake or a ``notice_sync()`` before
-    calling ``wait(noticeboard=True)``.
+    ``send``/``receive`` handshake before calling
+    ``wait(noticeboard=True)``.
     """
 
     @classmethod
@@ -1613,7 +1362,7 @@ def teardown_class(cls):
 
     def test_wait_noticeboard_true_returns_final_state(self):
         """The captured dict contains everything a behavior wrote."""
-        wait()  # baseline: runtime down
+        wait()
         x = Cown(0)
 
         @when(x)
@@ -1636,15 +1385,13 @@ def _(x):
             notice_write("k", "v")
 
         snap = wait(noticeboard=True)
-        # Plain dict means we can mutate locally without disturbing
-        # the now-cleared runtime.
         snap["local_only"] = True
         assert snap["local_only"] is True
         assert snap.get("k") == "v"
 
     def test_wait_noticeboard_true_runtime_never_started(self):
         """Empty dict when the runtime was never up."""
-        wait()  # ensure runtime is down
+        wait()
         assert wait(noticeboard=True) == {}
 
     def test_wait_noticeboard_true_empty_noticeboard(self):
@@ -1695,7 +1442,6 @@ def _(x):
 
         result = wait(stats=True, noticeboard=True)
         assert isinstance(result, WaitResult), type(result)
-        # Tuple-shape access still works (NamedTuple).
         stats_snap, nb_snap = result
         assert isinstance(stats_snap, list), type(stats_snap)
         assert isinstance(nb_snap, dict), type(nb_snap)
@@ -1749,11 +1495,6 @@ def writer(x):
             notice_write("keep", 1)
             notice_write("drop", 2)
 
-        # A second behavior runs the delete -- and because it is
-        # ordered after ``writer`` via the cown chain, the delete
-        # is guaranteed to land after the writes (FIFO ordering on
-        # the noticeboard queue alone is not enough across separate
-        # behaviors, which the cown chain provides).
         @when(x, writer)
         def deleter(x, _):
             notice_delete("drop")
@@ -1774,7 +1515,6 @@ def _(x):
         snap1 = wait(noticeboard=True)
         assert snap1.get("session1") is True
 
-        # New session; the previous session's data must be gone.
         y = Cown(0)
 
         @when(y)
@@ -1801,7 +1541,6 @@ def test_wait_noticeboard_survives_explicit_stop(self):
         def _(x):
             notice_write("k", "v")
 
-        # Explicit stop drives quiescence and captures the snapshot.
         inst = B.BEHAVIORS
         assert inst is not None
         inst.stop()
@@ -1809,7 +1548,129 @@ def _(x):
             inst._final_noticeboard
         )
 
-        # Now ``wait(noticeboard=True)`` re-enters ``stop()``; the
-        # captured snapshot must survive the second pass.
         snap = wait(noticeboard=True)
         assert snap == {"k": "v"}, snap
+
+
+class TestNoticeSeed:
+    """Tests for the synchronous main-thread notice_seed write."""
+
+    @classmethod
+    def setup_class(cls):
+        """Start the runtime so the noticeboard thread is registered."""
+        start()
+
+    @classmethod
+    def teardown_class(cls):
+        """Ensure runtime is drained after suite."""
+        wait()
+
+    def setup_method(self):
+        """Clear the noticeboard before each test."""
+        _core.noticeboard_clear()
+
+    def test_seed_visible_to_subsequent_behavior(self):
+        """A behavior scheduled after notice_seed observes the value."""
+        notice_seed("cfg", {"threshold": 7})
+        x = Cown(0)
+
+        @when(x)
+        def reader(x):
+            return notice_read("cfg", {}).get("threshold", -1)
+
+        quiesce(QUIESCE_TIMEOUT)
+        assert reader.unwrap() == 7
+
+    def test_seed_commits_before_return(self):
+        """notice_seed commits synchronously: a fresh snapshot sees it at once."""
+        notice_seed("now", 123)
+        assert _core.noticeboard_snapshot().get("now") == 123
+
+    def test_seed_visible_after_warm_main_cache(self):
+        """A seed is visible to the seeding thread even after its cache is warm.
+
+        The main thread has no behavior boundary to re-arm its snapshot
+        cache, so a read taken before the seed must not mask the seeded
+        value on the next read.
+        """
+        notice_seed("warm", 0)
+        assert notice_read("warm") == 0
+        notice_seed("warm", 1)
+        assert notice_read("warm") == 1
+
+    def test_seed_overwrite_last_wins(self):
+        """Seeding the same key twice keeps the last value."""
+        notice_seed("counter", 1)
+        notice_seed("counter", 2)
+        x = Cown(0)
+
+        @when(x)
+        def reader(x):
+            return notice_read("counter")
+
+        quiesce(QUIESCE_TIMEOUT)
+        assert reader.unwrap() == 2
+
+    def test_seed_rejected_in_worker(self):
+        """notice_seed raises RuntimeError when called from a behavior body."""
+        x = Cown(0)
+
+        @when(x)
+        def offender(x):
+            notice_seed("k", "v")
+
+        quiesce(QUIESCE_TIMEOUT)
+        with pytest.raises(RuntimeError, match="primary interpreter"):
+            offender.unwrap()
+
+    def test_seed_invalid_key_raises_on_main(self):
+        """An over-long key fails fast on the calling thread."""
+        with pytest.raises(ValueError):
+            notice_seed("k" * 64, "v")
+
+    def test_seed_embedded_cown_survives(self):
+        """A cown seeded from main outlives the writer's reference."""
+        notice_seed("ring", [Cown(i * 10) for i in range(4)])
+        x = Cown(0)
+
+        @when(x)
+        def reader(x):
+            ring = noticeboard()["ring"]
+            with ring[2] as v:
+                return (len(ring), v)
+
+        quiesce(QUIESCE_TIMEOUT)
+        size, value = reader.unwrap()
+        assert size == 4
+        assert value == 20
+
+
+class TestNoticeSeedAutoStart:
+    """notice_seed starts the runtime when it is the first bocpy call."""
+
+    @classmethod
+    def setup_class(cls):
+        """Guarantee a stopped runtime so the seed must auto-start it."""
+        wait()
+
+    @classmethod
+    def teardown_class(cls):
+        """Ensure runtime is drained after suite."""
+        wait()
+
+    def test_seed_auto_starts_runtime(self):
+        """Seeding with no prior start() brings the runtime up and commits."""
+        import bocpy.behaviors as B
+
+        assert B.BEHAVIORS is None
+        notice_seed("boot", "ready")
+        assert B.BEHAVIORS is not None
+
+        x = Cown(0)
+
+        @when(x)
+        def reader(x):
+            return notice_read("boot")
+
+        quiesce(QUIESCE_TIMEOUT)
+        assert reader.unwrap() == "ready"
diff --git a/test/test_pinned_pump.py b/test/test_pinned_pump.py
index 10534d3..5dbaa76 100644
--- a/test/test_pinned_pump.py
+++ b/test/test_pinned_pump.py
@@ -19,9 +19,11 @@
   ``max_behaviors``, ``raise_on_error``, BaseException propagation,
   and reentry rejection.
 
-All tests follow the standard ``send("assert", ...)`` / ``receive``
-/ trailing-``wait()`` idiom documented in
-``.github/skills/testing-with-boc/SKILL.md``.
+Tests read results back through ``quiesce()`` + :meth:`Cown.unwrap`
+(behaviors return the value under test); worker-scheduled pinned
+``@when`` chains are double-unwrapped. The handful of cross-interpreter
+handoffs that genuinely need the message queue (the round-trip
+producer/consumer split) keep ``send`` / ``receive``.
 """
 
 from __future__ import annotations
@@ -35,27 +37,22 @@
 from bocpy import (
     _core,
     Cown,
-    drain,
-    notice_sync,
     notice_update,
     notice_write,
     noticeboard,
     PinnedCown,
     pump,
     quiesce,
-    receive,
-    send,
     set_pump_watchdog,
     set_wait_pump_poll,
     start,
-    TIMEOUT,
     wait,
     when,
 )
 from bocpy import behaviors as _behaviors
 
 
-RECEIVE_TIMEOUT = 10
+QUIESCE_TIMEOUT = 10
 
 
 def _replace_with(new_value, _old):
@@ -82,36 +79,6 @@ def __repr__(self) -> str:
         return "<NotPicklable sentinel>"
 
 
-def receive_asserts(count=1):
-    """Drain all expected assertion messages, then fail on first mismatch.
-
-    The "assert" queue is always drained before returning so that leftover
-    messages from a failing test do not leak into subsequent tests in CI.
-    """
-    failed = None
-    timed_out = False
-    try:
-        for _ in range(count):
-            result = receive("assert", RECEIVE_TIMEOUT)
-            if result[0] == TIMEOUT:
-                timed_out = True
-                break
-            _, (actual, expected) = result
-            if failed is None and actual != expected:
-                failed = (actual, expected)
-    finally:
-        drain("assert")
-
-    assert not timed_out, (
-        "Timed out waiting for an 'assert' message from a behavior. "
-        "Check that every @when arg count matches the decorated "
-        "function's parameter count."
-    )
-    if failed is not None:
-        actual, expected = failed
-        assert actual == expected, f"expected {expected!r}, got {actual!r}"
-
-
 class TestPinnedCownBasics:
     """PinnedCown construction invariants (no schedule, no pump)."""
 
@@ -132,15 +99,16 @@ def test_pinned_value_identity_and_no_pickle(self):
         obj_id = id(obj)
         pc = PinnedCown(obj)
 
+        readers = []
         for _ in range(64):
             @when(pc)
             def _body(pc):
-                send("assert", (id(pc.value), obj_id))
+                return id(pc.value)
+            readers.append(_body)
 
-        # quiesce() so worker sub-interpreters survive until
-        # receive_asserts reads their messages.
-        quiesce()
-        receive_asserts(64)
+        quiesce(QUIESCE_TIMEOUT)
+        for r in readers:
+            assert r.unwrap() == obj_id
 
     def test_pinned_destruct_after_construction_only(self):
         """Drop a pinned cown immediately after construction.
@@ -158,30 +126,18 @@ def test_pinned_destruct_after_construction_only(self):
     def test_pinned_cown_off_main_raises(self):
         """``PinnedCown(...)`` from a worker raises ``RuntimeError``.
 
-        The except-clause cannot bind the exception to a name -- the
-        transpiler's free-variable scan treats ``ExceptHandler.name``
-        as a capture and frame-walking cannot resolve it. Capture
-        the type name and a substring of the message into plain
-        locals inside the except block and ship them through the
-        standard ``"assert"`` tag.
+        The behavior runs on a worker (no pinned cown in its request
+        set), constructs a ``PinnedCown`` off-main, and the resulting
+        ``RuntimeError`` is captured on the result cown. ``unwrap``
+        re-raises it on the main thread.
         """
         @when()
-        def _():
-            exc_type_name = "no-raise"
-            msg_mentions_main = False
-            try:
-                PinnedCown(object())
-            except RuntimeError as ex:
-                exc_type_name = "RuntimeError"
-                msg_mentions_main = "main interpreter" in str(ex)
+        def _probe():
+            PinnedCown(object())
 
-            send("assert", (
-                (exc_type_name, msg_mentions_main),
-                ("RuntimeError", True),
-            ))
-
-        quiesce()
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        with pytest.raises(RuntimeError, match="main interpreter"):
+            _probe.unwrap()
 
 
 class TestPinnedCownsAutoDrain:
@@ -191,18 +147,17 @@ class TestPinnedCownsAutoDrain:
     def teardown_class(cls):
         wait()
 
-    # wait() with pinned cowns auto-drains.
     def test_wait_auto_drains_pinned(self):
-        """A pinned behavior scheduled before wait() runs on main without pump()."""
+        """A pinned behavior scheduled before quiesce() runs on main without pump()."""
         pc = PinnedCown({"hits": 0})
 
         @when(pc)
         def _body(pc):
             pc.value["hits"] += 1
-            send("assert", ("ran", "ran"))
+            return pc.value["hits"]
 
-        wait()
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert _body.unwrap() == 1
 
     def test_wait_pinned_cown_in_cown(self):
         pc = PinnedCown({"hits": 0})
@@ -213,18 +168,15 @@ def _wrapper(w):
             @when(w.value)
             def _body(pc):
                 pc.value["hits"] += 1
-                send("assert", ("ran", "ran"))
+                return pc.value["hits"]
+            return _body
 
-        wait()
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert _wrapper.unwrap().unwrap() == 1
 
     def test_main_pump_drain_all_marks_result_cowns(self):
         """``_core.main_pump_drain_all`` pops every entry and marks each result Cown with a shutdown RuntimeError."""
-        # 8 distinct cowns: same-cown behaviours serialise via MCS and only the head sits in MAIN_PINNED_QUEUE.
         pcs = [PinnedCown(0) for _ in range(8)]
-        # Capture each @when's result Cown (the value returned by the
-        # decorator) so we can inspect its exception/value after the
-        # drain runs.
         results = []
         for pc in pcs:
             @when(pc)
@@ -232,7 +184,6 @@ def _body(pc):
                 pc.value += 1
             results.append(_body)
 
-        # Precondition: all 8 still queued (no pump has run yet).
         assert _core.main_pump_queue_depth() == 8
 
         drained = _core.main_pump_drain_all()
@@ -242,7 +193,6 @@ def _body(pc):
         )
         assert _core.main_pump_queue_depth() == 0
 
-        # Re-acquire each result via the Cown context manager; *every* one must carry the drop-exception.
         for result in results:
             with result:
                 assert result.exception is True, (
@@ -258,52 +208,40 @@ def _body(pc):
                     f"shutdown: {result.value!r}"
                 )
 
-    # stop() with pending pinned work: drain runs.
     def test_stop_drains_pinned_queue(self):
         """An explicit stop() should leave MAIN_PINNED_QUEUE empty.
 
         Also verifies the transpiler's per-iteration capture of ``i``:
-        a final pinned behaviour reads ``pc.value`` and ships the
-        tuple back via ``send("final", ...)``. A regression that
-        late-bound ``i`` at body-execution time would yield ``(3, 3,
-        3, 3)`` instead of ``(0, 1, 2, 3)``.
+        a final pinned behaviour reads ``pc.value`` and writes the
+        tuple to the noticeboard. A regression that late-bound ``i``
+        at body-execution time would yield ``(3, 3, 3, 3)`` instead of
+        ``(0, 1, 2, 3)``.
         """
         pc = PinnedCown([])
         for i in range(4):
             @when(pc)
             def _body(pc):
                 pc.value.append(i)  # noqa: B023
-                send("assert", ("ran", "ran"))
 
         @when(pc)
         def _final(pc):
-            send("final", tuple(pc.value))
+            notice_write("pinned_final", tuple(pc.value))
 
-        wait()
-        receive_asserts(4)
-        final_tag, final_payload = receive("final", RECEIVE_TIMEOUT)
-        try:
-            assert final_tag != TIMEOUT, (
-                "timed out waiting for the final pinned behaviour"
-            )
-            assert final_payload == (0, 1, 2, 3), (
-                f"per-iteration capture of i broke: expected "
-                f"(0, 1, 2, 3), got {final_payload!r}"
-            )
-        finally:
-            drain("final")
+        snap = wait(noticeboard=True)
+        assert snap["pinned_final"] == (0, 1, 2, 3), (
+            f"per-iteration capture of i broke: expected "
+            f"(0, 1, 2, 3), got {snap['pinned_final']!r}"
+        )
         assert _core.main_pump_queue_depth() == 0
 
-    # shutdown_no_disown: refcount of pinned value preserved.
     def test_shutdown_does_not_disown_pinned_value(self):
         """The Python value inside a PinnedCown must outlive stop().
 
-        Schedule a pinned behaviour that records ``sys.getrefcount`` of the
-        underlying value before and after the body runs; after ``wait()``
-        completes, the value should still be reachable (no disown / no
-        XIData round-trip). The test reads the value via a fresh
-        PinnedCown handle inside a follow-up behaviour rather than from
-        test code so we don't reach across the shutdown boundary.
+        Schedule a pinned behaviour that reads the underlying value's
+        identity and contents; after ``quiesce()`` completes, the value
+        should still be reachable (no disown / no XIData round-trip).
+        The body runs on main via the pump, so ``id(pc.value)`` is
+        directly comparable to the value captured at construction.
         """
         v = ["sentinel"]
         pc = PinnedCown(v)
@@ -311,13 +249,14 @@ def test_shutdown_does_not_disown_pinned_value(self):
 
         @when(pc)
         def _body(pc):
-            # Value identity preserved across acquire/release.
-            send("assert", (id(pc.value), v_id))
+            id_matches = id(pc.value) == v_id
             pc.value.append("post-acquire")
-            send("assert", (pc.value, ["sentinel", "post-acquire"]))
+            return (id_matches, list(pc.value))
 
-        wait()
-        receive_asserts(2)
+        quiesce(QUIESCE_TIMEOUT)
+        id_matches, contents = _body.unwrap()
+        assert id_matches is True
+        assert contents == ["sentinel", "post-acquire"]
 
 
 class TestPinnedCownsManualPump:
@@ -340,7 +279,7 @@ def test_pump_max_behaviors_caps_drain(self):
         for pc in pcs:
             @when(pc)
             def _body(pc):
-                send("assert", ("ran", "ran"))
+                pass
 
         assert _core.main_pump_queue_depth() == 10
 
@@ -354,8 +293,6 @@ def _body(pc):
         assert rest.executed == 7
         assert _core.main_pump_queue_depth() == 0
 
-        receive_asserts(10)
-
     def test_pump_deadline_caps_drain(self):
         """``deadline_ms`` may trip before the queue drains.
 
@@ -368,7 +305,7 @@ def test_pump_deadline_caps_drain(self):
         for pc in pcs:
             @when(pc)
             def _body(pc):
-                send("assert", ("ran", "ran"))
+                pass
 
         result = pump(deadline_ms=1)
         assert result.raised == 0
@@ -382,7 +319,6 @@ def _body(pc):
             assert result.executed == 50
 
         assert _core.main_pump_queue_depth() == 0
-        receive_asserts(50)
 
     def test_pump_raise_on_error_re_raises(self):
         """``raise_on_error`` re-raises the first body Exception.
@@ -412,7 +348,7 @@ def test_pump_propagates_base_exception(self):
 
         After the first body's :class:`KeyboardInterrupt` re-raises,
         the second behavior is still queued; a follow-up unbounded
-        pump drains it and surfaces its ``send``.
+        pump drains it.
         """
         pc1 = PinnedCown("payload1")
         pc2 = PinnedCown("payload2")
@@ -423,7 +359,7 @@ def _body1(pc):
 
         @when(pc2)
         def _body2(pc):
-            send("assert", ("survivor-ran", "survivor-ran"))
+            pass
 
         assert _core.main_pump_queue_depth() == 2
 
@@ -434,36 +370,28 @@ def _body2(pc):
 
         rest = pump()
         assert rest.executed == 1
-        receive_asserts(1)
 
     def test_pump_rejects_nested_call(self):
         """``pump()`` from inside a pinned body raises ``RuntimeError``.
 
-        The body wraps the inner ``pump()`` in a try/except and ships
-        the captured type name + message-substring through the
-        standard ``"assert"`` tag, so the outer pump observes
+        The body wraps the inner ``pump()`` in a try/except and returns
+        whether the re-entrancy guard fired, so the outer pump observes
         ``raised == 0``.
         """
         pc = PinnedCown("nest")
 
         @when(pc)
         def _attempt(pc):
-            exc_type_name = "no-raise"
-            msg_says_reentrant = False
             try:
                 pump()
+                return False
             except RuntimeError as ex:
-                exc_type_name = "RuntimeError"
-                msg_says_reentrant = "not reentrant" in str(ex)
-            send("assert", (
-                (exc_type_name, msg_says_reentrant),
-                ("RuntimeError", True),
-            ))
+                return "not reentrant" in str(ex)
 
         result = pump()
         assert result.executed == 1
         assert result.raised == 0
-        receive_asserts()
+        assert _attempt.unwrap() is True
 
 
 class TestPumpArgValidation:
@@ -481,7 +409,6 @@ class TestPumpArgValidation:
     def teardown_class(cls):
         wait()
 
-    # Type rejection (incl. 0).
     @pytest.mark.parametrize("bad", [0, -1, -1000, 1.5, "1", True, False])
     def test_pump_deadline_ms_rejects_bad_input(self, bad):
         """Non-None / non-int / non-positive / bool ``deadline_ms`` raises."""
@@ -494,9 +421,6 @@ def test_pump_max_behaviors_rejects_bad_input(self, bad):
         with pytest.raises(TypeError, match="max_behaviors"):
             pump(max_behaviors=bad)
 
-    # The overflow cap is gated on the explicit ``ms=True``
-    # kwarg, not a name-string heuristic. A non-ms bound named
-    # ``max_behaviors`` must NOT trip the cap even at huge values.
     def test_validator_non_ms_bound_not_capped(self):
         """A non-ms bound passes through the validator without OverflowError."""
         huge = _behaviors._MAX_PUMP_MS * 1000 + 1
@@ -527,8 +451,6 @@ class TestPumpRuntimeRequired:
 
     def test_pump_before_start_raises_runtimeerror(self):
         """Without a live ``BEHAVIORS``, :func:`pump` raises immediately."""
-        # Ensure the runtime is fully torn down: a prior test in the
-        # session may have left BEHAVIORS populated.
         assert _behaviors.BEHAVIORS is None, (
             "expected runtime to be stopped before this test; previous "
             "test did not call wait() in teardown"
@@ -536,18 +458,15 @@ def test_pump_before_start_raises_runtimeerror(self):
 
         with pytest.raises(RuntimeError, match="bocpy.start"):
             pump()
-        # Stillborn pump must not start the runtime as a side effect.
         assert _behaviors.BEHAVIORS is None
 
 
-# set_wait_pump_poll picked up mid-wait.
 def test_set_wait_pump_poll_re_read():
     """``_WAIT_PUMP_POLL_MS`` is re-read on every auto-pump iteration."""
     set_wait_pump_poll(50)
     assert _behaviors._WAIT_PUMP_POLL_MS == 50
     set_wait_pump_poll(5)
     assert _behaviors._WAIT_PUMP_POLL_MS == 5
-    # restore default
     set_wait_pump_poll(50)
 
 
@@ -562,8 +481,6 @@ def test_set_wait_pump_poll_validation():
     with pytest.raises(TypeError):
         set_wait_pump_poll(True)
 
-# Sanity: the new C constants exist with the expected integer values.
-
 
 def test_terminator_wake_reason_constants():
     assert _core.TERMINATED == 0
@@ -571,14 +488,11 @@ def test_terminator_wake_reason_constants():
     assert _core.WAIT_TIMED_OUT == 2
 
 
-# Sanity: terminator_wait_pumpable returns TERMINATED when no work is in flight.
 def test_terminator_wait_pumpable_terminated_when_empty():
-    # No outstanding behaviours: count must be 0 -> TERMINATED.
     reason = _core.terminator_wait_pumpable(0.01)
     assert reason == _core.TERMINATED
 
 
-# Sanity: main_pump_drain_all on an empty queue returns 0 and is a no-op.
 def test_main_pump_drain_all_empty():
     assert _core.main_pump_drain_all() == 0
     assert _core.main_pump_queue_depth() == 0
@@ -608,95 +522,99 @@ def test_handle_round_trip_via_worker_closure(self):
 
         @when(unrelated)
         def _ship(u):
-            send("assert", (_core.cown_is_pinned(pc.impl), True))
-
             @when(pc)
             def _on_main(pc):
                 pc.value.append("main-ran")
-                send("assert", (pc.value, ["main-ran"]))
+                return pc.value
+            return (_core.cown_is_pinned(pc.impl), _on_main)
 
-        quiesce()
-        receive_asserts(2)
+        quiesce(QUIESCE_TIMEOUT)
+        is_pinned, on_main = _ship.unwrap()
+        assert is_pinned is True
+        assert on_main.unwrap() == ["main-ran"]
 
     def test_pinned_via_noticeboard_write(self):
         """``notice_write("k", PinnedCown(x))`` round-trips to a worker reader."""
         start()
         pc = PinnedCown([])
         notice_write("t5_pc", pc)
-        notice_sync()
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
 
         unrelated = Cown(0)
 
         @when(unrelated)
         def _reader(u):
             h = noticeboard()["t5_pc"]
-            send("assert", (_core.cown_is_pinned(h.impl), True))
 
             @when(h)
             def _on_main(h):
                 h.value.append("via-noticeboard")
-                send("assert", (h.value, ["via-noticeboard"]))
+                return h.value
+            return (_core.cown_is_pinned(h.impl), _on_main)
 
-        wait()
-        receive_asserts(2)
+        quiesce(QUIESCE_TIMEOUT)
+        is_pinned, on_main = _reader.unwrap()
+        assert is_pinned is True
+        assert on_main.unwrap() == ["via-noticeboard"]
 
     def test_pinned_list_via_noticeboard(self):
         """A worker pulls handles out of a list payload and chains pinned @whens."""
         start()
         pcs = [PinnedCown([]), PinnedCown([])]
         notice_write("t6_pcs", pcs)
-        notice_sync()
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
 
         unrelated = Cown(0)
 
         @when(unrelated)
         def _reader(u):
             handles = noticeboard()["t6_pcs"]
-            send("assert", (len(handles), 2))
+            pins = []
+            bodies = []
             for i, h in enumerate(handles):
-                send("assert", (_core.cown_is_pinned(h.impl), True))
+                pins.append(_core.cown_is_pinned(h.impl))
 
                 @when(h)
                 def _on_main(h, i=i):
                     h.value.append(("chain", i))
-                    send("assert", (h.value, [("chain", i)]))
+                    return h.value
+                bodies.append(_on_main)
+            return (len(handles), pins, bodies)
 
-        wait()
-        # 1 length assert + 2 is_pinned asserts + 2 body asserts.
-        receive_asserts(5)
+        quiesce(QUIESCE_TIMEOUT)
+        n, pins, bodies = _reader.unwrap()
+        assert n == 2
+        assert pins == [True, True]
+        assert bodies[0].unwrap() == [("chain", 0)]
+        assert bodies[1].unwrap() == [("chain", 1)]
 
     def test_pinned_nested_in_regular_cown_value(self):
         """``Cown({"pc": PinnedCown(x), ...})`` -- worker extracts the inner handle."""
         pc = PinnedCown([])
         outer = Cown({"pc": pc, "tag": "wrap"})
-        # Pass the expected literal in as a closure capture: the
-        # transpiler ships it via the captures tuple so the string
-        # arrives in the worker with its own ownership, sidestepping a
-        # 3.13 debug-build interned-string teardown bug that bites
-        # comparisons against literals round-tripped through
-        # ``o.value[...]``.
-        expected_tag = "wrap"
 
         @when(outer)
         def _worker(o):
             inner = o.value["pc"]
-            send("assert", (_core.cown_is_pinned(inner.impl), True))
-            send("assert", (o.value["tag"], expected_tag))
 
             @when(inner)
             def _on_main(inner):
                 inner.value.append("from-nested")
-                send("assert", (inner.value, ["from-nested"]))
+                return inner.value
+            return (_core.cown_is_pinned(inner.impl), o.value["tag"], _on_main)
 
-        quiesce()
-        receive_asserts(3)
+        quiesce(QUIESCE_TIMEOUT)
+        is_pinned, tag, on_main = _worker.unwrap()
+        assert is_pinned is True
+        assert tag == "wrap"
+        assert on_main.unwrap() == ["from-nested"]
 
     def test_two_workers_share_pinned_handle_via_noticeboard(self):
         """Two workers each read the same pinned handle; both pinned bodies run on main."""
         start()
         pc = PinnedCown([])
         notice_write("t16_pc", pc)
-        notice_sync()
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
 
         u1 = Cown(0)
         u2 = Cown(0)
@@ -704,56 +622,57 @@ def test_two_workers_share_pinned_handle_via_noticeboard(self):
         @when(u1)
         def _w1(u):
             h = noticeboard()["t16_pc"]
-            send("assert", (_core.cown_is_pinned(h.impl), True))
 
             @when(h)
             def _body(h):
                 h.value.append("w1")
-                send("assert", (h.value[-1], "w1"))
+                return h.value[-1]
+            return (_core.cown_is_pinned(h.impl), _body)
 
         @when(u2)
         def _w2(u):
             h = noticeboard()["t16_pc"]
-            send("assert", (_core.cown_is_pinned(h.impl), True))
 
             @when(h)
             def _body(h):
                 h.value.append("w2")
-                send("assert", (h.value[-1], "w2"))
+                return h.value[-1]
+            return (_core.cown_is_pinned(h.impl), _body)
 
-        wait()
-        # 2 is_pinned asserts + 2 body asserts.
-        receive_asserts(4)
+        quiesce(QUIESCE_TIMEOUT)
+        p1, b1 = _w1.unwrap()
+        p2, b2 = _w2.unwrap()
+        assert p1 is True and p2 is True
+        assert b1.unwrap() == "w1"
+        assert b2.unwrap() == "w2"
 
-        # Both workers mutated the *same* pinned value -- strong evidence
-        # that both handles resolved to the same underlying capsule.
         sentinel = PinnedCown(None)
 
         @when(sentinel)
         def _inspect(_s):
-            content = sorted(pc.value)
-            send("assert", (content, ["w1", "w2"]))
+            return sorted(pc.value)
 
-        wait()
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert _inspect.unwrap() == ["w1", "w2"]
 
     def test_pinned_via_notice_update(self):
         """``notice_update`` with a pinned producer; readers see the pinned handle."""
         start()
         pc = PinnedCown([])
         notice_update("t16b_pc", partial(_replace_with, pc), default=None)
-        notice_sync()
+        quiesce(QUIESCE_TIMEOUT, noticeboard=True)
 
         unrelated = Cown(0)
 
         @when(unrelated)
         def _reader(u):
             h = noticeboard()["t16b_pc"]
-            send("assert", (h is not None, True))
-            send("assert", (_core.cown_is_pinned(h.impl), True))
+            return (h is not None, _core.cown_is_pinned(h.impl))
 
-        wait()
-        receive_asserts(2)
+        quiesce(QUIESCE_TIMEOUT)
+        not_none, is_pinned = _reader.unwrap()
+        assert not_none is True
+        assert is_pinned is True
 
     def test_body_raise_drains_queue(self):
         """A raising pinned body marks its result cown and the queue still drains."""
@@ -765,18 +684,15 @@ def raiser(pc):
             raise RuntimeError("planned-failure")
 
         @when(pc_ok)
-        def _survivor(pc):
-            send("assert", ("survived", "survived"))
+        def survivor(pc):
+            return "survived"
 
-        @when(raiser)
-        def _inspect(r):
-            send("assert", (r.exception, True))
-
-        quiesce()
-        receive_asserts(2)
+        quiesce(QUIESCE_TIMEOUT)
+        with pytest.raises(RuntimeError, match="planned-failure"):
+            raiser.unwrap()
+        assert survivor.unwrap() == "survived"
         assert _core.main_pump_queue_depth() == 0
 
-    # Mixed pinned/unpinned routing.
     @pytest.mark.parametrize("kind,expected_on_main", [
         (("p", "p"), True),
         (("p", "u"), True),
@@ -794,17 +710,17 @@ def test_mixed_request_set_routes_to_main_iff_pinned(
             a, b = cowns
 
             @when(a, b)
-            def _body(a, b, expected_on_main=expected_on_main):
-                send("assert", (_core.is_primary(), expected_on_main))
+            def _body(a, b):
+                return _core.is_primary()
         else:
             a, b, c = cowns
 
             @when(a, b, c)
-            def _body(a, b, c, expected_on_main=expected_on_main):
-                send("assert", (_core.is_primary(), expected_on_main))
+            def _body(a, b, c):
+                return _core.is_primary()
 
-        quiesce()
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
+        assert _body.unwrap() is expected_on_main
 
 
 class TestPinnedWatchdog:
@@ -821,11 +737,8 @@ def teardown_class(cls):
         wait()
 
     def teardown_method(self, method):
-        # Reset watchdog state so a leaked threshold cannot poison the
-        # next test. ``None`` disables the sampler.
         set_pump_watchdog(warn_ms=None, on_starve=None)
 
-    # Warn fires after starvation threshold.
     def test_warn_only_fires_on_starvation(self):
         """``warn_ms`` invokes on_starve once after the threshold elapses."""
         warns = []
@@ -839,19 +752,14 @@ def on_starve(severity, message):
 
         @when(pc)
         def _body(pc):
-            send("assert", ("ran", "ran"))
+            pass
 
-        # Let the queue sit non-empty past warn_ms before the pump runs.
         time.sleep(0.15)
-        # auto-pump drains the body; check_warn samples at pump entry
-        # and sees age > 50ms.
-        quiesce()
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
 
         assert any(s == 0 for s, _ in warns), (
             f"expected warn (severity 0) in {warns!r}")
 
-    # Unpinned-only window leaves warn untripped.
     def test_unpinned_only_window_does_not_trip_watchdog(self):
         """Watchdog gates on pinned-queue age, not on total work time."""
         warns = []
@@ -865,28 +773,19 @@ def on_starve(severity, message):
         for _ in range(8):
             @when(c)
             def _busy(c):
-                # Per-behaviour sleep adds up to ~160 ms total worker
-                # time, well past warn_ms. The pinned queue remains
-                # empty throughout, so NONEMPTY_SINCE_NS stays 0.
                 time.sleep(0.02)
-                send("assert", ("ran", "ran"))
         quiesce()
-        receive_asserts(8)
         assert warns == [], (
             f"warn must not fire across unpinned-only window, got {warns!r}")
 
-        # Now schedule a pinned @when. The pinned queue was empty
-        # across the unpinned window, so age = 0 < warn_ms.
         pc = PinnedCown(0)
 
         @when(pc)
         def _body(pc):
-            send("assert", ("pinned-ok", "pinned-ok"))
+            pass
 
-        quiesce()
-        receive_asserts()
+        quiesce(QUIESCE_TIMEOUT)
 
-    # Reconfigure-after-first-pinned.
     def test_reconfigure_after_first_pinned(self):
         """``set_pump_watchdog`` succeeds after live pinned work exists.
 
@@ -900,9 +799,8 @@ def test_reconfigure_after_first_pinned(self):
 
         @when(pc)
         def _body(pc):
-            send("assert", ("ran", "ran"))
+            pass
 
-        # Reconfigure mid-flight; must not raise.
         warns = []
 
         def on_starve(severity, message):
@@ -910,10 +808,7 @@ def on_starve(severity, message):
 
         set_pump_watchdog(warn_ms=200, on_starve=on_starve)
 
-        quiesce()
-        receive_asserts()
-        # The replaced callback may or may not have fired depending on
-        # exact timing; either way no exception escapes quiesce().
+        quiesce(QUIESCE_TIMEOUT)
 
 
 class TestPumpWatchdogOverflow:
@@ -934,7 +829,6 @@ def test_set_pump_watchdog_ms_overflow(self):
         too_big = _behaviors._MAX_PUMP_MS + 1
         with pytest.raises(OverflowError, match="warn_ms"):
             set_pump_watchdog(warn_ms=too_big)
-        # Restore defaults so we don't leak watchdog state into later tests.
         set_pump_watchdog(warn_ms=1000, on_starve=None)
 
 
@@ -994,11 +888,8 @@ def release_all(self):
         with pytest.raises(KeyboardInterrupt) as ei:
             b._drain_orphan_behaviors()
 
-        # Both failures survive: the ValueError that was already in
-        # the local list, and the KI that triggered the re-raise.
         assert len(b._stop_drain_errors) == 2
         assert isinstance(b._stop_drain_errors[0], ValueError)
         assert isinstance(b._stop_drain_errors[1], KeyboardInterrupt)
-        # The re-raised KI carries a note pointing at the stashed list.
         notes = getattr(ei.value, "__notes__", []) or []
         assert any("2 release_all error" in n for n in notes), notes
diff --git a/test/test_public_c_abi.py b/test/test_public_c_abi.py
index dc2d64e..290c361 100644
--- a/test/test_public_c_abi.py
+++ b/test/test_public_c_abi.py
@@ -30,19 +30,16 @@
 
 EXPECTED_PUBLIC_C_FILES = {"bocpy.h", "xidata.h", "bocpy_msvc.c"}
 
-# Filename extensions a wheel install of bocpy is allowed to ship.
 _ALLOWED_SHIPPED_EXTS = {
-    ".py",       # source modules
-    ".pyc",      # bytecode in __pycache__
-    ".pyi",      # type stubs
-    ".so",       # Linux/BSD compiled extensions
-    ".pyd",      # Windows compiled extensions
-    ".dylib",    # macOS dynamic libraries (defensive)
-    ".dll",      # Windows dynamic libraries (defensive)
-    ".txt",      # bocpy.examples ships menu.txt / cheese.txt
+    ".py",
+    ".pyc",
+    ".pyi",
+    ".so",
+    ".pyd",
+    ".dylib",
+    ".dll",
+    ".txt",
 }
-# Filenames (full basename, no extension) that are allowed even
-# though they don't match _ALLOWED_SHIPPED_EXTS.
 _ALLOWED_SHIPPED_NAMES = {"py.typed"}
 
 EXPECTED_ATOMIC_NAMES = {
@@ -53,11 +50,6 @@
 }
 
 
-# ---------------------------------------------------------------------------
-# get_include / get_sources
-# ---------------------------------------------------------------------------
-
-
 def test_get_include_points_at_headers():
     inc = bocpy.get_include()
     assert os.path.isabs(inc)
@@ -75,11 +67,6 @@ def test_get_sources_shape():
         assert sources == []
 
 
-# ---------------------------------------------------------------------------
-# Wheel allow-list (no internal .h / .c leaks)
-# ---------------------------------------------------------------------------
-
-
 def _assert_only_public_artefacts(package_dir: str) -> None:
     """Walk ``package_dir`` and assert every shipped file is allowed.
 
@@ -138,11 +125,6 @@ def test_wheel_allowlist_rejects_unknown_extension(tmp_path):
     assert "secrets.json" in str(exc_info.value)
 
 
-# ---------------------------------------------------------------------------
-# MSVC atomic bodies in lockstep (boc_compat.c vs bocpy_msvc.c)
-# ---------------------------------------------------------------------------
-
-
 _MARKER_BEGIN = "/* @atomic-bodies-begin */"
 _MARKER_END = "/* @atomic-bodies-end */"
 
@@ -170,11 +152,6 @@ def test_msvc_bodies_in_lockstep():
     assert a == b, "marker regions differ — atomic bodies have drifted"
 
 
-# ---------------------------------------------------------------------------
-# Static prototype/body parameter-signature parity (bocpy.h vs bocpy_msvc.c)
-# ---------------------------------------------------------------------------
-
-
 def _extract_atomic_signatures(text: str) -> dict[str, str]:
     """Return the parameter list of every atomic declaration in ``text``.
 
diff --git a/test/test_scheduler_integration.py b/test/test_scheduler_integration.py
index eed5864..2e834ec 100644
--- a/test/test_scheduler_integration.py
+++ b/test/test_scheduler_integration.py
@@ -34,15 +34,10 @@
 
 import bocpy
 from bocpy import _core
-from bocpy import Cown, drain, receive, send, TIMEOUT, wait, when
+from bocpy import Cown, quiesce, wait, when
 
 
-RECEIVE_TIMEOUT = 30
-
-
-# ---------------------------------------------------------------------------
-# Module-level helpers (must be importable by worker sub-interpreters)
-# ---------------------------------------------------------------------------
+QUIESCE_TIMEOUT = 5
 
 
 class _Counter:
@@ -64,18 +59,12 @@ def _ensure_quiesced():
     bocpy.wait()
 
 
-# ---------------------------------------------------------------------------
-# Runtime re-entry
-# ---------------------------------------------------------------------------
-
-
 class TestRuntimeReentry:
     """``start()`` / ``wait()`` / ``start()`` runs two clean workloads."""
 
     @classmethod
     def teardown_class(cls):
         wait()
-        drain("done")
 
     def test_start_wait_start_runs_two_workloads(self):
         """Two independent workloads bracketed by start/wait/start/wait.
@@ -88,7 +77,6 @@ def test_start_wait_start_runs_two_workloads(self):
         """
         _ensure_quiesced()
 
-        # First workload.
         bocpy.start(worker_count=2)
         try:
             c = Cown(_Counter())
@@ -96,17 +84,17 @@ def test_start_wait_start_runs_two_workloads(self):
                 @when(c)
                 def _(c):
                     c.value.count += 1
-                    send("done", c.value.count)
-            for _ in range(50):
-                tag, _payload = receive("done", RECEIVE_TIMEOUT)
-                assert tag != TIMEOUT, "first workload stalled"
+
+            @when(c)
+            def first(c):
+                return c.value.count
+            quiesce(QUIESCE_TIMEOUT)
+            assert first.unwrap() == 50, "first workload stalled"
         finally:
-            drain("done")
             wait()
 
         assert _core.scheduler_stats() == []
 
-        # Second workload after teardown — must come up clean.
         bocpy.start(worker_count=2)
         try:
             c = Cown(_Counter())
@@ -114,26 +102,23 @@ def _(c):
                 @when(c)
                 def _(c):
                     c.value.count += 1
-                    send("done", c.value.count)
-            for _ in range(50):
-                tag, _payload = receive("done", RECEIVE_TIMEOUT)
-                assert tag != TIMEOUT, "second workload stalled"
+
+            @when(c)
+            def second(c):
+                return c.value.count
+            quiesce(QUIESCE_TIMEOUT)
+            assert second.unwrap() == 50, "second workload stalled"
         finally:
-            drain("done")
             wait()
 
 
-# ---------------------------------------------------------------------------
-# Paired-release on uncaught body exception
-# ---------------------------------------------------------------------------
-
-
 def _raising_step(c):
     """Body that raises ``RuntimeError`` after touching the cown."""
     @when(c)
     def _(c):
         c.value.count += 1
         raise RuntimeError("intentional failure")
+    return _
 
 
 def _follow_on(c):
@@ -141,7 +126,8 @@ def _follow_on(c):
     @when(c)
     def _(c):
         c.value.count += 1
-        send("done", c.value.count)
+        return c.value.count
+    return _
 
 
 class TestPairedRelease:
@@ -150,7 +136,6 @@ class TestPairedRelease:
     @classmethod
     def teardown_class(cls):
         wait()
-        drain("done")
 
     def test_cown_reacquirable_after_uncaught_exception(self):
         """A failing behaviour releases its cown so the next one runs.
@@ -159,30 +144,25 @@ def test_cown_reacquirable_after_uncaught_exception(self):
         funnels it to ``Cown.set_exception``, then runs the
         release/release_all pair. If the release path were broken the
         follow-on ``@when(c)`` would block forever; the test would
-        time out on ``receive`` instead of returning a count of 2.
+        time out on ``quiesce`` instead of returning a count of 2.
         """
         _ensure_quiesced()
         bocpy.start(worker_count=2)
         try:
             c = Cown(_Counter())
-            _raising_step(c)
-            _follow_on(c)
+            raising = _raising_step(c)
+            follow = _follow_on(c)
+            quiesce(QUIESCE_TIMEOUT)
 
-            tag, payload = receive("done", RECEIVE_TIMEOUT)
-            assert tag != TIMEOUT, (
+            with pytest.raises(RuntimeError, match="intentional failure"):
+                raising.unwrap()
+            assert follow.unwrap() == 2, (
                 "cown was not re-acquired after an uncaught exception"
             )
-            assert payload == 2, payload
         finally:
-            drain("done")
             wait()
 
 
-# ---------------------------------------------------------------------------
-# Over-registration contract on scheduler_worker_register
-# ---------------------------------------------------------------------------
-
-
 def test_over_registration_raises_runtime_error():
     """An extra register() beyond worker_count must raise RuntimeError.
 
@@ -192,7 +172,6 @@ def test_over_registration_raises_runtime_error():
     """
     bocpy.start()
     try:
-        # Workers have already registered; one more must fail.
         with pytest.raises(RuntimeError, match="over-registration"):
             _core.scheduler_worker_register()
     finally:
diff --git a/test/test_scheduler_stats.py b/test/test_scheduler_stats.py
index f605c5d..910d6fe 100644
--- a/test/test_scheduler_stats.py
+++ b/test/test_scheduler_stats.py
@@ -44,7 +44,7 @@
 
 def test_scheduler_stats_empty_when_runtime_down():
     """With the runtime down, the snapshot must be an empty list."""
-    wait()  # ensure runtime is down
+    wait()
     stats = _core.scheduler_stats()
     assert isinstance(stats, list)
     assert stats == []
@@ -58,7 +58,7 @@ def test_wait_returns_final_snapshot():
     correct way to read the counters for the session that just
     ended.
     """
-    wait()  # baseline
+    wait()
     W = 2  # noqa: N806
     bocpy.start(worker_count=W)
     c = Cown(0)
@@ -67,18 +67,13 @@ def test_wait_returns_final_snapshot():
     def _(c):
         pass
 
-    # `wait()` blocks on the C-level terminator until every scheduled
-    # behavior has decremented it -- no send/receive handshake needed
-    # to know the behavior body actually ran.
     snapshot = wait(stats=True)
     assert isinstance(snapshot, list)
     assert len(snapshot) == W, snapshot
     for s in snapshot:
         assert SCHEDULER_FIELDS == set(s.keys()), s
-    # At least one push happened across the pool.
     assert sum(s["pushed_local"] + s["dispatched_to_pending"]
                + s["pushed_remote"] for s in snapshot) >= 1
-    # And the per-worker array is gone now.
     assert _core.scheduler_stats() == []
 
 
@@ -86,7 +81,6 @@ def test_wait_stats_default_returns_none():
     """`wait()` without `stats=True` returns ``None`` (back-compat)."""
     wait()
     assert wait() is None
-    # Even with a real session, default still returns None.
     bocpy.start(worker_count=2)
     c = Cown(0)
 
@@ -126,7 +120,6 @@ def _(c):
     total_remote = sum(s["pushed_remote"] for s in snap)
     total_pending = sum(s["dispatched_to_pending"] for s in snap)
     assert total_remote >= N, snap
-    # No producer-local arm was ever taken, so pending stays at 0.
     assert total_pending == 0, snap
 
 
@@ -162,7 +155,6 @@ def _inner(i):
 def test_queue_stats_reflects_set_tags_and_traffic():
     """`queue_stats` should expose tagged queues with monotonic counters."""
     set_tags(["t_one", "t_two"])
-    # Drain in case a previous test sent on these tags.
     drain(["t_one", "t_two"])
 
     before = _core.queue_stats()
@@ -189,7 +181,6 @@ def test_queue_stats_reflects_set_tags_and_traffic():
     by_tag_after = {q["tag"]: q for q in after}
     assert by_tag_after["t_one"]["pushed_total"] == pushed_before + 2
     assert by_tag_after["t_one"]["popped_total"] == popped_before + 1
-    # Other tag must not move.
     assert (by_tag_after["t_two"]["pushed_total"]
             == by_tag_before["t_two"]["pushed_total"])
     assert (by_tag_after["t_two"]["popped_total"]
@@ -210,14 +201,12 @@ def test_queue_stats_monotonic_and_no_side_effect():
     s2 = by_tag(snap2)
     s3 = by_tag(snap3)
 
-    # No traffic between snapshots → counters are stable.
     for tag in s1:
         assert s2[tag]["pushed_total"] == s1[tag]["pushed_total"]
         assert s2[tag]["popped_total"] == s1[tag]["popped_total"]
         assert s3[tag]["pushed_total"] == s1[tag]["pushed_total"]
         assert s3[tag]["popped_total"] == s1[tag]["popped_total"]
 
-    # And calling scheduler_stats does not perturb queue_stats either.
     _ = _core.scheduler_stats()
     snap4 = _core.queue_stats()
     s4 = by_tag(snap4)
@@ -242,7 +231,5 @@ def test_drain_does_not_decrement_pushed_or_popped_total():
     drain(["t_drain"])
     after = next(q for q in _core.queue_stats() if q["tag"] == "t_drain")
 
-    # Drain pulls the messages out via boc_dequeue, so popped_total
-    # advances. pushed_total must not retreat.
     assert after["pushed_total"] == before["pushed_total"]
     assert after["popped_total"] >= before["popped_total"]
diff --git a/test/test_scheduler_steal.py b/test/test_scheduler_steal.py
index 6153fe4..a3c9587 100644
--- a/test/test_scheduler_steal.py
+++ b/test/test_scheduler_steal.py
@@ -25,8 +25,8 @@
 ``examples/`` and at the data-structure level by
 ``test_internal_wsq.py``.
 
-All tests follow the same module-level helper / receive-pattern
-discipline as the other scheduler integration tests (see
+All tests follow the same module-level helper discipline as the
+other scheduler integration tests (see
 ``test_scheduler_integration.py``), because behaviours run on
 worker sub-interpreters that import this module to resolve symbols.
 """
@@ -36,16 +36,7 @@
 import pytest
 
 import bocpy
-from bocpy import _core
-from bocpy import Cown, drain, receive, send, TIMEOUT, wait, when
-
-
-RECEIVE_TIMEOUT = 30
-
-
-# ---------------------------------------------------------------------------
-# Module-level helpers (must be importable by worker sub-interpreters)
-# ---------------------------------------------------------------------------
+from bocpy import Cown, wait, when
 
 
 class _Counter:
@@ -63,19 +54,7 @@ def _ensure_quiesced():
     bocpy.wait()
 
 
-def _fanout_done(c_pin, marker):
-    """Final ``@when`` extracted to a helper.
-
-    Inlining inside ``_fanout_kickoff`` would trigger the transpiler
-    nested-capture gap (outer ``marker`` not forwarded into the inner
-    behaviour's capture tuple).
-    """
-    @when(c_pin)
-    def _(c_pin):
-        send("done", marker)
-
-
-def _fanout_kickoff(c_pin, work_cowns, marker):
+def _fanout_kickoff(c_pin, work_cowns):
     """Fan ``len(work_cowns)`` independent behaviours onto the kickoff worker.
 
     The kickoff is dispatched from the main thread and lands on
@@ -95,12 +74,6 @@ def _(c_pin):
             @when(wc)
             def _(wc):
                 wc.value.count += 1
-        _fanout_done(c_pin, marker)
-
-
-# ---------------------------------------------------------------------------
-# Token-work fairness sanity
-# ---------------------------------------------------------------------------
 
 
 class TestStealFairnessSanity:
@@ -109,7 +82,6 @@ class TestStealFairnessSanity:
     @classmethod
     def teardown_class(cls):
         wait()
-        drain("done")
 
     def test_fanout_exceeding_batch_size_provokes_steal_attempts(self):
         """K > BATCH_SIZE (=100) must produce non-zero steal_attempts.
@@ -126,28 +98,16 @@ def test_fanout_exceeding_batch_size_provokes_steal_attempts(self):
         """
         _ensure_quiesced()
         W = 4  # noqa: N806
-        K = 300  # > BOC_BQ_BATCH_SIZE (100)  # noqa: N806
+        K = 300  # noqa: N806
         bocpy.start(worker_count=W)
-        try:
-            c_pin = Cown(_Counter())
-            work_cowns = [Cown(_Counter()) for _ in range(K)]
-            _fanout_kickoff(c_pin, work_cowns, "fairness-done")
-
-            tag, _payload = receive("done", RECEIVE_TIMEOUT)
-            assert tag != TIMEOUT, "kickoff failed to complete"
+        c_pin = Cown(_Counter())
+        work_cowns = [Cown(_Counter()) for _ in range(K)]
+        _fanout_kickoff(c_pin, work_cowns)
 
-            stats = _core.scheduler_stats()
-        finally:
-            drain("done")
-            wait()
+        stats = wait(stats=True)
 
         assert len(stats) == W, stats
         total_attempts = sum(s["steal_attempts"] for s in stats)
-        # The exact distribution of attempts across workers depends
-        # on scheduling races; we only assert the aggregate is
-        # non-zero. ``last_steal_attempt_ns`` on at least one worker
-        # must also be non-zero (it's stamped on every try_steal
-        # entry).
         assert total_attempts > 0, (
             f"no steal_attempts recorded — fairness/empty-queue arms "
             f"never fired: {stats}"
@@ -158,11 +118,6 @@ def test_fanout_exceeding_batch_size_provokes_steal_attempts(self):
         )
 
 
-# ---------------------------------------------------------------------------
-# Empty-queue race: workers with no work must park
-# ---------------------------------------------------------------------------
-
-
 class TestStealEmptyQueueNoSpin:
     """W workers, 0 work — every worker must park in cnd_wait."""
 
@@ -193,20 +148,15 @@ def test_empty_queue_does_not_spin(self):
         _ensure_quiesced()
         W = 4  # noqa: N806
         bocpy.start(worker_count=W)
-        try:
-            # Brief warm-up so workers actually reach pop_slow and
-            # commit to parking before we start measuring.
-            time.sleep(0.05)
-
-            wall_start = time.monotonic()
-            cpu_start = time.process_time()
-            time.sleep(0.30)
-            wall_elapsed = time.monotonic() - wall_start
-            cpu_elapsed = time.process_time() - cpu_start
-
-            stats = _core.scheduler_stats()
-        finally:
-            wait()
+        time.sleep(0.05)
+
+        wall_start = time.monotonic()
+        cpu_start = time.process_time()
+        time.sleep(0.30)
+        wall_elapsed = time.monotonic() - wall_start
+        cpu_elapsed = time.process_time() - cpu_start
+
+        stats = wait(stats=True)
 
         ratio = cpu_elapsed / wall_elapsed
         assert ratio < 0.5, (
diff --git a/test/test_scheduling_stress.py b/test/test_scheduling_stress.py
index 4a9abd0..2161e89 100644
--- a/test/test_scheduling_stress.py
+++ b/test/test_scheduling_stress.py
@@ -5,8 +5,9 @@
 with @when behaviors is brittle (workers run in sub-interpreters and the
 test thread cannot directly observe per-cown state).
 
-Each test ships its results out via send/receive so the test thread can
-synchronize with completion.
+Each test reads its results back through reader behaviors and
+``quiesce()`` + ``Cown.unwrap()`` so the test thread can synchronize
+with completion.
 """
 
 import os
@@ -15,50 +16,31 @@
 
 import bocpy
 from bocpy import _core
-from bocpy import Cown, drain, receive, send, TIMEOUT, wait, when
+from bocpy import Cown, quiesce, wait, when
 import bocpy.behaviors as _behaviors
 
-# NOTE: do NOT import ``mockreplacement`` (or ``unittest.mock``) at
-# module scope. The transpiler exports this whole module into every
-# worker sub-interpreter; module-level imports therefore run inside
-# every worker, and ``mockreplacement`` is not on the worker
-# ``sys.path``. The handful of tests that need it import it locally.
 
+# Do NOT import ``mockreplacement`` (or ``unittest.mock``) at module scope: the
+# transpiler exports this whole module into every worker sub-interpreter, where
+# it is not on ``sys.path``. Tests that need it import it locally.
+QUIESCE_TIMEOUT = 30
 
-RECEIVE_TIMEOUT = 30
 
+def _read_back(cowns):
+    """Schedule one reader behavior per cown.
 
-# ---------------------------------------------------------------------------
-# Helpers (module-level so workers can import them)
-# ---------------------------------------------------------------------------
-
-
-def _drain_done():
-    """Drop any leftover 'done' messages between tests."""
-    drain("done")
-
-
-def _collect_done(expected: int, timeout: int = RECEIVE_TIMEOUT):
-    """Block until `expected` 'done' messages arrive; return their payloads.
-
-    Fails the test with a clear message on timeout instead of hanging.
+    Each reader returns ``(idx, count)`` for its cown; the readers run
+    after every increment already queued on that cown (FIFO per cown),
+    so unwrapping their result cowns after ``quiesce()`` yields the
+    final counts. Returns the list of result cowns.
     """
-    payloads = []
-    timed_out = False
-    try:
-        for _ in range(expected):
-            tag, payload = receive("done", timeout)
-            if tag == TIMEOUT:
-                timed_out = True
-                break
-            payloads.append(payload)
-    finally:
-        drain("done")
-    assert not timed_out, (
-        f"Timed out waiting for 'done' messages: got {len(payloads)} of "
-        f"{expected}. A behavior likely failed to schedule or run."
-    )
-    return payloads
+    readers = []
+    for idx, c in enumerate(cowns):
+        @when(c)
+        def _(c):
+            return (idx, c.value.count)  # noqa: B023
+        readers.append(_)
+    return readers
 
 
 class Counter:
@@ -75,18 +57,12 @@ def __init__(self):
         self.count = 0
 
 
-# ---------------------------------------------------------------------------
-# Fan-out: N behaviors over M cowns, disjoint and overlapping
-# ---------------------------------------------------------------------------
-
-
 class TestSchedulingFanOut:
     """N behaviors fan out across M cowns; each cown's count is an oracle."""
 
     @classmethod
     def teardown_class(cls):
         wait()
-        _drain_done()
 
     @pytest.mark.parametrize("n,m", [(1000, 32), (200, 4), (500, 1)])
     def test_disjoint_fan_out(self, n: int, m: int):
@@ -100,17 +76,12 @@ def test_disjoint_fan_out(self, n: int, m: int):
             def _(c):
                 c.value.count += 1
 
-        # Read each counter back through a behavior and report it.
-        for idx, c in enumerate(cowns):
-            @when(c)
-            def _(c):
-                send("done", (idx, c.value.count))  # noqa: B023
-
-        results = _collect_done(m)
+        readers = _read_back(cowns)
+        quiesce(QUIESCE_TIMEOUT)
+        results = [r.unwrap() for r in readers]
 
         per_cown = {idx: count for idx, count in results}
         assert sum(per_cown.values()) == n, per_cown
-        # Each cown should see exactly its round-robin share.
         for idx in range(m):
             expected_share = n // m + (1 if idx < n % m else 0)
             assert per_cown[idx] == expected_share, (idx, per_cown)
@@ -132,28 +103,19 @@ def _(a, b):
                 a.value.count += 1
                 b.value.count += 1
 
-        for idx, c in enumerate(cowns):
-            @when(c)
-            def _(c):
-                send("done", (idx, c.value.count))  # noqa: B023
-
-        results = _collect_done(m)
+        readers = _read_back(cowns)
+        quiesce(QUIESCE_TIMEOUT)
+        results = [r.unwrap() for r in readers]
         total = sum(count for _, count in results)
         assert total == 2 * n, results
 
 
-# ---------------------------------------------------------------------------
-# Sustained load: long-running schedule that must drain via wait()
-# ---------------------------------------------------------------------------
-
-
 class TestSchedulingSustainedLoad:
     """Schedule a large bounded workload and ensure it completes."""
 
     @classmethod
     def teardown_class(cls):
         wait()
-        _drain_done()
 
     def test_bounded_completion(self):
         """Schedule many behaviors; each reports done; wait collects them all.
@@ -171,18 +133,12 @@ def test_bounded_completion(self):
             @when(target)
             def _(c):
                 c.value.count += 1
-                send("done", 1)
-
-        # Use a generous timeout proportional to n; wait fails noisily if a
-        # behavior is dropped.
-        timeout = max(RECEIVE_TIMEOUT, n // 100)
-        payloads = _collect_done(n, timeout=timeout)
-        assert len(payloads) == n
 
-
-# ---------------------------------------------------------------------------
-# Dedup regression: @when(c, c) must run exactly once per scheduling
-# ---------------------------------------------------------------------------
+        timeout = max(QUIESCE_TIMEOUT, n // 100)
+        readers = _read_back(cowns)
+        quiesce(timeout)
+        total = sum(count for _, count in (r.unwrap() for r in readers))
+        assert total == n
 
 
 class TestSchedulingDedup:
@@ -191,28 +147,25 @@ class TestSchedulingDedup:
     @classmethod
     def teardown_class(cls):
         wait()
-        _drain_done()
 
     def test_when_same_cown_twice_runs_once(self):
         """@when(c, c) schedules exactly one behavior invocation."""
         c = Cown(Counter())
 
         @when(c, c)
-        def _(a, b):
-            # a and b are separate Python wrappers but back the same cown,
-            # so they observe the same underlying value object.
+        def identity(a, b):
             a.value.count += 1
-            send("done", a.value is b.value)
+            return a.value is b.value
 
-        payloads = _collect_done(1)
-        # Both parameters should expose the same underlying value.
-        assert payloads == [True]
+        quiesce(QUIESCE_TIMEOUT)
+        assert identity.unwrap() is True
 
         @when(c)
-        def _(c):
-            send("done", c.value.count)
+        def reader(c):
+            return c.value.count
 
-        [count] = _collect_done(1)
+        quiesce(QUIESCE_TIMEOUT)
+        count = reader.unwrap()
         assert count == 1, f"dedup failed: counter={count}"
 
     def test_when_repeated_cown_many_times(self):
@@ -226,18 +179,14 @@ def _(a, b):
                 a.value.count += 1
 
         @when(c)
-        def _(c):
-            send("done", c.value.count)
+        def reader(c):
+            return c.value.count
 
-        [count] = _collect_done(1)
+        quiesce(QUIESCE_TIMEOUT)
+        count = reader.unwrap()
         assert count == n, f"expected {n}, got {count}"
 
 
-# ---------------------------------------------------------------------------
-# Drain-with-recycle-flush: terminator + recycle invariant after wait()
-# ---------------------------------------------------------------------------
-
-
 class TestSchedulingDrainRecycleFlush:
     """Verify the terminator and recycle queue invariants after ``wait()``.
 
@@ -245,20 +194,17 @@ class TestSchedulingDrainRecycleFlush:
     must return to zero and a forced recycle-queue flush must be a no-op
     (no double-frees, no live entries left).
 
-    An earlier draft of this test also wanted a per-BOCBehavior refcount
-    assertion,
-    but that counter is only exposed under the compile-time
-    ``BOC_REF_TRACKING`` build flag. The terminator counter is a strict
-    superset for the leak-detection purpose: every behavior takes one
-    terminator hold via ``whencall`` and releases it on the worker thread
-    after ``behavior_release_all``, so a behavior that is leaked (or whose
-    release is dropped) keeps the count above zero.
+    A per-BOCBehavior refcount assertion is only exposed under the
+    compile-time ``BOC_REF_TRACKING`` build flag. The terminator counter is
+    a strict superset for the leak-detection purpose: every behavior takes
+    one terminator hold via ``whencall`` and releases it on the worker
+    thread after ``behavior_release_all``, so a behavior that is leaked (or
+    whose release is dropped) keeps the count above zero.
     """
 
     @classmethod
     def teardown_class(cls):
         wait()
-        _drain_done()
 
     def test_terminator_returns_to_zero_after_wait(self):
         """Schedule N disjoint behaviors; wait(); count must be 0."""
@@ -271,12 +217,11 @@ def test_terminator_returns_to_zero_after_wait(self):
             @when(target)
             def _(c):
                 c.value.count += 1
-                send("done", 1)
 
-        payloads = _collect_done(n)
-        assert len(payloads) == n
-        # wait() drains and stops; terminator_count() should observe a
-        # quiesced runtime. A non-zero value indicates a leaked hold.
+        readers = _read_back(cowns)
+        quiesce(QUIESCE_TIMEOUT)
+        total = sum(count for _, count in (r.unwrap() for r in readers))
+        assert total == n
         wait()
         assert _core.terminator_count() == 0
 
@@ -288,21 +233,17 @@ def test_recycle_after_wait_is_idempotent(self):
             @when(c)
             def _(c):
                 c.value.count += 1
-                send("done", 1)
 
-        _collect_done(len(cowns))
+        readers = _read_back(cowns)
+        quiesce(QUIESCE_TIMEOUT)
+        total = sum(count for _, count in (r.unwrap() for r in readers))
+        assert total == len(cowns)
         wait()
-        # Two flushes back-to-back: the second must be a no-op.
         _core.recycle()
         _core.recycle()
         assert _core.terminator_count() == 0
 
 
-# ---------------------------------------------------------------------------
-# whencall rollback: a failed behavior_schedule must release the terminator
-# ---------------------------------------------------------------------------
-
-
 class TestWhencallRollback:
     """Verify that a failed ``behavior_schedule`` releases its terminator hold.
 
@@ -317,18 +258,9 @@ class TestWhencallRollback:
     @classmethod
     def teardown_class(cls):
         wait()
-        _drain_done()
 
     def _baseline(self):
-        # Drive the runtime to a quiesced state with no outstanding holds.
         wait()
-        # Trigger a fresh start without scheduling anything. start()
-        # leaves the terminator at (count=1, seeded=1) -- the seed
-        # contribution that wait()/stop() drops via terminator_seed_dec.
-        # We do not schedule a probe behavior here because the worker's
-        # release/decrement happens after the behavior body returns and
-        # there is no synchronisation point that proves the decrement
-        # has landed before the test thread snapshots the count.
         from bocpy import start as _start_runtime
         _start_runtime()
 
@@ -336,11 +268,6 @@ def test_rollback_after_schedule_raises(self):
         """A raising ``BehaviorCapsule.schedule`` must leave terminator_count at 0."""
         self._baseline()
 
-        # After _baseline the runtime is alive (start() ran) but no
-        # behaviors are in flight. The terminator still carries the
-        # seed contribution (count == 1, seeded == 1) until stop().
-        # whencall increments above the seed and a clean rollback must
-        # bring count back to exactly the pre-call value.
         before = _core.terminator_count()
 
         from mockreplacement import patch_attr, Recorder
@@ -362,27 +289,20 @@ def _(c):
                     c.value.count += 1
             assert info.value is sentinel
 
-        # The mocked failure must not leave a dangling terminator hold:
-        # whencall caught the raise and called terminator_dec.
         assert _core.terminator_count() == before
-        # And the runtime should still be usable for fresh behaviors.
         c2 = Cown(Counter())
 
         @when(c2)
-        def _(c):
+        def probe(c):
             c.value.count += 1
-            send("done", 1)
+            return c.value.count
 
-        _collect_done(1)
+        quiesce(QUIESCE_TIMEOUT)
+        assert probe.unwrap() == 1
         wait()
         assert _core.terminator_count() == 0
 
 
-# ---------------------------------------------------------------------------
-# stop()-vs-schedule race: a closed terminator must reject new whencalls
-# ---------------------------------------------------------------------------
-
-
 class TestStopVsScheduleRace:
     """Verify that ``stop()`` fences subsequent ``whencall`` attempts.
 
@@ -395,27 +315,21 @@ class TestStopVsScheduleRace:
     @classmethod
     def teardown_class(cls):
         wait()
-        _drain_done()
 
     def test_terminator_inc_refuses_after_close(self):
         """``terminator_inc`` returns -1 once ``terminator_close`` has run."""
-        # wait() quiesces the runtime and runs terminator_close internally,
-        # leaving (count=0, seeded=0, closed=1). A direct terminator_inc
-        # call from the test thread must therefore be refused.
         wait()
         rc = _core.terminator_inc()
         assert rc < 0, f"terminator_inc returned {rc}, expected -1"
 
-        # The runtime must still be restartable on the next @when. The
-        # Behaviors.start() path runs terminator_reset which raises drift
-        # only if our refused inc somehow took effect (it must not have).
         c = Cown(Counter())
 
         @when(c)
-        def _(c):
-            send("done", 1)
+        def probe(c):
+            return True
 
-        _collect_done(1)
+        quiesce(QUIESCE_TIMEOUT)
+        assert probe.unwrap() is True
         wait()
         assert _core.terminator_count() == 0
 
@@ -429,15 +343,14 @@ def test_whencall_raises_after_close(self):
         targets the same underlying C function via the Python module
         binding the whencall helper actually consults.
         """
-        # First make sure the runtime is alive so @when does not try to
-        # restart it during the patched call.
         c0 = Cown(Counter())
 
         @when(c0)
-        def _(c):
-            send("done", 1)
+        def probe(c):
+            return True
 
-        _collect_done(1)
+        quiesce(QUIESCE_TIMEOUT)
+        assert probe.unwrap() is True
 
         from mockreplacement import patch_attr
 
@@ -453,18 +366,10 @@ def _refuse_inc(*args, **kwargs):
                 def _(c):
                     c.value.count += 1
 
-        # whencall short-circuited at terminator_inc; no hold leaked,
-        # no behavior_schedule was called.
         wait()
         assert _core.terminator_count() == 0
 
 
-# ---------------------------------------------------------------------------
-# Worker error-path resilience: a failing behavior body must not strand
-# wait() or take a worker out of rotation.
-# ---------------------------------------------------------------------------
-
-
 class _Boom(Exception):
     """Sentinel exception raised by the worker-resilience tests."""
 
@@ -495,7 +400,6 @@ class TestWorkerErrorPath:
     @classmethod
     def teardown_class(cls):
         wait()
-        _drain_done()
 
     def test_raising_body_does_not_strand_wait(self):
         """A single raising behavior must let ``wait()`` complete."""
@@ -524,9 +428,9 @@ def test_workers_survive_many_raising_behaviors(self):
         """N raising behaviors must not take any worker out of rotation.
 
         Schedule far more raising behaviors than workers, then schedule
-        a follow-up batch of well-behaved behaviors that emit on
-        ``done``. If any worker had broken out of its loop, we would
-        miss messages and ``_collect_done`` would time out.
+        a follow-up batch of well-behaved behaviors that return their
+        index. If any worker had broken out of its loop, the follow-up
+        result cowns would never resolve and ``quiesce`` would time out.
         """
         n_raising = 200
         n_followup = 50
@@ -538,23 +442,20 @@ def _(c):
                 _raise_boom(c)
 
         followup_cowns = [Cown(Counter()) for _ in range(n_followup)]
+        readers = []
         for i, c in enumerate(followup_cowns):
             @when(c)
             def _(c):
-                send("done", i)  # noqa: B023
+                return i  # noqa: B023
+            readers.append(_)
 
-        payloads = _collect_done(n_followup)
-        assert sorted(payloads) == list(range(n_followup))
+        quiesce(QUIESCE_TIMEOUT)
+        payloads = sorted(r.unwrap() for r in readers)
+        assert payloads == list(range(n_followup))
         wait()
         assert _core.terminator_count() == 0
 
 
-# ---------------------------------------------------------------------------
-# Noticeboard startup handshake: a failed set_noticeboard_thread() must be
-# surfaced on the calling thread, not silently strand the runtime.
-# ---------------------------------------------------------------------------
-
-
 class TestNoticeboardStartupHandshake:
     """Verify that a failed noticeboard claim surfaces on the starter thread.
 
@@ -568,11 +469,9 @@ class TestNoticeboardStartupHandshake:
     @classmethod
     def teardown_class(cls):
         wait()
-        _drain_done()
 
     def test_failed_claim_raises_on_start(self):
         """``start()`` must raise if ``set_noticeboard_thread`` raises."""
-        # Quiesce any prior runtime so the next @when triggers a fresh start.
         wait()
 
         from mockreplacement import patch_attr
@@ -591,31 +490,20 @@ def _raise_sentinel(*args, **kwargs):
                 def _(c):
                     c.value.count += 1
 
-        # The failed start must reset the global runtime slot so the
-        # next @when triggers a fresh start() rather than reusing the
-        # half-initialised Behaviors instance whose noticeboard thread
-        # is already dead.
         assert _behaviors.BEHAVIORS is None
 
-        # The runtime must be re-startable once the synthetic failure is
-        # withdrawn. A successful @when proves the next start_noticeboard
-        # claimed the slot cleanly.
         c2 = Cown(Counter())
 
         @when(c2)
-        def _(c):
-            send("done", 1)
+        def probe(c):
+            return True
 
-        _collect_done(1)
+        quiesce(QUIESCE_TIMEOUT)
+        assert probe.unwrap() is True
         wait()
         assert _core.terminator_count() == 0
 
 
-# ---------------------------------------------------------------------------
-# Chain-ring stress, parameterised over worker_count
-# ---------------------------------------------------------------------------
-
-
 class TestChainRingPerWorkerCount:
     """Long ring of overlapping pair-locks under varied worker counts.
 
@@ -635,7 +523,6 @@ class TestChainRingPerWorkerCount:
     @classmethod
     def teardown_class(cls):
         wait()
-        _drain_done()
 
     @pytest.mark.parametrize("worker_count", [1, 2, 4, 8])
     def test_chain_ring(self, worker_count: int):
@@ -672,22 +559,12 @@ def _(a, b):
                     a.value.count += 1
                     b.value.count += 1
 
-            # Read each counter back through a behaviour so the test
-            # thread observes the final value after all increments
-            # have committed.
-            for idx, c in enumerate(cowns):
-                @when(c)
-                def _(c):
-                    send("done", (idx, c.value.count))  # noqa: B023
-
-            results = _collect_done(ring_size)
+            readers = _read_back(cowns)
+            quiesce(QUIESCE_TIMEOUT)
+            results = [r.unwrap() for r in readers]
             total = sum(count for _, count in results)
             assert total == 2 * ring_length, (worker_count, results)
         finally:
-            _drain_done()
-            # `wait(stats=True)` returns the snapshot captured before
-            # the per-worker array is freed, so we don't need a
-            # pre-wait `_core.scheduler_stats()` call.
             stats = wait(stats=True)
             assert _core.terminator_count() == 0
 
@@ -695,23 +572,12 @@ def _(c):
         total_local = sum(s["popped_local"] for s in stats)
         total_stolen = sum(s["popped_via_steal"] for s in stats)
         total_pops = total_local + total_stolen
-        # Every behaviour that completes was popped exactly once, so
-        # `total_pops` must reach the dispatched count. We don't need
-        # an exact equality (last-mile read-back behaviours and the
-        # warm-up handshake also count, and the fairness token's
-        # pop-side accounting biases the totals; see
-        # `boc_sched_stats_t` in `boc_sched.h`), only a sanity floor.
         assert total_pops >= ring_length, (
             f"W={worker_count}: only {total_pops} pops recorded "
             f"for {ring_length} dispatched behaviours"
         )
 
 
-# ---------------------------------------------------------------------------
-# Orphan-drain mitigation: set_drop_exception on stop()-orphaned results
-# ---------------------------------------------------------------------------
-
-
 class TestOrphanDropException:
     """Verify the orphan-drain mitigation surfaces RuntimeError on result Cowns.
 
@@ -739,22 +605,15 @@ class TestOrphanDropException:
     @classmethod
     def teardown_class(cls):
         wait()
-        _drain_done()
 
     def test_set_drop_exception_marks_result_cown(self):
         """C-method: ``set_drop_exception`` writes value and flag, leaves cown released."""
-        # Drive the runtime to a known state and ensure it is alive
-        # (BehaviorCapsule construction touches per-module C state).
         wait()
         from bocpy import start as _start_runtime
         _start_runtime()
 
         result = Cown(None)
         arg = Cown(Counter())
-        # Construct a BehaviorCapsule without scheduling it. The thunk
-        # name does not need to resolve because we never call
-        # ``execute`` — set_drop_exception only touches the result
-        # cown.
         capsule = _core.BehaviorCapsule(
             "__behavior_never_called__",
             result.impl,
@@ -765,16 +624,11 @@ def test_set_drop_exception_marks_result_cown(self):
         drop = RuntimeError("orphaned during stop()")
         capsule.set_drop_exception(drop)
 
-        # The result Cown must now be in the published-and-released
-        # state with the exception flag set so a post-stop() consumer
-        # can acquire it and observe the failure.
         result.acquire()
         try:
             assert result.exception is True, (
                 "set_drop_exception must mark the result Cown's exception flag"
             )
-            # Value goes through xidata pickle/unpickle on release/acquire,
-            # so identity is not preserved; check type and message.
             assert isinstance(result.value, RuntimeError), (
                 f"expected RuntimeError, got {type(result.value).__name__}"
             )
@@ -792,12 +646,7 @@ def test_drain_orphan_invokes_set_drop_exception(self):
 
         from mockreplacement import patch_attr, Recorder
 
-        # Build a recorder so the orphan-drain path can call the
-        # documented methods on it (``set_drop_exception``,
-        # ``release_all``) and we can assert on the recorded calls.
         fake_capsule = Recorder("orphan_capsule")
-        # Single-shot drain: first call returns one fake orphan, the
-        # second call returns [] so the drain loop terminates.
         drain_results = [[fake_capsule], []]
 
         def _fake_drain():
@@ -824,9 +673,6 @@ def _fake_terminator_dec(*args, **kwargs):
             f"expected exactly one capsule drained; got {drained_count}"
         )
         fake_capsule.set_drop_exception.assert_called_once()
-        # The argument must be a RuntimeError carrying a stop()
-        # diagnostic; the orphan drain UX contract requires the
-        # message reference "stop()" so users can grep for it.
         sent_arg = fake_capsule.set_drop_exception.call_args[0][0]
         assert isinstance(sent_arg, RuntimeError), (
             f"expected RuntimeError, got {type(sent_arg).__name__}"
@@ -837,11 +683,6 @@ def _fake_terminator_dec(*args, **kwargs):
         fake_capsule.release_all.assert_called_once()
 
 
-# ---------------------------------------------------------------------------
-# Dispatch after runtime stop must surface
-# ---------------------------------------------------------------------------
-
-
 class TestDispatchAfterRuntimeStop:
     """``boc_sched_dispatch`` must raise once the runtime is torn down.
 
@@ -865,19 +706,12 @@ class TestDispatchAfterRuntimeStop:
 
     @classmethod
     def teardown_class(cls):
-        # Ensure the runtime is up for any subsequent test class.
         wait()
-        _drain_done()
 
     def test_schedule_after_runtime_stop_raises(self):
         """A ``@when`` after ``scheduler_runtime_stop`` raises and rolls back."""
-        # Bring the runtime to a clean post-stop state.
         wait()
 
-        # We need WORKER_COUNT == 0 at the C level. ``wait()`` ran
-        # ``stop_workers`` which already called ``scheduler_runtime_stop``,
-        # so the runtime is down. ``scheduler_stats()`` returns an
-        # empty list iff the per-worker array has been freed.
         assert _core.scheduler_stats() == [], (
             "scheduler_runtime_stop should have left WORKER_COUNT == 0"
         )
@@ -889,22 +723,8 @@ def test_schedule_after_runtime_stop_raises(self):
             f"seeded={before_seeded}"
         )
 
-        # Bypass the auto-start in the @when fast path by reaching
-        # whencall directly with a Cown whose runtime has been
-        # explicitly stopped. The trick: re-close the terminator and
-        # force WORKER_COUNT to 0 at the same time. We arm both by
-        # going through the public start/wait cycle which leaves
-        # exactly that state. Then we drive a behavior through
-        # ``_core.BehaviorCapsule(...).schedule()`` directly so the
-        # auto-start gate in ``behaviors.py`` cannot wake the
-        # runtime back up between our setup and the dispatch.
         c = Cown(Counter())
 
-        # Build a behavior capsule by hand so the auto-start path
-        # in ``@when`` does not fire. ``_core.BehaviorCapsule``
-        # takes (thunk_name, result_impl, cowns_with_groups,
-        # captures); ``cowns_with_groups`` is a list of
-        # (group_id, cown_impl) tuples mirroring whencall.
         result = Cown(None)
         capsule = _core.BehaviorCapsule(
             "__nonexistent_thunk__",
@@ -913,38 +733,18 @@ def test_schedule_after_runtime_stop_raises(self):
             [],
         )
 
-        # The terminator is closed after wait(); we must arm it for
-        # this single dispatch attempt the same way whencall would,
-        # then prove the dispatch failure rolls our hold back.
-        # terminator_inc would refuse a closed terminator, so we
-        # seed it via terminator_reset (count=1, seeded=1, closed=0)
-        # to mimic an alive runtime, then forcibly bring
-        # WORKER_COUNT back to 0 by NOT calling start().
         prior_count, prior_seeded = _core.terminator_reset()
-        # The reset returned the post-wait quiesced state; arm the
-        # terminator for our synthetic schedule attempt.
         rc = _core.terminator_inc()
         assert rc >= 0, f"terminator_inc unexpectedly refused: {rc}"
 
         try:
-            # Direct schedule. With WORKER_COUNT == 0 the off-worker
-            # dispatch arm in boc_sched_dispatch must surface a
-            # RuntimeError rather than silently dropping the node.
             with pytest.raises(RuntimeError, match="bocpy runtime is not running"):
                 capsule.schedule()
-            # whencall's try/except in behaviors.py would now call
-            # terminator_dec; we mirror that here so the count
-            # returns to its pre-arm state.
             _core.terminator_dec()
         finally:
-            # Drop the seed contribution from terminator_reset and
-            # close the terminator so subsequent tests starting
-            # fresh see a clean baseline.
             _core.terminator_seed_dec()
             _core.terminator_close()
 
-        # All holds rolled back: count is back to 0 and the
-        # surviving runtime state is clean.
         assert _core.terminator_count() == 0, (
             "schedule failure must roll back the synthetic terminator hold"
         )
@@ -966,41 +766,27 @@ def test_scheduler_runtime_stop_is_idempotent(self):
         exercising the second-call path the docstring claims to
         defend.
         """
-        # Bring the runtime down to a clean baseline.
         wait()
-        # Force a genuine runtime cycle: schedule one behaviour so
-        # ``Behaviors.start()`` allocates ``WORKERS``, then ``wait()``
-        # again so ``stop_workers`` performs the *first* real
-        # ``scheduler_runtime_stop`` call. Without this step the
-        # idempotency assertions below would all hit the
-        # ``WORKERS == NULL`` early-out and pass vacuously.
         c = Cown(Counter())
 
         @when(c)
-        def _(c):
-            send("done", 1)
+        def probe(c):
+            return True
 
-        _collect_done(1)
-        # While the runtime is still alive, ``scheduler_stats()`` is
-        # non-empty — this proves the runtime really did come up and
-        # the next ``wait()`` will perform a load-bearing
-        # ``scheduler_runtime_stop``.
+        quiesce(QUIESCE_TIMEOUT)
+        assert probe.unwrap() is True
         live_stats = _core.scheduler_stats()
         assert live_stats, (
             "runtime must be alive before tearing it down so the first "
             f"scheduler_runtime_stop has work to do; got {live_stats!r}"
         )
         wait()
-        # First (real) call already happened inside ``stop_workers()``.
-        # The array is freed and ``scheduler_stats()`` is empty.
         assert _core.scheduler_stats() == [], (
             "wait() should have left WORKER_COUNT == 0"
         )
-        # A second explicit call must be a no-op (no crash, no error).
         _core.scheduler_runtime_stop()
         assert _core.scheduler_stats() == [], (
             "second scheduler_runtime_stop must leave WORKER_COUNT == 0"
         )
-        # And a third, for good measure.
         _core.scheduler_runtime_stop()
         assert _core.scheduler_stats() == []
diff --git a/test/test_stop_retry_composition.py b/test/test_stop_retry_composition.py
index c8b60df..2375e7c 100644
--- a/test/test_stop_retry_composition.py
+++ b/test/test_stop_retry_composition.py
@@ -32,21 +32,13 @@
 
 import bocpy
 from bocpy import _core
-from bocpy import Cown, drain, notice_update, receive, send, TIMEOUT, wait, when
+from bocpy import Cown, notice_update, quiesce, wait, when
 
 
-RECEIVE_TIMEOUT = 10
-# Slow-fn duration: long enough that ``wait(timeout=0.1)`` reliably
-# hits the noticeboard-join timeout, short enough that the test does
-# not bloat the suite.
+QUIESCE_TIMEOUT = 10
 SLOW_FN_SECONDS = 0.6
 
 
-# ---------------------------------------------------------------------------
-# Module-level helpers (must be picklable across the boc_noticeboard queue).
-# ---------------------------------------------------------------------------
-
-
 def _slow_update_fn(_x):
     """Sleep on the noticeboard thread, then return a fresh value.
 
@@ -59,11 +51,6 @@ def _slow_update_fn(_x):
     return 1
 
 
-# ---------------------------------------------------------------------------
-# Stop-timeout and retry composition test
-# ---------------------------------------------------------------------------
-
-
 class TestStopTimeoutAndRetry:
     """Stop-timeout abort followed by clean retry.
 
@@ -87,24 +74,16 @@ class TestStopTimeoutAndRetry:
 
     @classmethod
     def teardown_class(cls):
-        """Drain the runtime and any leftover messages."""
+        """Drain the runtime to a clean state."""
         wait()
-        drain("retry_done")
 
     def test_stop_timeout_then_retry(self):
         """Time out on a slow noticeboard fn, then retry start() cleanly."""
-        # Begin from a known-clean state.
         wait()
 
-        # ----- Step 1: schedule a slow notice_update -----
         bocpy.start(worker_count=1)
         try:
             notice_update("retry_key", _slow_update_fn, default=0)
-            # Yield long enough for the noticeboard thread to
-            # dequeue the update and enter ``time.sleep``. Without
-            # this, on a very fast machine ``wait(timeout=0.1)``
-            # could race the message dequeue and the noticeboard
-            # thread would shut down cleanly inside the 0.1s budget.
             time.sleep(0.05)
         except BaseException:
             try:
@@ -113,41 +92,21 @@ def test_stop_timeout_then_retry(self):
                 pass
             raise
 
-        # ----- Step 2: stop times out, but the orphan drain still ran -----
         with pytest.raises(RuntimeError, match="noticeboard thread did not shut down"):
             wait(timeout=0.1)
 
-        # The orphan drain ran before the raise, so the C-side
-        # terminator_count is back to 0. Without that drain the
-        # count would still reflect in-flight @when traffic and
-        # the next start() would diagnose terminator drift.
         assert _core.terminator_count() == 0, (
             "terminator_count is non-zero after wait(timeout=0.1) "
             "timed out on the noticeboard join. The orphan drain "
             "did not run before the RuntimeError."
         )
 
-        # ----- Step 3: drain the slow fn, finish teardown -----
-        # The retry path in ``stop()`` calls
-        # ``noticeboard.join(_remaining())``. We invoke ``wait()``
-        # with no timeout here, so ``_remaining()`` returns ``None``
-        # and the join is unbounded -- the second ``wait()`` blocks
-        # deterministically until the slow fn completes and the
-        # noticeboard thread exits, with no ``time.sleep`` slack
-        # required. A retry that supplied a finite ``timeout=`` would
-        # see a bounded join and would still need explicit
-        # synchronisation to guarantee the slow fn has completed.
         wait()
 
-        # ----- Step 4: fresh start + schedule -----
-        # If the scheduler_runtime_stop pairing on abort paths or
-        # the dispatch-failure-observable change were regressed,
-        # this start() / @when cycle would either crash or hang.
         bocpy.start(worker_count=2)
         try:
             self._run_fresh_when()
         finally:
-            drain("retry_done")
             wait()
 
     def _run_fresh_when(self):
@@ -160,16 +119,7 @@ def _run_fresh_when(self):
 
         @when(fresh)
         def _(c):
-            send("retry_done", ("fresh_ran", c.value))
+            return c.value
 
-        tag, payload = receive("retry_done", RECEIVE_TIMEOUT)
-        assert tag != TIMEOUT, (
-            "@when on a fresh Cown after retry never ran -- the "
-            "scheduler did not re-arm cleanly after the "
-            "timed-out stop()"
-        )
-        assert payload == ("fresh_ran", 0), (
-            f"unexpected payload {payload!r} from fresh @when; a "
-            "'cannot acquire cown' error here would indicate a "
-            "leaked owner from the prior runtime"
-        )
+        quiesce(QUIESCE_TIMEOUT)
+        assert fresh.unwrap() == 0
diff --git a/test/test_transpiler.py b/test/test_transpiler.py
index 87bf90b..284433b 100644
--- a/test/test_transpiler.py
+++ b/test/test_transpiler.py
@@ -7,9 +7,6 @@
 from bocpy.transpiler import BOCModuleTransformer, CapturedVariableFinder, export_module
 
 
-# ── CapturedVariableFinder ──────────────────────────────────────────────
-
-
 class TestCapturedParams:
     """Function parameters must never appear as captured variables."""
 
@@ -71,10 +68,6 @@ def helper():
         """) == set()
 
     def test_except_as_name_excluded(self):
-        # ``except ... as X`` binds X via ``ExceptHandler.name`` (a
-        # plain identifier, not an ``ast.Name(Store)`` node). The
-        # finder must still treat it as local so a subsequent ``str(X)``
-        # read is not classified as a capture.
         assert "ex" not in self._captures("""\
             def f():
                 try:
@@ -130,8 +123,6 @@ def _captures(source, known_vars=frozenset()):
         return finder.captured_vars
 
     def test_inner_when_capture_propagates(self):
-        # `marker` is referenced only inside the nested @when body, but must
-        # be captured by the outer behavior so the inner whencall can see it.
         caps = self._captures("""\
             def outer(c):
                 @when(c)
@@ -141,8 +132,6 @@ def _(c):
         assert "marker" in caps
 
     def test_inner_when_decorator_arg_propagates(self):
-        # The cown argument to the nested @when is evaluated in the outer
-        # frame, so it must also be captured.
         caps = self._captures("""\
             def outer():
                 @when(other_cown)
@@ -152,7 +141,6 @@ def _(x):
         assert "other_cown" in caps
 
     def test_inner_when_locals_not_captured(self):
-        # Names that are local/params of the inner @when should NOT leak out.
         caps = self._captures("""\
             def outer():
                 @when(c)
@@ -163,8 +151,6 @@ def _(c):
         assert caps == {"c"}
 
     def test_plain_nested_def_unchanged(self):
-        # A plain (non-@when) nested def keeps its opaque treatment: names
-        # used only inside its body do not surface in the outer's captures.
         caps = self._captures("""\
             def outer():
                 def helper():
@@ -173,8 +159,6 @@ def helper():
         assert caps == set()
 
     def test_deeply_nested_when_propagates(self):
-        # A name referenced in a doubly-nested @when must propagate all the
-        # way out to the top-level behavior.
         caps = self._captures("""\
             def outer(c):
                 @when(c)
@@ -185,6 +169,44 @@ def _(c):
         """, known_vars={"when", "use"})
         assert "deep_marker" in caps
 
+    def test_when_inside_for_loop_name_not_captured(self):
+        caps = self._captures("""\
+            def outer(handles):
+                bodies = []
+                for h in handles:
+                    @when(h)
+                    def body(h):
+                        pass
+                    bodies.append(body)
+                return bodies
+        """, known_vars={"when"})
+        assert caps == set()
+
+    def test_when_inside_for_loop_capture_propagates(self):
+        caps = self._captures("""\
+            def outer(handles):
+                for h in handles:
+                    @when(h)
+                    def body(h):
+                        use(marker)
+        """, known_vars={"when", "use"})
+        assert "marker" in caps
+
+    def test_when_inside_nested_blocks_capture_propagates(self):
+        caps = self._captures("""\
+            def outer(items):
+                for x in items:
+                    if x:
+                        with ctx():
+                            @when(other_cown)
+                            def inner(c):
+                                use(marker)
+                return inner
+        """, known_vars={"when", "use", "ctx"})
+        assert "other_cown" in caps
+        assert "marker" in caps
+        assert "inner" not in caps
+
     def test_mixed_locals_and_captures(self):
         caps = self._captures("""\
             def f(a):
@@ -214,9 +236,6 @@ def test_clear_resets_between_visits(self):
         assert "a" not in finder.captured_vars
 
 
-# ── BOCModuleTransformer ────────────────────────────────────────────────
-
-
 class TestModuleTransformerImports:
     """Import handling: recording names and whencall injection."""
 
@@ -254,7 +273,6 @@ def test_whencall_not_duplicated_when_present(self):
     def test_whencall_injected_when_aliased(self):
         t, tree = self._transform("from bocpy import whencall as wc, Cown")
         aliases = [(a.name, a.asname) for a in tree.body[0].names]
-        # Original aliased import kept, plus bare whencall injected
         assert ("whencall", "wc") in aliases
         assert ("whencall", None) in aliases
         assert "wc" in t.imports
@@ -352,9 +370,6 @@ def test_bare_expression_filtered(self):
         assert len(tree.body) == 0
 
 
-# ── export_module (full pipeline) ───────────────────────────────────────
-
-
 class TestExportBehaviorNaming:
     """Behaviors are renamed to __behavior__N with sequential numbering."""
 
@@ -417,7 +432,6 @@ def scaled(x):
         """)
         info = list(result.behaviors.values())[0]
         assert "factor" in info.captures
-        # factor must appear as a parameter in the generated behavior def
         sig = result.code.split("def __behavior__0(")[1].split("):")[0]
         assert "factor" in sig
 
@@ -764,10 +778,6 @@ def test_file_replaced_with_absolute_path(self):
             def f(x):
                 return __file__
         """, path=path)
-        # Walk the generated AST and confirm __file__ has been replaced
-        # with the absolute source path as a string constant. Substring
-        # matching against the unparsed source is platform-fragile because
-        # backslashes in Windows paths get escaped during unparse.
         expected = os.path.abspath(path)
         gen_tree = ast.parse(result.code)
         constants = [
@@ -838,6 +848,27 @@ def inner(x):
         """)
         assert len(result.behaviors) == 2
 
+    def test_nested_when_in_for_loop(self):
+        result = self._export("""\
+            from bocpy import when, whencall, Cown
+
+            x = Cown(1)
+
+            @when(x)
+            def outer(x):
+                bodies = []
+                for i, h in enumerate(x.value):
+                    @when(h)
+                    def inner(h, i=i):
+                        return (i, h.value)
+                    bodies.append(inner)
+                return bodies
+        """)
+        assert len(result.behaviors) == 2
+        for info in result.behaviors.values():
+            assert "inner" not in info.captures
+        assert "Cannot resolve" not in result.code
+
 
 class TestExportMetadata:
     """ExportResult carries class, function, and behavior metadata."""
@@ -876,9 +907,6 @@ def f(x):
         assert line > 0
 
 
-# ── Import alias tests ──────────────────────────────────────────────────
-
-
 class TestImportAlias:
     """Aliased imports must not appear as captured variables."""
 
@@ -923,9 +951,6 @@ def use_alias(x):
             )
 
 
-# ── Defaults-as-captures (loop-snapshot idiom) ──────────────────────────
-
-
 class TestDefaultsAsCaptures:
     """``def b(c, i=i)`` and ``def b(c, x=y)`` hoist defaults to captures."""
 
@@ -946,7 +971,6 @@ def b(c, i=i):
         """)
         info = list(result.behaviors.values())[0]
         assert info.captures == ["i"]
-        # Default must be stripped from the exported behavior.
         gen_tree = ast.parse(result.code)
         for node in ast.walk(gen_tree):
             if isinstance(node, ast.FunctionDef) and node.name.startswith("__behavior__"):
@@ -1001,7 +1025,6 @@ def b(c, i=i):
                     return i * factor
         """)
         info = list(result.behaviors.values())[0]
-        # Extras come first, then body captures.
         assert info.captures == ["i", "factor"]
 
     def test_non_name_default_rejected(self):
@@ -1037,9 +1060,6 @@ def b(c=c):
             raise AssertionError("expected SyntaxError for default on cown position")
 
 
-# ── @when alias support ─────────────────────────────────────────────────
-
-
 class TestWhenAlias:
     """Aliased ``when`` decorators are detected and rewritten."""
 
@@ -1060,7 +1080,6 @@ def b(c):
         """)
         names = [info.name for info in result.behaviors.values()]
         assert names == ["__behavior__0"]
-        # The aliased decorator must be stripped from the behavior.
         gen_tree = ast.parse(result.code)
         for node in ast.walk(gen_tree):
             if isinstance(node, ast.FunctionDef) and node.name.startswith("__behavior__"):
@@ -1079,7 +1098,6 @@ def b(c):
         """)
         names = [info.name for info in result.behaviors.values()]
         assert names == ["__behavior__0"]
-        # whencall must be auto-imported when only ``import bocpy`` is present.
         assert "from bocpy import whencall" in result.code
         gen_tree = ast.parse(result.code)
         for node in ast.walk(gen_tree):
@@ -1107,14 +1125,17 @@ def b(c):
 class TestWhenResultAssignment:
     """@when-decorated functions must produce a name = whencall(...) assignment.
 
-    Regression: WhenTransformer.visit_FunctionDef was returning
-    ``ast.Expr(ast.Assign(...))``, an ast.Assign statement incorrectly
-    wrapped in ast.Expr. visit_Module filters out every ast.Expr node (to
-    drop bare expression-statement whencall results), so the wrapping caused
-    every @when result assignment to be silently dropped from the exported
-    module. Any code that read .value, checked .exception, or chained
-    behaviors on the result was operating on None with no error at schedule
-    time.
+    WhenTransformer.visit_FunctionDef returns an ast.Assign so the behavior
+    result is bound to the function's name. visit_Module filters out every
+    ast.Expr node (to drop bare expression-statement whencall results), so an
+    assignment wrapped in ast.Expr would be discarded and any code reading
+    .value, checking .exception, or chaining behaviors on the result would
+    operate on None.
+
+    visit_Module only filters at module scope; a @when nested inside a
+    function, a method, or another behavior emits its assignment into a
+    function body it never inspects. The nesting-level tests below lock
+    assignment preservation at every depth.
     """
 
     @staticmethod
@@ -1139,7 +1160,7 @@ def my_task(x):
         )
 
     def test_result_is_ast_assign_not_expr(self):
-        """The whencall node returned by visit_FunctionDef must be an ast.Assign, not an ast.Expr wrapping an ast.Assign.
+        """visit_FunctionDef must return an ast.Assign, not an ast.Expr wrapping one.
 
         visit_Module filters out all ast.Expr nodes; an ast.Expr return
         would silently drop the assignment.
@@ -1223,3 +1244,133 @@ def my_task(x):
         raise AssertionError(
             "no assignment for 'my_task' found in exported AST"
         )
+
+    def test_result_assigned_inside_function(self):
+        """A @when inside a plain function keeps its assignment in the body."""
+        result = self._export("""\
+            from bocpy import when, whencall, Cown
+
+            x = Cown(1)
+
+            def run():
+                @when(x)
+                def task(x):
+                    return x.value
+                return task
+        """)
+        assert "task = whencall(" in result.code, (
+            "in-function @when result assignment was dropped;\n"
+            f"generated code:\n{result.code}"
+        )
+
+    def test_result_assigned_inside_method(self):
+        """A @when inside a method keeps its assignment in the method body."""
+        result = self._export("""\
+            from bocpy import when, whencall, Cown
+
+            x = Cown(1)
+
+            class Driver:
+                def run(self):
+                    @when(x)
+                    def task(x):
+                        return x.value
+                    return task
+        """)
+        assert "task = whencall(" in result.code, (
+            "in-method @when result assignment was dropped;\n"
+            f"generated code:\n{result.code}"
+        )
+
+    def test_result_assigned_inside_nested_function(self):
+        """A @when two function levels deep keeps its assignment."""
+        result = self._export("""\
+            from bocpy import when, whencall, Cown
+
+            x = Cown(1)
+
+            def outer():
+                def inner():
+                    @when(x)
+                    def task(x):
+                        return x.value
+                    return task
+                return inner
+        """)
+        assert "task = whencall(" in result.code, (
+            "deeply-nested-function @when result assignment was dropped;\n"
+            f"generated code:\n{result.code}"
+        )
+
+    def test_result_assigned_inside_for_loop_in_function(self):
+        """A @when inside a for loop inside a function keeps its assignment."""
+        result = self._export("""\
+            from bocpy import when, whencall, Cown
+
+            handles = [Cown(1), Cown(2)]
+
+            def run():
+                bodies = []
+                for h in handles:
+                    @when(h)
+                    def task(h):
+                        return h.value
+                    bodies.append(task)
+                return bodies
+        """)
+        assert "task = whencall(" in result.code, (
+            "for-loop-in-function @when result assignment was dropped;\n"
+            f"generated code:\n{result.code}"
+        )
+
+    def test_nested_when_both_results_assigned(self):
+        """A nested @when assigns both the outer (module) and inner (in-body) results.
+
+        The outer @when sits at module scope; the inner @when's assignment
+        lands inside the extracted ``__behavior__`` body. Both must survive.
+        """
+        result = self._export("""\
+            from bocpy import when, whencall, Cown
+
+            x = Cown(1)
+
+            @when(x)
+            def outer(x):
+                @when(x)
+                def inner(x):
+                    return x.value
+                return inner
+        """)
+        assert "outer = whencall(" in result.code, (
+            "outer (module-level) @when result assignment was dropped;\n"
+            f"generated code:\n{result.code}"
+        )
+        assert "inner = whencall(" in result.code, (
+            "inner (in-behavior) @when result assignment was dropped;\n"
+            f"generated code:\n{result.code}"
+        )
+
+    def test_nested_when_inside_function_both_results_assigned(self):
+        """A nested @when wholly inside a function assigns both results in-body."""
+        result = self._export("""\
+            from bocpy import when, whencall, Cown
+
+            x = Cown(1)
+
+            def run():
+                @when(x)
+                def outer(x):
+                    @when(x)
+                    def inner(x):
+                        return x.value
+                    return inner
+                return outer
+        """)
+        assert "outer = whencall(" in result.code, (
+            "in-function outer @when result assignment was dropped;\n"
+            f"generated code:\n{result.code}"
+        )
+        assert "inner = whencall(" in result.code, (
+            "in-behavior inner @when result assignment was dropped;\n"
+            f"generated code:\n{result.code}"
+        )
diff --git a/test/test_validate_sbom.py b/test/test_validate_sbom.py
index cdfb602..ec1042f 100644
--- a/test/test_validate_sbom.py
+++ b/test/test_validate_sbom.py
@@ -23,11 +23,6 @@
 import validate_sbom
 
 
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
 def _good_doc() -> dict:
     """A freshly built CycloneDX 1.6 SBOM produced by ``build_sbom.py``.
 
@@ -46,11 +41,6 @@ def _good_doc() -> dict:
     )
 
 
-# ---------------------------------------------------------------------------
-# Happy path
-# ---------------------------------------------------------------------------
-
-
 def test_validate_sbom_document_accepts_build_sbom_output():
     """The generator and validator must agree on the wire format."""
     validate_sbom.validate_sbom_document(_good_doc())
@@ -62,19 +52,12 @@ def test_validate_sbom_file_accepts_round_trip(tmp_path: Path):
     validate_sbom.validate_sbom_file(sbom)
 
 
-# ---------------------------------------------------------------------------
-# Header invariants (bomFormat / specVersion / serialNumber / version)
-# ---------------------------------------------------------------------------
-
-
 @pytest.mark.parametrize(
     "mutation",
     [
-        # (key, replacement_value, expected_substring_in_error)
         ("bomFormat", "SPDX", "bomFormat"),
         ("specVersion", "1.5", "specVersion"),
         ("serialNumber", "urn:uuid:not-a-uuid", "serialNumber"),
-        # version-1 UUID still rejected by the UUIDv5 regex
         ("serialNumber", "urn:uuid:11111111-1111-1111-1111-111111111111",
          "serialNumber"),
         ("version", 0, "version"),
@@ -89,11 +72,6 @@ def test_validate_sbom_document_rejects_bad_header(mutation):
         validate_sbom.validate_sbom_document(doc)
 
 
-# ---------------------------------------------------------------------------
-# Metadata invariants
-# ---------------------------------------------------------------------------
-
-
 def test_metadata_must_be_object():
     doc = _good_doc()
     doc["metadata"] = []
@@ -103,9 +81,6 @@ def test_metadata_must_be_object():
 
 def test_timestamp_must_match_iso_z_format():
     doc = _good_doc()
-    # Missing the ``Z`` suffix and using ``+00:00`` instead — same instant,
-    # different lexical form. The validator pins the format because
-    # build_sbom.py commits to a specific shape.
     doc["metadata"]["timestamp"] = "2026-05-28T12:00:00+00:00"
     with pytest.raises(validate_sbom.ValidationError, match="timestamp"):
         validate_sbom.validate_sbom_document(doc)
@@ -142,14 +117,8 @@ def test_root_component_type_must_be_library():
         validate_sbom.validate_sbom_document(doc)
 
 
-# ---------------------------------------------------------------------------
-# components / dependencies invariants
-# ---------------------------------------------------------------------------
-
-
 def test_dependencies_must_reference_root_component():
     doc = _good_doc()
-    # Replace with a dependencies block that points at a different ref.
     doc["dependencies"] = [{"ref": "pkg:pypi/other@1.0", "dependsOn": []}]
     with pytest.raises(validate_sbom.ValidationError, match="dependencies"):
         validate_sbom.validate_sbom_document(doc)
@@ -162,11 +131,6 @@ def test_components_field_must_be_list():
         validate_sbom.validate_sbom_document(doc)
 
 
-# ---------------------------------------------------------------------------
-# File-level + CLI
-# ---------------------------------------------------------------------------
-
-
 def test_validate_sbom_file_reports_invalid_json(tmp_path: Path):
     bad = tmp_path / "bad.cdx.json"
     bad.write_text("{this is not json", encoding="utf-8")
@@ -226,7 +190,5 @@ def test_good_doc_helper_is_deep_copyable():
     b = _good_doc()
     a["metadata"]["component"]["name"] = "mutated"
     assert b["metadata"]["component"]["name"] == "bocpy"
-    # And the validator still accepts the unmutated copy:
     validate_sbom.validate_sbom_document(b)
-    # While rejecting nothing here — exercising copy semantics only.
     _ = copy.deepcopy(a)
diff --git a/test/test_validate_wheel.py b/test/test_validate_wheel.py
index cec2d41..b09ae6e 100644
--- a/test/test_validate_wheel.py
+++ b/test/test_validate_wheel.py
@@ -33,11 +33,6 @@
 WHEEL_NAME = f"{DIST}-{VERSION}-cp314-cp314-manylinux_2_28_x86_64.whl"
 
 
-# ---------------------------------------------------------------------------
-# Wheel builders
-# ---------------------------------------------------------------------------
-
-
 def _record_row(arcname: str, data: bytes) -> tuple[str, str, str]:
     digest = hashlib.sha256(data).digest()
     b64 = base64.urlsafe_b64encode(digest).rstrip(b"=").decode("ascii")
@@ -110,17 +105,11 @@ def _write_wheel_raw(
         wheel.writestr(record_arc, record_text)
 
 
-# ---------------------------------------------------------------------------
-# Happy path
-# ---------------------------------------------------------------------------
-
-
 def test_clean_wheel_passes(tmp_path):
     """A faithfully-built probe wheel passes both checks."""
     wheel_path = tmp_path / WHEEL_NAME
     _write_wheel(wheel_path, _wheel_metadata_entries())
 
-    # No exception means OK.
     validate_wheel.validate_wheel_file(wheel_path)
 
 
@@ -137,7 +126,7 @@ def test_main_returns_zero_on_clean_wheel(tmp_path, capsys):
 
 def test_main_accepts_directory_input(tmp_path, capsys):
     """Passing a directory expands to every *.whl in it."""
-    (tmp_path / WHEEL_NAME).touch()  # placeholder so glob matches the dir
+    (tmp_path / WHEEL_NAME).touch()
     wheel_a = tmp_path / WHEEL_NAME
     wheel_b = tmp_path / f"{DIST}-{VERSION}-cp314-cp314-linux_aarch64.whl"
     _write_wheel(wheel_a, _wheel_metadata_entries())
@@ -150,11 +139,6 @@ def test_main_accepts_directory_input(tmp_path, capsys):
     assert out.count("OK") == 2
 
 
-# ---------------------------------------------------------------------------
-# Negative cases: validate_record
-# ---------------------------------------------------------------------------
-
-
 def test_record_with_directory_entries_is_rejected(tmp_path):
     """Regression for the 0.7.0 PyPI warning.
 
@@ -165,7 +149,6 @@ def test_record_with_directory_entries_is_rejected(tmp_path):
     """
     wheel_path = tmp_path / WHEEL_NAME
     entries = _wheel_metadata_entries()
-    # RECORD claims an extra directory entry that PyPI ignores in the wheel.
     bad_rows = [_record_row(arc, data) for arc, data in entries]
     empty_hash = (
         "sha256="
@@ -183,7 +166,6 @@ def test_record_missing_an_entry_is_rejected(tmp_path):
     """An honest file present in the ZIP but absent from RECORD fails."""
     wheel_path = tmp_path / WHEEL_NAME
     entries = _wheel_metadata_entries()
-    # Build RECORD that omits the last entry.
     short_rows = [_record_row(arc, data) for arc, data in entries[:-1]]
     _write_wheel(wheel_path, entries, record_rows=short_rows)
 
@@ -210,7 +192,6 @@ def test_missing_record_file_raises(tmp_path):
     with zipfile.ZipFile(wheel_path, "w", zipfile.ZIP_DEFLATED) as wheel:
         for arcname, data in entries:
             wheel.writestr(arcname, data)
-        # Deliberately do NOT write RECORD.
 
     with pytest.raises(MissingWheelRecordError):
         validate_wheel.validate_wheel_file(wheel_path)
@@ -222,11 +203,8 @@ def test_record_with_jws_signature_is_exempt(tmp_path):
     entries = _wheel_metadata_entries() + [
         (f"{DIST_INFO}/RECORD.jws", b"<signature bytes>"),
     ]
-    # RECORD must list everything except RECORD itself and the signature.
     visible_entries = entries[:-1]
     _write_wheel(wheel_path, visible_entries, record_rows=None)
-    # Re-open and append the signature to the existing ZIP without
-    # adding it to RECORD (mirroring how signing tools work).
     with zipfile.ZipFile(wheel_path, "a", zipfile.ZIP_DEFLATED) as wheel:
         wheel.writestr(f"{DIST_INFO}/RECORD.jws", b"<signature bytes>")
 
@@ -247,11 +225,6 @@ def test_main_returns_nonzero_on_failure(tmp_path, capsys):
     assert "RECORD mismatch" in err
 
 
-# ---------------------------------------------------------------------------
-# Negative cases: validate_entrypoints
-# ---------------------------------------------------------------------------
-
-
 def test_valid_entry_points_pass(tmp_path):
     """A well-formed entry_points.txt is accepted."""
     wheel_path = tmp_path / WHEEL_NAME
diff --git a/test/test_version.py b/test/test_version.py
index 426e43e..1655a51 100644
--- a/test/test_version.py
+++ b/test/test_version.py
@@ -5,7 +5,7 @@
 
 try:
     import tomllib  # type: ignore[import-not-found]
-except ModuleNotFoundError:  # Python 3.10
+except ModuleNotFoundError:
     import tomli as tomllib  # type: ignore[import-not-found, no-redef]
 
 import bocpy
@@ -46,19 +46,6 @@ def test_version_in_dunder_all():
     assert "__version__" in bocpy.__all__
 
 
-# ---------------------------------------------------------------------------
-# Corrupt-installation fallback must log a WARNING.
-# ---------------------------------------------------------------------------
-#
-# When a metadata lookup raises, ``__version__`` falls back to
-# ``"0.0.0+unknown"``. Without a diagnostic a broken installation
-# would look identical to a clean source-checkout import in downstream
-# version gates / wheel telemetry, so the fallback path logs a WARNING
-# naming the exception class. We pin that contract via a subprocess
-# so the test does not need to reload ``bocpy`` (which would tear down
-# the C runtime mid-suite).
-
-
 def test_version_fallback_emits_warning(tmp_path):
     """When ``_metadata.version`` raises, the fallback path logs a WARNING."""
     import subprocess
@@ -106,7 +93,6 @@ def _explode(name):
     assert "VERSION=0.0.0+unknown" in result.stdout, (
         f"expected fallback version string in subprocess output; got:\n{result.stdout}"
     )
-    # The log line names the exception class and includes the fallback string.
     assert "bocpy package metadata unavailable" in result.stdout
     assert "RuntimeError" in result.stdout
     assert "0.0.0+unknown" in result.stdout