From e4baa7e19da5f48a601a5a41215776614edc155d Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Fri, 22 May 2026 12:22:05 +0800
Subject: [PATCH 1/7] Refactor: defer slot_state payload/task bind to
 orch::prepare_task
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the per-slot payload/task pointer assignments out of the
RingSchedState::init() O(task_window_size) loop and into orch::prepare_task.
Their value is per-slot constant (&task_payloads[slot] /
&task_descriptors[slot]) but writing them at submit time, on the same 64B
slot_state cache line prepare_task is already dirtying, is essentially
free — while removing the only "scale-dependent" pointer assignments from
the init path. ring_id stays in init (its value is per-ring constant, so
rewriting it each submit would only add noise without removing a loop).

Split PTO2TaskSlotState::bind() into bind_ring() (init-time) and
bind_buffers() (per-submit) to make the two call-site shapes explicit.

Mirrored across both a2a3 and a5 trb runtimes.
---
 .../runtime/pto_orchestrator.cpp              | 12 +++++++++-
 .../runtime/pto_runtime2_types.h              | 23 ++++++++++++++-----
 .../runtime/scheduler/pto_scheduler.cpp       |  7 +++---
 .../runtime/pto_orchestrator.cpp              | 12 +++++++++-
 .../runtime/pto_runtime2_types.h              | 23 ++++++++++++++-----
 .../runtime/scheduler/pto_scheduler.cpp       |  7 +++---
 6 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 5f6d20855..fbc07f53f 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -355,11 +355,21 @@ static bool prepare_task(
 
     prefetch_payload(out->payload, args.tensor_count(), args.scalar_count());
 
+    // Re-bind payload/task pointers each submit. Value is per-slot constant
+    // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing
+    // here lets RingSchedState::init() skip the O(window_size) bind loop.
+    // Both writes hit the same 64B slot_state cache line we're about to
+    // dirty below, so the extra cost is two stores on an already-hot line.
+    // Must precede the scheduler wiring.queue.push at the end of
+    // submit_task_common — that push is the first read of slot_state->task /
+    // slot_state->payload by another thread.
+    out->slot_state->bind_buffers(out->payload, out->task);
+
     // Fields already reset by advance_ring_pointers (eager reset after CONSUMED):
     //   fanout_lock=0, fanout_count=1, fanout_head=nullptr,
     //   fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0
     // Fields immutable after RingSchedState::init():
-    //   payload, task, ring_id
+    //   ring_id
     // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor
     // observers); set to PENDING here when orchestrator actually reuses the slot.
     out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index fcd8a27bd..f217e7ac3 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -330,7 +330,11 @@ struct alignas(64) PTO2TaskSlotState {
     // Fanout refcount (accessed with fanout_count in check_and_handle_consumed)
     std::atomic<int32_t> fanout_refcount;  // Dynamic: counts released references
 
-    // --- Immutable after RingSchedState::init() (same value on every slot reuse) ---
+    // --- Per-slot constant, re-bound by orch::prepare_task each submit ---
+    // Value is the same on every reuse (&task_payloads[slot] / &task_descriptors[slot]),
+    // but written here per-submit instead of in an O(window_size) init loop —
+    // these are the only "scale-dependent" pointers in this struct, so moving
+    // them out of init makes startup cost independent of task_window_size.
     PTO2TaskPayload *payload;
     PTO2TaskDescriptor *task;
 
@@ -345,14 +349,21 @@ struct alignas(64) PTO2TaskSlotState {
     int16_t next_block_idx{0};                   // Next block to dispatch (scheduler state)
 
     /**
-     * One-time binding of slot-invariant fields.
-     * Called during RingSchedState::init() — these values are determined by
-     * the slot's position in the ring and never change across reuses.
+     * Bind the slot-invariant ring id. Called once per slot during
+     * RingSchedState::init(); ring_id never changes across reuses.
      */
-    void bind(PTO2TaskPayload *p, PTO2TaskDescriptor *t, uint8_t rid) {
+    void bind_ring(uint8_t rid) { ring_id = rid; }
+
+    /**
+     * Re-bind the per-slot payload/task pointers. Called by
+     * orch::prepare_task on every submit. Value is constant for a given
+     * slot, but we pay the cheap re-write each submit (both fields land on
+     * the same 64B slot_state cache line that prepare_task is already
+     * dirtying) to avoid the init-time per-slot loop.
+     */
+    void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t) {
         payload = p;
         task = t;
-        ring_id = rid;
     }
 
     /**
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
index 281a714fb..f497b8fd8 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
@@ -102,12 +102,13 @@ bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHeader *sm_header,
     advance_lock.store(0, std::memory_order_relaxed);
 
     // Initialize all per-task slot state fields.
-    // bind() sets payload, task, ring_id — immutable after init, bound once
-    // to their fixed shared-memory addresses.
+    // bind_ring() sets the ring_id only — payload/task pointers are re-bound
+    // by orch::prepare_task on every submit (their value is per-slot constant
+    // but pinning them here would cost O(task_window_size) at startup).
     // reset_for_reuse() sets dynamic fields to reclaim defaults (fanout_count=1,
     // rest zero) so the first submit needs no reset.
     for (uint64_t i = 0; i < ring->task_window_size; i++) {
-        ring->slot_states[i].bind(&ring->task_payloads[i], &ring->task_descriptors[i], static_cast<uint8_t>(ring_id));
+        ring->slot_states[i].bind_ring(static_cast<uint8_t>(ring_id));
         ring->slot_states[i].reset_for_reuse();
         ring->slot_states[i].fanin_count = 0;
         ring->slot_states[i].active_mask = ActiveMask{};
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 05ac105a8..056c2ee64 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -329,11 +329,21 @@ static bool prepare_task(
 
     prefetch_payload(out->payload, args.tensor_count(), args.scalar_count());
 
+    // Re-bind payload/task pointers each submit. Value is per-slot constant
+    // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing
+    // here lets RingSchedState::init() skip the O(window_size) bind loop.
+    // Both writes hit the same 64B slot_state cache line we're about to
+    // dirty below, so the extra cost is two stores on an already-hot line.
+    // Must precede the scheduler wiring.queue.push at the end of
+    // submit_task_common — that push is the first read of slot_state->task /
+    // slot_state->payload by another thread.
+    out->slot_state->bind_buffers(out->payload, out->task);
+
     // Fields already reset by advance_ring_pointers (eager reset after CONSUMED):
     //   fanout_lock=0, fanout_count=1, fanout_head=nullptr,
     //   fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0
     // Fields immutable after RingSchedState::init():
-    //   payload, task, ring_id
+    //   ring_id
     // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor
     // observers); set to PENDING here when orchestrator actually reuses the slot.
     out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index 999dbf6c5..f022b8eb4 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -330,7 +330,11 @@ struct alignas(64) PTO2TaskSlotState {
     // Fanout refcount (accessed with fanout_count in check_and_handle_consumed)
     std::atomic<int32_t> fanout_refcount;  // Dynamic: counts released references
 
-    // --- Immutable after RingSchedState::init() (same value on every slot reuse) ---
+    // --- Per-slot constant, re-bound by orch::prepare_task each submit ---
+    // Value is the same on every reuse (&task_payloads[slot] / &task_descriptors[slot]),
+    // but written here per-submit instead of in an O(window_size) init loop —
+    // these are the only "scale-dependent" pointers in this struct, so moving
+    // them out of init makes startup cost independent of task_window_size.
     PTO2TaskPayload *payload;
     PTO2TaskDescriptor *task;
 
@@ -345,14 +349,21 @@ struct alignas(64) PTO2TaskSlotState {
     int16_t next_block_idx{0};                   // Next block to dispatch (scheduler state)
 
     /**
-     * One-time binding of slot-invariant fields.
-     * Called during RingSchedState::init() — these values are determined by
-     * the slot's position in the ring and never change across reuses.
+     * Bind the slot-invariant ring id. Called once per slot during
+     * RingSchedState::init(); ring_id never changes across reuses.
      */
-    void bind(PTO2TaskPayload *p, PTO2TaskDescriptor *t, uint8_t rid) {
+    void bind_ring(uint8_t rid) { ring_id = rid; }
+
+    /**
+     * Re-bind the per-slot payload/task pointers. Called by
+     * orch::prepare_task on every submit. Value is constant for a given
+     * slot, but we pay the cheap re-write each submit (both fields land on
+     * the same 64B slot_state cache line that prepare_task is already
+     * dirtying) to avoid the init-time per-slot loop.
+     */
+    void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t) {
         payload = p;
         task = t;
-        ring_id = rid;
     }
 
     /**
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
index 281a714fb..f497b8fd8 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
@@ -102,12 +102,13 @@ bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHeader *sm_header,
     advance_lock.store(0, std::memory_order_relaxed);
 
     // Initialize all per-task slot state fields.
-    // bind() sets payload, task, ring_id — immutable after init, bound once
-    // to their fixed shared-memory addresses.
+    // bind_ring() sets the ring_id only — payload/task pointers are re-bound
+    // by orch::prepare_task on every submit (their value is per-slot constant
+    // but pinning them here would cost O(task_window_size) at startup).
     // reset_for_reuse() sets dynamic fields to reclaim defaults (fanout_count=1,
     // rest zero) so the first submit needs no reset.
     for (uint64_t i = 0; i < ring->task_window_size; i++) {
-        ring->slot_states[i].bind(&ring->task_payloads[i], &ring->task_descriptors[i], static_cast<uint8_t>(ring_id));
+        ring->slot_states[i].bind_ring(static_cast<uint8_t>(ring_id));
         ring->slot_states[i].reset_for_reuse();
         ring->slot_states[i].fanin_count = 0;
         ring->slot_states[i].active_mask = ActiveMask{};

From 23139eb55d40c06dc917327ad02718022a2dcf23 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Fri, 22 May 2026 17:53:47 +0800
Subject: [PATCH 2/7] Refactor: host-build trb runtime arena, AICPU does only
 wire + SM reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the AICPU rebuilt the entire trb runtime arena (PTO2Runtime,
orchestrator/scheduler/tensor_map sub-regions, sm_handle wrapper,
mailbox) on every device boot via runtime_create_from_sm. This commit
moves layout + data init onto the host so the AICPU only does a cheap
arena-internal pointer wire pass plus the SM reset that can't run
off-device. Multi-run boots reuse the pooled prebuilt image with a
single rtMemcpy.

Mechanism
- DeviceArena::attach() wraps an externally-owned buffer; re-attach is
  permitted so each AICPU boot can reuse the pooled image.
- runtime_create_from_sm split into reserve_layout / init_data_from_layout
  / wire_arena_pointers / finalize_after_wire. orchestrator / scheduler /
  tensor_map / ready_queue / spsc gain matching data+wire pairs;
  finalize_after_wire stays AICPU-only since it binds s_runtime_ops.
- pto2_sm_layout helper computes SM field device addresses by pure
  offset arithmetic so host init never dereferences SM.
- Per-slot SM-side reset (bind_ring + reset_for_reuse + active_mask)
  moved from RingSchedState::init into
  PTO2SharedMemoryHandle::init_header_per_ring so the AICPU still owns
  it after the split.
- runtime/shared/pto_runtime2_init.cpp — new file holding the host-able
  pieces lifted out of pto_runtime2.cpp / pto_orchestrator.cpp /
  pto_scheduler.cpp. AICPU-only ops table / submit_task / dispatch
  stay in place.

Host wiring (runtime_maker.cpp)
- DeviceRunner::setup_static_arena gains a third runtime_arena_size
  region (hbg passes 0). The prebuilt image lives in the same pooled
  backing allocation as gm_heap and SM, keeping worker lifetime to one
  rtMalloc.
- bind_prepared_to_runtime_impl reserves layout on a host arena, sizes
  the pooled regions, runs init_data + wire, stashes prebuilt metadata
  into the rt image, rtMemcpys to device, and records base/offset on
  Runtime so the AICPU boot can find it.

AICPU boot (aicpu_executor.cpp)
- attach the runtime arena to the pooled buffer, take rt from
  base+off_runtime, wire arena-internal pointers, sm_handle->init
  (SM reset including the per-slot fields above), mailbox reset,
  finalize_after_wire (ops table + cluster/aiv counts).

Tests
- cpput: 25/25 pass. ready_queue / spsc_queue / scheduler_state /
  task_state / wiring / tensormap UTs migrated to the data+wire API.
  task_allocator.init grew an optional initial_local_task_id (default
  0) so UTs can still exercise task_id near INT32_MAX without reading
  the SM.
- a2a3sim trb: standalone (dynamic_register variants, L3
  group/dependency) + L2 tensormap_and_ringbuffer 29 tests all pass.
- a2a3sim host_build_graph: 9/9 pass (verifies the shared HostApi
  changes don't break hbg).
- a2a3 hardware: tests/st/.../paged_attention_unroll PASS on device 9
  (--build with pto-isa commit pinned to CI).
---
 .../platform/onboard/host/device_runner.cpp   |  30 +-
 .../platform/onboard/host/device_runner.h     |  32 +-
 .../onboard/host/pto_runtime_c_api.cpp        |  13 +-
 src/a2a3/platform/sim/host/device_runner.cpp  |  30 +-
 src/a2a3/platform/sim/host/device_runner.h    |  25 +-
 .../platform/sim/host/pto_runtime_c_api.cpp   |  13 +-
 .../host_build_graph/runtime/runtime.h        |   9 +-
 .../aicpu/aicpu_executor.cpp                  |  57 ++-
 .../host/dep_gen_replay.cpp                   |   9 +-
 .../host/runtime_maker.cpp                    |  66 +++-
 .../runtime/pto_orchestrator.cpp              |  83 -----
 .../runtime/pto_orchestrator.h                |  19 +-
 .../runtime/pto_ring_buffer.h                 |  16 +-
 .../runtime/pto_runtime2.cpp                  |  84 +----
 .../runtime/pto_runtime2.h                    | 110 ++++--
 .../runtime/pto_shared_memory.h               |  61 +++
 .../runtime/pto_tensormap.h                   |  19 +-
 .../runtime/runtime.h                         |  39 +-
 .../runtime/scheduler/pto_scheduler.cpp       | 147 --------
 .../runtime/scheduler/pto_scheduler.h         |  53 ++-
 .../runtime/shared/pto_runtime2_init.cpp      | 351 ++++++++++++++++++
 .../runtime/shared/pto_shared_memory.cpp      |  17 +
 .../runtime/shared/pto_tensormap.cpp          |  44 ++-
 .../runtime/shared/runtime.cpp                |   9 +
 src/common/device_comm/device_arena.h         |  44 ++-
 tests/ut/cpp/CMakeLists.txt                   |   1 +
 tests/ut/cpp/a2a3/test_ready_queue.cpp        |   9 +-
 tests/ut/cpp/a2a3/test_scheduler_state.cpp    |   3 +-
 tests/ut/cpp/a2a3/test_spsc_queue.cpp         |  13 +-
 tests/ut/cpp/a2a3/test_task_allocator.cpp     |   5 +-
 tests/ut/cpp/a2a3/test_task_state.cpp         |   3 +-
 tests/ut/cpp/a2a3/test_tensormap.cpp          |   6 +-
 tests/ut/cpp/a2a3/test_wiring.cpp             |   3 +-
 33 files changed, 979 insertions(+), 444 deletions(-)
 create mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp

diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index cf6ddea88..e3ba6cd10 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -249,31 +249,41 @@ int AicpuSoInfo::finalize() {
 
 DeviceRunner::~DeviceRunner() { finalize(); }
 
-int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) {
+int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
     if (static_arena_.is_committed()) {
         // Idempotent for the production case (sizes do not change across a
         // worker's lifetime). If a caller asks for a larger layout, redo it.
-        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0;
+        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_ &&
+            runtime_arena_size <= cached_runtime_arena_size_) {
+            return 0;
+        }
         static_arena_.release();
         gm_heap_region_off_ = SIZE_MAX;
         gm_sm_region_off_ = SIZE_MAX;
+        runtime_arena_region_off_ = SIZE_MAX;
         cached_gm_heap_size_ = 0;
         cached_gm_sm_size_ = 0;
+        cached_runtime_arena_size_ = 0;
     }
     gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign);
     gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign);
+    if (runtime_arena_size > 0) {
+        runtime_arena_region_off_ = static_arena_.reserve(runtime_arena_size, DeviceArena::kDefaultBaseAlign);
+    }
     if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
-        // Roll back the two reserves: commit() failure leaves committed_=false,
+        // Roll back the reserves: commit() failure leaves committed_=false,
         // so the next entry would skip the release branch and stack new
         // reserves on top of the stale cursor. release() is idempotent on a
         // never-committed arena (just zeroes cursor_ / region_count_).
         static_arena_.release();
         gm_heap_region_off_ = SIZE_MAX;
         gm_sm_region_off_ = SIZE_MAX;
+        runtime_arena_region_off_ = SIZE_MAX;
         return -1;
     }
     cached_gm_heap_size_ = gm_heap_size;
     cached_gm_sm_size_ = gm_sm_size;
+    cached_runtime_arena_size_ = runtime_arena_size;
     return 0;
 }
 
@@ -287,6 +297,11 @@ void *DeviceRunner::acquire_pooled_gm_sm() {
     return static_arena_.region_ptr(gm_sm_region_off_);
 }
 
+void *DeviceRunner::acquire_pooled_runtime_arena() {
+    if (!static_arena_.is_committed()) return nullptr;
+    return static_arena_.region_ptr(runtime_arena_region_off_);
+}
+
 std::thread DeviceRunner::create_thread(std::function<void()> fn) {
     int dev_id = device_id_;
     return std::thread([dev_id, fn = std::move(fn)]() {
@@ -1222,14 +1237,17 @@ int DeviceRunner::finalize() {
     // perf_cleanup guard; this is the backstop for the no-run-since-init case.
     finalize_collectors();
 
-    // Release per-Worker static arena (GM heap + PTO2 SM in a single backing
-    // device allocation). Must precede mem_alloc_.finalize() so the arena
-    // frees through the still-live allocator, not after it.
+    // Release per-Worker static arena (GM heap + PTO2 SM + optional trb
+    // prebuilt runtime arena in a single backing device allocation). Must
+    // precede mem_alloc_.finalize() so the arena frees through the still-live
+    // allocator, not after it.
     static_arena_.release();
     gm_heap_region_off_ = SIZE_MAX;
     gm_sm_region_off_ = SIZE_MAX;
+    runtime_arena_region_off_ = SIZE_MAX;
     cached_gm_heap_size_ = 0;
     cached_gm_sm_size_ = 0;
+    cached_runtime_arena_size_ = 0;
 
     // Free all remaining allocations (including handshake buffer and binGmAddr)
     mem_alloc_.finalize();
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index 53fb6555f..4d9819f21 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -190,20 +190,30 @@ class DeviceRunner {
 
     /**
      * Lay out and commit the per-Worker static device arena that backs the
-     * PTO2 GM heap and PTO2 shared memory in a single underlying allocation.
-     * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm.
-     * Idempotent on identical sizes. Returns 0 on success, -1 on failure.
+     * PTO2 GM heap, the PTO2 shared memory, and the trb prebuilt runtime
+     * arena in a single underlying allocation. Must be called before any
+     * acquire_pooled_*. Idempotent on identical sizes. `runtime_arena_size`
+     * is 0 for the hbg path (no prebuilt runtime arena). Returns 0 on
+     * success, -1 on failure.
      */
-    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size);
+    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
 
     /**
-     * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must
-     * have been called earlier in this Worker; otherwise these return
-     * nullptr. Both pointers are stable for the lifetime of the Worker and
-     * the single underlying device buffer is released in `finalize()`.
+     * Return the pooled GM heap / PTO2 SM / runtime arena pointer.
+     * setup_static_arena must have been called earlier in this Worker;
+     * otherwise these return nullptr. All pointers are stable for the
+     * Worker's lifetime; the single underlying device buffer is released in
+     * `finalize()`.
+     *
+     * acquire_pooled_runtime_arena() is trb-only — the runtime arena region
+     * is only reserved when setup_static_arena was called with
+     * runtime_arena_size > 0. hbg's runtime_maker never calls this; doing so
+     * after setup_static_arena(...,0) returns an unreserved-offset region_ptr
+     * (undefined). Keep the call site discipline at the runtime_maker layer.
      */
     void *acquire_pooled_gm_heap();
     void *acquire_pooled_gm_sm();
+    void *acquire_pooled_runtime_arena();
 
     /**
      * Create a thread bound to this device.
@@ -614,10 +624,14 @@ class DeviceRunner {
     DeviceArena static_arena_;
     size_t gm_heap_region_off_{SIZE_MAX};
     size_t gm_sm_region_off_{SIZE_MAX};
+    // SIZE_MAX (= "not provisioned") when the caller passed runtime_arena_size
+    // == 0 (hbg path); a real offset for trb.
+    size_t runtime_arena_region_off_{SIZE_MAX};
     // Cached sizes for setup_static_arena's "fits" check — avoids calling
-    // region_size() on the arena's public API for the two regions we own.
+    // region_size() on the arena's public API for the regions we own.
     size_t cached_gm_heap_size_{0};
     size_t cached_gm_sm_size_{0};
+    size_t cached_runtime_arena_size_{0};
 
     // Device resources
     rtStream_t stream_aicpu_{nullptr};
diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
index 744b7291c..29c14d862 100644
--- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -108,9 +108,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) {
     }
 }
 
-static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) {
+static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
     try {
-        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size);
+        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size);
     } catch (...) {
         return -1;
     }
@@ -132,6 +132,14 @@ static void *acquire_pooled_gm_sm_wrapper() {
     }
 }
 
+static void *acquire_pooled_runtime_arena_wrapper() {
+    try {
+        return current_runner()->acquire_pooled_runtime_arena();
+    } catch (...) {
+        return nullptr;
+    }
+}
+
 /* ===========================================================================
  * Public C API (resolved by ChipWorker via dlsym)
  * =========================================================================== */
@@ -370,6 +378,7 @@ int run_prepared(
         r->host_api.setup_static_arena = setup_static_arena_wrapper;
         r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper;
         r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper;
+        r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper;
         r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper;
 
         // Restore kernel addrs + orch symbol names + active_callable_id; the
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 1635f3a7a..53d967228 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -122,29 +122,39 @@ bool create_temp_so_file(const std::string &path_template, const uint8_t *data,
 
 DeviceRunner::~DeviceRunner() { finalize(); }
 
-int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) {
+int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
     if (static_arena_.is_committed()) {
-        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0;
+        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_ &&
+            runtime_arena_size <= cached_runtime_arena_size_) {
+            return 0;
+        }
         static_arena_.release();
         gm_heap_region_off_ = SIZE_MAX;
         gm_sm_region_off_ = SIZE_MAX;
+        runtime_arena_region_off_ = SIZE_MAX;
         cached_gm_heap_size_ = 0;
         cached_gm_sm_size_ = 0;
+        cached_runtime_arena_size_ = 0;
     }
     gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign);
     gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign);
+    if (runtime_arena_size > 0) {
+        runtime_arena_region_off_ = static_arena_.reserve(runtime_arena_size, DeviceArena::kDefaultBaseAlign);
+    }
     if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
-        // Roll back the two reserves: commit() failure leaves committed_=false,
+        // Roll back the reserves: commit() failure leaves committed_=false,
         // so the next entry would skip the release branch and stack new
         // reserves on top of the stale cursor. release() is idempotent on a
         // never-committed arena (just zeroes cursor_ / region_count_).
         static_arena_.release();
         gm_heap_region_off_ = SIZE_MAX;
         gm_sm_region_off_ = SIZE_MAX;
+        runtime_arena_region_off_ = SIZE_MAX;
         return -1;
     }
     cached_gm_heap_size_ = gm_heap_size;
     cached_gm_sm_size_ = gm_sm_size;
+    cached_runtime_arena_size_ = runtime_arena_size;
     return 0;
 }
 
@@ -158,6 +168,11 @@ void *DeviceRunner::acquire_pooled_gm_sm() {
     return static_arena_.region_ptr(gm_sm_region_off_);
 }
 
+void *DeviceRunner::acquire_pooled_runtime_arena() {
+    if (!static_arena_.is_committed()) return nullptr;
+    return static_arena_.region_ptr(runtime_arena_region_off_);
+}
+
 std::thread DeviceRunner::create_thread(std::function<void()> fn) {
     int dev_id = device_id_;
     return std::thread([dev_id, fn = std::move(fn)]() {
@@ -1032,14 +1047,17 @@ int DeviceRunner::finalize() {
     // Close executor .so files (typically already closed by run(), this is a safety net)
     unload_executor_binaries();
 
-    // Release per-Worker static arena (GM heap + PTO2 SM in a single backing
-    // device allocation). Must precede mem_alloc_.finalize() so the arena
-    // frees through the still-live allocator, not after it.
+    // Release per-Worker static arena (GM heap + PTO2 SM + optional trb
+    // prebuilt runtime arena in a single backing device allocation). Must
+    // precede mem_alloc_.finalize() so the arena frees through the still-live
+    // allocator, not after it.
     static_arena_.release();
     gm_heap_region_off_ = SIZE_MAX;
     gm_sm_region_off_ = SIZE_MAX;
+    runtime_arena_region_off_ = SIZE_MAX;
     cached_gm_heap_size_ = 0;
     cached_gm_sm_size_ = 0;
+    cached_runtime_arena_size_ = 0;
 
     // Free all remaining allocations
     mem_alloc_.finalize();
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index 73b3dfea2..60f1bfdc9 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -80,19 +80,26 @@ class DeviceRunner {
 
     /**
      * Lay out and commit the per-Worker static device arena that backs the
-     * PTO2 GM heap and PTO2 shared memory in a single underlying allocation.
-     * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm.
-     * Idempotent on identical sizes. Returns 0 on success, -1 on failure.
+     * PTO2 GM heap, the PTO2 shared memory, and the trb prebuilt runtime
+     * arena in a single underlying allocation. Must be called before any
+     * acquire_pooled_*. `runtime_arena_size` is 0 for hbg. Idempotent on
+     * identical sizes. Returns 0 on success, -1 on failure.
      */
-    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size);
+    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
 
     /**
-     * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must
-     * have been called earlier in this Worker; otherwise these return
-     * nullptr. Pointers are stable for the lifetime of the Worker.
+     * Return the pooled GM heap / PTO2 SM / runtime arena pointer.
+     * setup_static_arena must have been called earlier in this Worker.
+     *
+     * acquire_pooled_runtime_arena() is trb-only — the runtime arena region
+     * is only reserved when setup_static_arena was called with
+     * runtime_arena_size > 0. hbg's runtime_maker never calls this; doing so
+     * after setup_static_arena(...,0) returns an unreserved-offset region_ptr
+     * (undefined). Keep the call site discipline at the runtime_maker layer.
      */
     void *acquire_pooled_gm_heap();
     void *acquire_pooled_gm_sm();
+    void *acquire_pooled_runtime_arena();
 
     /**
      * Create a thread bound to this device.
@@ -292,10 +299,12 @@ class DeviceRunner {
     DeviceArena static_arena_;
     size_t gm_heap_region_off_{SIZE_MAX};
     size_t gm_sm_region_off_{SIZE_MAX};
+    size_t runtime_arena_region_off_{SIZE_MAX};
     // Cached sizes for setup_static_arena's "fits" check — avoids calling
-    // region_size() on the arena's public API for the two regions we own.
+    // region_size() on the arena's public API for the regions we own.
     size_t cached_gm_heap_size_{0};
     size_t cached_gm_sm_size_{0};
+    size_t cached_runtime_arena_size_{0};
 
     // Simulation state (no actual device resources)
     KernelArgs kernel_args_;
diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
index 4ad438a9c..fca663610 100644
--- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
@@ -103,9 +103,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) {
     }
 }
 
-static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) {
+static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
     try {
-        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size);
+        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size);
     } catch (...) {
         return -1;
     }
@@ -127,6 +127,14 @@ static void *acquire_pooled_gm_sm_wrapper() {
     }
 }
 
+static void *acquire_pooled_runtime_arena_wrapper() {
+    try {
+        return current_runner()->acquire_pooled_runtime_arena();
+    } catch (...) {
+        return nullptr;
+    }
+}
+
 /* ===========================================================================
  * Public C API (resolved by ChipWorker via dlsym)
  * =========================================================================== */
@@ -333,6 +341,7 @@ int run_prepared(
         r->host_api.setup_static_arena = setup_static_arena_wrapper;
         r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper;
         r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper;
+        r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper;
         r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper;
 
         auto bind_result = runner->bind_prepared_callable_to_runtime(*r, callable_id);
diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
index 41845bdf0..ccdc05ce0 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
@@ -140,9 +140,16 @@ struct HostApi {
     // pto_runtime_c_api.cpp can populate the same HostApi struct regardless of
     // which runtime variant it is built against. Unset for this variant; do
     // not call.
-    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size);
+    // PTO2 static-arena hooks. The host_build_graph runtime does not currently
+    // use these — the fields exist only so the platform layer's
+    // pto_runtime_c_api.cpp can populate the same HostApi struct regardless of
+    // which runtime variant it is built against. Unset for this variant; do
+    // not call. hbg-side callers pass runtime_arena_size == 0 (hbg has no
+    // prebuilt runtime arena).
+    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
     void *(*acquire_pooled_gm_heap)();
     void *(*acquire_pooled_gm_sm)();
+    void *(*acquire_pooled_runtime_arena)();
     // Single-shot upload of the entire ChipCallable buffer. `callable` is a
     // `const ChipCallable *` (declared void* to avoid pulling task_interface
     // headers into runtime.h). DeviceRunner walks child_offsets_ to compute
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index f8e35917b..5c31c5b9a 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -467,29 +467,60 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 static_cast<uint64_t>(task_window_size), static_cast<uint64_t>(heap_size), dep_pool_capacity
             );
 
-            void *sm_ptr = runtime->get_gm_sm_ptr();
-            void *gm_heap = runtime->get_gm_heap_ptr();
+            // gm_heap pointer / dep_pool_capacity are encoded into the prebuilt
+            // runtime arena image at host build time, so we no longer fetch
+            // them here. They remain on the host Runtime instance and on the
+            // PTO2Runtime header for diagnostic purposes only.
+            (void)dep_pool_capacity;
 
+            void *sm_ptr = runtime->get_gm_sm_ptr();
             uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size);
-            rt = runtime_create_from_sm(
-                PTO2_MODE_EXECUTE, sm_ptr, sm_size, task_window_size, gm_heap, heap_size, runtime_arena_,
-                dep_pool_capacity
-            );
-            if (!rt) {
-                LOG_ERROR("Thread %d: Failed to create PTO2Runtime", thread_idx);
-                // Unblock scheduler threads before returning so they don't spin forever.
+
+            // Prebuilt-arena fast path. Host has pre-populated the entire
+            // runtime arena (PTO2Runtime + orchestrator/scheduler/tensor_map
+            // sub-regions + sm_handle wrapper + mailbox) and uploaded it via
+            // rtMemcpy into the pooled runtime_arena buffer. We attach to it,
+            // wire arena-internal pointers to their device addresses, reset
+            // the SM, and finalize the few device-only fields the host could
+            // not know at image-build time.
+            void *prebuilt_arena = runtime->get_prebuilt_arena_base();
+            size_t off_runtime = runtime->get_prebuilt_runtime_offset();
+            if (prebuilt_arena == nullptr) {
+                LOG_ERROR("Thread %d: prebuilt_arena_base is null", thread_idx);
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
+            }
+            runtime_arena_.attach(prebuilt_arena, DeviceArena::kDefaultBaseAlign);
+            rt = reinterpret_cast<PTO2Runtime *>(static_cast<char *>(prebuilt_arena) + off_runtime);
+
+            // Wire every arena-internal pointer field (host wrote host-mirror
+            // addresses; we overwrite them with device addresses).
+            runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt);
+
+            // Reset SM state. setup_pointers + init_header_per_ring restore
+            // ring flow-control counters, layout metadata, error flags, and
+            // the per-slot ring->slot_states[] (bind_ring + reset_for_reuse +
+            // fanin_count/active_mask zero — previously done inside
+            // RingSchedState::init).
+            memset(rt->sm_handle, 0, sizeof(*rt->sm_handle));
+            if (!rt->sm_handle->init(sm_ptr, sm_size, task_window_size, heap_size)) {
+                LOG_ERROR("Thread %d: sm_handle->init failed", thread_idx);
                 runtime_init_ready_.store(true, std::memory_order_release);
                 return -1;
             }
 
+            // AICore completion mailbox lives in the arena; reset it each
+            // boot so stale completion notifications from a previous run do
+            // not leak.
+            memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox));
+
+            // Fill ops / core counts (host can't resolve s_runtime_ops's
+            // device address nor know the SchedulerContext's core fan-out).
+            runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count());
 #if PTO2_PROFILING
             rt->orchestrator.l2_perf_level = get_l2_perf_level();
 #endif
 
-            // Total core counts = aic_count_ / aiv_count_ (set once at runtime init).
-            rt->orchestrator.total_cluster_count = sched_ctx_.aic_count();
-            rt->orchestrator.total_aiv_count = sched_ctx_.aiv_count();
-
             // With multi-ring, slot_states are per-ring inside the scheduler.
             runtime->set_slot_states_ptr(nullptr);
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
index 027805918..506ba7cf6 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
@@ -487,11 +487,16 @@ dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, c
         PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes);
     auto annot_layout =
         PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes);
-    if (replay_arena.commit() == nullptr || !tm_oracle.init_from_layout(oracle_layout, replay_arena) ||
-        !tm_annot.init_from_layout(annot_layout, replay_arena)) {
+    if (replay_arena.commit() == nullptr || !tm_oracle.init_data_from_layout(oracle_layout, replay_arena) ||
+        !tm_annot.init_data_from_layout(annot_layout, replay_arena)) {
         LOG_ERROR("dep_gen replay: tensormap.init failed (buckets=%d, pool=%d)", PTO2_TENSORMAP_NUM_BUCKETS, pool_size);
         return -3;
     }
+    // Replay tensormaps live entirely on host; both arena base and the
+    // parent-orch self-pointer use host addresses. parent_orch is unused by
+    // the lookup/insert code paths exercised below — nullptr is safe.
+    tm_oracle.wire_arena_pointers(oracle_layout, replay_arena, nullptr);
+    tm_annot.wire_arena_pointers(annot_layout, replay_arena, nullptr);
 
     // JSON output accumulators.
     std::vector<TaskTableEntry> task_table;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index a75205196..3b278b2b4 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -36,11 +36,13 @@
 #include <cstring>
 
 #include "../common/pto_runtime_status.h"
+#include "../runtime/pto_runtime2.h"
 #include "../runtime/pto_shared_memory.h"
 #include "../runtime/runtime.h"
 #include "callable.h"
 #include "common/platform_config.h"
 #include "common/unified_log.h"
+#include "device_arena.h"
 #include "prepare_callable_common.h"
 
 // Helper: return current time in milliseconds
@@ -271,15 +273,27 @@ extern "C" int bind_prepared_to_runtime_impl(
     uint64_t eff_heap_size = runtime->heap_size ? runtime->heap_size : PTO2_HEAP_SIZE;
     uint64_t eff_task_window_size = runtime->task_window_size ? runtime->task_window_size : PTO2_TASK_WINDOW_SIZE;
 
-    // Lay out the per-Worker static device arena. GM heap (orchestrator output
-    // buffers, all rings combined) and PTO2 shared memory live in a single
-    // backing allocation; setup_static_arena reserves both regions and
-    // commits in one shot. Owned by DeviceRunner across runs — do NOT record
-    // in tensor_pairs_; the free is deferred to DeviceRunner::finalize().
+    // Lay out the per-Worker static device arena. GM heap, PTO2 shared memory,
+    // and the prebuilt runtime arena all live in a single backing allocation;
+    // setup_static_arena reserves the three regions and commits in one shot.
+    // Owned by DeviceRunner across runs — do NOT record in tensor_pairs_; the
+    // free is deferred to DeviceRunner::finalize(). The runtime-arena size is
+    // determined by replaying the reserve sequence on a host-side arena.
     uint64_t total_heap_size = eff_heap_size * PTO2_MAX_RING_DEPTH;
     uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(eff_task_window_size);
+    int32_t eff_dep_pool_capacity =
+        runtime->dep_pool_size ? static_cast<int32_t>(runtime->dep_pool_size) : PTO2_DEP_LIST_POOL_SIZE;
+
+    int64_t t_prebuilt_start = _now_ms();
+    DeviceArena host_arena;  // libc malloc backend by default
+    PTO2RuntimeArenaLayout layout = runtime_reserve_layout(host_arena, eff_task_window_size, eff_dep_pool_capacity);
+    if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+        LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
+        return -1;
+    }
+
     int64_t t_setup_start = _now_ms();
-    if (runtime->host_api.setup_static_arena(total_heap_size, sm_size) != 0) {
+    if (runtime->host_api.setup_static_arena(total_heap_size, sm_size, layout.arena_size) != 0) {
         LOG_ERROR("Failed to setup pooled static arena");
         return -1;
     }
@@ -303,9 +317,48 @@ extern "C" int bind_prepared_to_runtime_impl(
     }
     runtime->set_gm_sm_ptr(sm_ptr);
 
+    void *runtime_arena_dev = runtime->host_api.acquire_pooled_runtime_arena();
+    if (runtime_arena_dev == nullptr) {
+        LOG_ERROR("Failed to acquire pooled runtime arena");
+        return -1;
+    }
+
     // Set up device orchestration state
     runtime->set_orch_args(device_args);
 
+    // -------------------------------------------------------------------------
+    // Build the prebuilt runtime-arena image on host.
+    //
+    // We pre-compute every byte the AICPU's runtime arena would otherwise have
+    // to write at boot: layout offsets, sub-structure init data, and pointers
+    // back to the SM / GM heap. Then we rtMemcpy the image into the pooled
+    // runtime-arena region that DeviceRunner keeps alive across runs. AICPU
+    // boot becomes attach + wire (cheap pointer fixup) + sm_handle->init (SM
+    // reset) + a handful of device-only field fixups.
+    // -------------------------------------------------------------------------
+    PTO2Runtime *rt =
+        runtime_init_data_from_layout(host_arena, layout, PTO2_MODE_EXECUTE, sm_ptr, sm_size, gm_heap, eff_heap_size);
+    if (rt == nullptr) {
+        LOG_ERROR("runtime_init_data_from_layout failed");
+        return -1;
+    }
+    runtime_wire_arena_pointers(host_arena, layout, rt);
+
+    // Stash the prebuilt metadata inside the PTO2Runtime image so the AICPU
+    // picks them up directly via the pooled buffer after rtMemcpy. The host
+    // Runtime also carries the pointers so the AICPU can locate the
+    // PTO2Runtime before it does anything else (no chicken-and-egg).
+    rt->prebuilt_arena_base = runtime_arena_dev;
+    rt->prebuilt_layout = layout;
+
+    int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size);
+    if (rc_upload != 0) {
+        LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload);
+        return -1;
+    }
+    runtime->set_prebuilt_arena(runtime_arena_dev, layout.off_runtime);
+    int64_t t_prebuilt_end = _now_ms();
+
     LOG_INFO_V0("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count);
 
     int64_t t_total_end = _now_ms();
@@ -313,6 +366,7 @@ extern "C" int bind_prepared_to_runtime_impl(
     LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start);
     LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start);
     LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start);
+    LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start);
     LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start);
 
     return 0;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index fbc07f53f..f80c7a655 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -384,89 +384,6 @@ static bool prepare_task(
     return true;
 }
 
-// =============================================================================
-// Orchestrator Initialization
-// =============================================================================
-
-PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout(
-    DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity
-) {
-    PTO2OrchestratorLayout layout{};
-    layout.dep_pool_capacity = dep_pool_capacity;
-    layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP;
-    layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        const size_t fanin_pool_bytes =
-            PTO2_ALIGN_UP(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
-        layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE);
-    }
-    layout.off_scope_tasks = arena.reserve(
-        static_cast<size_t>(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *)
-    );
-    layout.off_scope_begins =
-        arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
-    layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes);
-    return layout;
-}
-
-bool PTO2OrchestratorState::init_from_layout(
-    const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header_arg, void *gm_heap,
-    uint64_t heap_size
-) {
-    auto *orch = this;
-    *orch = PTO2OrchestratorState{};
-
-    orch->sm_header = sm_header_arg;
-    orch->gm_heap_base = gm_heap;
-    orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
-    orch->fatal = false;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + r * heap_size;
-        auto &ring = sm_header_arg->rings[r];
-
-        orch->rings[r].task_allocator.init(
-            ring.task_descriptors, ring.task_window_size, &ring.fc.current_task_index, &ring.fc.last_task_alive,
-            ring_heap_base, heap_size, &sm_header_arg->orch_error_code
-        );
-
-        const size_t fanin_pool_bytes =
-            PTO2_ALIGN_UP(static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
-        auto *fanin_entries = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
-        // aligned_zalloc-equivalent: pool relies on zeroed entries.
-        memset(fanin_entries, 0, fanin_pool_bytes);
-        orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, &sm_header_arg->orch_error_code);
-    }
-
-    if (!orch->tensor_map.init_from_layout(layout.tensor_map, arena)) {
-        return false;
-    }
-    orch->tensor_map.orch = orch;
-
-    orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
-    orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
-    orch->scope_tasks_size = 0;
-    orch->scope_tasks_capacity = layout.scope_tasks_cap;
-    orch->scope_stack_top = -1;
-    orch->scope_stack_capacity = layout.scope_stack_capacity;
-    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
-
-    return true;
-}
-
-void PTO2OrchestratorState::destroy() {
-    auto *orch = this;
-    orch->tensor_map.destroy();
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        orch->rings[r].fanin_pool.base = nullptr;
-    }
-    orch->scope_tasks = nullptr;
-    orch->scope_begins = nullptr;
-}
-
-void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; }
-
 // =============================================================================
 // Scope Management
 // =============================================================================
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index 37fd0dcac..6e67cb597 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -142,14 +142,21 @@ struct PTO2OrchestratorState {
         int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
     );
 
-    // Phase 3: bind region pointers, wire per-ring task_allocator + fanin_pool
-    // and tensor_map. Arena must be committed; layout must come from
-    // reserve_layout() against the same arena.
-    bool init_from_layout(
-        const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header, void *gm_heap,
-        uint64_t heap_size
+    // Phase 3a: write everything *except* arena-internal pointer fields.
+    // sm_dev_base is the SM device address (only stored, never dereferenced);
+    // task_window_size feeds the per-ring SM address arithmetic. Safe to call
+    // on a host arena that holds the prebuilt image.
+    bool init_data_from_layout(
+        const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
+        uint64_t task_window_size
     );
 
+    // Phase 3b: write the arena-internal pointer fields (scope_tasks,
+    // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool,
+    // free_entry_list,task_entry_heads,orch}, scheduler reference).
+    // Idempotent — host runs once on the image, AICPU runs once after attach.
+    void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler);
+
     // Forget pointers; arena owns the backing buffers.
     void destroy();
     void set_scheduler(PTO2SchedulerState *scheduler);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
index 5a3e3d3d3..abd2a7510 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -68,10 +68,22 @@ class PTO2TaskAllocator {
 public:
     /**
      * Initialize the allocator with task ring and heap ring resources.
+     *
+     * All pointer arguments are device addresses (live in SM / GM heap); this
+     * function only stores them, no dereferences, so it is safe to invoke
+     * from host code that constructs a prebuilt arena image.
+     *
+     * Production callers leave `initial_local_task_id` at 0: the SM ring
+     * flow-control counters that current_index_ptr / last_alive_ptr point at
+     * start at zero (PTO2RingFlowControl::init() runs on the AICPU during SM
+     * reset), so we keep local_task_id_ aligned with that without reading the
+     * SM. Tests that drive SM state directly may pass a non-zero seed to
+     * exercise corner cases like task IDs near INT32_MAX.
      */
     void init(
         PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic<int32_t> *current_index_ptr,
-        std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr
+        std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr,
+        int32_t initial_local_task_id = 0
     ) {
         descriptors_ = descriptors;
         window_size_ = window_size;
@@ -81,7 +93,7 @@ class PTO2TaskAllocator {
         heap_base_ = heap_base;
         heap_size_ = heap_size;
         error_code_ptr_ = error_code_ptr;
-        local_task_id_ = current_index_ptr->load(std::memory_order_relaxed);
+        local_task_id_ = initial_local_task_id;
         heap_top_ = 0;
         heap_tail_ = 0;
         last_alive_seen_ = 0;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
index c801d5c15..f39bac365 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
@@ -249,81 +249,19 @@ static const PTO2RuntimeOps s_runtime_ops = {
 };
 
 // =============================================================================
-// Runtime Creation and Destruction
+// Runtime Lifecycle (AICPU-only fixup)
 // =============================================================================
-
-PTO2Runtime *runtime_create_from_sm(
-    PTO2RuntimeMode mode, void *sm_base, uint64_t sm_size, uint64_t task_window_size, void *gm_heap, uint64_t heap_size,
-    DeviceArena &arena, int32_t dep_pool_capacity
-) {
-    if (!sm_base || sm_size == 0) return nullptr;
-
-    // Phase 1: layout. Reserve every sub-region the runtime needs (including
-    // the SM handle wrapper itself) without touching memory yet.
-    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = static_cast<int32_t>(task_window_size);
-    }
-    const size_t off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
-    PTO2OrchestratorLayout orch_layout =
-        PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity);
-    PTO2SchedulerLayout sched_layout = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity);
-    const size_t off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
-    const size_t off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
-
-    // Phase 2: single backing allocation.
-    if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) return nullptr;
-
-    // Phase 3: bind region pointers and initialize.
-    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(off_runtime));
-    memset(rt, 0, sizeof(*rt));  // calloc-equivalent for the runtime header.
-
-    // Initialize the SM handle wrapper in-place on its arena region before
-    // anything that reads sm_handle->header (orchestrator / scheduler init).
-    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(off_sm_handle));
-    memset(rt->sm_handle, 0, sizeof(*rt->sm_handle));
-    if (!rt->sm_handle->init(sm_base, sm_size, task_window_size, heap_size)) {
-        arena.release();
-        return nullptr;
-    }
-
+//
+// Layout / init_data / wire / destroy live in
+// runtime/shared/pto_runtime2_init.cpp so the host build can pre-populate the
+// prebuilt arena image. The pieces below — wiring the ops table and the
+// SPMD core counts — depend on the device-side s_runtime_ops global and the
+// AICPU SchedulerContext respectively, so they remain in the AICPU build.
+
+void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count) {
     rt->ops = &s_runtime_ops;
-    rt->mode = mode;
-    rt->gm_heap = gm_heap;
-    rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0;
-    rt->gm_heap_owned = false;
-
-    if (!rt->orchestrator.init_from_layout(orch_layout, arena, rt->sm_handle->header, gm_heap, heap_size)) {
-        arena.release();
-        return nullptr;
-    }
-    if (!rt->scheduler.init_from_layout(sched_layout, arena, rt->sm_handle->header)) {
-        rt->orchestrator.destroy();
-        arena.release();
-        return nullptr;
-    }
-    rt->orchestrator.set_scheduler(&rt->scheduler);
-
-    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(off_mailbox));
-    memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox));
-
-    return rt;
-}
-
-void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena) {
-    if (!rt) {
-        arena.release();  // safe: idempotent if nothing's committed.
-        return;
-    }
-
-    rt->scheduler.destroy();
-    rt->orchestrator.destroy();
-    rt->aicore_mailbox = nullptr;  // arena-owned.
-    rt->sm_handle = nullptr;       // wrapper lives in arena; release() reclaims it.
-
-    // arena.release() frees the single backing buffer that holds rt,
-    // mailbox, sm_handle, orchestrator and scheduler sub-regions in one shot.
-    arena.release();
+    rt->orchestrator.total_cluster_count = aic_count;
+    rt->orchestrator.total_aiv_count = aiv_count;
 }
 
 void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index 5709a85b7..169937f82 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -91,6 +91,30 @@ struct PTO2RuntimeOps {
     TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args);
 };
 
+/**
+ * Layout descriptor for the prebuilt runtime arena. Holds all sub-region
+ * offsets (orchestrator / scheduler / sm_handle wrapper / runtime header /
+ * AICore mailbox) plus the layout-defining capacities. Produced once on the
+ * host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout
+ * and runtime_wire_arena_pointers.
+ */
+struct PTO2RuntimeArenaLayout {
+    size_t off_sm_handle{0};
+    PTO2OrchestratorLayout orch;
+    PTO2SchedulerLayout sched;
+    size_t off_runtime{0};
+    size_t off_mailbox{0};
+
+    // Cached parameters (re-used by init_data + wire stages).
+    uint64_t task_window_size{0};
+    uint64_t heap_size{0};
+    int32_t dep_pool_capacity{0};
+
+    // Total arena byte size post-commit. Used by host to size the prebuilt
+    // image buffer and as the rtMemcpy length.
+    size_t arena_size{0};
+};
+
 /**
  * PTO Runtime2 context
  *
@@ -118,6 +142,16 @@ struct PTO2Runtime {
 
     // Statistics
     int64_t total_cycles;
+
+    // Prebuilt-arena fast path metadata. `prebuilt_arena_base` is the device
+    // address of the runtime arena (the buffer that holds *this* PTO2Runtime
+    // at offset prebuilt_layout.off_runtime). `prebuilt_layout` carries every
+    // offset wire_arena_pointers needs at AICPU boot, so the AICPU can
+    // reconstruct all arena-internal pointer fields without re-running
+    // init_data. Populated by the host's runtime_init_data_from_layout +
+    // runtime_wire_arena_pointers; read by aicpu_executor.cpp.
+    void *prebuilt_arena_base{nullptr};
+    PTO2RuntimeArenaLayout prebuilt_layout;
 };
 
 // =============================================================================
@@ -125,38 +159,60 @@ struct PTO2Runtime {
 // =============================================================================
 
 /**
- * Create runtime from caller-provided GM SM buffer + GM heap.
- *
- * All AICPU-side runtime state (PTO2SharedMemoryHandle wrapper, PTO2Runtime,
- * AICoreCompletionMailbox, plus the orchestrator/scheduler/tensor_map
- * sub-regions) is laid out on the supplied arena and committed in a single
- * backing allocation — including the SM handle wrapper itself. The arena is
- * owned by the caller (typically the per-Worker AicpuExecutor);
- * runtime_destroy() calls arena.release() once to free the lot.
+ * Phase 1 — declare every sub-region (sm_handle wrapper, orchestrator /
+ * scheduler / tensor_map / mailbox / PTO2Runtime header) on the supplied
+ * arena. Pure arithmetic; does not touch device memory and may run on host.
+ * Returns the layout descriptor; caller commits/attaches the arena before
+ * Phase 2/3.
+ */
+PTO2RuntimeArenaLayout runtime_reserve_layout(
+    DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
+);
+
+/**
+ * Phase 2 — write the data half of the runtime arena: standalone fields,
+ * memset'd arena regions, sub-structure initializers, and SM-side device
+ * pointers. The arena must already be committed (or attached); writes go
+ * into arena.base() + sub-region offsets.
  *
- * `sm_base` / `sm_size` describe the SM buffer that host has already placed
- * for the runtime to use; the SM handle wrapper is constructed in-place on
- * an arena-reserved region pointing at that buffer.
+ * `sm_dev_base` / `gm_heap_dev_base` are device addresses; we only store
+ * them (never dereference). Safe to run on a host arena that owns a host
+ * mirror of the runtime image — the resulting buffer is rtMemcpy-ready.
  *
- * @param mode             Execution mode
- * @param sm_base          Pre-allocated SM buffer base (host-owned)
- * @param sm_size          Size of the SM buffer in bytes
- * @param task_window_size Per-ring task window size used to lay out SM
- * @param gm_heap          GM heap base for output buffers (or NULL if not used)
- * @param heap_size        GM heap size in bytes
- * @param arena            Caller-owned arena that sources all runtime sub-regions.
- *                         Must be freshly constructed (no prior commit) —
- *                         runtime_create_from_sm reserves + commits internally.
- * @return Runtime context, or NULL on failure
- */
-PTO2Runtime *runtime_create_from_sm(
-    PTO2RuntimeMode mode, void *sm_base, uint64_t sm_size, uint64_t task_window_size, void *gm_heap, uint64_t heap_size,
-    DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
+ * Returns the PTO2Runtime* that sits at layout.off_runtime within the arena.
+ * Caller must follow up with runtime_wire_arena_pointers; rt->ops and the
+ * AICore-side count fields are left untouched and must be filled by the
+ * AICPU at boot.
+ */
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size,
+    void *gm_heap_dev_base, uint64_t heap_size
 );
 
 /**
- * Destroy runtime and free all resources. arena.release() is the actual
- * memory free; the rt pointer is no longer valid afterward.
+ * Phase 3 — wire every arena-internal pointer field (rt->sm_handle,
+ * rt->aicore_mailbox, orchestrator.{scope_tasks, scope_begins, scheduler,
+ * tensor_map.*, rings[].fanin_pool.base}, scheduler.{ready_queues, dep_pool,
+ * wiring.queue}) so each holds arena.base() + offset. Idempotent — runs on
+ * both host (writing host-mirror addresses) and AICPU (writing device
+ * addresses) sides.
+ */
+void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt);
+
+/**
+ * AICPU-only Phase 4 — fill in the few fields the host could not know at
+ * prebuilt-image build time: the ops table (s_runtime_ops is a device-side
+ * file-local global, host cannot resolve its device address) and the
+ * orchestrator's core counts (depend on the executor's scheduler context).
+ * Call once per boot after runtime_wire_arena_pointers.
+ */
+void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count);
+
+/**
+ * Destroy runtime. With the prebuilt-arena fast path the arena buffer is
+ * pooled across runs by DeviceRunner, so we never call arena.release()
+ * here — the destructor only forgets sub-structure pointers (idempotent
+ * cleanup).
  */
 void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena);
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
index 5e1b6faa8..c8de35ba6 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
@@ -187,3 +187,64 @@ struct PTO2SharedMemoryHandle {
     void setup_pointers(uint64_t task_window_size);
     void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
 };
+
+// =============================================================================
+// SM Device Layout Helpers
+// =============================================================================
+//
+// When the host pre-builds a runtime-arena image, it needs the device-side
+// addresses of several SM sub-fields (ring flow-control counters,
+// task_descriptors arrays, orch_error_code) so it can wire them into the
+// orchestrator / scheduler init_data path without dereferencing the SM —
+// the SM lives in device memory and cannot be touched from host.
+//
+// These helpers compute those addresses by offset arithmetic on the SM
+// device base. Pure pointer math, no loads/stores; safe to call from host.
+// The same arithmetic happens on AICPU too (via PTO2SharedMemoryHandle's
+// own setup_pointers), so values are guaranteed consistent across sides.
+namespace pto2_sm_layout {
+
+inline std::atomic<int32_t> *orch_error_code_addr(void *sm_dev_base) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code)
+    );
+}
+
+inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<PTO2SharedMemoryRingHeader *>(
+        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) +
+        static_cast<size_t>(ring_id) * sizeof(PTO2SharedMemoryRingHeader)
+    );
+}
+
+inline std::atomic<int32_t> *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
+        offsetof(PTO2RingFlowControl, current_task_index)
+    );
+}
+
+inline std::atomic<int32_t> *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
+        offsetof(PTO2RingFlowControl, last_task_alive)
+    );
+}
+
+// Walk the per-ring SM layout (same arithmetic as setup_pointers_per_ring)
+// to compute ring `ring_id`'s task_descriptors device address. Uniform
+// per-ring task_window_size; matches the production callsite which always
+// passes a uniform window size to runtime_create_from_sm.
+inline PTO2TaskDescriptor *
+ring_task_descriptors_addr(void *sm_dev_base, uint64_t task_window_size, int ring_id) noexcept {
+    char *p = static_cast<char *>(sm_dev_base);
+    p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+    for (int r = 0; r < ring_id; r++) {
+        p += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+    }
+    return reinterpret_cast<PTO2TaskDescriptor *>(p);
+}
+
+}  // namespace pto2_sm_layout
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index cf1f2d28d..11decdf4e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -435,11 +435,22 @@ struct PTO2TensorMap {
     reserve_layout_default(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
 
     /**
-     * Phase 3: bind region pointers and initialize state. The arena must already
-     * be committed; layout must have been produced by reserve_layout() against
-     * the same arena.
+     * Phase 3a: write everything *except* arena-internal pointer fields
+     * (buckets, entry_pool, free_entry_list, task_entry_heads[r], orch).
+     * Uses arena.region_ptr to address the arena regions for data writes,
+     * but does not store those addresses in struct fields. Safe to call on
+     * a host arena that holds the prebuilt image.
      */
-    bool init_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena);
+    bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena);
+
+    /**
+     * Phase 3b: write the arena-internal pointer fields. Idempotent;
+     * called once on the host arena and once on the AICPU after attach.
+     * `parent_orch` is the device address (or host-mirror address) of the
+     * enclosing PTO2OrchestratorState; we store it in tensor_map.orch
+     * (self-pointer within the same arena).
+     */
+    void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena, PTO2OrchestratorState *parent_orch);
 
     /**
      * Tear down state. Does not free memory — the arena owns the backing
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 73b6027c4..117621ca2 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -124,14 +124,26 @@ struct HostApi {
     // allocation. Must be called once before acquire_pooled_gm_heap /
     // acquire_pooled_gm_sm. Idempotent on identical sizes; returns 0 on
     // success, -1 on allocation failure.
-    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size);
+    // Lay out three pooled regions in a single backing device allocation:
+    // GM heap, PTO2 shared memory, and the trb prebuilt runtime arena.
+    // `runtime_arena_size == 0` skips the last region (hbg path: hbg has no
+    // prebuilt runtime arena). Returns 0 on success, -1 on allocation
+    // failure.
+    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
     // Return the per-Worker pooled pointer for the PTO2 GM heap / shared
-    // memory. The static arena must already be committed via
-    // setup_static_arena; the returned pointer is owned by the DeviceRunner
-    // and freed in `DeviceRunner::finalize()` — do NOT pass it to
-    // device_free or record it in `tensor_pairs_`.
+    // memory / prebuilt runtime arena. The static arena must already be
+    // committed via setup_static_arena; the returned pointer is owned by
+    // the DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT
+    // pass it to device_free or record it in `tensor_pairs_`.
+    //
+    // acquire_pooled_runtime_arena is trb-only — the host side reserves the
+    // runtime-arena region only when setup_static_arena is invoked with
+    // runtime_arena_size > 0. hbg's runtime_maker.cpp must not call it
+    // (setup_static_arena(...,0) leaves the offset unreserved, and the
+    // returned region_ptr would be undefined).
     void *(*acquire_pooled_gm_heap)();
     void *(*acquire_pooled_gm_sm)();
+    void *(*acquire_pooled_runtime_arena)();
     // Single-shot upload of the entire ChipCallable buffer. `callable` is a
     // `const ChipCallable *` (declared void* to avoid pulling task_interface
     // headers into runtime.h). DeviceRunner walks child_offsets_ to compute
@@ -211,6 +223,13 @@ class Runtime {
     void *slot_states_ptr_;                  // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling)
     ChipStorageTaskArgs orch_args_storage_;  // Copy of args for device
 
+    // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing
+    // Runtime to device; AICPU reads them in the boot path to skip
+    // runtime_create_from_sm and reuse the pooled, prebuilt arena buffer
+    // (already populated by runtime_init_data_from_layout + wire on host).
+    void *prebuilt_arena_base_;
+    size_t prebuilt_runtime_offset_;
+
     // Device orchestration SO (for dlopen on AICPU thread 3).
     // The SO bytes themselves live in a separately-allocated device buffer
     // owned by DeviceRunner; only the metadata below travels inside Runtime.
@@ -247,6 +266,16 @@ class Runtime {
     void set_slot_states_ptr(void *p);
     void set_orch_args(const ChipStorageTaskArgs &args);
 
+    // Prebuilt-arena fast path (trb only). Set by host's
+    // bind_prepared_to_runtime_impl; consumed by AICPU at boot to attach a
+    // DeviceArena to `prebuilt_arena_base_` and pick up the PTO2Runtime at
+    // `prebuilt_arena_base_ + prebuilt_runtime_offset_`. Both stay zero on
+    // first construction (Runtime() ctor zeros them) so a non-prebuilt boot
+    // path can still detect "no prebuilt image set" via nullptr.
+    void set_prebuilt_arena(void *arena_base, size_t runtime_off);
+    void *get_prebuilt_arena_base() const;
+    size_t get_prebuilt_runtime_offset() const;
+
     // Device orchestration SO binary (for dlopen on AICPU thread 3)
     void set_dev_orch_so(uint64_t dev_addr, uint64_t size);
     uint64_t get_dev_orch_so_addr() const;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
index f497b8fd8..2d777e9b0 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
@@ -61,153 +61,6 @@ PTO2SchedProfilingData scheduler_get_profiling(int thread_idx) {
 }
 #endif
 
-// =============================================================================
-// Ready Queue Implementation
-// =============================================================================
-
-size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) {
-    // Align the slots[] base to a full cache line so MPMC CAS traffic on the
-    // first slot cannot false-share with whatever region sits in front of us
-    // (e.g. orchestrator tensormap heads written by the orch thread).
-    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
-}
-
-bool ready_queue_init_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) {
-    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
-    queue->capacity = capacity;
-    queue->mask = capacity - 1;
-    queue->enqueue_pos.store(0, std::memory_order_relaxed);
-    queue->dequeue_pos.store(0, std::memory_order_relaxed);
-
-    for (uint64_t i = 0; i < capacity; i++) {
-        queue->slots[i].sequence.store((int64_t)i, std::memory_order_relaxed);
-        queue->slots[i].slot_state = nullptr;
-    }
-
-    return true;
-}
-
-void ready_queue_destroy(PTO2ReadyQueue *queue) {
-    // Arena owns the slots[] buffer; just forget the pointer.
-    queue->slots = nullptr;
-}
-
-// =============================================================================
-// Scheduler Initialization
-// =============================================================================
-
-bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id) {
-    ring = &sm_header->rings[ring_id];
-    last_task_alive = 0;
-    advance_lock.store(0, std::memory_order_relaxed);
-
-    // Initialize all per-task slot state fields.
-    // bind_ring() sets the ring_id only — payload/task pointers are re-bound
-    // by orch::prepare_task on every submit (their value is per-slot constant
-    // but pinning them here would cost O(task_window_size) at startup).
-    // reset_for_reuse() sets dynamic fields to reclaim defaults (fanout_count=1,
-    // rest zero) so the first submit needs no reset.
-    for (uint64_t i = 0; i < ring->task_window_size; i++) {
-        ring->slot_states[i].bind_ring(static_cast<uint8_t>(ring_id));
-        ring->slot_states[i].reset_for_reuse();
-        ring->slot_states[i].fanin_count = 0;
-        ring->slot_states[i].active_mask = ActiveMask{};
-    }
-
-    return true;
-}
-
-void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; }
-
-PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) {
-    PTO2SchedulerLayout layout{};
-    layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
-    layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
-    layout.dep_pool_capacity = dep_pool_capacity;
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
-    }
-    layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        // Force a cache-line base so writes from scheduler thread 0 (sole
-        // writer of this ring's dep_pool) do not invalidate adjacent
-        // multi-threaded regions like ready_queue.slots.
-        layout.off_dep_pool_entries[r] =
-            arena.reserve(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE);
-    }
-    layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
-    return layout;
-}
-
-bool PTO2SchedulerState::init_from_layout(
-    const PTO2SchedulerLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header_arg
-) {
-    PTO2SchedulerState *sched = this;
-    sched->sm_header = sm_header_arg;
-#if PTO2_SCHED_PROFILING
-    sched->tasks_completed.store(0, std::memory_order_relaxed);
-    sched->tasks_consumed.store(0, std::memory_order_relaxed);
-#endif
-
-    // Per-ring scheduler state — no arena buffers, just field init.
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (!sched->ring_sched_states[r].init(sm_header_arg, r)) {
-            return false;
-        }
-    }
-
-    // Ready queues — one per resource shape plus DUMMY.
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        if (!ready_queue_init_from_layout(
-                &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity
-            )) {
-            return false;
-        }
-    }
-    if (!ready_queue_init_from_layout(
-            &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity
-        )) {
-        return false;
-    }
-
-    // Per-ring dep_pool: PTO2DepListPool::init takes an externally-allocated
-    // base + capacity, so we just plumb the arena region into it.
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        auto *dep_entries = static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
-        // calloc-equivalent: pool expects entries zeroed at construction.
-        memset(dep_entries, 0, static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry));
-        sched->ring_sched_states[r].dep_pool.init(
-            dep_entries, layout.dep_pool_capacity, &sm_header_arg->orch_error_code
-        );
-    }
-
-    // Wiring SPSC queue (orchestrator push, scheduler thread 0 pop).
-    if (!sched->wiring.queue.init_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) {
-        return false;
-    }
-    sched->wiring.batch_count = 0;
-    sched->wiring.batch_index = 0;
-    sched->wiring.backoff_counter = 0;
-
-    return true;
-}
-
-void PTO2SchedulerState::destroy() {
-    PTO2SchedulerState *sched = this;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        sched->ring_sched_states[r].destroy();
-        sched->ring_sched_states[r].dep_pool.base = nullptr;
-    }
-
-    sched->wiring.queue.destroy();
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        ready_queue_destroy(&sched->ready_queues[i]);
-    }
-    ready_queue_destroy(&sched->dummy_ready_queue);
-}
-
 // =============================================================================
 // Debug Utilities
 // =============================================================================
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
index 8d50681ba..828999113 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
@@ -409,7 +409,14 @@ struct alignas(64) PTO2ReadyQueue {
 //                     initialize sequence counters
 //   destroy: forget the slots pointer (arena owns the buffer)
 size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity);
-bool ready_queue_init_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity);
+// Writes everything *except* the arena-internal `slots` pointer field
+// (sequences/positions on the slot array, capacity, mask). Uses
+// arena.region_ptr(slots_off) only to address the slot array for writes;
+// does NOT store the pointer in `queue->slots`. Call
+// `ready_queue_wire_arena_pointers` afterwards to set the field itself.
+bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity);
+// Stores queue->slots = arena.region_ptr(slots_off). Idempotent.
+void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off);
 void ready_queue_destroy(PTO2ReadyQueue *queue);
 
 // =============================================================================
@@ -449,15 +456,17 @@ struct alignas(64) PTO2SpscQueue {
         return arena.reserve(capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE);
     }
 
-    // Bind buffer pointer + reset indices. The capacity must be a power of two
-    // and match the value passed to reserve_layout.
-    bool init_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) {
+    // Writes everything except the arena-internal `buffer_` pointer field
+    // (zeros the slot pointer array, mask/head/tail). The host pre-builds the
+    // image without storing a host address in buffer_; the AICPU wires
+    // buffer_ at boot via wire_arena_pointers().
+    bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) {
         if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false;
-        buffer_ = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+        auto *buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
         // calloc'd-equivalent: zero the slot pointers so spurious early pops
         // observe nullptr.
         for (uint64_t i = 0; i < capacity; i++)
-            buffer_[i] = nullptr;
+            buf[i] = nullptr;
         mask_ = capacity - 1;
         head_.store(0, std::memory_order_relaxed);
         tail_.store(0, std::memory_order_relaxed);
@@ -466,6 +475,12 @@ struct alignas(64) PTO2SpscQueue {
         return true;
     }
 
+    // Wire the arena-internal pointer. Called by both host (with host arena)
+    // and AICPU (with device arena attached to the prebuilt image).
+    void wire_arena_pointers(DeviceArena &arena, size_t buffer_off) {
+        buffer_ = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+    }
+
     // Arena owns the buffer; here we only forget our pointer.
     void destroy() { buffer_ = nullptr; }
 
@@ -563,7 +578,12 @@ struct PTO2SchedulerState {
         // --- Cache Line 1+: Thread 0 only (wiring dep_pool) ---
         alignas(64) PTO2DepListPool dep_pool;
 
-        bool init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id);
+        // Initialize arena-internal data + arena-external pointers; does NOT
+        // store dep_pool.base (that lives in the runtime arena and is wired
+        // by SchedulerState::wire_arena_pointers). The `ring` field stores
+        // the device address of the SM ring header — computed via offset
+        // arithmetic, no SM dereference.
+        bool init_data_from_layout(void *sm_dev_base, int32_t ring_id, int32_t dep_pool_capacity);
         void destroy();
 
         void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); }
@@ -1042,13 +1062,24 @@ struct PTO2SchedulerState {
 
     // Phase 1: declare every sub-region (ready_queue slots, dummy queue slots,
     // per-ring dep_pool entries, wiring SPSC buffer) on the supplied arena.
-    // Capacities are baked into the returned layout; init_from_layout uses
+    // Capacities are baked into the returned layout; init_data_from_layout uses
     // the same values.
     static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE);
 
-    // Phase 3: bind region pointers and initialize state. The arena must be
-    // committed; layout must come from reserve_layout() against the same arena.
-    bool init_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header);
+    // Phase 3a: write everything *except* arena-internal pointer fields.
+    // `sm_dev_base` is the device address of the SM (only stored, never
+    // dereferenced here); `task_window_size` lets the per-ring data-addr
+    // arithmetic resolve ring task_descriptors / fc field addresses without
+    // an SM load. Safe to call on a host arena that holds the prebuilt
+    // image buffer.
+    bool init_data_from_layout(
+        const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base, uint64_t task_window_size
+    );
+
+    // Phase 3b: write the arena-internal pointer fields
+    // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each
+    // ring, wiring.queue.buffer_). Called on both host and device sides.
+    void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena);
 
     // Forget per-region pointers; arena owns the backing memory.
     void destroy();
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
new file mode 100644
index 000000000..3efa313fd
--- /dev/null
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Host/AICPU shared runtime-arena layout, init_data and wire implementations.
+ *
+ * Lives under runtime/shared/ so it is included in both the host_runtime.so
+ * build (host pre-populates the prebuilt arena image) and the aicpu_runtime
+ * build (AICPU runs wire_arena_pointers + destroy after attach). The
+ * device-only parts of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp
+ * (ops table, scope/submit/dispatch business logic, profiling) stay in their
+ * original files and the aicpu build only.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "pto_orchestrator.h"
+#include "pto_runtime2.h"
+#include "pto_ring_buffer.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "scheduler/pto_scheduler.h"
+
+// =============================================================================
+// Ready queue
+// =============================================================================
+
+size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) {
+    // Align the slots[] base to a full cache line so MPMC CAS traffic on the
+    // first slot cannot false-share with whatever region sits in front of us
+    // (e.g. orchestrator tensormap heads written by the orch thread).
+    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
+}
+
+bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) {
+    // Address the slots region for data writes without storing the pointer in
+    // queue->slots — that field is set by ready_queue_wire_arena_pointers.
+    auto *slots_arena = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+    queue->capacity = capacity;
+    queue->mask = capacity - 1;
+    queue->enqueue_pos.store(0, std::memory_order_relaxed);
+    queue->dequeue_pos.store(0, std::memory_order_relaxed);
+
+    for (uint64_t i = 0; i < capacity; i++) {
+        slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed);
+        slots_arena[i].slot_state = nullptr;
+    }
+
+    return true;
+}
+
+void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off) {
+    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+}
+
+void ready_queue_destroy(PTO2ReadyQueue *queue) {
+    // Arena owns the slots[] buffer; just forget the pointer.
+    queue->slots = nullptr;
+}
+
+// =============================================================================
+// Scheduler
+// =============================================================================
+
+bool PTO2SchedulerState::RingSchedState::init_data_from_layout(
+    void *sm_dev_base, int32_t ring_id, int32_t /*dep_pool_capacity*/
+) {
+    // ring stores the device address of the SM ring header — pure offset
+    // arithmetic, no SM load.
+    ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id);
+    last_task_alive = 0;
+    advance_lock.store(0, std::memory_order_relaxed);
+
+    // Per-slot SM-side initialization (bind_ring + reset_for_reuse +
+    // fanin_count/active_mask zero) lives in PTO2SharedMemoryHandle::
+    // init_header_per_ring so the AICPU performs it during SM reset; host
+    // prebuilt-arena init skips SM access here.
+
+    return true;
+}
+
+void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; }
+
+PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) {
+    PTO2SchedulerLayout layout{};
+    layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
+    layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
+    layout.dep_pool_capacity = dep_pool_capacity;
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+    }
+    layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        // Force a cache-line base so writes from scheduler thread 0 (sole
+        // writer of this ring's dep_pool) do not invalidate adjacent
+        // multi-threaded regions like ready_queue.slots.
+        layout.off_dep_pool_entries[r] =
+            arena.reserve(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE);
+    }
+    layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
+    return layout;
+}
+
+bool PTO2SchedulerState::init_data_from_layout(
+    const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base, uint64_t /*task_window_size*/
+) {
+    PTO2SchedulerState *sched = this;
+    sched->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+#if PTO2_SCHED_PROFILING
+    sched->tasks_completed.store(0, std::memory_order_relaxed);
+    sched->tasks_consumed.store(0, std::memory_order_relaxed);
+#endif
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r, layout.dep_pool_capacity)) {
+            return false;
+        }
+    }
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        if (!ready_queue_init_data_from_layout(
+                &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity
+            )) {
+            return false;
+        }
+    }
+    if (!ready_queue_init_data_from_layout(
+            &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity
+        )) {
+        return false;
+    }
+
+    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto *dep_entries = static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
+        memset(dep_entries, 0, static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry));
+        sched->ring_sched_states[r].dep_pool.init(dep_entries, layout.dep_pool_capacity, orch_err);
+    }
+
+    if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) {
+        return false;
+    }
+    sched->wiring.batch_count = 0;
+    sched->wiring.batch_index = 0;
+    sched->wiring.backoff_counter = 0;
+
+    return true;
+}
+
+void PTO2SchedulerState::wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena) {
+    PTO2SchedulerState *sched = this;
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]);
+    }
+    ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        sched->ring_sched_states[r].dep_pool.base =
+            static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
+    }
+    sched->wiring.queue.wire_arena_pointers(arena, layout.off_wiring_spsc_buffer);
+}
+
+void PTO2SchedulerState::destroy() {
+    PTO2SchedulerState *sched = this;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        sched->ring_sched_states[r].destroy();
+        sched->ring_sched_states[r].dep_pool.base = nullptr;
+    }
+    sched->wiring.queue.destroy();
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        ready_queue_destroy(&sched->ready_queues[i]);
+    }
+    ready_queue_destroy(&sched->dummy_ready_queue);
+}
+
+// =============================================================================
+// Orchestrator
+// =============================================================================
+
+PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout(
+    DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity
+) {
+    PTO2OrchestratorLayout layout{};
+    layout.dep_pool_capacity = dep_pool_capacity;
+    layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP;
+    layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        const size_t fanin_pool_bytes =
+            PTO2_ALIGN_UP(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
+        layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE);
+    }
+    layout.off_scope_tasks = arena.reserve(
+        static_cast<size_t>(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *)
+    );
+    layout.off_scope_begins =
+        arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
+    layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes);
+    return layout;
+}
+
+bool PTO2OrchestratorState::init_data_from_layout(
+    const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
+    uint64_t task_window_size
+) {
+    auto *orch = this;
+    *orch = PTO2OrchestratorState{};
+
+    orch->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+    orch->gm_heap_base = gm_heap;
+    orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
+    orch->fatal = false;
+
+    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + r * heap_size;
+        auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_size, r);
+        auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r);
+        auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r);
+
+        orch->rings[r].task_allocator.init(
+            task_descs_dev, static_cast<int32_t>(task_window_size), cur_idx_dev, last_alive_dev, ring_heap_base,
+            heap_size, orch_err
+        );
+
+        const size_t fanin_pool_bytes =
+            PTO2_ALIGN_UP(static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
+        auto *fanin_entries = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
+        memset(fanin_entries, 0, fanin_pool_bytes);
+        orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, orch_err);
+    }
+
+    if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) {
+        return false;
+    }
+
+    orch->scope_tasks_size = 0;
+    orch->scope_tasks_capacity = layout.scope_tasks_cap;
+    orch->scope_stack_top = -1;
+    orch->scope_stack_capacity = layout.scope_stack_capacity;
+    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+
+    return true;
+}
+
+void PTO2OrchestratorState::wire_arena_pointers(
+    const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg
+) {
+    auto *orch = this;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        orch->rings[r].fanin_pool.base = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
+    }
+    orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena, orch);
+    orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
+    orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
+    orch->scheduler = scheduler_arg;
+}
+
+void PTO2OrchestratorState::destroy() {
+    auto *orch = this;
+    orch->tensor_map.destroy();
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        orch->rings[r].fanin_pool.base = nullptr;
+    }
+    orch->scope_tasks = nullptr;
+    orch->scope_begins = nullptr;
+}
+
+void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; }
+
+// =============================================================================
+// Top-level runtime arena
+// =============================================================================
+
+PTO2RuntimeArenaLayout
+runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity) {
+    PTO2RuntimeArenaLayout layout{};
+    layout.task_window_size = task_window_size;
+    layout.dep_pool_capacity = dep_pool_capacity;
+
+    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = static_cast<int32_t>(task_window_size);
+    }
+
+    layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
+    layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity);
+    layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity);
+    layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
+    layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
+
+    layout.arena_size = arena.total_size();
+    return layout;
+}
+
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base,
+    uint64_t /*sm_size*/, void *gm_heap_dev_base, uint64_t heap_size
+) {
+    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(layout.off_runtime));
+    memset(rt, 0, sizeof(*rt));
+
+    auto *sm_wrap = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    memset(sm_wrap, 0, sizeof(*sm_wrap));
+
+    // rt->ops is filled by the AICPU at boot.
+    rt->mode = mode;
+    rt->gm_heap = gm_heap_dev_base;
+    rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0;
+    rt->gm_heap_owned = false;
+    rt->total_cycles = 0;
+
+    if (!rt->orchestrator.init_data_from_layout(
+            layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_size, layout.task_window_size
+        )) {
+        return nullptr;
+    }
+    if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base, layout.task_window_size)) {
+        return nullptr;
+    }
+
+    auto *mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    memset(mailbox, 0, sizeof(*mailbox));
+
+    return rt;
+}
+
+void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) {
+    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler);
+    rt->scheduler.wire_arena_pointers(layout.sched, arena);
+}
+
+void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) {
+    // Arena buffer is pooled across runs by DeviceRunner — never freed here.
+    if (!rt) return;
+    rt->scheduler.destroy();
+    rt->orchestrator.destroy();
+    rt->aicore_mailbox = nullptr;
+    rt->sm_handle = nullptr;
+}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
index 358c87f57..1e1edff92 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
@@ -167,6 +167,23 @@ void PTO2SharedMemoryHandle::init_header_per_ring(
     header->sched_error_bitmap.store(0, std::memory_order_relaxed);
     header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
     header->sched_error_thread.store(-1, std::memory_order_relaxed);
+
+    // Per-ring slot_states reset. Previously lived in
+    // PTO2SchedulerState::RingSchedState::init(), but it writes into
+    // ring->slot_states[] which is SM-side storage — keeping it here lets
+    // host-side prebuilt-arena init skip all SM dereferences.
+    // bind_ring() pins the ring_id (slot-invariant after this point);
+    // reset_for_reuse() prepares dynamic fanout/refcount fields so the first
+    // submit doesn't need an explicit reset.
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto &ring = header->rings[r];
+        for (uint64_t i = 0; i < task_window_sizes[r]; i++) {
+            ring.slot_states[i].bind_ring(static_cast<uint8_t>(r));
+            ring.slot_states[i].reset_for_reuse();
+            ring.slot_states[i].fanin_count = 0;
+            ring.slot_states[i].active_mask = ActiveMask{};
+        }
+    }
 }
 
 // =============================================================================
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
index a0b98bd09..da9d4fddf 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
@@ -81,43 +81,45 @@ PTO2TensorMap::reserve_layout_default(DeviceArena &arena, const int32_t new_task
     return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes);
 }
 
-bool PTO2TensorMap::init_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
+bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
     num_buckets = layout.num_buckets;
     pool_size = layout.pool_size;
 
-    buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
-    entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
-    free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+    // Address arena regions for data writes; do not store these in struct
+    // fields (wire_arena_pointers does that).
+    auto *buckets_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+    auto *entry_pool_arena = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+    auto *free_list_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
 
     // buckets[]: empty == nullptr.
     for (int32_t i = 0; i < num_buckets; i++) {
-        buckets[i] = nullptr;
+        buckets_arena[i] = nullptr;
     }
 
     // entry_pool: zero-init equivalent to the previous calloc(entry_pool, ...).
     // The pool's persistent invariant after init is "bucket_index == -1 means
     // not linked", set explicitly below.
-    memset(entry_pool, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
+    memset(entry_pool_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
     for (int32_t i = 0; i < pool_size; i++) {
-        entry_pool[i].bucket_index = -1;
-        entry_pool[i].next_in_bucket = nullptr;
-        entry_pool[i].prev_in_bucket = nullptr;
-        entry_pool[i].next_in_task = nullptr;
-        entry_pool[i].prev_in_task = nullptr;
-        entry_pool[i].producer_task_id = PTO2TaskId{};
+        entry_pool_arena[i].bucket_index = -1;
+        entry_pool_arena[i].next_in_bucket = nullptr;
+        entry_pool_arena[i].prev_in_bucket = nullptr;
+        entry_pool_arena[i].next_in_task = nullptr;
+        entry_pool_arena[i].prev_in_task = nullptr;
+        entry_pool_arena[i].producer_task_id = PTO2TaskId{};
     }
 
     // free_entry_list: zeroed (was calloc'd before); contents become meaningful
     // only after entries are freed back, so the body of the array stays as 0.
-    memset(free_entry_list, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
+    memset(free_list_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
 
     next_entry_idx = 0;
     free_num = 0;
 
     for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+        auto *heads_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
         for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) {
-            task_entry_heads[r][i] = nullptr;
+            heads_arena[i] = nullptr;
         }
         task_window_sizes[r] = layout.task_window_sizes[r];
         last_task_alives[r] = 0;
@@ -127,6 +129,18 @@ bool PTO2TensorMap::init_from_layout(const PTO2TensorMapLayout &layout, DeviceAr
     return true;
 }
 
+void PTO2TensorMap::wire_arena_pointers(
+    const PTO2TensorMapLayout &layout, DeviceArena &arena, PTO2OrchestratorState *parent_orch
+) {
+    buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+    entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+    free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+    }
+    orch = parent_orch;
+}
+
 void PTO2TensorMap::destroy() {
     // Arena owns the backing memory; here we only forget our pointers so any
     // stray post-destroy access trips a nullptr dereference instead of reading
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index 6a7ab65da..b3347b53c 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -44,6 +44,8 @@ Runtime::Runtime() {
     gm_heap_ptr_ = nullptr;
     slot_states_ptr_ = nullptr;
     orch_args_storage_.clear();
+    prebuilt_arena_base_ = nullptr;
+    prebuilt_runtime_offset_ = 0;
 
     // Initialize device orchestration SO binary
     dev_orch_so_addr_ = 0;
@@ -74,6 +76,13 @@ void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; }
 void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; }
 void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; }
 
+void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) {
+    prebuilt_arena_base_ = arena_base;
+    prebuilt_runtime_offset_ = runtime_off;
+}
+void *Runtime::get_prebuilt_arena_base() const { return prebuilt_arena_base_; }
+size_t Runtime::get_prebuilt_runtime_offset() const { return prebuilt_runtime_offset_; }
+
 // Device orchestration SO metadata (bytes live in a separate device buffer
 // owned by DeviceRunner; only the address/size travels in Runtime).
 void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
diff --git a/src/common/device_comm/device_arena.h b/src/common/device_comm/device_arena.h
index a0ade3dc3..ad43d1869 100644
--- a/src/common/device_comm/device_arena.h
+++ b/src/common/device_comm/device_arena.h
@@ -95,6 +95,18 @@ class DeviceArena {
     // the trampoline's free path must therefore be nothrow.)
     void *commit(size_t base_align = kDefaultBaseAlign);
 
+    // Phase 2 alternative: attach to an externally-owned buffer instead of
+    // allocating one. Caller guarantees `external_base` is at least the size
+    // reported by `total_size()` rounded up to `base_align`, and that the
+    // same reserve() sequence has been (or will be) replayed. Forward-aligns
+    // the visible base in the same way as commit().
+    //
+    // The external buffer is NOT freed by release()/~DeviceArena(); ownership
+    // stays with the caller. Used for the prebuilt-arena fast path where
+    // a host-built image is rtMemcpy'd into a device buffer that DeviceRunner
+    // owns across runs.
+    void attach(void *external_base, size_t base_align = kDefaultBaseAlign) noexcept;
+
     // Phase 3: pointer to the sub-region at `offset`. Asserts if called
     // before commit().
     void *region_ptr(size_t offset) const noexcept;
@@ -135,6 +147,9 @@ class DeviceArena {
     size_t raw_size_{0};
     void *base_{nullptr};
     bool committed_{false};
+    // True when committed via attach(): the backing buffer is externally
+    // owned, so release() must not call free_().
+    bool attached_{false};
 
     size_t alloc_count_{0};
     size_t free_count_{0};
@@ -166,6 +181,31 @@ inline void *DeviceArena::commit(size_t base_align) {
     return base_;
 }
 
+inline void DeviceArena::attach(void *external_base, size_t base_align) noexcept {
+    // Re-attach (e.g. AICPU boot path attaches each run) is fine: only an
+    // attached state can be "re-attached" — release() it first to keep
+    // semantics tight. A real commit() (alloc-backed) must not be silently
+    // dropped, so still trap on that.
+    if (committed_) {
+        assert(attached_ && "DeviceArena::attach() called after commit (only re-attach is allowed)");
+        release();
+    }
+    assert(external_base != nullptr && "DeviceArena::attach() requires non-null base");
+    assert(base_align > 0 && (base_align & (base_align - 1)) == 0 && "DeviceArena: base_align must be a power of two");
+    // The external buffer must already be base_align-aligned by the caller —
+    // forward-align in-place would shift the visible base off the address the
+    // caller advertised (and that the prebuilt image was constructed for).
+    const auto raw = reinterpret_cast<uintptr_t>(external_base);
+    (void)raw;
+    (void)base_align;
+    assert((raw & (static_cast<uintptr_t>(base_align) - 1)) == 0 && "DeviceArena::attach() base must be pre-aligned");
+    base_ = external_base;
+    raw_base_ = nullptr;
+    raw_size_ = 0;
+    committed_ = true;
+    attached_ = true;
+}
+
 inline void *DeviceArena::region_ptr(size_t offset) const noexcept {
     assert(committed_ && "DeviceArena::region_ptr() called before commit()");
     return reinterpret_cast<char *>(base_) + offset;
@@ -179,7 +219,8 @@ inline size_t DeviceArena::region_size(size_t offset) const noexcept {
 }
 
 inline void DeviceArena::release() noexcept {
-    if (raw_base_ != nullptr) {
+    // attached arenas wrap externally-owned memory — never free.
+    if (raw_base_ != nullptr && !attached_) {
         free_(ctx_, raw_base_);
         ++free_count_;
     }
@@ -189,4 +230,5 @@ inline void DeviceArena::release() noexcept {
     cursor_ = 0;
     region_count_ = 0;
     committed_ = false;
+    attached_ = false;
 }
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index 89314d800..9922850d5 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -90,6 +90,7 @@ add_library(a2a3_rt_objs OBJECT
     ${A2A3_RUNTIME_DIR}/shared/pto_shared_memory.cpp
     ${A2A3_RUNTIME_DIR}/scheduler/pto_scheduler.cpp
     ${A2A3_RUNTIME_DIR}/shared/pto_tensormap.cpp
+    ${A2A3_RUNTIME_DIR}/shared/pto_runtime2_init.cpp
     ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp
 )
 target_include_directories(a2a3_rt_objs PUBLIC
diff --git a/tests/ut/cpp/a2a3/test_ready_queue.cpp b/tests/ut/cpp/a2a3/test_ready_queue.cpp
index 413e36cfd..f12b1e7c7 100644
--- a/tests/ut/cpp/a2a3/test_ready_queue.cpp
+++ b/tests/ut/cpp/a2a3/test_ready_queue.cpp
@@ -61,7 +61,8 @@ class ReadyQueueTest : public ::testing::Test {
     void SetUp() override {
         const size_t off = ready_queue_reserve_layout(arena, CAPACITY);
         ASSERT_NE(arena.commit(), nullptr);
-        ASSERT_TRUE(ready_queue_init_from_layout(&queue, arena, off, CAPACITY));
+        ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, CAPACITY));
+        ready_queue_wire_arena_pointers(&queue, arena, off);
     }
 
     void TearDown() override {
@@ -231,7 +232,8 @@ class ReadyQueueBoundaryTest : public ::testing::Test {
     void SetUp() override {
         const size_t off = ready_queue_reserve_layout(arena, QUEUE_CAP);
         ASSERT_NE(arena.commit(), nullptr);
-        ASSERT_TRUE(ready_queue_init_from_layout(&queue, arena, off, QUEUE_CAP));
+        ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, QUEUE_CAP));
+        ready_queue_wire_arena_pointers(&queue, arena, off);
     }
     void TearDown() override {
         ready_queue_destroy(&queue);
@@ -330,7 +332,8 @@ class ReadyQueueMPMCTest : public ::testing::TestWithParam<MPMCConfig> {
     void SetUp() override {
         const size_t off = ready_queue_reserve_layout(arena, CAPACITY);
         ASSERT_NE(arena.commit(), nullptr);
-        ASSERT_TRUE(ready_queue_init_from_layout(&queue, arena, off, CAPACITY));
+        ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, CAPACITY));
+        ready_queue_wire_arena_pointers(&queue, arena, off);
     }
     void TearDown() override {
         ready_queue_destroy(&queue);
diff --git a/tests/ut/cpp/a2a3/test_scheduler_state.cpp b/tests/ut/cpp/a2a3/test_scheduler_state.cpp
index 952aad55a..37e9d18ca 100644
--- a/tests/ut/cpp/a2a3/test_scheduler_state.cpp
+++ b/tests/ut/cpp/a2a3/test_scheduler_state.cpp
@@ -34,7 +34,8 @@ class SchedulerStateTest : public ::testing::Test {
         ASSERT_NE(sm_handle, nullptr);
         auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
         ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header));
+        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header, PTO2_TASK_WINDOW_SIZE));
+        sched.wire_arena_pointers(layout, sched_arena);
     }
 
     void TearDown() override {
diff --git a/tests/ut/cpp/a2a3/test_spsc_queue.cpp b/tests/ut/cpp/a2a3/test_spsc_queue.cpp
index 28e43d5a2..5dce3ba4a 100644
--- a/tests/ut/cpp/a2a3/test_spsc_queue.cpp
+++ b/tests/ut/cpp/a2a3/test_spsc_queue.cpp
@@ -47,7 +47,8 @@ class SpscQueueTest : public ::testing::Test {
         memset(&queue, 0, sizeof(queue));
         const size_t off = PTO2SpscQueue::reserve_layout(arena, CAPACITY);
         ASSERT_NE(arena.commit(), nullptr);
-        ASSERT_TRUE(queue.init_from_layout(arena, off, CAPACITY));
+        ASSERT_TRUE(queue.init_data_from_layout(arena, off, CAPACITY));
+        queue.wire_arena_pointers(arena, off);
     }
 
     void TearDown() override {
@@ -74,9 +75,9 @@ TEST_F(SpscQueueTest, InitRejectsNonPowerOfTwo) {
     const size_t off = PTO2SpscQueue::reserve_layout(local, 1);  // dummy reservation so commit succeeds
     (void)off;
     ASSERT_NE(local.commit(), nullptr);
-    EXPECT_FALSE(bad.init_from_layout(local, off, 3));
-    EXPECT_FALSE(bad.init_from_layout(local, off, 7));
-    EXPECT_FALSE(bad.init_from_layout(local, off, 0));
+    EXPECT_FALSE(bad.init_data_from_layout(local, off, 3));
+    EXPECT_FALSE(bad.init_data_from_layout(local, off, 7));
+    EXPECT_FALSE(bad.init_data_from_layout(local, off, 0));
 }
 
 TEST_F(SpscQueueTest, InitAcceptsPowerOfTwo) {
@@ -85,9 +86,9 @@ TEST_F(SpscQueueTest, InitAcceptsPowerOfTwo) {
     const size_t off4 = PTO2SpscQueue::reserve_layout(local, 4);
     const size_t off1024 = PTO2SpscQueue::reserve_layout(local, 1024);
     ASSERT_NE(local.commit(), nullptr);
-    EXPECT_TRUE(q.init_from_layout(local, off4, 4));
+    EXPECT_TRUE(q.init_data_from_layout(local, off4, 4));
     q.destroy();
-    EXPECT_TRUE(q.init_from_layout(local, off1024, 1024));
+    EXPECT_TRUE(q.init_data_from_layout(local, off1024, 1024));
     q.destroy();
 }
 
diff --git a/tests/ut/cpp/a2a3/test_task_allocator.cpp b/tests/ut/cpp/a2a3/test_task_allocator.cpp
index 383003900..512e241a2 100644
--- a/tests/ut/cpp/a2a3/test_task_allocator.cpp
+++ b/tests/ut/cpp/a2a3/test_task_allocator.cpp
@@ -388,7 +388,10 @@ TEST_F(TaskAllocatorTest, TaskWindowSaturates) {
 TEST_F(TaskAllocatorTest, TaskIdNearInt32Max) {
     current_index.store(INT32_MAX - 2);
     last_alive.store(INT32_MAX - 2);
-    allocator.init(descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code);
+    allocator.init(
+        descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code,
+        /*initial_local_task_id=*/INT32_MAX - 2
+    );
 
     auto r1 = allocator.alloc(0);
     ASSERT_FALSE(r1.failed());
diff --git a/tests/ut/cpp/a2a3/test_task_state.cpp b/tests/ut/cpp/a2a3/test_task_state.cpp
index 729b74999..a9655919b 100644
--- a/tests/ut/cpp/a2a3/test_task_state.cpp
+++ b/tests/ut/cpp/a2a3/test_task_state.cpp
@@ -43,7 +43,8 @@ class TaskStateTest : public ::testing::Test {
         ASSERT_NE(sm_handle, nullptr);
         auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
         ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header));
+        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header, PTO2_TASK_WINDOW_SIZE));
+        sched.wire_arena_pointers(layout, sched_arena);
     }
 
     void TearDown() override {
diff --git a/tests/ut/cpp/a2a3/test_tensormap.cpp b/tests/ut/cpp/a2a3/test_tensormap.cpp
index 204d00e42..df1789067 100644
--- a/tests/ut/cpp/a2a3/test_tensormap.cpp
+++ b/tests/ut/cpp/a2a3/test_tensormap.cpp
@@ -83,7 +83,8 @@ class TensorMapTest : public ::testing::Test {
         int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE};
         auto layout = PTO2TensorMap::reserve_layout(arena, NUM_BUCKETS, POOL_SIZE, window_sizes);
         ASSERT_NE(arena.commit(), nullptr);
-        ASSERT_TRUE(tmap.init_from_layout(layout, arena));
+        ASSERT_TRUE(tmap.init_data_from_layout(layout, arena));
+        tmap.wire_arena_pointers(layout, arena, /*parent_orch=*/nullptr);
     }
 
     void TearDown() override {
@@ -113,7 +114,8 @@ TEST_F(TensorMapTest, InitRequiresPowerOfTwoBuckets) {
     int32_t ws[PTO2_MAX_RING_DEPTH] = {8, 8, 8, 8};
     auto layout = PTO2TensorMap::reserve_layout(bad_arena, 8, 64, ws);
     ASSERT_NE(bad_arena.commit(), nullptr);
-    EXPECT_TRUE(bad.init_from_layout(layout, bad_arena));
+    EXPECT_TRUE(bad.init_data_from_layout(layout, bad_arena));
+    bad.wire_arena_pointers(layout, bad_arena, /*parent_orch=*/nullptr);
     bad.destroy();
 }
 
diff --git a/tests/ut/cpp/a2a3/test_wiring.cpp b/tests/ut/cpp/a2a3/test_wiring.cpp
index b01052a85..b3c11ead1 100644
--- a/tests/ut/cpp/a2a3/test_wiring.cpp
+++ b/tests/ut/cpp/a2a3/test_wiring.cpp
@@ -48,7 +48,8 @@ class WiringTest : public ::testing::Test {
         ASSERT_NE(sm_handle, nullptr);
         auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
         ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header));
+        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header, PTO2_TASK_WINDOW_SIZE));
+        sched.wire_arena_pointers(layout, sched_arena);
     }
 
     void TearDown() override {

From c186f6ca4baca1bcf292203464b46d780b9706aa Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Mon, 25 May 2026 11:27:35 +0800
Subject: [PATCH 3/7] Refactor: post-review hardening for trb host-build arena

Address review feedback from PR #846:

- pto2_sm_layout::ring_task_descriptors_addr: take per-ring task_window_sizes[]
  array (mirroring PTO2SharedMemoryHandle's SM API) and assert ring_id range,
  so a future per-ring SM layout cannot silently disagree with the addresses
  the host bakes into the prebuilt image.
- DeviceRunner::acquire_pooled_runtime_arena (onboard + sim): return nullptr
  when runtime_arena_region_off_ == SIZE_MAX so a stray hbg-path call cannot
  resolve to base + SIZE_MAX. Failure is now loud and contained at the
  acquire boundary.
- DeviceArena::attach(): rewrite doc to match real behavior (region table is
  not repopulated after attach, reserve() asserts !committed_ so cannot
  replay, region_size() returns 0); promote the pre-alignment / non-null /
  power-of-two checks from plain assert() to an unconditional abort() so
  release builds still trap on contract violations.
- PTO2TensorMap: drop the dead `orch` back-pointer field (a2a3 never
  dereferences it), strip parent_orch parameter from wire_arena_pointers,
  and remove the now-unused PTO2OrchestratorState forward declaration.
- PTO2RingFlowControl::init(): add a coupling comment so future fc-initial-
  value or boot-order changes flag PTO2TaskAllocator::init's
  initial_local_task_id default in the same edit.
- PTO2SchedulerState::init_data_from_layout / RingSchedState::
  init_data_from_layout: drop the task_window_size / dep_pool_capacity
  parameters that were never consumed (scheduler only needs SM base + ring
  index, both window-size-independent; orchestrator counterpart still takes
  task_window_size for ring_task_descriptors arithmetic). Updated all
  callsites (pto_runtime2_init.cpp + 4 cpput suites).
- PTO2Runtime::prebuilt_arena_base: removed the dead mirror field. The host
  Runtime's prebuilt_arena_base_ is the real source of truth (AICPU reads it
  to locate the pooled buffer *before* dereferencing the image); the
  PTO2Runtime image still carries prebuilt_layout, which the AICPU does
  consume.

cpput: 25/25 pass. a2a3sim trb: dummy_task / dynamic_register / L2 trb
suite pass with --build.
---
 .../platform/onboard/host/device_runner.cpp   |  4 ++
 src/a2a3/platform/sim/host/device_runner.cpp  |  4 ++
 .../host/dep_gen_replay.cpp                   |  9 ++--
 .../host/runtime_maker.cpp                    | 10 ++---
 .../runtime/pto_runtime2.h                    | 16 +++----
 .../runtime/pto_shared_memory.h               | 26 ++++++++----
 .../runtime/pto_tensormap.h                   | 11 +----
 .../runtime/scheduler/pto_scheduler.h         | 15 ++++---
 .../runtime/shared/pto_runtime2_init.cpp      | 20 +++++----
 .../runtime/shared/pto_tensormap.cpp          |  5 +--
 src/common/device_comm/device_arena.h         | 42 ++++++++++++++-----
 tests/ut/cpp/a2a3/test_scheduler_state.cpp    |  2 +-
 tests/ut/cpp/a2a3/test_task_state.cpp         |  2 +-
 tests/ut/cpp/a2a3/test_tensormap.cpp          |  4 +-
 tests/ut/cpp/a2a3/test_wiring.cpp             |  2 +-
 15 files changed, 102 insertions(+), 70 deletions(-)

diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index e3ba6cd10..8d2d9916b 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -299,6 +299,10 @@ void *DeviceRunner::acquire_pooled_gm_sm() {
 
 void *DeviceRunner::acquire_pooled_runtime_arena() {
     if (!static_arena_.is_committed()) return nullptr;
+    // hbg calls setup_static_arena(...,0) and never reserves a runtime-arena
+    // region — fail loudly if a caller asks for it anyway, rather than
+    // returning region_ptr(SIZE_MAX) (base + SIZE_MAX is undefined).
+    if (runtime_arena_region_off_ == SIZE_MAX) return nullptr;
     return static_arena_.region_ptr(runtime_arena_region_off_);
 }
 
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 53d967228..c221bb714 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -170,6 +170,10 @@ void *DeviceRunner::acquire_pooled_gm_sm() {
 
 void *DeviceRunner::acquire_pooled_runtime_arena() {
     if (!static_arena_.is_committed()) return nullptr;
+    // hbg calls setup_static_arena(...,0) and never reserves a runtime-arena
+    // region — fail loudly if a caller asks for it anyway, rather than
+    // returning region_ptr(SIZE_MAX) (base + SIZE_MAX is undefined).
+    if (runtime_arena_region_off_ == SIZE_MAX) return nullptr;
     return static_arena_.region_ptr(runtime_arena_region_off_);
 }
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
index 506ba7cf6..71a482632 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
@@ -492,11 +492,10 @@ dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, c
         LOG_ERROR("dep_gen replay: tensormap.init failed (buckets=%d, pool=%d)", PTO2_TENSORMAP_NUM_BUCKETS, pool_size);
         return -3;
     }
-    // Replay tensormaps live entirely on host; both arena base and the
-    // parent-orch self-pointer use host addresses. parent_orch is unused by
-    // the lookup/insert code paths exercised below — nullptr is safe.
-    tm_oracle.wire_arena_pointers(oracle_layout, replay_arena, nullptr);
-    tm_annot.wire_arena_pointers(annot_layout, replay_arena, nullptr);
+    // Replay tensormaps live entirely on host; only arena-internal pointer
+    // fields need wiring (no parent-orch back-reference exists anymore).
+    tm_oracle.wire_arena_pointers(oracle_layout, replay_arena);
+    tm_annot.wire_arena_pointers(annot_layout, replay_arena);
 
     // JSON output accumulators.
     std::vector<TaskTableEntry> task_table;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 3b278b2b4..750374683 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -344,11 +344,11 @@ extern "C" int bind_prepared_to_runtime_impl(
     }
     runtime_wire_arena_pointers(host_arena, layout, rt);
 
-    // Stash the prebuilt metadata inside the PTO2Runtime image so the AICPU
-    // picks them up directly via the pooled buffer after rtMemcpy. The host
-    // Runtime also carries the pointers so the AICPU can locate the
-    // PTO2Runtime before it does anything else (no chicken-and-egg).
-    rt->prebuilt_arena_base = runtime_arena_dev;
+    // Stash the layout inside the PTO2Runtime image so the AICPU can recover
+    // every arena-internal offset after rtMemcpy. The runtime arena's device
+    // base does NOT travel in this image — it's on the host Runtime
+    // (set_prebuilt_arena below), since the AICPU needs that pointer
+    // *before* it can dereference the image.
     rt->prebuilt_layout = layout;
 
     int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index 169937f82..460624e69 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -143,14 +143,14 @@ struct PTO2Runtime {
     // Statistics
     int64_t total_cycles;
 
-    // Prebuilt-arena fast path metadata. `prebuilt_arena_base` is the device
-    // address of the runtime arena (the buffer that holds *this* PTO2Runtime
-    // at offset prebuilt_layout.off_runtime). `prebuilt_layout` carries every
-    // offset wire_arena_pointers needs at AICPU boot, so the AICPU can
-    // reconstruct all arena-internal pointer fields without re-running
-    // init_data. Populated by the host's runtime_init_data_from_layout +
-    // runtime_wire_arena_pointers; read by aicpu_executor.cpp.
-    void *prebuilt_arena_base{nullptr};
+    // Prebuilt-arena fast path metadata. Carries every offset
+    // wire_arena_pointers needs at AICPU boot so the AICPU can reconstruct
+    // all arena-internal pointer fields without re-running init_data. The
+    // device base of the runtime arena travels separately on the host-side
+    // Runtime (Runtime::prebuilt_arena_base_), since the AICPU needs it
+    // *before* dereferencing this image. Populated on host by
+    // runtime_init_data_from_layout + runtime_wire_arena_pointers; read by
+    // aicpu_executor.cpp.
     PTO2RuntimeArenaLayout prebuilt_layout;
 };
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
index c8de35ba6..98b832510 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
@@ -58,6 +58,13 @@ struct alignas(64) PTO2RingFlowControl {
     // === Cache Line 1: Written by Scheduler, Read by Orchestrator (for back-pressure) ===
     alignas(64) std::atomic<int32_t> last_task_alive;  // Task ring tail (oldest active task)
 
+    // Per-boot SM reset. PTO2TaskAllocator::init() seeds its private
+    // local_task_id_ from initial_local_task_id (default 0 in production)
+    // *without* dereferencing current_task_index — it relies on this reset
+    // running on every AICPU boot so 0 stays in sync. If you ever change
+    // the initial fc value or the boot ordering, update the default in
+    // PTO2TaskAllocator::init (pto_ring_buffer.h) in the same change, or
+    // submit IDs will be off by the divergence.
     void init() {
         current_task_index.store(0, std::memory_order_relaxed);
         last_task_alive.store(0, std::memory_order_relaxed);
@@ -232,17 +239,20 @@ inline std::atomic<int32_t> *ring_last_task_alive_addr(void *sm_dev_base, int ri
 }
 
 // Walk the per-ring SM layout (same arithmetic as setup_pointers_per_ring)
-// to compute ring `ring_id`'s task_descriptors device address. Uniform
-// per-ring task_window_size; matches the production callsite which always
-// passes a uniform window size to runtime_create_from_sm.
-inline PTO2TaskDescriptor *
-ring_task_descriptors_addr(void *sm_dev_base, uint64_t task_window_size, int ring_id) noexcept {
+// to compute ring `ring_id`'s task_descriptors device address. Accepts a
+// per-ring window-size array so the helper's signature mirrors
+// `PTO2SharedMemoryHandle::setup_pointers_per_ring` and cannot silently
+// disagree with the SM layout when (hypothetically) ring sizes diverge.
+inline PTO2TaskDescriptor *ring_task_descriptors_addr(
+    void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id
+) noexcept {
+    assert(ring_id >= 0 && ring_id < PTO2_MAX_RING_DEPTH && "pto2_sm_layout: ring_id out of range");
     char *p = static_cast<char *>(sm_dev_base);
     p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
     for (int r = 0; r < ring_id; r++) {
-        p += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-        p += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
-        p += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
     }
     return reinterpret_cast<PTO2TaskDescriptor *>(p);
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index 11decdf4e..b63f20676 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -47,8 +47,6 @@
 #include "pto_runtime2_types.h"
 #include "tensor.h"
 
-struct PTO2OrchestratorState;  // forward declare
-
 /**
  * Layout descriptor produced by PTO2TensorMap::reserve_layout(). Stores the
  * region offsets returned by DeviceArena::reserve() so init_from_layout()
@@ -369,8 +367,6 @@ struct PTO2TensorMap {
     // Per-ring cleanup progress (for periodic cleanup_retired)
     int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{};
 
-    PTO2OrchestratorState *orch{nullptr};
-
     uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const {
         return task_local_id & (task_window_sizes[ring_id] - 1);
     }
@@ -436,7 +432,7 @@ struct PTO2TensorMap {
 
     /**
      * Phase 3a: write everything *except* arena-internal pointer fields
-     * (buckets, entry_pool, free_entry_list, task_entry_heads[r], orch).
+     * (buckets, entry_pool, free_entry_list, task_entry_heads[r]).
      * Uses arena.region_ptr to address the arena regions for data writes,
      * but does not store those addresses in struct fields. Safe to call on
      * a host arena that holds the prebuilt image.
@@ -446,11 +442,8 @@ struct PTO2TensorMap {
     /**
      * Phase 3b: write the arena-internal pointer fields. Idempotent;
      * called once on the host arena and once on the AICPU after attach.
-     * `parent_orch` is the device address (or host-mirror address) of the
-     * enclosing PTO2OrchestratorState; we store it in tensor_map.orch
-     * (self-pointer within the same arena).
      */
-    void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena, PTO2OrchestratorState *parent_orch);
+    void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena);
 
     /**
      * Tear down state. Does not free memory — the arena owns the backing
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
index 828999113..510187feb 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
@@ -583,7 +583,7 @@ struct PTO2SchedulerState {
         // by SchedulerState::wire_arena_pointers). The `ring` field stores
         // the device address of the SM ring header — computed via offset
         // arithmetic, no SM dereference.
-        bool init_data_from_layout(void *sm_dev_base, int32_t ring_id, int32_t dep_pool_capacity);
+        bool init_data_from_layout(void *sm_dev_base, int32_t ring_id);
         void destroy();
 
         void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); }
@@ -1068,13 +1068,12 @@ struct PTO2SchedulerState {
 
     // Phase 3a: write everything *except* arena-internal pointer fields.
     // `sm_dev_base` is the device address of the SM (only stored, never
-    // dereferenced here); `task_window_size` lets the per-ring data-addr
-    // arithmetic resolve ring task_descriptors / fc field addresses without
-    // an SM load. Safe to call on a host arena that holds the prebuilt
-    // image buffer.
-    bool init_data_from_layout(
-        const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base, uint64_t task_window_size
-    );
+    // dereferenced here). Safe to call on a host arena that holds the
+    // prebuilt image buffer. (The orchestrator counterpart takes
+    // task_window_size for ring task_descriptors address arithmetic; the
+    // scheduler only needs the SM header / ring header base addresses,
+    // both window-size-independent.)
+    bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base);
 
     // Phase 3b: write the arena-internal pointer fields
     // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
index 3efa313fd..d66acfcc4 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
@@ -70,9 +70,7 @@ void ready_queue_destroy(PTO2ReadyQueue *queue) {
 // Scheduler
 // =============================================================================
 
-bool PTO2SchedulerState::RingSchedState::init_data_from_layout(
-    void *sm_dev_base, int32_t ring_id, int32_t /*dep_pool_capacity*/
-) {
+bool PTO2SchedulerState::RingSchedState::init_data_from_layout(void *sm_dev_base, int32_t ring_id) {
     // ring stores the device address of the SM ring header — pure offset
     // arithmetic, no SM load.
     ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id);
@@ -111,7 +109,7 @@ PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32
 }
 
 bool PTO2SchedulerState::init_data_from_layout(
-    const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base, uint64_t /*task_window_size*/
+    const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base
 ) {
     PTO2SchedulerState *sched = this;
     sched->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
@@ -121,7 +119,7 @@ bool PTO2SchedulerState::init_data_from_layout(
 #endif
 
     for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r, layout.dep_pool_capacity)) {
+        if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) {
             return false;
         }
     }
@@ -220,10 +218,16 @@ bool PTO2OrchestratorState::init_data_from_layout(
     orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
     orch->fatal = false;
 
+    // Mirror the SM API's per-ring window-size shape so a future per-ring
+    // SM layout cannot silently disagree with the addresses we compute here.
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        task_window_sizes[r] = task_window_size;
+
     auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
     for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
         void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + r * heap_size;
-        auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_size, r);
+        auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r);
         auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r);
         auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r);
 
@@ -259,7 +263,7 @@ void PTO2OrchestratorState::wire_arena_pointers(
     for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
         orch->rings[r].fanin_pool.base = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
     }
-    orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena, orch);
+    orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena);
     orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
     orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
     orch->scheduler = scheduler_arg;
@@ -324,7 +328,7 @@ PTO2Runtime *runtime_init_data_from_layout(
         )) {
         return nullptr;
     }
-    if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base, layout.task_window_size)) {
+    if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) {
         return nullptr;
     }
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
index da9d4fddf..b99c67233 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
@@ -129,16 +129,13 @@ bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, Dev
     return true;
 }
 
-void PTO2TensorMap::wire_arena_pointers(
-    const PTO2TensorMapLayout &layout, DeviceArena &arena, PTO2OrchestratorState *parent_orch
-) {
+void PTO2TensorMap::wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
     buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
     entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
     free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
     for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
         task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
     }
-    orch = parent_orch;
 }
 
 void PTO2TensorMap::destroy() {
diff --git a/src/common/device_comm/device_arena.h b/src/common/device_comm/device_arena.h
index ad43d1869..ffe34c479 100644
--- a/src/common/device_comm/device_arena.h
+++ b/src/common/device_comm/device_arena.h
@@ -96,10 +96,22 @@ class DeviceArena {
     void *commit(size_t base_align = kDefaultBaseAlign);
 
     // Phase 2 alternative: attach to an externally-owned buffer instead of
-    // allocating one. Caller guarantees `external_base` is at least the size
-    // reported by `total_size()` rounded up to `base_align`, and that the
-    // same reserve() sequence has been (or will be) replayed. Forward-aligns
-    // the visible base in the same way as commit().
+    // allocating one. Caller guarantees:
+    //   (a) `external_base` is already `base_align`-aligned — attach does
+    //       NOT forward-align, since the prebuilt image was constructed for
+    //       the address the caller advertised;
+    //   (b) the buffer is at least `total_size()` bytes (the sum of sizes
+    //       passed to reserve()), since attach uses no forward-alignment
+    //       slack of its own;
+    //   (c) all region offsets the caller plans to read back via
+    //       `region_ptr(off)` are held by the caller — attach does NOT
+    //       repopulate the internal region table, and reserve() cannot run
+    //       after attach (it asserts !committed_). `region_size()` likewise
+    //       returns 0 for attached arenas; treat the arena post-attach as
+    //       a base-pointer wrapper.
+    //
+    // Re-attach (release + attach the same or another buffer) is permitted
+    // so the AICPU boot path can rebind the same pooled image each run.
     //
     // The external buffer is NOT freed by release()/~DeviceArena(); ownership
     // stays with the caller. Used for the prebuilt-arena fast path where
@@ -112,7 +124,10 @@ class DeviceArena {
     void *region_ptr(size_t offset) const noexcept;
 
     // Size of the sub-region whose offset matches `offset`. Linear scan;
-    // intended for debug / assertions, not hot path.
+    // intended for debug / assertions, not hot path. Returns 0 for an
+    // attached arena (attach() does not repopulate the region table) —
+    // callers in the prebuilt-image path keep sizes alongside their offsets
+    // instead.
     size_t region_size(size_t offset) const noexcept;
 
     // Free the backing buffer (if any) and reset to the pre-commit state so
@@ -190,15 +205,22 @@ inline void DeviceArena::attach(void *external_base, size_t base_align) noexcept
         assert(attached_ && "DeviceArena::attach() called after commit (only re-attach is allowed)");
         release();
     }
-    assert(external_base != nullptr && "DeviceArena::attach() requires non-null base");
-    assert(base_align > 0 && (base_align & (base_align - 1)) == 0 && "DeviceArena: base_align must be a power of two");
     // The external buffer must already be base_align-aligned by the caller —
     // forward-align in-place would shift the visible base off the address the
     // caller advertised (and that the prebuilt image was constructed for).
+    // The checks below are promoted to unconditional aborts (rather than
+    // plain assert()) because a misaligned attach silently produces a buffer
+    // whose visible base disagrees with every offset the prebuilt image was
+    // laid out against — release builds, which strip assert(), would still
+    // run on a corrupted arena. Aborting at the breakage point is far cheaper
+    // to triage than the downstream wild-pointer accesses.
     const auto raw = reinterpret_cast<uintptr_t>(external_base);
-    (void)raw;
-    (void)base_align;
-    assert((raw & (static_cast<uintptr_t>(base_align) - 1)) == 0 && "DeviceArena::attach() base must be pre-aligned");
+    const bool ok = (external_base != nullptr) && (base_align > 0) && ((base_align & (base_align - 1)) == 0) &&
+                    ((raw & (static_cast<uintptr_t>(base_align) - 1)) == 0);
+    if (!ok) {
+        assert(false && "DeviceArena::attach(): null base, non-power-of-two align, or pre-alignment violated");
+        std::abort();
+    }
     base_ = external_base;
     raw_base_ = nullptr;
     raw_size_ = 0;
diff --git a/tests/ut/cpp/a2a3/test_scheduler_state.cpp b/tests/ut/cpp/a2a3/test_scheduler_state.cpp
index 37e9d18ca..75476dedf 100644
--- a/tests/ut/cpp/a2a3/test_scheduler_state.cpp
+++ b/tests/ut/cpp/a2a3/test_scheduler_state.cpp
@@ -34,7 +34,7 @@ class SchedulerStateTest : public ::testing::Test {
         ASSERT_NE(sm_handle, nullptr);
         auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
         ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header, PTO2_TASK_WINDOW_SIZE));
+        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header));
         sched.wire_arena_pointers(layout, sched_arena);
     }
 
diff --git a/tests/ut/cpp/a2a3/test_task_state.cpp b/tests/ut/cpp/a2a3/test_task_state.cpp
index a9655919b..ffced6f9a 100644
--- a/tests/ut/cpp/a2a3/test_task_state.cpp
+++ b/tests/ut/cpp/a2a3/test_task_state.cpp
@@ -43,7 +43,7 @@ class TaskStateTest : public ::testing::Test {
         ASSERT_NE(sm_handle, nullptr);
         auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
         ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header, PTO2_TASK_WINDOW_SIZE));
+        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header));
         sched.wire_arena_pointers(layout, sched_arena);
     }
 
diff --git a/tests/ut/cpp/a2a3/test_tensormap.cpp b/tests/ut/cpp/a2a3/test_tensormap.cpp
index df1789067..805a9e079 100644
--- a/tests/ut/cpp/a2a3/test_tensormap.cpp
+++ b/tests/ut/cpp/a2a3/test_tensormap.cpp
@@ -84,7 +84,7 @@ class TensorMapTest : public ::testing::Test {
         auto layout = PTO2TensorMap::reserve_layout(arena, NUM_BUCKETS, POOL_SIZE, window_sizes);
         ASSERT_NE(arena.commit(), nullptr);
         ASSERT_TRUE(tmap.init_data_from_layout(layout, arena));
-        tmap.wire_arena_pointers(layout, arena, /*parent_orch=*/nullptr);
+        tmap.wire_arena_pointers(layout, arena);
     }
 
     void TearDown() override {
@@ -115,7 +115,7 @@ TEST_F(TensorMapTest, InitRequiresPowerOfTwoBuckets) {
     auto layout = PTO2TensorMap::reserve_layout(bad_arena, 8, 64, ws);
     ASSERT_NE(bad_arena.commit(), nullptr);
     EXPECT_TRUE(bad.init_data_from_layout(layout, bad_arena));
-    bad.wire_arena_pointers(layout, bad_arena, /*parent_orch=*/nullptr);
+    bad.wire_arena_pointers(layout, bad_arena);
     bad.destroy();
 }
 
diff --git a/tests/ut/cpp/a2a3/test_wiring.cpp b/tests/ut/cpp/a2a3/test_wiring.cpp
index b3c11ead1..1e8fee9c5 100644
--- a/tests/ut/cpp/a2a3/test_wiring.cpp
+++ b/tests/ut/cpp/a2a3/test_wiring.cpp
@@ -48,7 +48,7 @@ class WiringTest : public ::testing::Test {
         ASSERT_NE(sm_handle, nullptr);
         auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
         ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header, PTO2_TASK_WINDOW_SIZE));
+        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header));
         sched.wire_arena_pointers(layout, sched_arena);
     }
 

From 7cdb55cbad699b3c2158ea5bea6a9f1c08aec8c9 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Wed, 27 May 2026 11:56:12 +0800
Subject: [PATCH 4/7] Refactor: mirror trb host-build arena to a5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sync of PR #846 commit 2/3 to a5 — commit 1 (slot_state.bind split)
was already mirrored. Brings the a5 trb runtime up to the same
host-build arena fast path as a2a3.

- 4-phase API (reserve_layout / init_data_from_layout /
  wire_arena_pointers / finalize_after_wire) replaces
  runtime_create_from_sm.
- New runtime/shared/pto_runtime2_init.cpp (~355 lines) and
  shared/pto_tensormap.cpp (the old runtime/pto_tensormap.cpp
  moved + split) hold the host-pluggable cold-path lifted from
  pto_runtime2.cpp / pto_orchestrator.cpp / scheduler/pto_scheduler.cpp.
- AICPU boot becomes attach + wire + sm_handle->init + finalize.
- runtime_maker.cpp pre-builds the arena image on host and rtMemcpys
  it into a pooled runtime-arena region; onboard + sim DeviceRunner
  setup_static_arena grow a third runtime_arena_size argument with
  matching acquire_pooled_runtime_arena (hbg path passes 0).

a5-specific divergences kept: enable_l2_swimlane (bool) instead of
L2PerfLevel, no dep_gen subsystem, wait_init_complete naming,
alignas(64) PTO2SpscQueue queue, cache_invalidate_range + cond.retire
in async_wait, RUNTIME_MAX_WORKER 108.

Tests
- cpput: 25/25 pass.
- a5sim: trb 21/21 + host_build_graph 6/6 pass.
- a2a3sim regression: trb 29/29 + host_build_graph 9/9 pass.
---
 .../platform/onboard/host/device_runner.cpp   |  29 +-
 src/a5/platform/onboard/host/device_runner.h  |  27 +-
 .../onboard/host/pto_runtime_c_api.cpp        |  13 +-
 src/a5/platform/sim/host/device_runner.cpp    |  24 +-
 src/a5/platform/sim/host/device_runner.h      |  22 +-
 .../platform/sim/host/pto_runtime_c_api.cpp   |  13 +-
 .../host_build_graph/runtime/runtime.h        |   3 +-
 .../aicpu/aicpu_executor.cpp                  |  64 +++-
 .../host/runtime_maker.cpp                    |  66 +++-
 .../runtime/pto_orchestrator.cpp              |  97 +----
 .../runtime/pto_orchestrator.h                |  22 +-
 .../runtime/pto_ring_buffer.h                 |  16 +-
 .../runtime/pto_runtime2.cpp                  |  84 +----
 .../runtime/pto_runtime2.h                    | 103 ++++-
 .../runtime/pto_runtime2_types.h              |   2 +-
 .../runtime/pto_shared_memory.h               |  73 +++-
 .../runtime/pto_tensormap.h                   |  22 +-
 .../runtime/runtime.h                         |  43 ++-
 .../runtime/scheduler/pto_scheduler.cpp       | 147 --------
 .../runtime/scheduler/pto_scheduler.h         |  53 ++-
 .../runtime/shared/pto_runtime2_init.cpp      | 355 ++++++++++++++++++
 .../runtime/shared/pto_shared_memory.cpp      |  17 +
 .../runtime/{ => shared}/pto_tensormap.cpp    |  48 ++-
 .../runtime/shared/runtime.cpp                |   9 +
 tests/ut/cpp/a5/test_scheduler_state.cpp      |   3 +-
 tests/ut/cpp/a5/test_task_allocator.cpp       |   5 +-
 tests/ut/cpp/a5/test_task_state.cpp           |   3 +-
 tests/ut/cpp/a5/test_wiring.cpp               |   3 +-
 28 files changed, 935 insertions(+), 431 deletions(-)
 create mode 100644 src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
 rename src/a5/runtime/tensormap_and_ringbuffer/runtime/{ => shared}/pto_tensormap.cpp (82%)

diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index 38242555d..377e0b8eb 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -195,29 +195,41 @@ static int prof_free_cb(void *dev_ptr) { return rtFree(dev_ptr); }
 
 DeviceRunner::~DeviceRunner() { finalize(); }
 
-int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) {
+int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
     if (static_arena_.is_committed()) {
-        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0;
+        // Idempotent for the production case (sizes do not change across a
+        // worker's lifetime). If a caller asks for a larger layout, redo it.
+        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_ &&
+            runtime_arena_size <= cached_runtime_arena_size_) {
+            return 0;
+        }
         static_arena_.release();
         gm_heap_region_off_ = SIZE_MAX;
         gm_sm_region_off_ = SIZE_MAX;
+        runtime_arena_region_off_ = SIZE_MAX;
         cached_gm_heap_size_ = 0;
         cached_gm_sm_size_ = 0;
+        cached_runtime_arena_size_ = 0;
     }
     gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign);
     gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign);
+    if (runtime_arena_size > 0) {
+        runtime_arena_region_off_ = static_arena_.reserve(runtime_arena_size, DeviceArena::kDefaultBaseAlign);
+    }
     if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
-        // Roll back the two reserves: commit() failure leaves committed_=false,
+        // Roll back the reserves: commit() failure leaves committed_=false,
         // so the next entry would skip the release branch and stack new
         // reserves on top of the stale cursor. release() is idempotent on a
         // never-committed arena (just zeroes cursor_ / region_count_).
         static_arena_.release();
         gm_heap_region_off_ = SIZE_MAX;
         gm_sm_region_off_ = SIZE_MAX;
+        runtime_arena_region_off_ = SIZE_MAX;
         return -1;
     }
     cached_gm_heap_size_ = gm_heap_size;
     cached_gm_sm_size_ = gm_sm_size;
+    cached_runtime_arena_size_ = runtime_arena_size;
     return 0;
 }
 
@@ -231,6 +243,15 @@ void *DeviceRunner::acquire_pooled_gm_sm() {
     return static_arena_.region_ptr(gm_sm_region_off_);
 }
 
+void *DeviceRunner::acquire_pooled_runtime_arena() {
+    if (!static_arena_.is_committed()) return nullptr;
+    // hbg calls setup_static_arena(...,0) and never reserves a runtime-arena
+    // region — fail loudly if a caller asks for it anyway, rather than
+    // returning region_ptr(SIZE_MAX) (base + SIZE_MAX is undefined).
+    if (runtime_arena_region_off_ == SIZE_MAX) return nullptr;
+    return static_arena_.region_ptr(runtime_arena_region_off_);
+}
+
 std::thread DeviceRunner::create_thread(std::function<void()> fn) {
     int dev_id = device_id_;
     return std::thread([dev_id, fn = std::move(fn)]() {
@@ -1045,8 +1066,10 @@ int DeviceRunner::finalize() {
     static_arena_.release();
     gm_heap_region_off_ = SIZE_MAX;
     gm_sm_region_off_ = SIZE_MAX;
+    runtime_arena_region_off_ = SIZE_MAX;
     cached_gm_heap_size_ = 0;
     cached_gm_sm_size_ = 0;
+    cached_runtime_arena_size_ = 0;
 
     // Free all remaining allocations (including handshake buffer and binGmAddr)
     mem_alloc_.finalize();
diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h
index a07ab28bb..9edad84fa 100644
--- a/src/a5/platform/onboard/host/device_runner.h
+++ b/src/a5/platform/onboard/host/device_runner.h
@@ -179,19 +179,30 @@ class DeviceRunner {
 
     /**
      * Lay out and commit the per-Worker static device arena that backs the
-     * PTO2 GM heap and PTO2 shared memory in a single underlying allocation.
-     * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm.
-     * Idempotent on identical sizes. Returns 0 on success, -1 on failure.
+     * PTO2 GM heap, the PTO2 shared memory, and the trb prebuilt runtime
+     * arena in a single underlying allocation. Must be called before any
+     * acquire_pooled_*. Idempotent on identical sizes. `runtime_arena_size`
+     * is 0 for the hbg path (no prebuilt runtime arena). Returns 0 on
+     * success, -1 on failure.
      */
-    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size);
+    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
 
     /**
-     * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must
-     * have been called earlier in this Worker; otherwise these return
-     * nullptr. Pointers are stable for the lifetime of the Worker.
+     * Return the pooled GM heap / PTO2 SM / runtime arena pointer.
+     * setup_static_arena must have been called earlier in this Worker;
+     * otherwise these return nullptr. All pointers are stable for the
+     * Worker's lifetime; the single underlying device buffer is released in
+     * `finalize()`.
+     *
+     * acquire_pooled_runtime_arena() is trb-only — the runtime arena region
+     * is only reserved when setup_static_arena was called with
+     * runtime_arena_size > 0. hbg's runtime_maker never calls this; doing so
+     * after setup_static_arena(...,0) returns an unreserved-offset region_ptr
+     * (undefined). Keep the call site discipline at the runtime_maker layer.
      */
     void *acquire_pooled_gm_heap();
     void *acquire_pooled_gm_sm();
+    void *acquire_pooled_runtime_arena();
 
     /**
      * Create a thread bound to this device.
@@ -523,10 +534,12 @@ class DeviceRunner {
     DeviceArena static_arena_;
     size_t gm_heap_region_off_{SIZE_MAX};
     size_t gm_sm_region_off_{SIZE_MAX};
+    size_t runtime_arena_region_off_{SIZE_MAX};
     // Cached sizes for setup_static_arena's "fits" check — avoids calling
     // region_size() on the arena's public API for the two regions we own.
     size_t cached_gm_heap_size_{0};
     size_t cached_gm_sm_size_{0};
+    size_t cached_runtime_arena_size_{0};
 
     // Device resources
     rtStream_t stream_aicpu_{nullptr};
diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
index 0cc17c81f..1a2bb32a9 100644
--- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
@@ -108,9 +108,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) {
     }
 }
 
-static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) {
+static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
     try {
-        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size);
+        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size);
     } catch (...) {
         return -1;
     }
@@ -132,6 +132,14 @@ static void *acquire_pooled_gm_sm_wrapper() {
     }
 }
 
+static void *acquire_pooled_runtime_arena_wrapper() {
+    try {
+        return current_runner()->acquire_pooled_runtime_arena();
+    } catch (...) {
+        return nullptr;
+    }
+}
+
 /* ===========================================================================
  * Public C API (resolved by ChipWorker via dlsym)
  * =========================================================================== */
@@ -426,6 +434,7 @@ int run_prepared(
         r->host_api.setup_static_arena = setup_static_arena_wrapper;
         r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper;
         r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper;
+        r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper;
         r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper;
 
         // Restore kernel addrs + orch symbol names + active_callable_id; the
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index c0d26fbe1..8cbac796c 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -112,29 +112,39 @@ static int prof_free_cb(void *dev_ptr) {
 
 DeviceRunner::~DeviceRunner() { finalize(); }
 
-int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) {
+int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
     if (static_arena_.is_committed()) {
-        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0;
+        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_ &&
+            runtime_arena_size <= cached_runtime_arena_size_) {
+            return 0;
+        }
         static_arena_.release();
         gm_heap_region_off_ = SIZE_MAX;
         gm_sm_region_off_ = SIZE_MAX;
+        runtime_arena_region_off_ = SIZE_MAX;
         cached_gm_heap_size_ = 0;
         cached_gm_sm_size_ = 0;
+        cached_runtime_arena_size_ = 0;
     }
     gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign);
     gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign);
+    if (runtime_arena_size > 0) {
+        runtime_arena_region_off_ = static_arena_.reserve(runtime_arena_size, DeviceArena::kDefaultBaseAlign);
+    }
     if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
-        // Roll back the two reserves: commit() failure leaves committed_=false,
+        // Roll back the reserves: commit() failure leaves committed_=false,
         // so the next entry would skip the release branch and stack new
         // reserves on top of the stale cursor. release() is idempotent on a
         // never-committed arena (just zeroes cursor_ / region_count_).
         static_arena_.release();
         gm_heap_region_off_ = SIZE_MAX;
         gm_sm_region_off_ = SIZE_MAX;
+        runtime_arena_region_off_ = SIZE_MAX;
         return -1;
     }
     cached_gm_heap_size_ = gm_heap_size;
     cached_gm_sm_size_ = gm_sm_size;
+    cached_runtime_arena_size_ = runtime_arena_size;
     return 0;
 }
 
@@ -148,6 +158,12 @@ void *DeviceRunner::acquire_pooled_gm_sm() {
     return static_arena_.region_ptr(gm_sm_region_off_);
 }
 
+void *DeviceRunner::acquire_pooled_runtime_arena() {
+    if (!static_arena_.is_committed()) return nullptr;
+    if (runtime_arena_region_off_ == SIZE_MAX) return nullptr;
+    return static_arena_.region_ptr(runtime_arena_region_off_);
+}
+
 std::thread DeviceRunner::create_thread(std::function<void()> fn) {
     int dev_id = device_id_;
     return std::thread([dev_id, fn = std::move(fn)]() {
@@ -935,8 +951,10 @@ int DeviceRunner::finalize() {
     static_arena_.release();
     gm_heap_region_off_ = SIZE_MAX;
     gm_sm_region_off_ = SIZE_MAX;
+    runtime_arena_region_off_ = SIZE_MAX;
     cached_gm_heap_size_ = 0;
     cached_gm_sm_size_ = 0;
+    cached_runtime_arena_size_ = 0;
 
     // Free all remaining allocations
     mem_alloc_.finalize();
diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h
index 0aa6e6fa1..f4fe44121 100644
--- a/src/a5/platform/sim/host/device_runner.h
+++ b/src/a5/platform/sim/host/device_runner.h
@@ -77,19 +77,25 @@ class DeviceRunner {
 
     /**
      * Lay out and commit the per-Worker static device arena that backs the
-     * PTO2 GM heap and PTO2 shared memory in a single underlying allocation.
-     * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm.
-     * Idempotent on identical sizes. Returns 0 on success, -1 on failure.
+     * PTO2 GM heap, the PTO2 shared memory, and the trb prebuilt runtime
+     * arena in a single underlying allocation. Must be called before any
+     * acquire_pooled_*. Idempotent on identical sizes. `runtime_arena_size`
+     * is 0 for the hbg path (no prebuilt runtime arena). Returns 0 on
+     * success, -1 on failure.
      */
-    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size);
+    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
 
     /**
-     * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must
-     * have been called earlier in this Worker; otherwise these return
-     * nullptr. Pointers are stable for the lifetime of the Worker.
+     * Return the pooled GM heap / PTO2 SM / runtime arena pointer.
+     * setup_static_arena must have been called earlier in this Worker;
+     * otherwise these return nullptr.
+     *
+     * acquire_pooled_runtime_arena() is trb-only — the region exists only
+     * when setup_static_arena was called with runtime_arena_size > 0.
      */
     void *acquire_pooled_gm_heap();
     void *acquire_pooled_gm_sm();
+    void *acquire_pooled_runtime_arena();
 
     /**
      * Create a thread bound to this device.
@@ -292,10 +298,12 @@ class DeviceRunner {
     DeviceArena static_arena_;
     size_t gm_heap_region_off_{SIZE_MAX};
     size_t gm_sm_region_off_{SIZE_MAX};
+    size_t runtime_arena_region_off_{SIZE_MAX};
     // Cached sizes for setup_static_arena's "fits" check — avoids calling
     // region_size() on the arena's public API for the two regions we own.
     size_t cached_gm_heap_size_{0};
     size_t cached_gm_sm_size_{0};
+    size_t cached_runtime_arena_size_{0};
 
     // Simulation state (no actual device resources)
     KernelArgs kernel_args_;
diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
index 81e9b138f..f2dc10b4e 100644
--- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
@@ -103,9 +103,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) {
     }
 }
 
-static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) {
+static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
     try {
-        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size);
+        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size);
     } catch (...) {
         return -1;
     }
@@ -127,6 +127,14 @@ static void *acquire_pooled_gm_sm_wrapper() {
     }
 }
 
+static void *acquire_pooled_runtime_arena_wrapper() {
+    try {
+        return current_runner()->acquire_pooled_runtime_arena();
+    } catch (...) {
+        return nullptr;
+    }
+}
+
 /* ===========================================================================
  * Public C API (resolved by ChipWorker via dlsym)
  * =========================================================================== */
@@ -328,6 +336,7 @@ int run_prepared(
         r->host_api.setup_static_arena = setup_static_arena_wrapper;
         r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper;
         r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper;
+        r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper;
         r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper;
 
         auto bind_result = runner->bind_prepared_callable_to_runtime(*r, callable_id);
diff --git a/src/a5/runtime/host_build_graph/runtime/runtime.h b/src/a5/runtime/host_build_graph/runtime/runtime.h
index b9edf7020..25c6c13f4 100644
--- a/src/a5/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a5/runtime/host_build_graph/runtime/runtime.h
@@ -146,9 +146,10 @@ struct HostApi {
     // pto_runtime_c_api.cpp can populate the same HostApi struct regardless of
     // which runtime variant it is built against. Unset for this variant; do
     // not call.
-    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size);
+    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
     void *(*acquire_pooled_gm_heap)();
     void *(*acquire_pooled_gm_sm)();
+    void *(*acquire_pooled_runtime_arena)();
     // Single-shot upload of the entire ChipCallable buffer. `callable` is a
     // `const ChipCallable *` (declared void* to avoid pulling task_interface
     // headers into runtime.h). DeviceRunner walks child_offsets_ to compute
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index bcea9b09e..49d55380f 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -125,8 +125,10 @@ struct AicpuExecutor {
     std::atomic<int32_t> finished_count_{0};
     std::atomic<bool> runtime_init_ready_{false};
 
-    // Per-Worker arena backing the PTO2Runtime + sm_handle + orch/sched/mailbox
-    // sub-regions (created in runtime_create_from_sm, released in runtime_destroy).
+    // Per-Worker arena attaching to the pooled prebuilt runtime image. Host
+    // populates the layout + data on its own arena, rtMemcpys into a pooled
+    // device buffer owned by DeviceRunner, and the AICPU attach()es to that
+    // buffer on each boot — no AICPU-side commit, no per-boot rtMalloc.
     // Default-constructed: libc-backed backend, no ctx.
     DeviceArena runtime_arena_;
 
@@ -466,29 +468,61 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 static_cast<uint64_t>(task_window_size), static_cast<uint64_t>(heap_size), dep_pool_capacity
             );
 
-            void *sm_ptr = runtime->get_gm_sm_ptr();
-            void *gm_heap = runtime->get_gm_heap_ptr();
+            // gm_heap pointer / dep_pool_capacity are encoded into the prebuilt
+            // runtime arena image at host build time, so we no longer fetch
+            // them here. They remain on the host Runtime instance and on the
+            // PTO2Runtime header for diagnostic purposes only.
+            (void)dep_pool_capacity;
 
+            void *sm_ptr = runtime->get_gm_sm_ptr();
             uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size);
-            rt = runtime_create_from_sm(
-                PTO2_MODE_EXECUTE, sm_ptr, sm_size, task_window_size, gm_heap, heap_size, runtime_arena_,
-                dep_pool_capacity
-            );
-            if (!rt) {
-                LOG_ERROR("Thread %d: Failed to create PTO2Runtime", thread_idx);
-                // Unblock scheduler threads before returning so they don't spin forever.
+
+            // Prebuilt-arena fast path. Host has pre-populated the entire
+            // runtime arena (PTO2Runtime + orchestrator/scheduler/tensor_map
+            // sub-regions + sm_handle wrapper + mailbox) and uploaded it via
+            // rtMemcpy into the pooled runtime_arena buffer. We attach to it,
+            // wire arena-internal pointers to their device addresses, reset
+            // the SM, and finalize the few device-only fields the host could
+            // not know at image-build time.
+            void *prebuilt_arena = runtime->get_prebuilt_arena_base();
+            size_t off_runtime = runtime->get_prebuilt_runtime_offset();
+            if (prebuilt_arena == nullptr) {
+                LOG_ERROR("Thread %d: prebuilt_arena_base is null", thread_idx);
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
+            }
+            runtime_arena_.attach(prebuilt_arena, DeviceArena::kDefaultBaseAlign);
+            rt = reinterpret_cast<PTO2Runtime *>(static_cast<char *>(prebuilt_arena) + off_runtime);
+
+            // Wire every arena-internal pointer field (host wrote host-mirror
+            // addresses; we overwrite them with device addresses).
+            runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt);
+
+            // Reset SM state. setup_pointers + init_header_per_ring restore
+            // ring flow-control counters, layout metadata, error flags, and
+            // the per-slot ring->slot_states[] (bind_ring + reset_for_reuse +
+            // fanin_count/active_mask zero — previously done inside
+            // RingSchedState::init).
+            memset(rt->sm_handle, 0, sizeof(*rt->sm_handle));
+            if (!rt->sm_handle->init(sm_ptr, sm_size, task_window_size, heap_size)) {
+                LOG_ERROR("Thread %d: sm_handle->init failed", thread_idx);
                 runtime_init_ready_.store(true, std::memory_order_release);
                 return -1;
             }
 
+            // AICore completion mailbox lives in the arena; reset it each
+            // boot so stale completion notifications from a previous run do
+            // not leak.
+            memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox));
+
+            // Fill ops / core counts (host can't resolve s_runtime_ops's
+            // device address nor know the SchedulerContext's core fan-out).
+            runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count());
+
 #if PTO2_PROFILING
             rt->orchestrator.l2_perf_level = get_l2_perf_level();
 #endif
 
-            // Total core counts = aic_count_ / aiv_count_ (set once at runtime init).
-            rt->orchestrator.total_cluster_count = sched_ctx_.aic_count();
-            rt->orchestrator.total_aiv_count = sched_ctx_.aiv_count();
-
             // With multi-ring, slot_states are per-ring inside the scheduler.
             runtime->set_slot_states_ptr(nullptr);
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 0c7ac3872..9e1d00841 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -36,8 +36,10 @@
 #include <cstring>
 
 #include "../common/pto_runtime_status.h"
+#include "../runtime/pto_runtime2.h"
 #include "../runtime/pto_shared_memory.h"
 #include "../runtime/runtime.h"
+#include "device_arena.h"
 #include "callable.h"
 #include "common/platform_config.h"
 #include "common/unified_log.h"
@@ -271,15 +273,27 @@ extern "C" int bind_prepared_to_runtime_impl(
     uint64_t eff_heap_size = runtime->heap_size ? runtime->heap_size : PTO2_HEAP_SIZE;
     uint64_t eff_task_window_size = runtime->task_window_size ? runtime->task_window_size : PTO2_TASK_WINDOW_SIZE;
 
-    // Lay out the per-Worker static device arena. GM heap (orchestrator output
-    // buffers, all rings combined) and PTO2 shared memory live in a single
-    // backing allocation; setup_static_arena reserves both regions and
-    // commits in one shot. Owned by DeviceRunner across runs — do NOT record
-    // in tensor_pairs_; the free is deferred to DeviceRunner::finalize().
+    // Lay out the per-Worker static device arena. GM heap, PTO2 shared memory,
+    // and the prebuilt runtime arena all live in a single backing allocation;
+    // setup_static_arena reserves the three regions and commits in one shot.
+    // Owned by DeviceRunner across runs — do NOT record in tensor_pairs_; the
+    // free is deferred to DeviceRunner::finalize(). The runtime-arena size is
+    // determined by replaying the reserve sequence on a host-side arena.
     uint64_t total_heap_size = eff_heap_size * PTO2_MAX_RING_DEPTH;
     uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(eff_task_window_size);
+    int32_t eff_dep_pool_capacity =
+        runtime->dep_pool_size ? static_cast<int32_t>(runtime->dep_pool_size) : PTO2_DEP_LIST_POOL_SIZE;
+
+    int64_t t_prebuilt_start = _now_ms();
+    DeviceArena host_arena;  // libc malloc backend by default
+    PTO2RuntimeArenaLayout layout = runtime_reserve_layout(host_arena, eff_task_window_size, eff_dep_pool_capacity);
+    if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+        LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
+        return -1;
+    }
+
     int64_t t_setup_start = _now_ms();
-    if (runtime->host_api.setup_static_arena(total_heap_size, sm_size) != 0) {
+    if (runtime->host_api.setup_static_arena(total_heap_size, sm_size, layout.arena_size) != 0) {
         LOG_ERROR("Failed to setup pooled static arena");
         return -1;
     }
@@ -303,9 +317,48 @@ extern "C" int bind_prepared_to_runtime_impl(
     }
     runtime->set_gm_sm_ptr(sm_ptr);
 
+    void *runtime_arena_dev = runtime->host_api.acquire_pooled_runtime_arena();
+    if (runtime_arena_dev == nullptr) {
+        LOG_ERROR("Failed to acquire pooled runtime arena");
+        return -1;
+    }
+
     // Set up device orchestration state
     runtime->set_orch_args(device_args);
 
+    // -------------------------------------------------------------------------
+    // Build the prebuilt runtime-arena image on host.
+    //
+    // We pre-compute every byte the AICPU's runtime arena would otherwise have
+    // to write at boot: layout offsets, sub-structure init data, and pointers
+    // back to the SM / GM heap. Then we rtMemcpy the image into the pooled
+    // runtime-arena region that DeviceRunner keeps alive across runs. AICPU
+    // boot becomes attach + wire (cheap pointer fixup) + sm_handle->init (SM
+    // reset) + a handful of device-only field fixups.
+    // -------------------------------------------------------------------------
+    PTO2Runtime *rt =
+        runtime_init_data_from_layout(host_arena, layout, PTO2_MODE_EXECUTE, sm_ptr, sm_size, gm_heap, eff_heap_size);
+    if (rt == nullptr) {
+        LOG_ERROR("runtime_init_data_from_layout failed");
+        return -1;
+    }
+    runtime_wire_arena_pointers(host_arena, layout, rt);
+
+    // Stash the layout inside the PTO2Runtime image so the AICPU can recover
+    // every arena-internal offset after rtMemcpy. The runtime arena's device
+    // base does NOT travel in this image — it's on the host Runtime
+    // (set_prebuilt_arena below), since the AICPU needs that pointer
+    // *before* it can dereference the image.
+    rt->prebuilt_layout = layout;
+
+    int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size);
+    if (rc_upload != 0) {
+        LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload);
+        return -1;
+    }
+    runtime->set_prebuilt_arena(runtime_arena_dev, layout.off_runtime);
+    int64_t t_prebuilt_end = _now_ms();
+
     LOG_INFO_V0("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count);
 
     int64_t t_total_end = _now_ms();
@@ -313,6 +366,7 @@ extern "C" int bind_prepared_to_runtime_impl(
     LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start);
     LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start);
     LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start);
+    LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start);
     LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start);
 
     return 0;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 056c2ee64..48368cf6a 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -331,18 +331,19 @@ static bool prepare_task(
 
     // Re-bind payload/task pointers each submit. Value is per-slot constant
     // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing
-    // here lets RingSchedState::init() skip the O(window_size) bind loop.
-    // Both writes hit the same 64B slot_state cache line we're about to
-    // dirty below, so the extra cost is two stores on an already-hot line.
-    // Must precede the scheduler wiring.queue.push at the end of
-    // submit_task_common — that push is the first read of slot_state->task /
-    // slot_state->payload by another thread.
+    // here lets RingSchedState::init_data_from_layout() skip the
+    // O(window_size) bind loop. Both writes hit the same 64B slot_state
+    // cache line we're about to dirty below, so the extra cost is two
+    // stores on an already-hot line. Must precede the scheduler
+    // wiring.queue.push at the end of submit_task_common — that push is
+    // the first read of slot_state->task / slot_state->payload by another
+    // thread.
     out->slot_state->bind_buffers(out->payload, out->task);
 
     // Fields already reset by advance_ring_pointers (eager reset after CONSUMED):
     //   fanout_lock=0, fanout_count=1, fanout_head=nullptr,
     //   fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0
-    // Fields immutable after RingSchedState::init():
+    // Fields immutable after RingSchedState::init_data_from_layout():
     //   ring_id
     // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor
     // observers); set to PENDING here when orchestrator actually reuses the slot.
@@ -358,88 +359,6 @@ static bool prepare_task(
     return true;
 }
 
-// =============================================================================
-// Orchestrator Initialization
-// =============================================================================
-
-PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout(
-    DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity
-) {
-    PTO2OrchestratorLayout layout{};
-    layout.dep_pool_capacity = dep_pool_capacity;
-    layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP;
-    layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        const size_t fanin_pool_bytes =
-            PTO2_ALIGN_UP(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
-        layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE);
-    }
-    layout.off_scope_tasks = arena.reserve(
-        static_cast<size_t>(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *)
-    );
-    layout.off_scope_begins =
-        arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
-    layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes);
-    return layout;
-}
-
-bool PTO2OrchestratorState::init_from_layout(
-    const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header_arg, void *gm_heap,
-    uint64_t heap_size
-) {
-    auto *orch = this;
-    *orch = PTO2OrchestratorState{};
-
-    orch->sm_header = sm_header_arg;
-    orch->gm_heap_base = gm_heap;
-    orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
-    orch->fatal = false;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + r * heap_size;
-        auto &ring = sm_header_arg->rings[r];
-
-        orch->rings[r].task_allocator.init(
-            ring.task_descriptors, ring.task_window_size, &ring.fc.current_task_index, &ring.fc.last_task_alive,
-            ring_heap_base, heap_size, &sm_header_arg->orch_error_code
-        );
-
-        const size_t fanin_pool_bytes =
-            PTO2_ALIGN_UP(static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
-        auto *fanin_entries = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
-        memset(fanin_entries, 0, fanin_pool_bytes);
-        orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, &sm_header_arg->orch_error_code);
-    }
-
-    if (!orch->tensor_map.init_from_layout(layout.tensor_map, arena)) {
-        return false;
-    }
-    orch->tensor_map.orch = orch;
-
-    orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
-    orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
-    orch->scope_tasks_size = 0;
-    orch->scope_tasks_capacity = layout.scope_tasks_cap;
-    orch->scope_stack_top = -1;
-    orch->scope_stack_capacity = layout.scope_stack_capacity;
-    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
-
-    return true;
-}
-
-void PTO2OrchestratorState::destroy() {
-    auto *orch = this;
-    orch->tensor_map.destroy();
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        orch->rings[r].fanin_pool.base = nullptr;
-    }
-    orch->scope_tasks = nullptr;
-    orch->scope_begins = nullptr;
-}
-
-void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; }
-
 // =============================================================================
 // Scope Management
 // =============================================================================
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index e24b85b4e..9a73714c0 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -133,19 +133,29 @@ struct PTO2OrchestratorState {
     // === Cold-path API (defined in pto_orchestrator.cpp) ===
 
     // Phase 1: declare every sub-region (per-ring fanin pool, scope arrays,
-    // tensor_map sub-layout) on the supplied arena.
+    // tensor_map sub-layout) on the supplied arena. task_window_sizes feeds
+    // the nested tensor_map layout. Returned layout is consumed by
+    // init_data_from_layout.
     static PTO2OrchestratorLayout reserve_layout(
         DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH],
         int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
     );
 
-    // Phase 3: bind region pointers, wire per-ring task_allocator + fanin_pool
-    // and tensor_map. Arena must be committed.
-    bool init_from_layout(
-        const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header, void *gm_heap,
-        uint64_t heap_size
+    // Phase 3a: write everything *except* arena-internal pointer fields.
+    // sm_dev_base is the SM device address (only stored, never dereferenced);
+    // task_window_size feeds the per-ring SM address arithmetic. Safe to call
+    // on a host arena that holds the prebuilt image.
+    bool init_data_from_layout(
+        const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
+        uint64_t task_window_size
     );
 
+    // Phase 3b: write the arena-internal pointer fields (scope_tasks,
+    // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool,
+    // free_entry_list,task_entry_heads}, scheduler reference).
+    // Idempotent — host runs once on the image, AICPU runs once after attach.
+    void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler);
+
     // Forget pointers; arena owns the backing buffers.
     void destroy();
     void set_scheduler(PTO2SchedulerState *scheduler);
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
index 5a3e3d3d3..abd2a7510 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -68,10 +68,22 @@ class PTO2TaskAllocator {
 public:
     /**
      * Initialize the allocator with task ring and heap ring resources.
+     *
+     * All pointer arguments are device addresses (live in SM / GM heap); this
+     * function only stores them, no dereferences, so it is safe to invoke
+     * from host code that constructs a prebuilt arena image.
+     *
+     * Production callers leave `initial_local_task_id` at 0: the SM ring
+     * flow-control counters that current_index_ptr / last_alive_ptr point at
+     * start at zero (PTO2RingFlowControl::init() runs on the AICPU during SM
+     * reset), so we keep local_task_id_ aligned with that without reading the
+     * SM. Tests that drive SM state directly may pass a non-zero seed to
+     * exercise corner cases like task IDs near INT32_MAX.
      */
     void init(
         PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic<int32_t> *current_index_ptr,
-        std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr
+        std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr,
+        int32_t initial_local_task_id = 0
     ) {
         descriptors_ = descriptors;
         window_size_ = window_size;
@@ -81,7 +93,7 @@ class PTO2TaskAllocator {
         heap_base_ = heap_base;
         heap_size_ = heap_size;
         error_code_ptr_ = error_code_ptr;
-        local_task_id_ = current_index_ptr->load(std::memory_order_relaxed);
+        local_task_id_ = initial_local_task_id;
         heap_top_ = 0;
         heap_tail_ = 0;
         last_alive_seen_ = 0;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
index c801d5c15..f39bac365 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
@@ -249,81 +249,19 @@ static const PTO2RuntimeOps s_runtime_ops = {
 };
 
 // =============================================================================
-// Runtime Creation and Destruction
+// Runtime Lifecycle (AICPU-only fixup)
 // =============================================================================
-
-PTO2Runtime *runtime_create_from_sm(
-    PTO2RuntimeMode mode, void *sm_base, uint64_t sm_size, uint64_t task_window_size, void *gm_heap, uint64_t heap_size,
-    DeviceArena &arena, int32_t dep_pool_capacity
-) {
-    if (!sm_base || sm_size == 0) return nullptr;
-
-    // Phase 1: layout. Reserve every sub-region the runtime needs (including
-    // the SM handle wrapper itself) without touching memory yet.
-    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = static_cast<int32_t>(task_window_size);
-    }
-    const size_t off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
-    PTO2OrchestratorLayout orch_layout =
-        PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity);
-    PTO2SchedulerLayout sched_layout = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity);
-    const size_t off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
-    const size_t off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
-
-    // Phase 2: single backing allocation.
-    if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) return nullptr;
-
-    // Phase 3: bind region pointers and initialize.
-    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(off_runtime));
-    memset(rt, 0, sizeof(*rt));  // calloc-equivalent for the runtime header.
-
-    // Initialize the SM handle wrapper in-place on its arena region before
-    // anything that reads sm_handle->header (orchestrator / scheduler init).
-    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(off_sm_handle));
-    memset(rt->sm_handle, 0, sizeof(*rt->sm_handle));
-    if (!rt->sm_handle->init(sm_base, sm_size, task_window_size, heap_size)) {
-        arena.release();
-        return nullptr;
-    }
-
+//
+// Layout / init_data / wire / destroy live in
+// runtime/shared/pto_runtime2_init.cpp so the host build can pre-populate the
+// prebuilt arena image. The pieces below — wiring the ops table and the
+// SPMD core counts — depend on the device-side s_runtime_ops global and the
+// AICPU SchedulerContext respectively, so they remain in the AICPU build.
+
+void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count) {
     rt->ops = &s_runtime_ops;
-    rt->mode = mode;
-    rt->gm_heap = gm_heap;
-    rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0;
-    rt->gm_heap_owned = false;
-
-    if (!rt->orchestrator.init_from_layout(orch_layout, arena, rt->sm_handle->header, gm_heap, heap_size)) {
-        arena.release();
-        return nullptr;
-    }
-    if (!rt->scheduler.init_from_layout(sched_layout, arena, rt->sm_handle->header)) {
-        rt->orchestrator.destroy();
-        arena.release();
-        return nullptr;
-    }
-    rt->orchestrator.set_scheduler(&rt->scheduler);
-
-    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(off_mailbox));
-    memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox));
-
-    return rt;
-}
-
-void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena) {
-    if (!rt) {
-        arena.release();  // safe: idempotent if nothing's committed.
-        return;
-    }
-
-    rt->scheduler.destroy();
-    rt->orchestrator.destroy();
-    rt->aicore_mailbox = nullptr;  // arena-owned.
-    rt->sm_handle = nullptr;       // wrapper lives in arena; release() reclaims it.
-
-    // arena.release() frees the single backing buffer that holds rt,
-    // mailbox, sm_handle, orchestrator and scheduler sub-regions in one shot.
-    arena.release();
+    rt->orchestrator.total_cluster_count = aic_count;
+    rt->orchestrator.total_aiv_count = aiv_count;
 }
 
 void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) {
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index 1da622407..460624e69 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -91,6 +91,30 @@ struct PTO2RuntimeOps {
     TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args);
 };
 
+/**
+ * Layout descriptor for the prebuilt runtime arena. Holds all sub-region
+ * offsets (orchestrator / scheduler / sm_handle wrapper / runtime header /
+ * AICore mailbox) plus the layout-defining capacities. Produced once on the
+ * host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout
+ * and runtime_wire_arena_pointers.
+ */
+struct PTO2RuntimeArenaLayout {
+    size_t off_sm_handle{0};
+    PTO2OrchestratorLayout orch;
+    PTO2SchedulerLayout sched;
+    size_t off_runtime{0};
+    size_t off_mailbox{0};
+
+    // Cached parameters (re-used by init_data + wire stages).
+    uint64_t task_window_size{0};
+    uint64_t heap_size{0};
+    int32_t dep_pool_capacity{0};
+
+    // Total arena byte size post-commit. Used by host to size the prebuilt
+    // image buffer and as the rtMemcpy length.
+    size_t arena_size{0};
+};
+
 /**
  * PTO Runtime2 context
  *
@@ -118,6 +142,16 @@ struct PTO2Runtime {
 
     // Statistics
     int64_t total_cycles;
+
+    // Prebuilt-arena fast path metadata. Carries every offset
+    // wire_arena_pointers needs at AICPU boot so the AICPU can reconstruct
+    // all arena-internal pointer fields without re-running init_data. The
+    // device base of the runtime arena travels separately on the host-side
+    // Runtime (Runtime::prebuilt_arena_base_), since the AICPU needs it
+    // *before* dereferencing this image. Populated on host by
+    // runtime_init_data_from_layout + runtime_wire_arena_pointers; read by
+    // aicpu_executor.cpp.
+    PTO2RuntimeArenaLayout prebuilt_layout;
 };
 
 // =============================================================================
@@ -125,31 +159,60 @@ struct PTO2Runtime {
 // =============================================================================
 
 /**
- * Create runtime from caller-provided GM SM buffer + GM heap.
+ * Phase 1 — declare every sub-region (sm_handle wrapper, orchestrator /
+ * scheduler / tensor_map / mailbox / PTO2Runtime header) on the supplied
+ * arena. Pure arithmetic; does not touch device memory and may run on host.
+ * Returns the layout descriptor; caller commits/attaches the arena before
+ * Phase 2/3.
+ */
+PTO2RuntimeArenaLayout runtime_reserve_layout(
+    DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
+);
+
+/**
+ * Phase 2 — write the data half of the runtime arena: standalone fields,
+ * memset'd arena regions, sub-structure initializers, and SM-side device
+ * pointers. The arena must already be committed (or attached); writes go
+ * into arena.base() + sub-region offsets.
  *
- * All AICPU-side runtime state (PTO2SharedMemoryHandle wrapper, PTO2Runtime,
- * AICoreCompletionMailbox, plus the orchestrator/scheduler/tensor_map
- * sub-regions) is laid out on the supplied arena and committed in a single
- * backing allocation. runtime_destroy() calls arena.release() once to free
- * the lot.
+ * `sm_dev_base` / `gm_heap_dev_base` are device addresses; we only store
+ * them (never dereference). Safe to run on a host arena that owns a host
+ * mirror of the runtime image — the resulting buffer is rtMemcpy-ready.
  *
- * @param mode             Execution mode
- * @param sm_base          Pre-allocated SM buffer base (host-owned)
- * @param sm_size          Size of the SM buffer in bytes
- * @param task_window_size Per-ring task window size used to lay out SM
- * @param gm_heap          GM heap base for output buffers (or NULL if not used)
- * @param heap_size        GM heap size in bytes
- * @param arena            Caller-owned arena that sources all runtime sub-regions.
- * @return Runtime context, or NULL on failure
- */
-PTO2Runtime *runtime_create_from_sm(
-    PTO2RuntimeMode mode, void *sm_base, uint64_t sm_size, uint64_t task_window_size, void *gm_heap, uint64_t heap_size,
-    DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
+ * Returns the PTO2Runtime* that sits at layout.off_runtime within the arena.
+ * Caller must follow up with runtime_wire_arena_pointers; rt->ops and the
+ * AICore-side count fields are left untouched and must be filled by the
+ * AICPU at boot.
+ */
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size,
+    void *gm_heap_dev_base, uint64_t heap_size
 );
 
 /**
- * Destroy runtime and free all resources. arena.release() is the actual
- * memory free; the rt pointer is no longer valid afterward.
+ * Phase 3 — wire every arena-internal pointer field (rt->sm_handle,
+ * rt->aicore_mailbox, orchestrator.{scope_tasks, scope_begins, scheduler,
+ * tensor_map.*, rings[].fanin_pool.base}, scheduler.{ready_queues, dep_pool,
+ * wiring.queue}) so each holds arena.base() + offset. Idempotent — runs on
+ * both host (writing host-mirror addresses) and AICPU (writing device
+ * addresses) sides.
+ */
+void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt);
+
+/**
+ * AICPU-only Phase 4 — fill in the few fields the host could not know at
+ * prebuilt-image build time: the ops table (s_runtime_ops is a device-side
+ * file-local global, host cannot resolve its device address) and the
+ * orchestrator's core counts (depend on the executor's scheduler context).
+ * Call once per boot after runtime_wire_arena_pointers.
+ */
+void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count);
+
+/**
+ * Destroy runtime. With the prebuilt-arena fast path the arena buffer is
+ * pooled across runs by DeviceRunner, so we never call arena.release()
+ * here — the destructor only forgets sub-structure pointers (idempotent
+ * cleanup).
  */
 void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena);
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index f022b8eb4..a0dfbd9ef 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -92,7 +92,7 @@
 
 // Task management
 // NOTE: PTO2_TASK_WINDOW_SIZE is now a per-ring default value.
-// Actual window size is passed at runtime to runtime_create_from_sm().
+// Actual window size is passed at runtime to runtime_reserve_layout().
 // Use pto2_task_slot(sched, task_id) for slot calculation.
 #define PTO2_TASK_WINDOW_SIZE 16384  // Default per-ring task window size (power of 2)
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
index cf8dbb780..98b832510 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
@@ -53,11 +53,18 @@ struct PTO2SharedMemoryHandle;
  */
 struct alignas(64) PTO2RingFlowControl {
     // === Cache Line 0: Written by Orchestrator, Read by Scheduler ===
-    std::atomic<int32_t> current_task_index;  // Task ring head (next to allocate)
+    alignas(64) std::atomic<int32_t> current_task_index;  // Task ring head (next to allocate)
 
     // === Cache Line 1: Written by Scheduler, Read by Orchestrator (for back-pressure) ===
     alignas(64) std::atomic<int32_t> last_task_alive;  // Task ring tail (oldest active task)
 
+    // Per-boot SM reset. PTO2TaskAllocator::init() seeds its private
+    // local_task_id_ from initial_local_task_id (default 0 in production)
+    // *without* dereferencing current_task_index — it relies on this reset
+    // running on every AICPU boot so 0 stays in sync. If you ever change
+    // the initial fc value or the boot ordering, update the default in
+    // PTO2TaskAllocator::init (pto_ring_buffer.h) in the same change, or
+    // submit IDs will be off by the divergence.
     void init() {
         current_task_index.store(0, std::memory_order_relaxed);
         last_task_alive.store(0, std::memory_order_relaxed);
@@ -187,3 +194,67 @@ struct PTO2SharedMemoryHandle {
     void setup_pointers(uint64_t task_window_size);
     void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
 };
+
+// =============================================================================
+// SM Device Layout Helpers
+// =============================================================================
+//
+// When the host pre-builds a runtime-arena image, it needs the device-side
+// addresses of several SM sub-fields (ring flow-control counters,
+// task_descriptors arrays, orch_error_code) so it can wire them into the
+// orchestrator / scheduler init_data path without dereferencing the SM —
+// the SM lives in device memory and cannot be touched from host.
+//
+// These helpers compute those addresses by offset arithmetic on the SM
+// device base. Pure pointer math, no loads/stores; safe to call from host.
+// The same arithmetic happens on AICPU too (via PTO2SharedMemoryHandle's
+// own setup_pointers), so values are guaranteed consistent across sides.
+namespace pto2_sm_layout {
+
+inline std::atomic<int32_t> *orch_error_code_addr(void *sm_dev_base) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code)
+    );
+}
+
+inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<PTO2SharedMemoryRingHeader *>(
+        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) +
+        static_cast<size_t>(ring_id) * sizeof(PTO2SharedMemoryRingHeader)
+    );
+}
+
+inline std::atomic<int32_t> *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
+        offsetof(PTO2RingFlowControl, current_task_index)
+    );
+}
+
+inline std::atomic<int32_t> *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept {
+    return reinterpret_cast<std::atomic<int32_t> *>(
+        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
+        offsetof(PTO2RingFlowControl, last_task_alive)
+    );
+}
+
+// Walk the per-ring SM layout (same arithmetic as setup_pointers_per_ring)
+// to compute ring `ring_id`'s task_descriptors device address. Accepts a
+// per-ring window-size array so the helper's signature mirrors
+// `PTO2SharedMemoryHandle::setup_pointers_per_ring` and cannot silently
+// disagree with the SM layout when (hypothetically) ring sizes diverge.
+inline PTO2TaskDescriptor *ring_task_descriptors_addr(
+    void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id
+) noexcept {
+    assert(ring_id >= 0 && ring_id < PTO2_MAX_RING_DEPTH && "pto2_sm_layout: ring_id out of range");
+    char *p = static_cast<char *>(sm_dev_base);
+    p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+    for (int r = 0; r < ring_id; r++) {
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+        p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+    }
+    return reinterpret_cast<PTO2TaskDescriptor *>(p);
+}
+
+}  // namespace pto2_sm_layout
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index 39d6e4ad2..b63f20676 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -47,12 +47,12 @@
 #include "pto_runtime2_types.h"
 #include "tensor.h"
 
-struct PTO2OrchestratorState;  // forward declare
-
 /**
  * Layout descriptor produced by PTO2TensorMap::reserve_layout(). Stores the
  * region offsets returned by DeviceArena::reserve() so init_from_layout()
  * can fetch the matching pointers after the arena is committed.
+ *
+ * All offsets are relative to the arena's base.
  */
 struct PTO2TensorMapLayout {
     size_t off_buckets;
@@ -367,8 +367,6 @@ struct PTO2TensorMap {
     // Per-ring cleanup progress (for periodic cleanup_retired)
     int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{};
 
-    PTO2OrchestratorState *orch{nullptr};
-
     uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const {
         return task_local_id & (task_window_sizes[ring_id] - 1);
     }
@@ -433,11 +431,19 @@ struct PTO2TensorMap {
     reserve_layout_default(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
 
     /**
-     * Phase 3: bind region pointers and initialize state. The arena must already
-     * be committed; layout must have been produced by reserve_layout() against
-     * the same arena.
+     * Phase 3a: write everything *except* arena-internal pointer fields
+     * (buckets, entry_pool, free_entry_list, task_entry_heads[r]).
+     * Uses arena.region_ptr to address the arena regions for data writes,
+     * but does not store those addresses in struct fields. Safe to call on
+     * a host arena that holds the prebuilt image.
+     */
+    bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena);
+
+    /**
+     * Phase 3b: write the arena-internal pointer fields. Idempotent;
+     * called once on the host arena and once on the AICPU after attach.
      */
-    bool init_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena);
+    void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena);
 
     /**
      * Tear down state. Does not free memory — the arena owns the backing
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index a4aef9c04..4a7dce1bd 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -127,18 +127,26 @@ struct HostApi {
     void (*device_free)(void *dev_ptr);
     int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size);
     int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size);
-    // Lay out and commit the per-Worker static device arena that backs both
-    // the PTO2 GM heap and the PTO2 shared memory in a single underlying
-    // allocation. Must be called once before acquire_pooled_gm_heap /
-    // acquire_pooled_gm_sm. Returns 0 on success, -1 on allocation failure.
-    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size);
+    // Lay out three pooled regions in a single backing device allocation:
+    // GM heap, PTO2 shared memory, and the trb prebuilt runtime arena.
+    // `runtime_arena_size == 0` skips the last region (hbg path: hbg has no
+    // prebuilt runtime arena). Returns 0 on success, -1 on allocation
+    // failure.
+    int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
     // Return the per-Worker pooled pointer for the PTO2 GM heap / shared
-    // memory. The static arena must already be committed via
-    // setup_static_arena; the returned pointer is owned by the DeviceRunner
-    // and freed in `DeviceRunner::finalize()` — do NOT pass it to
-    // device_free or record it in `tensor_pairs_`.
+    // memory / prebuilt runtime arena. The static arena must already be
+    // committed via setup_static_arena; the returned pointer is owned by
+    // the DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT
+    // pass it to device_free or record it in `tensor_pairs_`.
+    //
+    // acquire_pooled_runtime_arena is trb-only — the host side reserves the
+    // runtime-arena region only when setup_static_arena is invoked with
+    // runtime_arena_size > 0. hbg's runtime_maker.cpp must not call it
+    // (setup_static_arena(...,0) leaves the offset unreserved, and the
+    // returned region_ptr would be undefined).
     void *(*acquire_pooled_gm_heap)();
     void *(*acquire_pooled_gm_sm)();
+    void *(*acquire_pooled_runtime_arena)();
     // Single-shot upload of the entire ChipCallable buffer. `callable` is a
     // `const ChipCallable *` (declared void* to avoid pulling task_interface
     // headers into runtime.h). DeviceRunner walks child_offsets_ to compute
@@ -218,6 +226,13 @@ class Runtime {
     void *slot_states_ptr_;                  // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling)
     ChipStorageTaskArgs orch_args_storage_;  // Copy of args for device
 
+    // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing
+    // Runtime to device; AICPU reads them in the boot path to skip
+    // runtime_create_from_sm and reuse the pooled, prebuilt arena buffer
+    // (already populated by runtime_init_data_from_layout + wire on host).
+    void *prebuilt_arena_base_;
+    size_t prebuilt_runtime_offset_;
+
     // Device orchestration SO (for dlopen on AICPU thread 3).
     // The SO bytes themselves live in a separately-allocated device buffer
     // owned by DeviceRunner; only the metadata below travels inside Runtime.
@@ -254,6 +269,16 @@ class Runtime {
     void set_slot_states_ptr(void *p);
     void set_orch_args(const ChipStorageTaskArgs &args);
 
+    // Prebuilt-arena fast path (trb only). Set by host's
+    // bind_prepared_to_runtime_impl; consumed by AICPU at boot to attach a
+    // DeviceArena to `prebuilt_arena_base_` and pick up the PTO2Runtime at
+    // `prebuilt_arena_base_ + prebuilt_runtime_offset_`. Both stay zero on
+    // first construction (Runtime() ctor zeros them) so a non-prebuilt boot
+    // path can still detect "no prebuilt image set" via nullptr.
+    void set_prebuilt_arena(void *arena_base, size_t runtime_off);
+    void *get_prebuilt_arena_base() const;
+    size_t get_prebuilt_runtime_offset() const;
+
     // Device orchestration SO binary (for dlopen on AICPU thread 3)
     void set_dev_orch_so(uint64_t dev_addr, uint64_t size);
     uint64_t get_dev_orch_so_addr() const;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
index f497b8fd8..2d777e9b0 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
@@ -61,153 +61,6 @@ PTO2SchedProfilingData scheduler_get_profiling(int thread_idx) {
 }
 #endif
 
-// =============================================================================
-// Ready Queue Implementation
-// =============================================================================
-
-size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) {
-    // Align the slots[] base to a full cache line so MPMC CAS traffic on the
-    // first slot cannot false-share with whatever region sits in front of us
-    // (e.g. orchestrator tensormap heads written by the orch thread).
-    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
-}
-
-bool ready_queue_init_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) {
-    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
-    queue->capacity = capacity;
-    queue->mask = capacity - 1;
-    queue->enqueue_pos.store(0, std::memory_order_relaxed);
-    queue->dequeue_pos.store(0, std::memory_order_relaxed);
-
-    for (uint64_t i = 0; i < capacity; i++) {
-        queue->slots[i].sequence.store((int64_t)i, std::memory_order_relaxed);
-        queue->slots[i].slot_state = nullptr;
-    }
-
-    return true;
-}
-
-void ready_queue_destroy(PTO2ReadyQueue *queue) {
-    // Arena owns the slots[] buffer; just forget the pointer.
-    queue->slots = nullptr;
-}
-
-// =============================================================================
-// Scheduler Initialization
-// =============================================================================
-
-bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id) {
-    ring = &sm_header->rings[ring_id];
-    last_task_alive = 0;
-    advance_lock.store(0, std::memory_order_relaxed);
-
-    // Initialize all per-task slot state fields.
-    // bind_ring() sets the ring_id only — payload/task pointers are re-bound
-    // by orch::prepare_task on every submit (their value is per-slot constant
-    // but pinning them here would cost O(task_window_size) at startup).
-    // reset_for_reuse() sets dynamic fields to reclaim defaults (fanout_count=1,
-    // rest zero) so the first submit needs no reset.
-    for (uint64_t i = 0; i < ring->task_window_size; i++) {
-        ring->slot_states[i].bind_ring(static_cast<uint8_t>(ring_id));
-        ring->slot_states[i].reset_for_reuse();
-        ring->slot_states[i].fanin_count = 0;
-        ring->slot_states[i].active_mask = ActiveMask{};
-    }
-
-    return true;
-}
-
-void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; }
-
-PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) {
-    PTO2SchedulerLayout layout{};
-    layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
-    layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
-    layout.dep_pool_capacity = dep_pool_capacity;
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
-    }
-    layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        // Force a cache-line base so writes from scheduler thread 0 (sole
-        // writer of this ring's dep_pool) do not invalidate adjacent
-        // multi-threaded regions like ready_queue.slots.
-        layout.off_dep_pool_entries[r] =
-            arena.reserve(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE);
-    }
-    layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
-    return layout;
-}
-
-bool PTO2SchedulerState::init_from_layout(
-    const PTO2SchedulerLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header_arg
-) {
-    PTO2SchedulerState *sched = this;
-    sched->sm_header = sm_header_arg;
-#if PTO2_SCHED_PROFILING
-    sched->tasks_completed.store(0, std::memory_order_relaxed);
-    sched->tasks_consumed.store(0, std::memory_order_relaxed);
-#endif
-
-    // Per-ring scheduler state — no arena buffers, just field init.
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (!sched->ring_sched_states[r].init(sm_header_arg, r)) {
-            return false;
-        }
-    }
-
-    // Ready queues — one per resource shape plus DUMMY.
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        if (!ready_queue_init_from_layout(
-                &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity
-            )) {
-            return false;
-        }
-    }
-    if (!ready_queue_init_from_layout(
-            &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity
-        )) {
-        return false;
-    }
-
-    // Per-ring dep_pool: PTO2DepListPool::init takes an externally-allocated
-    // base + capacity, so we just plumb the arena region into it.
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        auto *dep_entries = static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
-        // calloc-equivalent: pool expects entries zeroed at construction.
-        memset(dep_entries, 0, static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry));
-        sched->ring_sched_states[r].dep_pool.init(
-            dep_entries, layout.dep_pool_capacity, &sm_header_arg->orch_error_code
-        );
-    }
-
-    // Wiring SPSC queue (orchestrator push, scheduler thread 0 pop).
-    if (!sched->wiring.queue.init_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) {
-        return false;
-    }
-    sched->wiring.batch_count = 0;
-    sched->wiring.batch_index = 0;
-    sched->wiring.backoff_counter = 0;
-
-    return true;
-}
-
-void PTO2SchedulerState::destroy() {
-    PTO2SchedulerState *sched = this;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        sched->ring_sched_states[r].destroy();
-        sched->ring_sched_states[r].dep_pool.base = nullptr;
-    }
-
-    sched->wiring.queue.destroy();
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        ready_queue_destroy(&sched->ready_queues[i]);
-    }
-    ready_queue_destroy(&sched->dummy_ready_queue);
-}
-
 // =============================================================================
 // Debug Utilities
 // =============================================================================
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
index 32887d0be..173f65135 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
@@ -409,7 +409,14 @@ struct alignas(64) PTO2ReadyQueue {
 //                     initialize sequence counters
 //   destroy: forget the slots pointer (arena owns the buffer)
 size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity);
-bool ready_queue_init_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity);
+// Writes everything *except* the arena-internal `slots` pointer field
+// (sequences/positions on the slot array, capacity, mask). Uses
+// arena.region_ptr(slots_off) only to address the slot array for writes;
+// does NOT store the pointer in `queue->slots`. Call
+// `ready_queue_wire_arena_pointers` afterwards to set the field itself.
+bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity);
+// Stores queue->slots = arena.region_ptr(slots_off). Idempotent.
+void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off);
 void ready_queue_destroy(PTO2ReadyQueue *queue);
 
 // =============================================================================
@@ -449,13 +456,17 @@ struct alignas(64) PTO2SpscQueue {
         return arena.reserve(capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE);
     }
 
-    // Bind buffer pointer + reset indices. The capacity must be a power of two
-    // and match the value passed to reserve_layout.
-    bool init_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) {
+    // Writes everything except the arena-internal `buffer_` pointer field
+    // (zeros the slot pointer array, mask/head/tail). The host pre-builds the
+    // image without storing a host address in buffer_; the AICPU wires
+    // buffer_ at boot via wire_arena_pointers().
+    bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) {
         if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false;
-        buffer_ = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+        auto *buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+        // calloc'd-equivalent: zero the slot pointers so spurious early pops
+        // observe nullptr.
         for (uint64_t i = 0; i < capacity; i++)
-            buffer_[i] = nullptr;
+            buf[i] = nullptr;
         mask_ = capacity - 1;
         head_.store(0, std::memory_order_relaxed);
         tail_.store(0, std::memory_order_relaxed);
@@ -464,6 +475,12 @@ struct alignas(64) PTO2SpscQueue {
         return true;
     }
 
+    // Wire the arena-internal pointer. Called by both host (with host arena)
+    // and AICPU (with device arena attached to the prebuilt image).
+    void wire_arena_pointers(DeviceArena &arena, size_t buffer_off) {
+        buffer_ = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+    }
+
     // Arena owns the buffer; here we only forget our pointer.
     void destroy() { buffer_ = nullptr; }
 
@@ -561,7 +578,12 @@ struct PTO2SchedulerState {
         // --- Cache Line 1+: Thread 0 only (wiring dep_pool) ---
         alignas(64) PTO2DepListPool dep_pool;
 
-        bool init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id);
+        // Initialize arena-internal data + arena-external pointers; does NOT
+        // store dep_pool.base (that lives in the runtime arena and is wired
+        // by SchedulerState::wire_arena_pointers). The `ring` field stores
+        // the device address of the SM ring header — computed via offset
+        // arithmetic, no SM dereference.
+        bool init_data_from_layout(void *sm_dev_base, int32_t ring_id);
         void destroy();
 
         void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); }
@@ -1040,10 +1062,23 @@ struct PTO2SchedulerState {
 
     // Phase 1: declare every sub-region (ready_queue slots, dummy queue slots,
     // per-ring dep_pool entries, wiring SPSC buffer) on the supplied arena.
+    // Capacities are baked into the returned layout; init_data_from_layout uses
+    // the same values.
     static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE);
 
-    // Phase 3: bind region pointers and initialize state.
-    bool init_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header);
+    // Phase 3a: write everything *except* arena-internal pointer fields.
+    // `sm_dev_base` is the device address of the SM (only stored, never
+    // dereferenced here). Safe to call on a host arena that holds the
+    // prebuilt image buffer. (The orchestrator counterpart takes
+    // task_window_size for ring task_descriptors address arithmetic; the
+    // scheduler only needs the SM header / ring header base addresses,
+    // both window-size-independent.)
+    bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base);
+
+    // Phase 3b: write the arena-internal pointer fields
+    // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each
+    // ring, wiring.queue.buffer_). Called on both host and device sides.
+    void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena);
 
     // Forget per-region pointers; arena owns the backing memory.
     void destroy();
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
new file mode 100644
index 000000000..d66acfcc4
--- /dev/null
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Host/AICPU shared runtime-arena layout, init_data and wire implementations.
+ *
+ * Lives under runtime/shared/ so it is included in both the host_runtime.so
+ * build (host pre-populates the prebuilt arena image) and the aicpu_runtime
+ * build (AICPU runs wire_arena_pointers + destroy after attach). The
+ * device-only parts of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp
+ * (ops table, scope/submit/dispatch business logic, profiling) stay in their
+ * original files and the aicpu build only.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "pto_orchestrator.h"
+#include "pto_runtime2.h"
+#include "pto_ring_buffer.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "scheduler/pto_scheduler.h"
+
+// =============================================================================
+// Ready queue
+// =============================================================================
+
+size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) {
+    // Align the slots[] base to a full cache line so MPMC CAS traffic on the
+    // first slot cannot false-share with whatever region sits in front of us
+    // (e.g. orchestrator tensormap heads written by the orch thread).
+    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
+}
+
+bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) {
+    // Address the slots region for data writes without storing the pointer in
+    // queue->slots — that field is set by ready_queue_wire_arena_pointers.
+    auto *slots_arena = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+    queue->capacity = capacity;
+    queue->mask = capacity - 1;
+    queue->enqueue_pos.store(0, std::memory_order_relaxed);
+    queue->dequeue_pos.store(0, std::memory_order_relaxed);
+
+    for (uint64_t i = 0; i < capacity; i++) {
+        slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed);
+        slots_arena[i].slot_state = nullptr;
+    }
+
+    return true;
+}
+
+void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off) {
+    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+}
+
+void ready_queue_destroy(PTO2ReadyQueue *queue) {
+    // Arena owns the slots[] buffer; just forget the pointer.
+    queue->slots = nullptr;
+}
+
+// =============================================================================
+// Scheduler
+// =============================================================================
+
+bool PTO2SchedulerState::RingSchedState::init_data_from_layout(void *sm_dev_base, int32_t ring_id) {
+    // ring stores the device address of the SM ring header — pure offset
+    // arithmetic, no SM load.
+    ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id);
+    last_task_alive = 0;
+    advance_lock.store(0, std::memory_order_relaxed);
+
+    // Per-slot SM-side initialization (bind_ring + reset_for_reuse +
+    // fanin_count/active_mask zero) lives in PTO2SharedMemoryHandle::
+    // init_header_per_ring so the AICPU performs it during SM reset; host
+    // prebuilt-arena init skips SM access here.
+
+    return true;
+}
+
+void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; }
+
+PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) {
+    PTO2SchedulerLayout layout{};
+    layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
+    layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
+    layout.dep_pool_capacity = dep_pool_capacity;
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+    }
+    layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        // Force a cache-line base so writes from scheduler thread 0 (sole
+        // writer of this ring's dep_pool) do not invalidate adjacent
+        // multi-threaded regions like ready_queue.slots.
+        layout.off_dep_pool_entries[r] =
+            arena.reserve(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE);
+    }
+    layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
+    return layout;
+}
+
+bool PTO2SchedulerState::init_data_from_layout(
+    const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base
+) {
+    PTO2SchedulerState *sched = this;
+    sched->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+#if PTO2_SCHED_PROFILING
+    sched->tasks_completed.store(0, std::memory_order_relaxed);
+    sched->tasks_consumed.store(0, std::memory_order_relaxed);
+#endif
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) {
+            return false;
+        }
+    }
+
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        if (!ready_queue_init_data_from_layout(
+                &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity
+            )) {
+            return false;
+        }
+    }
+    if (!ready_queue_init_data_from_layout(
+            &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity
+        )) {
+        return false;
+    }
+
+    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto *dep_entries = static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
+        memset(dep_entries, 0, static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry));
+        sched->ring_sched_states[r].dep_pool.init(dep_entries, layout.dep_pool_capacity, orch_err);
+    }
+
+    if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) {
+        return false;
+    }
+    sched->wiring.batch_count = 0;
+    sched->wiring.batch_index = 0;
+    sched->wiring.backoff_counter = 0;
+
+    return true;
+}
+
+void PTO2SchedulerState::wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena) {
+    PTO2SchedulerState *sched = this;
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]);
+    }
+    ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        sched->ring_sched_states[r].dep_pool.base =
+            static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
+    }
+    sched->wiring.queue.wire_arena_pointers(arena, layout.off_wiring_spsc_buffer);
+}
+
+void PTO2SchedulerState::destroy() {
+    PTO2SchedulerState *sched = this;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        sched->ring_sched_states[r].destroy();
+        sched->ring_sched_states[r].dep_pool.base = nullptr;
+    }
+    sched->wiring.queue.destroy();
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        ready_queue_destroy(&sched->ready_queues[i]);
+    }
+    ready_queue_destroy(&sched->dummy_ready_queue);
+}
+
+// =============================================================================
+// Orchestrator
+// =============================================================================
+
+PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout(
+    DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity
+) {
+    PTO2OrchestratorLayout layout{};
+    layout.dep_pool_capacity = dep_pool_capacity;
+    layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP;
+    layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        const size_t fanin_pool_bytes =
+            PTO2_ALIGN_UP(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
+        layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE);
+    }
+    layout.off_scope_tasks = arena.reserve(
+        static_cast<size_t>(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *)
+    );
+    layout.off_scope_begins =
+        arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
+    layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes);
+    return layout;
+}
+
+bool PTO2OrchestratorState::init_data_from_layout(
+    const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
+    uint64_t task_window_size
+) {
+    auto *orch = this;
+    *orch = PTO2OrchestratorState{};
+
+    orch->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+    orch->gm_heap_base = gm_heap;
+    orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
+    orch->fatal = false;
+
+    // Mirror the SM API's per-ring window-size shape so a future per-ring
+    // SM layout cannot silently disagree with the addresses we compute here.
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        task_window_sizes[r] = task_window_size;
+
+    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + r * heap_size;
+        auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r);
+        auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r);
+        auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r);
+
+        orch->rings[r].task_allocator.init(
+            task_descs_dev, static_cast<int32_t>(task_window_size), cur_idx_dev, last_alive_dev, ring_heap_base,
+            heap_size, orch_err
+        );
+
+        const size_t fanin_pool_bytes =
+            PTO2_ALIGN_UP(static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
+        auto *fanin_entries = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
+        memset(fanin_entries, 0, fanin_pool_bytes);
+        orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, orch_err);
+    }
+
+    if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) {
+        return false;
+    }
+
+    orch->scope_tasks_size = 0;
+    orch->scope_tasks_capacity = layout.scope_tasks_cap;
+    orch->scope_stack_top = -1;
+    orch->scope_stack_capacity = layout.scope_stack_capacity;
+    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+
+    return true;
+}
+
+void PTO2OrchestratorState::wire_arena_pointers(
+    const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg
+) {
+    auto *orch = this;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        orch->rings[r].fanin_pool.base = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
+    }
+    orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena);
+    orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
+    orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
+    orch->scheduler = scheduler_arg;
+}
+
+void PTO2OrchestratorState::destroy() {
+    auto *orch = this;
+    orch->tensor_map.destroy();
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        orch->rings[r].fanin_pool.base = nullptr;
+    }
+    orch->scope_tasks = nullptr;
+    orch->scope_begins = nullptr;
+}
+
+void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; }
+
+// =============================================================================
+// Top-level runtime arena
+// =============================================================================
+
+PTO2RuntimeArenaLayout
+runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity) {
+    PTO2RuntimeArenaLayout layout{};
+    layout.task_window_size = task_window_size;
+    layout.dep_pool_capacity = dep_pool_capacity;
+
+    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = static_cast<int32_t>(task_window_size);
+    }
+
+    layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
+    layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity);
+    layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity);
+    layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
+    layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
+
+    layout.arena_size = arena.total_size();
+    return layout;
+}
+
+PTO2Runtime *runtime_init_data_from_layout(
+    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base,
+    uint64_t /*sm_size*/, void *gm_heap_dev_base, uint64_t heap_size
+) {
+    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(layout.off_runtime));
+    memset(rt, 0, sizeof(*rt));
+
+    auto *sm_wrap = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    memset(sm_wrap, 0, sizeof(*sm_wrap));
+
+    // rt->ops is filled by the AICPU at boot.
+    rt->mode = mode;
+    rt->gm_heap = gm_heap_dev_base;
+    rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0;
+    rt->gm_heap_owned = false;
+    rt->total_cycles = 0;
+
+    if (!rt->orchestrator.init_data_from_layout(
+            layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_size, layout.task_window_size
+        )) {
+        return nullptr;
+    }
+    if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) {
+        return nullptr;
+    }
+
+    auto *mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    memset(mailbox, 0, sizeof(*mailbox));
+
+    return rt;
+}
+
+void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) {
+    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler);
+    rt->scheduler.wire_arena_pointers(layout.sched, arena);
+}
+
+void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) {
+    // Arena buffer is pooled across runs by DeviceRunner — never freed here.
+    if (!rt) return;
+    rt->scheduler.destroy();
+    rt->orchestrator.destroy();
+    rt->aicore_mailbox = nullptr;
+    rt->sm_handle = nullptr;
+}
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
index 358c87f57..1e1edff92 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
@@ -167,6 +167,23 @@ void PTO2SharedMemoryHandle::init_header_per_ring(
     header->sched_error_bitmap.store(0, std::memory_order_relaxed);
     header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
     header->sched_error_thread.store(-1, std::memory_order_relaxed);
+
+    // Per-ring slot_states reset. Previously lived in
+    // PTO2SchedulerState::RingSchedState::init(), but it writes into
+    // ring->slot_states[] which is SM-side storage — keeping it here lets
+    // host-side prebuilt-arena init skip all SM dereferences.
+    // bind_ring() pins the ring_id (slot-invariant after this point);
+    // reset_for_reuse() prepares dynamic fanout/refcount fields so the first
+    // submit doesn't need an explicit reset.
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto &ring = header->rings[r];
+        for (uint64_t i = 0; i < task_window_sizes[r]; i++) {
+            ring.slot_states[i].bind_ring(static_cast<uint8_t>(r));
+            ring.slot_states[i].reset_for_reuse();
+            ring.slot_states[i].fanin_count = 0;
+            ring.slot_states[i].active_mask = ActiveMask{};
+        }
+    }
 }
 
 // =============================================================================
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
similarity index 82%
rename from src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
rename to src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
index c09e6f4f6..b99c67233 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
@@ -30,7 +30,6 @@
 
 #include "common.h"
 #include "common/unified_log.h"
-#include "pto_orchestrator.h"
 
 // =============================================================================
 // TensorMap Lookup Chain Length Statistics (compile-time toggle)
@@ -82,37 +81,45 @@ PTO2TensorMap::reserve_layout_default(DeviceArena &arena, const int32_t new_task
     return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes);
 }
 
-bool PTO2TensorMap::init_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
+bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
     num_buckets = layout.num_buckets;
     pool_size = layout.pool_size;
 
-    buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
-    entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
-    free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+    // Address arena regions for data writes; do not store these in struct
+    // fields (wire_arena_pointers does that).
+    auto *buckets_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+    auto *entry_pool_arena = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+    auto *free_list_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
 
+    // buckets[]: empty == nullptr.
     for (int32_t i = 0; i < num_buckets; i++) {
-        buckets[i] = nullptr;
+        buckets_arena[i] = nullptr;
     }
 
-    memset(entry_pool, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
+    // entry_pool: zero-init equivalent to the previous calloc(entry_pool, ...).
+    // The pool's persistent invariant after init is "bucket_index == -1 means
+    // not linked", set explicitly below.
+    memset(entry_pool_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
     for (int32_t i = 0; i < pool_size; i++) {
-        entry_pool[i].bucket_index = -1;
-        entry_pool[i].next_in_bucket = nullptr;
-        entry_pool[i].prev_in_bucket = nullptr;
-        entry_pool[i].next_in_task = nullptr;
-        entry_pool[i].prev_in_task = nullptr;
-        entry_pool[i].producer_task_id = PTO2TaskId{};
+        entry_pool_arena[i].bucket_index = -1;
+        entry_pool_arena[i].next_in_bucket = nullptr;
+        entry_pool_arena[i].prev_in_bucket = nullptr;
+        entry_pool_arena[i].next_in_task = nullptr;
+        entry_pool_arena[i].prev_in_task = nullptr;
+        entry_pool_arena[i].producer_task_id = PTO2TaskId{};
     }
 
-    memset(free_entry_list, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
+    // free_entry_list: zeroed (was calloc'd before); contents become meaningful
+    // only after entries are freed back, so the body of the array stays as 0.
+    memset(free_list_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
 
     next_entry_idx = 0;
     free_num = 0;
 
     for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+        auto *heads_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
         for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) {
-            task_entry_heads[r][i] = nullptr;
+            heads_arena[i] = nullptr;
         }
         task_window_sizes[r] = layout.task_window_sizes[r];
         last_task_alives[r] = 0;
@@ -122,6 +129,15 @@ bool PTO2TensorMap::init_from_layout(const PTO2TensorMapLayout &layout, DeviceAr
     return true;
 }
 
+void PTO2TensorMap::wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
+    buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+    entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+    free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+    }
+}
+
 void PTO2TensorMap::destroy() {
     // Arena owns the backing memory; here we only forget our pointers so any
     // stray post-destroy access trips a nullptr dereference instead of reading
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index 7daa54ed5..0ebb2ef79 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -46,6 +46,8 @@ Runtime::Runtime() {
     gm_heap_ptr_ = nullptr;
     slot_states_ptr_ = nullptr;
     orch_args_storage_.clear();
+    prebuilt_arena_base_ = nullptr;
+    prebuilt_runtime_offset_ = 0;
 
     // Initialize device orchestration SO binary
     dev_orch_so_addr_ = 0;
@@ -76,6 +78,13 @@ void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; }
 void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; }
 void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; }
 
+void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) {
+    prebuilt_arena_base_ = arena_base;
+    prebuilt_runtime_offset_ = runtime_off;
+}
+void *Runtime::get_prebuilt_arena_base() const { return prebuilt_arena_base_; }
+size_t Runtime::get_prebuilt_runtime_offset() const { return prebuilt_runtime_offset_; }
+
 // Device orchestration SO metadata (bytes live in a separate device buffer
 // owned by DeviceRunner; only the address/size travels in Runtime).
 void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
diff --git a/tests/ut/cpp/a5/test_scheduler_state.cpp b/tests/ut/cpp/a5/test_scheduler_state.cpp
index 952aad55a..75476dedf 100644
--- a/tests/ut/cpp/a5/test_scheduler_state.cpp
+++ b/tests/ut/cpp/a5/test_scheduler_state.cpp
@@ -34,7 +34,8 @@ class SchedulerStateTest : public ::testing::Test {
         ASSERT_NE(sm_handle, nullptr);
         auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
         ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header));
+        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header));
+        sched.wire_arena_pointers(layout, sched_arena);
     }
 
     void TearDown() override {
diff --git a/tests/ut/cpp/a5/test_task_allocator.cpp b/tests/ut/cpp/a5/test_task_allocator.cpp
index 383003900..512e241a2 100644
--- a/tests/ut/cpp/a5/test_task_allocator.cpp
+++ b/tests/ut/cpp/a5/test_task_allocator.cpp
@@ -388,7 +388,10 @@ TEST_F(TaskAllocatorTest, TaskWindowSaturates) {
 TEST_F(TaskAllocatorTest, TaskIdNearInt32Max) {
     current_index.store(INT32_MAX - 2);
     last_alive.store(INT32_MAX - 2);
-    allocator.init(descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code);
+    allocator.init(
+        descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code,
+        /*initial_local_task_id=*/INT32_MAX - 2
+    );
 
     auto r1 = allocator.alloc(0);
     ASSERT_FALSE(r1.failed());
diff --git a/tests/ut/cpp/a5/test_task_state.cpp b/tests/ut/cpp/a5/test_task_state.cpp
index 729b74999..ffced6f9a 100644
--- a/tests/ut/cpp/a5/test_task_state.cpp
+++ b/tests/ut/cpp/a5/test_task_state.cpp
@@ -43,7 +43,8 @@ class TaskStateTest : public ::testing::Test {
         ASSERT_NE(sm_handle, nullptr);
         auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
         ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header));
+        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header));
+        sched.wire_arena_pointers(layout, sched_arena);
     }
 
     void TearDown() override {
diff --git a/tests/ut/cpp/a5/test_wiring.cpp b/tests/ut/cpp/a5/test_wiring.cpp
index b01052a85..1e8fee9c5 100644
--- a/tests/ut/cpp/a5/test_wiring.cpp
+++ b/tests/ut/cpp/a5/test_wiring.cpp
@@ -48,7 +48,8 @@ class WiringTest : public ::testing::Test {
         ASSERT_NE(sm_handle, nullptr);
         auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
         ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header));
+        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header));
+        sched.wire_arena_pointers(layout, sched_arena);
     }
 
     void TearDown() override {

From 008658ea6d7d31916d88365b1ede5a3df5228a7c Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Wed, 27 May 2026 12:28:05 +0800
Subject: [PATCH 5/7] Refactor: split per-Worker static arena into three
 independent allocations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DeviceRunner's GM heap / PTO2 SM / trb prebuilt runtime arena used to live
in a single backing device buffer (one rtMalloc per worker, three regions
sub-divided via DeviceArena::reserve). The combined size can exceed the
device allocator's largest contiguous block on real hardware, so split
into three independent DeviceArena instances — each commits exactly one
region (one device_malloc), and acquire_pooled_* returns its base().

Touches all four DeviceRunner implementations (a2a3/a5 × onboard/sim).
The setup_static_arena and acquire_pooled_* signatures are unchanged;
the host_api / runtime_maker callers are unaffected. hbg keeps passing
runtime_arena_size = 0, which leaves runtime_arena_pool_ uncommitted
and acquire_pooled_runtime_arena returning nullptr.

Tests
- cpput: 25/25 pass.
- a5sim: L2 trb + host_build_graph full suite pass.
- a2a3sim: L2 trb + host_build_graph full suite pass.
---
 .../platform/onboard/host/device_runner.cpp   | 103 ++++++++++--------
 .../platform/onboard/host/device_runner.h     |  31 ++++--
 src/a2a3/platform/sim/host/device_runner.cpp  |  94 ++++++++--------
 src/a2a3/platform/sim/host/device_runner.h    |  27 +++--
 .../platform/onboard/host/device_runner.cpp   |  97 +++++++++--------
 src/a5/platform/onboard/host/device_runner.h  |  27 +++--
 src/a5/platform/sim/host/device_runner.cpp    |  86 ++++++++-------
 src/a5/platform/sim/host/device_runner.h      |  27 +++--
 8 files changed, 277 insertions(+), 215 deletions(-)

diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index 8d2d9916b..9b66e05ae 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -250,60 +250,74 @@ int AicpuSoInfo::finalize() {
 DeviceRunner::~DeviceRunner() { finalize(); }
 
 int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
-    if (static_arena_.is_committed()) {
-        // Idempotent for the production case (sizes do not change across a
-        // worker's lifetime). If a caller asks for a larger layout, redo it.
-        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_ &&
-            runtime_arena_size <= cached_runtime_arena_size_) {
+    // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt
+    // runtime arena. Split out from a single large allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block. Each arena commits exactly one region, so its base() is the
+    // pooled pointer the caller wants.
+    //
+    // Idempotent for the production case (sizes do not change across a
+    // worker's lifetime). If a caller asks for a larger layout on any
+    // region, redo just that region — already-committed peers stay alive
+    // so their callers don't have to re-acquire.
+    auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int {
+        if (requested_size == 0) {
+            // hbg's runtime_arena path: caller passed 0 and never reserved
+            // a region. Leave the arena uncommitted; acquire_pooled_* will
+            // return nullptr.
+            if (arena.is_committed() && cached_size != 0) {
+                arena.release();
+                cached_size = 0;
+            }
+            return 0;
+        }
+        if (arena.is_committed() && requested_size <= cached_size) {
             return 0;
         }
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
-        runtime_arena_region_off_ = SIZE_MAX;
+        arena.release();
+        cached_size = 0;
+        arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign);
+        if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+            // commit() failure leaves committed_=false, so the next entry's
+            // is_committed() guard skips the release branch. release() is
+            // idempotent on a never-committed arena (zeroes cursor_).
+            arena.release();
+            return -1;
+        }
+        cached_size = requested_size;
+        return 0;
+    };
+    if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1;
+    if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) {
+        gm_heap_arena_.release();
+        cached_gm_heap_size_ = 0;
+        return -1;
+    }
+    if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) {
+        gm_heap_arena_.release();
+        gm_sm_arena_.release();
         cached_gm_heap_size_ = 0;
         cached_gm_sm_size_ = 0;
-        cached_runtime_arena_size_ = 0;
-    }
-    gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign);
-    gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign);
-    if (runtime_arena_size > 0) {
-        runtime_arena_region_off_ = static_arena_.reserve(runtime_arena_size, DeviceArena::kDefaultBaseAlign);
-    }
-    if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
-        // Roll back the reserves: commit() failure leaves committed_=false,
-        // so the next entry would skip the release branch and stack new
-        // reserves on top of the stale cursor. release() is idempotent on a
-        // never-committed arena (just zeroes cursor_ / region_count_).
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
-        runtime_arena_region_off_ = SIZE_MAX;
         return -1;
     }
-    cached_gm_heap_size_ = gm_heap_size;
-    cached_gm_sm_size_ = gm_sm_size;
-    cached_runtime_arena_size_ = runtime_arena_size;
     return 0;
 }
 
 void *DeviceRunner::acquire_pooled_gm_heap() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_heap_region_off_);
+    if (!gm_heap_arena_.is_committed()) return nullptr;
+    return gm_heap_arena_.base();
 }
 
 void *DeviceRunner::acquire_pooled_gm_sm() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_sm_region_off_);
+    if (!gm_sm_arena_.is_committed()) return nullptr;
+    return gm_sm_arena_.base();
 }
 
 void *DeviceRunner::acquire_pooled_runtime_arena() {
-    if (!static_arena_.is_committed()) return nullptr;
-    // hbg calls setup_static_arena(...,0) and never reserves a runtime-arena
-    // region — fail loudly if a caller asks for it anyway, rather than
-    // returning region_ptr(SIZE_MAX) (base + SIZE_MAX is undefined).
-    if (runtime_arena_region_off_ == SIZE_MAX) return nullptr;
-    return static_arena_.region_ptr(runtime_arena_region_off_);
+    // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_
+    // uncommitted — fail loudly if a caller asks for it anyway.
+    if (!runtime_arena_pool_.is_committed()) return nullptr;
+    return runtime_arena_pool_.base();
 }
 
 std::thread DeviceRunner::create_thread(std::function<void()> fn) {
@@ -1241,14 +1255,13 @@ int DeviceRunner::finalize() {
     // perf_cleanup guard; this is the backstop for the no-run-since-init case.
     finalize_collectors();
 
-    // Release per-Worker static arena (GM heap + PTO2 SM + optional trb
-    // prebuilt runtime arena in a single backing device allocation). Must
-    // precede mem_alloc_.finalize() so the arena frees through the still-live
+    // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional
+    // trb prebuilt runtime arena — each its own device_malloc). Must precede
+    // mem_alloc_.finalize() so the arenas free through the still-live
     // allocator, not after it.
-    static_arena_.release();
-    gm_heap_region_off_ = SIZE_MAX;
-    gm_sm_region_off_ = SIZE_MAX;
-    runtime_arena_region_off_ = SIZE_MAX;
+    gm_heap_arena_.release();
+    gm_sm_arena_.release();
+    runtime_arena_pool_.release();
     cached_gm_heap_size_ = 0;
     cached_gm_sm_size_ = 0;
     cached_runtime_arena_size_ = 0;
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index 4d9819f21..8f6e1b3f9 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -185,7 +185,9 @@ struct KernelArgsHelper {
 class DeviceRunner {
 public:
     DeviceRunner() :
-        static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
+        gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
     ~DeviceRunner();
 
     /**
@@ -612,23 +614,28 @@ class DeviceRunner {
     // Memory management
     MemoryAllocator mem_alloc_;
 
-    // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a
-    // single device allocation. Released explicitly in finalize() before
-    // mem_alloc_.finalize() so it does not free pointers a second time.
+    // Three independent per-Worker arenas, each backing a single pooled
+    // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime
+    // arena). Split out from a single backing allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block — three separate device_malloc calls are friendlier than one
+    // big one. Released explicitly in finalize() before mem_alloc_.finalize()
+    // so the underlying buffers do not get freed twice.
+    //
+    // `runtime_arena_pool_` stays unreserved when setup_static_arena was
+    // invoked with runtime_arena_size == 0 (hbg path).
     //
     // Trampolines forward DeviceArena's alloc/free calls to mem_alloc_.
     static void *arena_alloc_trampoline(void *ctx, size_t size) {
         return static_cast<MemoryAllocator *>(ctx)->alloc(size);
     }
     static void arena_free_trampoline(void *ctx, void *p) { static_cast<MemoryAllocator *>(ctx)->free(p); }
-    DeviceArena static_arena_;
-    size_t gm_heap_region_off_{SIZE_MAX};
-    size_t gm_sm_region_off_{SIZE_MAX};
-    // SIZE_MAX (= "not provisioned") when the caller passed runtime_arena_size
-    // == 0 (hbg path); a real offset for trb.
-    size_t runtime_arena_region_off_{SIZE_MAX};
-    // Cached sizes for setup_static_arena's "fits" check — avoids calling
-    // region_size() on the arena's public API for the regions we own.
+    DeviceArena gm_heap_arena_;
+    DeviceArena gm_sm_arena_;
+    DeviceArena runtime_arena_pool_;
+    // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating
+    // the same buffer when a later worker init asks for an equal-or-smaller
+    // layout on an already-committed arena.
     size_t cached_gm_heap_size_{0};
     size_t cached_gm_sm_size_{0};
     size_t cached_runtime_arena_size_{0};
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index c221bb714..1651c4a89 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -123,58 +123,67 @@ bool create_temp_so_file(const std::string &path_template, const uint8_t *data,
 DeviceRunner::~DeviceRunner() { finalize(); }
 
 int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
-    if (static_arena_.is_committed()) {
-        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_ &&
-            runtime_arena_size <= cached_runtime_arena_size_) {
+    // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt
+    // runtime arena. Split out from a single large allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block. Each arena commits exactly one region, so its base() is the
+    // pooled pointer the caller wants.
+    //
+    // Idempotent for the production case (sizes do not change across a
+    // worker's lifetime). If a caller asks for a larger layout on any
+    // region, redo just that region.
+    auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int {
+        if (requested_size == 0) {
+            if (arena.is_committed() && cached_size != 0) {
+                arena.release();
+                cached_size = 0;
+            }
+            return 0;
+        }
+        if (arena.is_committed() && requested_size <= cached_size) {
             return 0;
         }
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
-        runtime_arena_region_off_ = SIZE_MAX;
+        arena.release();
+        cached_size = 0;
+        arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign);
+        if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+            arena.release();
+            return -1;
+        }
+        cached_size = requested_size;
+        return 0;
+    };
+    if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1;
+    if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) {
+        gm_heap_arena_.release();
+        cached_gm_heap_size_ = 0;
+        return -1;
+    }
+    if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) {
+        gm_heap_arena_.release();
+        gm_sm_arena_.release();
         cached_gm_heap_size_ = 0;
         cached_gm_sm_size_ = 0;
-        cached_runtime_arena_size_ = 0;
-    }
-    gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign);
-    gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign);
-    if (runtime_arena_size > 0) {
-        runtime_arena_region_off_ = static_arena_.reserve(runtime_arena_size, DeviceArena::kDefaultBaseAlign);
-    }
-    if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
-        // Roll back the reserves: commit() failure leaves committed_=false,
-        // so the next entry would skip the release branch and stack new
-        // reserves on top of the stale cursor. release() is idempotent on a
-        // never-committed arena (just zeroes cursor_ / region_count_).
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
-        runtime_arena_region_off_ = SIZE_MAX;
         return -1;
     }
-    cached_gm_heap_size_ = gm_heap_size;
-    cached_gm_sm_size_ = gm_sm_size;
-    cached_runtime_arena_size_ = runtime_arena_size;
     return 0;
 }
 
 void *DeviceRunner::acquire_pooled_gm_heap() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_heap_region_off_);
+    if (!gm_heap_arena_.is_committed()) return nullptr;
+    return gm_heap_arena_.base();
 }
 
 void *DeviceRunner::acquire_pooled_gm_sm() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_sm_region_off_);
+    if (!gm_sm_arena_.is_committed()) return nullptr;
+    return gm_sm_arena_.base();
 }
 
 void *DeviceRunner::acquire_pooled_runtime_arena() {
-    if (!static_arena_.is_committed()) return nullptr;
-    // hbg calls setup_static_arena(...,0) and never reserves a runtime-arena
-    // region — fail loudly if a caller asks for it anyway, rather than
-    // returning region_ptr(SIZE_MAX) (base + SIZE_MAX is undefined).
-    if (runtime_arena_region_off_ == SIZE_MAX) return nullptr;
-    return static_arena_.region_ptr(runtime_arena_region_off_);
+    // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_
+    // uncommitted — fail loudly if a caller asks for it anyway.
+    if (!runtime_arena_pool_.is_committed()) return nullptr;
+    return runtime_arena_pool_.base();
 }
 
 std::thread DeviceRunner::create_thread(std::function<void()> fn) {
@@ -1051,14 +1060,13 @@ int DeviceRunner::finalize() {
     // Close executor .so files (typically already closed by run(), this is a safety net)
     unload_executor_binaries();
 
-    // Release per-Worker static arena (GM heap + PTO2 SM + optional trb
-    // prebuilt runtime arena in a single backing device allocation). Must
-    // precede mem_alloc_.finalize() so the arena frees through the still-live
+    // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional
+    // trb prebuilt runtime arena — each its own device_malloc). Must precede
+    // mem_alloc_.finalize() so the arenas free through the still-live
     // allocator, not after it.
-    static_arena_.release();
-    gm_heap_region_off_ = SIZE_MAX;
-    gm_sm_region_off_ = SIZE_MAX;
-    runtime_arena_region_off_ = SIZE_MAX;
+    gm_heap_arena_.release();
+    gm_sm_arena_.release();
+    runtime_arena_pool_.release();
     cached_gm_heap_size_ = 0;
     cached_gm_sm_size_ = 0;
     cached_runtime_arena_size_ = 0;
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index 60f1bfdc9..a98eec1b8 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -75,7 +75,9 @@
 class DeviceRunner {
 public:
     DeviceRunner() :
-        static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
+        gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
     ~DeviceRunner();
 
     /**
@@ -287,21 +289,26 @@ class DeviceRunner {
     // Memory management
     MemoryAllocator mem_alloc_;
 
-    // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a
-    // single device allocation. Released explicitly in finalize() before
-    // mem_alloc_.finalize() so it does not free pointers a second time.
+    // Three independent per-Worker arenas, each backing a single pooled
+    // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime
+    // arena). Split out from a single backing allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block. Released explicitly in finalize() before mem_alloc_.finalize()
+    // so the underlying buffers do not get freed twice.
+    //
+    // `runtime_arena_pool_` stays unreserved when setup_static_arena was
+    // invoked with runtime_arena_size == 0 (hbg path).
     //
     // Trampolines forward DeviceArena's alloc/free to mem_alloc_.
     static void *arena_alloc_trampoline(void *ctx, size_t size) {
         return static_cast<MemoryAllocator *>(ctx)->alloc(size);
     }
     static void arena_free_trampoline(void *ctx, void *p) { static_cast<MemoryAllocator *>(ctx)->free(p); }
-    DeviceArena static_arena_;
-    size_t gm_heap_region_off_{SIZE_MAX};
-    size_t gm_sm_region_off_{SIZE_MAX};
-    size_t runtime_arena_region_off_{SIZE_MAX};
-    // Cached sizes for setup_static_arena's "fits" check — avoids calling
-    // region_size() on the arena's public API for the regions we own.
+    DeviceArena gm_heap_arena_;
+    DeviceArena gm_sm_arena_;
+    DeviceArena runtime_arena_pool_;
+    // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating
+    // a buffer when a later worker init asks for an equal-or-smaller layout.
     size_t cached_gm_heap_size_{0};
     size_t cached_gm_sm_size_{0};
     size_t cached_runtime_arena_size_{0};
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index 377e0b8eb..4a26e1056 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -196,60 +196,67 @@ static int prof_free_cb(void *dev_ptr) { return rtFree(dev_ptr); }
 DeviceRunner::~DeviceRunner() { finalize(); }
 
 int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
-    if (static_arena_.is_committed()) {
-        // Idempotent for the production case (sizes do not change across a
-        // worker's lifetime). If a caller asks for a larger layout, redo it.
-        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_ &&
-            runtime_arena_size <= cached_runtime_arena_size_) {
+    // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt
+    // runtime arena. Split out from a single large allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block. Each arena commits exactly one region, so its base() is the
+    // pooled pointer the caller wants.
+    //
+    // Idempotent for the production case (sizes do not change across a
+    // worker's lifetime). If a caller asks for a larger layout on any
+    // region, redo just that region.
+    auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int {
+        if (requested_size == 0) {
+            if (arena.is_committed() && cached_size != 0) {
+                arena.release();
+                cached_size = 0;
+            }
             return 0;
         }
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
-        runtime_arena_region_off_ = SIZE_MAX;
+        if (arena.is_committed() && requested_size <= cached_size) {
+            return 0;
+        }
+        arena.release();
+        cached_size = 0;
+        arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign);
+        if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+            arena.release();
+            return -1;
+        }
+        cached_size = requested_size;
+        return 0;
+    };
+    if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1;
+    if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) {
+        gm_heap_arena_.release();
+        cached_gm_heap_size_ = 0;
+        return -1;
+    }
+    if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) {
+        gm_heap_arena_.release();
+        gm_sm_arena_.release();
         cached_gm_heap_size_ = 0;
         cached_gm_sm_size_ = 0;
-        cached_runtime_arena_size_ = 0;
-    }
-    gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign);
-    gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign);
-    if (runtime_arena_size > 0) {
-        runtime_arena_region_off_ = static_arena_.reserve(runtime_arena_size, DeviceArena::kDefaultBaseAlign);
-    }
-    if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
-        // Roll back the reserves: commit() failure leaves committed_=false,
-        // so the next entry would skip the release branch and stack new
-        // reserves on top of the stale cursor. release() is idempotent on a
-        // never-committed arena (just zeroes cursor_ / region_count_).
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
-        runtime_arena_region_off_ = SIZE_MAX;
         return -1;
     }
-    cached_gm_heap_size_ = gm_heap_size;
-    cached_gm_sm_size_ = gm_sm_size;
-    cached_runtime_arena_size_ = runtime_arena_size;
     return 0;
 }
 
 void *DeviceRunner::acquire_pooled_gm_heap() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_heap_region_off_);
+    if (!gm_heap_arena_.is_committed()) return nullptr;
+    return gm_heap_arena_.base();
 }
 
 void *DeviceRunner::acquire_pooled_gm_sm() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_sm_region_off_);
+    if (!gm_sm_arena_.is_committed()) return nullptr;
+    return gm_sm_arena_.base();
 }
 
 void *DeviceRunner::acquire_pooled_runtime_arena() {
-    if (!static_arena_.is_committed()) return nullptr;
-    // hbg calls setup_static_arena(...,0) and never reserves a runtime-arena
-    // region — fail loudly if a caller asks for it anyway, rather than
-    // returning region_ptr(SIZE_MAX) (base + SIZE_MAX is undefined).
-    if (runtime_arena_region_off_ == SIZE_MAX) return nullptr;
-    return static_arena_.region_ptr(runtime_arena_region_off_);
+    // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_
+    // uncommitted — fail loudly if a caller asks for it anyway.
+    if (!runtime_arena_pool_.is_committed()) return nullptr;
+    return runtime_arena_pool_.base();
 }
 
 std::thread DeviceRunner::create_thread(std::function<void()> fn) {
@@ -1060,13 +1067,13 @@ int DeviceRunner::finalize() {
         pmu_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb);
     }
 
-    // Release per-Worker static arena (GM heap + PTO2 SM in a single backing
-    // device allocation). Must precede mem_alloc_.finalize() so the arena
-    // frees through the still-live allocator, not after it.
-    static_arena_.release();
-    gm_heap_region_off_ = SIZE_MAX;
-    gm_sm_region_off_ = SIZE_MAX;
-    runtime_arena_region_off_ = SIZE_MAX;
+    // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional
+    // trb prebuilt runtime arena — each its own device_malloc). Must precede
+    // mem_alloc_.finalize() so the arenas free through the still-live
+    // allocator, not after it.
+    gm_heap_arena_.release();
+    gm_sm_arena_.release();
+    runtime_arena_pool_.release();
     cached_gm_heap_size_ = 0;
     cached_gm_sm_size_ = 0;
     cached_runtime_arena_size_ = 0;
diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h
index 9edad84fa..754514fe5 100644
--- a/src/a5/platform/onboard/host/device_runner.h
+++ b/src/a5/platform/onboard/host/device_runner.h
@@ -174,7 +174,9 @@ struct KernelArgsHelper {
 class DeviceRunner {
 public:
     DeviceRunner() :
-        static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
+        gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
     ~DeviceRunner();
 
     /**
@@ -522,21 +524,26 @@ class DeviceRunner {
     // Memory management
     MemoryAllocator mem_alloc_;
 
-    // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a
-    // single device allocation. Released explicitly in finalize() before
-    // mem_alloc_.finalize() so it does not free pointers a second time.
+    // Three independent per-Worker arenas, each backing a single pooled
+    // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime
+    // arena). Split out from a single backing allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block. Released explicitly in finalize() before mem_alloc_.finalize()
+    // so the underlying buffers do not get freed twice.
+    //
+    // `runtime_arena_pool_` stays unreserved when setup_static_arena was
+    // invoked with runtime_arena_size == 0 (hbg path).
     //
     // Trampolines forward DeviceArena's alloc/free calls to mem_alloc_.
     static void *arena_alloc_trampoline(void *ctx, size_t size) {
         return static_cast<MemoryAllocator *>(ctx)->alloc(size);
     }
     static void arena_free_trampoline(void *ctx, void *p) { static_cast<MemoryAllocator *>(ctx)->free(p); }
-    DeviceArena static_arena_;
-    size_t gm_heap_region_off_{SIZE_MAX};
-    size_t gm_sm_region_off_{SIZE_MAX};
-    size_t runtime_arena_region_off_{SIZE_MAX};
-    // Cached sizes for setup_static_arena's "fits" check — avoids calling
-    // region_size() on the arena's public API for the two regions we own.
+    DeviceArena gm_heap_arena_;
+    DeviceArena gm_sm_arena_;
+    DeviceArena runtime_arena_pool_;
+    // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating
+    // a buffer when a later worker init asks for an equal-or-smaller layout.
     size_t cached_gm_heap_size_{0};
     size_t cached_gm_sm_size_{0};
     size_t cached_runtime_arena_size_{0};
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index 8cbac796c..a20b9d44d 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -113,55 +113,61 @@ static int prof_free_cb(void *dev_ptr) {
 DeviceRunner::~DeviceRunner() { finalize(); }
 
 int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
-    if (static_arena_.is_committed()) {
-        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_ &&
-            runtime_arena_size <= cached_runtime_arena_size_) {
+    // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt
+    // runtime arena. Split out from a single large allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block. Each arena commits exactly one region, so its base() is the
+    // pooled pointer the caller wants.
+    auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int {
+        if (requested_size == 0) {
+            if (arena.is_committed() && cached_size != 0) {
+                arena.release();
+                cached_size = 0;
+            }
+            return 0;
+        }
+        if (arena.is_committed() && requested_size <= cached_size) {
             return 0;
         }
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
-        runtime_arena_region_off_ = SIZE_MAX;
+        arena.release();
+        cached_size = 0;
+        arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign);
+        if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+            arena.release();
+            return -1;
+        }
+        cached_size = requested_size;
+        return 0;
+    };
+    if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1;
+    if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) {
+        gm_heap_arena_.release();
+        cached_gm_heap_size_ = 0;
+        return -1;
+    }
+    if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) {
+        gm_heap_arena_.release();
+        gm_sm_arena_.release();
         cached_gm_heap_size_ = 0;
         cached_gm_sm_size_ = 0;
-        cached_runtime_arena_size_ = 0;
-    }
-    gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign);
-    gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign);
-    if (runtime_arena_size > 0) {
-        runtime_arena_region_off_ = static_arena_.reserve(runtime_arena_size, DeviceArena::kDefaultBaseAlign);
-    }
-    if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
-        // Roll back the reserves: commit() failure leaves committed_=false,
-        // so the next entry would skip the release branch and stack new
-        // reserves on top of the stale cursor. release() is idempotent on a
-        // never-committed arena (just zeroes cursor_ / region_count_).
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
-        runtime_arena_region_off_ = SIZE_MAX;
         return -1;
     }
-    cached_gm_heap_size_ = gm_heap_size;
-    cached_gm_sm_size_ = gm_sm_size;
-    cached_runtime_arena_size_ = runtime_arena_size;
     return 0;
 }
 
 void *DeviceRunner::acquire_pooled_gm_heap() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_heap_region_off_);
+    if (!gm_heap_arena_.is_committed()) return nullptr;
+    return gm_heap_arena_.base();
 }
 
 void *DeviceRunner::acquire_pooled_gm_sm() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_sm_region_off_);
+    if (!gm_sm_arena_.is_committed()) return nullptr;
+    return gm_sm_arena_.base();
 }
 
 void *DeviceRunner::acquire_pooled_runtime_arena() {
-    if (!static_arena_.is_committed()) return nullptr;
-    if (runtime_arena_region_off_ == SIZE_MAX) return nullptr;
-    return static_arena_.region_ptr(runtime_arena_region_off_);
+    if (!runtime_arena_pool_.is_committed()) return nullptr;
+    return runtime_arena_pool_.base();
 }
 
 std::thread DeviceRunner::create_thread(std::function<void()> fn) {
@@ -945,13 +951,13 @@ int DeviceRunner::finalize() {
     // Close executor .so files (typically already closed by run(), this is a safety net)
     unload_executor_binaries();
 
-    // Release per-Worker static arena (GM heap + PTO2 SM in a single backing
-    // device allocation). Must precede mem_alloc_.finalize() so the arena
-    // frees through the still-live allocator, not after it.
-    static_arena_.release();
-    gm_heap_region_off_ = SIZE_MAX;
-    gm_sm_region_off_ = SIZE_MAX;
-    runtime_arena_region_off_ = SIZE_MAX;
+    // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional
+    // trb prebuilt runtime arena — each its own device_malloc). Must precede
+    // mem_alloc_.finalize() so the arenas free through the still-live
+    // allocator, not after it.
+    gm_heap_arena_.release();
+    gm_sm_arena_.release();
+    runtime_arena_pool_.release();
     cached_gm_heap_size_ = 0;
     cached_gm_sm_size_ = 0;
     cached_runtime_arena_size_ = 0;
diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h
index f4fe44121..468fd6e44 100644
--- a/src/a5/platform/sim/host/device_runner.h
+++ b/src/a5/platform/sim/host/device_runner.h
@@ -72,7 +72,9 @@
 class DeviceRunner {
 public:
     DeviceRunner() :
-        static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
+        gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
     ~DeviceRunner();
 
     /**
@@ -286,21 +288,26 @@ class DeviceRunner {
     // Memory management
     MemoryAllocator mem_alloc_;
 
-    // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a
-    // single device allocation. Released explicitly in finalize() before
-    // mem_alloc_.finalize() so it does not free pointers a second time.
+    // Three independent per-Worker arenas, each backing a single pooled
+    // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime
+    // arena). Split out from a single backing allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block. Released explicitly in finalize() before mem_alloc_.finalize()
+    // so the underlying buffers do not get freed twice.
+    //
+    // `runtime_arena_pool_` stays unreserved when setup_static_arena was
+    // invoked with runtime_arena_size == 0 (hbg path).
     //
     // Trampolines forward DeviceArena's alloc/free to mem_alloc_.
     static void *arena_alloc_trampoline(void *ctx, size_t size) {
         return static_cast<MemoryAllocator *>(ctx)->alloc(size);
     }
     static void arena_free_trampoline(void *ctx, void *p) { static_cast<MemoryAllocator *>(ctx)->free(p); }
-    DeviceArena static_arena_;
-    size_t gm_heap_region_off_{SIZE_MAX};
-    size_t gm_sm_region_off_{SIZE_MAX};
-    size_t runtime_arena_region_off_{SIZE_MAX};
-    // Cached sizes for setup_static_arena's "fits" check — avoids calling
-    // region_size() on the arena's public API for the two regions we own.
+    DeviceArena gm_heap_arena_;
+    DeviceArena gm_sm_arena_;
+    DeviceArena runtime_arena_pool_;
+    // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating
+    // a buffer when a later worker init asks for an equal-or-smaller layout.
     size_t cached_gm_heap_size_{0};
     size_t cached_gm_sm_size_{0};
     size_t cached_runtime_arena_size_{0};

From bd5bf35a6c837223702f82c91e7470afdb0df907 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Wed, 27 May 2026 16:27:03 +0800
Subject: [PATCH 6/7] Refactor: post-review cleanups for trb host-build arena
 PR
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address review feedback covering doc / comment consistency and a small set
of behavioral symmetry items between a2a3 and a5 trb runtimes:

- pto_orchestrator.h: drop the stale ",orch" from the wire_arena_pointers
  comment on a2a3 (PTO2TensorMap::orch was removed in 75f2562c but a2a3
  kept the comment lagging behind the a5 mirror).
- runtime.h / device_runner.h (both arches): refresh the
  setup_static_arena / acquire_pooled_* docblocks. Drop the orphan
  pre-split prose ("backs both the PTO2 GM heap and the PTO2 shared
  memory in a single underlying allocation") and the "doing so returns
  an unreserved-offset region_ptr (undefined)" wording that no longer
  matches the three-independent-arenas split — acquire_pooled_runtime_arena
  now returns a well-defined nullptr on the hbg path.
- a5 device_runner.{h,cpp}: restore the rationale comments that the a5
  mirror lost when it copied a2a3's earlier shape — three separate
  device_malloc calls being friendlier than one big one, hbg's
  runtime_arena_size == 0 contract, commit() failure rollback
  invariants, idempotent peer-arena policy. Keeps the why-this-way
  notes symmetric with a2a3.
- a5 runtime.h: fix the RUNTIME_MAX_ORCH_SO_SIZE comment that claimed
  "1MB" while the macro expands to 4MB.
- a5 pto_orchestrator.cpp: drop the prod_state->task null / task_id
  defensive guard. PTO2TensorMap lookup chain truncation already
  guarantees producer_task_id >= last_task_alive, and producers reach
  the tensormap only after prepare_task has bound the slot. Matches the
  a2a3 shape that relies on the same invariants.
- a5 cpput: migrate the three stale UTs (test_ready_queue,
  test_spsc_queue, test_tensormap) to the new 4-phase reserve_layout /
  init_data_from_layout / wire_arena_pointers API. Wire them and the
  previously-orphaned a5 trb UTs into CMakeLists.txt behind a new
  a5_rt_objs OBJECT library + add_a5_runtime_test helper (mirrors
  a2a3_rt_objs). Target names carry the test_a5_ prefix to avoid
  clashing with hierarchical / a2a3 unprefixed test names.

Tests
- cpput: 35/35 pass (25 a2a3 + 10 newly enabled a5 trb).
- a5sim: full sim suite passes.
- a2a3sim: full sim suite passes (regression).
---
 .../platform/onboard/host/device_runner.h     | 25 +++++----
 src/a2a3/platform/sim/host/device_runner.h    | 20 +++----
 .../runtime/pto_orchestrator.h                |  2 +-
 .../runtime/runtime.h                         | 32 +++++------
 .../platform/onboard/host/device_runner.cpp   |  9 +++-
 src/a5/platform/onboard/host/device_runner.h  | 28 +++++-----
 src/a5/platform/sim/host/device_runner.cpp    | 11 ++++
 src/a5/platform/sim/host/device_runner.h      | 22 ++++----
 .../runtime/pto_orchestrator.cpp              |  3 --
 .../runtime/runtime.h                         | 29 +++++-----
 tests/ut/cpp/CMakeLists.txt                   | 54 +++++++++++++++++++
 tests/ut/cpp/a5/test_ready_queue.cpp          | 42 ++++++++++++---
 tests/ut/cpp/a5/test_spsc_queue.cpp           | 32 ++++++++---
 tests/ut/cpp/a5/test_tensormap.cpp            | 23 ++++++--
 14 files changed, 228 insertions(+), 104 deletions(-)

diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index 8f6e1b3f9..93501a916 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -191,27 +191,26 @@ class DeviceRunner {
     ~DeviceRunner();
 
     /**
-     * Lay out and commit the per-Worker static device arena that backs the
-     * PTO2 GM heap, the PTO2 shared memory, and the trb prebuilt runtime
-     * arena in a single underlying allocation. Must be called before any
-     * acquire_pooled_*. Idempotent on identical sizes. `runtime_arena_size`
-     * is 0 for the hbg path (no prebuilt runtime arena). Returns 0 on
-     * success, -1 on failure.
+     * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
+     * memory, trb prebuilt runtime arena) as three independent device
+     * allocations. Must be called before any acquire_pooled_*. Idempotent
+     * on identical sizes. `runtime_arena_size` is 0 for the hbg path (no
+     * prebuilt runtime arena) — the corresponding arena stays uncommitted.
+     * Returns 0 on success, -1 on failure.
      */
     int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
 
     /**
      * Return the pooled GM heap / PTO2 SM / runtime arena pointer.
-     * setup_static_arena must have been called earlier in this Worker;
+     * setup_static_arena must have already committed the relevant region;
      * otherwise these return nullptr. All pointers are stable for the
-     * Worker's lifetime; the single underlying device buffer is released in
-     * `finalize()`.
+     * Worker's lifetime; the three underlying device buffers are released
+     * in `finalize()`.
      *
      * acquire_pooled_runtime_arena() is trb-only — the runtime arena region
-     * is only reserved when setup_static_arena was called with
-     * runtime_arena_size > 0. hbg's runtime_maker never calls this; doing so
-     * after setup_static_arena(...,0) returns an unreserved-offset region_ptr
-     * (undefined). Keep the call site discipline at the runtime_maker layer.
+     * is only committed when setup_static_arena was called with
+     * runtime_arena_size > 0. Calling it on the hbg path
+     * (setup_static_arena(...,0)) returns nullptr (well-defined).
      */
     void *acquire_pooled_gm_heap();
     void *acquire_pooled_gm_sm();
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index a98eec1b8..46ee45913 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -81,23 +81,23 @@ class DeviceRunner {
     ~DeviceRunner();
 
     /**
-     * Lay out and commit the per-Worker static device arena that backs the
-     * PTO2 GM heap, the PTO2 shared memory, and the trb prebuilt runtime
-     * arena in a single underlying allocation. Must be called before any
-     * acquire_pooled_*. `runtime_arena_size` is 0 for hbg. Idempotent on
-     * identical sizes. Returns 0 on success, -1 on failure.
+     * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
+     * memory, trb prebuilt runtime arena) as three independent device
+     * allocations. Must be called before any acquire_pooled_*.
+     * `runtime_arena_size` is 0 for the hbg path (leaves that arena
+     * uncommitted). Idempotent on identical sizes. Returns 0 on success,
+     * -1 on failure.
      */
     int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
 
     /**
      * Return the pooled GM heap / PTO2 SM / runtime arena pointer.
-     * setup_static_arena must have been called earlier in this Worker.
+     * setup_static_arena must have already committed the relevant region.
      *
      * acquire_pooled_runtime_arena() is trb-only — the runtime arena region
-     * is only reserved when setup_static_arena was called with
-     * runtime_arena_size > 0. hbg's runtime_maker never calls this; doing so
-     * after setup_static_arena(...,0) returns an unreserved-offset region_ptr
-     * (undefined). Keep the call site discipline at the runtime_maker layer.
+     * is only committed when setup_static_arena was called with
+     * runtime_arena_size > 0. Calling it on the hbg path
+     * (setup_static_arena(...,0)) returns nullptr (well-defined).
      */
     void *acquire_pooled_gm_heap();
     void *acquire_pooled_gm_sm();
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index 6e67cb597..7dd47b19a 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -153,7 +153,7 @@ struct PTO2OrchestratorState {
 
     // Phase 3b: write the arena-internal pointer fields (scope_tasks,
     // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool,
-    // free_entry_list,task_entry_heads,orch}, scheduler reference).
+    // free_entry_list,task_entry_heads}, scheduler reference).
     // Idempotent — host runs once on the image, AICPU runs once after attach.
     void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler);
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 117621ca2..8e1bb1567 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -119,28 +119,22 @@ struct HostApi {
     void (*device_free)(void *dev_ptr);
     int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size);
     int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size);
-    // Lay out and commit the per-Worker static device arena that backs both
-    // the PTO2 GM heap and the PTO2 shared memory in a single underlying
-    // allocation. Must be called once before acquire_pooled_gm_heap /
-    // acquire_pooled_gm_sm. Idempotent on identical sizes; returns 0 on
-    // success, -1 on allocation failure.
-    // Lay out three pooled regions in a single backing device allocation:
-    // GM heap, PTO2 shared memory, and the trb prebuilt runtime arena.
-    // `runtime_arena_size == 0` skips the last region (hbg path: hbg has no
-    // prebuilt runtime arena). Returns 0 on success, -1 on allocation
-    // failure.
+    // Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
+    // memory, trb prebuilt runtime arena) as three independent device
+    // allocations. `runtime_arena_size == 0` skips the third region (hbg
+    // path: hbg has no prebuilt runtime arena). Idempotent on identical
+    // sizes; returns 0 on success, -1 on allocation failure.
     int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
     // Return the per-Worker pooled pointer for the PTO2 GM heap / shared
-    // memory / prebuilt runtime arena. The static arena must already be
-    // committed via setup_static_arena; the returned pointer is owned by
-    // the DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT
-    // pass it to device_free or record it in `tensor_pairs_`.
+    // memory / prebuilt runtime arena. setup_static_arena must have already
+    // committed the relevant region; the returned pointer is owned by the
+    // DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT pass it
+    // to device_free or record it in `tensor_pairs_`.
     //
-    // acquire_pooled_runtime_arena is trb-only — the host side reserves the
-    // runtime-arena region only when setup_static_arena is invoked with
-    // runtime_arena_size > 0. hbg's runtime_maker.cpp must not call it
-    // (setup_static_arena(...,0) leaves the offset unreserved, and the
-    // returned region_ptr would be undefined).
+    // acquire_pooled_runtime_arena is trb-only — the runtime-arena region is
+    // only committed when setup_static_arena was invoked with
+    // runtime_arena_size > 0. Calling it on the hbg path
+    // (setup_static_arena(...,0)) returns nullptr (not undefined).
     void *(*acquire_pooled_gm_heap)();
     void *(*acquire_pooled_gm_sm)();
     void *(*acquire_pooled_runtime_arena)();
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index 4a26e1056..b8dc9bb46 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -204,9 +204,13 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz
     //
     // Idempotent for the production case (sizes do not change across a
     // worker's lifetime). If a caller asks for a larger layout on any
-    // region, redo just that region.
+    // region, redo just that region — already-committed peers stay alive
+    // so their callers don't have to re-acquire.
     auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int {
         if (requested_size == 0) {
+            // hbg's runtime_arena path: caller passed 0 and never reserved
+            // a region. Leave the arena uncommitted; acquire_pooled_* will
+            // return nullptr.
             if (arena.is_committed() && cached_size != 0) {
                 arena.release();
                 cached_size = 0;
@@ -220,6 +224,9 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz
         cached_size = 0;
         arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign);
         if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+            // commit() failure leaves committed_=false, so the next entry's
+            // is_committed() guard skips the release branch. release() is
+            // idempotent on a never-committed arena (zeroes cursor_).
             arena.release();
             return -1;
         }
diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h
index 754514fe5..0d8cc0397 100644
--- a/src/a5/platform/onboard/host/device_runner.h
+++ b/src/a5/platform/onboard/host/device_runner.h
@@ -180,27 +180,26 @@ class DeviceRunner {
     ~DeviceRunner();
 
     /**
-     * Lay out and commit the per-Worker static device arena that backs the
-     * PTO2 GM heap, the PTO2 shared memory, and the trb prebuilt runtime
-     * arena in a single underlying allocation. Must be called before any
-     * acquire_pooled_*. Idempotent on identical sizes. `runtime_arena_size`
-     * is 0 for the hbg path (no prebuilt runtime arena). Returns 0 on
-     * success, -1 on failure.
+     * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
+     * memory, trb prebuilt runtime arena) as three independent device
+     * allocations. Must be called before any acquire_pooled_*. Idempotent
+     * on identical sizes. `runtime_arena_size` is 0 for the hbg path (no
+     * prebuilt runtime arena) — the corresponding arena stays uncommitted.
+     * Returns 0 on success, -1 on failure.
      */
     int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
 
     /**
      * Return the pooled GM heap / PTO2 SM / runtime arena pointer.
-     * setup_static_arena must have been called earlier in this Worker;
+     * setup_static_arena must have already committed the relevant region;
      * otherwise these return nullptr. All pointers are stable for the
-     * Worker's lifetime; the single underlying device buffer is released in
-     * `finalize()`.
+     * Worker's lifetime; the three underlying device buffers are released
+     * in `finalize()`.
      *
      * acquire_pooled_runtime_arena() is trb-only — the runtime arena region
-     * is only reserved when setup_static_arena was called with
-     * runtime_arena_size > 0. hbg's runtime_maker never calls this; doing so
-     * after setup_static_arena(...,0) returns an unreserved-offset region_ptr
-     * (undefined). Keep the call site discipline at the runtime_maker layer.
+     * is only committed when setup_static_arena was called with
+     * runtime_arena_size > 0. Calling it on the hbg path
+     * (setup_static_arena(...,0)) returns nullptr (well-defined).
      */
     void *acquire_pooled_gm_heap();
     void *acquire_pooled_gm_sm();
@@ -528,7 +527,8 @@ class DeviceRunner {
     // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime
     // arena). Split out from a single backing allocation because the
     // combined size can exceed the device allocator's largest contiguous
-    // block. Released explicitly in finalize() before mem_alloc_.finalize()
+    // block — three separate device_malloc calls are friendlier than one
+    // big one. Released explicitly in finalize() before mem_alloc_.finalize()
     // so the underlying buffers do not get freed twice.
     //
     // `runtime_arena_pool_` stays unreserved when setup_static_arena was
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index a20b9d44d..fe3e938e1 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -118,8 +118,16 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz
     // combined size can exceed the device allocator's largest contiguous
     // block. Each arena commits exactly one region, so its base() is the
     // pooled pointer the caller wants.
+    //
+    // Idempotent for the production case (sizes do not change across a
+    // worker's lifetime). If a caller asks for a larger layout on any
+    // region, redo just that region — already-committed peers stay alive
+    // so their callers don't have to re-acquire.
     auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int {
         if (requested_size == 0) {
+            // hbg's runtime_arena path: caller passed 0 and never reserved
+            // a region. Leave the arena uncommitted; acquire_pooled_* will
+            // return nullptr.
             if (arena.is_committed() && cached_size != 0) {
                 arena.release();
                 cached_size = 0;
@@ -133,6 +141,9 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz
         cached_size = 0;
         arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign);
         if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+            // commit() failure leaves committed_=false, so the next entry's
+            // is_committed() guard skips the release branch. release() is
+            // idempotent on a never-committed arena (zeroes cursor_).
             arena.release();
             return -1;
         }
diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h
index 468fd6e44..59b685572 100644
--- a/src/a5/platform/sim/host/device_runner.h
+++ b/src/a5/platform/sim/host/device_runner.h
@@ -78,22 +78,23 @@ class DeviceRunner {
     ~DeviceRunner();
 
     /**
-     * Lay out and commit the per-Worker static device arena that backs the
-     * PTO2 GM heap, the PTO2 shared memory, and the trb prebuilt runtime
-     * arena in a single underlying allocation. Must be called before any
-     * acquire_pooled_*. Idempotent on identical sizes. `runtime_arena_size`
-     * is 0 for the hbg path (no prebuilt runtime arena). Returns 0 on
-     * success, -1 on failure.
+     * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
+     * memory, trb prebuilt runtime arena) as three independent device
+     * allocations. Must be called before any acquire_pooled_*. Idempotent
+     * on identical sizes. `runtime_arena_size` is 0 for the hbg path
+     * (leaves that arena uncommitted). Returns 0 on success, -1 on
+     * failure.
      */
     int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
 
     /**
      * Return the pooled GM heap / PTO2 SM / runtime arena pointer.
-     * setup_static_arena must have been called earlier in this Worker;
+     * setup_static_arena must have already committed the relevant region;
      * otherwise these return nullptr.
      *
-     * acquire_pooled_runtime_arena() is trb-only — the region exists only
-     * when setup_static_arena was called with runtime_arena_size > 0.
+     * acquire_pooled_runtime_arena() is trb-only — the region is only
+     * committed when setup_static_arena was called with
+     * runtime_arena_size > 0. Calling it on the hbg path returns nullptr.
      */
     void *acquire_pooled_gm_heap();
     void *acquire_pooled_gm_sm();
@@ -292,7 +293,8 @@ class DeviceRunner {
     // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime
     // arena). Split out from a single backing allocation because the
     // combined size can exceed the device allocator's largest contiguous
-    // block. Released explicitly in finalize() before mem_alloc_.finalize()
+    // block — three separate device_malloc calls are friendlier than one
+    // big one. Released explicitly in finalize() before mem_alloc_.finalize()
     // so the underlying buffers do not get freed twice.
     //
     // `runtime_arena_pool_` stays unreserved when setup_static_arena was
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 48368cf6a..c937fd986 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -507,9 +507,6 @@ static TaskOutputTensors submit_task_common(
     auto runtime_emit = [&](PTO2TaskId producer_task_id) -> bool {
         PTO2TaskSlotState *prod_state =
             &orch->sm_header->rings[producer_task_id.ring()].get_slot_state_by_task_id(producer_task_id.local());
-        if (prod_state->task == nullptr || prod_state->task->task_id != producer_task_id) {
-            return true;  // producer slot reused for a different task — dep is moot
-        }
         return append_fanin_or_fail(orch, prod_state, &fanin_builder, ring_id);
     };
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 4a7dce1bd..4a690e8ca 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -48,7 +48,7 @@
 #define RUNTIME_MAX_ARGS 128
 #define RUNTIME_MAX_WORKER 108  // 36 AIC + 72 AIV cores
 #define RUNTIME_MAX_FUNC_ID 1024
-#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024)  // 1MB max for orchestration SO
+#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024)  // 4MB max for orchestration SO
 #define RUNTIME_MAX_ORCH_SYMBOL_NAME 64
 
 // Default ready queue shards: one shard per worker thread (total minus orchestrator)
@@ -127,23 +127,22 @@ struct HostApi {
     void (*device_free)(void *dev_ptr);
     int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size);
     int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size);
-    // Lay out three pooled regions in a single backing device allocation:
-    // GM heap, PTO2 shared memory, and the trb prebuilt runtime arena.
-    // `runtime_arena_size == 0` skips the last region (hbg path: hbg has no
-    // prebuilt runtime arena). Returns 0 on success, -1 on allocation
-    // failure.
+    // Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
+    // memory, trb prebuilt runtime arena) as three independent device
+    // allocations. `runtime_arena_size == 0` skips the third region (hbg
+    // path: hbg has no prebuilt runtime arena). Idempotent on identical
+    // sizes; returns 0 on success, -1 on allocation failure.
     int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
     // Return the per-Worker pooled pointer for the PTO2 GM heap / shared
-    // memory / prebuilt runtime arena. The static arena must already be
-    // committed via setup_static_arena; the returned pointer is owned by
-    // the DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT
-    // pass it to device_free or record it in `tensor_pairs_`.
+    // memory / prebuilt runtime arena. setup_static_arena must have already
+    // committed the relevant region; the returned pointer is owned by the
+    // DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT pass it
+    // to device_free or record it in `tensor_pairs_`.
     //
-    // acquire_pooled_runtime_arena is trb-only — the host side reserves the
-    // runtime-arena region only when setup_static_arena is invoked with
-    // runtime_arena_size > 0. hbg's runtime_maker.cpp must not call it
-    // (setup_static_arena(...,0) leaves the offset unreserved, and the
-    // returned region_ptr would be undefined).
+    // acquire_pooled_runtime_arena is trb-only — the runtime-arena region is
+    // only committed when setup_static_arena was invoked with
+    // runtime_arena_size > 0. Calling it on the hbg path
+    // (setup_static_arena(...,0)) returns nullptr (not undefined).
     void *(*acquire_pooled_gm_heap)();
     void *(*acquire_pooled_gm_sm)();
     void *(*acquire_pooled_runtime_arena)();
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index 9922850d5..39cf5977b 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -194,6 +194,45 @@ function(add_a5_test name src)
     add_test(NAME ${name} COMMAND ${name})
 endfunction()
 
+# ---------------------------------------------------------------------------
+# A5 runtime sources, mirroring a2a3_rt_objs. Bundled into an OBJECT library
+# so the runtime .cpp files compile once and the resulting .o files are
+# reused across every a5 runtime test executable.
+# ---------------------------------------------------------------------------
+set(A5_RUNTIME_DIR ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/runtime)
+
+add_library(a5_rt_objs OBJECT
+    ${A5_RUNTIME_DIR}/pto_ring_buffer.cpp
+    ${A5_RUNTIME_DIR}/shared/pto_shared_memory.cpp
+    ${A5_RUNTIME_DIR}/scheduler/pto_scheduler.cpp
+    ${A5_RUNTIME_DIR}/shared/pto_tensormap.cpp
+    ${A5_RUNTIME_DIR}/shared/pto_runtime2_init.cpp
+    ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp
+)
+target_include_directories(a5_rt_objs PUBLIC
+    ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/orchestration
+    ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/runtime
+    ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/common
+    ${CMAKE_SOURCE_DIR}/../../../src/a5/platform/include
+    ${CMAKE_SOURCE_DIR}/../../../src/common/task_interface
+    ${CMAKE_SOURCE_DIR}/../../../src/common/log/include
+    ${CMAKE_SOURCE_DIR}/../../../src/common/device_comm
+)
+target_compile_options(a5_rt_objs PUBLIC -D_GLIBCXX_USE_CXX11_ABI=0)
+
+function(add_a5_runtime_test name src)
+    add_executable(${name} ${src})
+    target_include_directories(${name} PRIVATE ${GTEST_INCLUDE_DIRS})
+    target_link_libraries(${name} PRIVATE
+        a5_rt_objs
+        ${GTEST_MAIN_LIB}
+        ${GTEST_LIB}
+        pthread
+    )
+    add_test(NAME ${name} COMMAND ${name})
+    set_tests_properties(${name} PROPERTIES LABELS "no_hardware")
+endfunction()
+
 function(add_task_interface_test name src)
     add_executable(${name} ${src})
     target_include_directories(${name} PRIVATE
@@ -314,6 +353,21 @@ add_a2a3_runtime_test(test_wiring           a2a3/test_wiring.cpp)
 # ---------------------------------------------------------------------------
 add_a5_test(test_a5_fatal a5/test_a5_fatal.cpp)
 
+# A5 trb runtime UTs — mirror of a2a3 trb runtime UTs, link against a5_rt_objs.
+# Target names carry the a5_ prefix because hierarchical/test_tensormap (and
+# the unprefixed a2a3 runtime targets test_scheduler_state / test_ready_queue
+# / ...) already own those bare names.
+add_a5_runtime_test(test_a5_task_allocator   a5/test_task_allocator.cpp)
+add_a5_runtime_test(test_a5_dep_list_pool    a5/test_dep_list_pool.cpp)
+add_a5_runtime_test(test_a5_scheduler_state  a5/test_scheduler_state.cpp)
+add_a5_runtime_test(test_a5_task_state       a5/test_task_state.cpp)
+add_a5_runtime_test(test_a5_ready_queue      a5/test_ready_queue.cpp)
+add_a5_runtime_test(test_a5_shared_memory    a5/test_shared_memory.cpp)
+add_a5_runtime_test(test_a5_tensormap        a5/test_tensormap.cpp)
+add_a5_runtime_test(test_a5_fanin_pool       a5/test_fanin_pool.cpp)
+add_a5_runtime_test(test_a5_spsc_queue       a5/test_spsc_queue.cpp)
+add_a5_runtime_test(test_a5_wiring           a5/test_wiring.cpp)
+
 # Host logger silent/off behavior — no runtime deps, just compile host_log.cpp
 # alongside the test (faster than dlopen'ing libsimpler_log.so for a unit test).
 set(SIMPLER_LOG_DIR ${CMAKE_SOURCE_DIR}/../../../src/common/log)
diff --git a/tests/ut/cpp/a5/test_ready_queue.cpp b/tests/ut/cpp/a5/test_ready_queue.cpp
index 9dea3ae94..f12b1e7c7 100644
--- a/tests/ut/cpp/a5/test_ready_queue.cpp
+++ b/tests/ut/cpp/a5/test_ready_queue.cpp
@@ -44,6 +44,7 @@
 #include <thread>
 #include <vector>
 
+#include "device_arena.h"
 #include "scheduler/pto_scheduler.h"
 
 // =============================================================================
@@ -55,10 +56,19 @@ class ReadyQueueTest : public ::testing::Test {
     static constexpr uint64_t CAPACITY = 16;  // Power of 2
 
     PTO2ReadyQueue queue;
+    DeviceArena arena;
 
-    void SetUp() override { ASSERT_TRUE(ready_queue_init(&queue, CAPACITY)); }
+    void SetUp() override {
+        const size_t off = ready_queue_reserve_layout(arena, CAPACITY);
+        ASSERT_NE(arena.commit(), nullptr);
+        ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, CAPACITY));
+        ready_queue_wire_arena_pointers(&queue, arena, off);
+    }
 
-    void TearDown() override { ready_queue_destroy(&queue); }
+    void TearDown() override {
+        ready_queue_destroy(&queue);
+        arena.release();
+    }
 };
 
 // =============================================================================
@@ -217,8 +227,18 @@ class ReadyQueueBoundaryTest : public ::testing::Test {
     PTO2ReadyQueue queue{};
     PTO2TaskSlotState dummy[8]{};
 
-    void SetUp() override { ASSERT_TRUE(ready_queue_init(&queue, QUEUE_CAP)); }
-    void TearDown() override { ready_queue_destroy(&queue); }
+    DeviceArena arena;
+
+    void SetUp() override {
+        const size_t off = ready_queue_reserve_layout(arena, QUEUE_CAP);
+        ASSERT_NE(arena.commit(), nullptr);
+        ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, QUEUE_CAP));
+        ready_queue_wire_arena_pointers(&queue, arena, off);
+    }
+    void TearDown() override {
+        ready_queue_destroy(&queue);
+        arena.release();
+    }
 };
 
 TEST_F(ReadyQueueBoundaryTest, ExactCapacityFillDrain) {
@@ -307,8 +327,18 @@ class ReadyQueueMPMCTest : public ::testing::TestWithParam<MPMCConfig> {
     static constexpr uint64_t CAPACITY = 1024;
     PTO2ReadyQueue queue;
 
-    void SetUp() override { ASSERT_TRUE(ready_queue_init(&queue, CAPACITY)); }
-    void TearDown() override { ready_queue_destroy(&queue); }
+    DeviceArena arena;
+
+    void SetUp() override {
+        const size_t off = ready_queue_reserve_layout(arena, CAPACITY);
+        ASSERT_NE(arena.commit(), nullptr);
+        ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, CAPACITY));
+        ready_queue_wire_arena_pointers(&queue, arena, off);
+    }
+    void TearDown() override {
+        ready_queue_destroy(&queue);
+        arena.release();
+    }
 };
 
 TEST_P(ReadyQueueMPMCTest, NoDuplicateNoLoss) {
diff --git a/tests/ut/cpp/a5/test_spsc_queue.cpp b/tests/ut/cpp/a5/test_spsc_queue.cpp
index a2c80ca05..5dce3ba4a 100644
--- a/tests/ut/cpp/a5/test_spsc_queue.cpp
+++ b/tests/ut/cpp/a5/test_spsc_queue.cpp
@@ -27,6 +27,7 @@
 #include <thread>
 #include <vector>
 
+#include "device_arena.h"
 #include "scheduler/pto_scheduler.h"
 
 // =============================================================================
@@ -38,15 +39,22 @@ class SpscQueueTest : public ::testing::Test {
     static constexpr uint64_t CAPACITY = 16;  // must be power of 2
 
     PTO2SpscQueue queue{};
+    DeviceArena arena;
     // Dummy slot states used as push values
     alignas(64) PTO2TaskSlotState slots[64]{};
 
     void SetUp() override {
         memset(&queue, 0, sizeof(queue));
-        ASSERT_TRUE(queue.init(CAPACITY));
+        const size_t off = PTO2SpscQueue::reserve_layout(arena, CAPACITY);
+        ASSERT_NE(arena.commit(), nullptr);
+        ASSERT_TRUE(queue.init_data_from_layout(arena, off, CAPACITY));
+        queue.wire_arena_pointers(arena, off);
     }
 
-    void TearDown() override { queue.destroy(); }
+    void TearDown() override {
+        queue.destroy();
+        arena.release();
+    }
 };
 
 // =============================================================================
@@ -60,17 +68,27 @@ TEST_F(SpscQueueTest, InitValidState) {
 }
 
 TEST_F(SpscQueueTest, InitRejectsNonPowerOfTwo) {
+    // init_from_layout rejects non-power-of-two capacities. Use a fresh arena
+    // each time since reserve runs before commit.
     PTO2SpscQueue bad{};
-    EXPECT_FALSE(bad.init(3));
-    EXPECT_FALSE(bad.init(7));
-    EXPECT_FALSE(bad.init(0));
+    DeviceArena local;
+    const size_t off = PTO2SpscQueue::reserve_layout(local, 1);  // dummy reservation so commit succeeds
+    (void)off;
+    ASSERT_NE(local.commit(), nullptr);
+    EXPECT_FALSE(bad.init_data_from_layout(local, off, 3));
+    EXPECT_FALSE(bad.init_data_from_layout(local, off, 7));
+    EXPECT_FALSE(bad.init_data_from_layout(local, off, 0));
 }
 
 TEST_F(SpscQueueTest, InitAcceptsPowerOfTwo) {
     PTO2SpscQueue q{};
-    EXPECT_TRUE(q.init(4));
+    DeviceArena local;
+    const size_t off4 = PTO2SpscQueue::reserve_layout(local, 4);
+    const size_t off1024 = PTO2SpscQueue::reserve_layout(local, 1024);
+    ASSERT_NE(local.commit(), nullptr);
+    EXPECT_TRUE(q.init_data_from_layout(local, off4, 4));
     q.destroy();
-    EXPECT_TRUE(q.init(1024));
+    EXPECT_TRUE(q.init_data_from_layout(local, off1024, 1024));
     q.destroy();
 }
 
diff --git a/tests/ut/cpp/a5/test_tensormap.cpp b/tests/ut/cpp/a5/test_tensormap.cpp
index 10eef0317..805a9e079 100644
--- a/tests/ut/cpp/a5/test_tensormap.cpp
+++ b/tests/ut/cpp/a5/test_tensormap.cpp
@@ -28,6 +28,7 @@
 #include <set>
 #include <vector>
 
+#include "device_arena.h"
 #include "pto_orchestration_api.h"
 #include "pto_tensormap.h"
 
@@ -76,13 +77,20 @@ class TensorMapTest : public ::testing::Test {
     static constexpr int32_t WINDOW_SIZE = 32;
 
     PTO2TensorMap tmap{};
+    DeviceArena arena;
 
     void SetUp() override {
         int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE};
-        ASSERT_TRUE(tmap.init(NUM_BUCKETS, POOL_SIZE, window_sizes));
+        auto layout = PTO2TensorMap::reserve_layout(arena, NUM_BUCKETS, POOL_SIZE, window_sizes);
+        ASSERT_NE(arena.commit(), nullptr);
+        ASSERT_TRUE(tmap.init_data_from_layout(layout, arena));
+        tmap.wire_arena_pointers(layout, arena);
     }
 
-    void TearDown() override { tmap.destroy(); }
+    void TearDown() override {
+        tmap.destroy();
+        arena.release();
+    }
 };
 
 // =============================================================================
@@ -98,11 +106,16 @@ TEST_F(TensorMapTest, InitValidState) {
 }
 
 TEST_F(TensorMapTest, InitRequiresPowerOfTwoBuckets) {
+    // Non-power-of-2 bucket counts trip an always_assert inside reserve_layout
+    // (asserting EXPECT_DEATH is impossible in release builds where
+    // always_assert may compile out). Smoke-test only the success path here.
     PTO2TensorMap bad{};
+    DeviceArena bad_arena;
     int32_t ws[PTO2_MAX_RING_DEPTH] = {8, 8, 8, 8};
-    EXPECT_FALSE(bad.init(3, 64, ws)) << "non-power-of-2 bucket count must fail";
-    EXPECT_FALSE(bad.init(7, 64, ws));
-    EXPECT_TRUE(bad.init(8, 64, ws));
+    auto layout = PTO2TensorMap::reserve_layout(bad_arena, 8, 64, ws);
+    ASSERT_NE(bad_arena.commit(), nullptr);
+    EXPECT_TRUE(bad.init_data_from_layout(layout, bad_arena));
+    bad.wire_arena_pointers(layout, bad_arena);
     bad.destroy();
 }
 

From 7a1036a081d352caadc1ecaac23bf9c27a37604e Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Wed, 27 May 2026 17:27:55 +0800
Subject: [PATCH 7/7] Fix: address CodeRabbit review feedback on trb host-build
 arena PR

- setup_static_arena (a2a3 onboard + a5 sim mirrors): drop the late-region
  failure paths that released already-committed peer arenas. Callers may
  hold pooled pointers from earlier successful regions; tearing the peers
  down on a later resize failure turns those pointers into dangling refs,
  contradicting the lambda's "already-committed peers stay alive" invariant.

- DeviceRunner::finalize (a2a3 sim, a5 onboard): move the lazily-allocated
  device_wall_dev_ptr_ free above mem_alloc_.finalize() (and above
  rtDeviceReset on a5). free_tensor() routes through mem_alloc_.free(),
  so freeing after finalize was a use-after-finalize on the allocator
  state; on a5 it would also run after the device runtime had been reset.

- bind_prepared_to_runtime_impl (a2a3 + a5 runtime_maker): reject env-derived
  PTO2_RING_DEP_POOL values above INT32_MAX before narrowing to int32_t,
  rather than silently truncating into a corrupt layout sizing.

- test_a5_tensormap: rename InitRequiresPowerOfTwoBuckets to
  InitWithPowerOfTwoBucketsSucceeds and reword the comment. The earlier
  name was misleading because the body only exercises the success path
  (bucket count 8); the reject path is gated by always_assert and can't
  be reliably EXPECT_DEATH-tested in release builds.

Tests
- cpput: 35/35 pass (including renamed a5 tensormap test).
---
 .../platform/onboard/host/device_runner.cpp   | 17 ++++----------
 src/a2a3/platform/sim/host/device_runner.cpp  | 13 +++++++----
 .../host/runtime_maker.cpp                    | 12 ++++++++--
 .../platform/onboard/host/device_runner.cpp   | 14 +++++++----
 src/a5/platform/sim/host/device_runner.cpp    | 17 ++++----------
 .../host/runtime_maker.cpp                    | 12 ++++++++--
 tests/ut/cpp/a5/test_tensormap.cpp            | 23 ++++++++++---------
 7 files changed, 59 insertions(+), 49 deletions(-)

diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index 9b66e05ae..e3e1cfc2d 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -287,19 +287,12 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz
         cached_size = requested_size;
         return 0;
     };
+    // Failure of a later region leaves earlier peers committed on purpose:
+    // pooled pointers previously returned to callers must stay valid even if
+    // this resize attempt aborts.
     if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1;
-    if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) {
-        gm_heap_arena_.release();
-        cached_gm_heap_size_ = 0;
-        return -1;
-    }
-    if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) {
-        gm_heap_arena_.release();
-        gm_sm_arena_.release();
-        cached_gm_heap_size_ = 0;
-        cached_gm_sm_size_ = 0;
-        return -1;
-    }
+    if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) return -1;
+    if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) return -1;
     return 0;
 }
 
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 1651c4a89..9a9cbbabf 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -1071,15 +1071,18 @@ int DeviceRunner::finalize() {
     cached_gm_sm_size_ = 0;
     cached_runtime_arena_size_ = 0;
 
-    // Free all remaining allocations
-    mem_alloc_.finalize();
-    clear_cpu_sim_shared_storage();
-
-    // Free the 8-byte device_wall buffer (allocated lazily in run()).
+    // Free the 8-byte device_wall buffer (allocated lazily in run()) before
+    // mem_alloc_.finalize(): free_tensor() routes back through mem_alloc_,
+    // so doing it after finalize would be a use-after-finalize.
     if (device_wall_dev_ptr_ != nullptr) {
         free_tensor(device_wall_dev_ptr_);
         device_wall_dev_ptr_ = nullptr;
     }
+
+    // Free all remaining allocations
+    mem_alloc_.finalize();
+    clear_cpu_sim_shared_storage();
+
     device_id_ = -1;
     worker_count_ = 0;
     last_runtime_ = nullptr;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 750374683..e40aa5ae7 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -281,8 +281,16 @@ extern "C" int bind_prepared_to_runtime_impl(
     // determined by replaying the reserve sequence on a host-side arena.
     uint64_t total_heap_size = eff_heap_size * PTO2_MAX_RING_DEPTH;
     uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(eff_task_window_size);
-    int32_t eff_dep_pool_capacity =
-        runtime->dep_pool_size ? static_cast<int32_t>(runtime->dep_pool_size) : PTO2_DEP_LIST_POOL_SIZE;
+    // dep_pool_size comes from a uint64 env var; reject values that don't fit
+    // the int32_t layout-sizing path rather than silently truncating.
+    int32_t eff_dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE;
+    if (runtime->dep_pool_size != 0) {
+        if (runtime->dep_pool_size > static_cast<uint64_t>(INT32_MAX)) {
+            LOG_ERROR("PTO2_RING_DEP_POOL=%" PRIu64 " exceeds INT32_MAX", runtime->dep_pool_size);
+            return -1;
+        }
+        eff_dep_pool_capacity = static_cast<int32_t>(runtime->dep_pool_size);
+    }
 
     int64_t t_prebuilt_start = _now_ms();
     DeviceArena host_arena;  // libc malloc backend by default
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index b8dc9bb46..506613dcd 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -1085,6 +1085,15 @@ int DeviceRunner::finalize() {
     cached_gm_sm_size_ = 0;
     cached_runtime_arena_size_ = 0;
 
+    // Free the 8-byte device_wall buffer (allocated lazily in run()) while
+    // mem_alloc_ and the device context are still live. free_tensor() routes
+    // through mem_alloc_.free(), so it must run before finalize() and before
+    // rtDeviceReset() tears down the device runtime.
+    if (device_wall_dev_ptr_ != nullptr) {
+        free_tensor(device_wall_dev_ptr_);
+        device_wall_dev_ptr_ = nullptr;
+    }
+
     // Free all remaining allocations (including handshake buffer and binGmAddr)
     mem_alloc_.finalize();
 
@@ -1094,11 +1103,6 @@ int DeviceRunner::finalize() {
         return rc;
     }
 
-    // Free the 8-byte device_wall buffer (allocated lazily in run()).
-    if (device_wall_dev_ptr_ != nullptr) {
-        free_tensor(device_wall_dev_ptr_);
-        device_wall_dev_ptr_ = nullptr;
-    }
     device_id_ = -1;
     block_dim_ = 0;
     worker_count_ = 0;
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index fe3e938e1..b3072919c 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -150,19 +150,12 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz
         cached_size = requested_size;
         return 0;
     };
+    // Failure of a later region leaves earlier peers committed on purpose:
+    // pooled pointers previously returned to callers must stay valid even if
+    // this resize attempt aborts.
     if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1;
-    if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) {
-        gm_heap_arena_.release();
-        cached_gm_heap_size_ = 0;
-        return -1;
-    }
-    if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) {
-        gm_heap_arena_.release();
-        gm_sm_arena_.release();
-        cached_gm_heap_size_ = 0;
-        cached_gm_sm_size_ = 0;
-        return -1;
-    }
+    if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) return -1;
+    if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) return -1;
     return 0;
 }
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 9e1d00841..037d3ab04 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -281,8 +281,16 @@ extern "C" int bind_prepared_to_runtime_impl(
     // determined by replaying the reserve sequence on a host-side arena.
     uint64_t total_heap_size = eff_heap_size * PTO2_MAX_RING_DEPTH;
     uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(eff_task_window_size);
-    int32_t eff_dep_pool_capacity =
-        runtime->dep_pool_size ? static_cast<int32_t>(runtime->dep_pool_size) : PTO2_DEP_LIST_POOL_SIZE;
+    // dep_pool_size comes from a uint64 env var; reject values that don't fit
+    // the int32_t layout-sizing path rather than silently truncating.
+    int32_t eff_dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE;
+    if (runtime->dep_pool_size != 0) {
+        if (runtime->dep_pool_size > static_cast<uint64_t>(INT32_MAX)) {
+            LOG_ERROR("PTO2_RING_DEP_POOL=%" PRIu64 " exceeds INT32_MAX", runtime->dep_pool_size);
+            return -1;
+        }
+        eff_dep_pool_capacity = static_cast<int32_t>(runtime->dep_pool_size);
+    }
 
     int64_t t_prebuilt_start = _now_ms();
     DeviceArena host_arena;  // libc malloc backend by default
diff --git a/tests/ut/cpp/a5/test_tensormap.cpp b/tests/ut/cpp/a5/test_tensormap.cpp
index 805a9e079..ec83a064d 100644
--- a/tests/ut/cpp/a5/test_tensormap.cpp
+++ b/tests/ut/cpp/a5/test_tensormap.cpp
@@ -105,18 +105,19 @@ TEST_F(TensorMapTest, InitValidState) {
     EXPECT_EQ(tmap.valid_count(), 0);
 }
 
-TEST_F(TensorMapTest, InitRequiresPowerOfTwoBuckets) {
-    // Non-power-of-2 bucket counts trip an always_assert inside reserve_layout
-    // (asserting EXPECT_DEATH is impossible in release builds where
-    // always_assert may compile out). Smoke-test only the success path here.
-    PTO2TensorMap bad{};
-    DeviceArena bad_arena;
+TEST_F(TensorMapTest, InitWithPowerOfTwoBucketsSucceeds) {
+    // The reject path for non-power-of-2 bucket counts is enforced via an
+    // always_assert inside reserve_layout. It is not asserted here because
+    // EXPECT_DEATH cannot run reliably in release builds where always_assert
+    // may compile out. Cover only the accepted (power-of-2) shape.
+    PTO2TensorMap ok{};
+    DeviceArena ok_arena;
     int32_t ws[PTO2_MAX_RING_DEPTH] = {8, 8, 8, 8};
-    auto layout = PTO2TensorMap::reserve_layout(bad_arena, 8, 64, ws);
-    ASSERT_NE(bad_arena.commit(), nullptr);
-    EXPECT_TRUE(bad.init_data_from_layout(layout, bad_arena));
-    bad.wire_arena_pointers(layout, bad_arena);
-    bad.destroy();
+    auto layout = PTO2TensorMap::reserve_layout(ok_arena, 8, 64, ws);
+    ASSERT_NE(ok_arena.commit(), nullptr);
+    EXPECT_TRUE(ok.init_data_from_layout(layout, ok_arena));
+    ok.wire_arena_pointers(layout, ok_arena);
+    ok.destroy();
 }
 
 // =============================================================================