From e4baa7e19da5f48a601a5a41215776614edc155d Mon Sep 17 00:00:00 2001 From: poursoul Date: Fri, 22 May 2026 12:22:05 +0800 Subject: [PATCH 1/7] Refactor: defer slot_state payload/task bind to orch::prepare_task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the per-slot payload/task pointer assignments out of the RingSchedState::init() O(task_window_size) loop and into orch::prepare_task. Their value is per-slot constant (&task_payloads[slot] / &task_descriptors[slot]) but writing them at submit time, on the same 64B slot_state cache line prepare_task is already dirtying, is essentially free — while removing the only "scale-dependent" pointer assignments from the init path. ring_id stays in init (its value is per-ring constant, so rewriting it each submit would only add noise without removing a loop). Split PTO2TaskSlotState::bind() into bind_ring() (init-time) and bind_buffers() (per-submit) to make the two call-site shapes explicit. Mirrored across both a2a3 and a5 trb runtimes. --- .../runtime/pto_orchestrator.cpp | 12 +++++++++- .../runtime/pto_runtime2_types.h | 23 ++++++++++++++----- .../runtime/scheduler/pto_scheduler.cpp | 7 +++--- .../runtime/pto_orchestrator.cpp | 12 +++++++++- .../runtime/pto_runtime2_types.h | 23 ++++++++++++++----- .../runtime/scheduler/pto_scheduler.cpp | 7 +++--- 6 files changed, 64 insertions(+), 20 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index 5f6d20855..fbc07f53f 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -355,11 +355,21 @@ static bool prepare_task( prefetch_payload(out->payload, args.tensor_count(), args.scalar_count()); + // Re-bind payload/task pointers each submit. Value is per-slot constant + // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing + // here lets RingSchedState::init() skip the O(window_size) bind loop. + // Both writes hit the same 64B slot_state cache line we're about to + // dirty below, so the extra cost is two stores on an already-hot line. + // Must precede the scheduler wiring.queue.push at the end of + // submit_task_common — that push is the first read of slot_state->task / + // slot_state->payload by another thread. + out->slot_state->bind_buffers(out->payload, out->task); + // Fields already reset by advance_ring_pointers (eager reset after CONSUMED): // fanout_lock=0, fanout_count=1, fanout_head=nullptr, // fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0 // Fields immutable after RingSchedState::init(): - // payload, task, ring_id + // ring_id // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor // observers); set to PENDING here when orchestrator actually reuses the slot. out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index fcd8a27bd..f217e7ac3 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -330,7 +330,11 @@ struct alignas(64) PTO2TaskSlotState { // Fanout refcount (accessed with fanout_count in check_and_handle_consumed) std::atomic fanout_refcount; // Dynamic: counts released references - // --- Immutable after RingSchedState::init() (same value on every slot reuse) --- + // --- Per-slot constant, re-bound by orch::prepare_task each submit --- + // Value is the same on every reuse (&task_payloads[slot] / &task_descriptors[slot]), + // but written here per-submit instead of in an O(window_size) init loop — + // these are the only "scale-dependent" pointers in this struct, so moving + // them out of init makes startup cost independent of task_window_size. PTO2TaskPayload *payload; PTO2TaskDescriptor *task; @@ -345,14 +349,21 @@ struct alignas(64) PTO2TaskSlotState { int16_t next_block_idx{0}; // Next block to dispatch (scheduler state) /** - * One-time binding of slot-invariant fields. - * Called during RingSchedState::init() — these values are determined by - * the slot's position in the ring and never change across reuses. + * Bind the slot-invariant ring id. Called once per slot during + * RingSchedState::init(); ring_id never changes across reuses. */ - void bind(PTO2TaskPayload *p, PTO2TaskDescriptor *t, uint8_t rid) { + void bind_ring(uint8_t rid) { ring_id = rid; } + + /** + * Re-bind the per-slot payload/task pointers. Called by + * orch::prepare_task on every submit. Value is constant for a given + * slot, but we pay the cheap re-write each submit (both fields land on + * the same 64B slot_state cache line that prepare_task is already + * dirtying) to avoid the init-time per-slot loop. + */ + void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t) { payload = p; task = t; - ring_id = rid; } /** diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp index 281a714fb..f497b8fd8 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp @@ -102,12 +102,13 @@ bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHeader *sm_header, advance_lock.store(0, std::memory_order_relaxed); // Initialize all per-task slot state fields. - // bind() sets payload, task, ring_id — immutable after init, bound once - // to their fixed shared-memory addresses. + // bind_ring() sets the ring_id only — payload/task pointers are re-bound + // by orch::prepare_task on every submit (their value is per-slot constant + // but pinning them here would cost O(task_window_size) at startup). // reset_for_reuse() sets dynamic fields to reclaim defaults (fanout_count=1, // rest zero) so the first submit needs no reset. for (uint64_t i = 0; i < ring->task_window_size; i++) { - ring->slot_states[i].bind(&ring->task_payloads[i], &ring->task_descriptors[i], static_cast(ring_id)); + ring->slot_states[i].bind_ring(static_cast(ring_id)); ring->slot_states[i].reset_for_reuse(); ring->slot_states[i].fanin_count = 0; ring->slot_states[i].active_mask = ActiveMask{}; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index 05ac105a8..056c2ee64 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -329,11 +329,21 @@ static bool prepare_task( prefetch_payload(out->payload, args.tensor_count(), args.scalar_count()); + // Re-bind payload/task pointers each submit. Value is per-slot constant + // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing + // here lets RingSchedState::init() skip the O(window_size) bind loop. + // Both writes hit the same 64B slot_state cache line we're about to + // dirty below, so the extra cost is two stores on an already-hot line. + // Must precede the scheduler wiring.queue.push at the end of + // submit_task_common — that push is the first read of slot_state->task / + // slot_state->payload by another thread. + out->slot_state->bind_buffers(out->payload, out->task); + // Fields already reset by advance_ring_pointers (eager reset after CONSUMED): // fanout_lock=0, fanout_count=1, fanout_head=nullptr, // fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0 // Fields immutable after RingSchedState::init(): - // payload, task, ring_id + // ring_id // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor // observers); set to PENDING here when orchestrator actually reuses the slot. out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index 999dbf6c5..f022b8eb4 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -330,7 +330,11 @@ struct alignas(64) PTO2TaskSlotState { // Fanout refcount (accessed with fanout_count in check_and_handle_consumed) std::atomic fanout_refcount; // Dynamic: counts released references - // --- Immutable after RingSchedState::init() (same value on every slot reuse) --- + // --- Per-slot constant, re-bound by orch::prepare_task each submit --- + // Value is the same on every reuse (&task_payloads[slot] / &task_descriptors[slot]), + // but written here per-submit instead of in an O(window_size) init loop — + // these are the only "scale-dependent" pointers in this struct, so moving + // them out of init makes startup cost independent of task_window_size. PTO2TaskPayload *payload; PTO2TaskDescriptor *task; @@ -345,14 +349,21 @@ struct alignas(64) PTO2TaskSlotState { int16_t next_block_idx{0}; // Next block to dispatch (scheduler state) /** - * One-time binding of slot-invariant fields. - * Called during RingSchedState::init() — these values are determined by - * the slot's position in the ring and never change across reuses. + * Bind the slot-invariant ring id. Called once per slot during + * RingSchedState::init(); ring_id never changes across reuses. */ - void bind(PTO2TaskPayload *p, PTO2TaskDescriptor *t, uint8_t rid) { + void bind_ring(uint8_t rid) { ring_id = rid; } + + /** + * Re-bind the per-slot payload/task pointers. Called by + * orch::prepare_task on every submit. Value is constant for a given + * slot, but we pay the cheap re-write each submit (both fields land on + * the same 64B slot_state cache line that prepare_task is already + * dirtying) to avoid the init-time per-slot loop. + */ + void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t) { payload = p; task = t; - ring_id = rid; } /** diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp index 281a714fb..f497b8fd8 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp @@ -102,12 +102,13 @@ bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHeader *sm_header, advance_lock.store(0, std::memory_order_relaxed); // Initialize all per-task slot state fields. - // bind() sets payload, task, ring_id — immutable after init, bound once - // to their fixed shared-memory addresses. + // bind_ring() sets the ring_id only — payload/task pointers are re-bound + // by orch::prepare_task on every submit (their value is per-slot constant + // but pinning them here would cost O(task_window_size) at startup). // reset_for_reuse() sets dynamic fields to reclaim defaults (fanout_count=1, // rest zero) so the first submit needs no reset. for (uint64_t i = 0; i < ring->task_window_size; i++) { - ring->slot_states[i].bind(&ring->task_payloads[i], &ring->task_descriptors[i], static_cast(ring_id)); + ring->slot_states[i].bind_ring(static_cast(ring_id)); ring->slot_states[i].reset_for_reuse(); ring->slot_states[i].fanin_count = 0; ring->slot_states[i].active_mask = ActiveMask{}; From 23139eb55d40c06dc917327ad02718022a2dcf23 Mon Sep 17 00:00:00 2001 From: poursoul Date: Fri, 22 May 2026 17:53:47 +0800 Subject: [PATCH 2/7] Refactor: host-build trb runtime arena, AICPU does only wire + SM reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the AICPU rebuilt the entire trb runtime arena (PTO2Runtime, orchestrator/scheduler/tensor_map sub-regions, sm_handle wrapper, mailbox) on every device boot via runtime_create_from_sm. This commit moves layout + data init onto the host so the AICPU only does a cheap arena-internal pointer wire pass plus the SM reset that can't run off-device. Multi-run boots reuse the pooled prebuilt image with a single rtMemcpy. Mechanism - DeviceArena::attach() wraps an externally-owned buffer; re-attach is permitted so each AICPU boot can reuse the pooled image. - runtime_create_from_sm split into reserve_layout / init_data_from_layout / wire_arena_pointers / finalize_after_wire. orchestrator / scheduler / tensor_map / ready_queue / spsc gain matching data+wire pairs; finalize_after_wire stays AICPU-only since it binds s_runtime_ops. - pto2_sm_layout helper computes SM field device addresses by pure offset arithmetic so host init never dereferences SM. - Per-slot SM-side reset (bind_ring + reset_for_reuse + active_mask) moved from RingSchedState::init into PTO2SharedMemoryHandle::init_header_per_ring so the AICPU still owns it after the split. - runtime/shared/pto_runtime2_init.cpp — new file holding the host-able pieces lifted out of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp. AICPU-only ops table / submit_task / dispatch stay in place. Host wiring (runtime_maker.cpp) - DeviceRunner::setup_static_arena gains a third runtime_arena_size region (hbg passes 0). The prebuilt image lives in the same pooled backing allocation as gm_heap and SM, keeping worker lifetime to one rtMalloc. - bind_prepared_to_runtime_impl reserves layout on a host arena, sizes the pooled regions, runs init_data + wire, stashes prebuilt metadata into the rt image, rtMemcpys to device, and records base/offset on Runtime so the AICPU boot can find it. AICPU boot (aicpu_executor.cpp) - attach the runtime arena to the pooled buffer, take rt from base+off_runtime, wire arena-internal pointers, sm_handle->init (SM reset including the per-slot fields above), mailbox reset, finalize_after_wire (ops table + cluster/aiv counts). Tests - cpput: 25/25 pass. ready_queue / spsc_queue / scheduler_state / task_state / wiring / tensormap UTs migrated to the data+wire API. task_allocator.init grew an optional initial_local_task_id (default 0) so UTs can still exercise task_id near INT32_MAX without reading the SM. - a2a3sim trb: standalone (dynamic_register variants, L3 group/dependency) + L2 tensormap_and_ringbuffer 29 tests all pass. - a2a3sim host_build_graph: 9/9 pass (verifies the shared HostApi changes don't break hbg). - a2a3 hardware: tests/st/.../paged_attention_unroll PASS on device 9 (--build with pto-isa commit pinned to CI). --- .../platform/onboard/host/device_runner.cpp | 30 +- .../platform/onboard/host/device_runner.h | 32 +- .../onboard/host/pto_runtime_c_api.cpp | 13 +- src/a2a3/platform/sim/host/device_runner.cpp | 30 +- src/a2a3/platform/sim/host/device_runner.h | 25 +- .../platform/sim/host/pto_runtime_c_api.cpp | 13 +- .../host_build_graph/runtime/runtime.h | 9 +- .../aicpu/aicpu_executor.cpp | 57 ++- .../host/dep_gen_replay.cpp | 9 +- .../host/runtime_maker.cpp | 66 +++- .../runtime/pto_orchestrator.cpp | 83 ----- .../runtime/pto_orchestrator.h | 19 +- .../runtime/pto_ring_buffer.h | 16 +- .../runtime/pto_runtime2.cpp | 84 +---- .../runtime/pto_runtime2.h | 110 ++++-- .../runtime/pto_shared_memory.h | 61 +++ .../runtime/pto_tensormap.h | 19 +- .../runtime/runtime.h | 39 +- .../runtime/scheduler/pto_scheduler.cpp | 147 -------- .../runtime/scheduler/pto_scheduler.h | 53 ++- .../runtime/shared/pto_runtime2_init.cpp | 351 ++++++++++++++++++ .../runtime/shared/pto_shared_memory.cpp | 17 + .../runtime/shared/pto_tensormap.cpp | 44 ++- .../runtime/shared/runtime.cpp | 9 + src/common/device_comm/device_arena.h | 44 ++- tests/ut/cpp/CMakeLists.txt | 1 + tests/ut/cpp/a2a3/test_ready_queue.cpp | 9 +- tests/ut/cpp/a2a3/test_scheduler_state.cpp | 3 +- tests/ut/cpp/a2a3/test_spsc_queue.cpp | 13 +- tests/ut/cpp/a2a3/test_task_allocator.cpp | 5 +- tests/ut/cpp/a2a3/test_task_state.cpp | 3 +- tests/ut/cpp/a2a3/test_tensormap.cpp | 6 +- tests/ut/cpp/a2a3/test_wiring.cpp | 3 +- 33 files changed, 979 insertions(+), 444 deletions(-) create mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index cf6ddea88..e3ba6cd10 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -249,31 +249,41 @@ int AicpuSoInfo::finalize() { DeviceRunner::~DeviceRunner() { finalize(); } -int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) { +int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { if (static_arena_.is_committed()) { // Idempotent for the production case (sizes do not change across a // worker's lifetime). If a caller asks for a larger layout, redo it. - if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0; + if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_ && + runtime_arena_size <= cached_runtime_arena_size_) { + return 0; + } static_arena_.release(); gm_heap_region_off_ = SIZE_MAX; gm_sm_region_off_ = SIZE_MAX; + runtime_arena_region_off_ = SIZE_MAX; cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; + cached_runtime_arena_size_ = 0; } gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign); gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign); + if (runtime_arena_size > 0) { + runtime_arena_region_off_ = static_arena_.reserve(runtime_arena_size, DeviceArena::kDefaultBaseAlign); + } if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { - // Roll back the two reserves: commit() failure leaves committed_=false, + // Roll back the reserves: commit() failure leaves committed_=false, // so the next entry would skip the release branch and stack new // reserves on top of the stale cursor. release() is idempotent on a // never-committed arena (just zeroes cursor_ / region_count_). static_arena_.release(); gm_heap_region_off_ = SIZE_MAX; gm_sm_region_off_ = SIZE_MAX; + runtime_arena_region_off_ = SIZE_MAX; return -1; } cached_gm_heap_size_ = gm_heap_size; cached_gm_sm_size_ = gm_sm_size; + cached_runtime_arena_size_ = runtime_arena_size; return 0; } @@ -287,6 +297,11 @@ void *DeviceRunner::acquire_pooled_gm_sm() { return static_arena_.region_ptr(gm_sm_region_off_); } +void *DeviceRunner::acquire_pooled_runtime_arena() { + if (!static_arena_.is_committed()) return nullptr; + return static_arena_.region_ptr(runtime_arena_region_off_); +} + std::thread DeviceRunner::create_thread(std::function fn) { int dev_id = device_id_; return std::thread([dev_id, fn = std::move(fn)]() { @@ -1222,14 +1237,17 @@ int DeviceRunner::finalize() { // perf_cleanup guard; this is the backstop for the no-run-since-init case. finalize_collectors(); - // Release per-Worker static arena (GM heap + PTO2 SM in a single backing - // device allocation). Must precede mem_alloc_.finalize() so the arena - // frees through the still-live allocator, not after it. + // Release per-Worker static arena (GM heap + PTO2 SM + optional trb + // prebuilt runtime arena in a single backing device allocation). Must + // precede mem_alloc_.finalize() so the arena frees through the still-live + // allocator, not after it. static_arena_.release(); gm_heap_region_off_ = SIZE_MAX; gm_sm_region_off_ = SIZE_MAX; + runtime_arena_region_off_ = SIZE_MAX; cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; + cached_runtime_arena_size_ = 0; // Free all remaining allocations (including handshake buffer and binGmAddr) mem_alloc_.finalize(); diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 53fb6555f..4d9819f21 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -190,20 +190,30 @@ class DeviceRunner { /** * Lay out and commit the per-Worker static device arena that backs the - * PTO2 GM heap and PTO2 shared memory in a single underlying allocation. - * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm. - * Idempotent on identical sizes. Returns 0 on success, -1 on failure. + * PTO2 GM heap, the PTO2 shared memory, and the trb prebuilt runtime + * arena in a single underlying allocation. Must be called before any + * acquire_pooled_*. Idempotent on identical sizes. `runtime_arena_size` + * is 0 for the hbg path (no prebuilt runtime arena). Returns 0 on + * success, -1 on failure. */ - int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size); + int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); /** - * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must - * have been called earlier in this Worker; otherwise these return - * nullptr. Both pointers are stable for the lifetime of the Worker and - * the single underlying device buffer is released in `finalize()`. + * Return the pooled GM heap / PTO2 SM / runtime arena pointer. + * setup_static_arena must have been called earlier in this Worker; + * otherwise these return nullptr. All pointers are stable for the + * Worker's lifetime; the single underlying device buffer is released in + * `finalize()`. + * + * acquire_pooled_runtime_arena() is trb-only — the runtime arena region + * is only reserved when setup_static_arena was called with + * runtime_arena_size > 0. hbg's runtime_maker never calls this; doing so + * after setup_static_arena(...,0) returns an unreserved-offset region_ptr + * (undefined). Keep the call site discipline at the runtime_maker layer. */ void *acquire_pooled_gm_heap(); void *acquire_pooled_gm_sm(); + void *acquire_pooled_runtime_arena(); /** * Create a thread bound to this device. @@ -614,10 +624,14 @@ class DeviceRunner { DeviceArena static_arena_; size_t gm_heap_region_off_{SIZE_MAX}; size_t gm_sm_region_off_{SIZE_MAX}; + // SIZE_MAX (= "not provisioned") when the caller passed runtime_arena_size + // == 0 (hbg path); a real offset for trb. + size_t runtime_arena_region_off_{SIZE_MAX}; // Cached sizes for setup_static_arena's "fits" check — avoids calling - // region_size() on the arena's public API for the two regions we own. + // region_size() on the arena's public API for the regions we own. size_t cached_gm_heap_size_{0}; size_t cached_gm_sm_size_{0}; + size_t cached_runtime_arena_size_{0}; // Device resources rtStream_t stream_aicpu_{nullptr}; diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index 744b7291c..29c14d862 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -108,9 +108,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) { } } -static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) { +static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { try { - return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size); + return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size); } catch (...) { return -1; } @@ -132,6 +132,14 @@ static void *acquire_pooled_gm_sm_wrapper() { } } +static void *acquire_pooled_runtime_arena_wrapper() { + try { + return current_runner()->acquire_pooled_runtime_arena(); + } catch (...) { + return nullptr; + } +} + /* =========================================================================== * Public C API (resolved by ChipWorker via dlsym) * =========================================================================== */ @@ -370,6 +378,7 @@ int run_prepared( r->host_api.setup_static_arena = setup_static_arena_wrapper; r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper; r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper; + r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper; r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper; // Restore kernel addrs + orch symbol names + active_callable_id; the diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index 1635f3a7a..53d967228 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -122,29 +122,39 @@ bool create_temp_so_file(const std::string &path_template, const uint8_t *data, DeviceRunner::~DeviceRunner() { finalize(); } -int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) { +int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { if (static_arena_.is_committed()) { - if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0; + if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_ && + runtime_arena_size <= cached_runtime_arena_size_) { + return 0; + } static_arena_.release(); gm_heap_region_off_ = SIZE_MAX; gm_sm_region_off_ = SIZE_MAX; + runtime_arena_region_off_ = SIZE_MAX; cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; + cached_runtime_arena_size_ = 0; } gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign); gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign); + if (runtime_arena_size > 0) { + runtime_arena_region_off_ = static_arena_.reserve(runtime_arena_size, DeviceArena::kDefaultBaseAlign); + } if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { - // Roll back the two reserves: commit() failure leaves committed_=false, + // Roll back the reserves: commit() failure leaves committed_=false, // so the next entry would skip the release branch and stack new // reserves on top of the stale cursor. release() is idempotent on a // never-committed arena (just zeroes cursor_ / region_count_). static_arena_.release(); gm_heap_region_off_ = SIZE_MAX; gm_sm_region_off_ = SIZE_MAX; + runtime_arena_region_off_ = SIZE_MAX; return -1; } cached_gm_heap_size_ = gm_heap_size; cached_gm_sm_size_ = gm_sm_size; + cached_runtime_arena_size_ = runtime_arena_size; return 0; } @@ -158,6 +168,11 @@ void *DeviceRunner::acquire_pooled_gm_sm() { return static_arena_.region_ptr(gm_sm_region_off_); } +void *DeviceRunner::acquire_pooled_runtime_arena() { + if (!static_arena_.is_committed()) return nullptr; + return static_arena_.region_ptr(runtime_arena_region_off_); +} + std::thread DeviceRunner::create_thread(std::function fn) { int dev_id = device_id_; return std::thread([dev_id, fn = std::move(fn)]() { @@ -1032,14 +1047,17 @@ int DeviceRunner::finalize() { // Close executor .so files (typically already closed by run(), this is a safety net) unload_executor_binaries(); - // Release per-Worker static arena (GM heap + PTO2 SM in a single backing - // device allocation). Must precede mem_alloc_.finalize() so the arena - // frees through the still-live allocator, not after it. + // Release per-Worker static arena (GM heap + PTO2 SM + optional trb + // prebuilt runtime arena in a single backing device allocation). Must + // precede mem_alloc_.finalize() so the arena frees through the still-live + // allocator, not after it. static_arena_.release(); gm_heap_region_off_ = SIZE_MAX; gm_sm_region_off_ = SIZE_MAX; + runtime_arena_region_off_ = SIZE_MAX; cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; + cached_runtime_arena_size_ = 0; // Free all remaining allocations mem_alloc_.finalize(); diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h index 73b3dfea2..60f1bfdc9 100644 --- a/src/a2a3/platform/sim/host/device_runner.h +++ b/src/a2a3/platform/sim/host/device_runner.h @@ -80,19 +80,26 @@ class DeviceRunner { /** * Lay out and commit the per-Worker static device arena that backs the - * PTO2 GM heap and PTO2 shared memory in a single underlying allocation. - * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm. - * Idempotent on identical sizes. Returns 0 on success, -1 on failure. + * PTO2 GM heap, the PTO2 shared memory, and the trb prebuilt runtime + * arena in a single underlying allocation. Must be called before any + * acquire_pooled_*. `runtime_arena_size` is 0 for hbg. Idempotent on + * identical sizes. Returns 0 on success, -1 on failure. */ - int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size); + int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); /** - * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must - * have been called earlier in this Worker; otherwise these return - * nullptr. Pointers are stable for the lifetime of the Worker. + * Return the pooled GM heap / PTO2 SM / runtime arena pointer. + * setup_static_arena must have been called earlier in this Worker. + * + * acquire_pooled_runtime_arena() is trb-only — the runtime arena region + * is only reserved when setup_static_arena was called with + * runtime_arena_size > 0. hbg's runtime_maker never calls this; doing so + * after setup_static_arena(...,0) returns an unreserved-offset region_ptr + * (undefined). Keep the call site discipline at the runtime_maker layer. */ void *acquire_pooled_gm_heap(); void *acquire_pooled_gm_sm(); + void *acquire_pooled_runtime_arena(); /** * Create a thread bound to this device. @@ -292,10 +299,12 @@ class DeviceRunner { DeviceArena static_arena_; size_t gm_heap_region_off_{SIZE_MAX}; size_t gm_sm_region_off_{SIZE_MAX}; + size_t runtime_arena_region_off_{SIZE_MAX}; // Cached sizes for setup_static_arena's "fits" check — avoids calling - // region_size() on the arena's public API for the two regions we own. + // region_size() on the arena's public API for the regions we own. size_t cached_gm_heap_size_{0}; size_t cached_gm_sm_size_{0}; + size_t cached_runtime_arena_size_{0}; // Simulation state (no actual device resources) KernelArgs kernel_args_; diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index 4ad438a9c..fca663610 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -103,9 +103,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) { } } -static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) { +static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { try { - return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size); + return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size); } catch (...) { return -1; } @@ -127,6 +127,14 @@ static void *acquire_pooled_gm_sm_wrapper() { } } +static void *acquire_pooled_runtime_arena_wrapper() { + try { + return current_runner()->acquire_pooled_runtime_arena(); + } catch (...) { + return nullptr; + } +} + /* =========================================================================== * Public C API (resolved by ChipWorker via dlsym) * =========================================================================== */ @@ -333,6 +341,7 @@ int run_prepared( r->host_api.setup_static_arena = setup_static_arena_wrapper; r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper; r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper; + r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper; r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper; auto bind_result = runner->bind_prepared_callable_to_runtime(*r, callable_id); diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h index 41845bdf0..ccdc05ce0 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h +++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h @@ -140,9 +140,16 @@ struct HostApi { // pto_runtime_c_api.cpp can populate the same HostApi struct regardless of // which runtime variant it is built against. Unset for this variant; do // not call. - int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size); + // PTO2 static-arena hooks. The host_build_graph runtime does not currently + // use these — the fields exist only so the platform layer's + // pto_runtime_c_api.cpp can populate the same HostApi struct regardless of + // which runtime variant it is built against. Unset for this variant; do + // not call. hbg-side callers pass runtime_arena_size == 0 (hbg has no + // prebuilt runtime arena). + int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); void *(*acquire_pooled_gm_heap)(); void *(*acquire_pooled_gm_sm)(); + void *(*acquire_pooled_runtime_arena)(); // Single-shot upload of the entire ChipCallable buffer. `callable` is a // `const ChipCallable *` (declared void* to avoid pulling task_interface // headers into runtime.h). DeviceRunner walks child_offsets_ to compute diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index f8e35917b..5c31c5b9a 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -467,29 +467,60 @@ int32_t AicpuExecutor::run(Runtime *runtime) { static_cast(task_window_size), static_cast(heap_size), dep_pool_capacity ); - void *sm_ptr = runtime->get_gm_sm_ptr(); - void *gm_heap = runtime->get_gm_heap_ptr(); + // gm_heap pointer / dep_pool_capacity are encoded into the prebuilt + // runtime arena image at host build time, so we no longer fetch + // them here. They remain on the host Runtime instance and on the + // PTO2Runtime header for diagnostic purposes only. + (void)dep_pool_capacity; + void *sm_ptr = runtime->get_gm_sm_ptr(); uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size); - rt = runtime_create_from_sm( - PTO2_MODE_EXECUTE, sm_ptr, sm_size, task_window_size, gm_heap, heap_size, runtime_arena_, - dep_pool_capacity - ); - if (!rt) { - LOG_ERROR("Thread %d: Failed to create PTO2Runtime", thread_idx); - // Unblock scheduler threads before returning so they don't spin forever. + + // Prebuilt-arena fast path. Host has pre-populated the entire + // runtime arena (PTO2Runtime + orchestrator/scheduler/tensor_map + // sub-regions + sm_handle wrapper + mailbox) and uploaded it via + // rtMemcpy into the pooled runtime_arena buffer. We attach to it, + // wire arena-internal pointers to their device addresses, reset + // the SM, and finalize the few device-only fields the host could + // not know at image-build time. + void *prebuilt_arena = runtime->get_prebuilt_arena_base(); + size_t off_runtime = runtime->get_prebuilt_runtime_offset(); + if (prebuilt_arena == nullptr) { + LOG_ERROR("Thread %d: prebuilt_arena_base is null", thread_idx); + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + runtime_arena_.attach(prebuilt_arena, DeviceArena::kDefaultBaseAlign); + rt = reinterpret_cast(static_cast(prebuilt_arena) + off_runtime); + + // Wire every arena-internal pointer field (host wrote host-mirror + // addresses; we overwrite them with device addresses). + runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt); + + // Reset SM state. setup_pointers + init_header_per_ring restore + // ring flow-control counters, layout metadata, error flags, and + // the per-slot ring->slot_states[] (bind_ring + reset_for_reuse + + // fanin_count/active_mask zero — previously done inside + // RingSchedState::init). + memset(rt->sm_handle, 0, sizeof(*rt->sm_handle)); + if (!rt->sm_handle->init(sm_ptr, sm_size, task_window_size, heap_size)) { + LOG_ERROR("Thread %d: sm_handle->init failed", thread_idx); runtime_init_ready_.store(true, std::memory_order_release); return -1; } + // AICore completion mailbox lives in the arena; reset it each + // boot so stale completion notifications from a previous run do + // not leak. + memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox)); + + // Fill ops / core counts (host can't resolve s_runtime_ops's + // device address nor know the SchedulerContext's core fan-out). + runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count()); #if PTO2_PROFILING rt->orchestrator.l2_perf_level = get_l2_perf_level(); #endif - // Total core counts = aic_count_ / aiv_count_ (set once at runtime init). - rt->orchestrator.total_cluster_count = sched_ctx_.aic_count(); - rt->orchestrator.total_aiv_count = sched_ctx_.aiv_count(); - // With multi-ring, slot_states are per-ring inside the scheduler. runtime->set_slot_states_ptr(nullptr); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp index 027805918..506ba7cf6 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp @@ -487,11 +487,16 @@ dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, c PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes); auto annot_layout = PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes); - if (replay_arena.commit() == nullptr || !tm_oracle.init_from_layout(oracle_layout, replay_arena) || - !tm_annot.init_from_layout(annot_layout, replay_arena)) { + if (replay_arena.commit() == nullptr || !tm_oracle.init_data_from_layout(oracle_layout, replay_arena) || + !tm_annot.init_data_from_layout(annot_layout, replay_arena)) { LOG_ERROR("dep_gen replay: tensormap.init failed (buckets=%d, pool=%d)", PTO2_TENSORMAP_NUM_BUCKETS, pool_size); return -3; } + // Replay tensormaps live entirely on host; both arena base and the + // parent-orch self-pointer use host addresses. parent_orch is unused by + // the lookup/insert code paths exercised below — nullptr is safe. + tm_oracle.wire_arena_pointers(oracle_layout, replay_arena, nullptr); + tm_annot.wire_arena_pointers(annot_layout, replay_arena, nullptr); // JSON output accumulators. std::vector task_table; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index a75205196..3b278b2b4 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -36,11 +36,13 @@ #include #include "../common/pto_runtime_status.h" +#include "../runtime/pto_runtime2.h" #include "../runtime/pto_shared_memory.h" #include "../runtime/runtime.h" #include "callable.h" #include "common/platform_config.h" #include "common/unified_log.h" +#include "device_arena.h" #include "prepare_callable_common.h" // Helper: return current time in milliseconds @@ -271,15 +273,27 @@ extern "C" int bind_prepared_to_runtime_impl( uint64_t eff_heap_size = runtime->heap_size ? runtime->heap_size : PTO2_HEAP_SIZE; uint64_t eff_task_window_size = runtime->task_window_size ? runtime->task_window_size : PTO2_TASK_WINDOW_SIZE; - // Lay out the per-Worker static device arena. GM heap (orchestrator output - // buffers, all rings combined) and PTO2 shared memory live in a single - // backing allocation; setup_static_arena reserves both regions and - // commits in one shot. Owned by DeviceRunner across runs — do NOT record - // in tensor_pairs_; the free is deferred to DeviceRunner::finalize(). + // Lay out the per-Worker static device arena. GM heap, PTO2 shared memory, + // and the prebuilt runtime arena all live in a single backing allocation; + // setup_static_arena reserves the three regions and commits in one shot. + // Owned by DeviceRunner across runs — do NOT record in tensor_pairs_; the + // free is deferred to DeviceRunner::finalize(). The runtime-arena size is + // determined by replaying the reserve sequence on a host-side arena. uint64_t total_heap_size = eff_heap_size * PTO2_MAX_RING_DEPTH; uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(eff_task_window_size); + int32_t eff_dep_pool_capacity = + runtime->dep_pool_size ? static_cast(runtime->dep_pool_size) : PTO2_DEP_LIST_POOL_SIZE; + + int64_t t_prebuilt_start = _now_ms(); + DeviceArena host_arena; // libc malloc backend by default + PTO2RuntimeArenaLayout layout = runtime_reserve_layout(host_arena, eff_task_window_size, eff_dep_pool_capacity); + if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { + LOG_ERROR("Failed to commit host arena for prebuilt runtime image"); + return -1; + } + int64_t t_setup_start = _now_ms(); - if (runtime->host_api.setup_static_arena(total_heap_size, sm_size) != 0) { + if (runtime->host_api.setup_static_arena(total_heap_size, sm_size, layout.arena_size) != 0) { LOG_ERROR("Failed to setup pooled static arena"); return -1; } @@ -303,9 +317,48 @@ extern "C" int bind_prepared_to_runtime_impl( } runtime->set_gm_sm_ptr(sm_ptr); + void *runtime_arena_dev = runtime->host_api.acquire_pooled_runtime_arena(); + if (runtime_arena_dev == nullptr) { + LOG_ERROR("Failed to acquire pooled runtime arena"); + return -1; + } + // Set up device orchestration state runtime->set_orch_args(device_args); + // ------------------------------------------------------------------------- + // Build the prebuilt runtime-arena image on host. + // + // We pre-compute every byte the AICPU's runtime arena would otherwise have + // to write at boot: layout offsets, sub-structure init data, and pointers + // back to the SM / GM heap. Then we rtMemcpy the image into the pooled + // runtime-arena region that DeviceRunner keeps alive across runs. AICPU + // boot becomes attach + wire (cheap pointer fixup) + sm_handle->init (SM + // reset) + a handful of device-only field fixups. + // ------------------------------------------------------------------------- + PTO2Runtime *rt = + runtime_init_data_from_layout(host_arena, layout, PTO2_MODE_EXECUTE, sm_ptr, sm_size, gm_heap, eff_heap_size); + if (rt == nullptr) { + LOG_ERROR("runtime_init_data_from_layout failed"); + return -1; + } + runtime_wire_arena_pointers(host_arena, layout, rt); + + // Stash the prebuilt metadata inside the PTO2Runtime image so the AICPU + // picks them up directly via the pooled buffer after rtMemcpy. The host + // Runtime also carries the pointers so the AICPU can locate the + // PTO2Runtime before it does anything else (no chicken-and-egg). + rt->prebuilt_arena_base = runtime_arena_dev; + rt->prebuilt_layout = layout; + + int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size); + if (rc_upload != 0) { + LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload); + return -1; + } + runtime->set_prebuilt_arena(runtime_arena_dev, layout.off_runtime); + int64_t t_prebuilt_end = _now_ms(); + LOG_INFO_V0("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count); int64_t t_total_end = _now_ms(); @@ -313,6 +366,7 @@ extern "C" int bind_prepared_to_runtime_impl( LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start); LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start); LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start); + LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start); LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start); return 0; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index fbc07f53f..f80c7a655 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -384,89 +384,6 @@ static bool prepare_task( return true; } -// ============================================================================= -// Orchestrator Initialization -// ============================================================================= - -PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout( - DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity -) { - PTO2OrchestratorLayout layout{}; - layout.dep_pool_capacity = dep_pool_capacity; - layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP; - layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH; - - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - const size_t fanin_pool_bytes = - PTO2_ALIGN_UP(static_cast(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); - layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE); - } - layout.off_scope_tasks = arena.reserve( - static_cast(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *) - ); - layout.off_scope_begins = - arena.reserve(static_cast(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t)); - layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes); - return layout; -} - -bool PTO2OrchestratorState::init_from_layout( - const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header_arg, void *gm_heap, - uint64_t heap_size -) { - auto *orch = this; - *orch = PTO2OrchestratorState{}; - - orch->sm_header = sm_header_arg; - orch->gm_heap_base = gm_heap; - orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH; - orch->fatal = false; - - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - void *ring_heap_base = reinterpret_cast(gm_heap) + r * heap_size; - auto &ring = sm_header_arg->rings[r]; - - orch->rings[r].task_allocator.init( - ring.task_descriptors, ring.task_window_size, &ring.fc.current_task_index, &ring.fc.last_task_alive, - ring_heap_base, heap_size, &sm_header_arg->orch_error_code - ); - - const size_t fanin_pool_bytes = - PTO2_ALIGN_UP(static_cast(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); - auto *fanin_entries = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); - // aligned_zalloc-equivalent: pool relies on zeroed entries. - memset(fanin_entries, 0, fanin_pool_bytes); - orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, &sm_header_arg->orch_error_code); - } - - if (!orch->tensor_map.init_from_layout(layout.tensor_map, arena)) { - return false; - } - orch->tensor_map.orch = orch; - - orch->scope_tasks = static_cast(arena.region_ptr(layout.off_scope_tasks)); - orch->scope_begins = static_cast(arena.region_ptr(layout.off_scope_begins)); - orch->scope_tasks_size = 0; - orch->scope_tasks_capacity = layout.scope_tasks_cap; - orch->scope_stack_top = -1; - orch->scope_stack_capacity = layout.scope_stack_capacity; - orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH; - - return true; -} - -void PTO2OrchestratorState::destroy() { - auto *orch = this; - orch->tensor_map.destroy(); - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - orch->rings[r].fanin_pool.base = nullptr; - } - orch->scope_tasks = nullptr; - orch->scope_begins = nullptr; -} - -void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; } - // ============================================================================= // Scope Management // ============================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index 37fd0dcac..6e67cb597 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -142,14 +142,21 @@ struct PTO2OrchestratorState { int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE ); - // Phase 3: bind region pointers, wire per-ring task_allocator + fanin_pool - // and tensor_map. Arena must be committed; layout must come from - // reserve_layout() against the same arena. - bool init_from_layout( - const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header, void *gm_heap, - uint64_t heap_size + // Phase 3a: write everything *except* arena-internal pointer fields. + // sm_dev_base is the SM device address (only stored, never dereferenced); + // task_window_size feeds the per-ring SM address arithmetic. Safe to call + // on a host arena that holds the prebuilt image. + bool init_data_from_layout( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size, + uint64_t task_window_size ); + // Phase 3b: write the arena-internal pointer fields (scope_tasks, + // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool, + // free_entry_list,task_entry_heads,orch}, scheduler reference). + // Idempotent — host runs once on the image, AICPU runs once after attach. + void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler); + // Forget pointers; arena owns the backing buffers. void destroy(); void set_scheduler(PTO2SchedulerState *scheduler); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index 5a3e3d3d3..abd2a7510 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -68,10 +68,22 @@ class PTO2TaskAllocator { public: /** * Initialize the allocator with task ring and heap ring resources. + * + * All pointer arguments are device addresses (live in SM / GM heap); this + * function only stores them, no dereferences, so it is safe to invoke + * from host code that constructs a prebuilt arena image. + * + * Production callers leave `initial_local_task_id` at 0: the SM ring + * flow-control counters that current_index_ptr / last_alive_ptr point at + * start at zero (PTO2RingFlowControl::init() runs on the AICPU during SM + * reset), so we keep local_task_id_ aligned with that without reading the + * SM. Tests that drive SM state directly may pass a non-zero seed to + * exercise corner cases like task IDs near INT32_MAX. */ void init( PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic *current_index_ptr, - std::atomic *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic *error_code_ptr + std::atomic *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic *error_code_ptr, + int32_t initial_local_task_id = 0 ) { descriptors_ = descriptors; window_size_ = window_size; @@ -81,7 +93,7 @@ class PTO2TaskAllocator { heap_base_ = heap_base; heap_size_ = heap_size; error_code_ptr_ = error_code_ptr; - local_task_id_ = current_index_ptr->load(std::memory_order_relaxed); + local_task_id_ = initial_local_task_id; heap_top_ = 0; heap_tail_ = 0; last_alive_seen_ = 0; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index c801d5c15..f39bac365 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -249,81 +249,19 @@ static const PTO2RuntimeOps s_runtime_ops = { }; // ============================================================================= -// Runtime Creation and Destruction +// Runtime Lifecycle (AICPU-only fixup) // ============================================================================= - -PTO2Runtime *runtime_create_from_sm( - PTO2RuntimeMode mode, void *sm_base, uint64_t sm_size, uint64_t task_window_size, void *gm_heap, uint64_t heap_size, - DeviceArena &arena, int32_t dep_pool_capacity -) { - if (!sm_base || sm_size == 0) return nullptr; - - // Phase 1: layout. Reserve every sub-region the runtime needs (including - // the SM handle wrapper itself) without touching memory yet. - int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - task_window_sizes[r] = static_cast(task_window_size); - } - const size_t off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle)); - PTO2OrchestratorLayout orch_layout = - PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity); - PTO2SchedulerLayout sched_layout = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity); - const size_t off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE); - const size_t off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox)); - - // Phase 2: single backing allocation. - if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) return nullptr; - - // Phase 3: bind region pointers and initialize. - PTO2Runtime *rt = static_cast(arena.region_ptr(off_runtime)); - memset(rt, 0, sizeof(*rt)); // calloc-equivalent for the runtime header. - - // Initialize the SM handle wrapper in-place on its arena region before - // anything that reads sm_handle->header (orchestrator / scheduler init). - rt->sm_handle = static_cast(arena.region_ptr(off_sm_handle)); - memset(rt->sm_handle, 0, sizeof(*rt->sm_handle)); - if (!rt->sm_handle->init(sm_base, sm_size, task_window_size, heap_size)) { - arena.release(); - return nullptr; - } - +// +// Layout / init_data / wire / destroy live in +// runtime/shared/pto_runtime2_init.cpp so the host build can pre-populate the +// prebuilt arena image. The pieces below — wiring the ops table and the +// SPMD core counts — depend on the device-side s_runtime_ops global and the +// AICPU SchedulerContext respectively, so they remain in the AICPU build. + +void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count) { rt->ops = &s_runtime_ops; - rt->mode = mode; - rt->gm_heap = gm_heap; - rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0; - rt->gm_heap_owned = false; - - if (!rt->orchestrator.init_from_layout(orch_layout, arena, rt->sm_handle->header, gm_heap, heap_size)) { - arena.release(); - return nullptr; - } - if (!rt->scheduler.init_from_layout(sched_layout, arena, rt->sm_handle->header)) { - rt->orchestrator.destroy(); - arena.release(); - return nullptr; - } - rt->orchestrator.set_scheduler(&rt->scheduler); - - rt->aicore_mailbox = static_cast(arena.region_ptr(off_mailbox)); - memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox)); - - return rt; -} - -void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena) { - if (!rt) { - arena.release(); // safe: idempotent if nothing's committed. - return; - } - - rt->scheduler.destroy(); - rt->orchestrator.destroy(); - rt->aicore_mailbox = nullptr; // arena-owned. - rt->sm_handle = nullptr; // wrapper lives in arena; release() reclaims it. - - // arena.release() frees the single backing buffer that holds rt, - // mailbox, sm_handle, orchestrator and scheduler sub-regions in one shot. - arena.release(); + rt->orchestrator.total_cluster_count = aic_count; + rt->orchestrator.total_aiv_count = aiv_count; } void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) { diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h index 5709a85b7..169937f82 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h @@ -91,6 +91,30 @@ struct PTO2RuntimeOps { TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args); }; +/** + * Layout descriptor for the prebuilt runtime arena. Holds all sub-region + * offsets (orchestrator / scheduler / sm_handle wrapper / runtime header / + * AICore mailbox) plus the layout-defining capacities. Produced once on the + * host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout + * and runtime_wire_arena_pointers. + */ +struct PTO2RuntimeArenaLayout { + size_t off_sm_handle{0}; + PTO2OrchestratorLayout orch; + PTO2SchedulerLayout sched; + size_t off_runtime{0}; + size_t off_mailbox{0}; + + // Cached parameters (re-used by init_data + wire stages). + uint64_t task_window_size{0}; + uint64_t heap_size{0}; + int32_t dep_pool_capacity{0}; + + // Total arena byte size post-commit. Used by host to size the prebuilt + // image buffer and as the rtMemcpy length. + size_t arena_size{0}; +}; + /** * PTO Runtime2 context * @@ -118,6 +142,16 @@ struct PTO2Runtime { // Statistics int64_t total_cycles; + + // Prebuilt-arena fast path metadata. `prebuilt_arena_base` is the device + // address of the runtime arena (the buffer that holds *this* PTO2Runtime + // at offset prebuilt_layout.off_runtime). `prebuilt_layout` carries every + // offset wire_arena_pointers needs at AICPU boot, so the AICPU can + // reconstruct all arena-internal pointer fields without re-running + // init_data. Populated by the host's runtime_init_data_from_layout + + // runtime_wire_arena_pointers; read by aicpu_executor.cpp. + void *prebuilt_arena_base{nullptr}; + PTO2RuntimeArenaLayout prebuilt_layout; }; // ============================================================================= @@ -125,38 +159,60 @@ struct PTO2Runtime { // ============================================================================= /** - * Create runtime from caller-provided GM SM buffer + GM heap. - * - * All AICPU-side runtime state (PTO2SharedMemoryHandle wrapper, PTO2Runtime, - * AICoreCompletionMailbox, plus the orchestrator/scheduler/tensor_map - * sub-regions) is laid out on the supplied arena and committed in a single - * backing allocation — including the SM handle wrapper itself. The arena is - * owned by the caller (typically the per-Worker AicpuExecutor); - * runtime_destroy() calls arena.release() once to free the lot. + * Phase 1 — declare every sub-region (sm_handle wrapper, orchestrator / + * scheduler / tensor_map / mailbox / PTO2Runtime header) on the supplied + * arena. Pure arithmetic; does not touch device memory and may run on host. + * Returns the layout descriptor; caller commits/attaches the arena before + * Phase 2/3. + */ +PTO2RuntimeArenaLayout runtime_reserve_layout( + DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE +); + +/** + * Phase 2 — write the data half of the runtime arena: standalone fields, + * memset'd arena regions, sub-structure initializers, and SM-side device + * pointers. The arena must already be committed (or attached); writes go + * into arena.base() + sub-region offsets. * - * `sm_base` / `sm_size` describe the SM buffer that host has already placed - * for the runtime to use; the SM handle wrapper is constructed in-place on - * an arena-reserved region pointing at that buffer. + * `sm_dev_base` / `gm_heap_dev_base` are device addresses; we only store + * them (never dereference). Safe to run on a host arena that owns a host + * mirror of the runtime image — the resulting buffer is rtMemcpy-ready. * - * @param mode Execution mode - * @param sm_base Pre-allocated SM buffer base (host-owned) - * @param sm_size Size of the SM buffer in bytes - * @param task_window_size Per-ring task window size used to lay out SM - * @param gm_heap GM heap base for output buffers (or NULL if not used) - * @param heap_size GM heap size in bytes - * @param arena Caller-owned arena that sources all runtime sub-regions. - * Must be freshly constructed (no prior commit) — - * runtime_create_from_sm reserves + commits internally. - * @return Runtime context, or NULL on failure - */ -PTO2Runtime *runtime_create_from_sm( - PTO2RuntimeMode mode, void *sm_base, uint64_t sm_size, uint64_t task_window_size, void *gm_heap, uint64_t heap_size, - DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE + * Returns the PTO2Runtime* that sits at layout.off_runtime within the arena. + * Caller must follow up with runtime_wire_arena_pointers; rt->ops and the + * AICore-side count fields are left untouched and must be filled by the + * AICPU at boot. + */ +PTO2Runtime *runtime_init_data_from_layout( + DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size, + void *gm_heap_dev_base, uint64_t heap_size ); /** - * Destroy runtime and free all resources. arena.release() is the actual - * memory free; the rt pointer is no longer valid afterward. + * Phase 3 — wire every arena-internal pointer field (rt->sm_handle, + * rt->aicore_mailbox, orchestrator.{scope_tasks, scope_begins, scheduler, + * tensor_map.*, rings[].fanin_pool.base}, scheduler.{ready_queues, dep_pool, + * wiring.queue}) so each holds arena.base() + offset. Idempotent — runs on + * both host (writing host-mirror addresses) and AICPU (writing device + * addresses) sides. + */ +void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt); + +/** + * AICPU-only Phase 4 — fill in the few fields the host could not know at + * prebuilt-image build time: the ops table (s_runtime_ops is a device-side + * file-local global, host cannot resolve its device address) and the + * orchestrator's core counts (depend on the executor's scheduler context). + * Call once per boot after runtime_wire_arena_pointers. + */ +void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count); + +/** + * Destroy runtime. With the prebuilt-arena fast path the arena buffer is + * pooled across runs by DeviceRunner, so we never call arena.release() + * here — the destructor only forgets sub-structure pointers (idempotent + * cleanup). */ void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h index 5e1b6faa8..c8de35ba6 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h @@ -187,3 +187,64 @@ struct PTO2SharedMemoryHandle { void setup_pointers(uint64_t task_window_size); void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]); }; + +// ============================================================================= +// SM Device Layout Helpers +// ============================================================================= +// +// When the host pre-builds a runtime-arena image, it needs the device-side +// addresses of several SM sub-fields (ring flow-control counters, +// task_descriptors arrays, orch_error_code) so it can wire them into the +// orchestrator / scheduler init_data path without dereferencing the SM — +// the SM lives in device memory and cannot be touched from host. +// +// These helpers compute those addresses by offset arithmetic on the SM +// device base. Pure pointer math, no loads/stores; safe to call from host. +// The same arithmetic happens on AICPU too (via PTO2SharedMemoryHandle's +// own setup_pointers), so values are guaranteed consistent across sides. +namespace pto2_sm_layout { + +inline std::atomic *orch_error_code_addr(void *sm_dev_base) noexcept { + return reinterpret_cast *>( + static_cast(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code) + ); +} + +inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast( + static_cast(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) + + static_cast(ring_id) * sizeof(PTO2SharedMemoryRingHeader) + ); +} + +inline std::atomic *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast *>( + reinterpret_cast(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + + offsetof(PTO2RingFlowControl, current_task_index) + ); +} + +inline std::atomic *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast *>( + reinterpret_cast(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + + offsetof(PTO2RingFlowControl, last_task_alive) + ); +} + +// Walk the per-ring SM layout (same arithmetic as setup_pointers_per_ring) +// to compute ring `ring_id`'s task_descriptors device address. Uniform +// per-ring task_window_size; matches the production callsite which always +// passes a uniform window size to runtime_create_from_sm. +inline PTO2TaskDescriptor * +ring_task_descriptors_addr(void *sm_dev_base, uint64_t task_window_size, int ring_id) noexcept { + char *p = static_cast(sm_dev_base); + p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); + for (int r = 0; r < ring_id; r++) { + p += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); + p += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + p += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); + } + return reinterpret_cast(p); +} + +} // namespace pto2_sm_layout diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h index cf1f2d28d..11decdf4e 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h @@ -435,11 +435,22 @@ struct PTO2TensorMap { reserve_layout_default(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]); /** - * Phase 3: bind region pointers and initialize state. The arena must already - * be committed; layout must have been produced by reserve_layout() against - * the same arena. + * Phase 3a: write everything *except* arena-internal pointer fields + * (buckets, entry_pool, free_entry_list, task_entry_heads[r], orch). + * Uses arena.region_ptr to address the arena regions for data writes, + * but does not store those addresses in struct fields. Safe to call on + * a host arena that holds the prebuilt image. */ - bool init_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena); + bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena); + + /** + * Phase 3b: write the arena-internal pointer fields. Idempotent; + * called once on the host arena and once on the AICPU after attach. + * `parent_orch` is the device address (or host-mirror address) of the + * enclosing PTO2OrchestratorState; we store it in tensor_map.orch + * (self-pointer within the same arena). + */ + void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena, PTO2OrchestratorState *parent_orch); /** * Tear down state. Does not free memory — the arena owns the backing diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 73b6027c4..117621ca2 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -124,14 +124,26 @@ struct HostApi { // allocation. Must be called once before acquire_pooled_gm_heap / // acquire_pooled_gm_sm. Idempotent on identical sizes; returns 0 on // success, -1 on allocation failure. - int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size); + // Lay out three pooled regions in a single backing device allocation: + // GM heap, PTO2 shared memory, and the trb prebuilt runtime arena. + // `runtime_arena_size == 0` skips the last region (hbg path: hbg has no + // prebuilt runtime arena). Returns 0 on success, -1 on allocation + // failure. + int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); // Return the per-Worker pooled pointer for the PTO2 GM heap / shared - // memory. The static arena must already be committed via - // setup_static_arena; the returned pointer is owned by the DeviceRunner - // and freed in `DeviceRunner::finalize()` — do NOT pass it to - // device_free or record it in `tensor_pairs_`. + // memory / prebuilt runtime arena. The static arena must already be + // committed via setup_static_arena; the returned pointer is owned by + // the DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT + // pass it to device_free or record it in `tensor_pairs_`. + // + // acquire_pooled_runtime_arena is trb-only — the host side reserves the + // runtime-arena region only when setup_static_arena is invoked with + // runtime_arena_size > 0. hbg's runtime_maker.cpp must not call it + // (setup_static_arena(...,0) leaves the offset unreserved, and the + // returned region_ptr would be undefined). void *(*acquire_pooled_gm_heap)(); void *(*acquire_pooled_gm_sm)(); + void *(*acquire_pooled_runtime_arena)(); // Single-shot upload of the entire ChipCallable buffer. `callable` is a // `const ChipCallable *` (declared void* to avoid pulling task_interface // headers into runtime.h). DeviceRunner walks child_offsets_ to compute @@ -211,6 +223,13 @@ class Runtime { void *slot_states_ptr_; // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling) ChipStorageTaskArgs orch_args_storage_; // Copy of args for device + // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing + // Runtime to device; AICPU reads them in the boot path to skip + // runtime_create_from_sm and reuse the pooled, prebuilt arena buffer + // (already populated by runtime_init_data_from_layout + wire on host). + void *prebuilt_arena_base_; + size_t prebuilt_runtime_offset_; + // Device orchestration SO (for dlopen on AICPU thread 3). // The SO bytes themselves live in a separately-allocated device buffer // owned by DeviceRunner; only the metadata below travels inside Runtime. @@ -247,6 +266,16 @@ class Runtime { void set_slot_states_ptr(void *p); void set_orch_args(const ChipStorageTaskArgs &args); + // Prebuilt-arena fast path (trb only). Set by host's + // bind_prepared_to_runtime_impl; consumed by AICPU at boot to attach a + // DeviceArena to `prebuilt_arena_base_` and pick up the PTO2Runtime at + // `prebuilt_arena_base_ + prebuilt_runtime_offset_`. Both stay zero on + // first construction (Runtime() ctor zeros them) so a non-prebuilt boot + // path can still detect "no prebuilt image set" via nullptr. + void set_prebuilt_arena(void *arena_base, size_t runtime_off); + void *get_prebuilt_arena_base() const; + size_t get_prebuilt_runtime_offset() const; + // Device orchestration SO binary (for dlopen on AICPU thread 3) void set_dev_orch_so(uint64_t dev_addr, uint64_t size); uint64_t get_dev_orch_so_addr() const; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp index f497b8fd8..2d777e9b0 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp @@ -61,153 +61,6 @@ PTO2SchedProfilingData scheduler_get_profiling(int thread_idx) { } #endif -// ============================================================================= -// Ready Queue Implementation -// ============================================================================= - -size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) { - // Align the slots[] base to a full cache line so MPMC CAS traffic on the - // first slot cannot false-share with whatever region sits in front of us - // (e.g. orchestrator tensormap heads written by the orch thread). - return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE); -} - -bool ready_queue_init_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) { - queue->slots = static_cast(arena.region_ptr(slots_off)); - queue->capacity = capacity; - queue->mask = capacity - 1; - queue->enqueue_pos.store(0, std::memory_order_relaxed); - queue->dequeue_pos.store(0, std::memory_order_relaxed); - - for (uint64_t i = 0; i < capacity; i++) { - queue->slots[i].sequence.store((int64_t)i, std::memory_order_relaxed); - queue->slots[i].slot_state = nullptr; - } - - return true; -} - -void ready_queue_destroy(PTO2ReadyQueue *queue) { - // Arena owns the slots[] buffer; just forget the pointer. - queue->slots = nullptr; -} - -// ============================================================================= -// Scheduler Initialization -// ============================================================================= - -bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id) { - ring = &sm_header->rings[ring_id]; - last_task_alive = 0; - advance_lock.store(0, std::memory_order_relaxed); - - // Initialize all per-task slot state fields. - // bind_ring() sets the ring_id only — payload/task pointers are re-bound - // by orch::prepare_task on every submit (their value is per-slot constant - // but pinning them here would cost O(task_window_size) at startup). - // reset_for_reuse() sets dynamic fields to reclaim defaults (fanout_count=1, - // rest zero) so the first submit needs no reset. - for (uint64_t i = 0; i < ring->task_window_size; i++) { - ring->slot_states[i].bind_ring(static_cast(ring_id)); - ring->slot_states[i].reset_for_reuse(); - ring->slot_states[i].fanin_count = 0; - ring->slot_states[i].active_mask = ActiveMask{}; - } - - return true; -} - -void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; } - -PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) { - PTO2SchedulerLayout layout{}; - layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE; - layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE; - layout.dep_pool_capacity = dep_pool_capacity; - - for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { - layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); - } - layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - // Force a cache-line base so writes from scheduler thread 0 (sole - // writer of this ring's dep_pool) do not invalidate adjacent - // multi-threaded regions like ready_queue.slots. - layout.off_dep_pool_entries[r] = - arena.reserve(static_cast(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE); - } - layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE); - return layout; -} - -bool PTO2SchedulerState::init_from_layout( - const PTO2SchedulerLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header_arg -) { - PTO2SchedulerState *sched = this; - sched->sm_header = sm_header_arg; -#if PTO2_SCHED_PROFILING - sched->tasks_completed.store(0, std::memory_order_relaxed); - sched->tasks_consumed.store(0, std::memory_order_relaxed); -#endif - - // Per-ring scheduler state — no arena buffers, just field init. - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - if (!sched->ring_sched_states[r].init(sm_header_arg, r)) { - return false; - } - } - - // Ready queues — one per resource shape plus DUMMY. - for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { - if (!ready_queue_init_from_layout( - &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity - )) { - return false; - } - } - if (!ready_queue_init_from_layout( - &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity - )) { - return false; - } - - // Per-ring dep_pool: PTO2DepListPool::init takes an externally-allocated - // base + capacity, so we just plumb the arena region into it. - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - auto *dep_entries = static_cast(arena.region_ptr(layout.off_dep_pool_entries[r])); - // calloc-equivalent: pool expects entries zeroed at construction. - memset(dep_entries, 0, static_cast(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry)); - sched->ring_sched_states[r].dep_pool.init( - dep_entries, layout.dep_pool_capacity, &sm_header_arg->orch_error_code - ); - } - - // Wiring SPSC queue (orchestrator push, scheduler thread 0 pop). - if (!sched->wiring.queue.init_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) { - return false; - } - sched->wiring.batch_count = 0; - sched->wiring.batch_index = 0; - sched->wiring.backoff_counter = 0; - - return true; -} - -void PTO2SchedulerState::destroy() { - PTO2SchedulerState *sched = this; - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - sched->ring_sched_states[r].destroy(); - sched->ring_sched_states[r].dep_pool.base = nullptr; - } - - sched->wiring.queue.destroy(); - - for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { - ready_queue_destroy(&sched->ready_queues[i]); - } - ready_queue_destroy(&sched->dummy_ready_queue); -} - // ============================================================================= // Debug Utilities // ============================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h index 8d50681ba..828999113 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h @@ -409,7 +409,14 @@ struct alignas(64) PTO2ReadyQueue { // initialize sequence counters // destroy: forget the slots pointer (arena owns the buffer) size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity); -bool ready_queue_init_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity); +// Writes everything *except* the arena-internal `slots` pointer field +// (sequences/positions on the slot array, capacity, mask). Uses +// arena.region_ptr(slots_off) only to address the slot array for writes; +// does NOT store the pointer in `queue->slots`. Call +// `ready_queue_wire_arena_pointers` afterwards to set the field itself. +bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity); +// Stores queue->slots = arena.region_ptr(slots_off). Idempotent. +void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off); void ready_queue_destroy(PTO2ReadyQueue *queue); // ============================================================================= @@ -449,15 +456,17 @@ struct alignas(64) PTO2SpscQueue { return arena.reserve(capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE); } - // Bind buffer pointer + reset indices. The capacity must be a power of two - // and match the value passed to reserve_layout. - bool init_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) { + // Writes everything except the arena-internal `buffer_` pointer field + // (zeros the slot pointer array, mask/head/tail). The host pre-builds the + // image without storing a host address in buffer_; the AICPU wires + // buffer_ at boot via wire_arena_pointers(). + bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) { if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false; - buffer_ = static_cast(arena.region_ptr(buffer_off)); + auto *buf = static_cast(arena.region_ptr(buffer_off)); // calloc'd-equivalent: zero the slot pointers so spurious early pops // observe nullptr. for (uint64_t i = 0; i < capacity; i++) - buffer_[i] = nullptr; + buf[i] = nullptr; mask_ = capacity - 1; head_.store(0, std::memory_order_relaxed); tail_.store(0, std::memory_order_relaxed); @@ -466,6 +475,12 @@ struct alignas(64) PTO2SpscQueue { return true; } + // Wire the arena-internal pointer. Called by both host (with host arena) + // and AICPU (with device arena attached to the prebuilt image). + void wire_arena_pointers(DeviceArena &arena, size_t buffer_off) { + buffer_ = static_cast(arena.region_ptr(buffer_off)); + } + // Arena owns the buffer; here we only forget our pointer. void destroy() { buffer_ = nullptr; } @@ -563,7 +578,12 @@ struct PTO2SchedulerState { // --- Cache Line 1+: Thread 0 only (wiring dep_pool) --- alignas(64) PTO2DepListPool dep_pool; - bool init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id); + // Initialize arena-internal data + arena-external pointers; does NOT + // store dep_pool.base (that lives in the runtime arena and is wired + // by SchedulerState::wire_arena_pointers). The `ring` field stores + // the device address of the SM ring header — computed via offset + // arithmetic, no SM dereference. + bool init_data_from_layout(void *sm_dev_base, int32_t ring_id, int32_t dep_pool_capacity); void destroy(); void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); } @@ -1042,13 +1062,24 @@ struct PTO2SchedulerState { // Phase 1: declare every sub-region (ready_queue slots, dummy queue slots, // per-ring dep_pool entries, wiring SPSC buffer) on the supplied arena. - // Capacities are baked into the returned layout; init_from_layout uses + // Capacities are baked into the returned layout; init_data_from_layout uses // the same values. static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE); - // Phase 3: bind region pointers and initialize state. The arena must be - // committed; layout must come from reserve_layout() against the same arena. - bool init_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header); + // Phase 3a: write everything *except* arena-internal pointer fields. + // `sm_dev_base` is the device address of the SM (only stored, never + // dereferenced here); `task_window_size` lets the per-ring data-addr + // arithmetic resolve ring task_descriptors / fc field addresses without + // an SM load. Safe to call on a host arena that holds the prebuilt + // image buffer. + bool init_data_from_layout( + const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base, uint64_t task_window_size + ); + + // Phase 3b: write the arena-internal pointer fields + // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each + // ring, wiring.queue.buffer_). Called on both host and device sides. + void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena); // Forget per-region pointers; arena owns the backing memory. void destroy(); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp new file mode 100644 index 000000000..3efa313fd --- /dev/null +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp @@ -0,0 +1,351 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Host/AICPU shared runtime-arena layout, init_data and wire implementations. + * + * Lives under runtime/shared/ so it is included in both the host_runtime.so + * build (host pre-populates the prebuilt arena image) and the aicpu_runtime + * build (AICPU runs wire_arena_pointers + destroy after attach). The + * device-only parts of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp + * (ops table, scope/submit/dispatch business logic, profiling) stay in their + * original files and the aicpu build only. + */ + +#include +#include + +#include "pto_orchestrator.h" +#include "pto_runtime2.h" +#include "pto_ring_buffer.h" +#include "pto_shared_memory.h" +#include "pto_tensormap.h" +#include "scheduler/pto_scheduler.h" + +// ============================================================================= +// Ready queue +// ============================================================================= + +size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) { + // Align the slots[] base to a full cache line so MPMC CAS traffic on the + // first slot cannot false-share with whatever region sits in front of us + // (e.g. orchestrator tensormap heads written by the orch thread). + return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE); +} + +bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) { + // Address the slots region for data writes without storing the pointer in + // queue->slots — that field is set by ready_queue_wire_arena_pointers. + auto *slots_arena = static_cast(arena.region_ptr(slots_off)); + queue->capacity = capacity; + queue->mask = capacity - 1; + queue->enqueue_pos.store(0, std::memory_order_relaxed); + queue->dequeue_pos.store(0, std::memory_order_relaxed); + + for (uint64_t i = 0; i < capacity; i++) { + slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed); + slots_arena[i].slot_state = nullptr; + } + + return true; +} + +void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off) { + queue->slots = static_cast(arena.region_ptr(slots_off)); +} + +void ready_queue_destroy(PTO2ReadyQueue *queue) { + // Arena owns the slots[] buffer; just forget the pointer. + queue->slots = nullptr; +} + +// ============================================================================= +// Scheduler +// ============================================================================= + +bool PTO2SchedulerState::RingSchedState::init_data_from_layout( + void *sm_dev_base, int32_t ring_id, int32_t /*dep_pool_capacity*/ +) { + // ring stores the device address of the SM ring header — pure offset + // arithmetic, no SM load. + ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id); + last_task_alive = 0; + advance_lock.store(0, std::memory_order_relaxed); + + // Per-slot SM-side initialization (bind_ring + reset_for_reuse + + // fanin_count/active_mask zero) lives in PTO2SharedMemoryHandle:: + // init_header_per_ring so the AICPU performs it during SM reset; host + // prebuilt-arena init skips SM access here. + + return true; +} + +void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; } + +PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) { + PTO2SchedulerLayout layout{}; + layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE; + layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE; + layout.dep_pool_capacity = dep_pool_capacity; + + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); + } + layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + // Force a cache-line base so writes from scheduler thread 0 (sole + // writer of this ring's dep_pool) do not invalidate adjacent + // multi-threaded regions like ready_queue.slots. + layout.off_dep_pool_entries[r] = + arena.reserve(static_cast(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE); + } + layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE); + return layout; +} + +bool PTO2SchedulerState::init_data_from_layout( + const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base, uint64_t /*task_window_size*/ +) { + PTO2SchedulerState *sched = this; + sched->sm_header = reinterpret_cast(sm_dev_base); +#if PTO2_SCHED_PROFILING + sched->tasks_completed.store(0, std::memory_order_relaxed); + sched->tasks_consumed.store(0, std::memory_order_relaxed); +#endif + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r, layout.dep_pool_capacity)) { + return false; + } + } + + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + if (!ready_queue_init_data_from_layout( + &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity + )) { + return false; + } + } + if (!ready_queue_init_data_from_layout( + &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity + )) { + return false; + } + + auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto *dep_entries = static_cast(arena.region_ptr(layout.off_dep_pool_entries[r])); + memset(dep_entries, 0, static_cast(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry)); + sched->ring_sched_states[r].dep_pool.init(dep_entries, layout.dep_pool_capacity, orch_err); + } + + if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) { + return false; + } + sched->wiring.batch_count = 0; + sched->wiring.batch_index = 0; + sched->wiring.backoff_counter = 0; + + return true; +} + +void PTO2SchedulerState::wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena) { + PTO2SchedulerState *sched = this; + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]); + } + ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + sched->ring_sched_states[r].dep_pool.base = + static_cast(arena.region_ptr(layout.off_dep_pool_entries[r])); + } + sched->wiring.queue.wire_arena_pointers(arena, layout.off_wiring_spsc_buffer); +} + +void PTO2SchedulerState::destroy() { + PTO2SchedulerState *sched = this; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + sched->ring_sched_states[r].destroy(); + sched->ring_sched_states[r].dep_pool.base = nullptr; + } + sched->wiring.queue.destroy(); + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + ready_queue_destroy(&sched->ready_queues[i]); + } + ready_queue_destroy(&sched->dummy_ready_queue); +} + +// ============================================================================= +// Orchestrator +// ============================================================================= + +PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout( + DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity +) { + PTO2OrchestratorLayout layout{}; + layout.dep_pool_capacity = dep_pool_capacity; + layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP; + layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH; + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + const size_t fanin_pool_bytes = + PTO2_ALIGN_UP(static_cast(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); + layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE); + } + layout.off_scope_tasks = arena.reserve( + static_cast(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *) + ); + layout.off_scope_begins = + arena.reserve(static_cast(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t)); + layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes); + return layout; +} + +bool PTO2OrchestratorState::init_data_from_layout( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size, + uint64_t task_window_size +) { + auto *orch = this; + *orch = PTO2OrchestratorState{}; + + orch->sm_header = reinterpret_cast(sm_dev_base); + orch->gm_heap_base = gm_heap; + orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH; + orch->fatal = false; + + auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + void *ring_heap_base = reinterpret_cast(gm_heap) + r * heap_size; + auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_size, r); + auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r); + auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r); + + orch->rings[r].task_allocator.init( + task_descs_dev, static_cast(task_window_size), cur_idx_dev, last_alive_dev, ring_heap_base, + heap_size, orch_err + ); + + const size_t fanin_pool_bytes = + PTO2_ALIGN_UP(static_cast(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); + auto *fanin_entries = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); + memset(fanin_entries, 0, fanin_pool_bytes); + orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, orch_err); + } + + if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) { + return false; + } + + orch->scope_tasks_size = 0; + orch->scope_tasks_capacity = layout.scope_tasks_cap; + orch->scope_stack_top = -1; + orch->scope_stack_capacity = layout.scope_stack_capacity; + orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH; + + return true; +} + +void PTO2OrchestratorState::wire_arena_pointers( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg +) { + auto *orch = this; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + orch->rings[r].fanin_pool.base = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); + } + orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena, orch); + orch->scope_tasks = static_cast(arena.region_ptr(layout.off_scope_tasks)); + orch->scope_begins = static_cast(arena.region_ptr(layout.off_scope_begins)); + orch->scheduler = scheduler_arg; +} + +void PTO2OrchestratorState::destroy() { + auto *orch = this; + orch->tensor_map.destroy(); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + orch->rings[r].fanin_pool.base = nullptr; + } + orch->scope_tasks = nullptr; + orch->scope_begins = nullptr; +} + +void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; } + +// ============================================================================= +// Top-level runtime arena +// ============================================================================= + +PTO2RuntimeArenaLayout +runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity) { + PTO2RuntimeArenaLayout layout{}; + layout.task_window_size = task_window_size; + layout.dep_pool_capacity = dep_pool_capacity; + + int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_window_sizes[r] = static_cast(task_window_size); + } + + layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle)); + layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity); + layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity); + layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE); + layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox)); + + layout.arena_size = arena.total_size(); + return layout; +} + +PTO2Runtime *runtime_init_data_from_layout( + DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, + uint64_t /*sm_size*/, void *gm_heap_dev_base, uint64_t heap_size +) { + PTO2Runtime *rt = static_cast(arena.region_ptr(layout.off_runtime)); + memset(rt, 0, sizeof(*rt)); + + auto *sm_wrap = static_cast(arena.region_ptr(layout.off_sm_handle)); + memset(sm_wrap, 0, sizeof(*sm_wrap)); + + // rt->ops is filled by the AICPU at boot. + rt->mode = mode; + rt->gm_heap = gm_heap_dev_base; + rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0; + rt->gm_heap_owned = false; + rt->total_cycles = 0; + + if (!rt->orchestrator.init_data_from_layout( + layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_size, layout.task_window_size + )) { + return nullptr; + } + if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base, layout.task_window_size)) { + return nullptr; + } + + auto *mailbox = static_cast(arena.region_ptr(layout.off_mailbox)); + memset(mailbox, 0, sizeof(*mailbox)); + + return rt; +} + +void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) { + rt->sm_handle = static_cast(arena.region_ptr(layout.off_sm_handle)); + rt->aicore_mailbox = static_cast(arena.region_ptr(layout.off_mailbox)); + rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler); + rt->scheduler.wire_arena_pointers(layout.sched, arena); +} + +void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) { + // Arena buffer is pooled across runs by DeviceRunner — never freed here. + if (!rt) return; + rt->scheduler.destroy(); + rt->orchestrator.destroy(); + rt->aicore_mailbox = nullptr; + rt->sm_handle = nullptr; +} diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp index 358c87f57..1e1edff92 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp @@ -167,6 +167,23 @@ void PTO2SharedMemoryHandle::init_header_per_ring( header->sched_error_bitmap.store(0, std::memory_order_relaxed); header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed); header->sched_error_thread.store(-1, std::memory_order_relaxed); + + // Per-ring slot_states reset. Previously lived in + // PTO2SchedulerState::RingSchedState::init(), but it writes into + // ring->slot_states[] which is SM-side storage — keeping it here lets + // host-side prebuilt-arena init skip all SM dereferences. + // bind_ring() pins the ring_id (slot-invariant after this point); + // reset_for_reuse() prepares dynamic fanout/refcount fields so the first + // submit doesn't need an explicit reset. + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto &ring = header->rings[r]; + for (uint64_t i = 0; i < task_window_sizes[r]; i++) { + ring.slot_states[i].bind_ring(static_cast(r)); + ring.slot_states[i].reset_for_reuse(); + ring.slot_states[i].fanin_count = 0; + ring.slot_states[i].active_mask = ActiveMask{}; + } + } } // ============================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp index a0b98bd09..da9d4fddf 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp @@ -81,43 +81,45 @@ PTO2TensorMap::reserve_layout_default(DeviceArena &arena, const int32_t new_task return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes); } -bool PTO2TensorMap::init_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) { +bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) { num_buckets = layout.num_buckets; pool_size = layout.pool_size; - buckets = static_cast(arena.region_ptr(layout.off_buckets)); - entry_pool = static_cast(arena.region_ptr(layout.off_entry_pool)); - free_entry_list = static_cast(arena.region_ptr(layout.off_free_entry_list)); + // Address arena regions for data writes; do not store these in struct + // fields (wire_arena_pointers does that). + auto *buckets_arena = static_cast(arena.region_ptr(layout.off_buckets)); + auto *entry_pool_arena = static_cast(arena.region_ptr(layout.off_entry_pool)); + auto *free_list_arena = static_cast(arena.region_ptr(layout.off_free_entry_list)); // buckets[]: empty == nullptr. for (int32_t i = 0; i < num_buckets; i++) { - buckets[i] = nullptr; + buckets_arena[i] = nullptr; } // entry_pool: zero-init equivalent to the previous calloc(entry_pool, ...). // The pool's persistent invariant after init is "bucket_index == -1 means // not linked", set explicitly below. - memset(entry_pool, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry)); + memset(entry_pool_arena, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry)); for (int32_t i = 0; i < pool_size; i++) { - entry_pool[i].bucket_index = -1; - entry_pool[i].next_in_bucket = nullptr; - entry_pool[i].prev_in_bucket = nullptr; - entry_pool[i].next_in_task = nullptr; - entry_pool[i].prev_in_task = nullptr; - entry_pool[i].producer_task_id = PTO2TaskId{}; + entry_pool_arena[i].bucket_index = -1; + entry_pool_arena[i].next_in_bucket = nullptr; + entry_pool_arena[i].prev_in_bucket = nullptr; + entry_pool_arena[i].next_in_task = nullptr; + entry_pool_arena[i].prev_in_task = nullptr; + entry_pool_arena[i].producer_task_id = PTO2TaskId{}; } // free_entry_list: zeroed (was calloc'd before); contents become meaningful // only after entries are freed back, so the body of the array stays as 0. - memset(free_entry_list, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry *)); + memset(free_list_arena, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry *)); next_entry_idx = 0; free_num = 0; for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - task_entry_heads[r] = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); + auto *heads_arena = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) { - task_entry_heads[r][i] = nullptr; + heads_arena[i] = nullptr; } task_window_sizes[r] = layout.task_window_sizes[r]; last_task_alives[r] = 0; @@ -127,6 +129,18 @@ bool PTO2TensorMap::init_from_layout(const PTO2TensorMapLayout &layout, DeviceAr return true; } +void PTO2TensorMap::wire_arena_pointers( + const PTO2TensorMapLayout &layout, DeviceArena &arena, PTO2OrchestratorState *parent_orch +) { + buckets = static_cast(arena.region_ptr(layout.off_buckets)); + entry_pool = static_cast(arena.region_ptr(layout.off_entry_pool)); + free_entry_list = static_cast(arena.region_ptr(layout.off_free_entry_list)); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_entry_heads[r] = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); + } + orch = parent_orch; +} + void PTO2TensorMap::destroy() { // Arena owns the backing memory; here we only forget our pointers so any // stray post-destroy access trips a nullptr dereference instead of reading diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp index 6a7ab65da..b3347b53c 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp @@ -44,6 +44,8 @@ Runtime::Runtime() { gm_heap_ptr_ = nullptr; slot_states_ptr_ = nullptr; orch_args_storage_.clear(); + prebuilt_arena_base_ = nullptr; + prebuilt_runtime_offset_ = 0; // Initialize device orchestration SO binary dev_orch_so_addr_ = 0; @@ -74,6 +76,13 @@ void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; } void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; } void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; } +void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) { + prebuilt_arena_base_ = arena_base; + prebuilt_runtime_offset_ = runtime_off; +} +void *Runtime::get_prebuilt_arena_base() const { return prebuilt_arena_base_; } +size_t Runtime::get_prebuilt_runtime_offset() const { return prebuilt_runtime_offset_; } + // Device orchestration SO metadata (bytes live in a separate device buffer // owned by DeviceRunner; only the address/size travels in Runtime). void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) { diff --git a/src/common/device_comm/device_arena.h b/src/common/device_comm/device_arena.h index a0ade3dc3..ad43d1869 100644 --- a/src/common/device_comm/device_arena.h +++ b/src/common/device_comm/device_arena.h @@ -95,6 +95,18 @@ class DeviceArena { // the trampoline's free path must therefore be nothrow.) void *commit(size_t base_align = kDefaultBaseAlign); + // Phase 2 alternative: attach to an externally-owned buffer instead of + // allocating one. Caller guarantees `external_base` is at least the size + // reported by `total_size()` rounded up to `base_align`, and that the + // same reserve() sequence has been (or will be) replayed. Forward-aligns + // the visible base in the same way as commit(). + // + // The external buffer is NOT freed by release()/~DeviceArena(); ownership + // stays with the caller. Used for the prebuilt-arena fast path where + // a host-built image is rtMemcpy'd into a device buffer that DeviceRunner + // owns across runs. + void attach(void *external_base, size_t base_align = kDefaultBaseAlign) noexcept; + // Phase 3: pointer to the sub-region at `offset`. Asserts if called // before commit(). void *region_ptr(size_t offset) const noexcept; @@ -135,6 +147,9 @@ class DeviceArena { size_t raw_size_{0}; void *base_{nullptr}; bool committed_{false}; + // True when committed via attach(): the backing buffer is externally + // owned, so release() must not call free_(). + bool attached_{false}; size_t alloc_count_{0}; size_t free_count_{0}; @@ -166,6 +181,31 @@ inline void *DeviceArena::commit(size_t base_align) { return base_; } +inline void DeviceArena::attach(void *external_base, size_t base_align) noexcept { + // Re-attach (e.g. AICPU boot path attaches each run) is fine: only an + // attached state can be "re-attached" — release() it first to keep + // semantics tight. A real commit() (alloc-backed) must not be silently + // dropped, so still trap on that. + if (committed_) { + assert(attached_ && "DeviceArena::attach() called after commit (only re-attach is allowed)"); + release(); + } + assert(external_base != nullptr && "DeviceArena::attach() requires non-null base"); + assert(base_align > 0 && (base_align & (base_align - 1)) == 0 && "DeviceArena: base_align must be a power of two"); + // The external buffer must already be base_align-aligned by the caller — + // forward-align in-place would shift the visible base off the address the + // caller advertised (and that the prebuilt image was constructed for). + const auto raw = reinterpret_cast(external_base); + (void)raw; + (void)base_align; + assert((raw & (static_cast(base_align) - 1)) == 0 && "DeviceArena::attach() base must be pre-aligned"); + base_ = external_base; + raw_base_ = nullptr; + raw_size_ = 0; + committed_ = true; + attached_ = true; +} + inline void *DeviceArena::region_ptr(size_t offset) const noexcept { assert(committed_ && "DeviceArena::region_ptr() called before commit()"); return reinterpret_cast(base_) + offset; @@ -179,7 +219,8 @@ inline size_t DeviceArena::region_size(size_t offset) const noexcept { } inline void DeviceArena::release() noexcept { - if (raw_base_ != nullptr) { + // attached arenas wrap externally-owned memory — never free. + if (raw_base_ != nullptr && !attached_) { free_(ctx_, raw_base_); ++free_count_; } @@ -189,4 +230,5 @@ inline void DeviceArena::release() noexcept { cursor_ = 0; region_count_ = 0; committed_ = false; + attached_ = false; } diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index 89314d800..9922850d5 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -90,6 +90,7 @@ add_library(a2a3_rt_objs OBJECT ${A2A3_RUNTIME_DIR}/shared/pto_shared_memory.cpp ${A2A3_RUNTIME_DIR}/scheduler/pto_scheduler.cpp ${A2A3_RUNTIME_DIR}/shared/pto_tensormap.cpp + ${A2A3_RUNTIME_DIR}/shared/pto_runtime2_init.cpp ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp ) target_include_directories(a2a3_rt_objs PUBLIC diff --git a/tests/ut/cpp/a2a3/test_ready_queue.cpp b/tests/ut/cpp/a2a3/test_ready_queue.cpp index 413e36cfd..f12b1e7c7 100644 --- a/tests/ut/cpp/a2a3/test_ready_queue.cpp +++ b/tests/ut/cpp/a2a3/test_ready_queue.cpp @@ -61,7 +61,8 @@ class ReadyQueueTest : public ::testing::Test { void SetUp() override { const size_t off = ready_queue_reserve_layout(arena, CAPACITY); ASSERT_NE(arena.commit(), nullptr); - ASSERT_TRUE(ready_queue_init_from_layout(&queue, arena, off, CAPACITY)); + ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, CAPACITY)); + ready_queue_wire_arena_pointers(&queue, arena, off); } void TearDown() override { @@ -231,7 +232,8 @@ class ReadyQueueBoundaryTest : public ::testing::Test { void SetUp() override { const size_t off = ready_queue_reserve_layout(arena, QUEUE_CAP); ASSERT_NE(arena.commit(), nullptr); - ASSERT_TRUE(ready_queue_init_from_layout(&queue, arena, off, QUEUE_CAP)); + ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, QUEUE_CAP)); + ready_queue_wire_arena_pointers(&queue, arena, off); } void TearDown() override { ready_queue_destroy(&queue); @@ -330,7 +332,8 @@ class ReadyQueueMPMCTest : public ::testing::TestWithParam { void SetUp() override { const size_t off = ready_queue_reserve_layout(arena, CAPACITY); ASSERT_NE(arena.commit(), nullptr); - ASSERT_TRUE(ready_queue_init_from_layout(&queue, arena, off, CAPACITY)); + ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, CAPACITY)); + ready_queue_wire_arena_pointers(&queue, arena, off); } void TearDown() override { ready_queue_destroy(&queue); diff --git a/tests/ut/cpp/a2a3/test_scheduler_state.cpp b/tests/ut/cpp/a2a3/test_scheduler_state.cpp index 952aad55a..37e9d18ca 100644 --- a/tests/ut/cpp/a2a3/test_scheduler_state.cpp +++ b/tests/ut/cpp/a2a3/test_scheduler_state.cpp @@ -34,7 +34,8 @@ class SchedulerStateTest : public ::testing::Test { ASSERT_NE(sm_handle, nullptr); auto layout = PTO2SchedulerState::reserve_layout(sched_arena); ASSERT_NE(sched_arena.commit(), nullptr); - ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header)); + ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header, PTO2_TASK_WINDOW_SIZE)); + sched.wire_arena_pointers(layout, sched_arena); } void TearDown() override { diff --git a/tests/ut/cpp/a2a3/test_spsc_queue.cpp b/tests/ut/cpp/a2a3/test_spsc_queue.cpp index 28e43d5a2..5dce3ba4a 100644 --- a/tests/ut/cpp/a2a3/test_spsc_queue.cpp +++ b/tests/ut/cpp/a2a3/test_spsc_queue.cpp @@ -47,7 +47,8 @@ class SpscQueueTest : public ::testing::Test { memset(&queue, 0, sizeof(queue)); const size_t off = PTO2SpscQueue::reserve_layout(arena, CAPACITY); ASSERT_NE(arena.commit(), nullptr); - ASSERT_TRUE(queue.init_from_layout(arena, off, CAPACITY)); + ASSERT_TRUE(queue.init_data_from_layout(arena, off, CAPACITY)); + queue.wire_arena_pointers(arena, off); } void TearDown() override { @@ -74,9 +75,9 @@ TEST_F(SpscQueueTest, InitRejectsNonPowerOfTwo) { const size_t off = PTO2SpscQueue::reserve_layout(local, 1); // dummy reservation so commit succeeds (void)off; ASSERT_NE(local.commit(), nullptr); - EXPECT_FALSE(bad.init_from_layout(local, off, 3)); - EXPECT_FALSE(bad.init_from_layout(local, off, 7)); - EXPECT_FALSE(bad.init_from_layout(local, off, 0)); + EXPECT_FALSE(bad.init_data_from_layout(local, off, 3)); + EXPECT_FALSE(bad.init_data_from_layout(local, off, 7)); + EXPECT_FALSE(bad.init_data_from_layout(local, off, 0)); } TEST_F(SpscQueueTest, InitAcceptsPowerOfTwo) { @@ -85,9 +86,9 @@ TEST_F(SpscQueueTest, InitAcceptsPowerOfTwo) { const size_t off4 = PTO2SpscQueue::reserve_layout(local, 4); const size_t off1024 = PTO2SpscQueue::reserve_layout(local, 1024); ASSERT_NE(local.commit(), nullptr); - EXPECT_TRUE(q.init_from_layout(local, off4, 4)); + EXPECT_TRUE(q.init_data_from_layout(local, off4, 4)); q.destroy(); - EXPECT_TRUE(q.init_from_layout(local, off1024, 1024)); + EXPECT_TRUE(q.init_data_from_layout(local, off1024, 1024)); q.destroy(); } diff --git a/tests/ut/cpp/a2a3/test_task_allocator.cpp b/tests/ut/cpp/a2a3/test_task_allocator.cpp index 383003900..512e241a2 100644 --- a/tests/ut/cpp/a2a3/test_task_allocator.cpp +++ b/tests/ut/cpp/a2a3/test_task_allocator.cpp @@ -388,7 +388,10 @@ TEST_F(TaskAllocatorTest, TaskWindowSaturates) { TEST_F(TaskAllocatorTest, TaskIdNearInt32Max) { current_index.store(INT32_MAX - 2); last_alive.store(INT32_MAX - 2); - allocator.init(descriptors.data(), WINDOW_SIZE, ¤t_index, &last_alive, heap_buf, HEAP_SIZE, &error_code); + allocator.init( + descriptors.data(), WINDOW_SIZE, ¤t_index, &last_alive, heap_buf, HEAP_SIZE, &error_code, + /*initial_local_task_id=*/INT32_MAX - 2 + ); auto r1 = allocator.alloc(0); ASSERT_FALSE(r1.failed()); diff --git a/tests/ut/cpp/a2a3/test_task_state.cpp b/tests/ut/cpp/a2a3/test_task_state.cpp index 729b74999..a9655919b 100644 --- a/tests/ut/cpp/a2a3/test_task_state.cpp +++ b/tests/ut/cpp/a2a3/test_task_state.cpp @@ -43,7 +43,8 @@ class TaskStateTest : public ::testing::Test { ASSERT_NE(sm_handle, nullptr); auto layout = PTO2SchedulerState::reserve_layout(sched_arena); ASSERT_NE(sched_arena.commit(), nullptr); - ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header)); + ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header, PTO2_TASK_WINDOW_SIZE)); + sched.wire_arena_pointers(layout, sched_arena); } void TearDown() override { diff --git a/tests/ut/cpp/a2a3/test_tensormap.cpp b/tests/ut/cpp/a2a3/test_tensormap.cpp index 204d00e42..df1789067 100644 --- a/tests/ut/cpp/a2a3/test_tensormap.cpp +++ b/tests/ut/cpp/a2a3/test_tensormap.cpp @@ -83,7 +83,8 @@ class TensorMapTest : public ::testing::Test { int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE}; auto layout = PTO2TensorMap::reserve_layout(arena, NUM_BUCKETS, POOL_SIZE, window_sizes); ASSERT_NE(arena.commit(), nullptr); - ASSERT_TRUE(tmap.init_from_layout(layout, arena)); + ASSERT_TRUE(tmap.init_data_from_layout(layout, arena)); + tmap.wire_arena_pointers(layout, arena, /*parent_orch=*/nullptr); } void TearDown() override { @@ -113,7 +114,8 @@ TEST_F(TensorMapTest, InitRequiresPowerOfTwoBuckets) { int32_t ws[PTO2_MAX_RING_DEPTH] = {8, 8, 8, 8}; auto layout = PTO2TensorMap::reserve_layout(bad_arena, 8, 64, ws); ASSERT_NE(bad_arena.commit(), nullptr); - EXPECT_TRUE(bad.init_from_layout(layout, bad_arena)); + EXPECT_TRUE(bad.init_data_from_layout(layout, bad_arena)); + bad.wire_arena_pointers(layout, bad_arena, /*parent_orch=*/nullptr); bad.destroy(); } diff --git a/tests/ut/cpp/a2a3/test_wiring.cpp b/tests/ut/cpp/a2a3/test_wiring.cpp index b01052a85..b3c11ead1 100644 --- a/tests/ut/cpp/a2a3/test_wiring.cpp +++ b/tests/ut/cpp/a2a3/test_wiring.cpp @@ -48,7 +48,8 @@ class WiringTest : public ::testing::Test { ASSERT_NE(sm_handle, nullptr); auto layout = PTO2SchedulerState::reserve_layout(sched_arena); ASSERT_NE(sched_arena.commit(), nullptr); - ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header)); + ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header, PTO2_TASK_WINDOW_SIZE)); + sched.wire_arena_pointers(layout, sched_arena); } void TearDown() override { From c186f6ca4baca1bcf292203464b46d780b9706aa Mon Sep 17 00:00:00 2001 From: poursoul Date: Mon, 25 May 2026 11:27:35 +0800 Subject: [PATCH 3/7] Refactor: post-review hardening for trb host-build arena Address review feedback from PR #846: - pto2_sm_layout::ring_task_descriptors_addr: take per-ring task_window_sizes[] array (mirroring PTO2SharedMemoryHandle's SM API) and assert ring_id range, so a future per-ring SM layout cannot silently disagree with the addresses the host bakes into the prebuilt image. - DeviceRunner::acquire_pooled_runtime_arena (onboard + sim): return nullptr when runtime_arena_region_off_ == SIZE_MAX so a stray hbg-path call cannot resolve to base + SIZE_MAX. Failure is now loud and contained at the acquire boundary. - DeviceArena::attach(): rewrite doc to match real behavior (region table is not repopulated after attach, reserve() asserts !committed_ so cannot replay, region_size() returns 0); promote the pre-alignment / non-null / power-of-two checks from plain assert() to an unconditional abort() so release builds still trap on contract violations. - PTO2TensorMap: drop the dead `orch` back-pointer field (a2a3 never dereferences it), strip parent_orch parameter from wire_arena_pointers, and remove the now-unused PTO2OrchestratorState forward declaration. - PTO2RingFlowControl::init(): add a coupling comment so future fc-initial- value or boot-order changes flag PTO2TaskAllocator::init's initial_local_task_id default in the same edit. - PTO2SchedulerState::init_data_from_layout / RingSchedState:: init_data_from_layout: drop the task_window_size / dep_pool_capacity parameters that were never consumed (scheduler only needs SM base + ring index, both window-size-independent; orchestrator counterpart still takes task_window_size for ring_task_descriptors arithmetic). Updated all callsites (pto_runtime2_init.cpp + 4 cpput suites). - PTO2Runtime::prebuilt_arena_base: removed the dead mirror field. The host Runtime's prebuilt_arena_base_ is the real source of truth (AICPU reads it to locate the pooled buffer *before* dereferencing the image); the PTO2Runtime image still carries prebuilt_layout, which the AICPU does consume. cpput: 25/25 pass. a2a3sim trb: dummy_task / dynamic_register / L2 trb suite pass with --build. --- .../platform/onboard/host/device_runner.cpp | 4 ++ src/a2a3/platform/sim/host/device_runner.cpp | 4 ++ .../host/dep_gen_replay.cpp | 9 ++-- .../host/runtime_maker.cpp | 10 ++--- .../runtime/pto_runtime2.h | 16 +++---- .../runtime/pto_shared_memory.h | 26 ++++++++---- .../runtime/pto_tensormap.h | 11 +---- .../runtime/scheduler/pto_scheduler.h | 15 ++++--- .../runtime/shared/pto_runtime2_init.cpp | 20 +++++---- .../runtime/shared/pto_tensormap.cpp | 5 +-- src/common/device_comm/device_arena.h | 42 ++++++++++++++----- tests/ut/cpp/a2a3/test_scheduler_state.cpp | 2 +- tests/ut/cpp/a2a3/test_task_state.cpp | 2 +- tests/ut/cpp/a2a3/test_tensormap.cpp | 4 +- tests/ut/cpp/a2a3/test_wiring.cpp | 2 +- 15 files changed, 102 insertions(+), 70 deletions(-) diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index e3ba6cd10..8d2d9916b 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -299,6 +299,10 @@ void *DeviceRunner::acquire_pooled_gm_sm() { void *DeviceRunner::acquire_pooled_runtime_arena() { if (!static_arena_.is_committed()) return nullptr; + // hbg calls setup_static_arena(...,0) and never reserves a runtime-arena + // region — fail loudly if a caller asks for it anyway, rather than + // returning region_ptr(SIZE_MAX) (base + SIZE_MAX is undefined). + if (runtime_arena_region_off_ == SIZE_MAX) return nullptr; return static_arena_.region_ptr(runtime_arena_region_off_); } diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index 53d967228..c221bb714 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -170,6 +170,10 @@ void *DeviceRunner::acquire_pooled_gm_sm() { void *DeviceRunner::acquire_pooled_runtime_arena() { if (!static_arena_.is_committed()) return nullptr; + // hbg calls setup_static_arena(...,0) and never reserves a runtime-arena + // region — fail loudly if a caller asks for it anyway, rather than + // returning region_ptr(SIZE_MAX) (base + SIZE_MAX is undefined). + if (runtime_arena_region_off_ == SIZE_MAX) return nullptr; return static_arena_.region_ptr(runtime_arena_region_off_); } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp index 506ba7cf6..71a482632 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp @@ -492,11 +492,10 @@ dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, c LOG_ERROR("dep_gen replay: tensormap.init failed (buckets=%d, pool=%d)", PTO2_TENSORMAP_NUM_BUCKETS, pool_size); return -3; } - // Replay tensormaps live entirely on host; both arena base and the - // parent-orch self-pointer use host addresses. parent_orch is unused by - // the lookup/insert code paths exercised below — nullptr is safe. - tm_oracle.wire_arena_pointers(oracle_layout, replay_arena, nullptr); - tm_annot.wire_arena_pointers(annot_layout, replay_arena, nullptr); + // Replay tensormaps live entirely on host; only arena-internal pointer + // fields need wiring (no parent-orch back-reference exists anymore). + tm_oracle.wire_arena_pointers(oracle_layout, replay_arena); + tm_annot.wire_arena_pointers(annot_layout, replay_arena); // JSON output accumulators. std::vector task_table; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index 3b278b2b4..750374683 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -344,11 +344,11 @@ extern "C" int bind_prepared_to_runtime_impl( } runtime_wire_arena_pointers(host_arena, layout, rt); - // Stash the prebuilt metadata inside the PTO2Runtime image so the AICPU - // picks them up directly via the pooled buffer after rtMemcpy. The host - // Runtime also carries the pointers so the AICPU can locate the - // PTO2Runtime before it does anything else (no chicken-and-egg). - rt->prebuilt_arena_base = runtime_arena_dev; + // Stash the layout inside the PTO2Runtime image so the AICPU can recover + // every arena-internal offset after rtMemcpy. The runtime arena's device + // base does NOT travel in this image — it's on the host Runtime + // (set_prebuilt_arena below), since the AICPU needs that pointer + // *before* it can dereference the image. rt->prebuilt_layout = layout; int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h index 169937f82..460624e69 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h @@ -143,14 +143,14 @@ struct PTO2Runtime { // Statistics int64_t total_cycles; - // Prebuilt-arena fast path metadata. `prebuilt_arena_base` is the device - // address of the runtime arena (the buffer that holds *this* PTO2Runtime - // at offset prebuilt_layout.off_runtime). `prebuilt_layout` carries every - // offset wire_arena_pointers needs at AICPU boot, so the AICPU can - // reconstruct all arena-internal pointer fields without re-running - // init_data. Populated by the host's runtime_init_data_from_layout + - // runtime_wire_arena_pointers; read by aicpu_executor.cpp. - void *prebuilt_arena_base{nullptr}; + // Prebuilt-arena fast path metadata. Carries every offset + // wire_arena_pointers needs at AICPU boot so the AICPU can reconstruct + // all arena-internal pointer fields without re-running init_data. The + // device base of the runtime arena travels separately on the host-side + // Runtime (Runtime::prebuilt_arena_base_), since the AICPU needs it + // *before* dereferencing this image. Populated on host by + // runtime_init_data_from_layout + runtime_wire_arena_pointers; read by + // aicpu_executor.cpp. PTO2RuntimeArenaLayout prebuilt_layout; }; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h index c8de35ba6..98b832510 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h @@ -58,6 +58,13 @@ struct alignas(64) PTO2RingFlowControl { // === Cache Line 1: Written by Scheduler, Read by Orchestrator (for back-pressure) === alignas(64) std::atomic last_task_alive; // Task ring tail (oldest active task) + // Per-boot SM reset. PTO2TaskAllocator::init() seeds its private + // local_task_id_ from initial_local_task_id (default 0 in production) + // *without* dereferencing current_task_index — it relies on this reset + // running on every AICPU boot so 0 stays in sync. If you ever change + // the initial fc value or the boot ordering, update the default in + // PTO2TaskAllocator::init (pto_ring_buffer.h) in the same change, or + // submit IDs will be off by the divergence. void init() { current_task_index.store(0, std::memory_order_relaxed); last_task_alive.store(0, std::memory_order_relaxed); @@ -232,17 +239,20 @@ inline std::atomic *ring_last_task_alive_addr(void *sm_dev_base, int ri } // Walk the per-ring SM layout (same arithmetic as setup_pointers_per_ring) -// to compute ring `ring_id`'s task_descriptors device address. Uniform -// per-ring task_window_size; matches the production callsite which always -// passes a uniform window size to runtime_create_from_sm. -inline PTO2TaskDescriptor * -ring_task_descriptors_addr(void *sm_dev_base, uint64_t task_window_size, int ring_id) noexcept { +// to compute ring `ring_id`'s task_descriptors device address. Accepts a +// per-ring window-size array so the helper's signature mirrors +// `PTO2SharedMemoryHandle::setup_pointers_per_ring` and cannot silently +// disagree with the SM layout when (hypothetically) ring sizes diverge. +inline PTO2TaskDescriptor *ring_task_descriptors_addr( + void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id +) noexcept { + assert(ring_id >= 0 && ring_id < PTO2_MAX_RING_DEPTH && "pto2_sm_layout: ring_id out of range"); char *p = static_cast(sm_dev_base); p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); for (int r = 0; r < ring_id; r++) { - p += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); - p += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); - p += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); } return reinterpret_cast(p); } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h index 11decdf4e..b63f20676 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h @@ -47,8 +47,6 @@ #include "pto_runtime2_types.h" #include "tensor.h" -struct PTO2OrchestratorState; // forward declare - /** * Layout descriptor produced by PTO2TensorMap::reserve_layout(). Stores the * region offsets returned by DeviceArena::reserve() so init_from_layout() @@ -369,8 +367,6 @@ struct PTO2TensorMap { // Per-ring cleanup progress (for periodic cleanup_retired) int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{}; - PTO2OrchestratorState *orch{nullptr}; - uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const { return task_local_id & (task_window_sizes[ring_id] - 1); } @@ -436,7 +432,7 @@ struct PTO2TensorMap { /** * Phase 3a: write everything *except* arena-internal pointer fields - * (buckets, entry_pool, free_entry_list, task_entry_heads[r], orch). + * (buckets, entry_pool, free_entry_list, task_entry_heads[r]). * Uses arena.region_ptr to address the arena regions for data writes, * but does not store those addresses in struct fields. Safe to call on * a host arena that holds the prebuilt image. @@ -446,11 +442,8 @@ struct PTO2TensorMap { /** * Phase 3b: write the arena-internal pointer fields. Idempotent; * called once on the host arena and once on the AICPU after attach. - * `parent_orch` is the device address (or host-mirror address) of the - * enclosing PTO2OrchestratorState; we store it in tensor_map.orch - * (self-pointer within the same arena). */ - void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena, PTO2OrchestratorState *parent_orch); + void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena); /** * Tear down state. Does not free memory — the arena owns the backing diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h index 828999113..510187feb 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h @@ -583,7 +583,7 @@ struct PTO2SchedulerState { // by SchedulerState::wire_arena_pointers). The `ring` field stores // the device address of the SM ring header — computed via offset // arithmetic, no SM dereference. - bool init_data_from_layout(void *sm_dev_base, int32_t ring_id, int32_t dep_pool_capacity); + bool init_data_from_layout(void *sm_dev_base, int32_t ring_id); void destroy(); void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); } @@ -1068,13 +1068,12 @@ struct PTO2SchedulerState { // Phase 3a: write everything *except* arena-internal pointer fields. // `sm_dev_base` is the device address of the SM (only stored, never - // dereferenced here); `task_window_size` lets the per-ring data-addr - // arithmetic resolve ring task_descriptors / fc field addresses without - // an SM load. Safe to call on a host arena that holds the prebuilt - // image buffer. - bool init_data_from_layout( - const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base, uint64_t task_window_size - ); + // dereferenced here). Safe to call on a host arena that holds the + // prebuilt image buffer. (The orchestrator counterpart takes + // task_window_size for ring task_descriptors address arithmetic; the + // scheduler only needs the SM header / ring header base addresses, + // both window-size-independent.) + bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base); // Phase 3b: write the arena-internal pointer fields // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp index 3efa313fd..d66acfcc4 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp @@ -70,9 +70,7 @@ void ready_queue_destroy(PTO2ReadyQueue *queue) { // Scheduler // ============================================================================= -bool PTO2SchedulerState::RingSchedState::init_data_from_layout( - void *sm_dev_base, int32_t ring_id, int32_t /*dep_pool_capacity*/ -) { +bool PTO2SchedulerState::RingSchedState::init_data_from_layout(void *sm_dev_base, int32_t ring_id) { // ring stores the device address of the SM ring header — pure offset // arithmetic, no SM load. ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id); @@ -111,7 +109,7 @@ PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32 } bool PTO2SchedulerState::init_data_from_layout( - const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base, uint64_t /*task_window_size*/ + const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base ) { PTO2SchedulerState *sched = this; sched->sm_header = reinterpret_cast(sm_dev_base); @@ -121,7 +119,7 @@ bool PTO2SchedulerState::init_data_from_layout( #endif for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r, layout.dep_pool_capacity)) { + if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) { return false; } } @@ -220,10 +218,16 @@ bool PTO2OrchestratorState::init_data_from_layout( orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH; orch->fatal = false; + // Mirror the SM API's per-ring window-size shape so a future per-ring + // SM layout cannot silently disagree with the addresses we compute here. + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) + task_window_sizes[r] = task_window_size; + auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base); for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { void *ring_heap_base = reinterpret_cast(gm_heap) + r * heap_size; - auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_size, r); + auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r); auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r); auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r); @@ -259,7 +263,7 @@ void PTO2OrchestratorState::wire_arena_pointers( for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { orch->rings[r].fanin_pool.base = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); } - orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena, orch); + orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena); orch->scope_tasks = static_cast(arena.region_ptr(layout.off_scope_tasks)); orch->scope_begins = static_cast(arena.region_ptr(layout.off_scope_begins)); orch->scheduler = scheduler_arg; @@ -324,7 +328,7 @@ PTO2Runtime *runtime_init_data_from_layout( )) { return nullptr; } - if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base, layout.task_window_size)) { + if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) { return nullptr; } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp index da9d4fddf..b99c67233 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp @@ -129,16 +129,13 @@ bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, Dev return true; } -void PTO2TensorMap::wire_arena_pointers( - const PTO2TensorMapLayout &layout, DeviceArena &arena, PTO2OrchestratorState *parent_orch -) { +void PTO2TensorMap::wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena) { buckets = static_cast(arena.region_ptr(layout.off_buckets)); entry_pool = static_cast(arena.region_ptr(layout.off_entry_pool)); free_entry_list = static_cast(arena.region_ptr(layout.off_free_entry_list)); for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { task_entry_heads[r] = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); } - orch = parent_orch; } void PTO2TensorMap::destroy() { diff --git a/src/common/device_comm/device_arena.h b/src/common/device_comm/device_arena.h index ad43d1869..ffe34c479 100644 --- a/src/common/device_comm/device_arena.h +++ b/src/common/device_comm/device_arena.h @@ -96,10 +96,22 @@ class DeviceArena { void *commit(size_t base_align = kDefaultBaseAlign); // Phase 2 alternative: attach to an externally-owned buffer instead of - // allocating one. Caller guarantees `external_base` is at least the size - // reported by `total_size()` rounded up to `base_align`, and that the - // same reserve() sequence has been (or will be) replayed. Forward-aligns - // the visible base in the same way as commit(). + // allocating one. Caller guarantees: + // (a) `external_base` is already `base_align`-aligned — attach does + // NOT forward-align, since the prebuilt image was constructed for + // the address the caller advertised; + // (b) the buffer is at least `total_size()` bytes (the sum of sizes + // passed to reserve()), since attach uses no forward-alignment + // slack of its own; + // (c) all region offsets the caller plans to read back via + // `region_ptr(off)` are held by the caller — attach does NOT + // repopulate the internal region table, and reserve() cannot run + // after attach (it asserts !committed_). `region_size()` likewise + // returns 0 for attached arenas; treat the arena post-attach as + // a base-pointer wrapper. + // + // Re-attach (release + attach the same or another buffer) is permitted + // so the AICPU boot path can rebind the same pooled image each run. // // The external buffer is NOT freed by release()/~DeviceArena(); ownership // stays with the caller. Used for the prebuilt-arena fast path where @@ -112,7 +124,10 @@ class DeviceArena { void *region_ptr(size_t offset) const noexcept; // Size of the sub-region whose offset matches `offset`. Linear scan; - // intended for debug / assertions, not hot path. + // intended for debug / assertions, not hot path. Returns 0 for an + // attached arena (attach() does not repopulate the region table) — + // callers in the prebuilt-image path keep sizes alongside their offsets + // instead. size_t region_size(size_t offset) const noexcept; // Free the backing buffer (if any) and reset to the pre-commit state so @@ -190,15 +205,22 @@ inline void DeviceArena::attach(void *external_base, size_t base_align) noexcept assert(attached_ && "DeviceArena::attach() called after commit (only re-attach is allowed)"); release(); } - assert(external_base != nullptr && "DeviceArena::attach() requires non-null base"); - assert(base_align > 0 && (base_align & (base_align - 1)) == 0 && "DeviceArena: base_align must be a power of two"); // The external buffer must already be base_align-aligned by the caller — // forward-align in-place would shift the visible base off the address the // caller advertised (and that the prebuilt image was constructed for). + // The checks below are promoted to unconditional aborts (rather than + // plain assert()) because a misaligned attach silently produces a buffer + // whose visible base disagrees with every offset the prebuilt image was + // laid out against — release builds, which strip assert(), would still + // run on a corrupted arena. Aborting at the breakage point is far cheaper + // to triage than the downstream wild-pointer accesses. const auto raw = reinterpret_cast(external_base); - (void)raw; - (void)base_align; - assert((raw & (static_cast(base_align) - 1)) == 0 && "DeviceArena::attach() base must be pre-aligned"); + const bool ok = (external_base != nullptr) && (base_align > 0) && ((base_align & (base_align - 1)) == 0) && + ((raw & (static_cast(base_align) - 1)) == 0); + if (!ok) { + assert(false && "DeviceArena::attach(): null base, non-power-of-two align, or pre-alignment violated"); + std::abort(); + } base_ = external_base; raw_base_ = nullptr; raw_size_ = 0; diff --git a/tests/ut/cpp/a2a3/test_scheduler_state.cpp b/tests/ut/cpp/a2a3/test_scheduler_state.cpp index 37e9d18ca..75476dedf 100644 --- a/tests/ut/cpp/a2a3/test_scheduler_state.cpp +++ b/tests/ut/cpp/a2a3/test_scheduler_state.cpp @@ -34,7 +34,7 @@ class SchedulerStateTest : public ::testing::Test { ASSERT_NE(sm_handle, nullptr); auto layout = PTO2SchedulerState::reserve_layout(sched_arena); ASSERT_NE(sched_arena.commit(), nullptr); - ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header, PTO2_TASK_WINDOW_SIZE)); + ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header)); sched.wire_arena_pointers(layout, sched_arena); } diff --git a/tests/ut/cpp/a2a3/test_task_state.cpp b/tests/ut/cpp/a2a3/test_task_state.cpp index a9655919b..ffced6f9a 100644 --- a/tests/ut/cpp/a2a3/test_task_state.cpp +++ b/tests/ut/cpp/a2a3/test_task_state.cpp @@ -43,7 +43,7 @@ class TaskStateTest : public ::testing::Test { ASSERT_NE(sm_handle, nullptr); auto layout = PTO2SchedulerState::reserve_layout(sched_arena); ASSERT_NE(sched_arena.commit(), nullptr); - ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header, PTO2_TASK_WINDOW_SIZE)); + ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header)); sched.wire_arena_pointers(layout, sched_arena); } diff --git a/tests/ut/cpp/a2a3/test_tensormap.cpp b/tests/ut/cpp/a2a3/test_tensormap.cpp index df1789067..805a9e079 100644 --- a/tests/ut/cpp/a2a3/test_tensormap.cpp +++ b/tests/ut/cpp/a2a3/test_tensormap.cpp @@ -84,7 +84,7 @@ class TensorMapTest : public ::testing::Test { auto layout = PTO2TensorMap::reserve_layout(arena, NUM_BUCKETS, POOL_SIZE, window_sizes); ASSERT_NE(arena.commit(), nullptr); ASSERT_TRUE(tmap.init_data_from_layout(layout, arena)); - tmap.wire_arena_pointers(layout, arena, /*parent_orch=*/nullptr); + tmap.wire_arena_pointers(layout, arena); } void TearDown() override { @@ -115,7 +115,7 @@ TEST_F(TensorMapTest, InitRequiresPowerOfTwoBuckets) { auto layout = PTO2TensorMap::reserve_layout(bad_arena, 8, 64, ws); ASSERT_NE(bad_arena.commit(), nullptr); EXPECT_TRUE(bad.init_data_from_layout(layout, bad_arena)); - bad.wire_arena_pointers(layout, bad_arena, /*parent_orch=*/nullptr); + bad.wire_arena_pointers(layout, bad_arena); bad.destroy(); } diff --git a/tests/ut/cpp/a2a3/test_wiring.cpp b/tests/ut/cpp/a2a3/test_wiring.cpp index b3c11ead1..1e8fee9c5 100644 --- a/tests/ut/cpp/a2a3/test_wiring.cpp +++ b/tests/ut/cpp/a2a3/test_wiring.cpp @@ -48,7 +48,7 @@ class WiringTest : public ::testing::Test { ASSERT_NE(sm_handle, nullptr); auto layout = PTO2SchedulerState::reserve_layout(sched_arena); ASSERT_NE(sched_arena.commit(), nullptr); - ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header, PTO2_TASK_WINDOW_SIZE)); + ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header)); sched.wire_arena_pointers(layout, sched_arena); } From 7cdb55cbad699b3c2158ea5bea6a9f1c08aec8c9 Mon Sep 17 00:00:00 2001 From: poursoul Date: Wed, 27 May 2026 11:56:12 +0800 Subject: [PATCH 4/7] Refactor: mirror trb host-build arena to a5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sync of PR #846 commit 2/3 to a5 — commit 1 (slot_state.bind split) was already mirrored. Brings the a5 trb runtime up to the same host-build arena fast path as a2a3. - 4-phase API (reserve_layout / init_data_from_layout / wire_arena_pointers / finalize_after_wire) replaces runtime_create_from_sm. - New runtime/shared/pto_runtime2_init.cpp (~355 lines) and shared/pto_tensormap.cpp (the old runtime/pto_tensormap.cpp moved + split) hold the host-pluggable cold-path lifted from pto_runtime2.cpp / pto_orchestrator.cpp / scheduler/pto_scheduler.cpp. - AICPU boot becomes attach + wire + sm_handle->init + finalize. - runtime_maker.cpp pre-builds the arena image on host and rtMemcpys it into a pooled runtime-arena region; onboard + sim DeviceRunner setup_static_arena grow a third runtime_arena_size argument with matching acquire_pooled_runtime_arena (hbg path passes 0). a5-specific divergences kept: enable_l2_swimlane (bool) instead of L2PerfLevel, no dep_gen subsystem, wait_init_complete naming, alignas(64) PTO2SpscQueue queue, cache_invalidate_range + cond.retire in async_wait, RUNTIME_MAX_WORKER 108. Tests - cpput: 25/25 pass. - a5sim: trb 21/21 + host_build_graph 6/6 pass. - a2a3sim regression: trb 29/29 + host_build_graph 9/9 pass. --- .../platform/onboard/host/device_runner.cpp | 29 +- src/a5/platform/onboard/host/device_runner.h | 27 +- .../onboard/host/pto_runtime_c_api.cpp | 13 +- src/a5/platform/sim/host/device_runner.cpp | 24 +- src/a5/platform/sim/host/device_runner.h | 22 +- .../platform/sim/host/pto_runtime_c_api.cpp | 13 +- .../host_build_graph/runtime/runtime.h | 3 +- .../aicpu/aicpu_executor.cpp | 64 +++- .../host/runtime_maker.cpp | 66 +++- .../runtime/pto_orchestrator.cpp | 97 +---- .../runtime/pto_orchestrator.h | 22 +- .../runtime/pto_ring_buffer.h | 16 +- .../runtime/pto_runtime2.cpp | 84 +---- .../runtime/pto_runtime2.h | 103 ++++- .../runtime/pto_runtime2_types.h | 2 +- .../runtime/pto_shared_memory.h | 73 +++- .../runtime/pto_tensormap.h | 22 +- .../runtime/runtime.h | 43 ++- .../runtime/scheduler/pto_scheduler.cpp | 147 -------- .../runtime/scheduler/pto_scheduler.h | 53 ++- .../runtime/shared/pto_runtime2_init.cpp | 355 ++++++++++++++++++ .../runtime/shared/pto_shared_memory.cpp | 17 + .../runtime/{ => shared}/pto_tensormap.cpp | 48 ++- .../runtime/shared/runtime.cpp | 9 + tests/ut/cpp/a5/test_scheduler_state.cpp | 3 +- tests/ut/cpp/a5/test_task_allocator.cpp | 5 +- tests/ut/cpp/a5/test_task_state.cpp | 3 +- tests/ut/cpp/a5/test_wiring.cpp | 3 +- 28 files changed, 935 insertions(+), 431 deletions(-) create mode 100644 src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp rename src/a5/runtime/tensormap_and_ringbuffer/runtime/{ => shared}/pto_tensormap.cpp (82%) diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index 38242555d..377e0b8eb 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -195,29 +195,41 @@ static int prof_free_cb(void *dev_ptr) { return rtFree(dev_ptr); } DeviceRunner::~DeviceRunner() { finalize(); } -int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) { +int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { if (static_arena_.is_committed()) { - if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0; + // Idempotent for the production case (sizes do not change across a + // worker's lifetime). If a caller asks for a larger layout, redo it. + if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_ && + runtime_arena_size <= cached_runtime_arena_size_) { + return 0; + } static_arena_.release(); gm_heap_region_off_ = SIZE_MAX; gm_sm_region_off_ = SIZE_MAX; + runtime_arena_region_off_ = SIZE_MAX; cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; + cached_runtime_arena_size_ = 0; } gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign); gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign); + if (runtime_arena_size > 0) { + runtime_arena_region_off_ = static_arena_.reserve(runtime_arena_size, DeviceArena::kDefaultBaseAlign); + } if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { - // Roll back the two reserves: commit() failure leaves committed_=false, + // Roll back the reserves: commit() failure leaves committed_=false, // so the next entry would skip the release branch and stack new // reserves on top of the stale cursor. release() is idempotent on a // never-committed arena (just zeroes cursor_ / region_count_). static_arena_.release(); gm_heap_region_off_ = SIZE_MAX; gm_sm_region_off_ = SIZE_MAX; + runtime_arena_region_off_ = SIZE_MAX; return -1; } cached_gm_heap_size_ = gm_heap_size; cached_gm_sm_size_ = gm_sm_size; + cached_runtime_arena_size_ = runtime_arena_size; return 0; } @@ -231,6 +243,15 @@ void *DeviceRunner::acquire_pooled_gm_sm() { return static_arena_.region_ptr(gm_sm_region_off_); } +void *DeviceRunner::acquire_pooled_runtime_arena() { + if (!static_arena_.is_committed()) return nullptr; + // hbg calls setup_static_arena(...,0) and never reserves a runtime-arena + // region — fail loudly if a caller asks for it anyway, rather than + // returning region_ptr(SIZE_MAX) (base + SIZE_MAX is undefined). + if (runtime_arena_region_off_ == SIZE_MAX) return nullptr; + return static_arena_.region_ptr(runtime_arena_region_off_); +} + std::thread DeviceRunner::create_thread(std::function fn) { int dev_id = device_id_; return std::thread([dev_id, fn = std::move(fn)]() { @@ -1045,8 +1066,10 @@ int DeviceRunner::finalize() { static_arena_.release(); gm_heap_region_off_ = SIZE_MAX; gm_sm_region_off_ = SIZE_MAX; + runtime_arena_region_off_ = SIZE_MAX; cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; + cached_runtime_arena_size_ = 0; // Free all remaining allocations (including handshake buffer and binGmAddr) mem_alloc_.finalize(); diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index a07ab28bb..9edad84fa 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -179,19 +179,30 @@ class DeviceRunner { /** * Lay out and commit the per-Worker static device arena that backs the - * PTO2 GM heap and PTO2 shared memory in a single underlying allocation. - * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm. - * Idempotent on identical sizes. Returns 0 on success, -1 on failure. + * PTO2 GM heap, the PTO2 shared memory, and the trb prebuilt runtime + * arena in a single underlying allocation. Must be called before any + * acquire_pooled_*. Idempotent on identical sizes. `runtime_arena_size` + * is 0 for the hbg path (no prebuilt runtime arena). Returns 0 on + * success, -1 on failure. */ - int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size); + int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); /** - * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must - * have been called earlier in this Worker; otherwise these return - * nullptr. Pointers are stable for the lifetime of the Worker. + * Return the pooled GM heap / PTO2 SM / runtime arena pointer. + * setup_static_arena must have been called earlier in this Worker; + * otherwise these return nullptr. All pointers are stable for the + * Worker's lifetime; the single underlying device buffer is released in + * `finalize()`. + * + * acquire_pooled_runtime_arena() is trb-only — the runtime arena region + * is only reserved when setup_static_arena was called with + * runtime_arena_size > 0. hbg's runtime_maker never calls this; doing so + * after setup_static_arena(...,0) returns an unreserved-offset region_ptr + * (undefined). Keep the call site discipline at the runtime_maker layer. */ void *acquire_pooled_gm_heap(); void *acquire_pooled_gm_sm(); + void *acquire_pooled_runtime_arena(); /** * Create a thread bound to this device. @@ -523,10 +534,12 @@ class DeviceRunner { DeviceArena static_arena_; size_t gm_heap_region_off_{SIZE_MAX}; size_t gm_sm_region_off_{SIZE_MAX}; + size_t runtime_arena_region_off_{SIZE_MAX}; // Cached sizes for setup_static_arena's "fits" check — avoids calling // region_size() on the arena's public API for the two regions we own. size_t cached_gm_heap_size_{0}; size_t cached_gm_sm_size_{0}; + size_t cached_runtime_arena_size_{0}; // Device resources rtStream_t stream_aicpu_{nullptr}; diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp index 0cc17c81f..1a2bb32a9 100644 --- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp @@ -108,9 +108,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) { } } -static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) { +static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { try { - return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size); + return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size); } catch (...) { return -1; } @@ -132,6 +132,14 @@ static void *acquire_pooled_gm_sm_wrapper() { } } +static void *acquire_pooled_runtime_arena_wrapper() { + try { + return current_runner()->acquire_pooled_runtime_arena(); + } catch (...) { + return nullptr; + } +} + /* =========================================================================== * Public C API (resolved by ChipWorker via dlsym) * =========================================================================== */ @@ -426,6 +434,7 @@ int run_prepared( r->host_api.setup_static_arena = setup_static_arena_wrapper; r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper; r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper; + r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper; r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper; // Restore kernel addrs + orch symbol names + active_callable_id; the diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp index c0d26fbe1..8cbac796c 100644 --- a/src/a5/platform/sim/host/device_runner.cpp +++ b/src/a5/platform/sim/host/device_runner.cpp @@ -112,29 +112,39 @@ static int prof_free_cb(void *dev_ptr) { DeviceRunner::~DeviceRunner() { finalize(); } -int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) { +int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { if (static_arena_.is_committed()) { - if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0; + if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_ && + runtime_arena_size <= cached_runtime_arena_size_) { + return 0; + } static_arena_.release(); gm_heap_region_off_ = SIZE_MAX; gm_sm_region_off_ = SIZE_MAX; + runtime_arena_region_off_ = SIZE_MAX; cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; + cached_runtime_arena_size_ = 0; } gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign); gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign); + if (runtime_arena_size > 0) { + runtime_arena_region_off_ = static_arena_.reserve(runtime_arena_size, DeviceArena::kDefaultBaseAlign); + } if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { - // Roll back the two reserves: commit() failure leaves committed_=false, + // Roll back the reserves: commit() failure leaves committed_=false, // so the next entry would skip the release branch and stack new // reserves on top of the stale cursor. release() is idempotent on a // never-committed arena (just zeroes cursor_ / region_count_). static_arena_.release(); gm_heap_region_off_ = SIZE_MAX; gm_sm_region_off_ = SIZE_MAX; + runtime_arena_region_off_ = SIZE_MAX; return -1; } cached_gm_heap_size_ = gm_heap_size; cached_gm_sm_size_ = gm_sm_size; + cached_runtime_arena_size_ = runtime_arena_size; return 0; } @@ -148,6 +158,12 @@ void *DeviceRunner::acquire_pooled_gm_sm() { return static_arena_.region_ptr(gm_sm_region_off_); } +void *DeviceRunner::acquire_pooled_runtime_arena() { + if (!static_arena_.is_committed()) return nullptr; + if (runtime_arena_region_off_ == SIZE_MAX) return nullptr; + return static_arena_.region_ptr(runtime_arena_region_off_); +} + std::thread DeviceRunner::create_thread(std::function fn) { int dev_id = device_id_; return std::thread([dev_id, fn = std::move(fn)]() { @@ -935,8 +951,10 @@ int DeviceRunner::finalize() { static_arena_.release(); gm_heap_region_off_ = SIZE_MAX; gm_sm_region_off_ = SIZE_MAX; + runtime_arena_region_off_ = SIZE_MAX; cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; + cached_runtime_arena_size_ = 0; // Free all remaining allocations mem_alloc_.finalize(); diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h index 0aa6e6fa1..f4fe44121 100644 --- a/src/a5/platform/sim/host/device_runner.h +++ b/src/a5/platform/sim/host/device_runner.h @@ -77,19 +77,25 @@ class DeviceRunner { /** * Lay out and commit the per-Worker static device arena that backs the - * PTO2 GM heap and PTO2 shared memory in a single underlying allocation. - * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm. - * Idempotent on identical sizes. Returns 0 on success, -1 on failure. + * PTO2 GM heap, the PTO2 shared memory, and the trb prebuilt runtime + * arena in a single underlying allocation. Must be called before any + * acquire_pooled_*. Idempotent on identical sizes. `runtime_arena_size` + * is 0 for the hbg path (no prebuilt runtime arena). Returns 0 on + * success, -1 on failure. */ - int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size); + int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); /** - * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must - * have been called earlier in this Worker; otherwise these return - * nullptr. Pointers are stable for the lifetime of the Worker. + * Return the pooled GM heap / PTO2 SM / runtime arena pointer. + * setup_static_arena must have been called earlier in this Worker; + * otherwise these return nullptr. + * + * acquire_pooled_runtime_arena() is trb-only — the region exists only + * when setup_static_arena was called with runtime_arena_size > 0. */ void *acquire_pooled_gm_heap(); void *acquire_pooled_gm_sm(); + void *acquire_pooled_runtime_arena(); /** * Create a thread bound to this device. @@ -292,10 +298,12 @@ class DeviceRunner { DeviceArena static_arena_; size_t gm_heap_region_off_{SIZE_MAX}; size_t gm_sm_region_off_{SIZE_MAX}; + size_t runtime_arena_region_off_{SIZE_MAX}; // Cached sizes for setup_static_arena's "fits" check — avoids calling // region_size() on the arena's public API for the two regions we own. size_t cached_gm_heap_size_{0}; size_t cached_gm_sm_size_{0}; + size_t cached_runtime_arena_size_{0}; // Simulation state (no actual device resources) KernelArgs kernel_args_; diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp index 81e9b138f..f2dc10b4e 100644 --- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp @@ -103,9 +103,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) { } } -static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) { +static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { try { - return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size); + return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size); } catch (...) { return -1; } @@ -127,6 +127,14 @@ static void *acquire_pooled_gm_sm_wrapper() { } } +static void *acquire_pooled_runtime_arena_wrapper() { + try { + return current_runner()->acquire_pooled_runtime_arena(); + } catch (...) { + return nullptr; + } +} + /* =========================================================================== * Public C API (resolved by ChipWorker via dlsym) * =========================================================================== */ @@ -328,6 +336,7 @@ int run_prepared( r->host_api.setup_static_arena = setup_static_arena_wrapper; r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper; r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper; + r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper; r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper; auto bind_result = runner->bind_prepared_callable_to_runtime(*r, callable_id); diff --git a/src/a5/runtime/host_build_graph/runtime/runtime.h b/src/a5/runtime/host_build_graph/runtime/runtime.h index b9edf7020..25c6c13f4 100644 --- a/src/a5/runtime/host_build_graph/runtime/runtime.h +++ b/src/a5/runtime/host_build_graph/runtime/runtime.h @@ -146,9 +146,10 @@ struct HostApi { // pto_runtime_c_api.cpp can populate the same HostApi struct regardless of // which runtime variant it is built against. Unset for this variant; do // not call. - int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size); + int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); void *(*acquire_pooled_gm_heap)(); void *(*acquire_pooled_gm_sm)(); + void *(*acquire_pooled_runtime_arena)(); // Single-shot upload of the entire ChipCallable buffer. `callable` is a // `const ChipCallable *` (declared void* to avoid pulling task_interface // headers into runtime.h). DeviceRunner walks child_offsets_ to compute diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index bcea9b09e..49d55380f 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -125,8 +125,10 @@ struct AicpuExecutor { std::atomic finished_count_{0}; std::atomic runtime_init_ready_{false}; - // Per-Worker arena backing the PTO2Runtime + sm_handle + orch/sched/mailbox - // sub-regions (created in runtime_create_from_sm, released in runtime_destroy). + // Per-Worker arena attaching to the pooled prebuilt runtime image. Host + // populates the layout + data on its own arena, rtMemcpys into a pooled + // device buffer owned by DeviceRunner, and the AICPU attach()es to that + // buffer on each boot — no AICPU-side commit, no per-boot rtMalloc. // Default-constructed: libc-backed backend, no ctx. DeviceArena runtime_arena_; @@ -466,29 +468,61 @@ int32_t AicpuExecutor::run(Runtime *runtime) { static_cast(task_window_size), static_cast(heap_size), dep_pool_capacity ); - void *sm_ptr = runtime->get_gm_sm_ptr(); - void *gm_heap = runtime->get_gm_heap_ptr(); + // gm_heap pointer / dep_pool_capacity are encoded into the prebuilt + // runtime arena image at host build time, so we no longer fetch + // them here. They remain on the host Runtime instance and on the + // PTO2Runtime header for diagnostic purposes only. + (void)dep_pool_capacity; + void *sm_ptr = runtime->get_gm_sm_ptr(); uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size); - rt = runtime_create_from_sm( - PTO2_MODE_EXECUTE, sm_ptr, sm_size, task_window_size, gm_heap, heap_size, runtime_arena_, - dep_pool_capacity - ); - if (!rt) { - LOG_ERROR("Thread %d: Failed to create PTO2Runtime", thread_idx); - // Unblock scheduler threads before returning so they don't spin forever. + + // Prebuilt-arena fast path. Host has pre-populated the entire + // runtime arena (PTO2Runtime + orchestrator/scheduler/tensor_map + // sub-regions + sm_handle wrapper + mailbox) and uploaded it via + // rtMemcpy into the pooled runtime_arena buffer. We attach to it, + // wire arena-internal pointers to their device addresses, reset + // the SM, and finalize the few device-only fields the host could + // not know at image-build time. + void *prebuilt_arena = runtime->get_prebuilt_arena_base(); + size_t off_runtime = runtime->get_prebuilt_runtime_offset(); + if (prebuilt_arena == nullptr) { + LOG_ERROR("Thread %d: prebuilt_arena_base is null", thread_idx); + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + runtime_arena_.attach(prebuilt_arena, DeviceArena::kDefaultBaseAlign); + rt = reinterpret_cast(static_cast(prebuilt_arena) + off_runtime); + + // Wire every arena-internal pointer field (host wrote host-mirror + // addresses; we overwrite them with device addresses). + runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt); + + // Reset SM state. setup_pointers + init_header_per_ring restore + // ring flow-control counters, layout metadata, error flags, and + // the per-slot ring->slot_states[] (bind_ring + reset_for_reuse + + // fanin_count/active_mask zero — previously done inside + // RingSchedState::init). + memset(rt->sm_handle, 0, sizeof(*rt->sm_handle)); + if (!rt->sm_handle->init(sm_ptr, sm_size, task_window_size, heap_size)) { + LOG_ERROR("Thread %d: sm_handle->init failed", thread_idx); runtime_init_ready_.store(true, std::memory_order_release); return -1; } + // AICore completion mailbox lives in the arena; reset it each + // boot so stale completion notifications from a previous run do + // not leak. + memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox)); + + // Fill ops / core counts (host can't resolve s_runtime_ops's + // device address nor know the SchedulerContext's core fan-out). + runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count()); + #if PTO2_PROFILING rt->orchestrator.l2_perf_level = get_l2_perf_level(); #endif - // Total core counts = aic_count_ / aiv_count_ (set once at runtime init). - rt->orchestrator.total_cluster_count = sched_ctx_.aic_count(); - rt->orchestrator.total_aiv_count = sched_ctx_.aiv_count(); - // With multi-ring, slot_states are per-ring inside the scheduler. runtime->set_slot_states_ptr(nullptr); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index 0c7ac3872..9e1d00841 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -36,8 +36,10 @@ #include #include "../common/pto_runtime_status.h" +#include "../runtime/pto_runtime2.h" #include "../runtime/pto_shared_memory.h" #include "../runtime/runtime.h" +#include "device_arena.h" #include "callable.h" #include "common/platform_config.h" #include "common/unified_log.h" @@ -271,15 +273,27 @@ extern "C" int bind_prepared_to_runtime_impl( uint64_t eff_heap_size = runtime->heap_size ? runtime->heap_size : PTO2_HEAP_SIZE; uint64_t eff_task_window_size = runtime->task_window_size ? runtime->task_window_size : PTO2_TASK_WINDOW_SIZE; - // Lay out the per-Worker static device arena. GM heap (orchestrator output - // buffers, all rings combined) and PTO2 shared memory live in a single - // backing allocation; setup_static_arena reserves both regions and - // commits in one shot. Owned by DeviceRunner across runs — do NOT record - // in tensor_pairs_; the free is deferred to DeviceRunner::finalize(). + // Lay out the per-Worker static device arena. GM heap, PTO2 shared memory, + // and the prebuilt runtime arena all live in a single backing allocation; + // setup_static_arena reserves the three regions and commits in one shot. + // Owned by DeviceRunner across runs — do NOT record in tensor_pairs_; the + // free is deferred to DeviceRunner::finalize(). The runtime-arena size is + // determined by replaying the reserve sequence on a host-side arena. uint64_t total_heap_size = eff_heap_size * PTO2_MAX_RING_DEPTH; uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(eff_task_window_size); + int32_t eff_dep_pool_capacity = + runtime->dep_pool_size ? static_cast(runtime->dep_pool_size) : PTO2_DEP_LIST_POOL_SIZE; + + int64_t t_prebuilt_start = _now_ms(); + DeviceArena host_arena; // libc malloc backend by default + PTO2RuntimeArenaLayout layout = runtime_reserve_layout(host_arena, eff_task_window_size, eff_dep_pool_capacity); + if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { + LOG_ERROR("Failed to commit host arena for prebuilt runtime image"); + return -1; + } + int64_t t_setup_start = _now_ms(); - if (runtime->host_api.setup_static_arena(total_heap_size, sm_size) != 0) { + if (runtime->host_api.setup_static_arena(total_heap_size, sm_size, layout.arena_size) != 0) { LOG_ERROR("Failed to setup pooled static arena"); return -1; } @@ -303,9 +317,48 @@ extern "C" int bind_prepared_to_runtime_impl( } runtime->set_gm_sm_ptr(sm_ptr); + void *runtime_arena_dev = runtime->host_api.acquire_pooled_runtime_arena(); + if (runtime_arena_dev == nullptr) { + LOG_ERROR("Failed to acquire pooled runtime arena"); + return -1; + } + // Set up device orchestration state runtime->set_orch_args(device_args); + // ------------------------------------------------------------------------- + // Build the prebuilt runtime-arena image on host. + // + // We pre-compute every byte the AICPU's runtime arena would otherwise have + // to write at boot: layout offsets, sub-structure init data, and pointers + // back to the SM / GM heap. Then we rtMemcpy the image into the pooled + // runtime-arena region that DeviceRunner keeps alive across runs. AICPU + // boot becomes attach + wire (cheap pointer fixup) + sm_handle->init (SM + // reset) + a handful of device-only field fixups. + // ------------------------------------------------------------------------- + PTO2Runtime *rt = + runtime_init_data_from_layout(host_arena, layout, PTO2_MODE_EXECUTE, sm_ptr, sm_size, gm_heap, eff_heap_size); + if (rt == nullptr) { + LOG_ERROR("runtime_init_data_from_layout failed"); + return -1; + } + runtime_wire_arena_pointers(host_arena, layout, rt); + + // Stash the layout inside the PTO2Runtime image so the AICPU can recover + // every arena-internal offset after rtMemcpy. The runtime arena's device + // base does NOT travel in this image — it's on the host Runtime + // (set_prebuilt_arena below), since the AICPU needs that pointer + // *before* it can dereference the image. + rt->prebuilt_layout = layout; + + int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size); + if (rc_upload != 0) { + LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload); + return -1; + } + runtime->set_prebuilt_arena(runtime_arena_dev, layout.off_runtime); + int64_t t_prebuilt_end = _now_ms(); + LOG_INFO_V0("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count); int64_t t_total_end = _now_ms(); @@ -313,6 +366,7 @@ extern "C" int bind_prepared_to_runtime_impl( LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start); LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start); LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start); + LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start); LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start); return 0; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index 056c2ee64..48368cf6a 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -331,18 +331,19 @@ static bool prepare_task( // Re-bind payload/task pointers each submit. Value is per-slot constant // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing - // here lets RingSchedState::init() skip the O(window_size) bind loop. - // Both writes hit the same 64B slot_state cache line we're about to - // dirty below, so the extra cost is two stores on an already-hot line. - // Must precede the scheduler wiring.queue.push at the end of - // submit_task_common — that push is the first read of slot_state->task / - // slot_state->payload by another thread. + // here lets RingSchedState::init_data_from_layout() skip the + // O(window_size) bind loop. Both writes hit the same 64B slot_state + // cache line we're about to dirty below, so the extra cost is two + // stores on an already-hot line. Must precede the scheduler + // wiring.queue.push at the end of submit_task_common — that push is + // the first read of slot_state->task / slot_state->payload by another + // thread. out->slot_state->bind_buffers(out->payload, out->task); // Fields already reset by advance_ring_pointers (eager reset after CONSUMED): // fanout_lock=0, fanout_count=1, fanout_head=nullptr, // fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0 - // Fields immutable after RingSchedState::init(): + // Fields immutable after RingSchedState::init_data_from_layout(): // ring_id // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor // observers); set to PENDING here when orchestrator actually reuses the slot. @@ -358,88 +359,6 @@ static bool prepare_task( return true; } -// ============================================================================= -// Orchestrator Initialization -// ============================================================================= - -PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout( - DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity -) { - PTO2OrchestratorLayout layout{}; - layout.dep_pool_capacity = dep_pool_capacity; - layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP; - layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH; - - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - const size_t fanin_pool_bytes = - PTO2_ALIGN_UP(static_cast(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); - layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE); - } - layout.off_scope_tasks = arena.reserve( - static_cast(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *) - ); - layout.off_scope_begins = - arena.reserve(static_cast(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t)); - layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes); - return layout; -} - -bool PTO2OrchestratorState::init_from_layout( - const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header_arg, void *gm_heap, - uint64_t heap_size -) { - auto *orch = this; - *orch = PTO2OrchestratorState{}; - - orch->sm_header = sm_header_arg; - orch->gm_heap_base = gm_heap; - orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH; - orch->fatal = false; - - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - void *ring_heap_base = reinterpret_cast(gm_heap) + r * heap_size; - auto &ring = sm_header_arg->rings[r]; - - orch->rings[r].task_allocator.init( - ring.task_descriptors, ring.task_window_size, &ring.fc.current_task_index, &ring.fc.last_task_alive, - ring_heap_base, heap_size, &sm_header_arg->orch_error_code - ); - - const size_t fanin_pool_bytes = - PTO2_ALIGN_UP(static_cast(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); - auto *fanin_entries = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); - memset(fanin_entries, 0, fanin_pool_bytes); - orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, &sm_header_arg->orch_error_code); - } - - if (!orch->tensor_map.init_from_layout(layout.tensor_map, arena)) { - return false; - } - orch->tensor_map.orch = orch; - - orch->scope_tasks = static_cast(arena.region_ptr(layout.off_scope_tasks)); - orch->scope_begins = static_cast(arena.region_ptr(layout.off_scope_begins)); - orch->scope_tasks_size = 0; - orch->scope_tasks_capacity = layout.scope_tasks_cap; - orch->scope_stack_top = -1; - orch->scope_stack_capacity = layout.scope_stack_capacity; - orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH; - - return true; -} - -void PTO2OrchestratorState::destroy() { - auto *orch = this; - orch->tensor_map.destroy(); - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - orch->rings[r].fanin_pool.base = nullptr; - } - orch->scope_tasks = nullptr; - orch->scope_begins = nullptr; -} - -void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; } - // ============================================================================= // Scope Management // ============================================================================= diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index e24b85b4e..9a73714c0 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -133,19 +133,29 @@ struct PTO2OrchestratorState { // === Cold-path API (defined in pto_orchestrator.cpp) === // Phase 1: declare every sub-region (per-ring fanin pool, scope arrays, - // tensor_map sub-layout) on the supplied arena. + // tensor_map sub-layout) on the supplied arena. task_window_sizes feeds + // the nested tensor_map layout. Returned layout is consumed by + // init_data_from_layout. static PTO2OrchestratorLayout reserve_layout( DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE ); - // Phase 3: bind region pointers, wire per-ring task_allocator + fanin_pool - // and tensor_map. Arena must be committed. - bool init_from_layout( - const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header, void *gm_heap, - uint64_t heap_size + // Phase 3a: write everything *except* arena-internal pointer fields. + // sm_dev_base is the SM device address (only stored, never dereferenced); + // task_window_size feeds the per-ring SM address arithmetic. Safe to call + // on a host arena that holds the prebuilt image. + bool init_data_from_layout( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size, + uint64_t task_window_size ); + // Phase 3b: write the arena-internal pointer fields (scope_tasks, + // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool, + // free_entry_list,task_entry_heads}, scheduler reference). + // Idempotent — host runs once on the image, AICPU runs once after attach. + void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler); + // Forget pointers; arena owns the backing buffers. void destroy(); void set_scheduler(PTO2SchedulerState *scheduler); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index 5a3e3d3d3..abd2a7510 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -68,10 +68,22 @@ class PTO2TaskAllocator { public: /** * Initialize the allocator with task ring and heap ring resources. + * + * All pointer arguments are device addresses (live in SM / GM heap); this + * function only stores them, no dereferences, so it is safe to invoke + * from host code that constructs a prebuilt arena image. + * + * Production callers leave `initial_local_task_id` at 0: the SM ring + * flow-control counters that current_index_ptr / last_alive_ptr point at + * start at zero (PTO2RingFlowControl::init() runs on the AICPU during SM + * reset), so we keep local_task_id_ aligned with that without reading the + * SM. Tests that drive SM state directly may pass a non-zero seed to + * exercise corner cases like task IDs near INT32_MAX. */ void init( PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic *current_index_ptr, - std::atomic *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic *error_code_ptr + std::atomic *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic *error_code_ptr, + int32_t initial_local_task_id = 0 ) { descriptors_ = descriptors; window_size_ = window_size; @@ -81,7 +93,7 @@ class PTO2TaskAllocator { heap_base_ = heap_base; heap_size_ = heap_size; error_code_ptr_ = error_code_ptr; - local_task_id_ = current_index_ptr->load(std::memory_order_relaxed); + local_task_id_ = initial_local_task_id; heap_top_ = 0; heap_tail_ = 0; last_alive_seen_ = 0; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index c801d5c15..f39bac365 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -249,81 +249,19 @@ static const PTO2RuntimeOps s_runtime_ops = { }; // ============================================================================= -// Runtime Creation and Destruction +// Runtime Lifecycle (AICPU-only fixup) // ============================================================================= - -PTO2Runtime *runtime_create_from_sm( - PTO2RuntimeMode mode, void *sm_base, uint64_t sm_size, uint64_t task_window_size, void *gm_heap, uint64_t heap_size, - DeviceArena &arena, int32_t dep_pool_capacity -) { - if (!sm_base || sm_size == 0) return nullptr; - - // Phase 1: layout. Reserve every sub-region the runtime needs (including - // the SM handle wrapper itself) without touching memory yet. - int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - task_window_sizes[r] = static_cast(task_window_size); - } - const size_t off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle)); - PTO2OrchestratorLayout orch_layout = - PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity); - PTO2SchedulerLayout sched_layout = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity); - const size_t off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE); - const size_t off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox)); - - // Phase 2: single backing allocation. - if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) return nullptr; - - // Phase 3: bind region pointers and initialize. - PTO2Runtime *rt = static_cast(arena.region_ptr(off_runtime)); - memset(rt, 0, sizeof(*rt)); // calloc-equivalent for the runtime header. - - // Initialize the SM handle wrapper in-place on its arena region before - // anything that reads sm_handle->header (orchestrator / scheduler init). - rt->sm_handle = static_cast(arena.region_ptr(off_sm_handle)); - memset(rt->sm_handle, 0, sizeof(*rt->sm_handle)); - if (!rt->sm_handle->init(sm_base, sm_size, task_window_size, heap_size)) { - arena.release(); - return nullptr; - } - +// +// Layout / init_data / wire / destroy live in +// runtime/shared/pto_runtime2_init.cpp so the host build can pre-populate the +// prebuilt arena image. The pieces below — wiring the ops table and the +// SPMD core counts — depend on the device-side s_runtime_ops global and the +// AICPU SchedulerContext respectively, so they remain in the AICPU build. + +void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count) { rt->ops = &s_runtime_ops; - rt->mode = mode; - rt->gm_heap = gm_heap; - rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0; - rt->gm_heap_owned = false; - - if (!rt->orchestrator.init_from_layout(orch_layout, arena, rt->sm_handle->header, gm_heap, heap_size)) { - arena.release(); - return nullptr; - } - if (!rt->scheduler.init_from_layout(sched_layout, arena, rt->sm_handle->header)) { - rt->orchestrator.destroy(); - arena.release(); - return nullptr; - } - rt->orchestrator.set_scheduler(&rt->scheduler); - - rt->aicore_mailbox = static_cast(arena.region_ptr(off_mailbox)); - memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox)); - - return rt; -} - -void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena) { - if (!rt) { - arena.release(); // safe: idempotent if nothing's committed. - return; - } - - rt->scheduler.destroy(); - rt->orchestrator.destroy(); - rt->aicore_mailbox = nullptr; // arena-owned. - rt->sm_handle = nullptr; // wrapper lives in arena; release() reclaims it. - - // arena.release() frees the single backing buffer that holds rt, - // mailbox, sm_handle, orchestrator and scheduler sub-regions in one shot. - arena.release(); + rt->orchestrator.total_cluster_count = aic_count; + rt->orchestrator.total_aiv_count = aiv_count; } void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) { diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h index 1da622407..460624e69 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h @@ -91,6 +91,30 @@ struct PTO2RuntimeOps { TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args); }; +/** + * Layout descriptor for the prebuilt runtime arena. Holds all sub-region + * offsets (orchestrator / scheduler / sm_handle wrapper / runtime header / + * AICore mailbox) plus the layout-defining capacities. Produced once on the + * host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout + * and runtime_wire_arena_pointers. + */ +struct PTO2RuntimeArenaLayout { + size_t off_sm_handle{0}; + PTO2OrchestratorLayout orch; + PTO2SchedulerLayout sched; + size_t off_runtime{0}; + size_t off_mailbox{0}; + + // Cached parameters (re-used by init_data + wire stages). + uint64_t task_window_size{0}; + uint64_t heap_size{0}; + int32_t dep_pool_capacity{0}; + + // Total arena byte size post-commit. Used by host to size the prebuilt + // image buffer and as the rtMemcpy length. + size_t arena_size{0}; +}; + /** * PTO Runtime2 context * @@ -118,6 +142,16 @@ struct PTO2Runtime { // Statistics int64_t total_cycles; + + // Prebuilt-arena fast path metadata. Carries every offset + // wire_arena_pointers needs at AICPU boot so the AICPU can reconstruct + // all arena-internal pointer fields without re-running init_data. The + // device base of the runtime arena travels separately on the host-side + // Runtime (Runtime::prebuilt_arena_base_), since the AICPU needs it + // *before* dereferencing this image. Populated on host by + // runtime_init_data_from_layout + runtime_wire_arena_pointers; read by + // aicpu_executor.cpp. + PTO2RuntimeArenaLayout prebuilt_layout; }; // ============================================================================= @@ -125,31 +159,60 @@ struct PTO2Runtime { // ============================================================================= /** - * Create runtime from caller-provided GM SM buffer + GM heap. + * Phase 1 — declare every sub-region (sm_handle wrapper, orchestrator / + * scheduler / tensor_map / mailbox / PTO2Runtime header) on the supplied + * arena. Pure arithmetic; does not touch device memory and may run on host. + * Returns the layout descriptor; caller commits/attaches the arena before + * Phase 2/3. + */ +PTO2RuntimeArenaLayout runtime_reserve_layout( + DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE +); + +/** + * Phase 2 — write the data half of the runtime arena: standalone fields, + * memset'd arena regions, sub-structure initializers, and SM-side device + * pointers. The arena must already be committed (or attached); writes go + * into arena.base() + sub-region offsets. * - * All AICPU-side runtime state (PTO2SharedMemoryHandle wrapper, PTO2Runtime, - * AICoreCompletionMailbox, plus the orchestrator/scheduler/tensor_map - * sub-regions) is laid out on the supplied arena and committed in a single - * backing allocation. runtime_destroy() calls arena.release() once to free - * the lot. + * `sm_dev_base` / `gm_heap_dev_base` are device addresses; we only store + * them (never dereference). Safe to run on a host arena that owns a host + * mirror of the runtime image — the resulting buffer is rtMemcpy-ready. * - * @param mode Execution mode - * @param sm_base Pre-allocated SM buffer base (host-owned) - * @param sm_size Size of the SM buffer in bytes - * @param task_window_size Per-ring task window size used to lay out SM - * @param gm_heap GM heap base for output buffers (or NULL if not used) - * @param heap_size GM heap size in bytes - * @param arena Caller-owned arena that sources all runtime sub-regions. - * @return Runtime context, or NULL on failure - */ -PTO2Runtime *runtime_create_from_sm( - PTO2RuntimeMode mode, void *sm_base, uint64_t sm_size, uint64_t task_window_size, void *gm_heap, uint64_t heap_size, - DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE + * Returns the PTO2Runtime* that sits at layout.off_runtime within the arena. + * Caller must follow up with runtime_wire_arena_pointers; rt->ops and the + * AICore-side count fields are left untouched and must be filled by the + * AICPU at boot. + */ +PTO2Runtime *runtime_init_data_from_layout( + DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size, + void *gm_heap_dev_base, uint64_t heap_size ); /** - * Destroy runtime and free all resources. arena.release() is the actual - * memory free; the rt pointer is no longer valid afterward. + * Phase 3 — wire every arena-internal pointer field (rt->sm_handle, + * rt->aicore_mailbox, orchestrator.{scope_tasks, scope_begins, scheduler, + * tensor_map.*, rings[].fanin_pool.base}, scheduler.{ready_queues, dep_pool, + * wiring.queue}) so each holds arena.base() + offset. Idempotent — runs on + * both host (writing host-mirror addresses) and AICPU (writing device + * addresses) sides. + */ +void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt); + +/** + * AICPU-only Phase 4 — fill in the few fields the host could not know at + * prebuilt-image build time: the ops table (s_runtime_ops is a device-side + * file-local global, host cannot resolve its device address) and the + * orchestrator's core counts (depend on the executor's scheduler context). + * Call once per boot after runtime_wire_arena_pointers. + */ +void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count); + +/** + * Destroy runtime. With the prebuilt-arena fast path the arena buffer is + * pooled across runs by DeviceRunner, so we never call arena.release() + * here — the destructor only forgets sub-structure pointers (idempotent + * cleanup). */ void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index f022b8eb4..a0dfbd9ef 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -92,7 +92,7 @@ // Task management // NOTE: PTO2_TASK_WINDOW_SIZE is now a per-ring default value. -// Actual window size is passed at runtime to runtime_create_from_sm(). +// Actual window size is passed at runtime to runtime_reserve_layout(). // Use pto2_task_slot(sched, task_id) for slot calculation. #define PTO2_TASK_WINDOW_SIZE 16384 // Default per-ring task window size (power of 2) diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h index cf8dbb780..98b832510 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h @@ -53,11 +53,18 @@ struct PTO2SharedMemoryHandle; */ struct alignas(64) PTO2RingFlowControl { // === Cache Line 0: Written by Orchestrator, Read by Scheduler === - std::atomic current_task_index; // Task ring head (next to allocate) + alignas(64) std::atomic current_task_index; // Task ring head (next to allocate) // === Cache Line 1: Written by Scheduler, Read by Orchestrator (for back-pressure) === alignas(64) std::atomic last_task_alive; // Task ring tail (oldest active task) + // Per-boot SM reset. PTO2TaskAllocator::init() seeds its private + // local_task_id_ from initial_local_task_id (default 0 in production) + // *without* dereferencing current_task_index — it relies on this reset + // running on every AICPU boot so 0 stays in sync. If you ever change + // the initial fc value or the boot ordering, update the default in + // PTO2TaskAllocator::init (pto_ring_buffer.h) in the same change, or + // submit IDs will be off by the divergence. void init() { current_task_index.store(0, std::memory_order_relaxed); last_task_alive.store(0, std::memory_order_relaxed); @@ -187,3 +194,67 @@ struct PTO2SharedMemoryHandle { void setup_pointers(uint64_t task_window_size); void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]); }; + +// ============================================================================= +// SM Device Layout Helpers +// ============================================================================= +// +// When the host pre-builds a runtime-arena image, it needs the device-side +// addresses of several SM sub-fields (ring flow-control counters, +// task_descriptors arrays, orch_error_code) so it can wire them into the +// orchestrator / scheduler init_data path without dereferencing the SM — +// the SM lives in device memory and cannot be touched from host. +// +// These helpers compute those addresses by offset arithmetic on the SM +// device base. Pure pointer math, no loads/stores; safe to call from host. +// The same arithmetic happens on AICPU too (via PTO2SharedMemoryHandle's +// own setup_pointers), so values are guaranteed consistent across sides. +namespace pto2_sm_layout { + +inline std::atomic *orch_error_code_addr(void *sm_dev_base) noexcept { + return reinterpret_cast *>( + static_cast(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code) + ); +} + +inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast( + static_cast(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) + + static_cast(ring_id) * sizeof(PTO2SharedMemoryRingHeader) + ); +} + +inline std::atomic *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast *>( + reinterpret_cast(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + + offsetof(PTO2RingFlowControl, current_task_index) + ); +} + +inline std::atomic *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast *>( + reinterpret_cast(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + + offsetof(PTO2RingFlowControl, last_task_alive) + ); +} + +// Walk the per-ring SM layout (same arithmetic as setup_pointers_per_ring) +// to compute ring `ring_id`'s task_descriptors device address. Accepts a +// per-ring window-size array so the helper's signature mirrors +// `PTO2SharedMemoryHandle::setup_pointers_per_ring` and cannot silently +// disagree with the SM layout when (hypothetically) ring sizes diverge. +inline PTO2TaskDescriptor *ring_task_descriptors_addr( + void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id +) noexcept { + assert(ring_id >= 0 && ring_id < PTO2_MAX_RING_DEPTH && "pto2_sm_layout: ring_id out of range"); + char *p = static_cast(sm_dev_base); + p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); + for (int r = 0; r < ring_id; r++) { + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); + } + return reinterpret_cast(p); +} + +} // namespace pto2_sm_layout diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h index 39d6e4ad2..b63f20676 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h @@ -47,12 +47,12 @@ #include "pto_runtime2_types.h" #include "tensor.h" -struct PTO2OrchestratorState; // forward declare - /** * Layout descriptor produced by PTO2TensorMap::reserve_layout(). Stores the * region offsets returned by DeviceArena::reserve() so init_from_layout() * can fetch the matching pointers after the arena is committed. + * + * All offsets are relative to the arena's base. */ struct PTO2TensorMapLayout { size_t off_buckets; @@ -367,8 +367,6 @@ struct PTO2TensorMap { // Per-ring cleanup progress (for periodic cleanup_retired) int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{}; - PTO2OrchestratorState *orch{nullptr}; - uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const { return task_local_id & (task_window_sizes[ring_id] - 1); } @@ -433,11 +431,19 @@ struct PTO2TensorMap { reserve_layout_default(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]); /** - * Phase 3: bind region pointers and initialize state. The arena must already - * be committed; layout must have been produced by reserve_layout() against - * the same arena. + * Phase 3a: write everything *except* arena-internal pointer fields + * (buckets, entry_pool, free_entry_list, task_entry_heads[r]). + * Uses arena.region_ptr to address the arena regions for data writes, + * but does not store those addresses in struct fields. Safe to call on + * a host arena that holds the prebuilt image. + */ + bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena); + + /** + * Phase 3b: write the arena-internal pointer fields. Idempotent; + * called once on the host arena and once on the AICPU after attach. */ - bool init_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena); + void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena); /** * Tear down state. Does not free memory — the arena owns the backing diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h index a4aef9c04..4a7dce1bd 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -127,18 +127,26 @@ struct HostApi { void (*device_free)(void *dev_ptr); int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size); int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size); - // Lay out and commit the per-Worker static device arena that backs both - // the PTO2 GM heap and the PTO2 shared memory in a single underlying - // allocation. Must be called once before acquire_pooled_gm_heap / - // acquire_pooled_gm_sm. Returns 0 on success, -1 on allocation failure. - int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size); + // Lay out three pooled regions in a single backing device allocation: + // GM heap, PTO2 shared memory, and the trb prebuilt runtime arena. + // `runtime_arena_size == 0` skips the last region (hbg path: hbg has no + // prebuilt runtime arena). Returns 0 on success, -1 on allocation + // failure. + int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); // Return the per-Worker pooled pointer for the PTO2 GM heap / shared - // memory. The static arena must already be committed via - // setup_static_arena; the returned pointer is owned by the DeviceRunner - // and freed in `DeviceRunner::finalize()` — do NOT pass it to - // device_free or record it in `tensor_pairs_`. + // memory / prebuilt runtime arena. The static arena must already be + // committed via setup_static_arena; the returned pointer is owned by + // the DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT + // pass it to device_free or record it in `tensor_pairs_`. + // + // acquire_pooled_runtime_arena is trb-only — the host side reserves the + // runtime-arena region only when setup_static_arena is invoked with + // runtime_arena_size > 0. hbg's runtime_maker.cpp must not call it + // (setup_static_arena(...,0) leaves the offset unreserved, and the + // returned region_ptr would be undefined). void *(*acquire_pooled_gm_heap)(); void *(*acquire_pooled_gm_sm)(); + void *(*acquire_pooled_runtime_arena)(); // Single-shot upload of the entire ChipCallable buffer. `callable` is a // `const ChipCallable *` (declared void* to avoid pulling task_interface // headers into runtime.h). DeviceRunner walks child_offsets_ to compute @@ -218,6 +226,13 @@ class Runtime { void *slot_states_ptr_; // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling) ChipStorageTaskArgs orch_args_storage_; // Copy of args for device + // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing + // Runtime to device; AICPU reads them in the boot path to skip + // runtime_create_from_sm and reuse the pooled, prebuilt arena buffer + // (already populated by runtime_init_data_from_layout + wire on host). + void *prebuilt_arena_base_; + size_t prebuilt_runtime_offset_; + // Device orchestration SO (for dlopen on AICPU thread 3). // The SO bytes themselves live in a separately-allocated device buffer // owned by DeviceRunner; only the metadata below travels inside Runtime. @@ -254,6 +269,16 @@ class Runtime { void set_slot_states_ptr(void *p); void set_orch_args(const ChipStorageTaskArgs &args); + // Prebuilt-arena fast path (trb only). Set by host's + // bind_prepared_to_runtime_impl; consumed by AICPU at boot to attach a + // DeviceArena to `prebuilt_arena_base_` and pick up the PTO2Runtime at + // `prebuilt_arena_base_ + prebuilt_runtime_offset_`. Both stay zero on + // first construction (Runtime() ctor zeros them) so a non-prebuilt boot + // path can still detect "no prebuilt image set" via nullptr. + void set_prebuilt_arena(void *arena_base, size_t runtime_off); + void *get_prebuilt_arena_base() const; + size_t get_prebuilt_runtime_offset() const; + // Device orchestration SO binary (for dlopen on AICPU thread 3) void set_dev_orch_so(uint64_t dev_addr, uint64_t size); uint64_t get_dev_orch_so_addr() const; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp index f497b8fd8..2d777e9b0 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp @@ -61,153 +61,6 @@ PTO2SchedProfilingData scheduler_get_profiling(int thread_idx) { } #endif -// ============================================================================= -// Ready Queue Implementation -// ============================================================================= - -size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) { - // Align the slots[] base to a full cache line so MPMC CAS traffic on the - // first slot cannot false-share with whatever region sits in front of us - // (e.g. orchestrator tensormap heads written by the orch thread). - return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE); -} - -bool ready_queue_init_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) { - queue->slots = static_cast(arena.region_ptr(slots_off)); - queue->capacity = capacity; - queue->mask = capacity - 1; - queue->enqueue_pos.store(0, std::memory_order_relaxed); - queue->dequeue_pos.store(0, std::memory_order_relaxed); - - for (uint64_t i = 0; i < capacity; i++) { - queue->slots[i].sequence.store((int64_t)i, std::memory_order_relaxed); - queue->slots[i].slot_state = nullptr; - } - - return true; -} - -void ready_queue_destroy(PTO2ReadyQueue *queue) { - // Arena owns the slots[] buffer; just forget the pointer. - queue->slots = nullptr; -} - -// ============================================================================= -// Scheduler Initialization -// ============================================================================= - -bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id) { - ring = &sm_header->rings[ring_id]; - last_task_alive = 0; - advance_lock.store(0, std::memory_order_relaxed); - - // Initialize all per-task slot state fields. - // bind_ring() sets the ring_id only — payload/task pointers are re-bound - // by orch::prepare_task on every submit (their value is per-slot constant - // but pinning them here would cost O(task_window_size) at startup). - // reset_for_reuse() sets dynamic fields to reclaim defaults (fanout_count=1, - // rest zero) so the first submit needs no reset. - for (uint64_t i = 0; i < ring->task_window_size; i++) { - ring->slot_states[i].bind_ring(static_cast(ring_id)); - ring->slot_states[i].reset_for_reuse(); - ring->slot_states[i].fanin_count = 0; - ring->slot_states[i].active_mask = ActiveMask{}; - } - - return true; -} - -void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; } - -PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) { - PTO2SchedulerLayout layout{}; - layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE; - layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE; - layout.dep_pool_capacity = dep_pool_capacity; - - for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { - layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); - } - layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - // Force a cache-line base so writes from scheduler thread 0 (sole - // writer of this ring's dep_pool) do not invalidate adjacent - // multi-threaded regions like ready_queue.slots. - layout.off_dep_pool_entries[r] = - arena.reserve(static_cast(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE); - } - layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE); - return layout; -} - -bool PTO2SchedulerState::init_from_layout( - const PTO2SchedulerLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header_arg -) { - PTO2SchedulerState *sched = this; - sched->sm_header = sm_header_arg; -#if PTO2_SCHED_PROFILING - sched->tasks_completed.store(0, std::memory_order_relaxed); - sched->tasks_consumed.store(0, std::memory_order_relaxed); -#endif - - // Per-ring scheduler state — no arena buffers, just field init. - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - if (!sched->ring_sched_states[r].init(sm_header_arg, r)) { - return false; - } - } - - // Ready queues — one per resource shape plus DUMMY. - for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { - if (!ready_queue_init_from_layout( - &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity - )) { - return false; - } - } - if (!ready_queue_init_from_layout( - &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity - )) { - return false; - } - - // Per-ring dep_pool: PTO2DepListPool::init takes an externally-allocated - // base + capacity, so we just plumb the arena region into it. - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - auto *dep_entries = static_cast(arena.region_ptr(layout.off_dep_pool_entries[r])); - // calloc-equivalent: pool expects entries zeroed at construction. - memset(dep_entries, 0, static_cast(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry)); - sched->ring_sched_states[r].dep_pool.init( - dep_entries, layout.dep_pool_capacity, &sm_header_arg->orch_error_code - ); - } - - // Wiring SPSC queue (orchestrator push, scheduler thread 0 pop). - if (!sched->wiring.queue.init_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) { - return false; - } - sched->wiring.batch_count = 0; - sched->wiring.batch_index = 0; - sched->wiring.backoff_counter = 0; - - return true; -} - -void PTO2SchedulerState::destroy() { - PTO2SchedulerState *sched = this; - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - sched->ring_sched_states[r].destroy(); - sched->ring_sched_states[r].dep_pool.base = nullptr; - } - - sched->wiring.queue.destroy(); - - for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { - ready_queue_destroy(&sched->ready_queues[i]); - } - ready_queue_destroy(&sched->dummy_ready_queue); -} - // ============================================================================= // Debug Utilities // ============================================================================= diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h index 32887d0be..173f65135 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h @@ -409,7 +409,14 @@ struct alignas(64) PTO2ReadyQueue { // initialize sequence counters // destroy: forget the slots pointer (arena owns the buffer) size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity); -bool ready_queue_init_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity); +// Writes everything *except* the arena-internal `slots` pointer field +// (sequences/positions on the slot array, capacity, mask). Uses +// arena.region_ptr(slots_off) only to address the slot array for writes; +// does NOT store the pointer in `queue->slots`. Call +// `ready_queue_wire_arena_pointers` afterwards to set the field itself. +bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity); +// Stores queue->slots = arena.region_ptr(slots_off). Idempotent. +void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off); void ready_queue_destroy(PTO2ReadyQueue *queue); // ============================================================================= @@ -449,13 +456,17 @@ struct alignas(64) PTO2SpscQueue { return arena.reserve(capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE); } - // Bind buffer pointer + reset indices. The capacity must be a power of two - // and match the value passed to reserve_layout. - bool init_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) { + // Writes everything except the arena-internal `buffer_` pointer field + // (zeros the slot pointer array, mask/head/tail). The host pre-builds the + // image without storing a host address in buffer_; the AICPU wires + // buffer_ at boot via wire_arena_pointers(). + bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) { if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false; - buffer_ = static_cast(arena.region_ptr(buffer_off)); + auto *buf = static_cast(arena.region_ptr(buffer_off)); + // calloc'd-equivalent: zero the slot pointers so spurious early pops + // observe nullptr. for (uint64_t i = 0; i < capacity; i++) - buffer_[i] = nullptr; + buf[i] = nullptr; mask_ = capacity - 1; head_.store(0, std::memory_order_relaxed); tail_.store(0, std::memory_order_relaxed); @@ -464,6 +475,12 @@ struct alignas(64) PTO2SpscQueue { return true; } + // Wire the arena-internal pointer. Called by both host (with host arena) + // and AICPU (with device arena attached to the prebuilt image). + void wire_arena_pointers(DeviceArena &arena, size_t buffer_off) { + buffer_ = static_cast(arena.region_ptr(buffer_off)); + } + // Arena owns the buffer; here we only forget our pointer. void destroy() { buffer_ = nullptr; } @@ -561,7 +578,12 @@ struct PTO2SchedulerState { // --- Cache Line 1+: Thread 0 only (wiring dep_pool) --- alignas(64) PTO2DepListPool dep_pool; - bool init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id); + // Initialize arena-internal data + arena-external pointers; does NOT + // store dep_pool.base (that lives in the runtime arena and is wired + // by SchedulerState::wire_arena_pointers). The `ring` field stores + // the device address of the SM ring header — computed via offset + // arithmetic, no SM dereference. + bool init_data_from_layout(void *sm_dev_base, int32_t ring_id); void destroy(); void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); } @@ -1040,10 +1062,23 @@ struct PTO2SchedulerState { // Phase 1: declare every sub-region (ready_queue slots, dummy queue slots, // per-ring dep_pool entries, wiring SPSC buffer) on the supplied arena. + // Capacities are baked into the returned layout; init_data_from_layout uses + // the same values. static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE); - // Phase 3: bind region pointers and initialize state. - bool init_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, PTO2SharedMemoryHeader *sm_header); + // Phase 3a: write everything *except* arena-internal pointer fields. + // `sm_dev_base` is the device address of the SM (only stored, never + // dereferenced here). Safe to call on a host arena that holds the + // prebuilt image buffer. (The orchestrator counterpart takes + // task_window_size for ring task_descriptors address arithmetic; the + // scheduler only needs the SM header / ring header base addresses, + // both window-size-independent.) + bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base); + + // Phase 3b: write the arena-internal pointer fields + // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each + // ring, wiring.queue.buffer_). Called on both host and device sides. + void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena); // Forget per-region pointers; arena owns the backing memory. void destroy(); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp new file mode 100644 index 000000000..d66acfcc4 --- /dev/null +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp @@ -0,0 +1,355 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Host/AICPU shared runtime-arena layout, init_data and wire implementations. + * + * Lives under runtime/shared/ so it is included in both the host_runtime.so + * build (host pre-populates the prebuilt arena image) and the aicpu_runtime + * build (AICPU runs wire_arena_pointers + destroy after attach). The + * device-only parts of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp + * (ops table, scope/submit/dispatch business logic, profiling) stay in their + * original files and the aicpu build only. + */ + +#include +#include + +#include "pto_orchestrator.h" +#include "pto_runtime2.h" +#include "pto_ring_buffer.h" +#include "pto_shared_memory.h" +#include "pto_tensormap.h" +#include "scheduler/pto_scheduler.h" + +// ============================================================================= +// Ready queue +// ============================================================================= + +size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) { + // Align the slots[] base to a full cache line so MPMC CAS traffic on the + // first slot cannot false-share with whatever region sits in front of us + // (e.g. orchestrator tensormap heads written by the orch thread). + return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE); +} + +bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) { + // Address the slots region for data writes without storing the pointer in + // queue->slots — that field is set by ready_queue_wire_arena_pointers. + auto *slots_arena = static_cast(arena.region_ptr(slots_off)); + queue->capacity = capacity; + queue->mask = capacity - 1; + queue->enqueue_pos.store(0, std::memory_order_relaxed); + queue->dequeue_pos.store(0, std::memory_order_relaxed); + + for (uint64_t i = 0; i < capacity; i++) { + slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed); + slots_arena[i].slot_state = nullptr; + } + + return true; +} + +void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off) { + queue->slots = static_cast(arena.region_ptr(slots_off)); +} + +void ready_queue_destroy(PTO2ReadyQueue *queue) { + // Arena owns the slots[] buffer; just forget the pointer. + queue->slots = nullptr; +} + +// ============================================================================= +// Scheduler +// ============================================================================= + +bool PTO2SchedulerState::RingSchedState::init_data_from_layout(void *sm_dev_base, int32_t ring_id) { + // ring stores the device address of the SM ring header — pure offset + // arithmetic, no SM load. + ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id); + last_task_alive = 0; + advance_lock.store(0, std::memory_order_relaxed); + + // Per-slot SM-side initialization (bind_ring + reset_for_reuse + + // fanin_count/active_mask zero) lives in PTO2SharedMemoryHandle:: + // init_header_per_ring so the AICPU performs it during SM reset; host + // prebuilt-arena init skips SM access here. + + return true; +} + +void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; } + +PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) { + PTO2SchedulerLayout layout{}; + layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE; + layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE; + layout.dep_pool_capacity = dep_pool_capacity; + + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); + } + layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + // Force a cache-line base so writes from scheduler thread 0 (sole + // writer of this ring's dep_pool) do not invalidate adjacent + // multi-threaded regions like ready_queue.slots. + layout.off_dep_pool_entries[r] = + arena.reserve(static_cast(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE); + } + layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE); + return layout; +} + +bool PTO2SchedulerState::init_data_from_layout( + const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base +) { + PTO2SchedulerState *sched = this; + sched->sm_header = reinterpret_cast(sm_dev_base); +#if PTO2_SCHED_PROFILING + sched->tasks_completed.store(0, std::memory_order_relaxed); + sched->tasks_consumed.store(0, std::memory_order_relaxed); +#endif + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) { + return false; + } + } + + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + if (!ready_queue_init_data_from_layout( + &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity + )) { + return false; + } + } + if (!ready_queue_init_data_from_layout( + &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity + )) { + return false; + } + + auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto *dep_entries = static_cast(arena.region_ptr(layout.off_dep_pool_entries[r])); + memset(dep_entries, 0, static_cast(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry)); + sched->ring_sched_states[r].dep_pool.init(dep_entries, layout.dep_pool_capacity, orch_err); + } + + if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) { + return false; + } + sched->wiring.batch_count = 0; + sched->wiring.batch_index = 0; + sched->wiring.backoff_counter = 0; + + return true; +} + +void PTO2SchedulerState::wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena) { + PTO2SchedulerState *sched = this; + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]); + } + ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + sched->ring_sched_states[r].dep_pool.base = + static_cast(arena.region_ptr(layout.off_dep_pool_entries[r])); + } + sched->wiring.queue.wire_arena_pointers(arena, layout.off_wiring_spsc_buffer); +} + +void PTO2SchedulerState::destroy() { + PTO2SchedulerState *sched = this; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + sched->ring_sched_states[r].destroy(); + sched->ring_sched_states[r].dep_pool.base = nullptr; + } + sched->wiring.queue.destroy(); + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + ready_queue_destroy(&sched->ready_queues[i]); + } + ready_queue_destroy(&sched->dummy_ready_queue); +} + +// ============================================================================= +// Orchestrator +// ============================================================================= + +PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout( + DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity +) { + PTO2OrchestratorLayout layout{}; + layout.dep_pool_capacity = dep_pool_capacity; + layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP; + layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH; + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + const size_t fanin_pool_bytes = + PTO2_ALIGN_UP(static_cast(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); + layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE); + } + layout.off_scope_tasks = arena.reserve( + static_cast(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *) + ); + layout.off_scope_begins = + arena.reserve(static_cast(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t)); + layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes); + return layout; +} + +bool PTO2OrchestratorState::init_data_from_layout( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size, + uint64_t task_window_size +) { + auto *orch = this; + *orch = PTO2OrchestratorState{}; + + orch->sm_header = reinterpret_cast(sm_dev_base); + orch->gm_heap_base = gm_heap; + orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH; + orch->fatal = false; + + // Mirror the SM API's per-ring window-size shape so a future per-ring + // SM layout cannot silently disagree with the addresses we compute here. + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) + task_window_sizes[r] = task_window_size; + + auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + void *ring_heap_base = reinterpret_cast(gm_heap) + r * heap_size; + auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r); + auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r); + auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r); + + orch->rings[r].task_allocator.init( + task_descs_dev, static_cast(task_window_size), cur_idx_dev, last_alive_dev, ring_heap_base, + heap_size, orch_err + ); + + const size_t fanin_pool_bytes = + PTO2_ALIGN_UP(static_cast(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); + auto *fanin_entries = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); + memset(fanin_entries, 0, fanin_pool_bytes); + orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, orch_err); + } + + if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) { + return false; + } + + orch->scope_tasks_size = 0; + orch->scope_tasks_capacity = layout.scope_tasks_cap; + orch->scope_stack_top = -1; + orch->scope_stack_capacity = layout.scope_stack_capacity; + orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH; + + return true; +} + +void PTO2OrchestratorState::wire_arena_pointers( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg +) { + auto *orch = this; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + orch->rings[r].fanin_pool.base = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); + } + orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena); + orch->scope_tasks = static_cast(arena.region_ptr(layout.off_scope_tasks)); + orch->scope_begins = static_cast(arena.region_ptr(layout.off_scope_begins)); + orch->scheduler = scheduler_arg; +} + +void PTO2OrchestratorState::destroy() { + auto *orch = this; + orch->tensor_map.destroy(); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + orch->rings[r].fanin_pool.base = nullptr; + } + orch->scope_tasks = nullptr; + orch->scope_begins = nullptr; +} + +void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; } + +// ============================================================================= +// Top-level runtime arena +// ============================================================================= + +PTO2RuntimeArenaLayout +runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity) { + PTO2RuntimeArenaLayout layout{}; + layout.task_window_size = task_window_size; + layout.dep_pool_capacity = dep_pool_capacity; + + int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_window_sizes[r] = static_cast(task_window_size); + } + + layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle)); + layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity); + layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity); + layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE); + layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox)); + + layout.arena_size = arena.total_size(); + return layout; +} + +PTO2Runtime *runtime_init_data_from_layout( + DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, + uint64_t /*sm_size*/, void *gm_heap_dev_base, uint64_t heap_size +) { + PTO2Runtime *rt = static_cast(arena.region_ptr(layout.off_runtime)); + memset(rt, 0, sizeof(*rt)); + + auto *sm_wrap = static_cast(arena.region_ptr(layout.off_sm_handle)); + memset(sm_wrap, 0, sizeof(*sm_wrap)); + + // rt->ops is filled by the AICPU at boot. + rt->mode = mode; + rt->gm_heap = gm_heap_dev_base; + rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0; + rt->gm_heap_owned = false; + rt->total_cycles = 0; + + if (!rt->orchestrator.init_data_from_layout( + layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_size, layout.task_window_size + )) { + return nullptr; + } + if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) { + return nullptr; + } + + auto *mailbox = static_cast(arena.region_ptr(layout.off_mailbox)); + memset(mailbox, 0, sizeof(*mailbox)); + + return rt; +} + +void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) { + rt->sm_handle = static_cast(arena.region_ptr(layout.off_sm_handle)); + rt->aicore_mailbox = static_cast(arena.region_ptr(layout.off_mailbox)); + rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler); + rt->scheduler.wire_arena_pointers(layout.sched, arena); +} + +void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) { + // Arena buffer is pooled across runs by DeviceRunner — never freed here. + if (!rt) return; + rt->scheduler.destroy(); + rt->orchestrator.destroy(); + rt->aicore_mailbox = nullptr; + rt->sm_handle = nullptr; +} diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp index 358c87f57..1e1edff92 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp @@ -167,6 +167,23 @@ void PTO2SharedMemoryHandle::init_header_per_ring( header->sched_error_bitmap.store(0, std::memory_order_relaxed); header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed); header->sched_error_thread.store(-1, std::memory_order_relaxed); + + // Per-ring slot_states reset. Previously lived in + // PTO2SchedulerState::RingSchedState::init(), but it writes into + // ring->slot_states[] which is SM-side storage — keeping it here lets + // host-side prebuilt-arena init skip all SM dereferences. + // bind_ring() pins the ring_id (slot-invariant after this point); + // reset_for_reuse() prepares dynamic fanout/refcount fields so the first + // submit doesn't need an explicit reset. + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto &ring = header->rings[r]; + for (uint64_t i = 0; i < task_window_sizes[r]; i++) { + ring.slot_states[i].bind_ring(static_cast(r)); + ring.slot_states[i].reset_for_reuse(); + ring.slot_states[i].fanin_count = 0; + ring.slot_states[i].active_mask = ActiveMask{}; + } + } } // ============================================================================= diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp similarity index 82% rename from src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp rename to src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp index c09e6f4f6..b99c67233 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp @@ -30,7 +30,6 @@ #include "common.h" #include "common/unified_log.h" -#include "pto_orchestrator.h" // ============================================================================= // TensorMap Lookup Chain Length Statistics (compile-time toggle) @@ -82,37 +81,45 @@ PTO2TensorMap::reserve_layout_default(DeviceArena &arena, const int32_t new_task return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes); } -bool PTO2TensorMap::init_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) { +bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) { num_buckets = layout.num_buckets; pool_size = layout.pool_size; - buckets = static_cast(arena.region_ptr(layout.off_buckets)); - entry_pool = static_cast(arena.region_ptr(layout.off_entry_pool)); - free_entry_list = static_cast(arena.region_ptr(layout.off_free_entry_list)); + // Address arena regions for data writes; do not store these in struct + // fields (wire_arena_pointers does that). + auto *buckets_arena = static_cast(arena.region_ptr(layout.off_buckets)); + auto *entry_pool_arena = static_cast(arena.region_ptr(layout.off_entry_pool)); + auto *free_list_arena = static_cast(arena.region_ptr(layout.off_free_entry_list)); + // buckets[]: empty == nullptr. for (int32_t i = 0; i < num_buckets; i++) { - buckets[i] = nullptr; + buckets_arena[i] = nullptr; } - memset(entry_pool, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry)); + // entry_pool: zero-init equivalent to the previous calloc(entry_pool, ...). + // The pool's persistent invariant after init is "bucket_index == -1 means + // not linked", set explicitly below. + memset(entry_pool_arena, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry)); for (int32_t i = 0; i < pool_size; i++) { - entry_pool[i].bucket_index = -1; - entry_pool[i].next_in_bucket = nullptr; - entry_pool[i].prev_in_bucket = nullptr; - entry_pool[i].next_in_task = nullptr; - entry_pool[i].prev_in_task = nullptr; - entry_pool[i].producer_task_id = PTO2TaskId{}; + entry_pool_arena[i].bucket_index = -1; + entry_pool_arena[i].next_in_bucket = nullptr; + entry_pool_arena[i].prev_in_bucket = nullptr; + entry_pool_arena[i].next_in_task = nullptr; + entry_pool_arena[i].prev_in_task = nullptr; + entry_pool_arena[i].producer_task_id = PTO2TaskId{}; } - memset(free_entry_list, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry *)); + // free_entry_list: zeroed (was calloc'd before); contents become meaningful + // only after entries are freed back, so the body of the array stays as 0. + memset(free_list_arena, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry *)); next_entry_idx = 0; free_num = 0; for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - task_entry_heads[r] = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); + auto *heads_arena = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) { - task_entry_heads[r][i] = nullptr; + heads_arena[i] = nullptr; } task_window_sizes[r] = layout.task_window_sizes[r]; last_task_alives[r] = 0; @@ -122,6 +129,15 @@ bool PTO2TensorMap::init_from_layout(const PTO2TensorMapLayout &layout, DeviceAr return true; } +void PTO2TensorMap::wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena) { + buckets = static_cast(arena.region_ptr(layout.off_buckets)); + entry_pool = static_cast(arena.region_ptr(layout.off_entry_pool)); + free_entry_list = static_cast(arena.region_ptr(layout.off_free_entry_list)); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_entry_heads[r] = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); + } +} + void PTO2TensorMap::destroy() { // Arena owns the backing memory; here we only forget our pointers so any // stray post-destroy access trips a nullptr dereference instead of reading diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp index 7daa54ed5..0ebb2ef79 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp @@ -46,6 +46,8 @@ Runtime::Runtime() { gm_heap_ptr_ = nullptr; slot_states_ptr_ = nullptr; orch_args_storage_.clear(); + prebuilt_arena_base_ = nullptr; + prebuilt_runtime_offset_ = 0; // Initialize device orchestration SO binary dev_orch_so_addr_ = 0; @@ -76,6 +78,13 @@ void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; } void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; } void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; } +void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) { + prebuilt_arena_base_ = arena_base; + prebuilt_runtime_offset_ = runtime_off; +} +void *Runtime::get_prebuilt_arena_base() const { return prebuilt_arena_base_; } +size_t Runtime::get_prebuilt_runtime_offset() const { return prebuilt_runtime_offset_; } + // Device orchestration SO metadata (bytes live in a separate device buffer // owned by DeviceRunner; only the address/size travels in Runtime). void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) { diff --git a/tests/ut/cpp/a5/test_scheduler_state.cpp b/tests/ut/cpp/a5/test_scheduler_state.cpp index 952aad55a..75476dedf 100644 --- a/tests/ut/cpp/a5/test_scheduler_state.cpp +++ b/tests/ut/cpp/a5/test_scheduler_state.cpp @@ -34,7 +34,8 @@ class SchedulerStateTest : public ::testing::Test { ASSERT_NE(sm_handle, nullptr); auto layout = PTO2SchedulerState::reserve_layout(sched_arena); ASSERT_NE(sched_arena.commit(), nullptr); - ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header)); + ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header)); + sched.wire_arena_pointers(layout, sched_arena); } void TearDown() override { diff --git a/tests/ut/cpp/a5/test_task_allocator.cpp b/tests/ut/cpp/a5/test_task_allocator.cpp index 383003900..512e241a2 100644 --- a/tests/ut/cpp/a5/test_task_allocator.cpp +++ b/tests/ut/cpp/a5/test_task_allocator.cpp @@ -388,7 +388,10 @@ TEST_F(TaskAllocatorTest, TaskWindowSaturates) { TEST_F(TaskAllocatorTest, TaskIdNearInt32Max) { current_index.store(INT32_MAX - 2); last_alive.store(INT32_MAX - 2); - allocator.init(descriptors.data(), WINDOW_SIZE, ¤t_index, &last_alive, heap_buf, HEAP_SIZE, &error_code); + allocator.init( + descriptors.data(), WINDOW_SIZE, ¤t_index, &last_alive, heap_buf, HEAP_SIZE, &error_code, + /*initial_local_task_id=*/INT32_MAX - 2 + ); auto r1 = allocator.alloc(0); ASSERT_FALSE(r1.failed()); diff --git a/tests/ut/cpp/a5/test_task_state.cpp b/tests/ut/cpp/a5/test_task_state.cpp index 729b74999..ffced6f9a 100644 --- a/tests/ut/cpp/a5/test_task_state.cpp +++ b/tests/ut/cpp/a5/test_task_state.cpp @@ -43,7 +43,8 @@ class TaskStateTest : public ::testing::Test { ASSERT_NE(sm_handle, nullptr); auto layout = PTO2SchedulerState::reserve_layout(sched_arena); ASSERT_NE(sched_arena.commit(), nullptr); - ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header)); + ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header)); + sched.wire_arena_pointers(layout, sched_arena); } void TearDown() override { diff --git a/tests/ut/cpp/a5/test_wiring.cpp b/tests/ut/cpp/a5/test_wiring.cpp index b01052a85..1e8fee9c5 100644 --- a/tests/ut/cpp/a5/test_wiring.cpp +++ b/tests/ut/cpp/a5/test_wiring.cpp @@ -48,7 +48,8 @@ class WiringTest : public ::testing::Test { ASSERT_NE(sm_handle, nullptr); auto layout = PTO2SchedulerState::reserve_layout(sched_arena); ASSERT_NE(sched_arena.commit(), nullptr); - ASSERT_TRUE(sched.init_from_layout(layout, sched_arena, sm_handle->header)); + ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header)); + sched.wire_arena_pointers(layout, sched_arena); } void TearDown() override { From 008658ea6d7d31916d88365b1ede5a3df5228a7c Mon Sep 17 00:00:00 2001 From: poursoul Date: Wed, 27 May 2026 12:28:05 +0800 Subject: [PATCH 5/7] Refactor: split per-Worker static arena into three independent allocations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DeviceRunner's GM heap / PTO2 SM / trb prebuilt runtime arena used to live in a single backing device buffer (one rtMalloc per worker, three regions sub-divided via DeviceArena::reserve). The combined size can exceed the device allocator's largest contiguous block on real hardware, so split into three independent DeviceArena instances — each commits exactly one region (one device_malloc), and acquire_pooled_* returns its base(). Touches all four DeviceRunner implementations (a2a3/a5 × onboard/sim). The setup_static_arena and acquire_pooled_* signatures are unchanged; the host_api / runtime_maker callers are unaffected. hbg keeps passing runtime_arena_size = 0, which leaves runtime_arena_pool_ uncommitted and acquire_pooled_runtime_arena returning nullptr. Tests - cpput: 25/25 pass. - a5sim: L2 trb + host_build_graph full suite pass. - a2a3sim: L2 trb + host_build_graph full suite pass. --- .../platform/onboard/host/device_runner.cpp | 103 ++++++++++-------- .../platform/onboard/host/device_runner.h | 31 ++++-- src/a2a3/platform/sim/host/device_runner.cpp | 94 ++++++++-------- src/a2a3/platform/sim/host/device_runner.h | 27 +++-- .../platform/onboard/host/device_runner.cpp | 97 +++++++++-------- src/a5/platform/onboard/host/device_runner.h | 27 +++-- src/a5/platform/sim/host/device_runner.cpp | 86 ++++++++------- src/a5/platform/sim/host/device_runner.h | 27 +++-- 8 files changed, 277 insertions(+), 215 deletions(-) diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 8d2d9916b..9b66e05ae 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -250,60 +250,74 @@ int AicpuSoInfo::finalize() { DeviceRunner::~DeviceRunner() { finalize(); } int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { - if (static_arena_.is_committed()) { - // Idempotent for the production case (sizes do not change across a - // worker's lifetime). If a caller asks for a larger layout, redo it. - if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_ && - runtime_arena_size <= cached_runtime_arena_size_) { + // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt + // runtime arena. Split out from a single large allocation because the + // combined size can exceed the device allocator's largest contiguous + // block. Each arena commits exactly one region, so its base() is the + // pooled pointer the caller wants. + // + // Idempotent for the production case (sizes do not change across a + // worker's lifetime). If a caller asks for a larger layout on any + // region, redo just that region — already-committed peers stay alive + // so their callers don't have to re-acquire. + auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int { + if (requested_size == 0) { + // hbg's runtime_arena path: caller passed 0 and never reserved + // a region. Leave the arena uncommitted; acquire_pooled_* will + // return nullptr. + if (arena.is_committed() && cached_size != 0) { + arena.release(); + cached_size = 0; + } + return 0; + } + if (arena.is_committed() && requested_size <= cached_size) { return 0; } - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; - runtime_arena_region_off_ = SIZE_MAX; + arena.release(); + cached_size = 0; + arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign); + if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { + // commit() failure leaves committed_=false, so the next entry's + // is_committed() guard skips the release branch. release() is + // idempotent on a never-committed arena (zeroes cursor_). + arena.release(); + return -1; + } + cached_size = requested_size; + return 0; + }; + if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1; + if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) { + gm_heap_arena_.release(); + cached_gm_heap_size_ = 0; + return -1; + } + if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) { + gm_heap_arena_.release(); + gm_sm_arena_.release(); cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; - cached_runtime_arena_size_ = 0; - } - gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign); - gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign); - if (runtime_arena_size > 0) { - runtime_arena_region_off_ = static_arena_.reserve(runtime_arena_size, DeviceArena::kDefaultBaseAlign); - } - if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { - // Roll back the reserves: commit() failure leaves committed_=false, - // so the next entry would skip the release branch and stack new - // reserves on top of the stale cursor. release() is idempotent on a - // never-committed arena (just zeroes cursor_ / region_count_). - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; - runtime_arena_region_off_ = SIZE_MAX; return -1; } - cached_gm_heap_size_ = gm_heap_size; - cached_gm_sm_size_ = gm_sm_size; - cached_runtime_arena_size_ = runtime_arena_size; return 0; } void *DeviceRunner::acquire_pooled_gm_heap() { - if (!static_arena_.is_committed()) return nullptr; - return static_arena_.region_ptr(gm_heap_region_off_); + if (!gm_heap_arena_.is_committed()) return nullptr; + return gm_heap_arena_.base(); } void *DeviceRunner::acquire_pooled_gm_sm() { - if (!static_arena_.is_committed()) return nullptr; - return static_arena_.region_ptr(gm_sm_region_off_); + if (!gm_sm_arena_.is_committed()) return nullptr; + return gm_sm_arena_.base(); } void *DeviceRunner::acquire_pooled_runtime_arena() { - if (!static_arena_.is_committed()) return nullptr; - // hbg calls setup_static_arena(...,0) and never reserves a runtime-arena - // region — fail loudly if a caller asks for it anyway, rather than - // returning region_ptr(SIZE_MAX) (base + SIZE_MAX is undefined). - if (runtime_arena_region_off_ == SIZE_MAX) return nullptr; - return static_arena_.region_ptr(runtime_arena_region_off_); + // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_ + // uncommitted — fail loudly if a caller asks for it anyway. + if (!runtime_arena_pool_.is_committed()) return nullptr; + return runtime_arena_pool_.base(); } std::thread DeviceRunner::create_thread(std::function fn) { @@ -1241,14 +1255,13 @@ int DeviceRunner::finalize() { // perf_cleanup guard; this is the backstop for the no-run-since-init case. finalize_collectors(); - // Release per-Worker static arena (GM heap + PTO2 SM + optional trb - // prebuilt runtime arena in a single backing device allocation). Must - // precede mem_alloc_.finalize() so the arena frees through the still-live + // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional + // trb prebuilt runtime arena — each its own device_malloc). Must precede + // mem_alloc_.finalize() so the arenas free through the still-live // allocator, not after it. - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; - runtime_arena_region_off_ = SIZE_MAX; + gm_heap_arena_.release(); + gm_sm_arena_.release(); + runtime_arena_pool_.release(); cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; cached_runtime_arena_size_ = 0; diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 4d9819f21..8f6e1b3f9 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -185,7 +185,9 @@ struct KernelArgsHelper { class DeviceRunner { public: DeviceRunner() : - static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} + gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} ~DeviceRunner(); /** @@ -612,23 +614,28 @@ class DeviceRunner { // Memory management MemoryAllocator mem_alloc_; - // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a - // single device allocation. Released explicitly in finalize() before - // mem_alloc_.finalize() so it does not free pointers a second time. + // Three independent per-Worker arenas, each backing a single pooled + // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime + // arena). Split out from a single backing allocation because the + // combined size can exceed the device allocator's largest contiguous + // block — three separate device_malloc calls are friendlier than one + // big one. Released explicitly in finalize() before mem_alloc_.finalize() + // so the underlying buffers do not get freed twice. + // + // `runtime_arena_pool_` stays unreserved when setup_static_arena was + // invoked with runtime_arena_size == 0 (hbg path). // // Trampolines forward DeviceArena's alloc/free calls to mem_alloc_. static void *arena_alloc_trampoline(void *ctx, size_t size) { return static_cast(ctx)->alloc(size); } static void arena_free_trampoline(void *ctx, void *p) { static_cast(ctx)->free(p); } - DeviceArena static_arena_; - size_t gm_heap_region_off_{SIZE_MAX}; - size_t gm_sm_region_off_{SIZE_MAX}; - // SIZE_MAX (= "not provisioned") when the caller passed runtime_arena_size - // == 0 (hbg path); a real offset for trb. - size_t runtime_arena_region_off_{SIZE_MAX}; - // Cached sizes for setup_static_arena's "fits" check — avoids calling - // region_size() on the arena's public API for the regions we own. + DeviceArena gm_heap_arena_; + DeviceArena gm_sm_arena_; + DeviceArena runtime_arena_pool_; + // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating + // the same buffer when a later worker init asks for an equal-or-smaller + // layout on an already-committed arena. size_t cached_gm_heap_size_{0}; size_t cached_gm_sm_size_{0}; size_t cached_runtime_arena_size_{0}; diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index c221bb714..1651c4a89 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -123,58 +123,67 @@ bool create_temp_so_file(const std::string &path_template, const uint8_t *data, DeviceRunner::~DeviceRunner() { finalize(); } int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { - if (static_arena_.is_committed()) { - if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_ && - runtime_arena_size <= cached_runtime_arena_size_) { + // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt + // runtime arena. Split out from a single large allocation because the + // combined size can exceed the device allocator's largest contiguous + // block. Each arena commits exactly one region, so its base() is the + // pooled pointer the caller wants. + // + // Idempotent for the production case (sizes do not change across a + // worker's lifetime). If a caller asks for a larger layout on any + // region, redo just that region. + auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int { + if (requested_size == 0) { + if (arena.is_committed() && cached_size != 0) { + arena.release(); + cached_size = 0; + } + return 0; + } + if (arena.is_committed() && requested_size <= cached_size) { return 0; } - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; - runtime_arena_region_off_ = SIZE_MAX; + arena.release(); + cached_size = 0; + arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign); + if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { + arena.release(); + return -1; + } + cached_size = requested_size; + return 0; + }; + if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1; + if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) { + gm_heap_arena_.release(); + cached_gm_heap_size_ = 0; + return -1; + } + if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) { + gm_heap_arena_.release(); + gm_sm_arena_.release(); cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; - cached_runtime_arena_size_ = 0; - } - gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign); - gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign); - if (runtime_arena_size > 0) { - runtime_arena_region_off_ = static_arena_.reserve(runtime_arena_size, DeviceArena::kDefaultBaseAlign); - } - if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { - // Roll back the reserves: commit() failure leaves committed_=false, - // so the next entry would skip the release branch and stack new - // reserves on top of the stale cursor. release() is idempotent on a - // never-committed arena (just zeroes cursor_ / region_count_). - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; - runtime_arena_region_off_ = SIZE_MAX; return -1; } - cached_gm_heap_size_ = gm_heap_size; - cached_gm_sm_size_ = gm_sm_size; - cached_runtime_arena_size_ = runtime_arena_size; return 0; } void *DeviceRunner::acquire_pooled_gm_heap() { - if (!static_arena_.is_committed()) return nullptr; - return static_arena_.region_ptr(gm_heap_region_off_); + if (!gm_heap_arena_.is_committed()) return nullptr; + return gm_heap_arena_.base(); } void *DeviceRunner::acquire_pooled_gm_sm() { - if (!static_arena_.is_committed()) return nullptr; - return static_arena_.region_ptr(gm_sm_region_off_); + if (!gm_sm_arena_.is_committed()) return nullptr; + return gm_sm_arena_.base(); } void *DeviceRunner::acquire_pooled_runtime_arena() { - if (!static_arena_.is_committed()) return nullptr; - // hbg calls setup_static_arena(...,0) and never reserves a runtime-arena - // region — fail loudly if a caller asks for it anyway, rather than - // returning region_ptr(SIZE_MAX) (base + SIZE_MAX is undefined). - if (runtime_arena_region_off_ == SIZE_MAX) return nullptr; - return static_arena_.region_ptr(runtime_arena_region_off_); + // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_ + // uncommitted — fail loudly if a caller asks for it anyway. + if (!runtime_arena_pool_.is_committed()) return nullptr; + return runtime_arena_pool_.base(); } std::thread DeviceRunner::create_thread(std::function fn) { @@ -1051,14 +1060,13 @@ int DeviceRunner::finalize() { // Close executor .so files (typically already closed by run(), this is a safety net) unload_executor_binaries(); - // Release per-Worker static arena (GM heap + PTO2 SM + optional trb - // prebuilt runtime arena in a single backing device allocation). Must - // precede mem_alloc_.finalize() so the arena frees through the still-live + // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional + // trb prebuilt runtime arena — each its own device_malloc). Must precede + // mem_alloc_.finalize() so the arenas free through the still-live // allocator, not after it. - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; - runtime_arena_region_off_ = SIZE_MAX; + gm_heap_arena_.release(); + gm_sm_arena_.release(); + runtime_arena_pool_.release(); cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; cached_runtime_arena_size_ = 0; diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h index 60f1bfdc9..a98eec1b8 100644 --- a/src/a2a3/platform/sim/host/device_runner.h +++ b/src/a2a3/platform/sim/host/device_runner.h @@ -75,7 +75,9 @@ class DeviceRunner { public: DeviceRunner() : - static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} + gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} ~DeviceRunner(); /** @@ -287,21 +289,26 @@ class DeviceRunner { // Memory management MemoryAllocator mem_alloc_; - // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a - // single device allocation. Released explicitly in finalize() before - // mem_alloc_.finalize() so it does not free pointers a second time. + // Three independent per-Worker arenas, each backing a single pooled + // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime + // arena). Split out from a single backing allocation because the + // combined size can exceed the device allocator's largest contiguous + // block. Released explicitly in finalize() before mem_alloc_.finalize() + // so the underlying buffers do not get freed twice. + // + // `runtime_arena_pool_` stays unreserved when setup_static_arena was + // invoked with runtime_arena_size == 0 (hbg path). // // Trampolines forward DeviceArena's alloc/free to mem_alloc_. static void *arena_alloc_trampoline(void *ctx, size_t size) { return static_cast(ctx)->alloc(size); } static void arena_free_trampoline(void *ctx, void *p) { static_cast(ctx)->free(p); } - DeviceArena static_arena_; - size_t gm_heap_region_off_{SIZE_MAX}; - size_t gm_sm_region_off_{SIZE_MAX}; - size_t runtime_arena_region_off_{SIZE_MAX}; - // Cached sizes for setup_static_arena's "fits" check — avoids calling - // region_size() on the arena's public API for the regions we own. + DeviceArena gm_heap_arena_; + DeviceArena gm_sm_arena_; + DeviceArena runtime_arena_pool_; + // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating + // a buffer when a later worker init asks for an equal-or-smaller layout. size_t cached_gm_heap_size_{0}; size_t cached_gm_sm_size_{0}; size_t cached_runtime_arena_size_{0}; diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index 377e0b8eb..4a26e1056 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -196,60 +196,67 @@ static int prof_free_cb(void *dev_ptr) { return rtFree(dev_ptr); } DeviceRunner::~DeviceRunner() { finalize(); } int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { - if (static_arena_.is_committed()) { - // Idempotent for the production case (sizes do not change across a - // worker's lifetime). If a caller asks for a larger layout, redo it. - if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_ && - runtime_arena_size <= cached_runtime_arena_size_) { + // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt + // runtime arena. Split out from a single large allocation because the + // combined size can exceed the device allocator's largest contiguous + // block. Each arena commits exactly one region, so its base() is the + // pooled pointer the caller wants. + // + // Idempotent for the production case (sizes do not change across a + // worker's lifetime). If a caller asks for a larger layout on any + // region, redo just that region. + auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int { + if (requested_size == 0) { + if (arena.is_committed() && cached_size != 0) { + arena.release(); + cached_size = 0; + } return 0; } - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; - runtime_arena_region_off_ = SIZE_MAX; + if (arena.is_committed() && requested_size <= cached_size) { + return 0; + } + arena.release(); + cached_size = 0; + arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign); + if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { + arena.release(); + return -1; + } + cached_size = requested_size; + return 0; + }; + if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1; + if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) { + gm_heap_arena_.release(); + cached_gm_heap_size_ = 0; + return -1; + } + if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) { + gm_heap_arena_.release(); + gm_sm_arena_.release(); cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; - cached_runtime_arena_size_ = 0; - } - gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign); - gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign); - if (runtime_arena_size > 0) { - runtime_arena_region_off_ = static_arena_.reserve(runtime_arena_size, DeviceArena::kDefaultBaseAlign); - } - if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { - // Roll back the reserves: commit() failure leaves committed_=false, - // so the next entry would skip the release branch and stack new - // reserves on top of the stale cursor. release() is idempotent on a - // never-committed arena (just zeroes cursor_ / region_count_). - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; - runtime_arena_region_off_ = SIZE_MAX; return -1; } - cached_gm_heap_size_ = gm_heap_size; - cached_gm_sm_size_ = gm_sm_size; - cached_runtime_arena_size_ = runtime_arena_size; return 0; } void *DeviceRunner::acquire_pooled_gm_heap() { - if (!static_arena_.is_committed()) return nullptr; - return static_arena_.region_ptr(gm_heap_region_off_); + if (!gm_heap_arena_.is_committed()) return nullptr; + return gm_heap_arena_.base(); } void *DeviceRunner::acquire_pooled_gm_sm() { - if (!static_arena_.is_committed()) return nullptr; - return static_arena_.region_ptr(gm_sm_region_off_); + if (!gm_sm_arena_.is_committed()) return nullptr; + return gm_sm_arena_.base(); } void *DeviceRunner::acquire_pooled_runtime_arena() { - if (!static_arena_.is_committed()) return nullptr; - // hbg calls setup_static_arena(...,0) and never reserves a runtime-arena - // region — fail loudly if a caller asks for it anyway, rather than - // returning region_ptr(SIZE_MAX) (base + SIZE_MAX is undefined). - if (runtime_arena_region_off_ == SIZE_MAX) return nullptr; - return static_arena_.region_ptr(runtime_arena_region_off_); + // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_ + // uncommitted — fail loudly if a caller asks for it anyway. + if (!runtime_arena_pool_.is_committed()) return nullptr; + return runtime_arena_pool_.base(); } std::thread DeviceRunner::create_thread(std::function fn) { @@ -1060,13 +1067,13 @@ int DeviceRunner::finalize() { pmu_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb); } - // Release per-Worker static arena (GM heap + PTO2 SM in a single backing - // device allocation). Must precede mem_alloc_.finalize() so the arena - // frees through the still-live allocator, not after it. - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; - runtime_arena_region_off_ = SIZE_MAX; + // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional + // trb prebuilt runtime arena — each its own device_malloc). Must precede + // mem_alloc_.finalize() so the arenas free through the still-live + // allocator, not after it. + gm_heap_arena_.release(); + gm_sm_arena_.release(); + runtime_arena_pool_.release(); cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; cached_runtime_arena_size_ = 0; diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index 9edad84fa..754514fe5 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -174,7 +174,9 @@ struct KernelArgsHelper { class DeviceRunner { public: DeviceRunner() : - static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} + gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} ~DeviceRunner(); /** @@ -522,21 +524,26 @@ class DeviceRunner { // Memory management MemoryAllocator mem_alloc_; - // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a - // single device allocation. Released explicitly in finalize() before - // mem_alloc_.finalize() so it does not free pointers a second time. + // Three independent per-Worker arenas, each backing a single pooled + // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime + // arena). Split out from a single backing allocation because the + // combined size can exceed the device allocator's largest contiguous + // block. Released explicitly in finalize() before mem_alloc_.finalize() + // so the underlying buffers do not get freed twice. + // + // `runtime_arena_pool_` stays unreserved when setup_static_arena was + // invoked with runtime_arena_size == 0 (hbg path). // // Trampolines forward DeviceArena's alloc/free calls to mem_alloc_. static void *arena_alloc_trampoline(void *ctx, size_t size) { return static_cast(ctx)->alloc(size); } static void arena_free_trampoline(void *ctx, void *p) { static_cast(ctx)->free(p); } - DeviceArena static_arena_; - size_t gm_heap_region_off_{SIZE_MAX}; - size_t gm_sm_region_off_{SIZE_MAX}; - size_t runtime_arena_region_off_{SIZE_MAX}; - // Cached sizes for setup_static_arena's "fits" check — avoids calling - // region_size() on the arena's public API for the two regions we own. + DeviceArena gm_heap_arena_; + DeviceArena gm_sm_arena_; + DeviceArena runtime_arena_pool_; + // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating + // a buffer when a later worker init asks for an equal-or-smaller layout. size_t cached_gm_heap_size_{0}; size_t cached_gm_sm_size_{0}; size_t cached_runtime_arena_size_{0}; diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp index 8cbac796c..a20b9d44d 100644 --- a/src/a5/platform/sim/host/device_runner.cpp +++ b/src/a5/platform/sim/host/device_runner.cpp @@ -113,55 +113,61 @@ static int prof_free_cb(void *dev_ptr) { DeviceRunner::~DeviceRunner() { finalize(); } int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { - if (static_arena_.is_committed()) { - if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_ && - runtime_arena_size <= cached_runtime_arena_size_) { + // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt + // runtime arena. Split out from a single large allocation because the + // combined size can exceed the device allocator's largest contiguous + // block. Each arena commits exactly one region, so its base() is the + // pooled pointer the caller wants. + auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int { + if (requested_size == 0) { + if (arena.is_committed() && cached_size != 0) { + arena.release(); + cached_size = 0; + } + return 0; + } + if (arena.is_committed() && requested_size <= cached_size) { return 0; } - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; - runtime_arena_region_off_ = SIZE_MAX; + arena.release(); + cached_size = 0; + arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign); + if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { + arena.release(); + return -1; + } + cached_size = requested_size; + return 0; + }; + if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1; + if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) { + gm_heap_arena_.release(); + cached_gm_heap_size_ = 0; + return -1; + } + if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) { + gm_heap_arena_.release(); + gm_sm_arena_.release(); cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; - cached_runtime_arena_size_ = 0; - } - gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign); - gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign); - if (runtime_arena_size > 0) { - runtime_arena_region_off_ = static_arena_.reserve(runtime_arena_size, DeviceArena::kDefaultBaseAlign); - } - if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { - // Roll back the reserves: commit() failure leaves committed_=false, - // so the next entry would skip the release branch and stack new - // reserves on top of the stale cursor. release() is idempotent on a - // never-committed arena (just zeroes cursor_ / region_count_). - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; - runtime_arena_region_off_ = SIZE_MAX; return -1; } - cached_gm_heap_size_ = gm_heap_size; - cached_gm_sm_size_ = gm_sm_size; - cached_runtime_arena_size_ = runtime_arena_size; return 0; } void *DeviceRunner::acquire_pooled_gm_heap() { - if (!static_arena_.is_committed()) return nullptr; - return static_arena_.region_ptr(gm_heap_region_off_); + if (!gm_heap_arena_.is_committed()) return nullptr; + return gm_heap_arena_.base(); } void *DeviceRunner::acquire_pooled_gm_sm() { - if (!static_arena_.is_committed()) return nullptr; - return static_arena_.region_ptr(gm_sm_region_off_); + if (!gm_sm_arena_.is_committed()) return nullptr; + return gm_sm_arena_.base(); } void *DeviceRunner::acquire_pooled_runtime_arena() { - if (!static_arena_.is_committed()) return nullptr; - if (runtime_arena_region_off_ == SIZE_MAX) return nullptr; - return static_arena_.region_ptr(runtime_arena_region_off_); + if (!runtime_arena_pool_.is_committed()) return nullptr; + return runtime_arena_pool_.base(); } std::thread DeviceRunner::create_thread(std::function fn) { @@ -945,13 +951,13 @@ int DeviceRunner::finalize() { // Close executor .so files (typically already closed by run(), this is a safety net) unload_executor_binaries(); - // Release per-Worker static arena (GM heap + PTO2 SM in a single backing - // device allocation). Must precede mem_alloc_.finalize() so the arena - // frees through the still-live allocator, not after it. - static_arena_.release(); - gm_heap_region_off_ = SIZE_MAX; - gm_sm_region_off_ = SIZE_MAX; - runtime_arena_region_off_ = SIZE_MAX; + // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional + // trb prebuilt runtime arena — each its own device_malloc). Must precede + // mem_alloc_.finalize() so the arenas free through the still-live + // allocator, not after it. + gm_heap_arena_.release(); + gm_sm_arena_.release(); + runtime_arena_pool_.release(); cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; cached_runtime_arena_size_ = 0; diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h index f4fe44121..468fd6e44 100644 --- a/src/a5/platform/sim/host/device_runner.h +++ b/src/a5/platform/sim/host/device_runner.h @@ -72,7 +72,9 @@ class DeviceRunner { public: DeviceRunner() : - static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} + gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} ~DeviceRunner(); /** @@ -286,21 +288,26 @@ class DeviceRunner { // Memory management MemoryAllocator mem_alloc_; - // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a - // single device allocation. Released explicitly in finalize() before - // mem_alloc_.finalize() so it does not free pointers a second time. + // Three independent per-Worker arenas, each backing a single pooled + // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime + // arena). Split out from a single backing allocation because the + // combined size can exceed the device allocator's largest contiguous + // block. Released explicitly in finalize() before mem_alloc_.finalize() + // so the underlying buffers do not get freed twice. + // + // `runtime_arena_pool_` stays unreserved when setup_static_arena was + // invoked with runtime_arena_size == 0 (hbg path). // // Trampolines forward DeviceArena's alloc/free to mem_alloc_. static void *arena_alloc_trampoline(void *ctx, size_t size) { return static_cast(ctx)->alloc(size); } static void arena_free_trampoline(void *ctx, void *p) { static_cast(ctx)->free(p); } - DeviceArena static_arena_; - size_t gm_heap_region_off_{SIZE_MAX}; - size_t gm_sm_region_off_{SIZE_MAX}; - size_t runtime_arena_region_off_{SIZE_MAX}; - // Cached sizes for setup_static_arena's "fits" check — avoids calling - // region_size() on the arena's public API for the two regions we own. + DeviceArena gm_heap_arena_; + DeviceArena gm_sm_arena_; + DeviceArena runtime_arena_pool_; + // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating + // a buffer when a later worker init asks for an equal-or-smaller layout. size_t cached_gm_heap_size_{0}; size_t cached_gm_sm_size_{0}; size_t cached_runtime_arena_size_{0}; From bd5bf35a6c837223702f82c91e7470afdb0df907 Mon Sep 17 00:00:00 2001 From: poursoul Date: Wed, 27 May 2026 16:27:03 +0800 Subject: [PATCH 6/7] Refactor: post-review cleanups for trb host-build arena PR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback covering doc / comment consistency and a small set of behavioral symmetry items between a2a3 and a5 trb runtimes: - pto_orchestrator.h: drop the stale ",orch" from the wire_arena_pointers comment on a2a3 (PTO2TensorMap::orch was removed in 75f2562c but a2a3 kept the comment lagging behind the a5 mirror). - runtime.h / device_runner.h (both arches): refresh the setup_static_arena / acquire_pooled_* docblocks. Drop the orphan pre-split prose ("backs both the PTO2 GM heap and the PTO2 shared memory in a single underlying allocation") and the "doing so returns an unreserved-offset region_ptr (undefined)" wording that no longer matches the three-independent-arenas split — acquire_pooled_runtime_arena now returns a well-defined nullptr on the hbg path. - a5 device_runner.{h,cpp}: restore the rationale comments that the a5 mirror lost when it copied a2a3's earlier shape — three separate device_malloc calls being friendlier than one big one, hbg's runtime_arena_size == 0 contract, commit() failure rollback invariants, idempotent peer-arena policy. Keeps the why-this-way notes symmetric with a2a3. - a5 runtime.h: fix the RUNTIME_MAX_ORCH_SO_SIZE comment that claimed "1MB" while the macro expands to 4MB. - a5 pto_orchestrator.cpp: drop the prod_state->task null / task_id defensive guard. PTO2TensorMap lookup chain truncation already guarantees producer_task_id >= last_task_alive, and producers reach the tensormap only after prepare_task has bound the slot. Matches the a2a3 shape that relies on the same invariants. - a5 cpput: migrate the three stale UTs (test_ready_queue, test_spsc_queue, test_tensormap) to the new 4-phase reserve_layout / init_data_from_layout / wire_arena_pointers API. Wire them and the previously-orphaned a5 trb UTs into CMakeLists.txt behind a new a5_rt_objs OBJECT library + add_a5_runtime_test helper (mirrors a2a3_rt_objs). Target names carry the test_a5_ prefix to avoid clashing with hierarchical / a2a3 unprefixed test names. Tests - cpput: 35/35 pass (25 a2a3 + 10 newly enabled a5 trb). - a5sim: full sim suite passes. - a2a3sim: full sim suite passes (regression). --- .../platform/onboard/host/device_runner.h | 25 +++++---- src/a2a3/platform/sim/host/device_runner.h | 20 +++---- .../runtime/pto_orchestrator.h | 2 +- .../runtime/runtime.h | 32 +++++------ .../platform/onboard/host/device_runner.cpp | 9 +++- src/a5/platform/onboard/host/device_runner.h | 28 +++++----- src/a5/platform/sim/host/device_runner.cpp | 11 ++++ src/a5/platform/sim/host/device_runner.h | 22 ++++---- .../runtime/pto_orchestrator.cpp | 3 -- .../runtime/runtime.h | 29 +++++----- tests/ut/cpp/CMakeLists.txt | 54 +++++++++++++++++++ tests/ut/cpp/a5/test_ready_queue.cpp | 42 ++++++++++++--- tests/ut/cpp/a5/test_spsc_queue.cpp | 32 ++++++++--- tests/ut/cpp/a5/test_tensormap.cpp | 23 ++++++-- 14 files changed, 228 insertions(+), 104 deletions(-) diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 8f6e1b3f9..93501a916 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -191,27 +191,26 @@ class DeviceRunner { ~DeviceRunner(); /** - * Lay out and commit the per-Worker static device arena that backs the - * PTO2 GM heap, the PTO2 shared memory, and the trb prebuilt runtime - * arena in a single underlying allocation. Must be called before any - * acquire_pooled_*. Idempotent on identical sizes. `runtime_arena_size` - * is 0 for the hbg path (no prebuilt runtime arena). Returns 0 on - * success, -1 on failure. + * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared + * memory, trb prebuilt runtime arena) as three independent device + * allocations. Must be called before any acquire_pooled_*. Idempotent + * on identical sizes. `runtime_arena_size` is 0 for the hbg path (no + * prebuilt runtime arena) — the corresponding arena stays uncommitted. + * Returns 0 on success, -1 on failure. */ int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); /** * Return the pooled GM heap / PTO2 SM / runtime arena pointer. - * setup_static_arena must have been called earlier in this Worker; + * setup_static_arena must have already committed the relevant region; * otherwise these return nullptr. All pointers are stable for the - * Worker's lifetime; the single underlying device buffer is released in - * `finalize()`. + * Worker's lifetime; the three underlying device buffers are released + * in `finalize()`. * * acquire_pooled_runtime_arena() is trb-only — the runtime arena region - * is only reserved when setup_static_arena was called with - * runtime_arena_size > 0. hbg's runtime_maker never calls this; doing so - * after setup_static_arena(...,0) returns an unreserved-offset region_ptr - * (undefined). Keep the call site discipline at the runtime_maker layer. + * is only committed when setup_static_arena was called with + * runtime_arena_size > 0. Calling it on the hbg path + * (setup_static_arena(...,0)) returns nullptr (well-defined). */ void *acquire_pooled_gm_heap(); void *acquire_pooled_gm_sm(); diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h index a98eec1b8..46ee45913 100644 --- a/src/a2a3/platform/sim/host/device_runner.h +++ b/src/a2a3/platform/sim/host/device_runner.h @@ -81,23 +81,23 @@ class DeviceRunner { ~DeviceRunner(); /** - * Lay out and commit the per-Worker static device arena that backs the - * PTO2 GM heap, the PTO2 shared memory, and the trb prebuilt runtime - * arena in a single underlying allocation. Must be called before any - * acquire_pooled_*. `runtime_arena_size` is 0 for hbg. Idempotent on - * identical sizes. Returns 0 on success, -1 on failure. + * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared + * memory, trb prebuilt runtime arena) as three independent device + * allocations. Must be called before any acquire_pooled_*. + * `runtime_arena_size` is 0 for the hbg path (leaves that arena + * uncommitted). Idempotent on identical sizes. Returns 0 on success, + * -1 on failure. */ int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); /** * Return the pooled GM heap / PTO2 SM / runtime arena pointer. - * setup_static_arena must have been called earlier in this Worker. + * setup_static_arena must have already committed the relevant region. * * acquire_pooled_runtime_arena() is trb-only — the runtime arena region - * is only reserved when setup_static_arena was called with - * runtime_arena_size > 0. hbg's runtime_maker never calls this; doing so - * after setup_static_arena(...,0) returns an unreserved-offset region_ptr - * (undefined). Keep the call site discipline at the runtime_maker layer. + * is only committed when setup_static_arena was called with + * runtime_arena_size > 0. Calling it on the hbg path + * (setup_static_arena(...,0)) returns nullptr (well-defined). */ void *acquire_pooled_gm_heap(); void *acquire_pooled_gm_sm(); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index 6e67cb597..7dd47b19a 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -153,7 +153,7 @@ struct PTO2OrchestratorState { // Phase 3b: write the arena-internal pointer fields (scope_tasks, // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool, - // free_entry_list,task_entry_heads,orch}, scheduler reference). + // free_entry_list,task_entry_heads}, scheduler reference). // Idempotent — host runs once on the image, AICPU runs once after attach. void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 117621ca2..8e1bb1567 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -119,28 +119,22 @@ struct HostApi { void (*device_free)(void *dev_ptr); int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size); int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size); - // Lay out and commit the per-Worker static device arena that backs both - // the PTO2 GM heap and the PTO2 shared memory in a single underlying - // allocation. Must be called once before acquire_pooled_gm_heap / - // acquire_pooled_gm_sm. Idempotent on identical sizes; returns 0 on - // success, -1 on allocation failure. - // Lay out three pooled regions in a single backing device allocation: - // GM heap, PTO2 shared memory, and the trb prebuilt runtime arena. - // `runtime_arena_size == 0` skips the last region (hbg path: hbg has no - // prebuilt runtime arena). Returns 0 on success, -1 on allocation - // failure. + // Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared + // memory, trb prebuilt runtime arena) as three independent device + // allocations. `runtime_arena_size == 0` skips the third region (hbg + // path: hbg has no prebuilt runtime arena). Idempotent on identical + // sizes; returns 0 on success, -1 on allocation failure. int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); // Return the per-Worker pooled pointer for the PTO2 GM heap / shared - // memory / prebuilt runtime arena. The static arena must already be - // committed via setup_static_arena; the returned pointer is owned by - // the DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT - // pass it to device_free or record it in `tensor_pairs_`. + // memory / prebuilt runtime arena. setup_static_arena must have already + // committed the relevant region; the returned pointer is owned by the + // DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT pass it + // to device_free or record it in `tensor_pairs_`. // - // acquire_pooled_runtime_arena is trb-only — the host side reserves the - // runtime-arena region only when setup_static_arena is invoked with - // runtime_arena_size > 0. hbg's runtime_maker.cpp must not call it - // (setup_static_arena(...,0) leaves the offset unreserved, and the - // returned region_ptr would be undefined). + // acquire_pooled_runtime_arena is trb-only — the runtime-arena region is + // only committed when setup_static_arena was invoked with + // runtime_arena_size > 0. Calling it on the hbg path + // (setup_static_arena(...,0)) returns nullptr (not undefined). void *(*acquire_pooled_gm_heap)(); void *(*acquire_pooled_gm_sm)(); void *(*acquire_pooled_runtime_arena)(); diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index 4a26e1056..b8dc9bb46 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -204,9 +204,13 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz // // Idempotent for the production case (sizes do not change across a // worker's lifetime). If a caller asks for a larger layout on any - // region, redo just that region. + // region, redo just that region — already-committed peers stay alive + // so their callers don't have to re-acquire. auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int { if (requested_size == 0) { + // hbg's runtime_arena path: caller passed 0 and never reserved + // a region. Leave the arena uncommitted; acquire_pooled_* will + // return nullptr. if (arena.is_committed() && cached_size != 0) { arena.release(); cached_size = 0; @@ -220,6 +224,9 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz cached_size = 0; arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign); if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { + // commit() failure leaves committed_=false, so the next entry's + // is_committed() guard skips the release branch. release() is + // idempotent on a never-committed arena (zeroes cursor_). arena.release(); return -1; } diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index 754514fe5..0d8cc0397 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -180,27 +180,26 @@ class DeviceRunner { ~DeviceRunner(); /** - * Lay out and commit the per-Worker static device arena that backs the - * PTO2 GM heap, the PTO2 shared memory, and the trb prebuilt runtime - * arena in a single underlying allocation. Must be called before any - * acquire_pooled_*. Idempotent on identical sizes. `runtime_arena_size` - * is 0 for the hbg path (no prebuilt runtime arena). Returns 0 on - * success, -1 on failure. + * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared + * memory, trb prebuilt runtime arena) as three independent device + * allocations. Must be called before any acquire_pooled_*. Idempotent + * on identical sizes. `runtime_arena_size` is 0 for the hbg path (no + * prebuilt runtime arena) — the corresponding arena stays uncommitted. + * Returns 0 on success, -1 on failure. */ int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); /** * Return the pooled GM heap / PTO2 SM / runtime arena pointer. - * setup_static_arena must have been called earlier in this Worker; + * setup_static_arena must have already committed the relevant region; * otherwise these return nullptr. All pointers are stable for the - * Worker's lifetime; the single underlying device buffer is released in - * `finalize()`. + * Worker's lifetime; the three underlying device buffers are released + * in `finalize()`. * * acquire_pooled_runtime_arena() is trb-only — the runtime arena region - * is only reserved when setup_static_arena was called with - * runtime_arena_size > 0. hbg's runtime_maker never calls this; doing so - * after setup_static_arena(...,0) returns an unreserved-offset region_ptr - * (undefined). Keep the call site discipline at the runtime_maker layer. + * is only committed when setup_static_arena was called with + * runtime_arena_size > 0. Calling it on the hbg path + * (setup_static_arena(...,0)) returns nullptr (well-defined). */ void *acquire_pooled_gm_heap(); void *acquire_pooled_gm_sm(); @@ -528,7 +527,8 @@ class DeviceRunner { // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime // arena). Split out from a single backing allocation because the // combined size can exceed the device allocator's largest contiguous - // block. Released explicitly in finalize() before mem_alloc_.finalize() + // block — three separate device_malloc calls are friendlier than one + // big one. Released explicitly in finalize() before mem_alloc_.finalize() // so the underlying buffers do not get freed twice. // // `runtime_arena_pool_` stays unreserved when setup_static_arena was diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp index a20b9d44d..fe3e938e1 100644 --- a/src/a5/platform/sim/host/device_runner.cpp +++ b/src/a5/platform/sim/host/device_runner.cpp @@ -118,8 +118,16 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz // combined size can exceed the device allocator's largest contiguous // block. Each arena commits exactly one region, so its base() is the // pooled pointer the caller wants. + // + // Idempotent for the production case (sizes do not change across a + // worker's lifetime). If a caller asks for a larger layout on any + // region, redo just that region — already-committed peers stay alive + // so their callers don't have to re-acquire. auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int { if (requested_size == 0) { + // hbg's runtime_arena path: caller passed 0 and never reserved + // a region. Leave the arena uncommitted; acquire_pooled_* will + // return nullptr. if (arena.is_committed() && cached_size != 0) { arena.release(); cached_size = 0; @@ -133,6 +141,9 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz cached_size = 0; arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign); if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { + // commit() failure leaves committed_=false, so the next entry's + // is_committed() guard skips the release branch. release() is + // idempotent on a never-committed arena (zeroes cursor_). arena.release(); return -1; } diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h index 468fd6e44..59b685572 100644 --- a/src/a5/platform/sim/host/device_runner.h +++ b/src/a5/platform/sim/host/device_runner.h @@ -78,22 +78,23 @@ class DeviceRunner { ~DeviceRunner(); /** - * Lay out and commit the per-Worker static device arena that backs the - * PTO2 GM heap, the PTO2 shared memory, and the trb prebuilt runtime - * arena in a single underlying allocation. Must be called before any - * acquire_pooled_*. Idempotent on identical sizes. `runtime_arena_size` - * is 0 for the hbg path (no prebuilt runtime arena). Returns 0 on - * success, -1 on failure. + * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared + * memory, trb prebuilt runtime arena) as three independent device + * allocations. Must be called before any acquire_pooled_*. Idempotent + * on identical sizes. `runtime_arena_size` is 0 for the hbg path + * (leaves that arena uncommitted). Returns 0 on success, -1 on + * failure. */ int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); /** * Return the pooled GM heap / PTO2 SM / runtime arena pointer. - * setup_static_arena must have been called earlier in this Worker; + * setup_static_arena must have already committed the relevant region; * otherwise these return nullptr. * - * acquire_pooled_runtime_arena() is trb-only — the region exists only - * when setup_static_arena was called with runtime_arena_size > 0. + * acquire_pooled_runtime_arena() is trb-only — the region is only + * committed when setup_static_arena was called with + * runtime_arena_size > 0. Calling it on the hbg path returns nullptr. */ void *acquire_pooled_gm_heap(); void *acquire_pooled_gm_sm(); @@ -292,7 +293,8 @@ class DeviceRunner { // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime // arena). Split out from a single backing allocation because the // combined size can exceed the device allocator's largest contiguous - // block. Released explicitly in finalize() before mem_alloc_.finalize() + // block — three separate device_malloc calls are friendlier than one + // big one. Released explicitly in finalize() before mem_alloc_.finalize() // so the underlying buffers do not get freed twice. // // `runtime_arena_pool_` stays unreserved when setup_static_arena was diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index 48368cf6a..c937fd986 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -507,9 +507,6 @@ static TaskOutputTensors submit_task_common( auto runtime_emit = [&](PTO2TaskId producer_task_id) -> bool { PTO2TaskSlotState *prod_state = &orch->sm_header->rings[producer_task_id.ring()].get_slot_state_by_task_id(producer_task_id.local()); - if (prod_state->task == nullptr || prod_state->task->task_id != producer_task_id) { - return true; // producer slot reused for a different task — dep is moot - } return append_fanin_or_fail(orch, prod_state, &fanin_builder, ring_id); }; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 4a7dce1bd..4a690e8ca 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -48,7 +48,7 @@ #define RUNTIME_MAX_ARGS 128 #define RUNTIME_MAX_WORKER 108 // 36 AIC + 72 AIV cores #define RUNTIME_MAX_FUNC_ID 1024 -#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024) // 1MB max for orchestration SO +#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024) // 4MB max for orchestration SO #define RUNTIME_MAX_ORCH_SYMBOL_NAME 64 // Default ready queue shards: one shard per worker thread (total minus orchestrator) @@ -127,23 +127,22 @@ struct HostApi { void (*device_free)(void *dev_ptr); int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size); int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size); - // Lay out three pooled regions in a single backing device allocation: - // GM heap, PTO2 shared memory, and the trb prebuilt runtime arena. - // `runtime_arena_size == 0` skips the last region (hbg path: hbg has no - // prebuilt runtime arena). Returns 0 on success, -1 on allocation - // failure. + // Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared + // memory, trb prebuilt runtime arena) as three independent device + // allocations. `runtime_arena_size == 0` skips the third region (hbg + // path: hbg has no prebuilt runtime arena). Idempotent on identical + // sizes; returns 0 on success, -1 on allocation failure. int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); // Return the per-Worker pooled pointer for the PTO2 GM heap / shared - // memory / prebuilt runtime arena. The static arena must already be - // committed via setup_static_arena; the returned pointer is owned by - // the DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT - // pass it to device_free or record it in `tensor_pairs_`. + // memory / prebuilt runtime arena. setup_static_arena must have already + // committed the relevant region; the returned pointer is owned by the + // DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT pass it + // to device_free or record it in `tensor_pairs_`. // - // acquire_pooled_runtime_arena is trb-only — the host side reserves the - // runtime-arena region only when setup_static_arena is invoked with - // runtime_arena_size > 0. hbg's runtime_maker.cpp must not call it - // (setup_static_arena(...,0) leaves the offset unreserved, and the - // returned region_ptr would be undefined). + // acquire_pooled_runtime_arena is trb-only — the runtime-arena region is + // only committed when setup_static_arena was invoked with + // runtime_arena_size > 0. Calling it on the hbg path + // (setup_static_arena(...,0)) returns nullptr (not undefined). void *(*acquire_pooled_gm_heap)(); void *(*acquire_pooled_gm_sm)(); void *(*acquire_pooled_runtime_arena)(); diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index 9922850d5..39cf5977b 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -194,6 +194,45 @@ function(add_a5_test name src) add_test(NAME ${name} COMMAND ${name}) endfunction() +# --------------------------------------------------------------------------- +# A5 runtime sources, mirroring a2a3_rt_objs. Bundled into an OBJECT library +# so the runtime .cpp files compile once and the resulting .o files are +# reused across every a5 runtime test executable. +# --------------------------------------------------------------------------- +set(A5_RUNTIME_DIR ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/runtime) + +add_library(a5_rt_objs OBJECT + ${A5_RUNTIME_DIR}/pto_ring_buffer.cpp + ${A5_RUNTIME_DIR}/shared/pto_shared_memory.cpp + ${A5_RUNTIME_DIR}/scheduler/pto_scheduler.cpp + ${A5_RUNTIME_DIR}/shared/pto_tensormap.cpp + ${A5_RUNTIME_DIR}/shared/pto_runtime2_init.cpp + ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp +) +target_include_directories(a5_rt_objs PUBLIC + ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/orchestration + ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/runtime + ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/common + ${CMAKE_SOURCE_DIR}/../../../src/a5/platform/include + ${CMAKE_SOURCE_DIR}/../../../src/common/task_interface + ${CMAKE_SOURCE_DIR}/../../../src/common/log/include + ${CMAKE_SOURCE_DIR}/../../../src/common/device_comm +) +target_compile_options(a5_rt_objs PUBLIC -D_GLIBCXX_USE_CXX11_ABI=0) + +function(add_a5_runtime_test name src) + add_executable(${name} ${src}) + target_include_directories(${name} PRIVATE ${GTEST_INCLUDE_DIRS}) + target_link_libraries(${name} PRIVATE + a5_rt_objs + ${GTEST_MAIN_LIB} + ${GTEST_LIB} + pthread + ) + add_test(NAME ${name} COMMAND ${name}) + set_tests_properties(${name} PROPERTIES LABELS "no_hardware") +endfunction() + function(add_task_interface_test name src) add_executable(${name} ${src}) target_include_directories(${name} PRIVATE @@ -314,6 +353,21 @@ add_a2a3_runtime_test(test_wiring a2a3/test_wiring.cpp) # --------------------------------------------------------------------------- add_a5_test(test_a5_fatal a5/test_a5_fatal.cpp) +# A5 trb runtime UTs — mirror of a2a3 trb runtime UTs, link against a5_rt_objs. +# Target names carry the a5_ prefix because hierarchical/test_tensormap (and +# the unprefixed a2a3 runtime targets test_scheduler_state / test_ready_queue +# / ...) already own those bare names. +add_a5_runtime_test(test_a5_task_allocator a5/test_task_allocator.cpp) +add_a5_runtime_test(test_a5_dep_list_pool a5/test_dep_list_pool.cpp) +add_a5_runtime_test(test_a5_scheduler_state a5/test_scheduler_state.cpp) +add_a5_runtime_test(test_a5_task_state a5/test_task_state.cpp) +add_a5_runtime_test(test_a5_ready_queue a5/test_ready_queue.cpp) +add_a5_runtime_test(test_a5_shared_memory a5/test_shared_memory.cpp) +add_a5_runtime_test(test_a5_tensormap a5/test_tensormap.cpp) +add_a5_runtime_test(test_a5_fanin_pool a5/test_fanin_pool.cpp) +add_a5_runtime_test(test_a5_spsc_queue a5/test_spsc_queue.cpp) +add_a5_runtime_test(test_a5_wiring a5/test_wiring.cpp) + # Host logger silent/off behavior — no runtime deps, just compile host_log.cpp # alongside the test (faster than dlopen'ing libsimpler_log.so for a unit test). set(SIMPLER_LOG_DIR ${CMAKE_SOURCE_DIR}/../../../src/common/log) diff --git a/tests/ut/cpp/a5/test_ready_queue.cpp b/tests/ut/cpp/a5/test_ready_queue.cpp index 9dea3ae94..f12b1e7c7 100644 --- a/tests/ut/cpp/a5/test_ready_queue.cpp +++ b/tests/ut/cpp/a5/test_ready_queue.cpp @@ -44,6 +44,7 @@ #include #include +#include "device_arena.h" #include "scheduler/pto_scheduler.h" // ============================================================================= @@ -55,10 +56,19 @@ class ReadyQueueTest : public ::testing::Test { static constexpr uint64_t CAPACITY = 16; // Power of 2 PTO2ReadyQueue queue; + DeviceArena arena; - void SetUp() override { ASSERT_TRUE(ready_queue_init(&queue, CAPACITY)); } + void SetUp() override { + const size_t off = ready_queue_reserve_layout(arena, CAPACITY); + ASSERT_NE(arena.commit(), nullptr); + ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, CAPACITY)); + ready_queue_wire_arena_pointers(&queue, arena, off); + } - void TearDown() override { ready_queue_destroy(&queue); } + void TearDown() override { + ready_queue_destroy(&queue); + arena.release(); + } }; // ============================================================================= @@ -217,8 +227,18 @@ class ReadyQueueBoundaryTest : public ::testing::Test { PTO2ReadyQueue queue{}; PTO2TaskSlotState dummy[8]{}; - void SetUp() override { ASSERT_TRUE(ready_queue_init(&queue, QUEUE_CAP)); } - void TearDown() override { ready_queue_destroy(&queue); } + DeviceArena arena; + + void SetUp() override { + const size_t off = ready_queue_reserve_layout(arena, QUEUE_CAP); + ASSERT_NE(arena.commit(), nullptr); + ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, QUEUE_CAP)); + ready_queue_wire_arena_pointers(&queue, arena, off); + } + void TearDown() override { + ready_queue_destroy(&queue); + arena.release(); + } }; TEST_F(ReadyQueueBoundaryTest, ExactCapacityFillDrain) { @@ -307,8 +327,18 @@ class ReadyQueueMPMCTest : public ::testing::TestWithParam { static constexpr uint64_t CAPACITY = 1024; PTO2ReadyQueue queue; - void SetUp() override { ASSERT_TRUE(ready_queue_init(&queue, CAPACITY)); } - void TearDown() override { ready_queue_destroy(&queue); } + DeviceArena arena; + + void SetUp() override { + const size_t off = ready_queue_reserve_layout(arena, CAPACITY); + ASSERT_NE(arena.commit(), nullptr); + ASSERT_TRUE(ready_queue_init_data_from_layout(&queue, arena, off, CAPACITY)); + ready_queue_wire_arena_pointers(&queue, arena, off); + } + void TearDown() override { + ready_queue_destroy(&queue); + arena.release(); + } }; TEST_P(ReadyQueueMPMCTest, NoDuplicateNoLoss) { diff --git a/tests/ut/cpp/a5/test_spsc_queue.cpp b/tests/ut/cpp/a5/test_spsc_queue.cpp index a2c80ca05..5dce3ba4a 100644 --- a/tests/ut/cpp/a5/test_spsc_queue.cpp +++ b/tests/ut/cpp/a5/test_spsc_queue.cpp @@ -27,6 +27,7 @@ #include #include +#include "device_arena.h" #include "scheduler/pto_scheduler.h" // ============================================================================= @@ -38,15 +39,22 @@ class SpscQueueTest : public ::testing::Test { static constexpr uint64_t CAPACITY = 16; // must be power of 2 PTO2SpscQueue queue{}; + DeviceArena arena; // Dummy slot states used as push values alignas(64) PTO2TaskSlotState slots[64]{}; void SetUp() override { memset(&queue, 0, sizeof(queue)); - ASSERT_TRUE(queue.init(CAPACITY)); + const size_t off = PTO2SpscQueue::reserve_layout(arena, CAPACITY); + ASSERT_NE(arena.commit(), nullptr); + ASSERT_TRUE(queue.init_data_from_layout(arena, off, CAPACITY)); + queue.wire_arena_pointers(arena, off); } - void TearDown() override { queue.destroy(); } + void TearDown() override { + queue.destroy(); + arena.release(); + } }; // ============================================================================= @@ -60,17 +68,27 @@ TEST_F(SpscQueueTest, InitValidState) { } TEST_F(SpscQueueTest, InitRejectsNonPowerOfTwo) { + // init_from_layout rejects non-power-of-two capacities. Use a fresh arena + // each time since reserve runs before commit. PTO2SpscQueue bad{}; - EXPECT_FALSE(bad.init(3)); - EXPECT_FALSE(bad.init(7)); - EXPECT_FALSE(bad.init(0)); + DeviceArena local; + const size_t off = PTO2SpscQueue::reserve_layout(local, 1); // dummy reservation so commit succeeds + (void)off; + ASSERT_NE(local.commit(), nullptr); + EXPECT_FALSE(bad.init_data_from_layout(local, off, 3)); + EXPECT_FALSE(bad.init_data_from_layout(local, off, 7)); + EXPECT_FALSE(bad.init_data_from_layout(local, off, 0)); } TEST_F(SpscQueueTest, InitAcceptsPowerOfTwo) { PTO2SpscQueue q{}; - EXPECT_TRUE(q.init(4)); + DeviceArena local; + const size_t off4 = PTO2SpscQueue::reserve_layout(local, 4); + const size_t off1024 = PTO2SpscQueue::reserve_layout(local, 1024); + ASSERT_NE(local.commit(), nullptr); + EXPECT_TRUE(q.init_data_from_layout(local, off4, 4)); q.destroy(); - EXPECT_TRUE(q.init(1024)); + EXPECT_TRUE(q.init_data_from_layout(local, off1024, 1024)); q.destroy(); } diff --git a/tests/ut/cpp/a5/test_tensormap.cpp b/tests/ut/cpp/a5/test_tensormap.cpp index 10eef0317..805a9e079 100644 --- a/tests/ut/cpp/a5/test_tensormap.cpp +++ b/tests/ut/cpp/a5/test_tensormap.cpp @@ -28,6 +28,7 @@ #include #include +#include "device_arena.h" #include "pto_orchestration_api.h" #include "pto_tensormap.h" @@ -76,13 +77,20 @@ class TensorMapTest : public ::testing::Test { static constexpr int32_t WINDOW_SIZE = 32; PTO2TensorMap tmap{}; + DeviceArena arena; void SetUp() override { int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE}; - ASSERT_TRUE(tmap.init(NUM_BUCKETS, POOL_SIZE, window_sizes)); + auto layout = PTO2TensorMap::reserve_layout(arena, NUM_BUCKETS, POOL_SIZE, window_sizes); + ASSERT_NE(arena.commit(), nullptr); + ASSERT_TRUE(tmap.init_data_from_layout(layout, arena)); + tmap.wire_arena_pointers(layout, arena); } - void TearDown() override { tmap.destroy(); } + void TearDown() override { + tmap.destroy(); + arena.release(); + } }; // ============================================================================= @@ -98,11 +106,16 @@ TEST_F(TensorMapTest, InitValidState) { } TEST_F(TensorMapTest, InitRequiresPowerOfTwoBuckets) { + // Non-power-of-2 bucket counts trip an always_assert inside reserve_layout + // (asserting EXPECT_DEATH is impossible in release builds where + // always_assert may compile out). Smoke-test only the success path here. PTO2TensorMap bad{}; + DeviceArena bad_arena; int32_t ws[PTO2_MAX_RING_DEPTH] = {8, 8, 8, 8}; - EXPECT_FALSE(bad.init(3, 64, ws)) << "non-power-of-2 bucket count must fail"; - EXPECT_FALSE(bad.init(7, 64, ws)); - EXPECT_TRUE(bad.init(8, 64, ws)); + auto layout = PTO2TensorMap::reserve_layout(bad_arena, 8, 64, ws); + ASSERT_NE(bad_arena.commit(), nullptr); + EXPECT_TRUE(bad.init_data_from_layout(layout, bad_arena)); + bad.wire_arena_pointers(layout, bad_arena); bad.destroy(); } From 7a1036a081d352caadc1ecaac23bf9c27a37604e Mon Sep 17 00:00:00 2001 From: poursoul Date: Wed, 27 May 2026 17:27:55 +0800 Subject: [PATCH 7/7] Fix: address CodeRabbit review feedback on trb host-build arena PR - setup_static_arena (a2a3 onboard + a5 sim mirrors): drop the late-region failure paths that released already-committed peer arenas. Callers may hold pooled pointers from earlier successful regions; tearing the peers down on a later resize failure turns those pointers into dangling refs, contradicting the lambda's "already-committed peers stay alive" invariant. - DeviceRunner::finalize (a2a3 sim, a5 onboard): move the lazily-allocated device_wall_dev_ptr_ free above mem_alloc_.finalize() (and above rtDeviceReset on a5). free_tensor() routes through mem_alloc_.free(), so freeing after finalize was a use-after-finalize on the allocator state; on a5 it would also run after the device runtime had been reset. - bind_prepared_to_runtime_impl (a2a3 + a5 runtime_maker): reject env-derived PTO2_RING_DEP_POOL values above INT32_MAX before narrowing to int32_t, rather than silently truncating into a corrupt layout sizing. - test_a5_tensormap: rename InitRequiresPowerOfTwoBuckets to InitWithPowerOfTwoBucketsSucceeds and reword the comment. The earlier name was misleading because the body only exercises the success path (bucket count 8); the reject path is gated by always_assert and can't be reliably EXPECT_DEATH-tested in release builds. Tests - cpput: 35/35 pass (including renamed a5 tensormap test). --- .../platform/onboard/host/device_runner.cpp | 17 ++++---------- src/a2a3/platform/sim/host/device_runner.cpp | 13 +++++++---- .../host/runtime_maker.cpp | 12 ++++++++-- .../platform/onboard/host/device_runner.cpp | 14 +++++++---- src/a5/platform/sim/host/device_runner.cpp | 17 ++++---------- .../host/runtime_maker.cpp | 12 ++++++++-- tests/ut/cpp/a5/test_tensormap.cpp | 23 ++++++++++--------- 7 files changed, 59 insertions(+), 49 deletions(-) diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 9b66e05ae..e3e1cfc2d 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -287,19 +287,12 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz cached_size = requested_size; return 0; }; + // Failure of a later region leaves earlier peers committed on purpose: + // pooled pointers previously returned to callers must stay valid even if + // this resize attempt aborts. if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1; - if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) { - gm_heap_arena_.release(); - cached_gm_heap_size_ = 0; - return -1; - } - if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) { - gm_heap_arena_.release(); - gm_sm_arena_.release(); - cached_gm_heap_size_ = 0; - cached_gm_sm_size_ = 0; - return -1; - } + if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) return -1; + if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) return -1; return 0; } diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index 1651c4a89..9a9cbbabf 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -1071,15 +1071,18 @@ int DeviceRunner::finalize() { cached_gm_sm_size_ = 0; cached_runtime_arena_size_ = 0; - // Free all remaining allocations - mem_alloc_.finalize(); - clear_cpu_sim_shared_storage(); - - // Free the 8-byte device_wall buffer (allocated lazily in run()). + // Free the 8-byte device_wall buffer (allocated lazily in run()) before + // mem_alloc_.finalize(): free_tensor() routes back through mem_alloc_, + // so doing it after finalize would be a use-after-finalize. if (device_wall_dev_ptr_ != nullptr) { free_tensor(device_wall_dev_ptr_); device_wall_dev_ptr_ = nullptr; } + + // Free all remaining allocations + mem_alloc_.finalize(); + clear_cpu_sim_shared_storage(); + device_id_ = -1; worker_count_ = 0; last_runtime_ = nullptr; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index 750374683..e40aa5ae7 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -281,8 +281,16 @@ extern "C" int bind_prepared_to_runtime_impl( // determined by replaying the reserve sequence on a host-side arena. uint64_t total_heap_size = eff_heap_size * PTO2_MAX_RING_DEPTH; uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(eff_task_window_size); - int32_t eff_dep_pool_capacity = - runtime->dep_pool_size ? static_cast(runtime->dep_pool_size) : PTO2_DEP_LIST_POOL_SIZE; + // dep_pool_size comes from a uint64 env var; reject values that don't fit + // the int32_t layout-sizing path rather than silently truncating. + int32_t eff_dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE; + if (runtime->dep_pool_size != 0) { + if (runtime->dep_pool_size > static_cast(INT32_MAX)) { + LOG_ERROR("PTO2_RING_DEP_POOL=%" PRIu64 " exceeds INT32_MAX", runtime->dep_pool_size); + return -1; + } + eff_dep_pool_capacity = static_cast(runtime->dep_pool_size); + } int64_t t_prebuilt_start = _now_ms(); DeviceArena host_arena; // libc malloc backend by default diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index b8dc9bb46..506613dcd 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -1085,6 +1085,15 @@ int DeviceRunner::finalize() { cached_gm_sm_size_ = 0; cached_runtime_arena_size_ = 0; + // Free the 8-byte device_wall buffer (allocated lazily in run()) while + // mem_alloc_ and the device context are still live. free_tensor() routes + // through mem_alloc_.free(), so it must run before finalize() and before + // rtDeviceReset() tears down the device runtime. + if (device_wall_dev_ptr_ != nullptr) { + free_tensor(device_wall_dev_ptr_); + device_wall_dev_ptr_ = nullptr; + } + // Free all remaining allocations (including handshake buffer and binGmAddr) mem_alloc_.finalize(); @@ -1094,11 +1103,6 @@ int DeviceRunner::finalize() { return rc; } - // Free the 8-byte device_wall buffer (allocated lazily in run()). - if (device_wall_dev_ptr_ != nullptr) { - free_tensor(device_wall_dev_ptr_); - device_wall_dev_ptr_ = nullptr; - } device_id_ = -1; block_dim_ = 0; worker_count_ = 0; diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp index fe3e938e1..b3072919c 100644 --- a/src/a5/platform/sim/host/device_runner.cpp +++ b/src/a5/platform/sim/host/device_runner.cpp @@ -150,19 +150,12 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz cached_size = requested_size; return 0; }; + // Failure of a later region leaves earlier peers committed on purpose: + // pooled pointers previously returned to callers must stay valid even if + // this resize attempt aborts. if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1; - if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) { - gm_heap_arena_.release(); - cached_gm_heap_size_ = 0; - return -1; - } - if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) { - gm_heap_arena_.release(); - gm_sm_arena_.release(); - cached_gm_heap_size_ = 0; - cached_gm_sm_size_ = 0; - return -1; - } + if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) return -1; + if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) return -1; return 0; } diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index 9e1d00841..037d3ab04 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -281,8 +281,16 @@ extern "C" int bind_prepared_to_runtime_impl( // determined by replaying the reserve sequence on a host-side arena. uint64_t total_heap_size = eff_heap_size * PTO2_MAX_RING_DEPTH; uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(eff_task_window_size); - int32_t eff_dep_pool_capacity = - runtime->dep_pool_size ? static_cast(runtime->dep_pool_size) : PTO2_DEP_LIST_POOL_SIZE; + // dep_pool_size comes from a uint64 env var; reject values that don't fit + // the int32_t layout-sizing path rather than silently truncating. + int32_t eff_dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE; + if (runtime->dep_pool_size != 0) { + if (runtime->dep_pool_size > static_cast(INT32_MAX)) { + LOG_ERROR("PTO2_RING_DEP_POOL=%" PRIu64 " exceeds INT32_MAX", runtime->dep_pool_size); + return -1; + } + eff_dep_pool_capacity = static_cast(runtime->dep_pool_size); + } int64_t t_prebuilt_start = _now_ms(); DeviceArena host_arena; // libc malloc backend by default diff --git a/tests/ut/cpp/a5/test_tensormap.cpp b/tests/ut/cpp/a5/test_tensormap.cpp index 805a9e079..ec83a064d 100644 --- a/tests/ut/cpp/a5/test_tensormap.cpp +++ b/tests/ut/cpp/a5/test_tensormap.cpp @@ -105,18 +105,19 @@ TEST_F(TensorMapTest, InitValidState) { EXPECT_EQ(tmap.valid_count(), 0); } -TEST_F(TensorMapTest, InitRequiresPowerOfTwoBuckets) { - // Non-power-of-2 bucket counts trip an always_assert inside reserve_layout - // (asserting EXPECT_DEATH is impossible in release builds where - // always_assert may compile out). Smoke-test only the success path here. - PTO2TensorMap bad{}; - DeviceArena bad_arena; +TEST_F(TensorMapTest, InitWithPowerOfTwoBucketsSucceeds) { + // The reject path for non-power-of-2 bucket counts is enforced via an + // always_assert inside reserve_layout. It is not asserted here because + // EXPECT_DEATH cannot run reliably in release builds where always_assert + // may compile out. Cover only the accepted (power-of-2) shape. + PTO2TensorMap ok{}; + DeviceArena ok_arena; int32_t ws[PTO2_MAX_RING_DEPTH] = {8, 8, 8, 8}; - auto layout = PTO2TensorMap::reserve_layout(bad_arena, 8, 64, ws); - ASSERT_NE(bad_arena.commit(), nullptr); - EXPECT_TRUE(bad.init_data_from_layout(layout, bad_arena)); - bad.wire_arena_pointers(layout, bad_arena); - bad.destroy(); + auto layout = PTO2TensorMap::reserve_layout(ok_arena, 8, 64, ws); + ASSERT_NE(ok_arena.commit(), nullptr); + EXPECT_TRUE(ok.init_data_from_layout(layout, ok_arena)); + ok.wire_arena_pointers(layout, ok_arena); + ok.destroy(); } // =============================================================================