hw-native-sys · poursoul · May 27, 2026 · May 22, 2026 · May 22, 2026 · May 25, 2026
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -249,42 +249,68 @@ int AicpuSoInfo::finalize() {
 
 DeviceRunner::~DeviceRunner() { finalize(); }
 
-int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) {
-    if (static_arena_.is_committed()) {
-        // Idempotent for the production case (sizes do not change across a
-        // worker's lifetime). If a caller asks for a larger layout, redo it.
-        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0;
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
-        cached_gm_heap_size_ = 0;
-        cached_gm_sm_size_ = 0;
-    }
-    gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign);
-    gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign);
-    if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
-        // Roll back the two reserves: commit() failure leaves committed_=false,
-        // so the next entry would skip the release branch and stack new
-        // reserves on top of the stale cursor. release() is idempotent on a
-        // never-committed arena (just zeroes cursor_ / region_count_).
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
-        return -1;
-    }
-    cached_gm_heap_size_ = gm_heap_size;
-    cached_gm_sm_size_ = gm_sm_size;
+int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
+    // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt
+    // runtime arena. Split out from a single large allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block. Each arena commits exactly one region, so its base() is the
+    // pooled pointer the caller wants.
+    //
+    // Idempotent for the production case (sizes do not change across a
+    // worker's lifetime). If a caller asks for a larger layout on any
+    // region, redo just that region — already-committed peers stay alive
+    // so their callers don't have to re-acquire.
+    auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int {
+        if (requested_size == 0) {
+            // hbg's runtime_arena path: caller passed 0 and never reserved
+            // a region. Leave the arena uncommitted; acquire_pooled_* will
+            // return nullptr.
+            if (arena.is_committed() && cached_size != 0) {
+                arena.release();
+                cached_size = 0;
+            }
+            return 0;
+        }
+        if (arena.is_committed() && requested_size <= cached_size) {
+            return 0;
+        }
+        arena.release();
+        cached_size = 0;
+        arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign);
+        if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+            // commit() failure leaves committed_=false, so the next entry's
+            // is_committed() guard skips the release branch. release() is
+            // idempotent on a never-committed arena (zeroes cursor_).
+            arena.release();
+            return -1;
+        }
+        cached_size = requested_size;
+        return 0;
+    };
+    // Failure of a later region leaves earlier peers committed on purpose:
+    // pooled pointers previously returned to callers must stay valid even if
+    // this resize attempt aborts.
+    if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1;
+    if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) return -1;
+    if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) return -1;
     return 0;
 }
 
 void *DeviceRunner::acquire_pooled_gm_heap() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_heap_region_off_);
+    if (!gm_heap_arena_.is_committed()) return nullptr;
+    return gm_heap_arena_.base();
 }
 
 void *DeviceRunner::acquire_pooled_gm_sm() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_sm_region_off_);
+    if (!gm_sm_arena_.is_committed()) return nullptr;
+    return gm_sm_arena_.base();
+}
+
+void *DeviceRunner::acquire_pooled_runtime_arena() {
+    // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_
+    // uncommitted — fail loudly if a caller asks for it anyway.
+    if (!runtime_arena_pool_.is_committed()) return nullptr;
+    return runtime_arena_pool_.base();
 }
 
 std::thread DeviceRunner::create_thread(std::function<void()> fn) {
@@ -1222,14 +1248,16 @@ int DeviceRunner::finalize() {
     // perf_cleanup guard; this is the backstop for the no-run-since-init case.
     finalize_collectors();
 
-    // Release per-Worker static arena (GM heap + PTO2 SM in a single backing
-    // device allocation). Must precede mem_alloc_.finalize() so the arena
-    // frees through the still-live allocator, not after it.
-    static_arena_.release();
-    gm_heap_region_off_ = SIZE_MAX;
-    gm_sm_region_off_ = SIZE_MAX;
+    // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional
+    // trb prebuilt runtime arena — each its own device_malloc). Must precede
+    // mem_alloc_.finalize() so the arenas free through the still-live
+    // allocator, not after it.
+    gm_heap_arena_.release();
+    gm_sm_arena_.release();
+    runtime_arena_pool_.release();
     cached_gm_heap_size_ = 0;
     cached_gm_sm_size_ = 0;
+    cached_runtime_arena_size_ = 0;
 
     // Free all remaining allocations (including handshake buffer and binGmAddr)
     mem_alloc_.finalize();

diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
@@ -185,25 +185,36 @@ struct KernelArgsHelper {
 class DeviceRunner {
 public:
     DeviceRunner() :
-        static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
+        gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+        runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
     ~DeviceRunner();
 
     /**
-     * Lay out and commit the per-Worker static device arena that backs the
-     * PTO2 GM heap and PTO2 shared memory in a single underlying allocation.
-     * Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm.
-     * Idempotent on identical sizes. Returns 0 on success, -1 on failure.
+     * Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
+     * memory, trb prebuilt runtime arena) as three independent device
+     * allocations. Must be called before any acquire_pooled_*. Idempotent
+     * on identical sizes. `runtime_arena_size` is 0 for the hbg path (no
+     * prebuilt runtime arena) — the corresponding arena stays uncommitted.
+     * Returns 0 on success, -1 on failure.
      */
-    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size);
+    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
 
     /**
-     * Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must
-     * have been called earlier in this Worker; otherwise these return
-     * nullptr. Both pointers are stable for the lifetime of the Worker and
-     * the single underlying device buffer is released in `finalize()`.
+     * Return the pooled GM heap / PTO2 SM / runtime arena pointer.
+     * setup_static_arena must have already committed the relevant region;
+     * otherwise these return nullptr. All pointers are stable for the
+     * Worker's lifetime; the three underlying device buffers are released
+     * in `finalize()`.
+     *
+     * acquire_pooled_runtime_arena() is trb-only — the runtime arena region
+     * is only committed when setup_static_arena was called with
+     * runtime_arena_size > 0. Calling it on the hbg path
+     * (setup_static_arena(...,0)) returns nullptr (well-defined).
      */
     void *acquire_pooled_gm_heap();
     void *acquire_pooled_gm_sm();
+    void *acquire_pooled_runtime_arena();
 
     /**
      * Create a thread bound to this device.
@@ -602,22 +613,31 @@ class DeviceRunner {
     // Memory management
     MemoryAllocator mem_alloc_;
 
-    // Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a
-    // single device allocation. Released explicitly in finalize() before
-    // mem_alloc_.finalize() so it does not free pointers a second time.
+    // Three independent per-Worker arenas, each backing a single pooled
+    // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime
+    // arena). Split out from a single backing allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block — three separate device_malloc calls are friendlier than one
+    // big one. Released explicitly in finalize() before mem_alloc_.finalize()
+    // so the underlying buffers do not get freed twice.
+    //
+    // `runtime_arena_pool_` stays unreserved when setup_static_arena was
+    // invoked with runtime_arena_size == 0 (hbg path).
     //
     // Trampolines forward DeviceArena's alloc/free calls to mem_alloc_.
     static void *arena_alloc_trampoline(void *ctx, size_t size) {
         return static_cast<MemoryAllocator *>(ctx)->alloc(size);
     }
     static void arena_free_trampoline(void *ctx, void *p) { static_cast<MemoryAllocator *>(ctx)->free(p); }
-    DeviceArena static_arena_;
-    size_t gm_heap_region_off_{SIZE_MAX};
-    size_t gm_sm_region_off_{SIZE_MAX};
-    // Cached sizes for setup_static_arena's "fits" check — avoids calling
-    // region_size() on the arena's public API for the two regions we own.
+    DeviceArena gm_heap_arena_;
+    DeviceArena gm_sm_arena_;
+    DeviceArena runtime_arena_pool_;
+    // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating
+    // the same buffer when a later worker init asks for an equal-or-smaller
+    // layout on an already-committed arena.
     size_t cached_gm_heap_size_{0};
     size_t cached_gm_sm_size_{0};
+    size_t cached_runtime_arena_size_{0};
 
     // Device resources
     rtStream_t stream_aicpu_{nullptr};

diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -108,9 +108,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) {
     }
 }
 
-static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) {
+static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
     try {
-        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size);
+        return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size);
     } catch (...) {
         return -1;
     }
@@ -132,6 +132,14 @@ static void *acquire_pooled_gm_sm_wrapper() {
     }
 }
 
+static void *acquire_pooled_runtime_arena_wrapper() {
+    try {
+        return current_runner()->acquire_pooled_runtime_arena();
+    } catch (...) {
+        return nullptr;
+    }
+}
+
 /* ===========================================================================
  * Public C API (resolved by ChipWorker via dlsym)
  * =========================================================================== */
@@ -370,6 +378,7 @@ int run_prepared(
         r->host_api.setup_static_arena = setup_static_arena_wrapper;
         r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper;
         r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper;
+        r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper;
         r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper;
 
         // Restore kernel addrs + orch symbol names + active_callable_id; the

diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -122,40 +122,68 @@ bool create_temp_so_file(const std::string &path_template, const uint8_t *data,
 
 DeviceRunner::~DeviceRunner() { finalize(); }
 
-int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) {
-    if (static_arena_.is_committed()) {
-        if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0;
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
+int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
+    // Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt
+    // runtime arena. Split out from a single large allocation because the
+    // combined size can exceed the device allocator's largest contiguous
+    // block. Each arena commits exactly one region, so its base() is the
+    // pooled pointer the caller wants.
+    //
+    // Idempotent for the production case (sizes do not change across a
+    // worker's lifetime). If a caller asks for a larger layout on any
+    // region, redo just that region.
+    auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int {
+        if (requested_size == 0) {
+            if (arena.is_committed() && cached_size != 0) {
+                arena.release();
+                cached_size = 0;
+            }
+            return 0;
+        }
+        if (arena.is_committed() && requested_size <= cached_size) {
+            return 0;
+        }
+        arena.release();
+        cached_size = 0;
+        arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign);
+        if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+            arena.release();
+            return -1;
+        }
+        cached_size = requested_size;
+        return 0;
+    };
+    if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1;
+    if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) {
+        gm_heap_arena_.release();
         cached_gm_heap_size_ = 0;
-        cached_gm_sm_size_ = 0;
+        return -1;
     }
-    gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign);
-    gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign);
-    if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
-        // Roll back the two reserves: commit() failure leaves committed_=false,
-        // so the next entry would skip the release branch and stack new
-        // reserves on top of the stale cursor. release() is idempotent on a
-        // never-committed arena (just zeroes cursor_ / region_count_).
-        static_arena_.release();
-        gm_heap_region_off_ = SIZE_MAX;
-        gm_sm_region_off_ = SIZE_MAX;
+    if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) {
+        gm_heap_arena_.release();
+        gm_sm_arena_.release();
+        cached_gm_heap_size_ = 0;
+        cached_gm_sm_size_ = 0;
         return -1;
     }
-    cached_gm_heap_size_ = gm_heap_size;
-    cached_gm_sm_size_ = gm_sm_size;
     return 0;
 }
 
 void *DeviceRunner::acquire_pooled_gm_heap() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_heap_region_off_);
+    if (!gm_heap_arena_.is_committed()) return nullptr;
+    return gm_heap_arena_.base();
 }
 
 void *DeviceRunner::acquire_pooled_gm_sm() {
-    if (!static_arena_.is_committed()) return nullptr;
-    return static_arena_.region_ptr(gm_sm_region_off_);
+    if (!gm_sm_arena_.is_committed()) return nullptr;
+    return gm_sm_arena_.base();
+}
+
+void *DeviceRunner::acquire_pooled_runtime_arena() {
+    // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_
+    // uncommitted — fail loudly if a caller asks for it anyway.
+    if (!runtime_arena_pool_.is_committed()) return nullptr;
+    return runtime_arena_pool_.base();
 }
 
 std::thread DeviceRunner::create_thread(std::function<void()> fn) {
@@ -1032,24 +1060,29 @@ int DeviceRunner::finalize() {
     // Close executor .so files (typically already closed by run(), this is a safety net)
     unload_executor_binaries();
 
-    // Release per-Worker static arena (GM heap + PTO2 SM in a single backing
-    // device allocation). Must precede mem_alloc_.finalize() so the arena
-    // frees through the still-live allocator, not after it.
-    static_arena_.release();
-    gm_heap_region_off_ = SIZE_MAX;
-    gm_sm_region_off_ = SIZE_MAX;
+    // Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional
+    // trb prebuilt runtime arena — each its own device_malloc). Must precede
+    // mem_alloc_.finalize() so the arenas free through the still-live
+    // allocator, not after it.
+    gm_heap_arena_.release();
+    gm_sm_arena_.release();
+    runtime_arena_pool_.release();
     cached_gm_heap_size_ = 0;
     cached_gm_sm_size_ = 0;
+    cached_runtime_arena_size_ = 0;
 
-    // Free all remaining allocations
-    mem_alloc_.finalize();
-    clear_cpu_sim_shared_storage();
-
-    // Free the 8-byte device_wall buffer (allocated lazily in run()).
+    // Free the 8-byte device_wall buffer (allocated lazily in run()) before
+    // mem_alloc_.finalize(): free_tensor() routes back through mem_alloc_,
+    // so doing it after finalize would be a use-after-finalize.
     if (device_wall_dev_ptr_ != nullptr) {
         free_tensor(device_wall_dev_ptr_);
         device_wall_dev_ptr_ = nullptr;
     }
+
+    // Free all remaining allocations
+    mem_alloc_.finalize();
+    clear_cpu_sim_shared_storage();
+
     device_id_ = -1;
     worker_count_ = 0;
     last_runtime_ = nullptr;