Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 63 additions & 35 deletions src/a2a3/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -249,42 +249,68 @@ int AicpuSoInfo::finalize() {

DeviceRunner::~DeviceRunner() { finalize(); }

int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) {
if (static_arena_.is_committed()) {
// Idempotent for the production case (sizes do not change across a
// worker's lifetime). If a caller asks for a larger layout, redo it.
if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0;
static_arena_.release();
gm_heap_region_off_ = SIZE_MAX;
gm_sm_region_off_ = SIZE_MAX;
cached_gm_heap_size_ = 0;
cached_gm_sm_size_ = 0;
}
gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign);
gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign);
if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
// Roll back the two reserves: commit() failure leaves committed_=false,
// so the next entry would skip the release branch and stack new
// reserves on top of the stale cursor. release() is idempotent on a
// never-committed arena (just zeroes cursor_ / region_count_).
static_arena_.release();
gm_heap_region_off_ = SIZE_MAX;
gm_sm_region_off_ = SIZE_MAX;
return -1;
}
cached_gm_heap_size_ = gm_heap_size;
cached_gm_sm_size_ = gm_sm_size;
int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
// Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt
// runtime arena. Split out from a single large allocation because the
// combined size can exceed the device allocator's largest contiguous
// block. Each arena commits exactly one region, so its base() is the
// pooled pointer the caller wants.
//
// Idempotent for the production case (sizes do not change across a
// worker's lifetime). If a caller asks for a larger layout on any
// region, redo just that region — already-committed peers stay alive
// so their callers don't have to re-acquire.
auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int {
if (requested_size == 0) {
// hbg's runtime_arena path: caller passed 0 and never reserved
// a region. Leave the arena uncommitted; acquire_pooled_* will
// return nullptr.
if (arena.is_committed() && cached_size != 0) {
arena.release();
cached_size = 0;
}
return 0;
}
if (arena.is_committed() && requested_size <= cached_size) {
return 0;
}
arena.release();
cached_size = 0;
arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign);
if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
// commit() failure leaves committed_=false, so the next entry's
// is_committed() guard skips the release branch. release() is
// idempotent on a never-committed arena (zeroes cursor_).
arena.release();
return -1;
}
cached_size = requested_size;
return 0;
};
// Failure of a later region leaves earlier peers committed on purpose:
// pooled pointers previously returned to callers must stay valid even if
// this resize attempt aborts.
if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1;
if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) return -1;
if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) return -1;
return 0;
}

void *DeviceRunner::acquire_pooled_gm_heap() {
if (!static_arena_.is_committed()) return nullptr;
return static_arena_.region_ptr(gm_heap_region_off_);
if (!gm_heap_arena_.is_committed()) return nullptr;
return gm_heap_arena_.base();
}

void *DeviceRunner::acquire_pooled_gm_sm() {
if (!static_arena_.is_committed()) return nullptr;
return static_arena_.region_ptr(gm_sm_region_off_);
if (!gm_sm_arena_.is_committed()) return nullptr;
return gm_sm_arena_.base();
}

void *DeviceRunner::acquire_pooled_runtime_arena() {
// hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_
// uncommitted — fail loudly if a caller asks for it anyway.
if (!runtime_arena_pool_.is_committed()) return nullptr;
return runtime_arena_pool_.base();
}
Comment thread
poursoul marked this conversation as resolved.

std::thread DeviceRunner::create_thread(std::function<void()> fn) {
Expand Down Expand Up @@ -1222,14 +1248,16 @@ int DeviceRunner::finalize() {
// perf_cleanup guard; this is the backstop for the no-run-since-init case.
finalize_collectors();

// Release per-Worker static arena (GM heap + PTO2 SM in a single backing
// device allocation). Must precede mem_alloc_.finalize() so the arena
// frees through the still-live allocator, not after it.
static_arena_.release();
gm_heap_region_off_ = SIZE_MAX;
gm_sm_region_off_ = SIZE_MAX;
// Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional
// trb prebuilt runtime arena — each its own device_malloc). Must precede
// mem_alloc_.finalize() so the arenas free through the still-live
// allocator, not after it.
gm_heap_arena_.release();
gm_sm_arena_.release();
runtime_arena_pool_.release();
cached_gm_heap_size_ = 0;
cached_gm_sm_size_ = 0;
cached_runtime_arena_size_ = 0;

// Free all remaining allocations (including handshake buffer and binGmAddr)
mem_alloc_.finalize();
Expand Down
56 changes: 38 additions & 18 deletions src/a2a3/platform/onboard/host/device_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,25 +185,36 @@ struct KernelArgsHelper {
class DeviceRunner {
public:
DeviceRunner() :
static_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
~DeviceRunner();

/**
* Lay out and commit the per-Worker static device arena that backs the
* PTO2 GM heap and PTO2 shared memory in a single underlying allocation.
* Must be called before acquire_pooled_gm_heap / acquire_pooled_gm_sm.
* Idempotent on identical sizes. Returns 0 on success, -1 on failure.
* Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
* memory, trb prebuilt runtime arena) as three independent device
* allocations. Must be called before any acquire_pooled_*. Idempotent
* on identical sizes. `runtime_arena_size` is 0 for the hbg path (no
* prebuilt runtime arena) — the corresponding arena stays uncommitted.
* Returns 0 on success, -1 on failure.
*/
int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size);
int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);

/**
* Return the pooled GM heap / PTO2 SM pointer. setup_static_arena must
* have been called earlier in this Worker; otherwise these return
* nullptr. Both pointers are stable for the lifetime of the Worker and
* the single underlying device buffer is released in `finalize()`.
* Return the pooled GM heap / PTO2 SM / runtime arena pointer.
* setup_static_arena must have already committed the relevant region;
* otherwise these return nullptr. All pointers are stable for the
* Worker's lifetime; the three underlying device buffers are released
* in `finalize()`.
*
* acquire_pooled_runtime_arena() is trb-only — the runtime arena region
* is only committed when setup_static_arena was called with
* runtime_arena_size > 0. Calling it on the hbg path
* (setup_static_arena(...,0)) returns nullptr (well-defined).
*/
void *acquire_pooled_gm_heap();
void *acquire_pooled_gm_sm();
void *acquire_pooled_runtime_arena();

/**
* Create a thread bound to this device.
Expand Down Expand Up @@ -602,22 +613,31 @@ class DeviceRunner {
// Memory management
MemoryAllocator mem_alloc_;

// Per-Worker arena backing the PTO2 GM heap + PTO2 shared memory in a
// single device allocation. Released explicitly in finalize() before
// mem_alloc_.finalize() so it does not free pointers a second time.
// Three independent per-Worker arenas, each backing a single pooled
// region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime
// arena). Split out from a single backing allocation because the
// combined size can exceed the device allocator's largest contiguous
// block — three separate device_malloc calls are friendlier than one
// big one. Released explicitly in finalize() before mem_alloc_.finalize()
// so the underlying buffers do not get freed twice.
//
// `runtime_arena_pool_` stays unreserved when setup_static_arena was
// invoked with runtime_arena_size == 0 (hbg path).
//
// Trampolines forward DeviceArena's alloc/free calls to mem_alloc_.
static void *arena_alloc_trampoline(void *ctx, size_t size) {
return static_cast<MemoryAllocator *>(ctx)->alloc(size);
}
static void arena_free_trampoline(void *ctx, void *p) { static_cast<MemoryAllocator *>(ctx)->free(p); }
DeviceArena static_arena_;
size_t gm_heap_region_off_{SIZE_MAX};
size_t gm_sm_region_off_{SIZE_MAX};
// Cached sizes for setup_static_arena's "fits" check — avoids calling
// region_size() on the arena's public API for the two regions we own.
DeviceArena gm_heap_arena_;
DeviceArena gm_sm_arena_;
DeviceArena runtime_arena_pool_;
// Cached sizes for setup_static_arena's "fits" check — avoids re-allocating
// the same buffer when a later worker init asks for an equal-or-smaller
// layout on an already-committed arena.
size_t cached_gm_heap_size_{0};
size_t cached_gm_sm_size_{0};
size_t cached_runtime_arena_size_{0};

// Device resources
rtStream_t stream_aicpu_{nullptr};
Expand Down
13 changes: 11 additions & 2 deletions src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,9 @@ static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) {
}
}

static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size) {
static int setup_static_arena_wrapper(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
try {
return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size);
return current_runner()->setup_static_arena(gm_heap_size, gm_sm_size, runtime_arena_size);
} catch (...) {
return -1;
}
Expand All @@ -132,6 +132,14 @@ static void *acquire_pooled_gm_sm_wrapper() {
}
}

static void *acquire_pooled_runtime_arena_wrapper() {
try {
return current_runner()->acquire_pooled_runtime_arena();
} catch (...) {
return nullptr;
}
}

/* ===========================================================================
* Public C API (resolved by ChipWorker via dlsym)
* =========================================================================== */
Expand Down Expand Up @@ -370,6 +378,7 @@ int run_prepared(
r->host_api.setup_static_arena = setup_static_arena_wrapper;
r->host_api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper;
r->host_api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper;
r->host_api.acquire_pooled_runtime_arena = acquire_pooled_runtime_arena_wrapper;
r->host_api.upload_chip_callable_buffer = upload_chip_callable_buffer_wrapper;

// Restore kernel addrs + orch symbol names + active_callable_id; the
Expand Down
101 changes: 67 additions & 34 deletions src/a2a3/platform/sim/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,40 +122,68 @@ bool create_temp_so_file(const std::string &path_template, const uint8_t *data,

DeviceRunner::~DeviceRunner() { finalize(); }

int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size) {
if (static_arena_.is_committed()) {
if (gm_heap_size <= cached_gm_heap_size_ && gm_sm_size <= cached_gm_sm_size_) return 0;
static_arena_.release();
gm_heap_region_off_ = SIZE_MAX;
gm_sm_region_off_ = SIZE_MAX;
int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
// Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt
// runtime arena. Split out from a single large allocation because the
// combined size can exceed the device allocator's largest contiguous
// block. Each arena commits exactly one region, so its base() is the
// pooled pointer the caller wants.
//
// Idempotent for the production case (sizes do not change across a
// worker's lifetime). If a caller asks for a larger layout on any
// region, redo just that region.
auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int {
if (requested_size == 0) {
if (arena.is_committed() && cached_size != 0) {
arena.release();
cached_size = 0;
}
return 0;
}
if (arena.is_committed() && requested_size <= cached_size) {
return 0;
}
arena.release();
cached_size = 0;
arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign);
if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
arena.release();
return -1;
}
cached_size = requested_size;
return 0;
};
if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1;
if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) {
gm_heap_arena_.release();
cached_gm_heap_size_ = 0;
cached_gm_sm_size_ = 0;
return -1;
}
gm_heap_region_off_ = static_arena_.reserve(gm_heap_size, DeviceArena::kDefaultBaseAlign);
gm_sm_region_off_ = static_arena_.reserve(gm_sm_size, DeviceArena::kDefaultBaseAlign);
if (static_arena_.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
// Roll back the two reserves: commit() failure leaves committed_=false,
// so the next entry would skip the release branch and stack new
// reserves on top of the stale cursor. release() is idempotent on a
// never-committed arena (just zeroes cursor_ / region_count_).
static_arena_.release();
gm_heap_region_off_ = SIZE_MAX;
gm_sm_region_off_ = SIZE_MAX;
if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) {
gm_heap_arena_.release();
gm_sm_arena_.release();
cached_gm_heap_size_ = 0;
cached_gm_sm_size_ = 0;
return -1;
}
cached_gm_heap_size_ = gm_heap_size;
cached_gm_sm_size_ = gm_sm_size;
return 0;
}

void *DeviceRunner::acquire_pooled_gm_heap() {
if (!static_arena_.is_committed()) return nullptr;
return static_arena_.region_ptr(gm_heap_region_off_);
if (!gm_heap_arena_.is_committed()) return nullptr;
return gm_heap_arena_.base();
}

void *DeviceRunner::acquire_pooled_gm_sm() {
if (!static_arena_.is_committed()) return nullptr;
return static_arena_.region_ptr(gm_sm_region_off_);
if (!gm_sm_arena_.is_committed()) return nullptr;
return gm_sm_arena_.base();
}

void *DeviceRunner::acquire_pooled_runtime_arena() {
// hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_
// uncommitted — fail loudly if a caller asks for it anyway.
if (!runtime_arena_pool_.is_committed()) return nullptr;
return runtime_arena_pool_.base();
}
Comment thread
poursoul marked this conversation as resolved.

std::thread DeviceRunner::create_thread(std::function<void()> fn) {
Expand Down Expand Up @@ -1032,24 +1060,29 @@ int DeviceRunner::finalize() {
// Close executor .so files (typically already closed by run(), this is a safety net)
unload_executor_binaries();

// Release per-Worker static arena (GM heap + PTO2 SM in a single backing
// device allocation). Must precede mem_alloc_.finalize() so the arena
// frees through the still-live allocator, not after it.
static_arena_.release();
gm_heap_region_off_ = SIZE_MAX;
gm_sm_region_off_ = SIZE_MAX;
// Release the three per-Worker pooled arenas (GM heap, PTO2 SM, optional
// trb prebuilt runtime arena — each its own device_malloc). Must precede
// mem_alloc_.finalize() so the arenas free through the still-live
// allocator, not after it.
gm_heap_arena_.release();
gm_sm_arena_.release();
runtime_arena_pool_.release();
cached_gm_heap_size_ = 0;
cached_gm_sm_size_ = 0;
cached_runtime_arena_size_ = 0;
Comment thread
coderabbitai[bot] marked this conversation as resolved.

// Free all remaining allocations
mem_alloc_.finalize();
clear_cpu_sim_shared_storage();

// Free the 8-byte device_wall buffer (allocated lazily in run()).
// Free the 8-byte device_wall buffer (allocated lazily in run()) before
// mem_alloc_.finalize(): free_tensor() routes back through mem_alloc_,
// so doing it after finalize would be a use-after-finalize.
if (device_wall_dev_ptr_ != nullptr) {
free_tensor(device_wall_dev_ptr_);
device_wall_dev_ptr_ = nullptr;
}

// Free all remaining allocations
mem_alloc_.finalize();
clear_cpu_sim_shared_storage();

device_id_ = -1;
worker_count_ = 0;
last_runtime_ = nullptr;
Expand Down
Loading
Loading