From 2fb95d9122622cba1716f5b3f115a96196997ae6 Mon Sep 17 00:00:00 2001
From: David Ndungu <ndungu.sink@gmail.com>
Date: Mon, 20 Apr 2026 14:23:30 -0700
Subject: [PATCH] feat(compute): configurable GPU arena size via
 ZERFOO_ARENA_SIZE_GB

The arena was hardcoded at 2 GiB. Training workloads whose per-step
working set exceeds 2 GiB spill to the unbounded MemPool fallback,
which ResetPool/StepScope.Close() does NOT reset. On GB10 Grace Hopper,
managed-memory allocations count against host RAM, so the spill
accumulates across thousands of steps until the 119 GiB host is
exhausted and the job silently thrashes under swap.

This change exposes the arena capacity via ZERFOO_ARENA_SIZE_GB
(default 2, range 1..128) so training callers can size the arena to
their actual per-step footprint and avoid the spill entirely. Invalid
or out-of-range values log a warning and fall back to the default.

Observed in Wolf's CrossAsset walk-forward training on GB10: every run
hung silently around epoch 3-4 with the 2 GiB default. Raising to
ZERFOO_ARENA_SIZE_GB=32 should keep all per-step intermediates in the
arena and let StepScope.Close() reclaim them between batches.
---
 compute/arena_size_test.go | 60 ++++++++++++++++++++++++++++++++++++++
 compute/gpu_engine.go      | 53 +++++++++++++++++++++++++++++----
 2 files changed, 108 insertions(+), 5 deletions(-)
 create mode 100644 compute/arena_size_test.go

diff --git a/compute/arena_size_test.go b/compute/arena_size_test.go
new file mode 100644
index 0000000..c21ca43
--- /dev/null
+++ b/compute/arena_size_test.go
@@ -0,0 +1,60 @@
+package compute
+
+import (
+	"testing"
+
+	"github.com/zerfoo/ztensor/log"
+)
+
+func TestArenaSizeBytes_Default(t *testing.T) {
+	t.Setenv("ZERFOO_ARENA_SIZE_GB", "")
+	got := arenaSizeBytes(log.Nop())
+	want := int64(defaultArenaSizeGB) * 1024 * 1024 * 1024
+	if got != want {
+		t.Fatalf("default arena size: got %d, want %d", got, want)
+	}
+}
+
+func TestArenaSizeBytes_EnvOverride(t *testing.T) {
+	tests := []struct {
+		name  string
+		env   string
+		wantG int64
+	}{
+		{"min", "1", 1},
+		{"training-typical", "32", 32},
+		{"max", "128", 128},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Setenv("ZERFOO_ARENA_SIZE_GB", tt.env)
+			got := arenaSizeBytes(log.Nop())
+			want := tt.wantG * 1024 * 1024 * 1024
+			if got != want {
+				t.Fatalf("env=%q: got %d, want %d", tt.env, got, want)
+			}
+		})
+	}
+}
+
+func TestArenaSizeBytes_InvalidFallsBackToDefault(t *testing.T) {
+	tests := []struct {
+		name string
+		env  string
+	}{
+		{"non-integer", "lots"},
+		{"below-min", "0"},
+		{"above-max", "256"},
+		{"negative", "-5"},
+	}
+	wantDefault := int64(defaultArenaSizeGB) * 1024 * 1024 * 1024
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Setenv("ZERFOO_ARENA_SIZE_GB", tt.env)
+			got := arenaSizeBytes(log.Nop())
+			if got != wantDefault {
+				t.Fatalf("env=%q: got %d, want default %d", tt.env, got, wantDefault)
+			}
+		})
+	}
+}
diff --git a/compute/gpu_engine.go b/compute/gpu_engine.go
index b1d5e7d..68d8a19 100644
--- a/compute/gpu_engine.go
+++ b/compute/gpu_engine.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"os"
+	"strconv"
 	"sync/atomic"
 	"unsafe"
 
@@ -16,6 +17,45 @@ import (
 	"github.com/zerfoo/ztensor/tensor"
 )
 
+// defaultArenaSizeGB is the per-GPUEngine arena capacity when the
+// ZERFOO_ARENA_SIZE_GB env var is unset. Sized for single-pass inference
+// on typical 1-7B LLMs; larger training workloads (e.g. multi-scale
+// walk-forward with hundreds of batches) should raise this via env var
+// to keep per-step intermediates inside the arena and avoid spill to
+// the unbounded MemPool fallback.
+const defaultArenaSizeGB = 2
+
+// minArenaSizeGB and maxArenaSizeGB bound user-supplied values so a typo
+// can't request a TB-sized arena or a zero-size one.
+const (
+	minArenaSizeGB = 1
+	maxArenaSizeGB = 128
+)
+
+// arenaSizeBytes resolves the arena capacity in bytes. ZERFOO_ARENA_SIZE_GB,
+// if set to an integer in [minArenaSizeGB, maxArenaSizeGB], overrides the
+// default. Invalid / out-of-range values are logged and ignored.
+func arenaSizeBytes(l log.Logger) int64 {
+	gb := int64(defaultArenaSizeGB)
+	if raw := os.Getenv("ZERFOO_ARENA_SIZE_GB"); raw != "" {
+		parsed, err := strconv.ParseInt(raw, 10, 64)
+		switch {
+		case err != nil:
+			l.Warn("ZERFOO_ARENA_SIZE_GB is not an integer; using default",
+				"value", raw, "default", fmt.Sprintf("%d", defaultArenaSizeGB))
+		case parsed < minArenaSizeGB || parsed > maxArenaSizeGB:
+			l.Warn("ZERFOO_ARENA_SIZE_GB out of range; using default",
+				"value", fmt.Sprintf("%d", parsed),
+				"min", fmt.Sprintf("%d", minArenaSizeGB),
+				"max", fmt.Sprintf("%d", maxArenaSizeGB),
+				"default", fmt.Sprintf("%d", defaultArenaSizeGB))
+		default:
+			gb = parsed
+		}
+	}
+	return gb * 1024 * 1024 * 1024
+}
+
 // DType selects the compute precision for GPU operations.
 type DType int
 
@@ -169,11 +209,14 @@ func NewGPUEngine[T tensor.Numeric](ops numeric.Arithmetic[T], deviceID ...int)
 	fallbackPool := cuda.NewMemPool()
 	cuda.SetDefaultMemPool(fallbackPool)
 
-	// Arena pool: 2GB pre-allocated region for per-inference intermediates.
-	// On DGX Spark with 128GB unified memory, this is a small fraction.
-	// Falls back to MemPool if arena is exhausted.
-	const arenaSize = 2 * 1024 * 1024 * 1024 // 2 GB
-	arenaPool, err := gpuapi.NewCUDAArenaPool(dev, arenaSize, fallbackPool)
+	// Arena pool: pre-allocated region for per-inference / per-step
+	// intermediates. Defaults to 2GB (sized for 1-7B LLM inference).
+	// Override via ZERFOO_ARENA_SIZE_GB for larger training workloads
+	// whose per-step working set would otherwise spill to the unbounded
+	// MemPool fallback and leak through StepScope.Close(). On DGX Spark
+	// with 128GB unified memory, sizes up to 128GB are valid.
+	arenaSize := arenaSizeBytes(l)
+	arenaPool, err := gpuapi.NewCUDAArenaPool(dev, int(arenaSize), fallbackPool)
 	if err == nil {
 		cuda.SetDefaultArenaPool(arenaPool.Inner())
 	}