diff --git a/.gitignore b/.gitignore
index 122a68c2f..93f844c22 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@
 
 # rust target
 /target
+PLAN.md
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f49447a8a..0d04b5995 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -548,6 +548,11 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY)
   # These are mitigation-independent and can be compiled once, then linked
   # against both fast and check testlib variants.
   set(TESTLIB_ONLY_TESTS
+    aligned_dealloc
+    arena
+    arenabins
+    largearenarange
+    smallarenarange
     bits first_operation memory memory_usage multi_atexit multi_threadatexit
     redblack statistics teardown
     contention external_pointer large_alloc lotsofthreads post_teardown
diff --git a/claude.md b/claude.md
new file mode 100644
index 000000000..87dde8ea7
--- /dev/null
+++ b/claude.md
@@ -0,0 +1,172 @@
+# Claude AI Guidelines for snmalloc
+
+## Working Style
+
+**Complete the plan, then check in**: When a plan is approved, execute all
+steps to completion. Don't stop after each step for review. When you think
+you're done, recursively apply all relevant principles from this file — check
+each one, act on any that apply, then check again until no more principles
+are relevant. Only then report completion and wait for feedback.
+
+**Plans require discussion before implementation**: After devising a plan
+(whether in plan mode or not), run the review loop (see "Mandatory review
+checkpoints") before presenting it. Do NOT proceed to implementation until
+the plan has been seen and explicitly approved.
+
+**Store plans in PLAN.md**: Always write plans to `PLAN.md` in the repository
+root so that context survives session boundaries. Update (not append to) the
+file when the plan evolves. This is the single source of truth for what is
+planned and what has been completed.
+
+**Never commit without explicit approval**: Do not run `git commit`,
+`git commit --amend`, `git push`, `git reset`, `git rebase`, or any other
+history-mutating command until the user has explicitly approved the commit
+for the current change. "I'm happy with this phase now, please commit"
+counts as approval for that single commit; "begin the next phase" does not
+authorise committing later work. When you believe a change is ready to
+commit:
+  1. Show the user `git status` and `git diff --stat` (and the proposed
+     commit message) so they can see exactly what would be committed.
+  2. Ask for explicit approval — use the `ask_user` tool, do not infer
+     consent from earlier messages.
+  3. Only after the user has approved THIS commit, run `git commit`.
+If you have already committed without approval, offer to `git reset --soft
+HEAD~1` to undo it while keeping the changes staged. The same rule applies
+to opening pull requests: never run `gh pr create` (or equivalent) without
+explicit approval for that PR.
+
+**Baseline the checkout before starting work**: Before beginning implementation
+of any plan, verify that the current checkout builds and passes tests. Run the
+build and test suite (per `skills/building_and_testing.md`) and record the
+results. If the baseline is broken, report the failures and stop — do not start
+implementation on a broken base. Pre-existing failures that are not caused by
+your changes must be acknowledged upfront so they are not confused with
+regressions introduced by the plan. This establishes the ground truth against
+which your changes will be measured.
+
+**Every plan step must have a test gate**: Each step in a plan must produce
+a testable result — a test, a build check, or a verifiable property — that
+acts as the gate to the next step. Do not move to step N+1 until step N's
+gate passes. This catches integration issues incrementally rather than
+deferring all testing to the end. When writing a plan, structure it so that
+independently testable components are implemented and verified first, and
+later steps build on proven foundations.
+
+**Mandatory review checkpoints**: At each of these points, run the full
+review loop — spawn a fresh-context reviewer subagent, address findings,
+spawn another fresh reviewer, repeat until a reviewer finds no issues. When
+you disagree with a reviewer's finding, escalate — do not resolve disputes
+unilaterally. Do not proceed past a checkpoint without a clean review.
+1. **After devising a plan**, before presenting it for discussion. For plan
+   reviews, adapt the reviewer prompt: instead of reading changed files and
+   running tests, the reviewer should read the plan document, read existing
+   code the plan references, verify assumptions about the codebase, and check
+   for structural gaps (missing steps, naming conflicts, incorrect
+   dependencies).
+2. **After completing implementation and self-review**, before opening a PR.
+
+The only exception: if you believe a change is truly trivial (a typo fix, a
+one-line config change), ask for permission to skip the review. Do not decide
+on your own that something is trivial enough to skip. When in doubt, run the
+review.
+
+**Go slow to go fast**: Before starting implementation work, identify and state
+which principles from these instructions are most relevant to the current task.
+This surfaces the right guidelines before they're needed rather than
+rediscovering them after a mistake.
+
+**Challenge me when the evidence says I'm wrong**: If a reviewer flags something
+that contradicts what I said, or if you have concrete evidence that an
+instruction is incorrect, raise it — don't silently comply. Present the evidence
+and discuss it.
+
+**Research findings belong in the plan**: If research or exploration surfaces
+issues beyond the original task (inaccurate comments, dead code, related bugs),
+include them as explicit plan steps — don't just mention them in the analysis
+and move on. Anything worth noting is worth acting on or explicitly deferring.
+
+**Self-review is part of done**: The recursive principle check described in
+"Complete the plan, then check in" IS the self-review. It's not a separate
+step — it's what "done" means. Never report completion without having done it.
+
+**During reviewer loops**: At any point during the review loop — when fixing
+findings, when unsure about a reviewer's suggestion, when making tradeoff
+decisions — stop and ask. The automated review removes me as a gatekeeper, not
+as a collaborator.
+
+## Debugging Principles
+
+1. **Logging is essential** - When debugging issues in allocator code, add tracing to identify the exact point of failure. Use `write()` directly to stderr/file rather than `printf`/`message` to avoid recursion through the allocator.
+
+2. **New code is most likely at fault** - When tests fail after changes, assume the new code introduced the bug. Don't blame existing infrastructure that was working before.
+
+3. **Baseline against origin/main** - Before assuming a system-wide issue, verify the test passes on `origin/main`. This confirms whether the issue is a regression introduced by your changes.
+
+4. **Check the whole PR for patterns** - When fixing a bug of a specific shape (e.g., "one-armed `if constexpr` causes MSVC C4702"), immediately search all changed files in the PR for the same pattern. Fix all instances at once rather than waiting for CI to report each one individually.
+
+5. **Verify hypotheses before acting** - A hypothesis about a bug's cause is not knowledge — it's a guess. Before investing effort in workarounds or fixes, validate empirically that your suspected cause is actually the cause. Read the code more carefully, write a minimal reproducer, or examine the actual data. Verify first, then act.
+
+6. **CI is the source of truth for build status** - A local build failure does not mean the build is broken. Local toolchain versions, stale dependency caches, and environment differences can all cause local failures that don't reproduce in CI. Never declare a build "broken on main" based on local results — check CI first.
+
+## Code Quality
+
+- **Use cross-platform macros from `defines.h`** - Never use raw compiler attributes like `__attribute__((used))` or `__forceinline` directly. Instead use the corresponding `SNMALLOC_*` macros (e.g., `SNMALLOC_USED_FUNCTION`, `SNMALLOC_FAST_PATH`, `SNMALLOC_SLOW_PATH`, `SNMALLOC_PURE`, `SNMALLOC_COLD`, `SNMALLOC_UNUSED_FUNCTION`, `ALWAYSINLINE`, `NOINLINE`). These are defined in `ds_core/defines.h` with correct expansions for MSVC, GCC, and Clang.
+
+- **Don't encode platform assumptions** - Avoid hardcoding limits like "48-bit address space" or "256 TiB max allocation". These assumptions may not hold on future platforms (56-bit, 64-bit address spaces, CHERI, etc.).
+
+- **Trust the existing bounds checks** - snmalloc already has appropriate bounds checking at API boundaries. New internal code should defer to the backend for edge cases rather than adding redundant checks.
+
+- **Guard new data structures** - When adding caches or intermediate layers, ensure they handle all input ranges correctly, including sizes larger than what they cache. Return early/bypass for out-of-range inputs.
+
+- **Keep headers minimal** - Each header should only include what it directly needs. Avoid adding transitive includes "for convenience" — if a header's own declarations only need `<stdint.h>`, don't pull in heavier internal headers. Includers are responsible for their own dependencies. This keeps compile times low and dependency graphs clean.
+
+- **No C++ STL or C++ standard library headers** - snmalloc must be compilable as part of a libc implementation, so it cannot depend on an external C++ STL. Never use headers like `<cstdint>`, `<cstddef>`, `<type_traits>`, `<atomic>`, etc. directly. Instead use the C equivalents (`<stdint.h>`, `<stddef.h>`) or snmalloc's own STL wrappers in `src/snmalloc/stl/` (e.g., `snmalloc/stl/type_traits.h`, `snmalloc/stl/atomic.h`, `snmalloc/stl/array.h`). These wrappers have both a `gnu/` backend (no C++ STL dependency) and a `cxx/` backend, selected at build time.
+
+- **Prefer explicit over implicit** - Avoid relying on implicit conversions, convention-based wiring, or unnamed dependencies. A few extra characters of explicit code is almost always cheaper than someone later needing to reconstruct the hidden knowledge. This is especially relevant in C++ with its many implicit conversion paths and template magic.
+
+- **Document coupling at the point of breakage** - When code A depends on the internal behaviour of code B (read sequence, execution order, size assumptions), put the comment on B — that's where a future maintainer would make a breaking change. Commenting at A doesn't help because the person changing B won't be reading A.
+
+- **Design for changeability, not for predicted changes** - Make designs modular and replaceable so future needs can be accommodated, but don't add abstractions, extension points, or features for changes that haven't happened yet. The goal is a design that's easy to modify, not one that anticipates specific modifications.
+
+- **Comments earn their length by carrying correctness-relevant information** - A comment exists to convey something the reader cannot recover from the code — a non-obvious invariant, a subtle correctness argument, a coupling that breaks if edited. If you cannot name what the comment teaches that the code does not, cut it.
+
+- **Don't qualify `src/snmalloc/` code as "production"** - Everything under `src/snmalloc/` is the shipped library; calling it "production code" / "production header" / "production Rep" adds no information and implicitly suggests there's non-production code in the same tree. Use unqualified names (e.g. "the in-tree header", "the shipped `PagemapRep`", or just the name itself). The meaningful distinction is in-tree (`src/snmalloc/`) vs test (`src/test/`), and that distinction is already clear from the path. This applies equally to comments inside `src/snmalloc/`, to comments inside `src/test/` referring back to in-tree code, and to design documents like `PLAN.md`.
+
+- **Test scaffolding does not live in `src/snmalloc/` headers** - A header that needs a friend struct purely for testing carries only the forward declaration and the friend grant; the body lives in test code.
+
+- **Store data in the form the consumer uses** - If a derived value is only consumed pre-shifted, pre-negated, or pre-masked, store it that way at build time. The cost moves from every consumer call to one build-time loop.
+
+- **Algorithms must only touch state whose synchronisation they own** - Speculative reads of globally shared, concurrently modified data are races, not "fast probes". If you need information from outside your lock, design without it or pull it through the structure's own discipline.
+
+- **Verify language and library claims before stating them** - When a comment or rationale invokes a language rule, check it. Load-bearing claims that are wrong make the explanation actively misleading.
+
+## Code Change Discipline
+
+- **Read before modifying** - Do not propose changes to code you haven't read. Understand existing code before suggesting modifications.
+
+- **Prefer editing over creating** - Edit existing files rather than creating new ones. This prevents file bloat and builds on existing work.
+
+- **Avoid over-engineering** - Only make changes that are directly requested or clearly necessary. Don't add error handling for scenarios that can't happen. Don't add docstrings or comments to code you didn't change. Don't create helpers or abstractions for one-time operations. Three similar lines of code is better than a premature abstraction.
+
+- **Evaluate copied patterns, don't cargo-cult** - When reusing a pattern from existing snmalloc code, evaluate each choice (`constexpr` vs runtime, template vs function parameter, etc.) in the context of the new usage. The original may have had reasons that don't apply, or it may have been a mistake. Copy the intent, not the incidental choices. Conventions (legal headers, naming schemes, file organisation) should be followed for consistency; technical patterns should be evaluated on merit.
+
+- **Fix what your change makes stale** - When a change invalidates something elsewhere — a comment, a test description, documentation — fix it in the same PR. Stale artefacts left behind are bugs in the making, and "I didn't modify that line" isn't an excuse when your change is what made it wrong.
+
+- **Document the code, not the change** - Comments and documentation describe how the code IS, not how it was changed or why it differs from a previous version. Don't leave comments explaining "we removed X" or "this was changed from Y" — a reader shouldn't need the git history to understand the code. If code needs context about alternatives or design decisions, put that in design docs, not inline comments.
+
+- **Don't fragment atomic refactors** - If a change has a single commit-able outcome and a small footprint, do it as one step. Multi-step plans are for refactors whose intermediate states are independently testable.
+
+## Building, Testing, and Benchmarking
+
+All build, test, and benchmarking guidance lives in `skills/building_and_testing.md`.
+
+**Delegate testing to a subagent.** When it is time to build and run tests,
+spawn a subagent whose prompt includes the contents of
+`skills/building_and_testing.md` and tells it which tests to run (or "run the
+full suite"). Do NOT include implementation context — the subagent must not
+know what code changed. This prevents the tester from rationalising failures
+as related to the changes instead of reporting them objectively.
+
+The subagent will report back: which tests passed, which failed, exact
+commands, and full output of any failures. If failures are reported, treat
+them as actionable per the failure protocol in the skill file.
diff --git a/docs/AddressSpace.md b/docs/AddressSpace.md
index 1e28491ee..030023513 100644
--- a/docs/AddressSpace.md
+++ b/docs/AddressSpace.md
@@ -26,14 +26,14 @@ For simplicity, we gloss over much of the "lazy initialization" that would actua
    Because the two exercise similar bits of machinery, we now track them in parallel in prose despite their sequential nature.
 
 4. The `BackendAllocator` has a chain of "range" types that it uses to manage address space.
-   By default (and in the case we are considering), that chain begins with a per-thread "small buddy allocator range".
+   By default (and in the case we are considering), that chain begins with a per-thread *small arena range*.
 
    1. For the metadata allocation, the size is (well) below `MIN_CHUNK_SIZE` and so this allocator, which by supposition is empty, attempts to `refill` itself from its parent.
       This results in a request for a `MIN_CHUNK_SIZE` chunk from the parent allocator.
 
    2. For the chunk allocation, the size is `MIN_CHUNK_SIZE` or larger, so this allocator immediately forwards the request to its parent.
 
-5. The next range allocator in the chain is a per-thread *large* buddy allocator that refills in 2 MiB granules.
+5. The next range allocator in the chain is a per-thread `LargeArenaRange` that refills in 2 MiB granules.
    (2 MiB chosen because it is a typical superpage size.)
    At this point, both requests are for at least one and no more than a few times `MIN_CHUNK_SIZE` bytes.
 
@@ -48,7 +48,7 @@ For simplicity, we gloss over much of the "lazy initialization" that would actua
 8. The next entry in the chain is a `StatsRange` which serves to accumulate statistics.
    We ignore this stage and continue onwards.
 
-9. The next entry in the chain is another *large* buddy allocator which refills at 16 MiB but can hold regions
+9. The next entry in the chain is another `LargeArenaRange` which refills at 16 MiB but can hold regions
    of any size up to the entire address space.
    The first request triggers a `refill`, continuing along the chain as a 16 MiB request.
    (Recall that the second allocation will be handled at an earlier point on the chain.)
@@ -61,15 +61,15 @@ For simplicity, we gloss over much of the "lazy initialization" that would actua
 12. Having wound the chain onto our stack, we now unwind!
     The `PagemapRegisterRange` ensures that the Pagemap entries for allocations passing through it are mapped and returns the allocation unaltered.
 
-13. The global large buddy allocator splits the 16 MiB refill into 8, 4, and 2 MiB regions it retains as well as returning the remaining 2 MiB back along the chain.
+13. The global `LargeArenaRange` carves the request out of its 16 MiB refill and keeps the unused remainder as a single free block in its internal red-black trees of free ranges, returning the carved portion back along the chain.
 
 14. The `StatsRange` makes its observations, the `GlobalRange` now unlocks the global component of the chain, and the `CommitRange` ensures that the allocation is mapped.
     Aside from these side effects, these propagate the allocation along the chain unaltered.
 
-15. We now arrive back at the thread-local large buddy allocator, which takes its 2 MiB refill and breaks it down into powers of two down to the requested `MIN_CHUNK_SIZE`.
-    The second allocation (of the chunk), will either return or again break down one of these intermediate chunks.
+15. We now arrive back at the thread-local `LargeArenaRange`, which takes its 2 MiB refill and carves out the requested chunk(s); the unused remainder stays in its free-range trees.
+    The second allocation (of the chunk) will either be satisfied from this leftover or trigger another carve.
 
-16. For the first (metadata) allocation, the thread-local *small* allocator breaks the `MIN_CHUNK_SIZE` allocation down into powers of two down to `PAGEMAP_METADATA_STRUCT_SIZE` and returns one of that size.
+16. For the first (metadata) allocation, the thread-local *small arena range* takes its `MIN_CHUNK_SIZE` refill, hands back a sub-chunk fragment large enough for `PAGEMAP_METADATA_STRUCT_SIZE`, and tracks the remainder as free sub-chunk space using tree nodes stored inside the free fragments themselves.
     The second allocation will have been forwarded and so is not additionally handled here.
 
 Exciting, no?
@@ -98,26 +98,19 @@ For chunks owned by the *frontend* (`REMOTE_BACKEND_MARKER` not asserted),
 
    2. A bit (`META_BOUNDARY_BIT`) that serves to limit chunk coalescing on platforms where that may not be possible, such as CHERI.
 
-See `src/backend/metatypes.h` and `src/mem/metaslab.h`.
+See `src/snmalloc/mem/metadata.h`.
 
 For chunks owned by a *backend* (`REMOTE_BACKEND_MARKER` asserted), there are again multiple possibilities.
 
-For chunks owned by a *small buddy allocator*, the remainder of the `MetaEntry` is zero.
+For chunks owned by a *small arena range* (`SmallArenaRange`), the remainder of the `MetaEntry` is zero.
 That is, it appears to have small sizeclass 0 and an implausible `RemoteAllocator*`.
+The free-fragment tree itself is stored in-band, inside the free space of the chunk, rather than in the pagemap (see `InplaceRep` in `src/snmalloc/backend_helpers/inplacerep.h`).
 
-For chunks owned by a *large buddy allocator*, the `MetaEntry` is instead a node in a red-black tree of all such chunks.
-Its contents can be decoded as follows:
+For chunks owned by a `LargeArenaRange`, the `MetaEntry` is instead a node in the red-black trees of free ranges.
+A free block of *N* units consumes the `MetaEntry`s of its first *min(N, 3)* unit-aligned addresses; their words encode the bin-tree node (unit 0), the range-tree node (unit 1, for blocks of two or more units), and the large-chunk count (unit 2, for blocks of three or more units).
+The pagemap reserves the low `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT` bits of each word for the meta-entry layout itself; the tree-node encoding (left/right pointers, red bit, variant tag, large-size count) lives at or above that bit.
 
-1. The `meta` field's `META_BOUNDARY_BIT` is preserved, with the same meaning as in the frontend case, above.
-
-2. `meta` (resp. `remote_and_sizeclass`) includes a pointer to the left (resp. right) *chunk* of address space.
-   (The corresponding child *node* in this tree is found by taking the *address* of this chunk and looking up the `MetaEntry` in the Pagemap.
-   This trick of pointing at the child's chunk rather than at the child `MetaEntry` is particularly useful on CHERI:
-   it allows us to capture the authority to the chunk without needing another pointer and costs just a shift and add.)
-
-3. The `meta` field's `LargeBuddyRep::RED_BIT` is used to carry the red/black color of this node.
-
-See `src/backend/largebuddyrange.h`.
+See `PagemapRep` in `src/snmalloc/backend_helpers/largearenarange.h`.
 
 ### Encoding a MetaEntry
 
@@ -131,18 +124,20 @@ The following cases apply:
    * has "small" sizeclass 0, which has size 0.
    * has no associated metadata structure.
 
-2. The address is part of a free chunk in a backend's Large Buddy Allocator:
+2. The address is part of a free chunk in a backend `LargeArenaRange`:
    The `MetaEntry`...
    * has `REMOTE_BACKEND_MARKER` asserted in `remote_and_sizeclass`.
    * has "small" sizeclass 0, which has size 0.
-   * the remainder of its `MetaEntry` structure will be a Large Buddy Allocator rbtree node.
+   * the remainder of its `MetaEntry` structure (and those of the next one or two unit-aligned `MetaEntry`s if the free block spans them) carries the `Arena`'s red-black-tree node encoding.
    * has no associated metadata structure.
 
-3. The address is part of a free chunk inside a backend's Small Buddy Allocator:
+3. The address is part of a free fragment inside a backend `SmallArenaRange`:
    Here, the `MetaEntry` is zero aside from the asserted `REMOTE_BACKEND_MARKER` bit, and so it...
    * has "small" sizeclass 0, which has size 0.
    * has no associated metadata structure.
 
+   The tree of free sub-chunk fragments for this chunk is stored inside the free fragments themselves (`InplaceRep`), not in the pagemap.
+
 4. The address is part of a live large allocation (spanning one or more 16KiB chunks):
    Here, the `MetaEntry`...
    * has `REMOTE_BACKEND_MARKER` clear in `remote_and_sizeclass`.
diff --git a/docs/Arena.md b/docs/Arena.md
new file mode 100644
index 000000000..522ddf4c2
--- /dev/null
+++ b/docs/Arena.md
@@ -0,0 +1,156 @@
+# The Arena: A Bitmap-Indexed Coalescing Range
+
+`Arena` is snmalloc's address-space range that stores free blocks at their
+**natural** size — no power-of-two rounding — and serves any request from the
+full snmalloc size-class sequence. It sits in the per-thread range pipeline
+underneath the slab caches and replaces the historical buddy-based ranges.
+
+This document is the conceptual introduction. For where `Arena` plugs into
+the wider range chain, see [`AddressSpace.md`](AddressSpace.md).
+
+## The problem
+
+A buddy allocator only stores power-of-two blocks. A request for 5 chunks
+must be served from an 8-chunk buddy block, wasting 3 chunks. We wanted a
+range that
+
+* stores blocks at their actual size,
+* uses snmalloc's full `(exponent, mantissa)` size-class sequence at the
+  range level, and
+* still answers "find a block that can serve this request" in O(1).
+
+## The core idea: search upward, mask out exceptions
+
+Free blocks are binned by the *set of size classes they can serve* — the
+**servable set**. To allocate, you walk a per-arena non-empty-bins bitmap
+upward through the bins; any larger block can be carved down. This almost
+works perfectly. The exception is alignment: some bins hold blocks whose
+address alignment is too poor to serve certain smaller, *more* aligned size
+classes. Those bins must be excluded from the search for those requests.
+
+The implementation builds the per-request filter *positively* as a **serve
+mask** — bit `k` set means bin `k` can serve this request — and the lookup
+is `find_first_set(bitmap & serve_mask, start_word)`. The serve mask
+depends only on the requested size class, not on the block, so it is
+precomputed at compile time.
+
+(The original sketch of this design used the equivalent inverse framing of
+a "skip mask" with `bitmap & ~skip_mask`; see `arenabins.h` for the
+in-tree explanation of why positive is preferred.)
+
+## Why the exceptions exist
+
+snmalloc's size classes follow `S = 2^e + m · 2^(e−B)`, where `B` is the
+mantissa-bit width (`INTERMEDIATE_BITS`, 2 in production). Each size class
+has a natural alignment `align(S) = S & -S`.
+
+A size class with high alignment needs padding to reach an aligned address
+within a block. A block of a *larger* size class with *lower* alignment may
+not have room for that padding. Concretely: a block of size 5 at address 1
+can serve size 5 (alignment 1) but cannot serve size 4 (alignment 4) —
+there is not enough space after padding to the first 4-aligned address.
+
+Same size block, different address, different servable set. This is why
+distinct bins per servable-set are needed.
+
+## Bin count grows slowly in B
+
+At each exponent, the distinct servable sets are enumerated exhaustively
+by `prototype/skip_analysis.py`:
+
+| B | Mantissas/exponent | Bins/exponent | Max mask bits |
+|---|-------------------:|--------------:|--------------:|
+| 1 | 2                  | 2             | 0             |
+| 2 | 4                  | 5             | 1             |
+| 3 | 8                  | 13            | 4             |
+| 4 | 16                 | 34            | 11            |
+
+Most requests need no exceptions at all. Only size classes whose alignment
+exceeds the expected alignment for their position in the sequence have any
+bits to mask. The whole structure is constant-folded into a few small tables.
+
+## The two-tree structure
+
+A bitmap alone is not enough — when a bin is non-empty, the arena still has
+to *retrieve* and *coalesce* blocks. Each `Arena` therefore maintains:
+
+* **One red-black tree per non-empty bin** (the "bin trees"), keyed by
+  block address, giving O(log n) selection within a bin. The non-empty-bins
+  bitmap is the index over these trees.
+
+* **One red-black tree of all free blocks** (the "range tree"), keyed by
+  address, used to find a block's left/right neighbours for coalescing on
+  free.
+
+On allocation: bitmap lookup → choose the bin → pop a block from its
+bin tree → `carve` returns pre-pad / aligned request / post-pad → pre and
+post (if any) re-enter the arena via the bin and range trees.
+
+On free: range tree lookup → coalesce with neighbours if their tags allow
+→ insert the resulting (possibly merged) block.
+
+## Two variants over the same Arena
+
+`Arena` is parameterised by a **Rep** (representation) that decides where
+the per-block tree-node state lives. Two reps ship today:
+
+* **`PagemapRep`** — node state lives in the pagemap entry that already
+  covers the block. Used by **`LargeArenaRange`**, which manages whole
+  chunks and larger. Node access is a pagemap lookup; no in-band space is
+  consumed.
+
+* **`InplaceRep`** — node state lives *in the free block itself*, in the
+  first units. Used by **`SmallArenaRange`**, which manages sub-chunk
+  metadata fragments where no pagemap entry exists for the fragment. The
+  layout packs the bin tree pointers, the range tree pointers, and (for
+  blocks ≥ 3 units) a large-size word into the leading units of the free
+  block. Unit size is `next_pow2(2 · sizeof(CapPtr))` — 16 B without
+  CHERI, 32 B with pure-capability CHERI/Morello — large enough to hold
+  the two pointers a free block must store.
+
+Both reps drive the same bin / range tree logic in `arena.h`; the bin
+classifier and bitmap in `arenabins.h` are shared.
+
+## Why this matters for metadata
+
+Slab metadata typically wants a pow2 client structure (e.g. a 128 B
+bitmap) plus a fixed ~32 B header. A buddy-based small range rounds
+`160 B → 256 B` (96 B wasted per slab). `SmallArenaRange` rounds to a unit
+multiple (`MIN_META_ALIGN`), so the same allocation costs ~160 B. Across
+many slabs and large heaps this is real memory.
+
+## Concrete example (B = 2, in-production)
+
+At exponent `e = 2` the size classes are 4, 5, 6, 7, and there are 5 bins,
+each labeled by the set of sizes it can serve at this exponent:
+
+    Bin 0: serves {4}
+    Bin 1: serves {5}
+    Bin 2: serves {4, 5}
+    Bin 3: serves {4, 5, 6}
+    Bin 4: serves {4, 5, 6, 7}
+
+The per-request serve masks (within this exponent — higher exponents
+always serve, so their bits are set):
+
+    Request for 7: serve bins {4}
+    Request for 6: serve bins {3, 4}
+    Request for 5: serve bins {1, 2, 3, 4}
+    Request for 4: serve bins {0, 2, 3, 4}   — bin 1 holds only {5} blocks
+
+Only the size-4 request has an exception: bin 1 must not be picked. All
+other requests get the simple "everything at or above" mask.
+
+## Where to look in the code
+
+* `src/snmalloc/backend_helpers/arenabins.h` — bin classification, serve
+  masks, the non-empty-bins bitmap, the `carve` primitive.
+* `src/snmalloc/backend_helpers/arena.h` — bin-tree-per-bin + range-tree
+  structure, allocation and free / coalesce paths.
+* `src/snmalloc/backend_helpers/largearenarange.h` — `Arena<PagemapRep>`
+  for whole-chunk allocations.
+* `src/snmalloc/backend_helpers/smallarenarange.h`,
+  `inplacerep.h` — `Arena<InplaceRep>` for sub-chunk metadata.
+
+`prototype/skip_analysis.py` and `prototype/servable_sets.py` enumerate
+the bin scheme and verify the serve-mask construction for B ∈ {1, 2, 3}.
diff --git a/prototype/servable_sets.py b/prototype/servable_sets.py
new file mode 100644
index 000000000..3acdcff5d
--- /dev/null
+++ b/prototype/servable_sets.py
@@ -0,0 +1,351 @@
+#!/usr/bin/env python3
+"""
+Exhaustive analysis of servable sets for snmalloc's size classes.
+
+For every possible free block (offset, size) in a 512-chunk arena,
+compute which snmalloc size classes can be allocated from that block,
+respecting the natural alignment constraint:
+  align(S) = S & ~(S-1)   (largest power of 2 dividing S)
+
+A block at offset `a` with size `n` can serve size class `S` iff
+there exists an address `x` within [a, a+n-S] such that x is a
+multiple of align(S):
+  first_aligned = ceil(a / align(S)) * align(S)
+  servable iff first_aligned + S <= a + n
+"""
+
+ARENA = 512
+B = 2  # INTERMEDIATE_BITS
+
+
+def gen_size_classes(max_size):
+    """Generate snmalloc size classes: S = 2^e + m * 2^{e-B}."""
+    classes = set()
+    classes.add(1)
+    classes.add(2)
+    classes.add(3)
+    e = 2
+    while True:
+        base = 1 << e
+        step = 1 << (e - B)
+        for m in range(1 << B):
+            s = base + m * step
+            if s > max_size:
+                break
+            classes.add(s)
+        if base > max_size:
+            break
+        e += 1
+    return sorted(classes)
+
+
+def natural_align(x):
+    """Largest power of 2 dividing x. For 0, return a large value."""
+    if x == 0:
+        return 1 << 30
+    return x & (-x)
+
+
+def can_serve(addr, block_size, sizeclass):
+    """Can a block at `addr` of `block_size` chunks serve `sizeclass`?"""
+    A = natural_align(sizeclass)
+    first_aligned = ((addr + A - 1) // A) * A
+    return first_aligned + sizeclass <= addr + block_size
+
+
+def get_exponent_mantissa(s):
+    """Return (exponent, mantissa) for size class s with B=2."""
+    if s == 1:
+        return (0, 0)
+    if s == 2:
+        return (1, 0)
+    if s == 3:
+        return (1, 1)
+    e = 2
+    while True:
+        base = 1 << e
+        step = 1 << (e - B)
+        for m in range(4):
+            if base + m * step == s:
+                return (e, m)
+        e += 1
+        if base > s * 2:
+            return None
+
+
+def main():
+    size_classes = gen_size_classes(ARENA)
+    print(f"Size classes: {size_classes}")
+    print(f"Count: {len(size_classes)}")
+    print()
+
+    # Show alignment for each size class
+    print("Size class alignments:")
+    for sc in size_classes:
+        em = get_exponent_mantissa(sc)
+        print(f"  S={sc:>4d}  align={natural_align(sc):>4d}  (e={em[0]}, m={em[1]})")
+    print()
+
+    # ================================================================
+    # Step 1: Compute ALL unique servable sets
+    # ================================================================
+    all_sets = {}  # frozenset -> list of (addr, size) examples
+    for a in range(ARENA):
+        for n in range(1, ARENA - a + 1):
+            servable = frozenset(
+                sc for sc in size_classes if can_serve(a, n, sc)
+            )
+            if servable not in all_sets:
+                all_sets[servable] = []
+            all_sets[servable].append((a, n))
+
+    # Sort by (cardinality, max element)
+    sorted_sets = sorted(
+        all_sets.keys(), key=lambda s: (len(s), max(s) if s else 0)
+    )
+
+    print(f"Total unique servable sets: {len(sorted_sets)}")
+    print()
+
+    # ================================================================
+    # Step 2: Show each unique servable set and its structure
+    # ================================================================
+    print("=" * 80)
+    print("ALL UNIQUE SERVABLE SETS")
+    print("=" * 80)
+    for i, s in enumerate(sorted_sets):
+        examples = all_sets[s][:3]
+        ex_str = ", ".join(f"(a={a},n={n})" for a, n in examples)
+        print(f"  #{i:>3d}  |{len(s):>3d} classes|  {sorted(s)}")
+        print(f"         examples: {ex_str}")
+    print()
+
+    # ================================================================
+    # Step 3: Analyse containment / subset structure
+    # ================================================================
+    print("=" * 80)
+    print("CONTAINMENT ANALYSIS")
+    print("=" * 80)
+    print()
+    print("For each set, what's new compared to its largest strict subset?")
+    print("Incomparable pairs are sets where neither is a subset of the other.")
+    print()
+
+    for i, s in enumerate(sorted_sets):
+        # Find strict subsets
+        subsets = [sorted_sets[j] for j in range(len(sorted_sets)) if sorted_sets[j] < s]
+        if subsets:
+            biggest_subset = max(subsets, key=len)
+            new = sorted(s - biggest_subset)
+        else:
+            new = sorted(s)
+
+        # Find incomparable sets (same cardinality, neither subset)
+        incomparable = []
+        for j in range(len(sorted_sets)):
+            other = sorted_sets[j]
+            if other == s:
+                continue
+            if not (other < s) and not (other > s) and len(other) == len(s):
+                incomparable.append(j)
+
+        new_em = [(sc, get_exponent_mantissa(sc)) for sc in new]
+        inc_str = f"  ** INCOMPARABLE with #{incomparable}" if incomparable else ""
+        print(f"  #{i:>3d}: +{new}  {inc_str}")
+
+    print()
+
+    # ================================================================
+    # Step 4: Group by exponent — show the 5-state structure
+    # ================================================================
+    print("=" * 80)
+    print("PER-EXPONENT STRUCTURE")
+    print("=" * 80)
+    print()
+    print("Within each exponent level, how many distinct states are there?")
+    print("A 'state' is a distinct subset of {m=0, m=1, m=2, m=3} that")
+    print("appears as the set of servable mantissas at that exponent.")
+    print()
+
+    max_exp = max(get_exponent_mantissa(sc)[0] for sc in size_classes)
+
+    for e in range(2, max_exp + 1):
+        # Size classes at this exponent
+        sizes_at_e = []
+        for m in range(4):
+            step = 1 << (e - B)
+            s = (1 << e) + m * step
+            if s <= ARENA:
+                sizes_at_e.append((m, s))
+
+        if not sizes_at_e:
+            continue
+
+        # For each servable set, extract which mantissas at exponent e are present
+        mantissa_subsets = set()
+        for s_set in sorted_sets:
+            present = frozenset(
+                m for m, sc in sizes_at_e if sc in s_set
+            )
+            if present:  # at least one mantissa servable
+                mantissa_subsets.add(present)
+
+        print(f"  Exponent e={e}: sizes {[s for _, s in sizes_at_e]}")
+        print(f"    Distinct mantissa subsets: {len(mantissa_subsets)}")
+        for ms in sorted(mantissa_subsets, key=lambda x: (len(x), sorted(x))):
+            label = ""
+            ms_sorted = sorted(ms)
+            if ms_sorted == [0]:
+                label = "A-only"
+            elif ms_sorted == [1]:
+                label = "B-only"
+            elif ms_sorted == [0, 1]:
+                label = "both"
+            elif ms_sorted == [0, 1, 2]:
+                label = "+m2"
+            elif ms_sorted == [0, 1, 2, 3]:
+                label = "+m3"
+            else:
+                label = "???"
+            print(f"      mantissas {str(ms_sorted):20s}  ({label})")
+
+        # Check for incomparable pairs
+        for ms1 in mantissa_subsets:
+            for ms2 in mantissa_subsets:
+                if ms1 != ms2 and not ms1 < ms2 and not ms2 < ms1:
+                    print(f"    ** Incomparable: {sorted(ms1)} vs {sorted(ms2)}")
+        print()
+
+    # ================================================================
+    # Step 5: Show the threshold formula
+    # ================================================================
+    print("=" * 80)
+    print("THRESHOLD ANALYSIS")
+    print("=" * 80)
+    print()
+    print("T(S, alpha) = S + max(0, align(S) - alpha)")
+    print("= minimum block size to serve S at block alignment alpha")
+    print()
+
+    for e in range(2, min(max_exp + 1, 6)):
+        print(f"  Exponent e={e}:")
+        for m in range(4):
+            step = 1 << (e - B)
+            s = (1 << e) + m * step
+            if s > ARENA:
+                break
+            a = natural_align(s)
+            print(f"    m={m}: S={s:>4d}, align={a:>4d}", end="")
+            # Show threshold at various block alignments
+            alphas = [1, 2, 4, 1 << e]
+            vals = []
+            for alpha in alphas:
+                t = s + max(0, a - alpha)
+                vals.append(f"T(α={alpha})={t}")
+            print(f"  {', '.join(vals)}")
+        print()
+
+    # ================================================================
+    # Step 6: Verify the key property
+    # ================================================================
+    print("=" * 80)
+    print("KEY PROPERTY VERIFICATION")
+    print("=" * 80)
+    print()
+    print("Checking: servable sets are almost totally ordered.")
+    print("For each exponent, there should be exactly 5 states")
+    print("with exactly 1 incomparable pair ({m=0} vs {m=1}).")
+    print()
+
+    all_ok = True
+    for e in range(2, max_exp + 1):
+        sizes_at_e = []
+        for m in range(4):
+            step = 1 << (e - B)
+            s = (1 << e) + m * step
+            if s <= ARENA:
+                sizes_at_e.append((m, s))
+
+        if len(sizes_at_e) < 4:
+            continue
+
+        mantissa_subsets = set()
+        for s_set in sorted_sets:
+            present = frozenset(m for m, sc in sizes_at_e if sc in s_set)
+            if present:
+                mantissa_subsets.add(present)
+
+        n_states = len(mantissa_subsets)
+        n_incomparable = 0
+        for ms1 in mantissa_subsets:
+            for ms2 in mantissa_subsets:
+                if ms1 < ms2 or ms2 < ms1 or ms1 == ms2:
+                    continue
+                n_incomparable += 1
+        n_incomparable //= 2  # each pair counted twice
+
+        ok = (n_states == 5 and n_incomparable == 1)
+        status = "OK" if ok else "FAIL"
+        if not ok:
+            all_ok = False
+        print(f"  e={e}: {n_states} states, {n_incomparable} incomparable pairs  [{status}]")
+
+    print()
+    if all_ok:
+        print("  ALL EXPONENTS HAVE EXACTLY 5 STATES WITH 1 INCOMPARABLE PAIR.")
+    else:
+        print("  SOME EXPONENTS DIFFER — check output above.")
+
+    # ================================================================
+    # Step 7: Show the two-bin split for m=1 with concrete examples
+    # ================================================================
+    print()
+    print("=" * 80)
+    print("THE TWO-BIN SPLIT: blocks of the same size go to different bins")
+    print("=" * 80)
+    print()
+    print("For each exponent, m=1 blocks are split into two bins based on")
+    print("whether they can also serve m=0 (the power-of-two size).")
+    print()
+
+    for e in range(2, min(max_exp + 1, 6)):
+        s0 = 1 << e                       # m=0 size
+        s1 = 5 * (1 << (e - B))           # m=1 size
+        a0 = natural_align(s0)
+        a1 = natural_align(s1)
+
+        print(f"  Exponent e={e}: m=0 is size {s0} (align {a0}), "
+              f"m=1 is size {s1} (align {a1})")
+
+        # Find concrete blocks of size s1 that can/cannot serve s0
+        bin_a_examples = []  # can serve both s0 and s1
+        bin_b_examples = []  # can serve s1 but NOT s0
+
+        for a in range(min(ARENA, 64)):
+            if can_serve(a, s1, s1):
+                if can_serve(a, s1, s0):
+                    if len(bin_a_examples) < 3:
+                        bin_a_examples.append((a, s1))
+                else:
+                    if len(bin_b_examples) < 3:
+                        bin_b_examples.append((a, s1))
+
+        # Show what each bin can serve
+        if bin_a_examples:
+            a_ex = bin_a_examples[0]
+            servable = sorted(sc for sc in size_classes if can_serve(a_ex[0], a_ex[1], sc))
+            ex_strs = ", ".join(f"(a={a},n={n})" for a, n in bin_a_examples)
+            print(f"    Bin A (serves {s0} AND {s1}): e.g. {ex_strs}")
+            print(f"      serves: {servable}")
+
+        if bin_b_examples:
+            b_ex = bin_b_examples[0]
+            servable = sorted(sc for sc in size_classes if can_serve(b_ex[0], b_ex[1], sc))
+            ex_strs = ", ".join(f"(a={a},n={n})" for a, n in bin_b_examples)
+            print(f"    Bin B (serves {s1} but NOT {s0}): e.g. {ex_strs}")
+            print(f"      serves: {servable}")
+        print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/prototype/skip_analysis.py b/prototype/skip_analysis.py
new file mode 100644
index 000000000..fe510677f
--- /dev/null
+++ b/prototype/skip_analysis.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+"""
+Analyse the "skip" structure for different INTERMEDIATE_BITS values.
+
+Core question: when searching upward through bins for a size class,
+how many bins do you need to skip (bins that serve a larger size but
+not the one you want, due to alignment)?
+"""
+
+ARENA = 1024
+
+
+def natural_align(x):
+    if x == 0:
+        return 1 << 30
+    return x & (-x)
+
+
+def can_serve(addr, block_size, sizeclass):
+    A = natural_align(sizeclass)
+    first_aligned = ((addr + A - 1) // A) * A
+    return first_aligned + sizeclass <= addr + block_size
+
+
+def gen_size_classes(B, max_size):
+    classes = set()
+    classes.add(1)
+    if B >= 1:
+        classes.add(2)
+    if B >= 2:
+        classes.add(3)
+    if B >= 3:
+        for s in range(4, 8):
+            if s <= max_size:
+                classes.add(s)
+    e = B
+    while True:
+        base = 1 << e
+        step = 1 << (e - B)
+        for m in range(1 << B):
+            s = base + m * step
+            if s <= max_size:
+                classes.add(s)
+        if base > max_size:
+            break
+        e += 1
+    return sorted(classes)
+
+
+def analyse(B):
+    M = 1 << B  # mantissas per exponent
+    size_classes = gen_size_classes(B, ARENA)
+    print(f"{'='*80}")
+    print(f"INTERMEDIATE_BITS = {B}  ({M} mantissas per exponent)")
+    print(f"{'='*80}")
+    print(f"Size classes: {size_classes[:30]}{'...' if len(size_classes)>30 else ''}")
+    print()
+
+    # Show alignment pattern for one exponent
+    e = max(B + 2, 4)  # pick an exponent where sizes aren't tiny
+    print(f"  Alignment pattern at exponent e={e}:")
+    sizes_at_e = []
+    for m in range(M):
+        step = 1 << (e - B)
+        s = (1 << e) + m * step
+        a = natural_align(s)
+        sizes_at_e.append((m, s, a))
+        print(f"    m={m}: size={s:>4d}  align={a:>4d}  (coefficient {s >> (e-B)} = {s // (1 << (e-B))})")
+    print()
+
+    # Compute all unique servable sets
+    all_sets = set()
+    for a in range(ARENA):
+        for n in range(1, ARENA - a + 1):
+            servable = frozenset(
+                sc for sc in size_classes if can_serve(a, n, sc)
+            )
+            all_sets.add(servable)
+
+    sorted_sets = sorted(all_sets, key=lambda s: (len(s), max(s) if s else 0))
+
+    # Per-exponent analysis
+    max_exp = 1
+    for sc in size_classes:
+        ee = B
+        while (1 << ee) <= sc:
+            ee += 1
+        ee -= 1
+        if ee >= B:
+            max_exp = max(max_exp, ee)
+
+    print(f"  Per-exponent mantissa state analysis:")
+    print()
+
+    for e in range(B, max_exp + 1):
+        sizes_at_e = []
+        for m in range(M):
+            step = 1 << (e - B)
+            s = (1 << e) + m * step
+            if s <= ARENA:
+                sizes_at_e.append((m, s))
+
+        if len(sizes_at_e) < M:
+            continue
+
+        # For each servable set, extract which mantissas at this exponent are present
+        mantissa_subsets = set()
+        for s_set in sorted_sets:
+            present = frozenset(m for m, sc in sizes_at_e if sc in s_set)
+            if present:
+                mantissa_subsets.add(present)
+
+        # Count incomparable pairs
+        incomparable_pairs = []
+        ms_list = sorted(mantissa_subsets, key=lambda x: (len(x), sorted(x)))
+        for i, ms1 in enumerate(ms_list):
+            for ms2 in ms_list[i+1:]:
+                if not ms1 < ms2 and not ms2 < ms1:
+                    incomparable_pairs.append((sorted(ms1), sorted(ms2)))
+
+        print(f"  Exponent e={e}: sizes {[s for _, s in sizes_at_e]}")
+        print(f"    {len(mantissa_subsets)} distinct states, {len(incomparable_pairs)} incomparable pair(s)")
+
+        for ms in sorted(mantissa_subsets, key=lambda x: (len(x), sorted(x))):
+            print(f"      {sorted(ms)}")
+
+        if incomparable_pairs:
+            for p in incomparable_pairs:
+                print(f"    ** Incomparable: {p[0]} vs {p[1]}")
+        print()
+
+    # The key analysis: for each size class, which bins must be SKIPPED?
+    print(f"  SKIP ANALYSIS: when searching for size S, which larger-size bins")
+    print(f"  might contain blocks that can't serve S?")
+    print()
+
+    for e in range(B, min(max_exp + 1, B + 4)):
+        sizes_at_e = []
+        for m in range(M):
+            step = 1 << (e - B)
+            s = (1 << e) + m * step
+            if s <= ARENA:
+                sizes_at_e.append((m, s))
+
+        if len(sizes_at_e) < M:
+            continue
+
+        print(f"  Exponent e={e}:")
+
+        for m_req, s_req in sizes_at_e:
+            a_req = natural_align(s_req)
+
+            # For each larger size at same exponent, check if it can always serve s_req
+            skips = []
+            for m_other, s_other in sizes_at_e:
+                if s_other <= s_req:
+                    continue
+                # Can a block of size s_other sometimes NOT serve s_req?
+                # Check: at worst alignment for s_req, does s_other still suffice?
+                # T(s_req, alpha=1) = s_req + align(s_req) - 1
+                # The block serves s_req if block_size >= T(s_req, block_align)
+                # A block of size s_other could have any alignment
+                can_always = True
+                can_sometimes_not = False
+                for addr in range(min(ARENA, 64)):
+                    if can_serve(addr, s_other, s_other):  # valid block
+                        if not can_serve(addr, s_other, s_req):
+                            can_sometimes_not = True
+                            break
+
+                if can_sometimes_not:
+                    skips.append((m_other, s_other))
+
+            if skips:
+                skip_str = ", ".join(f"m={m}(size {s})" for m, s in skips)
+                print(f"    Requesting m={m_req} (size {s_req}, align {a_req}): "
+                      f"must skip: {skip_str}")
+            else:
+                print(f"    Requesting m={m_req} (size {s_req}, align {a_req}): "
+                      f"no skips needed")
+        print()
+
+    # Summary: how many skips total per exponent?
+    print(f"  SUMMARY: skips needed per request at each exponent")
+    print()
+
+    for e in range(B, min(max_exp + 1, B + 4)):
+        sizes_at_e = []
+        for m in range(M):
+            step = 1 << (e - B)
+            s = (1 << e) + m * step
+            if s <= ARENA:
+                sizes_at_e.append((m, s))
+
+        if len(sizes_at_e) < M:
+            continue
+
+        total_skips = 0
+        max_skips_per_request = 0
+
+        for m_req, s_req in sizes_at_e:
+            skips = 0
+            for m_other, s_other in sizes_at_e:
+                if s_other <= s_req:
+                    continue
+                for addr in range(min(ARENA, 64)):
+                    if can_serve(addr, s_other, s_other):
+                        if not can_serve(addr, s_other, s_req):
+                            skips += 1
+                            break
+
+            total_skips += skips
+            max_skips_per_request = max(max_skips_per_request, skips)
+
+        print(f"    e={e}: max skips for any single request = {max_skips_per_request}")
+
+
+def main():
+    for B in [1, 2, 3, 4]:
+        analyse(B)
+        print()
+        print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/snmalloc/README.md b/src/snmalloc/README.md
index 2549320fb..f598f8171 100644
--- a/src/snmalloc/README.md
+++ b/src/snmalloc/README.md
@@ -20,7 +20,7 @@ These are arranged in a hierarchy such that each of the directories may include
  - `mem/` provides the core allocator abstractions.
    The code here is templated over a back-end, which defines a particular embedding of snmalloc.
  - `backend_helpers/` provides helper classes for use in defining a back end.
-   This includes data structures such as pagemap implementations (efficient maps from a chunk address to associated metadata) and buddy allocators for managing address-space ranges.
+   This includes data structures such as pagemap implementations (efficient maps from a chunk address to associated metadata) and range allocators for managing address-space ranges.
  - `backend/` provides some example implementations for snmalloc embeddings that provide a global memory allocator for an address space.
    Users may ignore this entirely and use the types in `mem/` with a custom back end to expose an snmalloc instance with specific behaviour.
    Layers above this can be used with a custom configuration by defining `SNMALLOC_PROVIDE_OWN_CONFIG` and exporting a type as `snmalloc::Config` that defines the configuration.
diff --git a/src/snmalloc/backend/backend.h b/src/snmalloc/backend/backend.h
index 2772cf319..5f8a0aca6 100644
--- a/src/snmalloc/backend/backend.h
+++ b/src/snmalloc/backend/backend.h
@@ -23,7 +23,27 @@ namespace snmalloc
     using Pal = PAL;
     using SlabMetadata = typename PagemapEntry::SlabMetadata;
 
-  public:
+    /**
+     * Round a metadata allocation size to a value the meta range can
+     * service.
+     *
+     * - Pads to `LocalState::MIN_META_ALIGN` so that the in-band small
+     *   meta range (`SmallArenaRange`) accepts it.
+     * - If the result reaches `MIN_CHUNK_SIZE`, the request will bypass
+     *   the small range to the parent `LargeArenaRange`, which requires
+     *   `MIN_CHUNK_SIZE` alignment; step up to satisfy that.
+     *
+     * Alloc and dealloc sites MUST share this helper so a chunk's
+     * metadata is freed at the same size it was allocated.
+     */
+    SNMALLOC_FAST_PATH static size_t meta_size_round(size_t size)
+    {
+      size_t r = bits::align_up(size, LocalState::MIN_META_ALIGN);
+      if (r >= MIN_CHUNK_SIZE)
+        r = bits::align_up(r, MIN_CHUNK_SIZE);
+      return r;
+    }
+
     /**
      * Provide a block of meta-data with size and align.
      *
@@ -46,7 +66,8 @@ namespace snmalloc
 
       if (local_state != nullptr)
       {
-        p = local_state->get_meta_range().alloc_range_with_leftover(size);
+        auto& meta_range = local_state->get_meta_range();
+        p = meta_range.alloc_range(meta_size_round(size));
       }
       else
       {
@@ -54,7 +75,7 @@ namespace snmalloc
           GlobalMetaRange::ConcurrencySafe,
           "Global meta data range needs to be concurrency safe.");
         GlobalMetaRange global_state;
-        p = global_state.alloc_range(bits::next_pow2(size));
+        p = global_state.alloc_range(meta_size_round(size));
       }
 
       if (p == nullptr)
@@ -92,13 +113,21 @@ namespace snmalloc
       uintptr_t ras,
       sizeclass_t sizeclass)
     {
-      SNMALLOC_ASSERT(bits::is_pow2(size));
+      // `size` must be a positive multiple of the sizeclass's slab
+      // tile size: the pagemap loop below writes one entry per
+      // `slab_size` stride and must terminate exactly at `size`.
+      // Front-end callers satisfy this by construction because they
+      // pass `sizeclass_full_to_size(sizeclass)`, whose largest pow2
+      // divisor is `sizeclass_full_to_slab_size(sizeclass)`.
+      const size_t slab_size = sizeclass_full_to_slab_size(sizeclass);
+      SNMALLOC_ASSERT(size >= slab_size);
+      SNMALLOC_ASSERT((size & (slab_size - 1)) == 0);
       SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE);
 
       // Calculate the extra bytes required to store the client meta-data.
       size_t extra_bytes = SlabMetadata::get_extra_bytes(sizeclass);
 
-      auto meta_size = bits::next_pow2(sizeof(SlabMetadata) + extra_bytes);
+      auto meta_size = meta_size_round(sizeof(SlabMetadata) + extra_bytes);
 
 #ifdef SNMALLOC_TRACING
       message<1024>(
@@ -128,8 +157,28 @@ namespace snmalloc
         return {nullptr, nullptr};
       }
 
-      typename Pagemap::Entry t(meta, ras);
-      Pagemap::set_metaentry(address_cast(p), size, t);
+      // `slab_size` was computed and asserted against `size` at the
+      // top of `alloc_chunk`. `size = k * slab_size` for some integer
+      // `k >= 1`; each slab tile gets the same
+      // `ras | (slab_index << SIZECLASS_BITS)` entry, written in one
+      // `set_metaentry` call.
+      // The OR below assumes the per-chunk-offset bits of `ras` are
+      // zero; `MetaEntryBase::encode` defaults offset to 0, and the
+      // backend is the only place per-chunk offsets are written.
+      SNMALLOC_ASSERT(
+        (ras & (((size_t{1} << OFFSET_BITS) - 1) << SIZECLASS_BITS)) == 0);
+      for (size_t chunk_offset = 0; chunk_offset < size;
+           chunk_offset += slab_size)
+      {
+        const size_t slab_index = chunk_offset / slab_size;
+        // `compute_max_large_slab_index() < (1 << OFFSET_BITS)` is
+        // static_asserted in sizeclasstable.h; this asserts the
+        // arithmetic that derives `slab_index` from `size`/`slab_size`.
+        SNMALLOC_ASSERT(slab_index < (size_t{1} << OFFSET_BITS));
+        const uintptr_t ras_i = ras | (slab_index << SIZECLASS_BITS);
+        typename Pagemap::Entry t_i(meta, ras_i);
+        Pagemap::set_metaentry(address_cast(p) + chunk_offset, slab_size, t_i);
+      }
 
       return {Aal::capptr_bound<void, capptr::bounds::Chunk>(p, size), meta};
     }
@@ -178,7 +227,7 @@ namespace snmalloc
       // Calculate the extra bytes required to store the client meta-data.
       size_t extra_bytes = SlabMetadata::get_extra_bytes(sizeclass);
 
-      auto meta_size = bits::next_pow2(sizeof(SlabMetadata) + extra_bytes);
+      auto meta_size = meta_size_round(sizeof(SlabMetadata) + extra_bytes);
       local_state.get_meta_range().dealloc_range(
         capptr::Arena<void>::unsafe_from(&slab_metadata), meta_size);
 
diff --git a/src/snmalloc/backend/fixedglobalconfig.h b/src/snmalloc/backend/fixedglobalconfig.h
index 5bd3b68b5..68b41f860 100644
--- a/src/snmalloc/backend/fixedglobalconfig.h
+++ b/src/snmalloc/backend/fixedglobalconfig.h
@@ -39,10 +39,17 @@ namespace snmalloc
       {
         return Aal::capptr_rebound(arena, c);
       }
+
+      template<bool potentially_out_of_range = false>
+      static SNMALLOC_FAST_PATH capptr::Arena<void>
+      amplify_from_address(address_t a)
+      {
+        return pointer_offset(arena, a - address_cast(arena));
+      }
     };
 
   public:
-    using LocalState = StandardLocalState<PAL, Pagemap>;
+    using LocalState = StandardLocalState<PAL, Pagemap, Authmap>;
 
     using GlobalPoolState = PoolState<Allocator<FixedRangeConfig>>;
 
@@ -86,7 +93,7 @@ namespace snmalloc
         Pagemap::concretePagemap.init(base, length);
 
       // Make this a alloc_config constant.
-      if (length < MIN_HEAP_SIZE_FOR_THREAD_LOCAL_BUDDY)
+      if (length < MIN_HEAP_SIZE_FOR_THREAD_LOCAL_CACHE)
       {
         LocalState::set_small_heap();
       }
diff --git a/src/snmalloc/backend/globalconfig.h b/src/snmalloc/backend/globalconfig.h
index 208210b65..9bdada06c 100644
--- a/src/snmalloc/backend/globalconfig.h
+++ b/src/snmalloc/backend/globalconfig.h
@@ -68,8 +68,8 @@ namespace snmalloc
      */
     using LocalState = stl::conditional_t<
       mitigations(metadata_protection),
-      MetaProtectedRangeLocalState<Pal, Pagemap, Base>,
-      StandardLocalState<Pal, Pagemap, Base>>;
+      MetaProtectedRangeLocalState<Pal, Pagemap, Authmap, Base>,
+      StandardLocalState<Pal, Pagemap, Authmap, Base>>;
 
     /**
      * Use the default backend.
diff --git a/src/snmalloc/backend/meta_protected_range.h b/src/snmalloc/backend/meta_protected_range.h
index 857e853d2..76916e82f 100644
--- a/src/snmalloc/backend/meta_protected_range.h
+++ b/src/snmalloc/backend/meta_protected_range.h
@@ -24,6 +24,7 @@ namespace snmalloc
   template<
     typename PAL,
     typename Pagemap,
+    typename Authmap,
     typename Base,
     size_t MinSizeBits = MinBaseSizeBits<PAL>()>
   struct MetaProtectedRangeLocalState : BaseLocalStateConstants
@@ -32,7 +33,7 @@ namespace snmalloc
     // Global range of memory
     using GlobalR = Pipe<
       Base,
-      LargeBuddyRange<
+      LargeArenaRange<
         GlobalCacheSizeBits,
         bits::BITS - 1,
         Pagemap,
@@ -51,7 +52,7 @@ namespace snmalloc
     // would be able to corrupt meta-data.
     using CentralObjectRange = Pipe<
       GlobalR,
-      LargeBuddyRange<GlobalCacheSizeBits, bits::BITS - 1, Pagemap>,
+      LargeArenaRange<GlobalCacheSizeBits, bits::BITS - 1, Pagemap>,
       LogRange<3>,
       GlobalRange,
       CommitRange<PAL>,
@@ -67,7 +68,7 @@ namespace snmalloc
       GlobalR,
       SubRange<PAL, SubRangeRatioBits>, // Use SubRange to introduce guard
                                         // pages.
-      LargeBuddyRange<
+      LargeArenaRange<
         GlobalCacheSizeBits,
         bits::BITS - 1,
         Pagemap,
@@ -77,7 +78,7 @@ namespace snmalloc
       // page, so commit in the global range.
       stl::conditional_t<
         (max_page_chunk_size_bits > MIN_CHUNK_BITS),
-        LargeBuddyRange<
+        LargeArenaRange<
           max_page_chunk_size_bits,
           max_page_chunk_size_bits,
           Pagemap,
@@ -90,7 +91,7 @@ namespace snmalloc
     // Local caching of object range
     using ObjectRange = Pipe<
       CentralObjectRange,
-      LargeBuddyRange<
+      LargeArenaRange<
         LocalCacheSizeBits,
         LocalCacheSizeBits,
         Pagemap,
@@ -100,17 +101,23 @@ namespace snmalloc
     // Local caching of meta-data range
     using MetaRange = Pipe<
       CentralMetaRange,
-      LargeBuddyRange<
+      LargeArenaRange<
         LocalCacheSizeBits - SubRangeRatioBits,
         bits::BITS - 1,
         Pagemap>,
-      SmallBuddyRange>;
+      SmallArenaRange<Authmap>>;
 
     ObjectRange object_range;
 
     MetaRange meta_range;
 
   public:
+    /// Granularity of the local meta range. Backend rounds metadata
+    /// allocation sizes up to this; replaces pow2 rounding.
+    static constexpr size_t MIN_META_ALIGN = MetaRange::UNIT_SIZE;
+    static_assert(
+      bits::is_pow2(MIN_META_ALIGN), "MIN_META_ALIGN must be a power of two");
+
     using Stats = StatsCombiner<CentralObjectRange, CentralMetaRange>;
 
     ObjectRange* get_object_range()
@@ -124,9 +131,9 @@ namespace snmalloc
     }
 
     // Create global range that can service small meta-data requests.
-    // Don't want to add the SmallBuddyRange to the CentralMetaRange as that
+    // Don't want to add the SmallArenaRange to the CentralMetaRange as that
     // would require committing memory inside the main global lock.
     using GlobalMetaRange =
-      Pipe<CentralMetaRange, SmallBuddyRange, GlobalRange>;
+      Pipe<CentralMetaRange, SmallArenaRange<Authmap>, GlobalRange>;
   };
 } // namespace snmalloc
diff --git a/src/snmalloc/backend/standard_range.h b/src/snmalloc/backend/standard_range.h
index 78609ed2d..f46e6085d 100644
--- a/src/snmalloc/backend/standard_range.h
+++ b/src/snmalloc/backend/standard_range.h
@@ -22,6 +22,7 @@ namespace snmalloc
   template<
     typename PAL,
     typename Pagemap,
+    typename Authmap,
     typename Base = EmptyRange<>,
     size_t MinSizeBits = MinBaseSizeBits<PAL>()>
   struct StandardLocalState : BaseLocalStateConstants
@@ -29,7 +30,7 @@ namespace snmalloc
     // Global range of memory, expose this so can be filled by init.
     using GlobalR = Pipe<
       Base,
-      LargeBuddyRange<
+      LargeArenaRange<
         GlobalCacheSizeBits,
         bits::BITS - 1,
         Pagemap,
@@ -45,29 +46,35 @@ namespace snmalloc
       bits::next_pow2_bits_const(PAL::page_size);
 
   public:
-    // Source for object allocations and metadata
-    // Use buddy allocators to cache locally.
+    // Source for object allocations and metadata; thread-local cache
+    // for chunk-sized ranges.
     using LargeObjectRange = Pipe<
       Stats,
-      StaticConditionalRange<LargeBuddyRange<
+      StaticConditionalRange<LargeArenaRange<
         LocalCacheSizeBits,
         LocalCacheSizeBits,
         Pagemap,
         page_size_bits>>>;
 
   private:
-    using ObjectRange = Pipe<LargeObjectRange, SmallBuddyRange>;
+    using ObjectRange = Pipe<LargeObjectRange, SmallArenaRange<Authmap>>;
 
     ObjectRange object_range;
 
   public:
+    /// Granularity of the local meta range. Backend rounds metadata
+    /// allocation sizes up to this; replaces pow2 rounding.
+    static constexpr size_t MIN_META_ALIGN = ObjectRange::UNIT_SIZE;
+    static_assert(
+      bits::is_pow2(MIN_META_ALIGN), "MIN_META_ALIGN must be a power of two");
+
     // Expose a global range for the initial allocation of meta-data.
     using GlobalMetaRange = Pipe<ObjectRange, GlobalRange>;
 
     /**
      * Where we turn for allocations of user chunks.
      *
-     * Reach over the SmallBuddyRange that's at the near end of the ObjectRange
+     * Reach over the SmallArenaRange that's at the near end of the ObjectRange
      * pipe, rather than having that range adapter dynamically branch to its
      * parent.
      */
diff --git a/src/snmalloc/backend_helpers/arena.h b/src/snmalloc/backend_helpers/arena.h
new file mode 100644
index 000000000..4330637a7
--- /dev/null
+++ b/src/snmalloc/backend_helpers/arena.h
@@ -0,0 +1,422 @@
+#pragma once
+
+#include "../ds_core/redblacktree.h"
+#include "../ds_core/sizeclassconfig.h"
+#include "../stl/array.h"
+#include "../stl/utility.h"
+#include "arenabins.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace snmalloc
+{
+  struct ArenaTestAccess;
+
+  /**
+   * Size encoding for a free block's first pagemap entry.
+   * Min:     exactly 1 chunk (no range-tree entry).
+   * EvenTwo: exactly 2 chunks, 2-aligned; can serve size-2 requests.
+   * OddTwo:  exactly 2 chunks, NOT 2-aligned; in range tree but
+   *          placed in a size-1 bin (cannot serve aligned size-2 requests).
+   * Large:   3+ chunks; precise size stored in a separate entry.
+   */
+  enum class ArenaVariant : uint8_t
+  {
+    Min = 0,
+    EvenTwo = 1,
+    OddTwo = 2,
+    Large = 3
+  };
+
+  /**
+   * Manages free ranges within a single bounded arena using a dual-tree
+   * scheme: a set of bin trees indexed by the floor-log2 size class
+   * (used for allocation lookup) and one range tree keyed by address
+   * (used for consolidation of adjacent free ranges).
+   *
+   * `Rep` is the representation. It owns *all* storage and bit-layout
+   * decisions for tree nodes and per-block metadata. `Rep` must provide:
+   *
+   *   - `using BinRep`  — full RBTree Rep for the bin trees, supplying
+   *     `Handle`, `Contents`, `null`, `root`, `ref`, `get`, `set`,
+   *     `is_red`, `set_red`, `compare`, `equal`, `printable`, `name`.
+   *     Owns its own red-bit packing privately.
+   *   - `using RangeRep` — full RBTree Rep for the range tree, same
+   *     shape as `BinRep`.
+   *   - `get_variant(addr)` / `set_variant(addr, v)` — the
+   *     `ArenaVariant` tag for the block starting at `addr`.
+   *   - `get_large_size(addr)` / `set_large_size(addr, size)` —
+   *     exact byte size for `Large` blocks (3+ units).
+   *   - `can_consolidate(higher_addr) -> bool` — whether the block at
+   *     `higher_addr` may be merged with the block immediately below
+   *     it. Returns false at allocation boundaries that must be
+   *     preserved.
+   *
+   * `MIN_SIZE_BITS`: log2 of the unit of allocation (= the minimum
+   * block size in bytes). All addresses and sizes managed by this
+   * arena are multiples of `1 << MIN_SIZE_BITS`.
+   *
+   * `MAX_SIZE_BITS`: log2 of the (exclusive) upper bound on managed
+   * block sizes. Blocks that reach this size overflow and are
+   * returned to the caller.
+   */
+  template<typename Rep, size_t MIN_SIZE_BITS, size_t MAX_SIZE_BITS>
+  class Arena
+  {
+    static_assert(MAX_SIZE_BITS > MIN_SIZE_BITS);
+    static_assert(MAX_SIZE_BITS < bits::BITS);
+    static_assert(MIN_SIZE_BITS < bits::BITS);
+
+    static constexpr size_t UNIT_SIZE = size_t(1) << MIN_SIZE_BITS;
+    static constexpr size_t TWO_UNITS = size_t(2) << MIN_SIZE_BITS;
+
+    static constexpr size_t B = 2;
+    using Bins = ArenaBins<B, MIN_SIZE_BITS>;
+
+    static_assert(
+      bits::one_at_bit(MAX_SIZE_BITS) - 1 <= Bins::max_supported_size());
+
+    using BinRep = typename Rep::BinRep;
+    using RangeRep = typename Rep::RangeRep;
+
+    using BinTree = RBTree<BinRep>;
+    using RangeTree = RBTree<RangeRep>;
+
+    stl::Array<BinTree, Bins::Bitmap::TOTAL_BINS> bin_trees{};
+    RangeTree range_tree{};
+    typename Bins::Bitmap bitmap{};
+
+    // ---- Metadata helpers ----
+
+    static ArenaVariant variant_of(size_t size, uintptr_t addr)
+    {
+      if (size == UNIT_SIZE)
+        return ArenaVariant::Min;
+      if (size == TWO_UNITS)
+        return ((addr >> MIN_SIZE_BITS) & 1) == 0 ? ArenaVariant::EvenTwo :
+                                                    ArenaVariant::OddTwo;
+      return ArenaVariant::Large;
+    }
+
+    static stl::Pair<uintptr_t, size_t> range_from_addr(uintptr_t a)
+    {
+      if (a == 0)
+        return {0, 0};
+      auto v = Rep::get_variant(a);
+      switch (v)
+      {
+        case ArenaVariant::Min:
+          return {a, UNIT_SIZE};
+        case ArenaVariant::EvenTwo:
+        case ArenaVariant::OddTwo:
+          return {a, TWO_UNITS};
+        case ArenaVariant::Large:
+        {
+          size_t s = Rep::get_large_size(a);
+          SNMALLOC_ASSERT(
+            s > TWO_UNITS && s < bits::one_at_bit(MAX_SIZE_BITS) &&
+            bits::align_down(s, UNIT_SIZE) == s);
+          return {a, s};
+        }
+      }
+      SNMALLOC_ASSERT(false);
+      return {0, 0};
+    }
+
+    bool contains_min(uintptr_t a)
+    {
+      auto path = bin_trees[0].get_root_path();
+      return bin_trees[0].find(path, a) &&
+        Rep::get_variant(a) == ArenaVariant::Min;
+    }
+
+    void insert_block(uintptr_t addr, size_t size)
+    {
+      Rep::set_variant(addr, variant_of(size, addr));
+      if (size > TWO_UNITS)
+        Rep::set_large_size(addr, size);
+
+      auto range = typename Bins::range_t{addr, size};
+      size_t bin = bitmap.add(range);
+      bin_trees[bin].insert_elem(addr);
+      if (size >= TWO_UNITS)
+        range_tree.insert_elem(addr);
+    }
+
+    void unlink_block(uintptr_t addr, size_t size)
+    {
+      auto range = typename Bins::range_t{addr, size};
+      size_t bin = Bins::bin_index(range);
+      bin_trees[bin].remove_elem(addr);
+      if (size >= TWO_UNITS)
+        range_tree.remove_elem(addr);
+      if (bin_trees[bin].is_empty())
+        bitmap.clear(bin);
+    }
+
+    friend struct ArenaTestAccess;
+
+  public:
+    using addr_t = uintptr_t;
+
+    constexpr Arena() = default;
+
+    /**
+     * Add a free block at `addr` with `size` bytes. The block is
+     * consolidated with any adjacent free neighbours. Returns
+     * `{0, 0}` on success. If consolidation produces a block whose
+     * size reaches `2^MAX_SIZE_BITS` bytes (the exclusive upper bound
+     * on representable block sizes), the block is not inserted;
+     * returns `{consolidated_addr, consolidated_size}` so the caller
+     * can return it to a parent range.
+     */
+    stl::Pair<addr_t, size_t> add_block(addr_t addr, size_t size)
+    {
+      check_invariant();
+      SNMALLOC_ASSERT(addr != 0);
+      // Unit alignment is required: callers feeding parent ranges (e.g.
+      // mmap-backed PalRange returns page-aligned but not chunk-aligned
+      // memory) must trim their input to UNIT_SIZE before reaching here.
+      // LargeArenaRange::add_range does this trim.
+      SNMALLOC_ASSERT((addr & (UNIT_SIZE - 1)) == 0);
+      SNMALLOC_ASSERT(size > 0);
+      SNMALLOC_ASSERT((size & (UNIT_SIZE - 1)) == 0);
+      SNMALLOC_ASSERT(size < bits::one_at_bit(MAX_SIZE_BITS));
+
+      uintptr_t c_addr = addr;
+      size_t c_size = size;
+
+      auto merge = [&](uintptr_t n_addr, size_t n_size) {
+        unlink_block(n_addr, n_size);
+        if (n_addr < c_addr)
+          c_addr = n_addr;
+        c_size += n_size;
+      };
+
+      // Check range tree for non-min neighbours.
+      auto [p_key, s_key] = range_tree.neighbours(addr);
+
+      // Predecessor: check range tree, then fall back to min-size bin.
+      auto [pa, ps] = range_from_addr(p_key);
+      if (pa + ps == addr && Rep::can_consolidate(addr))
+        merge(pa, ps);
+      else if (
+        addr >= UNIT_SIZE && Rep::can_consolidate(addr) &&
+        contains_min(addr - UNIT_SIZE))
+        merge(addr - UNIT_SIZE, UNIT_SIZE);
+
+      // Successor: check range tree, then fall back to min-size bin.
+      // `can_consolidate` reads succ_addr's pagemap entry. That entry is
+      // only known to exist after a tree lookup confirms succ_addr is in
+      // our region — succ_addr can be one past the registered range when
+      // the input block ends at the high edge of the arena. Order the
+      // checks so the tree check gates the pagemap read.
+      auto [sa, ss] = range_from_addr(s_key);
+      uintptr_t succ_addr = addr + size;
+      if (sa == succ_addr && Rep::can_consolidate(succ_addr))
+        merge(sa, ss);
+      else if (
+        succ_addr > addr && contains_min(succ_addr) &&
+        Rep::can_consolidate(succ_addr))
+        merge(succ_addr, UNIT_SIZE);
+
+      // Arena-scale overflow: consolidated block spans the full arena.
+      if (c_size >= bits::one_at_bit(MAX_SIZE_BITS))
+        return {c_addr, c_size};
+
+      // Insert consolidated block.
+      insert_block(c_addr, c_size);
+
+      check_invariant();
+      return {0, 0};
+    }
+
+    /**
+     * Remove exactly `size` bytes. Returns the address on success or
+     * 0 if nothing fits. SC rounding is internal: the arena may
+     * locate a larger free region but only the requested `size` is
+     * handed out — the remainder rolls into the carve remainders
+     * which are re-inserted via `add_block`.
+     */
+    addr_t remove_block(size_t size)
+    {
+      check_invariant();
+      if (size == 0)
+        return 0;
+
+      if (size > Bins::max_supported_size())
+        return 0;
+
+      SNMALLOC_ASSERT((size & (UNIT_SIZE - 1)) == 0);
+
+      size_t bin_id = bitmap.find_for_request(size);
+      if (bin_id == SIZE_MAX)
+        return 0;
+
+      // remove_min returns the lowest-address entry (since compare
+      // is k1 > k2). Read metadata after removal — remove_elem
+      // does not clear node contents (redblacktree.h:535).
+      uintptr_t block_addr = bin_trees[bin_id].remove_min();
+      auto [_, block_size] = range_from_addr(block_addr);
+      (void)_;
+
+      if (block_size >= TWO_UNITS)
+        range_tree.remove_elem(block_addr);
+
+      if (bin_trees[bin_id].is_empty())
+        bitmap.clear(bin_id);
+
+      // Carve the requested size from the block.
+      auto carved = Bins::carve({block_addr, block_size}, size);
+
+      // Re-insert non-empty remainders. By the maximally-consolidated
+      // invariant, these remainders have no adjacent free neighbours.
+      if (carved.pre.size != 0)
+      {
+        insert_block(carved.pre.base, carved.pre.size);
+      }
+
+      if (carved.post.size != 0)
+      {
+        insert_block(carved.post.base, carved.post.size);
+      }
+
+      check_invariant();
+      return carved.req.base;
+    }
+
+    /**
+     * Structural invariant. Runs when `enabled` is true; defaults to
+     * `Debug` so in-tree callers compile away in Release while tests
+     * can opt in by passing `true` explicitly. Uses `SNMALLOC_CHECK`
+     * rather than `SNMALLOC_ASSERT` so that test-driven invocations
+     * are checked even under NDEBUG.
+     *
+     * Five clauses are verified:
+     *  1. Maximally consolidated — no adjacent free blocks could be
+     *     merged: (a) no two non-min range-tree entries touch across
+     *     a consolidatable boundary, (b) no non-min entry touches a
+     *     min entry, (c) no two min entries are adjacent.
+     *  2. Cross-tree consistency — every range-tree entry appears in
+     *     exactly one bin tree, and every non-min bin-tree entry
+     *     appears in the range tree.
+     *  3. Bin classification — every bin-tree entry sits in the bin
+     *     its size selects.
+     *  4. Bitmap consistency — the non-empty bin bit is set iff the
+     *     corresponding bin tree has entries.
+     *  5. Variant-tag consistency — each entry's pagemap variant tag
+     *     matches the tag implied by its address and size, and Large
+     *     variant entries carry the correct stored size.
+     */
+    void check_invariant(bool enabled = Debug)
+    {
+      if (!enabled)
+        return;
+
+      // 1a. No two adjacent non-min blocks (unless boundary prevents merge).
+      {
+        uintptr_t prev_addr = 0;
+        size_t prev_size = 0;
+        bool prev_valid = false;
+        range_tree.for_each([&](uintptr_t node) {
+          auto [a, s] = range_from_addr(node);
+          if (prev_valid)
+          {
+            uintptr_t prev_end = prev_addr + prev_size;
+            SNMALLOC_CHECK(prev_end != a || !Rep::can_consolidate(a));
+          }
+          prev_addr = a;
+          prev_size = s;
+          prev_valid = true;
+        });
+      }
+
+      // 1b. No non-min block adjacent to a min block (unless boundary).
+      range_tree.for_each([&](uintptr_t node) {
+        auto [a, s] = range_from_addr(node);
+        if (a >= UNIT_SIZE)
+          SNMALLOC_CHECK(
+            !contains_min(a - UNIT_SIZE) || !Rep::can_consolidate(a));
+        uintptr_t end = a + s;
+        SNMALLOC_CHECK(!contains_min(end) || !Rep::can_consolidate(end));
+      });
+
+      // 1c. No two adjacent min blocks (unless boundary).
+      {
+        uintptr_t prev = 0;
+        bool prev_valid = false;
+        bin_trees[0].for_each([&](uintptr_t node) {
+          if (Rep::get_variant(node) != ArenaVariant::Min)
+            return;
+          if (prev_valid)
+            SNMALLOC_CHECK(
+              prev + UNIT_SIZE != node || !Rep::can_consolidate(node));
+          prev = node;
+          prev_valid = true;
+        });
+      }
+
+      // 2. Cross-tree consistency.
+      {
+        size_t range_tree_count = 0;
+        size_t bin_tree_nonmin_count = 0;
+
+        for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++)
+        {
+          bin_trees[bin].for_each([&](uintptr_t node) {
+            auto [a, s] = range_from_addr(node);
+            if (s >= TWO_UNITS)
+            {
+              auto path = range_tree.get_root_path();
+              SNMALLOC_CHECK(range_tree.find(path, node));
+              bin_tree_nonmin_count++;
+            }
+          });
+        }
+
+        range_tree.for_each([&](uintptr_t node) {
+          range_tree_count++;
+          auto [a, s] = range_from_addr(node);
+          auto range = typename Bins::range_t{a, s};
+          size_t expected_bin = Bins::bin_index(range);
+          auto path = bin_trees[expected_bin].get_root_path();
+          SNMALLOC_CHECK(bin_trees[expected_bin].find(path, node));
+        });
+
+        SNMALLOC_CHECK(bin_tree_nonmin_count == range_tree_count);
+      }
+
+      // 3. Bin classification correctness.
+      for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++)
+      {
+        bin_trees[bin].for_each([&](uintptr_t node) {
+          auto [a, s] = range_from_addr(node);
+          auto range = typename Bins::range_t{a, s};
+          size_t expected_bin = Bins::bin_index(range);
+          SNMALLOC_CHECK(expected_bin == bin);
+        });
+      }
+
+      // 4. Bitmap consistency.
+      for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++)
+      {
+        bool has_entries = !bin_trees[bin].is_empty();
+        bool bit_set = bitmap.test(bin);
+        SNMALLOC_CHECK(has_entries == bit_set);
+      }
+
+      // 5. Variant-tag consistency.
+      for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++)
+      {
+        bin_trees[bin].for_each([&](uintptr_t node) {
+          auto v = Rep::get_variant(node);
+          auto [a, s] = range_from_addr(node);
+          SNMALLOC_CHECK(v == variant_of(s, a));
+          if (v == ArenaVariant::Large)
+            SNMALLOC_CHECK(Rep::get_large_size(node) == s);
+        });
+      }
+    }
+  };
+} // namespace snmalloc
diff --git a/src/snmalloc/backend_helpers/arenabins.h b/src/snmalloc/backend_helpers/arenabins.h
new file mode 100644
index 000000000..ccfb23ca8
--- /dev/null
+++ b/src/snmalloc/backend_helpers/arenabins.h
@@ -0,0 +1,822 @@
+#pragma once
+
+#include "../ds_core/bits.h"
+#include "../ds_core/helpers.h"
+
+#include <stdint.h>
+
+namespace snmalloc
+{
+  template<size_t INTERMEDIATE_BITS, size_t MIN_SIZE_BITS>
+  struct ArenaBinsTestAccess;
+
+  /**
+   * Size class enumeration and bin classification used by the
+   * Arena.
+   *
+   * Template parameter `B` (mantissa-bit width of snmalloc's
+   * non-power-of-two size class scheme) determines the number of
+   * RB-trees per exponent — the count of distinct servable subsets a
+   * free block can occupy at that exponent: B=1 -> 2; B=2 -> 5;
+   * B=3 -> 13. The canonical within-exponent bin numbering matches
+   * `prototype/skip_analysis.py`. All bin-scheme metadata derives
+   * constexpr from a single per-bin subsets table, `bin_subsets`.
+   *
+   * Template parameter `MIN_SIZE_BITS` is the log2 of the allocation
+   * unit: every byte size handled here is a multiple of
+   * `UNIT_SIZE = 1 << MIN_SIZE_BITS`, and the smallest representable
+   * size is `UNIT_SIZE`. With `MIN_SIZE_BITS == 0` the unit is a single
+   * byte and the classifier degenerates to the bare bin scheme;
+   * larger values scale the entire size axis (and the bin tables)
+   * by `UNIT_SIZE`.
+   *
+   * Public surface:
+   *  - `range_t`, `carve_t`: byte ranges and carve output.
+   *  - `carve(block, n)`: split a block into pre-pad / aligned
+   *    request / post-pad, where `n` is in bytes.
+   *  - `max_supported_size()`: upper bound on legal request sizes
+   *    (in bytes).
+   *  - nested `Bitmap`: per-arena non-empty-bins bitmap with
+   *    `add` / `find_for_request` / `clear`.
+   *
+   * Everything else is private; tests reach it via
+   * `ArenaBinsTestAccess<B, MIN_SIZE_BITS>`.
+   */
+  template<size_t INTERMEDIATE_BITS, size_t MIN_SIZE_BITS>
+  class ArenaBins
+  {
+    static_assert(
+      INTERMEDIATE_BITS >= 1 && INTERMEDIATE_BITS <= 3,
+      "ArenaBins supports B in {1, 2, 3}");
+    static_assert(
+      MIN_SIZE_BITS + INTERMEDIATE_BITS < bits::BITS,
+      "MIN_SIZE_BITS + INTERMEDIATE_BITS must leave room for at least one "
+      "exponent above the low regime so MAX_SC is non-trivial");
+
+  public:
+    /// (base, size) byte range. Both fields are multiples of
+    /// `UNIT_SIZE = 1 << MIN_SIZE_BITS`. `size == 0` means empty
+    /// (base is unspecified).
+    struct range_t
+    {
+      size_t base;
+      size_t size;
+    };
+
+    /// Output of `carve`: pre-pad / aligned request / post-pad.
+    /// Either or both of `pre`/`post` may be empty.
+    struct carve_t
+    {
+      range_t pre;
+      range_t req;
+      range_t post;
+    };
+
+  private:
+    friend struct ArenaBinsTestAccess<INTERMEDIATE_BITS, MIN_SIZE_BITS>;
+
+    static constexpr size_t B = INTERMEDIATE_BITS;
+
+    /// Size of the allocation unit. Every byte size handled by the
+    /// classifier is a multiple of this value, and the smallest
+    /// representable size is `UNIT_SIZE`.
+    static constexpr size_t UNIT_SIZE = size_t(1) << MIN_SIZE_BITS;
+
+    /// Number of mantissa positions per regular exponent (= 2^B).
+    static constexpr size_t MANTISSAS_PER_EXP = size_t(1) << B;
+
+    /// Number of distinct servable-subset bins per exponent
+    /// (from prototype/skip_analysis.py).
+    static constexpr size_t BINS_PER_EXP = (B == 1) ? 2 :
+      (B == 2)                                      ? 5 :
+      (B == 3)                                      ? 13 :
+                                                      0;
+
+    /// Size of the per-sc info tables. One past the largest raw id from
+    /// `bits::to_exp_mant_const<B, MIN_SIZE_BITS>` whose decoded size
+    /// fits in `size_t` (the architectural max raw id would decode to
+    /// `2^bits::BITS`, which overflows).
+    static constexpr size_t MAX_SC =
+      ((bits::BITS - B - MIN_SIZE_BITS) << B) + ((size_t(1) << B) - 1);
+
+    /**
+     * Per-SC bitmap-scan record, read by `Bitmap::find_for_request`.
+     * Fields are pre-shifted into the bitmap's word layout so the
+     * search hot path is two ANDs.
+     *
+     *  - `start_word`: bitmap word containing this SC's start bin.
+     *  - `first_mask`: serve mask pre-shifted into `start_word`. Bit
+     *    `i` set iff `words_[start_word]` bit `i` serves this SC.
+     *  - `second_mask`: serve mask carried into `start_word + 1`. When
+     *    the start bin is word-aligned there is no within-exp carry
+     *    and bits there are all higher-exponent, so `second_mask == ~0`.
+     *
+     * `alignas(4 * sizeof(size_t))` rounds `sizeof(bitmap_info_t)` up
+     * to a power of two so `table_.bitmap_info[sc]` indexes with a
+     * single shift+add.
+     *
+     * A *bin* (single bit in `Bitmap`) has no size/alignment of its
+     * own; it may be set on behalf of any SC whose subset includes it.
+     */
+    struct alignas(4 * sizeof(size_t)) bitmap_info_t
+    {
+      size_t start_word;
+      size_t first_mask;
+      size_t second_mask;
+    };
+
+    static_assert(
+      sizeof(bitmap_info_t) == 4 * sizeof(size_t),
+      "bitmap_info_t must be 4*size_t so table_.bitmap_info[sc] indexes "
+      "with a single shift+add; revisit the alignas if fields change");
+
+    /**
+     * Per-SC carve record, read by `carve` and by `bin_offset_at`'s
+     * `fits` predicate (free-side cascade walk via `bin_index`).
+     *
+     *  - `size`: byte size this SC promises on allocation (multiple
+     *    of `UNIT_SIZE`).
+     *  - `align`: natural byte alignment (a power of two, derived
+     *    from `size`).
+     */
+    struct carve_info_t
+    {
+      size_t size;
+      size_t align;
+    };
+
+    static_assert(
+      sizeof(carve_info_t) == 2 * sizeof(size_t),
+      "carve_info_t must be 2*size_t so table_.carve_info[sc] indexes "
+      "with a single shift+add");
+
+    /**
+     * Map a request size to its bitmap-scan record.
+     *
+     * `n` must be in `[UNIT_SIZE, max_supported_size()]` and a
+     * multiple of `UNIT_SIZE`. Not `constexpr`: uses `bits::clz`
+     * intrinsic via `bits::to_exp_mant` to stay single-cycle on the
+     * fast path.
+     */
+    SNMALLOC_FAST_PATH static const bitmap_info_t&
+    bitmap_info_for_request(size_t n)
+    {
+      SNMALLOC_ASSERT(n >= UNIT_SIZE);
+      SNMALLOC_ASSERT((n & (UNIT_SIZE - 1)) == 0);
+      SNMALLOC_ASSERT(n <= max_supported_size());
+      size_t raw = bits::to_exp_mant<B, MIN_SIZE_BITS>(n);
+      SNMALLOC_ASSERT(raw < MAX_SC);
+      return table_.bitmap_info[raw];
+    }
+
+    /// Map a request size to its carve record. Preconditions and
+    /// properties as `bitmap_info_for_request`.
+    SNMALLOC_FAST_PATH static const carve_info_t&
+    carve_info_for_request(size_t n)
+    {
+      SNMALLOC_ASSERT(n >= UNIT_SIZE);
+      SNMALLOC_ASSERT((n & (UNIT_SIZE - 1)) == 0);
+      SNMALLOC_ASSERT(n <= max_supported_size());
+      size_t raw = bits::to_exp_mant<B, MIN_SIZE_BITS>(n);
+      SNMALLOC_ASSERT(raw < MAX_SC);
+      return table_.carve_info[raw];
+    }
+
+  public:
+    /**
+     * Bin id of `block`. Operates on arbitrary byte sizes that are
+     * multiples of `UNIT_SIZE`, not just exact size classes.
+     * `block.size` must be at least `UNIT_SIZE`.
+     *
+     * A bin id at exponent `e` identifies the *servable set*: the
+     * subset of SCs at `e` that `block` could serve. Two blocks with
+     * the same servable set at the same exponent share a bin id.
+     *
+     * The natural byte exponent is `prev_pow2_bits(block.size)`,
+     * which ranges over `[MIN_SIZE_BITS, bits::BITS)` once the
+     * size is a multiple of `UNIT_SIZE`. The internal exponent
+     * `e` is normalised by subtracting `MIN_SIZE_BITS`, so bin
+     * 0 always corresponds to the `UNIT_SIZE` block.
+     *
+     * If alignment padding eats every SC at the natural exponent we
+     * drop to `e - 1`, which is guaranteed to fit: its smallest SC
+     * has size and alignment `UNIT_SIZE << (e - 1)`, so worst-case
+     * `size + pad < UNIT_SIZE << e <= block.size`. One drop is
+     * always enough.
+     *
+     * Not `constexpr`: uses `bits::clz` via `bits::prev_pow2_bits`.
+     */
+    SNMALLOC_FAST_PATH static size_t bin_index(range_t block)
+    {
+      SNMALLOC_ASSERT(block.size >= UNIT_SIZE);
+      SNMALLOC_ASSERT((block.size & (UNIT_SIZE - 1)) == 0);
+      SNMALLOC_ASSERT((block.base & (UNIT_SIZE - 1)) == 0);
+
+      size_t e = bits::prev_pow2_bits(block.size) - MIN_SIZE_BITS;
+      size_t offset = bin_offset_at(block.base, block.size, e);
+      if (SNMALLOC_UNLIKELY(offset == BINS_PER_EXP))
+      {
+        // Padding ate the natural exponent. Drop one and retry. Proof
+        // of single-step termination is in the doc comment above.
+        SNMALLOC_ASSERT(e > 0);
+        e--;
+        offset = bin_offset_at(block.base, block.size, e);
+        SNMALLOC_ASSERT(offset != BINS_PER_EXP);
+      }
+      return table_.exp_bin_base[e] + offset;
+    }
+
+    /// Largest byte size legal for `carve` / `Bitmap::find_for_request`.
+    static constexpr size_t max_supported_size()
+    {
+      return bits::from_exp_mant<B, MIN_SIZE_BITS>(MAX_SC - 1);
+    }
+
+    /**
+     * Carve a free block into pre-pad / aligned request / post-pad,
+     * delivering exactly `n` bytes to the caller.
+     *
+     * The carve_info for `n` is used only to find a valid alignment
+     * and to verify that the block has room: `req.base` is aligned
+     * to `info.align` (the natural alignment of the SC that covers
+     * `n`), and the block must contain `info.size` bytes from that
+     * point. Only `n` bytes are handed out, and the leftover
+     * `info.size - n` bytes roll into `post`. This keeps SC rounding
+     * as an arena-internal detail: callers always receive exactly
+     * what they asked for.
+     *
+     * Preconditions (caller must have used `Bitmap::find_for_request`
+     * to locate a servable bin):
+     *  - `block.size > 0`, `n` in `[UNIT_SIZE, max_supported_size()]`
+     *    and a multiple of `UNIT_SIZE`, `block` large enough to fit
+     *    the SC after aligning up.
+     *  - `block.base + block.size` does not wrap.
+     *
+     * Pure: does not touch the bitmap or any tree. Either or both
+     * `pre` / `post` may have `size == 0`; their `base` is still set
+     * to the natural address so `pre.base + pre.size == req.base` and
+     * `req.base + req.size == post.base` (keeps caller adjacency
+     * checks simple).
+     */
+    SNMALLOC_FAST_PATH static carve_t carve(range_t block, size_t n)
+    {
+      SNMALLOC_ASSERT(n >= UNIT_SIZE);
+      SNMALLOC_ASSERT((n & (UNIT_SIZE - 1)) == 0);
+      SNMALLOC_ASSERT(n <= max_supported_size());
+      SNMALLOC_ASSERT(block.size > 0);
+      SNMALLOC_ASSERT((block.size & (UNIT_SIZE - 1)) == 0);
+      SNMALLOC_ASSERT((block.base & (UNIT_SIZE - 1)) == 0);
+      // Combined with the servability precondition, non-wrapping end
+      // ensures the alignment-up below does not wrap either.
+      SNMALLOC_ASSERT(block.base + block.size >= block.base);
+
+      const carve_info_t& info = carve_info_for_request(n);
+
+      size_t req_base = (block.base + (info.align - 1)) & ~(info.align - 1);
+      size_t pre_size = req_base - block.base;
+
+      // Servability precondition: `info.size >= n` bytes fit after
+      // `pre`. We only hand out `n`; the remainder (`info.size - n`)
+      // joins `post`.
+      SNMALLOC_ASSERT(pre_size <= block.size);
+      SNMALLOC_ASSERT(block.size - pre_size >= info.size);
+
+      size_t post_base = req_base + n;
+      size_t post_size = (block.base + block.size) - post_base;
+
+      carve_t result;
+      result.pre = {block.base, pre_size};
+      result.req = {req_base, n};
+      result.post = {post_base, post_size};
+      return result;
+    }
+
+    /**
+     * Bitmap of non-empty per-arena bins. One bit per bin id
+     * (`bin_index`'s output); set iff the corresponding RB-tree is
+     * non-empty.
+     *
+     * Three-method API:
+     *   - `add(range_t)`: classify a block and set its bin's bit
+     *     (idempotent on the bit; returns the bin id).
+     *   - `find_for_request(n)`: smallest set bin whose blocks
+     *     all serve `n`, or `SIZE_MAX` if none.
+     *   - `clear(bin_id)`: mark empty. Caller must ensure the bin's
+     *     tree is actually empty; the bitmap does not track contents.
+     *
+     * Not thread-safe: callers sharing an arena must serialise the
+     * add / find / clear sequence under an external mutex.
+     */
+    class Bitmap
+    {
+      friend struct ArenaBinsTestAccess<INTERMEDIATE_BITS, MIN_SIZE_BITS>;
+
+    public:
+      /// Strict upper bound on bin ids `bin_index` produces. Exposed
+      /// so callers can size parallel arrays (one RB-tree per bin id).
+      static constexpr size_t TOTAL_BINS = BINS_PER_EXP * bits::BITS;
+
+      constexpr Bitmap() : words_{} {}
+
+      /**
+       * Classify `block`, set its bin's bit, return the bin id.
+       *
+       * Idempotent on bitmap state: if the bit is already set, this
+       * is a no-op (the bin id is still returned).
+       *
+       * The bitmap does NOT track which `(base, size)` ranges live in
+       * each bin's tree — the caller is responsible for inserting
+       * `block` into the appropriate tree.
+       */
+      SNMALLOC_FAST_PATH size_t add(range_t block)
+      {
+        SNMALLOC_ASSERT(block.size >= UNIT_SIZE);
+        SNMALLOC_ASSERT(block.size <= max_supported_size());
+        size_t bin_id = bin_index(block);
+        SNMALLOC_ASSERT(bin_id < TOTAL_BINS);
+        words_[bin_id / bits::BITS] |=
+          (size_t(1) << (bin_id & (bits::BITS - 1)));
+        return bin_id;
+      }
+
+      /// Read-only test: is the bit for `bin_id` set?
+      /// Used by `Arena::invariant()`.
+      bool test(size_t bin_id) const
+      {
+        SNMALLOC_ASSERT(bin_id < TOTAL_BINS);
+        return (words_[bin_id / bits::BITS] &
+                (size_t(1) << (bin_id & (bits::BITS - 1)))) != 0;
+      }
+
+      /// Mark bin `bin_id` empty. Caller must ensure the bin's tree
+      /// is actually empty; the bitmap does not consult the trees.
+      SNMALLOC_FAST_PATH void clear(size_t bin_id)
+      {
+        SNMALLOC_ASSERT(bin_id < TOTAL_BINS);
+        words_[bin_id / bits::BITS] &=
+          ~(size_t(1) << (bin_id & (bits::BITS - 1)));
+      }
+
+      /**
+       * Smallest bin id whose set blocks all serve `n`, or `SIZE_MAX`
+       * if none. `n` in `[UNIT_SIZE, max_supported_size()]` and a
+       * multiple of `UNIT_SIZE`.
+       *
+       * Invariant (static_assert below): `BINS_PER_EXP <= bits::BITS`,
+       * so the within-exponent range fits inside one word and the
+       * search straddles at most one word boundary. After the second
+       * word, every remaining word is purely higher-exponent.
+       */
+      SNMALLOC_FAST_PATH size_t find_for_request(size_t n) const
+      {
+        const bitmap_info_t& info = bitmap_info_for_request(n);
+        SNMALLOC_ASSERT(info.start_word < NUM_BITMAP_WORDS);
+        SNMALLOC_ASSUME(info.start_word < NUM_BITMAP_WORDS);
+
+        // First word: start bin + any within-exp neighbours in same word.
+        size_t word = info.start_word;
+        size_t bits = words_[word] & info.first_mask;
+        if (bits != 0)
+          return word * bits::BITS + bits::ctz(bits);
+        ++word;
+        if (word == NUM_BITMAP_WORDS)
+          return SIZE_MAX;
+
+        // Second word: within-exp carry plus any higher-exp bits.
+        bits = words_[word] & info.second_mask;
+        if (bits != 0)
+          return word * bits::BITS + bits::ctz(bits);
+
+        // Remaining words: purely higher-exponent, any bit serves.
+        while (++word < NUM_BITMAP_WORDS)
+          if (words_[word] != 0)
+            return word * bits::BITS + bits::ctz(words_[word]);
+        return SIZE_MAX;
+      }
+
+    private:
+      /// Number of size_t words backing the bitmap. Internal layout.
+      static constexpr size_t NUM_BITMAP_WORDS =
+        (TOTAL_BINS + bits::BITS - 1) / bits::BITS;
+
+      static_assert(
+        TOTAL_BINS == BINS_PER_EXP * bits::BITS,
+        "Bitmap layout: TOTAL_BINS must be BINS_PER_EXP * bits::BITS so it "
+        "divides evenly into bits::BITS-sized words.");
+      static_assert(
+        NUM_BITMAP_WORDS == BINS_PER_EXP,
+        "Bitmap layout: with the canonical TOTAL_BINS, the word count is "
+        "exactly BINS_PER_EXP.");
+      static_assert(
+        TOTAL_BINS < SIZE_MAX,
+        "find_for_request returns SIZE_MAX as the 'no match' sentinel; "
+        "TOTAL_BINS must be strictly less than SIZE_MAX so no valid bin "
+        "id can collide with the sentinel.");
+      static_assert(
+        BINS_PER_EXP <= bits::BITS,
+        "find_for_request assumes the within-exponent range (at most "
+        "BINS_PER_EXP bins) fits inside a single word, so the search "
+        "straddles at most one word boundary. If a future B pushes "
+        "BINS_PER_EXP above bits::BITS, the two-word body must be "
+        "generalised to handle a multi-word straddle.");
+
+      size_t words_[NUM_BITMAP_WORDS];
+    };
+
+  private:
+    // Vocabulary used in the rest of the private implementation:
+    //
+    //   exponent (e) : the bin-scheme exponent of a size; one axis of
+    //                  the size class grid.
+    //   mantissa (m) : the within-exponent position, in
+    //                  [0, MANTISSAS_PER_EXP). The other axis. When
+    //                  passed as a single argument it is named `m`
+    //                  (e.g. `start_bin_offset_for_m(m)`).
+    //   subset       : a bitmask of mantissas. `bin_subsets[b]` is the
+    //                  set of mantissas bin offset `b` can serve.
+    //   m_top        : when discussing a particular bin, the maximum
+    //                  element of its subset. Used as the bucketing
+    //                  axis for the cascade (see `bin_offset_at`).
+    //   m_test       : a single-mantissa probe in a cascade step;
+    //                  chosen so the probe's outcome disambiguates
+    //                  one candidate bin from the rest.
+
+    /**
+     * Single source of truth for the bin scheme.
+     *
+     * `bin_subsets[b]` is a bitmask of the mantissas bin offset `b`
+     * can serve: bit `m` set iff bin offset `b`'s servable subset
+     * contains mantissa `m`. The canonical bin numbering matches
+     * `prototype/skip_analysis.py`. Everything else in this file --
+     * `start_bin_offset_for_m`, `serve_mask_for_m`, the per-SC
+     * `start_word` / `first_mask` / `second_mask`, and the per-m_top
+     * decision lists in `BinTable::cascade_steps` -- is derived
+     * (constexpr) from this table.
+     *
+     * Required invariant (checked at constexpr build time in
+     * `BinTable::BinTable`; violating it fails the build): for every
+     * `m_top`, the bins whose subset has `m_top` as max element form a
+     * strict containment chain when sorted by subset size descending.
+     * That is, the largest such subset properly contains the next,
+     * which properly contains the one after, and so on. The chain
+     * property is what makes the single-mantissa-probe cascade in
+     * `bin_offset_at` sufficient to disambiguate among them.
+     *
+     * If you edit the literals below, re-run
+     * `prototype/skip_analysis.py` to verify they still match the
+     * canonical numbering and chain property.
+     */
+    static constexpr ModArray<BINS_PER_EXP, size_t> bin_subsets = []() {
+      ModArray<BINS_PER_EXP, size_t> r{};
+      if constexpr (B == 1)
+      {
+        // bin 0: {0}
+        // bin 1: {0,1}
+        r[0] = 0b01;
+        r[1] = 0b11;
+      }
+      else if constexpr (B == 2)
+      {
+        // bin 0: {0}      bin 3: {0,1,2}
+        // bin 1: {1}      bin 4: {0,1,2,3}
+        // bin 2: {0,1}
+        r[0] = 0b0001;
+        r[1] = 0b0010;
+        r[2] = 0b0011;
+        r[3] = 0b0111;
+        r[4] = 0b1111;
+      }
+      else /* B == 3 */
+      {
+        // bin  0: {0}              bin  7: {1,2,3,5}
+        // bin  1: {1}              bin  8: {0,1,2,3,4}
+        // bin  2: {0,1}            bin  9: {0,1,2,3,5}
+        // bin  3: {1,2}            bin 10: {0,1,2,3,4,5}
+        // bin  4: {0,1,2}          bin 11: {0,1,2,3,4,5,6}
+        // bin  5: {1,2,3}          bin 12: {0,1,2,3,4,5,6,7}
+        // bin  6: {0,1,2,3}
+        r[0] = 0b00000001;
+        r[1] = 0b00000010;
+        r[2] = 0b00000011;
+        r[3] = 0b00000110;
+        r[4] = 0b00000111;
+        r[5] = 0b00001110;
+        r[6] = 0b00001111;
+        r[7] = 0b00101110;
+        r[8] = 0b00011111;
+        r[9] = 0b00101111;
+        r[10] = 0b00111111;
+        r[11] = 0b01111111;
+        r[12] = 0b11111111;
+      }
+      return r;
+    }();
+
+    /**
+     * First within-exponent bin offset whose subset contains mantissa
+     * `m`. Derived from `bin_subsets`.
+     *
+     * Combined with the per-exponent base, this is an SC's absolute
+     * start bin index: `start_bit = exp_bin_base[e] +
+     * start_bin_offset_for_m(m)`. The bitmap stores its low and high
+     * halves pre-shifted into the `bitmap_info_t::first_mask` /
+     * `second_mask` fields.
+     */
+    static constexpr size_t start_bin_offset_for_m(size_t m)
+    {
+      size_t mask = size_t(1) << m;
+      for (size_t b = 0; b < BINS_PER_EXP; b++)
+        if (bin_subsets[b] & mask)
+          return b;
+      return BINS_PER_EXP; // unreachable: every m is in some subset
+    }
+
+    /**
+     * Bitmask, relative to `start_bin_offset_for_m(m)`, of bins that
+     * serve `m`. Bit `k` is set iff bin offset
+     * `start_bin_offset_for_m(m) + k` serves a request whose
+     * within-exponent position is `m`. The start bin always serves
+     * (bit 0 set), within-exponent bins serve iff their subset
+     * contains `m`, and bins above the within-exponent range belong
+     * to higher exponents and always serve (high bits all 1).
+     *
+     * Built positively (set bit = "serve") rather than as a "skip"
+     * mask: the hot path in `Bitmap::find_for_request` AND's this
+     * mask (pre-shifted into `bitmap_info_t::first_mask` / `second_mask`)
+     * against the bitmap word without an intermediate NOT.
+     */
+    static constexpr size_t serve_mask_for_m(size_t m)
+    {
+      size_t mask = size_t(1) << m;
+      size_t start = start_bin_offset_for_m(m);
+      size_t result = ~size_t(0);
+      for (size_t b = start + 1; b < BINS_PER_EXP; b++)
+        if (!(bin_subsets[b] & mask))
+          result &= ~(size_t(1) << (b - start));
+      return result;
+    }
+
+    /// Constexpr popcount: small loop, used only at BinTable build time.
+    static constexpr size_t popcount_const(size_t x)
+    {
+      size_t n = 0;
+      while (x != 0)
+      {
+        n += (x & 1);
+        x >>= 1;
+      }
+      return n;
+    }
+
+    /// One step of a per-m_top decision list used by `bin_offset_at`.
+    /// If `m_test == NO_TEST` (see below) or `fits(m_test)` is true,
+    /// return `bin`.
+    struct CascadeStep
+    {
+      size_t m_test;
+      size_t bin;
+    };
+
+    /// Sentinel for `CascadeStep::m_test` meaning "take this bin
+    /// unconditionally". Any value `>= MANTISSAS_PER_EXP` would do; the
+    /// fits() lambda would short-circuit it on `first + m >= past`, but
+    /// the explicit sentinel makes the walker's intent obvious and
+    /// avoids one unnecessary comparison.
+    static constexpr size_t NO_TEST = MANTISSAS_PER_EXP;
+
+    /**
+     * Maximum decision-list length per `m_top`. Derived from
+     * `bin_subsets`: the largest number of bins sharing the same max
+     * subset element. Used to size `cascade_steps[m_top][]`; some
+     * `m_top` values have fewer candidates, leaving default-initialised
+     * slots at the end. Those slots are never reached because the
+     * preceding NO_TEST entry always returns.
+     */
+    static constexpr size_t MAX_CASCADE_STEPS = []() {
+      size_t mx = 0;
+      for (size_t m_top = 0; m_top < MANTISSAS_PER_EXP; m_top++)
+      {
+        size_t cnt = 0;
+        for (size_t b = 0; b < BINS_PER_EXP; b++)
+        {
+          // Bit m_top set and no higher bit set <=> max element is m_top.
+          if ((bin_subsets[b] >> m_top) == 1)
+            cnt++;
+        }
+        if (cnt > mx)
+          mx = cnt;
+      }
+      return mx;
+    }();
+
+    /**
+     * Within-exponent bin offset for a block at byte address `addr`
+     * of byte length `n` at internal exponent `e`. Returns
+     * `BINS_PER_EXP` (sentinel) if no mantissa at this exponent
+     * fits.
+     *
+     * Walks `m_top` from `MANTISSAS_PER_EXP - 1` down. The first
+     * fitting `m_top` is the largest mantissa this block can serve;
+     * it is also the natural bucketing axis, because the bins whose
+     * subset has `m_top` as max element are exactly the candidates we
+     * still need to disambiguate among. `table_.cascade_steps[m_top]`
+     * (a constexpr-built decision list, derived from `bin_subsets`)
+     * disambiguates among them with at most a couple of secondary
+     * `fits` checks.
+     *
+     * Worst case: `MANTISSAS_PER_EXP + MAX_CASCADE_STEPS - 1` fit
+     * checks — the inner loop's last entry is the NO_TEST default and
+     * returns without calling `fits`. Typical: 1-2 at the natural
+     * exponent and 1 at the fallback exponent.
+     */
+    SNMALLOC_FAST_PATH static size_t
+    bin_offset_at(size_t addr, size_t n, size_t e)
+    {
+      size_t first = table_.exp_first_sc[e];
+      size_t past = table_.exp_first_sc[e + 1];
+
+      auto fits = [&](size_t m) SNMALLOC_FAST_PATH_LAMBDA -> bool {
+        // Safety: mantissa m may not exist at this exponent (low
+        // regime -- exponents 0..B-1 have fewer than 2^B mantissas;
+        // for any B the very first exponent has only 1). Without this
+        // check we would index past `past` into the carve_info table.
+        if (first + m >= past)
+          return false;
+        const carve_info_t& ci = table_.carve_info[first + m];
+        // Optimisation: near the bottom of n's exponent range the
+        // higher-mantissa sizes already exceed n and cannot fit
+        // regardless of alignment. Skips the align_up below.
+        if (n < ci.size)
+          return false;
+        size_t pad = bits::align_up(addr, ci.align) - addr;
+        return n - ci.size >= pad;
+      };
+
+      for (size_t m_top = MANTISSAS_PER_EXP; m_top-- > 0;)
+      {
+        if (fits(m_top))
+        {
+          // Walk this m_top's decision list. The list always ends with
+          // a NO_TEST entry that acts as the default, so the loop is
+          // guaranteed to return.
+          for (size_t j = 0; j < MAX_CASCADE_STEPS; j++)
+          {
+            const CascadeStep& step = table_.cascade_steps[m_top][j];
+            if (step.m_test == NO_TEST || fits(step.m_test))
+              return step.bin;
+          }
+          SNMALLOC_ASSERT(false); // unreachable per the invariant above
+        }
+      }
+      return BINS_PER_EXP;
+    }
+
+    /**
+     * Constexpr-populated rodata tables.
+     *
+     * `bitmap_info[sc]` is the bitmap-scan record for each in-range
+     * sc (consumed by `Bitmap::find_for_request`).
+     * `carve_info[sc]` is the size/alignment record for each in-range
+     * sc (consumed by `carve` and by `bin_offset_at`'s `fits`
+     * predicate during free-side classification).
+     * `exp_first_sc[e]` is the first raw sc id at ArenaBins
+     * exponent e (with `exp_first_sc[bits::BITS] = MAX_SC` as a sentinel
+     * so `[exp_first_sc[e], exp_first_sc[e + 1])` is a valid raw range
+     * for every `e < bits::BITS`).
+     * `exp_bin_base[e]` is `e * BINS_PER_EXP`, precomputed so the
+     * `bin_index` fast path never performs a runtime multiply.
+     * `cascade_steps[m_top]` is the decision list `bin_offset_at` walks
+     * once it knows `m_top` is the largest fitting mantissa at the
+     * current exponent. The list always ends with a NO_TEST entry that
+     * acts as the default.
+     */
+    struct BinTable
+    {
+      ModArray<MAX_SC, bitmap_info_t> bitmap_info{};
+      ModArray<MAX_SC, carve_info_t> carve_info{};
+      ModArray<bits::BITS + 1, size_t> exp_first_sc{};
+      ModArray<bits::BITS + 1, size_t> exp_bin_base{};
+      ModArray<MANTISSAS_PER_EXP, ModArray<MAX_CASCADE_STEPS, CascadeStep>>
+        cascade_steps{};
+
+      constexpr BinTable()
+      {
+        // Boundary tables: keep all (e -> raw sc range) and (e -> bin id
+        // base) knowledge in two small ROM arrays. `to_exp_mant_const` is
+        // the only place that knows the size class encoding; once we've
+        // pinned down the raw boundaries, everything else is table lookup.
+        //
+        // `e` here is the internal (normalised) exponent: an SC's
+        // `e == 0` corresponds to byte size `UNIT_SIZE = 1 << MIN_SIZE_BITS`.
+        //
+        // Note: `exp_first_sc` does NOT have a uniform stride. At the
+        // bottom of the encoding the low regime (no leading-1 bit; the
+        // `b = (e == 0) ? 0 : 1` branch in `to_exp_mant_const`) squashes
+        // multiple internal exponents into encoded-exponent 0.
+        // For `B = 2` the counts are 1, 2, 4, 4, 4, ...
+        constexpr size_t MAX_E = bits::BITS - MIN_SIZE_BITS;
+        for (size_t e = 0; e < MAX_E; e++)
+        {
+          exp_first_sc[e] = bits::to_exp_mant_const<B, MIN_SIZE_BITS>(
+            size_t(1) << (e + MIN_SIZE_BITS));
+          exp_bin_base[e] = e * BINS_PER_EXP;
+        }
+        exp_first_sc[MAX_E] = MAX_SC;
+        exp_bin_base[MAX_E] = MAX_E * BINS_PER_EXP;
+
+        // Per-sc records. Size and alignment come straight from the
+        // size-class scheme (via from_exp_mant); start_word, first_mask,
+        // second_mask are derived from bin_subsets via the constexpr
+        // helpers above, pre-shifted into the bitmap's word layout so
+        // the search hot path is two ANDs.
+        for (size_t sc = 0; sc < MAX_SC; sc++)
+        {
+          size_t size = bits::from_exp_mant<B, MIN_SIZE_BITS>(sc);
+          size_t e = bits::prev_pow2_bits_const(size) - MIN_SIZE_BITS;
+          size_t m = sc - exp_first_sc[e];
+          size_t start_bit = exp_bin_base[e] + start_bin_offset_for_m(m);
+          size_t mask = serve_mask_for_m(m);
+          size_t shift = start_bit & (bits::BITS - 1);
+          carve_info[sc].size = size;
+          carve_info[sc].align = size & (~size + 1);
+          bitmap_info[sc].start_word = start_bit / bits::BITS;
+          bitmap_info[sc].first_mask = mask << shift;
+          // shift == 0: no within-exponent carry; the second word is
+          // entirely higher-exponent. shift > 0: the low `shift` bits
+          // receive the top of mask (within-exp carry plus its all-1s
+          // tail), and bits [shift, BITS) are higher-exp and always
+          // serve.
+          bitmap_info[sc].second_mask = (shift == 0) ?
+            ~size_t(0) :
+            ((mask >> (bits::BITS - shift)) | (~size_t(0) << shift));
+        }
+
+        // cascade_steps: for each m_top, build a decision list of
+        // (m_test, bin) pairs derived from bin_subsets. Candidates are
+        // bins whose subset has m_top as max element; sort descending
+        // by subset size. The strict-chain invariant on `bin_subsets`
+        // (see its doc comment) guarantees each non-default
+        // candidate's subset properly contains the next candidate's,
+        // so the discriminator for candidate `i` is one of the
+        // mantissas in `bin_subsets[b_i] & ~bin_subsets[b_{i+1}]`.
+        for (size_t m_top = 0; m_top < MANTISSAS_PER_EXP; m_top++)
+        {
+          ModArray<BINS_PER_EXP, size_t> candidates{};
+          size_t n_cand = 0;
+          for (size_t b = 0; b < BINS_PER_EXP; b++)
+          {
+            // bin_subsets[b] >> m_top == 1 <=> bit m_top set and no
+            // higher bit set <=> max element of subset is m_top.
+            if ((bin_subsets[b] >> m_top) == 1)
+            {
+              candidates[n_cand] = b;
+              n_cand++;
+            }
+          }
+          // Insertion sort, descending by popcount of subset.
+          for (size_t i = 1; i < n_cand; i++)
+          {
+            size_t b = candidates[i];
+            size_t pcb = popcount_const(bin_subsets[b]);
+            size_t j = i;
+            while (j > 0 &&
+                   popcount_const(bin_subsets[candidates[j - 1]]) < pcb)
+            {
+              candidates[j] = candidates[j - 1];
+              j--;
+            }
+            candidates[j] = b;
+          }
+          // Non-default candidates: pick a discriminating mantissa.
+          // Under the strict-chain invariant on `bin_subsets`, each
+          // candidate's subset properly contains the next candidate's,
+          // so `bin_subsets[b] & ~bin_subsets[b_next]` is the
+          // (non-empty) set of mantissas unique to this candidate.
+          for (size_t i = 0; i + 1 < n_cand; i++)
+          {
+            size_t b = candidates[i];
+            size_t b_next = candidates[i + 1];
+            size_t discrim_set = bin_subsets[b] & ~bin_subsets[b_next];
+            // If this fires, `bin_subsets` violates the strict-chain
+            // invariant: candidate `b`'s subset does not properly
+            // contain candidate `b_next`'s, so the cascade can't be
+            // expressed as single-mantissa probes. Calling the
+            // non-constexpr `SNMALLOC_CHECK` makes the constexpr
+            // evaluation non-constant and surfaces the violation as
+            // a compile error.
+            if (discrim_set == 0)
+              SNMALLOC_CHECK_MSG(
+                false, "bin_subsets violates strict-chain invariant");
+            cascade_steps[m_top][i].m_test = bits::ctz_const(discrim_set);
+            cascade_steps[m_top][i].bin = b;
+          }
+          // Default (last) candidate.
+          cascade_steps[m_top][n_cand - 1].m_test = NO_TEST;
+          cascade_steps[m_top][n_cand - 1].bin = candidates[n_cand - 1];
+        }
+      }
+    };
+
+    static constexpr BinTable table_{};
+  };
+} // namespace snmalloc
diff --git a/src/snmalloc/backend_helpers/authmap.h b/src/snmalloc/backend_helpers/authmap.h
index e2a00085b..c0ad74258 100644
--- a/src/snmalloc/backend_helpers/authmap.h
+++ b/src/snmalloc/backend_helpers/authmap.h
@@ -23,6 +23,19 @@ namespace snmalloc
     {
       return capptr::Arena<void>::unsafe_from(c.unsafe_ptr());
     }
+
+    /**
+     * Address-keyed sibling of `amplify`: returns a capability with
+     * address `a` and (on real capability hardware) the registered
+     * arena's permissions. The non-StrictProvenance pass-through
+     * variant simply fabricates a pointer at `a`.
+     */
+    template<bool potentially_out_of_range = false>
+    static SNMALLOC_FAST_PATH capptr::Arena<void>
+    amplify_from_address(address_t a)
+    {
+      return capptr::Arena<void>::unsafe_from(reinterpret_cast<void*>(a));
+    }
   };
 
   /**
@@ -67,6 +80,23 @@ namespace snmalloc
         concreteAuthmap.template get<potentially_out_of_range>(address_cast(c)),
         c);
     }
+
+    /**
+     * Address-keyed sibling of `amplify`: returns a capability at
+     * address `a` with the registered arena's permissions, suitable
+     * for cases where the caller holds only an integer address (for
+     * example, in-band tree-node access in `InplaceRep`). The
+     * authmap is set once per arena registration and never mutated
+     * thereafter, so this lookup is safe under concurrent allocator
+     * activity.
+     */
+    template<bool potentially_out_of_range = false>
+    static SNMALLOC_FAST_PATH capptr::Arena<void>
+    amplify_from_address(address_t a)
+    {
+      auto arena = concreteAuthmap.template get<potentially_out_of_range>(a);
+      return pointer_offset(arena, a - address_cast(arena));
+    }
   };
 
   /**
diff --git a/src/snmalloc/backend_helpers/backend_helpers.h b/src/snmalloc/backend_helpers/backend_helpers.h
index ee339337b..baeeb1cfd 100644
--- a/src/snmalloc/backend_helpers/backend_helpers.h
+++ b/src/snmalloc/backend_helpers/backend_helpers.h
@@ -2,21 +2,20 @@
 
 #include "../mem/mem.h"
 #include "authmap.h"
-#include "buddy.h"
 #include "commitrange.h"
 #include "commonconfig.h"
 #include "defaultpagemapentry.h"
 #include "empty_range.h"
 #include "globalrange.h"
 #include "indirectrange.h"
-#include "largebuddyrange.h"
+#include "largearenarange.h"
 #include "logrange.h"
 #include "noprange.h"
 #include "pagemap.h"
 #include "pagemapregisterrange.h"
 #include "palrange.h"
 #include "range_helpers.h"
-#include "smallbuddyrange.h"
+#include "smallarenarange.h"
 #include "staticconditionalrange.h"
 #include "statsrange.h"
 #include "subrange.h"
diff --git a/src/snmalloc/backend_helpers/buddy.h b/src/snmalloc/backend_helpers/buddy.h
deleted file mode 100644
index 58cafacb1..000000000
--- a/src/snmalloc/backend_helpers/buddy.h
+++ /dev/null
@@ -1,199 +0,0 @@
-#pragma once
-
-#include "../ds/ds.h"
-
-namespace snmalloc
-{
-  /**
-   * Class representing a buddy allocator
-   *
-   * Underlying node `Rep` representation is passed in.
-   *
-   * The allocator can handle blocks between inclusive MIN_SIZE_BITS and
-   * exclusive MAX_SIZE_BITS.
-   */
-  template<typename Rep, size_t MIN_SIZE_BITS, size_t MAX_SIZE_BITS>
-  class Buddy
-  {
-    static_assert(MAX_SIZE_BITS > MIN_SIZE_BITS);
-
-    struct Entry
-    {
-      typename Rep::Contents cache[3];
-      RBTree<Rep> tree{};
-    };
-
-    stl::Array<Entry, MAX_SIZE_BITS - MIN_SIZE_BITS> entries{};
-    // All RBtrees at or above this index should be empty.
-    size_t empty_at_or_above{0};
-
-    size_t to_index(size_t size)
-    {
-      SNMALLOC_ASSERT(size != 0);
-      SNMALLOC_ASSERT(bits::is_pow2(size));
-      auto log = snmalloc::bits::next_pow2_bits(size);
-      SNMALLOC_ASSERT_MSG(
-        log >= MIN_SIZE_BITS, "Size too big: {} log {}.", size, log);
-      SNMALLOC_ASSERT_MSG(
-        log < MAX_SIZE_BITS, "Size too small: {} log {}.", size, log);
-
-      return log - MIN_SIZE_BITS;
-    }
-
-    void validate_block(typename Rep::Contents addr, size_t size)
-    {
-      SNMALLOC_ASSERT(bits::is_pow2(size));
-      SNMALLOC_ASSERT(addr == Rep::align_down(addr, size));
-      UNUSED(addr, size);
-    }
-
-    void invariant()
-    {
-#ifndef NDEBUG
-      for (size_t i = empty_at_or_above; i < entries.size(); i++)
-      {
-        SNMALLOC_ASSERT(entries[i].tree.is_empty());
-        // TODO check cache is empty
-      }
-#endif
-    }
-
-    bool remove_buddy(typename Rep::Contents addr, size_t size)
-    {
-      auto idx = to_index(size);
-
-      // Empty at this range.
-      if (idx >= empty_at_or_above)
-        return false;
-
-      auto buddy = Rep::buddy(addr, size);
-
-      // Check local cache first
-      for (auto& e : entries[idx].cache)
-      {
-        if (Rep::equal(buddy, e))
-        {
-          if (!Rep::can_consolidate(addr, size))
-            return false;
-
-          e = entries[idx].tree.remove_min();
-          return true;
-        }
-      }
-
-      auto path = entries[idx].tree.get_root_path();
-      bool contains_buddy = entries[idx].tree.find(path, buddy);
-
-      if (!contains_buddy)
-        return false;
-
-      // Only check if we can consolidate after we know the buddy is in
-      // the buddy allocator.  This is required to prevent possible segfaults
-      // from looking at the buddies meta-data, which we only know exists
-      // once we have found it in the red-black tree.
-      if (!Rep::can_consolidate(addr, size))
-        return false;
-
-      entries[idx].tree.remove_path(path);
-      return true;
-    }
-
-  public:
-    constexpr Buddy() = default;
-
-    /**
-     * Add a block to the buddy allocator.
-     *
-     * Blocks needs to be power of two size and aligned to the same power of
-     * two.
-     *
-     * Returns null, if the block is successfully added. Otherwise, returns the
-     * consolidated block that is MAX_SIZE_BITS big, and hence too large for
-     * this allocator.
-     */
-    typename Rep::Contents add_block(typename Rep::Contents addr, size_t size)
-    {
-      validate_block(addr, size);
-
-      if (remove_buddy(addr, size))
-      {
-        // Add to next level cache
-        size *= 2;
-        addr = Rep::align_down(addr, size);
-        if (size == bits::one_at_bit(MAX_SIZE_BITS))
-        {
-          // Invariant should be checked on all non-tail return paths.
-          // Holds trivially here with current design.
-          invariant();
-          // Too big for this buddy allocator.
-          return addr;
-        }
-        return add_block(addr, size);
-      }
-
-      auto idx = to_index(size);
-      empty_at_or_above = bits::max(empty_at_or_above, idx + 1);
-
-      for (auto& e : entries[idx].cache)
-      {
-        if (Rep::equal(Rep::null, e))
-        {
-          e = addr;
-          return Rep::null;
-        }
-      }
-
-      auto path = entries[idx].tree.get_root_path();
-      entries[idx].tree.find(path, addr);
-      entries[idx].tree.insert_path(path, addr);
-      invariant();
-      return Rep::null;
-    }
-
-    /**
-     * Removes a block of size from the buddy allocator.
-     *
-     * Return Rep::null if this cannot be satisfied.
-     */
-    typename Rep::Contents remove_block(size_t size)
-    {
-      invariant();
-      auto idx = to_index(size);
-      if (idx >= empty_at_or_above)
-        return Rep::null;
-
-      auto addr = entries[idx].tree.remove_min();
-      for (auto& e : entries[idx].cache)
-      {
-        if (Rep::equal(Rep::null, addr) || Rep::compare(e, addr))
-        {
-          addr = stl::exchange(e, addr);
-        }
-      }
-
-      if (addr != Rep::null)
-      {
-        validate_block(addr, size);
-        return addr;
-      }
-
-      if (size * 2 == bits::one_at_bit(MAX_SIZE_BITS))
-        // Too big for this buddy allocator
-        return Rep::null;
-
-      auto bigger = remove_block(size * 2);
-      if (bigger == Rep::null)
-      {
-        empty_at_or_above = idx;
-        invariant();
-        return Rep::null;
-      }
-
-      auto second = Rep::offset(bigger, size);
-
-      // Split large block
-      add_block(second, size);
-      return bigger;
-    }
-  };
-} // namespace snmalloc
diff --git a/src/snmalloc/backend_helpers/inplacerep.h b/src/snmalloc/backend_helpers/inplacerep.h
new file mode 100644
index 000000000..3aacfb410
--- /dev/null
+++ b/src/snmalloc/backend_helpers/inplacerep.h
@@ -0,0 +1,278 @@
+#pragma once
+
+#include "../ds_core/bits.h"
+#include "../ds_core/defines.h"
+#include "../ds_core/sizeclassconfig.h"
+#include "arena.h"
+
+#include <stdint.h>
+
+namespace snmalloc
+{
+  /**
+   * In-band tree node stored at the head of a free block managed by
+   * `Arena`. Two pointer-sized words per unit; bit-packing of
+   * red and variant tags lives in `word_one`. Stored as `uintptr_t`
+   * so we can OR meta bits into the pointer slot without UB on
+   * non-capability platforms (on CHERI, capabilities to access these
+   * words are re-derived from the `Authmap` — see `InplaceRep`).
+   */
+  template<SNMALLOC_CONCEPT(capptr::IsBound) bounds>
+  struct InplaceNode
+  {
+    uintptr_t word_one;
+    uintptr_t word_two;
+  };
+
+  /**
+   * In-band `Rep` for `Arena`. Each free block carries its
+   * own tree-node and metadata storage in its first few units:
+   *
+   *   Unit 0 (addr):                bin-tree node + variant tag.
+   *   Unit 1 (addr + UNIT_SIZE):    range-tree node (size >= 2 units).
+   *   Unit 2 (addr + 2*UNIT_SIZE):  large-size word (size >= 3 units).
+   *
+   * Bit layout in `word_one` of each unit:
+   *   bit 0           : red bit (both trees)
+   *   bits 1..2       : variant tag (`ArenaVariant`, unit 0 only)
+   * `word_two` holds the second child pointer with no packed meta.
+   * Both child pointers are unit-aligned, so their low `MIN_BITS`
+   * bits are zero — the packed meta occupies bits below
+   * `1 << MIN_BITS` and never collides with a stored pointer value.
+   *
+   * `MIN_BITS = next_pow2_bits_const(sizeof(InplaceNode))`: the
+   * smallest free block must hold one tree node, so the unit IS the
+   * node footprint rounded up.
+   *
+   * CHERI: in-band storage is accessed via
+   * `Authmap::amplify_from_address(addr)`, which returns a
+   * capability at `addr` with the registered arena's permissions.
+   * The authmap is set once per arena registration and never
+   * mutated, so this lookup carries no concurrency hazard. On
+   * non-CHERI platforms the authmap is the pass-through
+   * `DummyAuthmap` and the cap collapses to a raw pointer.
+   */
+  template<typename Authmap, SNMALLOC_CONCEPT(capptr::IsBound) bounds>
+  class InplaceRep
+  {
+  public:
+    static constexpr size_t MIN_BITS =
+      bits::next_pow2_bits_const(sizeof(InplaceNode<bounds>));
+    static constexpr size_t UNIT_SIZE = size_t(1) << MIN_BITS;
+
+    // 3 meta bits (variant 2 + red 1) packed below the unit
+    // alignment boundary. Block addresses are UNIT_SIZE-aligned, so
+    // a value v with `(v & (UNIT_SIZE - 1)) == 0` writes the
+    // pointer cleanly without touching meta.
+    static_assert(MIN_BITS >= 3, "Need 3 low bits for red+variant packing");
+    static_assert(MIN_BITS < MIN_CHUNK_BITS, "Arena needs a non-trivial range");
+    static_assert(
+      MIN_ALLOC_SIZE >= (size_t(1) << MIN_BITS),
+      "Front-end minimum allocation must be >= in-band unit size; "
+      "otherwise a free block cannot hold the tree node.");
+
+    static constexpr uintptr_t RED_BIT = 1;
+    static constexpr unsigned VARIANT_SHIFT = 1;
+    static constexpr unsigned VARIANT_BITS = 2;
+    static constexpr uintptr_t VARIANT_MASK =
+      ((uintptr_t(1) << VARIANT_BITS) - 1) << VARIANT_SHIFT;
+    static constexpr uintptr_t BIN_META_MASK = RED_BIT | VARIANT_MASK;
+    static constexpr uintptr_t RANGE_META_MASK = RED_BIT;
+
+    static_assert(BIN_META_MASK < UNIT_SIZE);
+
+    /**
+     * Wraps a `uintptr_t*` storage slot plus the meta-bit mask that
+     * this slot owns. `get()` returns the slot value with meta bits
+     * cleared; assignment preserves them. Mirrors the role of
+     * `BackendStateWordRef` but with an inline mask field (we own
+     * the only mask here, unlike `BackendStateWordRef` which layers
+     * on top of the frontend-reserved mask).
+     */
+    class Handle
+    {
+      uintptr_t* val{nullptr};
+      uintptr_t mask{0};
+
+    public:
+      constexpr Handle() = default;
+
+      constexpr Handle(uintptr_t* v, uintptr_t m) : val(v), mask(m) {}
+
+      /**
+       * Single-pointer constructor required by the `RBRepMethods`
+       * concept (`ds_core/redblacktree.h:64-67`) for sentinel
+       * construction from `&Rep::root`. The tree's root field
+       * carries no meta bits, so mask defaults to zero.
+       */
+      constexpr Handle(uintptr_t* v) : val(v) {}
+
+      [[nodiscard]] uintptr_t get() const
+      {
+        return *val & ~mask;
+      }
+
+      Handle& operator=(uintptr_t v)
+      {
+        SNMALLOC_ASSERT((v & mask) == 0);
+        *val = v | (*val & mask);
+        return *this;
+      }
+
+      bool operator!=(const Handle& other) const
+      {
+        return val != other.val;
+      }
+
+      uintptr_t printable_address() const
+      {
+        return reinterpret_cast<uintptr_t>(val);
+      }
+    };
+
+  private:
+    template<size_t UnitIdx>
+    static InplaceNode<bounds>* unit_at(uintptr_t addr)
+    {
+      auto cap = Authmap::amplify_from_address(addr + UnitIdx * UNIT_SIZE);
+      return static_cast<InplaceNode<bounds>*>(cap.unsafe_ptr());
+    }
+
+    /**
+     * Tree rep shared by `BinRep` and `RangeRep`. `UnitIdx` is the
+     * block-relative unit (0 or 1) that holds this rep's node;
+     * `MetaMask` covers the bits in that unit's `word_one` owned
+     * by this rep (red + variant for `BinRep`, red only for
+     * `RangeRep`) and is preserved across `set`.
+     *
+     * Convention (mirrors `PagemapRep`): direction `true` selects
+     * `word_one` (the meta-bearing word); direction `false`
+     * selects `word_two`.
+     */
+    template<size_t UnitIdx, uintptr_t MetaMask, const char* Name>
+    struct TreeRep
+    {
+      using Handle = InplaceRep::Handle;
+      using Contents = uintptr_t;
+
+      static constexpr Contents null = 0;
+      static constexpr Contents root = 0;
+
+      static Handle ref(bool direction, Contents k)
+      {
+        // Sentinel handle for the null key, mirroring
+        // `PagemapRep::TreeRep::ref`. Reads return 0; writes are
+        // disallowed by the tree's algorithm but the storage is
+        // still backing in case of accidental writes during
+        // debugging.
+        static uintptr_t null_entry = 0;
+        if (SNMALLOC_UNLIKELY(k == 0))
+          return Handle{&null_entry, 0};
+        auto* node = unit_at<UnitIdx>(k);
+        return direction ? Handle{&node->word_one, MetaMask} :
+                           Handle{&node->word_two, 0};
+      }
+
+      static Contents get(Handle h)
+      {
+        return h.get();
+      }
+
+      static void set(Handle h, Contents v)
+      {
+        h = v;
+      }
+
+      static bool is_red(Contents k)
+      {
+        if (k == 0)
+          return false;
+        return (unit_at<UnitIdx>(k)->word_one & RED_BIT) != 0;
+      }
+
+      static void set_red(Contents k, bool new_is_red)
+      {
+        auto* w = &unit_at<UnitIdx>(k)->word_one;
+        if (((*w & RED_BIT) != 0) != new_is_red)
+          *w ^= RED_BIT;
+        SNMALLOC_ASSERT(is_red(k) == new_is_red);
+      }
+
+      static bool compare(Contents k1, Contents k2)
+      {
+        return k1 > k2;
+      }
+
+      static bool equal(Contents k1, Contents k2)
+      {
+        return k1 == k2;
+      }
+
+      static uintptr_t printable(Contents k)
+      {
+        return k;
+      }
+
+      static uintptr_t printable(Handle h)
+      {
+        return h.printable_address();
+      }
+
+      static const char* name()
+      {
+        return Name;
+      }
+    };
+
+    static constexpr char BIN_REP_NAME[] = "InplaceBinRep";
+    static constexpr char RANGE_REP_NAME[] = "InplaceRangeRep";
+
+  public:
+    using BinRep = TreeRep<0, BIN_META_MASK, BIN_REP_NAME>;
+    using RangeRep = TreeRep<1, RANGE_META_MASK, RANGE_REP_NAME>;
+
+    static ArenaVariant get_variant(uintptr_t addr)
+    {
+      auto w = unit_at<0>(addr)->word_one;
+      return static_cast<ArenaVariant>((w & VARIANT_MASK) >> VARIANT_SHIFT);
+    }
+
+    static void set_variant(uintptr_t addr, ArenaVariant v)
+    {
+      auto* w = &unit_at<0>(addr)->word_one;
+      *w = (*w & ~VARIANT_MASK) | (static_cast<uintptr_t>(v) << VARIANT_SHIFT);
+    }
+
+    /**
+     * Exact byte size for `Large` blocks. Stored as a plain
+     * `uintptr_t` in unit 2's `word_one`; unlike `PagemapRep` we
+     * do not need to compress (the pagemap word has reserved low
+     * bits but our in-band word has the full width).
+     */
+    static size_t get_large_size(uintptr_t addr)
+    {
+      return static_cast<size_t>(unit_at<2>(addr)->word_one);
+    }
+
+    static void set_large_size(uintptr_t addr, size_t size)
+    {
+      SNMALLOC_ASSERT((size & (UNIT_SIZE - 1)) == 0);
+      unit_at<2>(addr)->word_one = static_cast<uintptr_t>(size);
+    }
+
+    /**
+     * Refuse consolidation across `MIN_CHUNK_SIZE` boundaries.
+     * `SmallArenaRange::add_range_impl` splits incoming ranges at
+     * chunk boundaries, but does not eagerly merge across them on
+     * the wrapper side; this check is what stops `Arena`
+     * from later merging two adjacent intra-chunk fragments that
+     * happen to abut the same chunk boundary, which would create a
+     * free block straddling chunks. Chunk-aligned `higher_addr`
+     * means the lower neighbour ends at a chunk boundary — refuse.
+     */
+    static bool can_consolidate(uintptr_t higher_addr)
+    {
+      return (higher_addr & (MIN_CHUNK_SIZE - 1)) != 0;
+    }
+  };
+} // namespace snmalloc
diff --git a/src/snmalloc/backend_helpers/largearenarange.h b/src/snmalloc/backend_helpers/largearenarange.h
new file mode 100644
index 000000000..f53643bf4
--- /dev/null
+++ b/src/snmalloc/backend_helpers/largearenarange.h
@@ -0,0 +1,391 @@
+#pragma once
+
+#include "arena.h"
+#include "empty_range.h"
+#include "range_helpers.h"
+
+namespace snmalloc
+{
+  /**
+   * PagemapRep — Rep for `Arena` over a Pagemap.
+   *
+   * Each free block uses three pagemap entries at unit-aligned offsets:
+   *
+   *   Unit 0 (addr):                bin-tree node + variant tag.
+   *   Unit 1 (addr + UNIT_SIZE):    range-tree node (size ≥ 2 units).
+   *   Unit 2 (addr + 2*UNIT_SIZE):  large chunk count (size ≥ 3 units).
+   *
+   * Bit-layout decisions for tree nodes are private to this class. The
+   * pagemap reserves the low bits of each word for the meta-entry (see
+   * `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT`); the red bit, variant
+   * tag, and shifted large-chunk count all live at or above that bit:
+   * - Red bit (both trees) at `BACKEND_LAYOUT_FIRST_FREE_BIT`.
+   * - Variant tag (Word::One at unit 0) occupies 2 bits starting at
+   *   `BACKEND_LAYOUT_FIRST_FREE_BIT + 1`.
+   * - Large chunk count is stored in Word::One of unit 2 left-shifted by
+   *   `BACKEND_LAYOUT_FIRST_FREE_BIT`.
+   *
+   * `MIN_SIZE_BITS` is the log2 size of the allocation unit (= pagemap
+   * stride); the caller passes whatever unit it uses (snmalloc's global
+   * `MIN_CHUNK_BITS` in the in-tree pipeline).
+   * `MAX_SIZE_BITS` is the log2 of the (exclusive) upper bound on block
+   * size in bytes; used here only to verify that the largest chunk
+   * count fits in a shifted pagemap word.
+   */
+  template<
+    SNMALLOC_CONCEPT(IsWritablePagemap) Pagemap,
+    size_t MIN_SIZE_BITS,
+    size_t MAX_SIZE_BITS>
+  class PagemapRep
+  {
+    using Entry = typename Pagemap::Entry;
+
+    static constexpr uintptr_t UNIT_SIZE = uintptr_t(1) << MIN_SIZE_BITS;
+
+    // Bit positions inside a pagemap word. The reserved region (the
+    // sizeclass+offset bits on Word::Two, and META_BOUNDARY_BIT on
+    // Word::One) is owned by the meta-entry layout; tree-node and
+    // large-size encodings start at the first free bit above that
+    // reserved range — see
+    // `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT` in `mem/metadata.h`.
+    static constexpr unsigned RED_BIT_POS =
+      MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT;
+    static constexpr unsigned VARIANT_SHIFT =
+      MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT + 1;
+    static constexpr unsigned VARIANT_BITS = 2;
+
+    // Shift used to encode the large-size chunk count in Word::One of
+    // unit 2.
+    static constexpr size_t LARGE_SIZE_SHIFT =
+      MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT;
+
+    static constexpr uintptr_t RED_BIT = uintptr_t(1) << RED_BIT_POS;
+    static constexpr uintptr_t VARIANT_MASK =
+      ((uintptr_t(1) << VARIANT_BITS) - 1) << VARIANT_SHIFT;
+    static constexpr uintptr_t BIN_META_MASK = RED_BIT | VARIANT_MASK;
+    static constexpr uintptr_t RANGE_META_MASK = RED_BIT;
+
+    static_assert(MAX_SIZE_BITS > MIN_SIZE_BITS);
+    static_assert(
+      (MAX_SIZE_BITS - MIN_SIZE_BITS) + LARGE_SIZE_SHIFT <= bits::BITS,
+      "Shifted large-size field must fit in a pagemap word.");
+    static_assert((RED_BIT & VARIANT_MASK) == 0);
+    static_assert(BIN_META_MASK < UNIT_SIZE);
+    static_assert(
+      Entry::is_backend_allowed_value(Entry::Word::One, BIN_META_MASK));
+    static_assert(
+      Entry::is_backend_allowed_value(
+        Entry::Word::Two, ~uintptr_t(UNIT_SIZE - 1)),
+      "RangeRep stores chunk-aligned addresses in Word::Two; the "
+      "markerless ownership discriminator requires their low "
+      "BACKEND_RESERVED_MASK_WORD_TWO bits to be zero. This asserts "
+      "that the reserved mask fits entirely below the chunk alignment, "
+      "so no chunk-aligned value (any bit set only at position "
+      ">= MIN_SIZE_BITS) can collide.");
+
+    using Word = typename Entry::Word;
+    using Handle = typename Entry::BackendStateWordRef;
+
+    template<size_t UnitIdx>
+    static Handle word_at(uintptr_t addr, Word w)
+    {
+      auto& entry = Pagemap::template get_metaentry_mut<false>(
+        address_cast(addr + UnitIdx * UNIT_SIZE));
+      return entry.get_backend_word(w);
+    }
+
+    /**
+     * Tree rep shared by `BinRep` and `RangeRep`. `UnitIdx` is the
+     * block-relative pagemap unit (0 or 1) that holds this Rep's
+     * node; `MetaMask` covers bits in that unit's words owned by
+     * this Rep (red + variant tag for `BinRep`, red only for
+     * `RangeRep`) and must be preserved across get/set.
+     */
+    template<size_t UnitIdx, uintptr_t MetaMask, const char* Name>
+    struct TreeRep
+    {
+      using Handle = PagemapRep::Handle;
+      using Contents = uintptr_t;
+
+      static constexpr Contents null = 0;
+      static constexpr Contents root = 0;
+
+      static Handle ref(bool direction, Contents k)
+      {
+        static const Contents null_entry = 0;
+        if (SNMALLOC_UNLIKELY(k == 0))
+          return Handle{const_cast<Contents*>(&null_entry), 0};
+        return word_at<UnitIdx>(k, direction ? Word::One : Word::Two);
+      }
+
+      static Contents get(Handle h)
+      {
+        return h.get() & ~MetaMask;
+      }
+
+      static void set(Handle h, Contents v)
+      {
+        h = v | (h.get() & MetaMask);
+      }
+
+      static bool is_red(Contents k)
+      {
+        return (ref(true, k).get() & RED_BIT) == RED_BIT;
+      }
+
+      static void set_red(Contents k, bool new_is_red)
+      {
+        if (new_is_red != is_red(k))
+        {
+          auto h = ref(true, k);
+          h = h.get() ^ RED_BIT;
+        }
+        SNMALLOC_ASSERT(is_red(k) == new_is_red);
+      }
+
+      static bool compare(Contents k1, Contents k2)
+      {
+        return k1 > k2;
+      }
+
+      static bool equal(Contents k1, Contents k2)
+      {
+        return k1 == k2;
+      }
+
+      static uintptr_t printable(Contents k)
+      {
+        return k;
+      }
+
+      static uintptr_t printable(Handle h)
+      {
+        return h.printable_address();
+      }
+
+      static const char* name()
+      {
+        return Name;
+      }
+    };
+
+    static constexpr char BIN_REP_NAME[] = "PagemapBinRep";
+    static constexpr char RANGE_REP_NAME[] = "PagemapRangeRep";
+
+  public:
+    using BinRep = TreeRep<0, BIN_META_MASK, BIN_REP_NAME>;
+    using RangeRep = TreeRep<1, RANGE_META_MASK, RANGE_REP_NAME>;
+
+    static ArenaVariant get_variant(uintptr_t addr)
+    {
+      auto w = word_at<0>(addr, Word::One);
+      return static_cast<ArenaVariant>(
+        (w.get() & VARIANT_MASK) >> VARIANT_SHIFT);
+    }
+
+    static void set_variant(uintptr_t addr, ArenaVariant v)
+    {
+      auto w = word_at<0>(addr, Word::One);
+      w = (w.get() & ~VARIANT_MASK) |
+        (static_cast<uintptr_t>(v) << VARIANT_SHIFT);
+    }
+
+    static size_t get_large_size(uintptr_t addr)
+    {
+      // Stored as chunk count to keep the shifted field within a
+      // pagemap word (see LARGE_SIZE_SHIFT static_assert). Returns
+      // the byte size.
+      return (word_at<2>(addr, Word::One).get() >> LARGE_SIZE_SHIFT)
+        << MIN_SIZE_BITS;
+    }
+
+    static void set_large_size(uintptr_t addr, size_t size)
+    {
+      SNMALLOC_ASSERT((size & (UNIT_SIZE - 1)) == 0);
+      word_at<2>(addr, Word::One) = (size >> MIN_SIZE_BITS) << LARGE_SIZE_SHIFT;
+    }
+
+    static bool can_consolidate(uintptr_t higher_addr)
+    {
+      auto& entry =
+        Pagemap::template get_metaentry_mut<false>(address_cast(higher_addr));
+      return !entry.is_boundary();
+    }
+  };
+
+  /**
+   * Range wrapper around Arena, presenting the standard
+   * Range interface for use in Pipe<...> compositions.
+   */
+  template<
+    size_t REFILL_SIZE_BITS,
+    size_t MAX_SIZE_BITS,
+    SNMALLOC_CONCEPT(IsWritablePagemap) Pagemap,
+    size_t MIN_REFILL_SIZE_BITS = 0>
+  class LargeArenaRange
+  {
+    static_assert(
+      REFILL_SIZE_BITS <= MAX_SIZE_BITS, "REFILL_SIZE_BITS > MAX_SIZE_BITS");
+    static_assert(
+      MIN_REFILL_SIZE_BITS <= REFILL_SIZE_BITS,
+      "MIN_REFILL_SIZE_BITS > REFILL_SIZE_BITS");
+
+    static constexpr size_t REFILL_SIZE = bits::one_at_bit(REFILL_SIZE_BITS);
+    static constexpr size_t MIN_REFILL_SIZE =
+      bits::one_at_bit(MIN_REFILL_SIZE_BITS);
+
+  public:
+    template<typename ParentRange = EmptyRange<>>
+    class Type : public ContainsParent<ParentRange>
+    {
+      using ContainsParent<ParentRange>::parent;
+
+      using PagemapRepT = PagemapRep<Pagemap, MIN_CHUNK_BITS, MAX_SIZE_BITS>;
+
+      Arena<PagemapRepT, MIN_CHUNK_BITS, MAX_SIZE_BITS> arena;
+      size_t requested_total = 0;
+
+      void parent_dealloc(uintptr_t addr, size_t size)
+      {
+        if constexpr (MAX_SIZE_BITS != (bits::BITS - 1))
+        {
+          auto base =
+            capptr::Arena<void>::unsafe_from(reinterpret_cast<void*>(addr));
+          parent.dealloc_range(base, size);
+        }
+        else
+        {
+          SNMALLOC_CHECK_MSG(false, "Global range overflow should not happen");
+        }
+      }
+
+      void add_range(capptr::Arena<void> base, size_t length)
+      {
+        // Parent ranges (e.g. mmap-backed PalRange) may return regions
+        // that are page-aligned but not chunk-aligned; trim to chunk
+        // boundaries on both ends before handing to the arena.
+        uintptr_t lo = bits::align_up(base.unsafe_uintptr(), MIN_CHUNK_SIZE);
+        uintptr_t hi =
+          bits::align_down(base.unsafe_uintptr() + length, MIN_CHUNK_SIZE);
+        if (lo >= hi)
+          return;
+        auto [ov_addr, ov_size] = arena.add_block(lo, hi - lo);
+        if (ov_addr != 0)
+          parent_dealloc(ov_addr, ov_size);
+      }
+
+      capptr::Arena<void> refill(size_t size)
+      {
+        if (ParentRange::Aligned)
+        {
+          size_t refill_size = bits::min(REFILL_SIZE, requested_total);
+          refill_size = bits::max(refill_size, MIN_REFILL_SIZE);
+          refill_size = bits::max(refill_size, size);
+          refill_size = bits::next_pow2(refill_size);
+
+          auto refill_range = parent.alloc_range(refill_size);
+          if (refill_range != nullptr)
+          {
+            requested_total += refill_size;
+            add_range(pointer_offset(refill_range, size), refill_size - size);
+          }
+          return refill_range;
+        }
+
+        bool overflow = false;
+        size_t needed_size = bits::umul(size, 2, overflow);
+        if (overflow)
+        {
+          return nullptr;
+        }
+
+        auto refill_size = bits::max(needed_size, REFILL_SIZE);
+        while (needed_size <= refill_size)
+        {
+          auto refill_range = parent.alloc_range(refill_size);
+
+          if (refill_range != nullptr)
+          {
+            requested_total += refill_size;
+            add_range(refill_range, refill_size);
+
+            SNMALLOC_ASSERT(refill_size < bits::one_at_bit(MAX_SIZE_BITS));
+            static_assert(
+              (REFILL_SIZE < bits::one_at_bit(MAX_SIZE_BITS)) ||
+                ParentRange::Aligned,
+              "Required to prevent overflow.");
+
+            return alloc_range(size);
+          }
+
+          refill_size >>= 1;
+        }
+
+        return nullptr;
+      }
+
+    public:
+      static constexpr bool Aligned = true;
+      static constexpr bool ConcurrencySafe = false;
+      using ChunkBounds = capptr::bounds::Arena;
+      static_assert(
+        stl::is_same_v<typename ParentRange::ChunkBounds, ChunkBounds>);
+
+      constexpr Type() = default;
+
+      /**
+       * `size` exceeds the arena's representable range and must be
+       * routed to the parent (or refused if no parent exists). Matches
+       * `Arena::add_block`'s `size < bits::one_at_bit(MAX_SIZE_BITS)`
+       * precondition exactly, so alloc and dealloc bypass on the same
+       * boundary.
+       */
+      static constexpr bool is_too_large(size_t size)
+      {
+        return size >= bits::one_at_bit(MAX_SIZE_BITS);
+      }
+
+      capptr::Arena<void> alloc_range(size_t size)
+      {
+        SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE);
+        SNMALLOC_ASSERT((size & (MIN_CHUNK_SIZE - 1)) == 0);
+
+        if (is_too_large(size))
+        {
+          if (ParentRange::Aligned)
+            return parent.alloc_range(size);
+
+          return nullptr;
+        }
+
+        uintptr_t addr = arena.remove_block(size);
+        if (addr != 0)
+        {
+          return capptr::Arena<void>::unsafe_from(
+            reinterpret_cast<void*>(addr));
+        }
+
+        return refill(size);
+      }
+
+      void dealloc_range(capptr::Arena<void> base, size_t size)
+      {
+        SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE);
+        SNMALLOC_ASSERT((size & (MIN_CHUNK_SIZE - 1)) == 0);
+
+        if constexpr (MAX_SIZE_BITS != (bits::BITS - 1))
+        {
+          if (is_too_large(size))
+          {
+            parent_dealloc(base.unsafe_uintptr(), size);
+            return;
+          }
+        }
+
+        auto [ov_addr, ov_size] = arena.add_block(base.unsafe_uintptr(), size);
+        if (ov_addr != 0)
+          parent_dealloc(ov_addr, ov_size);
+      }
+    };
+  };
+} // namespace snmalloc
diff --git a/src/snmalloc/backend_helpers/largebuddyrange.h b/src/snmalloc/backend_helpers/largebuddyrange.h
deleted file mode 100644
index 15324753f..000000000
--- a/src/snmalloc/backend_helpers/largebuddyrange.h
+++ /dev/null
@@ -1,393 +0,0 @@
-#pragma once
-
-#include "../ds/ds.h"
-#include "../mem/mem.h"
-#include "buddy.h"
-#include "empty_range.h"
-#include "range_helpers.h"
-
-namespace snmalloc
-{
-  /**
-   * Class for using the pagemap entries for the buddy allocator.
-   */
-  template<SNMALLOC_CONCEPT(IsWritablePagemap) Pagemap>
-  class BuddyChunkRep
-  {
-  public:
-    /*
-     * The values we store in our rbtree are the addresses of (combined spans
-     * of) chunks of the address space; as such, bits in (MIN_CHUNK_SIZE - 1)
-     * are unused and so the RED_BIT is packed therein.  However, in practice,
-     * these are not "just any" uintptr_t-s, but specifically the uintptr_t-s
-     * inside the Pagemap's BackendAllocator::Entry structures.
-     *
-     * The BackendAllocator::Entry provides us with helpers that guarantee that
-     * we use only the bits that we are allowed to.
-     * @{
-     */
-    using Handle = MetaEntryBase::BackendStateWordRef;
-    using Contents = uintptr_t;
-    ///@}
-
-    /**
-     * The bit that we will use to mark an entry as red.
-     * This has constraints in two directions, it must not be one of the
-     * reserved bits from the perspective of the meta entry and it must not be
-     * a bit that is a valid part of the address of a chunk.
-     * @{
-     */
-    static constexpr address_t RED_BIT = 1 << 8;
-
-    static_assert(RED_BIT < MIN_CHUNK_SIZE);
-    static_assert(MetaEntryBase::is_backend_allowed_value(
-      MetaEntryBase::Word::One, RED_BIT));
-    static_assert(MetaEntryBase::is_backend_allowed_value(
-      MetaEntryBase::Word::Two, RED_BIT));
-    ///@}
-
-    /// The value of a null node, as returned by `get`
-    static constexpr Contents null = 0;
-    /// The value of a null node, as stored in a `uintptr_t`.
-    static constexpr Contents root = 0;
-
-    /**
-     * Set the value.  Preserve the red/black colour.
-     */
-    static void set(Handle ptr, Contents r)
-    {
-      ptr = r | (static_cast<address_t>(ptr.get()) & RED_BIT);
-    }
-
-    /**
-     * Returns the value, stripping out the red/black colour.
-     */
-    static Contents get(const Handle ptr)
-    {
-      return ptr.get() & ~RED_BIT;
-    }
-
-    /**
-     * Returns a pointer to the tree node for the specified address.
-     */
-    static Handle ref(bool direction, Contents k)
-    {
-      // Special case for accessing the null entry.  We want to make sure
-      // that this is never modified by the back end, so we make it point to
-      // a constant entry and use the MMU to trap even in release modes.
-      static const Contents null_entry = 0;
-      if (SNMALLOC_UNLIKELY(address_cast(k) == 0))
-      {
-        return {const_cast<Contents*>(&null_entry)};
-      }
-      auto& entry = Pagemap::template get_metaentry_mut<false>(address_cast(k));
-      if (direction)
-        return entry.get_backend_word(Pagemap::Entry::Word::One);
-
-      return entry.get_backend_word(Pagemap::Entry::Word::Two);
-    }
-
-    static bool is_red(Contents k)
-    {
-      return (ref(true, k).get() & RED_BIT) == RED_BIT;
-    }
-
-    static void set_red(Contents k, bool new_is_red)
-    {
-      if (new_is_red != is_red(k))
-      {
-        auto v = ref(true, k);
-        v = v.get() ^ RED_BIT;
-      }
-      SNMALLOC_ASSERT(is_red(k) == new_is_red);
-    }
-
-    static Contents offset(Contents k, size_t size)
-    {
-      return k + size;
-    }
-
-    static Contents buddy(Contents k, size_t size)
-    {
-      return k ^ size;
-    }
-
-    static Contents align_down(Contents k, size_t size)
-    {
-      return k & ~(size - 1);
-    }
-
-    static bool compare(Contents k1, Contents k2)
-    {
-      return k1 > k2;
-    }
-
-    static bool equal(Contents k1, Contents k2)
-    {
-      return k1 == k2;
-    }
-
-    static uintptr_t printable(Contents k)
-    {
-      return k;
-    }
-
-    /**
-     * Convert the pointer wrapper into something that the snmalloc debug
-     * printing code can print.
-     */
-    static address_t printable(Handle k)
-    {
-      return k.printable_address();
-    }
-
-    /**
-     * Returns the name for use in debugging traces.  Not used in normal builds
-     * (release or debug), only when tracing is enabled.
-     */
-    static const char* name()
-    {
-      return "BuddyChunkRep";
-    }
-
-    static bool can_consolidate(Contents k, size_t size)
-    {
-      // Need to know both entries exist in the pagemap.
-      // This must only be called if that has already been
-      // ascertained.
-      // The buddy could be in a part of the pagemap that has
-      // not been registered and thus could segfault on access.
-      auto larger = bits::max(k, buddy(k, size));
-      auto& entry =
-        Pagemap::template get_metaentry_mut<false>(address_cast(larger));
-      return !entry.is_boundary();
-    }
-  };
-
-  /**
-   * Used to represent a consolidating range of memory.  Uses a buddy allocator
-   * to consolidate adjacent blocks.
-   *
-   * ParentRange - Represents the range to get memory from to fill this range.
-   *
-   * REFILL_SIZE_BITS - Maximum size of a refill, may ask for less during warm
-   * up phase.
-   *
-   * MAX_SIZE_BITS - Maximum size that this range will store.
-   *
-   * Pagemap - How to access the pagemap, which is used to store the red black
-   * tree nodes for the buddy allocators.
-   *
-   * MIN_REFILL_SIZE_BITS - The minimum size that the ParentRange can be asked
-   * for
-   */
-  template<
-    size_t REFILL_SIZE_BITS,
-    size_t MAX_SIZE_BITS,
-    SNMALLOC_CONCEPT(IsWritablePagemap) Pagemap,
-    size_t MIN_REFILL_SIZE_BITS = 0>
-  class LargeBuddyRange
-  {
-    static_assert(
-      REFILL_SIZE_BITS <= MAX_SIZE_BITS, "REFILL_SIZE_BITS > MAX_SIZE_BITS");
-    static_assert(
-      MIN_REFILL_SIZE_BITS <= REFILL_SIZE_BITS,
-      "MIN_REFILL_SIZE_BITS > REFILL_SIZE_BITS");
-
-    /**
-     * Maximum size of a refill
-     */
-    static constexpr size_t REFILL_SIZE = bits::one_at_bit(REFILL_SIZE_BITS);
-
-    /**
-     * Minimum size of a refill
-     */
-    static constexpr size_t MIN_REFILL_SIZE =
-      bits::one_at_bit(MIN_REFILL_SIZE_BITS);
-
-  public:
-    template<typename ParentRange = EmptyRange<>>
-    class Type : public ContainsParent<ParentRange>
-    {
-      using ContainsParent<ParentRange>::parent;
-
-      /**
-       * The size of memory requested so far.
-       *
-       * This is used to determine the refill size.
-       */
-      size_t requested_total = 0;
-
-      /**
-       * Buddy allocator used to represent this range of memory.
-       */
-      Buddy<BuddyChunkRep<Pagemap>, MIN_CHUNK_BITS, MAX_SIZE_BITS> buddy_large;
-
-      /**
-       * The parent might not support deallocation if this buddy allocator
-       * covers the whole range.  Uses template insanity to make this work.
-       */
-      template<bool exists = MAX_SIZE_BITS != (bits::BITS - 1)>
-      stl::enable_if_t<exists>
-      parent_dealloc_range(capptr::Arena<void> base, size_t size)
-      {
-        static_assert(
-          MAX_SIZE_BITS != (bits::BITS - 1), "Don't set SFINAE parameter");
-        parent.dealloc_range(base, size);
-      }
-
-      void dealloc_overflow(capptr::Arena<void> overflow)
-      {
-        if constexpr (MAX_SIZE_BITS != (bits::BITS - 1))
-        {
-          if (overflow != nullptr)
-          {
-            parent.dealloc_range(overflow, bits::one_at_bit(MAX_SIZE_BITS));
-          }
-        }
-        else
-        {
-          if (overflow != nullptr)
-            abort();
-        }
-      }
-
-      /**
-       * Add a range of memory to the address space.
-       * Divides blocks into power of two sizes with natural alignment
-       */
-      void add_range(capptr::Arena<void> base, size_t length)
-      {
-        range_to_pow_2_blocks<MIN_CHUNK_BITS>(
-          base, length, [this](capptr::Arena<void> base, size_t align, bool) {
-            auto overflow =
-              capptr::Arena<void>::unsafe_from(reinterpret_cast<void*>(
-                buddy_large.add_block(base.unsafe_uintptr(), align)));
-
-            dealloc_overflow(overflow);
-          });
-      }
-
-      capptr::Arena<void> refill(size_t size)
-      {
-        if (ParentRange::Aligned)
-        {
-          // Use amount currently requested to determine refill size.
-          // This will gradually increase the usage of the parent range.
-          // So small examples can grow local caches slowly, and larger
-          // examples will grow them by the refill size.
-          //
-          // The heuristic is designed to allocate the following sequence for
-          // 16KiB requests 16KiB, 16KiB, 32Kib, 64KiB, ..., REFILL_SIZE/2,
-          // REFILL_SIZE, REFILL_SIZE, ... Hence if this if they are coming from
-          // a contiguous aligned range, then they could be consolidated.  This
-          // depends on the ParentRange behaviour.
-          size_t refill_size = bits::min(REFILL_SIZE, requested_total);
-          refill_size = bits::max(refill_size, MIN_REFILL_SIZE);
-          refill_size = bits::max(refill_size, size);
-          refill_size = bits::next_pow2(refill_size);
-
-          auto refill_range = parent.alloc_range(refill_size);
-          if (refill_range != nullptr)
-          {
-            requested_total += refill_size;
-            add_range(pointer_offset(refill_range, size), refill_size - size);
-          }
-          return refill_range;
-        }
-
-        // Note the unaligned parent path does not use
-        // requested_total in the heuristic for the initial size
-        // this is because the request needs to introduce alignment.
-        // Currently the unaligned variant is not used as a local cache.
-        // So the gradual growing of refill_size is not needed.
-
-        // Need to overallocate to get the alignment right.
-        bool overflow = false;
-        size_t needed_size = bits::umul(size, 2, overflow);
-        if (overflow)
-        {
-          return nullptr;
-        }
-
-        auto refill_size = bits::max(needed_size, REFILL_SIZE);
-        while (needed_size <= refill_size)
-        {
-          auto refill = parent.alloc_range(refill_size);
-
-          if (refill != nullptr)
-          {
-            requested_total += refill_size;
-            add_range(refill, refill_size);
-
-            SNMALLOC_ASSERT(refill_size < bits::one_at_bit(MAX_SIZE_BITS));
-            static_assert(
-              (REFILL_SIZE < bits::one_at_bit(MAX_SIZE_BITS)) ||
-                ParentRange::Aligned,
-              "Required to prevent overflow.");
-
-            return alloc_range(size);
-          }
-
-          refill_size >>= 1;
-        }
-
-        return nullptr;
-      }
-
-    public:
-      static constexpr bool Aligned = true;
-
-      static constexpr bool ConcurrencySafe = false;
-
-      /* The large buddy allocator always deals in Arena-bounded pointers. */
-      using ChunkBounds = capptr::bounds::Arena;
-      static_assert(
-        stl::is_same_v<typename ParentRange::ChunkBounds, ChunkBounds>);
-
-      constexpr Type() = default;
-
-      capptr::Arena<void> alloc_range(size_t size)
-      {
-        SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE);
-        SNMALLOC_ASSERT(bits::is_pow2(size));
-
-        if (size >= bits::mask_bits(MAX_SIZE_BITS))
-        {
-          if (ParentRange::Aligned)
-            return parent.alloc_range(size);
-
-          return nullptr;
-        }
-
-        auto result = capptr::Arena<void>::unsafe_from(
-          reinterpret_cast<void*>(buddy_large.remove_block(size)));
-
-        if (result != nullptr)
-          return result;
-
-        return refill(size);
-      }
-
-      void dealloc_range(capptr::Arena<void> base, size_t size)
-      {
-        SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE);
-        SNMALLOC_ASSERT(bits::is_pow2(size));
-
-        if constexpr (MAX_SIZE_BITS != (bits::BITS - 1))
-        {
-          if (size >= bits::mask_bits(MAX_SIZE_BITS))
-          {
-            parent_dealloc_range(base, size);
-            return;
-          }
-        }
-
-        auto overflow =
-          capptr::Arena<void>::unsafe_from(reinterpret_cast<void*>(
-            buddy_large.add_block(base.unsafe_uintptr(), size)));
-        dealloc_overflow(overflow);
-      }
-    };
-  };
-} // namespace snmalloc
diff --git a/src/snmalloc/backend_helpers/smallarenarange.h b/src/snmalloc/backend_helpers/smallarenarange.h
new file mode 100644
index 000000000..f5820c8f0
--- /dev/null
+++ b/src/snmalloc/backend_helpers/smallarenarange.h
@@ -0,0 +1,166 @@
+#pragma once
+
+#include "../pal/pal.h"
+#include "arena.h"
+#include "empty_range.h"
+#include "inplacerep.h"
+#include "range_helpers.h"
+
+namespace snmalloc
+{
+  /**
+   * Small-grained range backed by `Arena` with in-band
+   * (`InplaceRep`) tree-node storage. Serves blocks of any
+   * unit-aligned size — not restricted to powers of two — for
+   * `SlabMetadata` allocations.
+   *
+   * Each arena instance covers exactly one chunk
+   * (`MAX_SIZE_BITS = MIN_CHUNK_BITS`): refill takes one chunk
+   * from the parent, sub-chunk fragments live in the arena,
+   * consolidated whole chunks flow back to the parent.
+   */
+  template<typename Authmap>
+  struct SmallArenaRange
+  {
+    template<typename ParentRange = EmptyRange<>>
+    class Type : public ContainsParent<ParentRange>
+    {
+    public:
+      using ChunkBounds = typename ParentRange::ChunkBounds;
+
+    private:
+      using ContainsParent<ParentRange>::parent;
+
+      using RepT = InplaceRep<Authmap, ChunkBounds>;
+      static constexpr size_t MIN_BITS = RepT::MIN_BITS;
+
+      Arena<RepT, MIN_BITS, MIN_CHUNK_BITS> arena;
+
+    public:
+      static constexpr size_t UNIT_SIZE = RepT::UNIT_SIZE;
+
+    private:
+      /**
+       * Split `[base, base+length)` at chunk boundaries.
+       * Intra-chunk fragments are unit-trimmed and submitted to
+       * the arena; segments that begin and end chunk-aligned go
+       * to the parent. Accepts arbitrary unaligned input —
+       * `dealloc_meta_data` forwards `make()`'s unaligned spare
+       * here; sub-unit edges are discarded by design.
+       */
+      void add_range_impl(CapPtr<void, ChunkBounds> base, size_t length)
+      {
+        uintptr_t lo = base.unsafe_uintptr();
+        uintptr_t hi = lo + length;
+
+        while (lo < hi)
+        {
+          uintptr_t chunk_end = bits::align_up(lo + 1, MIN_CHUNK_SIZE);
+          uintptr_t seg_end = bits::min(hi, chunk_end);
+
+          if (
+            lo == bits::align_down(lo, MIN_CHUNK_SIZE) && seg_end == chunk_end)
+          {
+            auto chunk_base = CapPtr<void, ChunkBounds>::unsafe_from(
+              reinterpret_cast<void*>(lo));
+            parent.dealloc_range(chunk_base, MIN_CHUNK_SIZE);
+          }
+          else
+          {
+            uintptr_t f_lo = bits::align_up(lo, UNIT_SIZE);
+            uintptr_t f_hi = bits::align_down(seg_end, UNIT_SIZE);
+            if (f_lo < f_hi)
+            {
+              auto [ov_a, ov_s] = arena.add_block(f_lo, f_hi - f_lo);
+              if (ov_a != 0)
+              {
+                // Arena consolidated up to MAX_SIZE_BITS = chunk:
+                // hand the whole-chunk piece back to the parent.
+                auto ov_base = CapPtr<void, ChunkBounds>::unsafe_from(
+                  reinterpret_cast<void*>(ov_a));
+                parent.dealloc_range(ov_base, ov_s);
+              }
+            }
+          }
+
+          lo = seg_end;
+        }
+      }
+
+      CapPtr<void, ChunkBounds> refill(size_t size)
+      {
+        auto refill_range = parent.alloc_range(MIN_CHUNK_SIZE);
+        if (refill_range == nullptr)
+          return nullptr;
+
+        add_range_impl(
+          pointer_offset(refill_range, size), MIN_CHUNK_SIZE - size);
+
+        return refill_range;
+      }
+
+    public:
+      static constexpr bool Aligned = true;
+      static_assert(ParentRange::Aligned, "ParentRange must be aligned");
+
+      static constexpr bool ConcurrencySafe = false;
+
+      constexpr Type() = default;
+
+      CapPtr<void, ChunkBounds> alloc_range(size_t size)
+      {
+        SNMALLOC_ASSERT((size & (UNIT_SIZE - 1)) == 0);
+
+        if (size >= MIN_CHUNK_SIZE)
+          return parent.alloc_range(size);
+
+        uintptr_t a = arena.remove_block(size);
+        if (a != 0)
+          return CapPtr<void, ChunkBounds>::unsafe_from(
+            reinterpret_cast<void*>(a));
+
+        return refill(size);
+      }
+
+      /**
+       * Allocate `align`-aligned space large enough for `size`,
+       * donating the unit-aligned tail back to the arena.
+       *
+       * Requests `requested = align_up(size, align)` bytes; because
+       * `align` is pow2 and `requested` is a multiple of `align`,
+       * `Arena`'s carve returns an `align`-aligned base
+       * without a caller-side over-allocate-and-trim. The tail
+       * `[align_up(size, UNIT_SIZE), requested)` is donated via
+       * `add_range_impl`. The sub-unit slice
+       * `[size, align_up(size, UNIT_SIZE))` cannot be represented
+       * and is leaked — pre-round `size` to `UNIT_SIZE` to avoid it.
+       */
+      CapPtr<void, ChunkBounds> alloc_size_with_align(size_t size, size_t align)
+      {
+        SNMALLOC_ASSERT(size > 0);
+        SNMALLOC_ASSERT(bits::is_pow2(align));
+        SNMALLOC_ASSERT(align >= UNIT_SIZE);
+        SNMALLOC_ASSERT(align <= MIN_CHUNK_SIZE);
+
+        size_t requested = bits::align_up(size, align);
+        auto p = alloc_range(requested);
+        if (p == nullptr)
+          return nullptr;
+
+        size_t used = bits::align_up(size, UNIT_SIZE);
+        if (used < requested)
+        {
+          add_range_impl(pointer_offset(p, used), requested - used);
+        }
+
+        return p;
+      }
+
+      // No precondition on `size`: sub-unit edges discarded.
+      void dealloc_range(CapPtr<void, ChunkBounds> base, size_t size)
+      {
+        add_range_impl(base, size);
+      }
+    };
+  };
+} // namespace snmalloc
diff --git a/src/snmalloc/backend_helpers/smallbuddyrange.h b/src/snmalloc/backend_helpers/smallbuddyrange.h
deleted file mode 100644
index 6f8400e83..000000000
--- a/src/snmalloc/backend_helpers/smallbuddyrange.h
+++ /dev/null
@@ -1,252 +0,0 @@
-#pragma once
-
-#include "../pal/pal.h"
-#include "empty_range.h"
-#include "range_helpers.h"
-
-namespace snmalloc
-{
-  /**
-   * struct for representing the redblack nodes
-   * directly inside the meta data.
-   */
-  template<SNMALLOC_CONCEPT(capptr::IsBound) bounds>
-  struct FreeChunk
-  {
-    CapPtr<FreeChunk, bounds> left;
-    CapPtr<FreeChunk, bounds> right;
-  };
-
-  /**
-   * Class for using the allocations own space to store in the RBTree.
-   */
-  template<SNMALLOC_CONCEPT(capptr::IsBound) bounds>
-  class BuddyInplaceRep
-  {
-  public:
-    using Handle = CapPtr<FreeChunk<bounds>, bounds>*;
-    using Contents = CapPtr<FreeChunk<bounds>, bounds>;
-
-    static constexpr Contents null = nullptr;
-    static constexpr Contents root = nullptr;
-
-    static constexpr address_t MASK = 1;
-
-    static void set(Handle ptr, Contents r)
-    {
-      SNMALLOC_ASSERT((address_cast(r) & MASK) == 0);
-      if (r == nullptr)
-        *ptr = CapPtr<FreeChunk<bounds>, bounds>::unsafe_from(
-          reinterpret_cast<FreeChunk<bounds>*>((*ptr).unsafe_uintptr() & MASK));
-      else
-        // Preserve lower bit.
-        *ptr = pointer_offset(r, (address_cast(*ptr) & MASK))
-                 .template as_static<FreeChunk<bounds>>();
-    }
-
-    static Contents get(Handle ptr)
-    {
-      return pointer_align_down<2, FreeChunk<bounds>>((*ptr).as_void());
-    }
-
-    static Handle ref(bool direction, Contents r)
-    {
-      if (direction)
-        return &r->left;
-
-      return &r->right;
-    }
-
-    static bool is_red(Contents k)
-    {
-      if (k == nullptr)
-        return false;
-      return (address_cast(*ref(false, k)) & MASK) == MASK;
-    }
-
-    static void set_red(Contents k, bool new_is_red)
-    {
-      if (new_is_red != is_red(k))
-      {
-        auto r = ref(false, k);
-        auto old_addr = pointer_align_down<2, FreeChunk<bounds>>(r->as_void());
-
-        if (new_is_red)
-        {
-          if (old_addr == nullptr)
-            *r = CapPtr<FreeChunk<bounds>, bounds>::unsafe_from(
-              reinterpret_cast<FreeChunk<bounds>*>(MASK));
-          else
-            *r = pointer_offset(old_addr, MASK)
-                   .template as_static<FreeChunk<bounds>>();
-        }
-        else
-        {
-          *r = old_addr;
-        }
-        SNMALLOC_ASSERT(is_red(k) == new_is_red);
-      }
-    }
-
-    static Contents offset(Contents k, size_t size)
-    {
-      return pointer_offset(k, size).template as_static<FreeChunk<bounds>>();
-    }
-
-    static Contents buddy(Contents k, size_t size)
-    {
-      // This is just doing xor size, but with what API
-      // exists on capptr.
-      auto base = pointer_align_down<FreeChunk<bounds>>(k.as_void(), size * 2);
-      auto offset = (address_cast(k) & size) ^ size;
-      return pointer_offset(base, offset)
-        .template as_static<FreeChunk<bounds>>();
-    }
-
-    static Contents align_down(Contents k, size_t size)
-    {
-      return pointer_align_down<FreeChunk<bounds>>(k.as_void(), size);
-    }
-
-    static bool compare(Contents k1, Contents k2)
-    {
-      return address_cast(k1) > address_cast(k2);
-    }
-
-    static bool equal(Contents k1, Contents k2)
-    {
-      return address_cast(k1) == address_cast(k2);
-    }
-
-    static address_t printable(Contents k)
-    {
-      return address_cast(k);
-    }
-
-    /**
-     * Return the holder in some format suitable for printing by snmalloc's
-     * debug log mechanism.  Used only when used in tracing mode, not normal
-     * debug or release builds. Raw pointers are printable already, so this is
-     * the identity function.
-     */
-    static Handle printable(Handle k)
-    {
-      return k;
-    }
-
-    /**
-     * Return a name for use in tracing mode.  Unused in any other context.
-     */
-    static const char* name()
-    {
-      return "BuddyInplaceRep";
-    }
-
-    static bool can_consolidate(Contents k, size_t size)
-    {
-      UNUSED(k, size);
-      return true;
-    }
-  };
-
-  struct SmallBuddyRange
-  {
-    template<typename ParentRange = EmptyRange<>>
-    class Type : public ContainsParent<ParentRange>
-    {
-    public:
-      using ChunkBounds = typename ParentRange::ChunkBounds;
-
-    private:
-      using ContainsParent<ParentRange>::parent;
-
-      static constexpr size_t MIN_BITS =
-        bits::next_pow2_bits_const(sizeof(FreeChunk<ChunkBounds>));
-
-      Buddy<BuddyInplaceRep<ChunkBounds>, MIN_BITS, MIN_CHUNK_BITS> buddy_small;
-
-      /**
-       * Add a range of memory to the address space.
-       * Divides blocks into power of two sizes with natural alignment
-       */
-      void add_range(CapPtr<void, ChunkBounds> base, size_t length)
-      {
-        range_to_pow_2_blocks<MIN_BITS>(
-          base,
-          length,
-          [this](CapPtr<void, ChunkBounds> base, size_t align, bool) {
-            if (align < MIN_CHUNK_SIZE)
-            {
-              CapPtr<void, ChunkBounds> overflow =
-                buddy_small
-                  .add_block(
-                    base.template as_reinterpret<FreeChunk<ChunkBounds>>(),
-                    align)
-                  .template as_reinterpret<void>();
-              if (overflow != nullptr)
-                parent.dealloc_range(
-                  overflow, bits::one_at_bit(MIN_CHUNK_BITS));
-            }
-            else
-            {
-              parent.dealloc_range(base, align);
-            }
-          });
-      }
-
-      CapPtr<void, ChunkBounds> refill(size_t size)
-      {
-        auto refill = parent.alloc_range(MIN_CHUNK_SIZE);
-
-        if (refill != nullptr)
-          add_range(pointer_offset(refill, size), MIN_CHUNK_SIZE - size);
-
-        return refill;
-      }
-
-    public:
-      static constexpr bool Aligned = true;
-      static_assert(ParentRange::Aligned, "ParentRange must be aligned");
-
-      static constexpr bool ConcurrencySafe = false;
-
-      constexpr Type() = default;
-
-      CapPtr<void, ChunkBounds> alloc_range(size_t size)
-      {
-        if (size >= MIN_CHUNK_SIZE)
-          return parent.alloc_range(size);
-
-        auto result = buddy_small.remove_block(size);
-        if (result != nullptr)
-        {
-          result->left = nullptr;
-          result->right = nullptr;
-          return result.template as_reinterpret<void>();
-        }
-        return refill(size);
-      }
-
-      CapPtr<void, ChunkBounds> alloc_range_with_leftover(size_t size)
-      {
-        auto rsize = bits::next_pow2(size);
-
-        auto result = alloc_range(rsize);
-
-        if (result == nullptr)
-          return nullptr;
-
-        auto remnant = pointer_offset(result, size);
-
-        add_range(remnant, rsize - size);
-
-        return result.template as_reinterpret<void>();
-      }
-
-      void dealloc_range(CapPtr<void, ChunkBounds> base, size_t size)
-      {
-        add_range(base, size);
-      }
-    };
-  };
-} // namespace snmalloc
diff --git a/src/snmalloc/backend_helpers/staticconditionalrange.h b/src/snmalloc/backend_helpers/staticconditionalrange.h
index 682c2f1fb..f5d46441b 100644
--- a/src/snmalloc/backend_helpers/staticconditionalrange.h
+++ b/src/snmalloc/backend_helpers/staticconditionalrange.h
@@ -10,8 +10,8 @@ namespace snmalloc
   {
     // This is a range that can bypass the OptionalRange if it is disabled.
     // Disabling is global, and not local.
-    // This is used to allow disabling thread local buddy allocators when the
-    // initial fixed size heap is small.
+    // This is used to allow disabling the thread-local cache range when
+    // the initial fixed-size heap is small.
     //
     // The range builds a more complex parent
     //    Pipe<ParentRange, OptionalRange>
diff --git a/src/snmalloc/ds/pagemap.h b/src/snmalloc/ds/pagemap.h
index 2ee3cdd29..983b82e83 100644
--- a/src/snmalloc/ds/pagemap.h
+++ b/src/snmalloc/ds/pagemap.h
@@ -343,17 +343,24 @@ namespace snmalloc
             PAL::error("Internal error: Pagemap read access out of range.");
           }
         }
-        p = p - base;
       }
 
       //  If this is potentially_out_of_range, then the pages will not have
       //  been mapped. With Lazy commit they will at least be mapped read-only
       //  Note that: this means external pointer on Windows will be slow.
+      //  register_range takes an unadjusted address: it does its own
+      //  base-relative arithmetic when has_bounds, so it must be called
+      //  before the p = p - base adjustment below.
       if constexpr (potentially_out_of_range && !pal_supports<LazyCommit, PAL>)
       {
         register_range(p, 1);
       }
 
+      if constexpr (has_bounds)
+      {
+        p = p - base;
+      }
+
       if constexpr (potentially_out_of_range)
         return body_opt[p >> SHIFT];
       else
diff --git a/src/snmalloc/ds/sizeclasstable.h b/src/snmalloc/ds/sizeclasstable.h
index 5db3cb5fa..df173bef5 100644
--- a/src/snmalloc/ds/sizeclasstable.h
+++ b/src/snmalloc/ds/sizeclasstable.h
@@ -17,31 +17,64 @@ namespace snmalloc
 {
   using chunksizeclass_t = size_t;
 
-  // Large classes range from [MAX_SMALL_SIZECLASS_SIZE, ADDRESS_SPACE).
+  // Capped to `bits::BITS - 1` so `MAX_LARGE_SIZECLASS_SIZE` fits in
+  // `size_t` on 32-bit platforms where `DefaultPal::address_bits ==
+  // bits::BITS`.
+  constexpr size_t ENCODED_ADDRESS_BITS =
+    bits::min(DefaultPal::address_bits, bits::BITS - 1);
+
+  // Large classes follow on directly from small classes in the global
+  // exp+mantissa scheme: `(ENCODED_ADDRESS_BITS - MAX_SMALL_SIZECLASS_BITS)`
+  // mantissa cycles of `2^INTERMEDIATE_BITS` entries each.
   constexpr size_t NUM_LARGE_CLASSES =
-    DefaultPal::address_bits - MAX_SMALL_SIZECLASS_BITS;
+    (ENCODED_ADDRESS_BITS - MAX_SMALL_SIZECLASS_BITS) << INTERMEDIATE_BITS;
 
-  // How many bits are required to represent either a large or a small
-  // sizeclass.
-  constexpr size_t TAG_SIZECLASS_BITS = bits::max<size_t>(
-    bits::next_pow2_bits_const(NUM_SMALL_SIZECLASSES),
-    bits::next_pow2_bits_const(NUM_LARGE_CLASSES + 1));
+  // Slot 0 of the table is reserved as the unmapped sentinel, hence +1.
+  constexpr size_t SIZECLASS_BITS =
+    bits::next_pow2_bits_const(1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES);
 
-  // Number of bits required to represent a tagged sizeclass that can be
-  // either small or large.
-  constexpr size_t SIZECLASS_REP_SIZE =
-    bits::one_at_bit(TAG_SIZECLASS_BITS + 1);
+  constexpr size_t SIZECLASS_REP_SIZE = bits::one_at_bit(SIZECLASS_BITS);
+
+  // Width of the per-chunk slab-offset field packed immediately above the
+  // sizeclass in `ras`. The worst-case slab count for any non-pow2 large
+  // class with `INTERMEDIATE_BITS = M` is `2^(M+1)`; `M + 1` bits cover
+  // the maximum index. `compute_max_large_slab_index` static_asserts the
+  // bound against the actual table below.
+  constexpr size_t OFFSET_BITS = INTERMEDIATE_BITS + 1;
+
+  // `ras & COMBINED_MASK` directly indexes the `(sizeclass, offset)` table
+  // row, which already carries `offset_bytes = offset * slab_size`.
+  constexpr size_t COMBINED_BITS = SIZECLASS_BITS + OFFSET_BITS;
+  constexpr size_t COMBINED_REP_SIZE = bits::one_at_bit(COMBINED_BITS);
+
+  // Largest size representable by the uniform sizeclass encoding;
+  // requests larger than this must be failed before
+  // `size_to_sizeclass_full`.
+  constexpr size_t MAX_LARGE_SIZECLASS_SIZE =
+    bits::from_exp_mant<INTERMEDIATE_BITS, MIN_ALLOC_STEP_BITS>(
+      NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES - 1);
+
+  static_assert(
+    MAX_LARGE_SIZECLASS_SIZE == bits::one_at_bit(ENCODED_ADDRESS_BITS),
+    "MAX_LARGE_SIZECLASS_SIZE must equal 2 ^ ENCODED_ADDRESS_BITS; if this "
+    "fails, the exp+mantissa math does not match NUM_LARGE_CLASSES.");
+  static_assert(
+    ENCODED_ADDRESS_BITS > MAX_SMALL_SIZECLASS_BITS,
+    "ENCODED_ADDRESS_BITS must exceed MAX_SMALL_SIZECLASS_BITS so the large "
+    "range is non-empty.");
 
   /**
-   * Encapsulates a tagged union of large and small sizeclasses.
+   * Sizeclass identifier shared by small and large allocations:
+   *
+   *   value == 0                              : sentinel (unmapped)
+   *   value ∈ [1, 1 + NUM_SMALL_SIZECLASSES)  : small, sc = value - 1
+   *   value ∈ [1 + NUM_SMALL_SIZECLASSES, ...): large
    *
-   * Used in various lookup tables to make efficient code that handles
-   * all objects allocated by snmalloc.
+   * Indexes `sizeclass_metadata` directly; slot 0 is zero-padded so the
+   * sentinel flows through fast-path lookups without a branch.
    */
   class sizeclass_t
   {
-    static constexpr size_t TAG = bits::one_at_bit(TAG_SIZECLASS_BITS);
-
     size_t value{0};
 
     constexpr sizeclass_t(size_t value) : value(value) {}
@@ -51,20 +84,19 @@ namespace snmalloc
 
     static constexpr sizeclass_t from_small_class(smallsizeclass_t sc)
     {
-      SNMALLOC_ASSERT(sc < TAG);
-      // Note could use `+` or `|`.  Using `+` as will combine nicely with array
-      // offset.
-      return {TAG + sc};
+      SNMALLOC_ASSERT(sc < NUM_SMALL_SIZECLASSES);
+      return {sc + 1};
     }
 
     /**
-     * Takes the number of leading zero bits from the actual large size-1.
-     * See size_to_sizeclass_full
+     * Construct from a large class index `lc` in [0, NUM_LARGE_CLASSES).
+     * Large classes are stored as a contiguous run immediately after the
+     * small range and the sentinel slot.
      */
     static constexpr sizeclass_t from_large_class(size_t large_class)
     {
-      SNMALLOC_ASSERT(large_class < TAG);
-      return {large_class};
+      SNMALLOC_ASSERT(large_class < NUM_LARGE_CLASSES);
+      return {1 + NUM_SMALL_SIZECLASSES + large_class};
     }
 
     static constexpr sizeclass_t from_raw(size_t raw)
@@ -72,21 +104,16 @@ namespace snmalloc
       return {raw};
     }
 
-    constexpr size_t index()
-    {
-      return value & (TAG - 1);
-    }
-
     constexpr smallsizeclass_t as_small()
     {
       SNMALLOC_ASSERT(is_small());
-      return smallsizeclass_t(value & (TAG - 1));
+      return smallsizeclass_t(value - 1);
     }
 
     constexpr chunksizeclass_t as_large()
     {
-      SNMALLOC_ASSERT(!is_small());
-      return bits::BITS - (value & (TAG - 1));
+      SNMALLOC_ASSERT(!is_small() && !is_default());
+      return value - 1 - NUM_SMALL_SIZECLASSES;
     }
 
     constexpr size_t raw()
@@ -96,7 +123,8 @@ namespace snmalloc
 
     constexpr bool is_small()
     {
-      return (value & TAG) != 0;
+      // Sentinel (value == 0) underflows past NUM_SMALL_SIZECLASSES.
+      return (value - 1) < NUM_SMALL_SIZECLASSES;
     }
 
     constexpr bool is_default()
@@ -108,14 +136,77 @@ namespace snmalloc
     {
       return value == other.value;
     }
+
+    constexpr bool operator!=(sizeclass_t other)
+    {
+      return value != other.value;
+    }
+  };
+
+  /**
+   * (sizeclass, per-chunk slab offset) packed into the low `COMBINED_BITS`
+   * of a pagemap entry's `remote_and_sizeclass`. Non-zero offsets occur
+   * only for interior chunks of non-pow2 large allocations; the offset
+   * lets `start_of_object` recover the allocation base.
+   *
+   * Distinct from `sizeclass_t` so `is_small()` / `as_small()` /
+   * `as_large()` cannot be called on a value carrying offset bits, and so
+   * the offset can never be synthesised: constructing a value requires
+   * supplying both components explicitly, or going through `from_raw`
+   * with bits read from storage.
+   */
+  class offset_and_sizeclass_t
+  {
+    size_t value{0};
+
+    constexpr offset_and_sizeclass_t(size_t value) : value(value) {}
+
+  public:
+    constexpr offset_and_sizeclass_t() = default;
+
+    constexpr offset_and_sizeclass_t(sizeclass_t sc, size_t offset)
+    : value(sc.raw() | (offset << SIZECLASS_BITS))
+    {
+      SNMALLOC_ASSERT(offset < (size_t{1} << OFFSET_BITS));
+    }
+
+    static constexpr offset_and_sizeclass_t from_raw(size_t raw)
+    {
+      return {raw};
+    }
+
+    constexpr size_t raw() const
+    {
+      return value;
+    }
+
+    constexpr sizeclass_t sizeclass() const
+    {
+      return sizeclass_t::from_raw(value & (SIZECLASS_REP_SIZE - 1));
+    }
+
+    constexpr size_t offset() const
+    {
+      return (value >> SIZECLASS_BITS) & ((size_t{1} << OFFSET_BITS) - 1);
+    }
+
+    constexpr bool operator==(offset_and_sizeclass_t other) const
+    {
+      return value == other.value;
+    }
   };
 
   using sizeclass_compress_t = uint8_t;
 
   /**
-   * This structure contains the fields required for fast paths for sizeclasses.
+   * Per-`offset_and_sizeclass_t` metadata for `start_of_object` —
+   * recovering the allocation base from an interior pointer.
+   *
+   * Sized to a power of two (4 × `size_t` = 32 bytes) so the table
+   * stride collapses to a single shift in the
+   * `__malloc_start_pointer` hot path.
    */
-  struct sizeclass_data_fast
+  struct sizeclass_data_start
   {
     size_t size;
     // We store the mask as it is used more on the fast path, and the size of
@@ -123,68 +214,128 @@ namespace snmalloc
     size_t slab_mask;
     // Table of constants for reciprocal division for each sizeclass.
     size_t div_mult;
-    // Table of constants for reciprocal modulus for each sizeclass.
+    // `offset * slab_size`, precomputed. Zero for `offset == 0` rows.
+    size_t offset_bytes;
+  };
+
+  static_assert(
+    sizeof(sizeclass_data_start) == 4 * sizeof(size_t),
+    "sizeclass_data_start must be a power-of-two stride for single-shift "
+    "indexing in start_of_object");
+
+  /**
+   * Per-`sizeclass_t` metadata for `is_start_of_object` — the
+   * Lemire-style alignment check used by check-build dealloc and
+   * debug asserts.
+   *
+   * `slab_mask` is duplicated here (also held in `sizeclass_data_start`)
+   * so the alignment check loads from a single row instead of straddling
+   * two tables.
+   */
+  struct sizeclass_data_align
+  {
+    size_t slab_mask;
     size_t mod_zero_mult;
   };
 
   /**
-   * This structure contains the remaining fields required for slow paths for
-   * sizeclasses.
+   * Per-`sizeclass_t` thresholds used when initialising a slab —
+   * cold-path data consumed at slab allocation/refill time.
    */
-  struct sizeclass_data_slow
+  struct sizeclass_data_slab
   {
     uint16_t capacity;
     uint16_t waking;
   };
 
-  static_assert(sizeof(sizeclass_data_slow::capacity) * 8 > MAX_CAPACITY_BITS);
+  static_assert(sizeof(sizeclass_data_slab::capacity) * 8 > MAX_CAPACITY_BITS);
 
   struct SizeClassTable
   {
-    ModArray<SIZECLASS_REP_SIZE, sizeclass_data_fast> fast_{};
-    ModArray<SIZECLASS_REP_SIZE, sizeclass_data_slow> slow_{};
+    // `start_` is indexed by an `offset_and_sizeclass_t` (Word::Two of
+    // the pagemap entry & COMBINED_MASK). The first SIZECLASS_REP_SIZE
+    // rows have offset == 0; subsequent rows carry the offset_bytes
+    // needed for `start_of_object` on non-pow2 large interior chunks.
+    ModArray<COMBINED_REP_SIZE, sizeclass_data_start> start_{};
+    ModArray<SIZECLASS_REP_SIZE, sizeclass_data_align> align_{};
+    ModArray<SIZECLASS_REP_SIZE, sizeclass_data_slab> slab_{};
 
     size_t DIV_MULT_SHIFT{0};
 
-    [[nodiscard]] constexpr sizeclass_data_fast& fast(sizeclass_t index)
+    [[nodiscard]] constexpr sizeclass_data_start& start(sizeclass_t index)
+    {
+      return start_[index.raw()];
+    }
+
+    [[nodiscard]] constexpr sizeclass_data_start start(sizeclass_t index) const
+    {
+      return start_[index.raw()];
+    }
+
+    [[nodiscard]] constexpr sizeclass_data_start&
+    start(offset_and_sizeclass_t osc)
     {
-      return fast_[index.raw()];
+      return start_[osc.raw()];
     }
 
-    [[nodiscard]] constexpr sizeclass_data_fast fast(sizeclass_t index) const
+    [[nodiscard]] constexpr sizeclass_data_start
+    start(offset_and_sizeclass_t osc) const
     {
-      return fast_[index.raw()];
+      return start_[osc.raw()];
     }
 
-    [[nodiscard]] constexpr sizeclass_data_fast& fast_small(smallsizeclass_t sc)
+    [[nodiscard]] constexpr sizeclass_data_start&
+    start_small(smallsizeclass_t sc)
     {
-      return fast_[sizeclass_t::from_small_class(sc).raw()];
+      return start_[sizeclass_t::from_small_class(sc).raw()];
     }
 
-    [[nodiscard]] constexpr sizeclass_data_fast
-    fast_small(smallsizeclass_t sc) const
+    [[nodiscard]] constexpr sizeclass_data_start
+    start_small(smallsizeclass_t sc) const
     {
-      return fast_[sizeclass_t::from_small_class(sc).raw()];
+      return start_[sizeclass_t::from_small_class(sc).raw()];
     }
 
-    [[nodiscard]] constexpr sizeclass_data_slow& slow(sizeclass_t index)
+    [[nodiscard]] constexpr sizeclass_data_align& align(sizeclass_t index)
     {
-      return slow_[index.raw()];
+      return align_[index.raw()];
     }
 
-    [[nodiscard]] constexpr sizeclass_data_slow slow(sizeclass_t index) const
+    [[nodiscard]] constexpr sizeclass_data_align align(sizeclass_t index) const
     {
-      return slow_[index.raw()];
+      return align_[index.raw()];
+    }
+
+    [[nodiscard]] constexpr sizeclass_data_slab& slab(sizeclass_t index)
+    {
+      return slab_[index.raw()];
+    }
+
+    [[nodiscard]] constexpr sizeclass_data_slab slab(sizeclass_t index) const
+    {
+      return slab_[index.raw()];
     }
 
     constexpr SizeClassTable()
     {
+      // Sentinel slot (sizeclass_t{} / raw 0) covers any address whose
+      // pagemap entry is unmapped or owned by the backend — including
+      // foreign (non-snmalloc) heap addresses reached via the
+      // bounds-checked memcpy shim before snmalloc has seen them.
+      // `slab_mask = ~size_t(0)` makes `start_of_object` collapse
+      // `addr & ~slab_mask` to 0 and `index_in_object` to `addr`, so
+      // `remaining_bytes = sentinel.size - addr` underflows to a very
+      // large value and any memcpy bound check trivially passes the
+      // sentinel through to the destination's native checks.
+      start_[0].slab_mask = ~size_t(0);
+
       size_t max_capacity = 0;
 
       for (smallsizeclass_t sizeclass(0); sizeclass < NUM_SMALL_SIZECLASSES;
            sizeclass++)
       {
-        auto& meta = fast_small(sizeclass);
+        auto& meta = start_small(sizeclass);
+        auto sc = sizeclass_t::from_small_class(sizeclass);
 
         size_t rsize =
           bits::from_exp_mant<INTERMEDIATE_BITS, MIN_ALLOC_STEP_BITS>(
@@ -194,18 +345,19 @@ namespace snmalloc
           bits::next_pow2_bits_const(MIN_OBJECT_COUNT * rsize), MIN_CHUNK_BITS);
 
         meta.slab_mask = bits::mask_bits(slab_bits);
+        align(sc).slab_mask = meta.slab_mask;
 
-        auto& meta_slow = slow(sizeclass_t::from_small_class(sizeclass));
-        meta_slow.capacity =
+        auto& meta_slab = slab(sc);
+        meta_slab.capacity =
           static_cast<uint16_t>((meta.slab_mask + 1) / rsize);
 
-        meta_slow.waking = mitigations(random_larger_thresholds) ?
-          static_cast<uint16_t>(meta_slow.capacity / 4) :
-          static_cast<uint16_t>(bits::min((meta_slow.capacity / 4), 32));
+        meta_slab.waking = mitigations(random_larger_thresholds) ?
+          static_cast<uint16_t>(meta_slab.capacity / 4) :
+          static_cast<uint16_t>(bits::min((meta_slab.capacity / 4), 32));
 
-        if (meta_slow.capacity > max_capacity)
+        if (meta_slab.capacity > max_capacity)
         {
-          max_capacity = meta_slow.capacity;
+          max_capacity = meta_slab.capacity;
         }
       }
 
@@ -216,54 +368,116 @@ namespace snmalloc
            sizeclass++)
       {
         // Calculate reciprocal division constant.
-        auto& meta = fast_small(sizeclass);
+        auto& meta = start_small(sizeclass);
         meta.div_mult = (bits::mask_bits(DIV_MULT_SHIFT) / meta.size) + 1;
 
         size_t zero = 0;
-        meta.mod_zero_mult = (~zero / meta.size) + 1;
+        align(sizeclass_t::from_small_class(sizeclass)).mod_zero_mult =
+          (~zero / meta.size) + 1;
       }
 
-      for (size_t sizeclass = 0; sizeclass < bits::BITS; sizeclass++)
+      for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++)
       {
-        auto lsc = sizeclass_t::from_large_class(sizeclass);
-        auto& meta = fast(lsc);
-        meta.size = sizeclass == 0 ? 0 : bits::one_at_bit(lsc.as_large());
-        meta.slab_mask = meta.size - 1;
-        // The slab_mask will do all the necessary work, so
-        // perform identity multiplication for the test.
-        meta.mod_zero_mult = 1;
-        // The slab_mask will do all the necessary work for division
-        // so collapse the calculated offset.
+        auto lsc = sizeclass_t::from_large_class(lc);
+        auto& meta = start(lsc);
+        size_t size =
+          bits::from_exp_mant<INTERMEDIATE_BITS, MIN_ALLOC_STEP_BITS>(
+            NUM_SMALL_SIZECLASSES + lc);
+        meta.size = size;
+        // `slab_mask = (natural alignment of size) - 1`; for pow2 sizes
+        // this equals size - 1, for non-pow2 mantissa steps it is the
+        // slab granularity at which the allocation tiles.
+        size_t align_bytes = size & (~size + 1);
+        meta.slab_mask = align_bytes - 1;
+        align(lsc).slab_mask = meta.slab_mask;
+        // slab_mask handles the math; identity values neutralise the
+        // mod/div reciprocals.
+        align(lsc).mod_zero_mult = 1;
         meta.div_mult = 0;
       }
+
+      // Populate offset > 0 rows: same as the (sc, 0) row but with
+      // `offset_bytes = offset * slab_size` so that `start_of_object`
+      // collapses to `(addr & ~slab_mask) - offset_bytes`. Read when
+      // the backend writes per-chunk offsets for multi-slab-tile
+      // reservations.
+      for (size_t sc_raw = 0; sc_raw < SIZECLASS_REP_SIZE; sc_raw++)
+      {
+        const auto& base = start_[sc_raw];
+        const size_t slab_size = base.slab_mask + 1;
+        for (size_t offset = 1; offset < (size_t{1} << OFFSET_BITS); offset++)
+        {
+          auto& row = start_[sc_raw | (offset << SIZECLASS_BITS)];
+          row.size = base.size;
+          row.slab_mask = base.slab_mask;
+          row.div_mult = base.div_mult;
+          row.offset_bytes = offset * slab_size;
+        }
+      }
     }
   };
 
   constexpr SizeClassTable sizeclass_metadata = SizeClassTable();
 
+  // Sentinel must remain zero-initialised so fast-path lookups via
+  // `start(sc)` return zero size without a branch. Slab_mask is
+  // `~size_t(0)` so foreign-pointer `remaining_bytes` underflows to a
+  // huge value (see `SizeClassTable::SizeClassTable`).
+  static_assert(
+    sizeclass_metadata.start(sizeclass_t{}).size == 0,
+    "sentinel slot must have size 0");
+  static_assert(
+    sizeclass_metadata.start(sizeclass_t{}).slab_mask == ~size_t(0),
+    "sentinel slot must have slab_mask ~0 for foreign-pointer "
+    "remaining_bytes underflow");
+
   static_assert(
     bits::BITS - sizeclass_metadata.DIV_MULT_SHIFT <= MAX_CAPACITY_BITS);
 
+  // Largest slab index for any large class: `OFFSET_BITS` must cover
+  // it. Each large allocation reserves exactly `meta.size` bytes (a
+  // positive multiple of `slab_size`), so the largest `slab_index`
+  // the pagemap loop in `Backend::alloc_chunk` writes is
+  // `meta.size / slab_size - 1`.
+  constexpr size_t compute_max_large_slab_index()
+  {
+    size_t max_idx = 0;
+    for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++)
+    {
+      const auto& meta =
+        sizeclass_metadata.start(sizeclass_t::from_large_class(lc));
+      const size_t slab_size = meta.slab_mask + 1;
+      const size_t idx = (meta.size / slab_size) - 1;
+      if (idx > max_idx)
+        max_idx = idx;
+    }
+    return max_idx;
+  }
+
+  static_assert(
+    compute_max_large_slab_index() < (size_t{1} << OFFSET_BITS),
+    "OFFSET_BITS must cover the worst-case slab index for any large class");
+
   constexpr size_t DIV_MULT_SHIFT = sizeclass_metadata.DIV_MULT_SHIFT;
 
   constexpr size_t sizeclass_to_size(smallsizeclass_t sizeclass)
   {
-    return sizeclass_metadata.fast_small(sizeclass).size;
+    return sizeclass_metadata.start_small(sizeclass).size;
   }
 
   constexpr size_t sizeclass_full_to_size(sizeclass_t sizeclass)
   {
-    return sizeclass_metadata.fast(sizeclass).size;
+    return sizeclass_metadata.start(sizeclass).size;
   }
 
   constexpr size_t sizeclass_full_to_slab_size(sizeclass_t sizeclass)
   {
-    return sizeclass_metadata.fast(sizeclass).slab_mask + 1;
+    return sizeclass_metadata.start(sizeclass).slab_mask + 1;
   }
 
   constexpr size_t sizeclass_to_slab_size(smallsizeclass_t sizeclass)
   {
-    return sizeclass_metadata.fast_small(sizeclass).slab_mask + 1;
+    return sizeclass_metadata.start_small(sizeclass).slab_mask + 1;
   }
 
   /**
@@ -275,7 +489,7 @@ namespace snmalloc
    */
   constexpr uint16_t threshold_for_waking_slab(smallsizeclass_t sizeclass)
   {
-    return sizeclass_metadata.slow(sizeclass_t::from_small_class(sizeclass))
+    return sizeclass_metadata.slab(sizeclass_t::from_small_class(sizeclass))
       .waking;
   }
 
@@ -291,25 +505,16 @@ namespace snmalloc
     return bits::one_at_bit(MIN_CHUNK_BITS + sizeclass);
   }
 
-  /**
-   * For large allocations, the metaentry stores the raw log_2 of the size,
-   * which must be shifted into the index space of slab_sizeclass-es.
-   */
-  constexpr size_t
-  metaentry_chunk_sizeclass_to_slab_sizeclass(chunksizeclass_t sizeclass)
-  {
-    return sizeclass - MIN_CHUNK_BITS;
-  }
-
   constexpr uint16_t sizeclass_to_slab_object_count(smallsizeclass_t sizeclass)
   {
-    return sizeclass_metadata.slow(sizeclass_t::from_small_class(sizeclass))
+    return sizeclass_metadata.slab(sizeclass_t::from_small_class(sizeclass))
       .capacity;
   }
 
-  SNMALLOC_FAST_PATH constexpr size_t slab_index(sizeclass_t sc, address_t addr)
+  SNMALLOC_FAST_PATH constexpr size_t
+  slab_index(offset_and_sizeclass_t osc, address_t addr)
   {
-    auto meta = sizeclass_metadata.fast(sc);
+    auto meta = sizeclass_metadata.start(osc);
     size_t offset = addr & meta.slab_mask;
     if constexpr (sizeof(offset) >= 8)
     {
@@ -334,29 +539,54 @@ namespace snmalloc
     }
   }
 
+  /**
+   * Recover the start address of the allocation containing `addr`.
+   *
+   * Branch on `osc.offset() == 0` (testable from bits already loaded
+   * into `osc.raw()`, before any metadata-table access). The common
+   * case skips the `offset_bytes` field load and four extra arithmetic
+   * insns; the slow arm handles non-pow2 large interior chunks where
+   * the slab base must be shifted back to the allocation base.
+   */
   SNMALLOC_FAST_PATH constexpr address_t
-  start_of_object(sizeclass_t sc, address_t addr)
+  start_of_object(offset_and_sizeclass_t osc, address_t addr)
   {
-    auto meta = sizeclass_metadata.fast(sc);
-    address_t slab_start = addr & ~meta.slab_mask;
-    size_t index = slab_index(sc, addr);
-    return slab_start + (index * meta.size);
+    auto meta = sizeclass_metadata.start(osc);
+    if (SNMALLOC_LIKELY(osc.offset() == 0))
+    {
+      address_t slab_base = addr & ~meta.slab_mask;
+      size_t index = slab_index(osc, addr);
+      return slab_base + (index * meta.size);
+    }
+    address_t alloc_start = (addr & ~meta.slab_mask) - meta.offset_bytes;
+    size_t index = slab_index(osc, addr - alloc_start);
+    return alloc_start + (index * meta.size);
   }
 
-  constexpr size_t index_in_object(sizeclass_t sc, address_t addr)
+  SNMALLOC_FAST_PATH constexpr size_t
+  index_in_object(offset_and_sizeclass_t osc, address_t addr)
   {
-    return addr - start_of_object(sc, addr);
+    return addr - start_of_object(osc, addr);
   }
 
-  constexpr size_t remaining_bytes(sizeclass_t sc, address_t addr)
+  SNMALLOC_FAST_PATH constexpr size_t
+  remaining_bytes(offset_and_sizeclass_t osc, address_t addr)
   {
-    return sizeclass_metadata.fast(sc).size - index_in_object(sc, addr);
+    return sizeclass_metadata.start(osc).size - index_in_object(osc, addr);
   }
 
+  /**
+   * True iff `addr` is correctly aligned for an object of this
+   * sizeclass within its slab. Does NOT check whether `addr` lies in
+   * the first slab tile of a non-pow2 large allocation; callers that
+   * could be looking at an interior chunk must read the
+   * `offset_and_sizeclass_t` from the pagemap and use that overload
+   * instead.
+   */
   constexpr bool is_start_of_object(sizeclass_t sc, address_t addr)
   {
-    size_t offset = addr & (sizeclass_full_to_slab_size(sc) - 1);
-
+    auto meta = sizeclass_metadata.align(sc);
+    size_t offset = addr & meta.slab_mask;
     // Only works up to certain offsets, exhaustively tested by rounding.cc
     if constexpr (sizeof(offset) >= 8)
     {
@@ -364,8 +594,7 @@ namespace snmalloc
       // 32bit.
       // This is based on:
       //  https://lemire.me/blog/2019/02/20/more-fun-with-fast-remainders-when-the-divisor-is-a-constant/
-      auto mod_zero_mult = sizeclass_metadata.fast(sc).mod_zero_mult;
-      return (offset * mod_zero_mult) < mod_zero_mult;
+      return (offset * meta.mod_zero_mult) < meta.mod_zero_mult;
     }
     else
       // Use 32-bit division as considerably faster than 64-bit, and
@@ -373,14 +602,17 @@ namespace snmalloc
       return static_cast<uint32_t>(offset % sizeclass_full_to_size(sc)) == 0;
   }
 
-  inline static size_t large_size_to_chunk_size(size_t size)
-  {
-    return bits::next_pow2(size);
-  }
-
-  inline static size_t large_size_to_chunk_sizeclass(size_t size)
+  /**
+   * True iff `addr` is the start of an object. Interior chunks of
+   * non-pow2 large allocations carry `offset_bytes != 0`; only the
+   * first slab tile holds an allocation base, so a non-zero
+   * `offset_bytes` short-circuits to false.
+   */
+  constexpr bool is_start_of_object(offset_and_sizeclass_t osc, address_t addr)
   {
-    return bits::next_pow2_bits(size) - MIN_CHUNK_BITS;
+    if (sizeclass_metadata.start(osc).offset_bytes != 0)
+      return false;
+    return is_start_of_object(osc.sizeclass(), addr);
   }
 
   constexpr SNMALLOC_PURE size_t sizeclass_lookup_index(const size_t s)
@@ -416,7 +648,7 @@ namespace snmalloc
       for (; sizeclass < minimum_class; sizeclass++)
       {
         for (; curr <=
-             sizeclass_metadata.fast_small(smallsizeclass_t(sizeclass)).size;
+             sizeclass_metadata.start_small(smallsizeclass_t(sizeclass)).size;
              curr += MIN_ALLOC_STEP_SIZE)
         {
           table[sizeclass_lookup_index(curr)] = minimum_class;
@@ -426,7 +658,7 @@ namespace snmalloc
       for (; sizeclass < NUM_SMALL_SIZECLASSES; sizeclass++)
       {
         for (; curr <=
-             sizeclass_metadata.fast_small(smallsizeclass_t(sizeclass)).size;
+             sizeclass_metadata.start_small(smallsizeclass_t(sizeclass)).size;
              curr += MIN_ALLOC_STEP_SIZE)
         {
           auto i = sizeclass_lookup_index(curr);
@@ -456,13 +688,13 @@ namespace snmalloc
   }
 
   /**
-   * A compressed size representation,
-   *   either a small size class with the 7th bit set
-   *   or a large class with the 7th bit not set.
-   * Large classes are stored as a mask shift.
-   *    size = (~0 >> lc) + 1;
-   * Thus large size class 0, has size 0.
-   * And large size class 33, has size 2^31
+   * Map a requested size to its sizeclass.
+   *
+   * Small requests use the dense lookup table. Large requests are
+   * encoded with `to_exp_mant<INTERMEDIATE_BITS, MIN_ALLOC_STEP_BITS>`,
+   * whose ceil semantic (`v = v - 1; ...`) selects the smallest
+   * sizeclass whose size is `>= size`. The raw `size` is passed in
+   * directly — the encoding does the rounding.
    */
   static inline sizeclass_t size_to_sizeclass_full(size_t size)
   {
@@ -470,9 +702,11 @@ namespace snmalloc
     {
       return sizeclass_t::from_small_class(size_to_sizeclass(size));
     }
-    // bits::clz is undefined on 0, but we have size == 1 has already been
-    // handled here.  We conflate 0 and sizes larger than we can allocate.
-    return sizeclass_t::from_large_class(bits::clz(size - 1));
+    SNMALLOC_ASSERT(size != 0);
+    SNMALLOC_ASSERT(size <= MAX_LARGE_SIZECLASS_SIZE);
+    size_t global =
+      bits::to_exp_mant<INTERMEDIATE_BITS, MIN_ALLOC_STEP_BITS>(size);
+    return sizeclass_t::from_large_class(global - NUM_SMALL_SIZECLASSES);
   }
 
   inline SNMALLOC_FAST_PATH static size_t round_size(size_t size)
@@ -492,13 +726,20 @@ namespace snmalloc
       return sizeclass_to_size(size_to_sizeclass(1));
     }
 
-    if (size > bits::one_at_bit(bits::BITS - 1))
+    if (size > MAX_LARGE_SIZECLASS_SIZE)
     {
       // This size is too large, no rounding should occur as will result in a
       // failed allocation later.
       return size;
     }
-    return bits::next_pow2(size);
+    // Large branch: round to the smallest enclosing exp+mantissa
+    // sizeclass. Must agree with `round_size`'s small-class branch in
+    // semantics: every request rounds to the smallest enclosing
+    // class. `DefaultConts::success` (corealloc.h) uses `round_size`
+    // to compute the `calloc` zeroing range, so any drift between
+    // the actual reservation and `round_size` would over- or
+    // under-zero.
+    return sizeclass_full_to_size(size_to_sizeclass_full(size));
   }
 
   /// Returns the alignment that this size naturally has, that is
@@ -510,41 +751,4 @@ namespace snmalloc
       return 1;
     return bits::one_at_bit(bits::ctz(rsize));
   }
-
-  constexpr SNMALLOC_FAST_PATH static size_t
-  aligned_size(size_t alignment, size_t size)
-  {
-    // Client responsible for checking alignment is not zero
-    SNMALLOC_ASSERT(alignment != 0);
-    // Client responsible for checking alignment is a power of two
-    SNMALLOC_ASSERT(bits::is_pow2(alignment));
-
-    // There are a class of corner cases to consider
-    //    alignment = 0x8
-    //    size = 0xfff...fff7
-    // for this result will be 0.  This should fail an allocation, so we need to
-    // check for this overflow.
-    // However,
-    //    alignment = 0x8
-    //    size      = 0x0
-    // will also result in 0, but this should be allowed to allocate.
-    // So we need to check for overflow, and return SIZE_MAX in this first case,
-    // and 0 in the second.
-    size_t result = ((alignment - 1) | (size - 1)) + 1;
-    // The following code is designed to fuse well with a subsequent
-    // sizeclass calculation.  We use the same fast path constant to
-    // move the case where result==0 to the slow path, and then check for which
-    // case we are in.
-    if (is_small_sizeclass(result))
-      return result;
-
-    // We are in the slow path, so we need to check for overflow.
-    if (SNMALLOC_UNLIKELY(result == 0))
-    {
-      // Check for overflow and return the maximum size.
-      if (SNMALLOC_UNLIKELY(result < size))
-        return SIZE_MAX;
-    }
-    return result;
-  }
 } // namespace snmalloc
diff --git a/src/snmalloc/ds_core/bits.h b/src/snmalloc/ds_core/bits.h
index 3391e70f7..57a5a0e73 100644
--- a/src/snmalloc/ds_core/bits.h
+++ b/src/snmalloc/ds_core/bits.h
@@ -288,6 +288,21 @@ namespace snmalloc
       return BITS - clz_const(x - 1);
     }
 
+    /**
+     * Returns `floor(log2(x))`, i.e. the bit index of the highest set bit
+     * of `x`. Correct for `x >= 1`; calling with `x == 0` is UB (it would
+     * call `clz(0)`, whose precondition is `x != 0`).
+     */
+    inline SNMALLOC_FAST_PATH size_t prev_pow2_bits(size_t x)
+    {
+      return BITS - 1 - clz(x);
+    }
+
+    constexpr size_t prev_pow2_bits_const(size_t x)
+    {
+      return BITS - 1 - clz_const(x);
+    }
+
     constexpr SNMALLOC_FAST_PATH size_t
     align_down(size_t value, size_t alignment)
     {
@@ -352,6 +367,35 @@ namespace snmalloc
       return (e << MANTISSA_BITS) + m;
     }
 
+    /**
+     * Runtime counterpart of `to_exp_mant_const`. Identical semantics, but
+     * uses the `clz` intrinsic instead of the 64-iteration `clz_const`
+     * loop, which makes it suitable for the allocation fast path.
+     *
+     * Requires `MANTISSA_BITS + LOW_BITS > 0` so that `value | LEADING_BIT`
+     * is never zero, satisfying `clz`'s precondition.
+     */
+    template<size_t MANTISSA_BITS, size_t LOW_BITS = 0>
+    inline SNMALLOC_FAST_PATH size_t to_exp_mant(size_t value)
+    {
+      static_assert(
+        MANTISSA_BITS + LOW_BITS > 0,
+        "to_exp_mant requires MANTISSA_BITS + LOW_BITS > 0 so that "
+        "value | LEADING_BIT is non-zero (clz precondition)");
+
+      constexpr size_t LEADING_BIT = one_at_bit(MANTISSA_BITS + LOW_BITS) >> 1;
+      constexpr size_t MANTISSA_MASK = mask_bits(MANTISSA_BITS);
+
+      value = value - 1;
+
+      size_t e =
+        bits::BITS - MANTISSA_BITS - LOW_BITS - clz(value | LEADING_BIT);
+      size_t b = (e == 0) ? 0 : 1;
+      size_t m = (value >> (LOW_BITS + e - b)) & MANTISSA_MASK;
+
+      return (e << MANTISSA_BITS) + m;
+    }
+
     template<size_t MANTISSA_BITS, size_t LOW_BITS = 0>
     constexpr size_t from_exp_mant(size_t m_e)
     {
diff --git a/src/snmalloc/ds_core/redblacktree.h b/src/snmalloc/ds_core/redblacktree.h
index e6ce73c24..3fda3b0c9 100644
--- a/src/snmalloc/ds_core/redblacktree.h
+++ b/src/snmalloc/ds_core/redblacktree.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "snmalloc/stl/array.h"
+#include "snmalloc/stl/utility.h"
 
 #include <stddef.h>
 #include <stdint.h>
@@ -456,7 +457,7 @@ namespace snmalloc
           // insufficient to accurately display the tree, but it will still be
           // memory safe as the search code is bounded by the string size.
           static constexpr size_t max_depth = 128;
-          char s_indent[max_depth];
+          char s_indent[max_depth] = {};
           size_t end = 0;
           for (; end < max_depth - 1; end++)
           {
@@ -787,9 +788,76 @@ namespace snmalloc
       return true;
     }
 
+    /**
+     * Return the strict neighbours of `value` in the tree:
+     * `(largest key < value, smallest key > value)`. Either component is
+     * `Rep::null` when no such neighbour exists.
+     *
+     * **Precondition**: `value` is not present in the tree. A single
+     * root-to-leaf descent then records both neighbours: every left
+     * turn (parent key > value) updates the successor candidate to the
+     * parent's key, every right turn updates the predecessor candidate.
+     * `SNMALLOC_CHECK` aborts in any build if a non-null `value` is
+     * encountered on the descent: a duplicate key would make
+     * `neighbours` return an arbitrary neighbour pair that the
+     * caller would consume as valid, corrupting dependent state. The
+     * check uses only one post-descent comparison because a duplicate
+     * key is always recorded into `pred` on the right-going branch
+     * (`compare(k, value)` is false when `k == value`). `Rep::null`
+     * can never be present in the tree, so probing with it is benign
+     * and exempt from the check.
+     */
+    stl::Pair<K, K> neighbours(K value)
+    {
+      K pred = Rep::null;
+      K succ = Rep::null;
+
+      ChildRef cur = get_root();
+      while (!cur.is_null())
+      {
+        K k = cur;
+        if (Rep::compare(k, value))
+        {
+          // k > value: go left; k is the tightest successor seen so far.
+          succ = k;
+          cur = get_dir(true, k);
+        }
+        else
+        {
+          pred = k;
+          cur = get_dir(false, k);
+        }
+      }
+
+      SNMALLOC_CHECK(Rep::equal(pred, Rep::null) || !Rep::equal(pred, value));
+
+      return {pred, succ};
+    }
+
     RBPath get_root_path()
     {
       return RBPath(H{&root});
     }
+
+    /**
+     * Call `fn(key)` for every key in ascending order.
+     */
+    template<typename Fn>
+    void for_each(Fn&& fn)
+    {
+      for_each_impl(get_root(), fn);
+    }
+
+  private:
+    template<typename Fn>
+    static void for_each_impl(ChildRef node, Fn& fn)
+    {
+      if (node.is_null())
+        return;
+      K k = node;
+      for_each_impl(get_dir(true, k), fn);
+      fn(k);
+      for_each_impl(get_dir(false, k), fn);
+    }
   };
 } // namespace snmalloc
diff --git a/src/snmalloc/ds_core/sizeclassstatic.h b/src/snmalloc/ds_core/sizeclassstatic.h
index 011f69830..cf66851bc 100644
--- a/src/snmalloc/ds_core/sizeclassstatic.h
+++ b/src/snmalloc/ds_core/sizeclassstatic.h
@@ -74,4 +74,51 @@ namespace snmalloc
     return (size - 1) <
       sizeclass_to_size_const(smallsizeclass_t(NUM_SMALL_SIZECLASSES - 1));
   }
+
+  /**
+   * @brief Round `size` up so the resulting allocation can satisfy
+   * the requested `alignment`. `alignment` must be a non-zero power
+   * of two.
+   *
+   * Lives in sizeclassstatic.h (not sizeclasstable.h) so it is
+   * available to compile-time-only consumers — notably the test
+   * library header — without pulling in the full runtime sizeclass
+   * machinery.
+   */
+  constexpr SNMALLOC_FAST_PATH size_t
+  aligned_size(size_t alignment, size_t size)
+  {
+    // Client responsible for checking alignment is not zero
+    SNMALLOC_ASSERT(alignment != 0);
+    // Client responsible for checking alignment is a power of two
+    SNMALLOC_ASSERT(bits::is_pow2(alignment));
+
+    // There are a class of corner cases to consider
+    //    alignment = 0x8
+    //    size = 0xfff...fff7
+    // for this result will be 0.  This should fail an allocation, so we need to
+    // check for this overflow.
+    // However,
+    //    alignment = 0x8
+    //    size      = 0x0
+    // will also result in 0, but this should be allowed to allocate.
+    // So we need to check for overflow, and return SIZE_MAX in this first case,
+    // and 0 in the second.
+    size_t result = ((alignment - 1) | (size - 1)) + 1;
+    // The following code is designed to fuse well with a subsequent
+    // sizeclass calculation.  We use the same fast path constant to
+    // move the case where result==0 to the slow path, and then check for which
+    // case we are in.
+    if (is_small_sizeclass(result))
+      return result;
+
+    // We are in the slow path, so we need to check for overflow.
+    if (SNMALLOC_UNLIKELY(result == 0))
+    {
+      // Check for overflow and return the maximum size.
+      if (SNMALLOC_UNLIKELY(result < size))
+        return SIZE_MAX;
+    }
+    return result;
+  }
 } // namespace snmalloc
diff --git a/src/snmalloc/global/globalalloc.h b/src/snmalloc/global/globalalloc.h
index 7607e582a..1d7f05a18 100644
--- a/src/snmalloc/global/globalalloc.h
+++ b/src/snmalloc/global/globalalloc.h
@@ -138,9 +138,7 @@ namespace snmalloc
   size_t SNMALLOC_FAST_PATH_INLINE remaining_bytes(address_t p)
   {
     const auto& entry = Config_::Backend::template get_metaentry<true>(p);
-
-    auto sizeclass = entry.get_sizeclass();
-    return snmalloc::remaining_bytes(sizeclass, p);
+    return snmalloc::remaining_bytes(entry.get_offset_and_sizeclass(), p);
   }
 
   template<SNMALLOC_CONCEPT(IsConfig) Config_ = Config>
@@ -159,9 +157,7 @@ namespace snmalloc
   static inline size_t index_in_object(address_t p)
   {
     const auto& entry = Config_::Backend::template get_metaentry<true>(p);
-
-    auto sizeclass = entry.get_sizeclass();
-    return snmalloc::index_in_object(sizeclass, p);
+    return snmalloc::index_in_object(entry.get_offset_and_sizeclass(), p);
   }
 
   enum Boundary
@@ -230,7 +226,8 @@ namespace snmalloc
   {
     const auto& entry = Config_::Backend::get_metaentry(address_cast(p));
 
-    size_t index = slab_index(entry.get_sizeclass(), address_cast(p));
+    size_t index =
+      slab_index(entry.get_offset_and_sizeclass(), address_cast(p));
 
     auto* meta_slab = entry.get_slab_metadata();
 
@@ -259,7 +256,8 @@ namespace snmalloc
     const auto& entry =
       Config_::Backend::template get_metaentry<true>(address_cast(p));
 
-    size_t index = slab_index(entry.get_sizeclass(), address_cast(p));
+    size_t index =
+      slab_index(entry.get_offset_and_sizeclass(), address_cast(p));
 
     auto* meta_slab = entry.get_slab_metadata();
 
@@ -287,6 +285,19 @@ namespace snmalloc
       if (!entry.is_owned())
         return;
       size = size == 0 ? 1 : size;
+      // Any size beyond what the sizeclass encoding can represent is
+      // necessarily a mismatch with the pagemap's recorded sizeclass; report
+      // it directly rather than feeding the unrepresentable size into
+      // `size_to_sizeclass_full`.
+      if (size > MAX_LARGE_SIZECLASS_SIZE)
+      {
+        snmalloc_check_client(
+          mitigations(sanity_checks),
+          p == nullptr,
+          "Dealloc size exceeds encodable range: {}",
+          size);
+        return;
+      }
       auto sc = size_to_sizeclass_full(size);
       auto pm_sc = entry.get_sizeclass();
       auto rsize = sizeclass_full_to_size(sc);
@@ -380,10 +391,17 @@ namespace snmalloc
     ThreadAlloc::get().dealloc<ThreadAlloc::CheckInit>(p);
   }
 
-  template<size_t size>
+  /**
+   * Compile-time sized dealloc. The optional `align` parameter mirrors
+   * the `align` parameter on `alloc<size, Conts, align>` so the
+   * sized-dealloc sanity check sees the size that was actually
+   * reserved (post `aligned_size`), not the raw requested `size`.
+   */
+  template<size_t size, size_t align = 1>
   SNMALLOC_FAST_PATH_INLINE void dealloc(void* p)
   {
-    check_size(p, size);
+    constexpr size_t sz = aligned_size(align, size);
+    check_size(p, sz);
     ThreadAlloc::get().dealloc<ThreadAlloc::CheckInit>(p);
   }
 
diff --git a/src/snmalloc/mem/corealloc.h b/src/snmalloc/mem/corealloc.h
index 127abc76a..942b7f514 100644
--- a/src/snmalloc/mem/corealloc.h
+++ b/src/snmalloc/mem/corealloc.h
@@ -533,7 +533,7 @@ namespace snmalloc
 
       snmalloc_check_client(
         mitigations(sanity_checks),
-        is_start_of_object(entry.get_sizeclass(), address_cast(msg)),
+        is_start_of_object(entry.get_offset_and_sizeclass(), address_cast(msg)),
         "Not deallocating start of an object");
 
       size_t objsize = sizeclass_full_to_size(entry.get_sizeclass());
@@ -700,10 +700,11 @@ namespace snmalloc
         [](Allocator* self, size_t size) SNMALLOC_FAST_PATH_LAMBDA {
           return CheckInit::check_init(
             [self, size]() SNMALLOC_FAST_PATH_LAMBDA {
-              if (size > bits::one_at_bit(bits::BITS - 1))
+              if (size > MAX_LARGE_SIZECLASS_SIZE)
               {
-                // Cannot allocate something that is more that half the size of
-                // the address space
+                // Cannot allocate something the sizeclass encoding cannot
+                // represent (equals `2 ^ ENCODED_ADDRESS_BITS` in
+                // `sizeclasstable.h` — well above any plausible request).
                 return Conts::failure(size);
               }
 
@@ -719,12 +720,13 @@ namespace snmalloc
 
               // Grab slab of correct size
               // Set remote as large allocator remote.
+              const auto sc = size_to_sizeclass_full(size);
+              const size_t chunk_size = sizeclass_full_to_size(sc);
               auto [chunk, meta] = Config::Backend::alloc_chunk(
                 self->get_backend_local_state(),
-                large_size_to_chunk_size(size),
-                PagemapEntry::encode(
-                  self->public_state(), size_to_sizeclass_full(size)),
-                size_to_sizeclass_full(size));
+                chunk_size,
+                PagemapEntry::encode(self->public_state(), sc),
+                sc);
 
 #ifdef SNMALLOC_TRACING
               message<1024>(
@@ -1079,7 +1081,7 @@ namespace snmalloc
 
       snmalloc_check_client(
         mitigations(sanity_checks),
-        is_start_of_object(entry.get_sizeclass(), address_cast(p)),
+        is_start_of_object(entry.get_offset_and_sizeclass(), address_cast(p)),
         "Not deallocating start of an object");
 
       auto cp = p.as_static<freelist::Object::T<>>();
@@ -1117,8 +1119,7 @@ namespace snmalloc
         // XXX: because large objects have unique metadata associated with them,
         // the ring size here is one.  We should probably assert that.
 
-        size_t entry_sizeclass = entry.get_sizeclass().as_large();
-        size_t size = bits::one_at_bit(entry_sizeclass);
+        size_t size = sizeclass_full_to_size(entry.get_sizeclass());
 
 #ifdef SNMALLOC_TRACING
         message<1024>("Large deallocation: {}", size);
diff --git a/src/snmalloc/mem/metadata.h b/src/snmalloc/mem/metadata.h
index e753f125c..cfc13755e 100644
--- a/src/snmalloc/mem/metadata.h
+++ b/src/snmalloc/mem/metadata.h
@@ -9,12 +9,11 @@ namespace snmalloc
   struct RemoteAllocator;
 
   /**
-   * Remotes need to be aligned enough that the bottom bits have enough room for
-   * all the size classes, both large and small. An additional bit is required
-   * to separate backend uses.
+   * RemoteAllocator pointers must have their low `COMBINED_BITS` zero
+   * so the (sizeclass, offset) field can be OR-ed in by `encode`.
    */
   static constexpr size_t REMOTE_MIN_ALIGN =
-    bits::max<size_t>(CACHELINE_SIZE, SIZECLASS_REP_SIZE) << 1;
+    bits::max<size_t>(CACHELINE_SIZE, COMBINED_REP_SIZE);
 
   /**
    * Base class for the templated FrontendMetaEntry.  This exists to avoid
@@ -33,16 +32,18 @@ namespace snmalloc
   {
   protected:
     /**
-     * This bit is set in remote_and_sizeclass to discriminate between the case
-     * that it is in use by the frontend (0) or by the backend (1).  For the
-     * former case, see other methods on this and the subclass
-     * `FrontendMetaEntry`; for the latter, see backend/backend.h and
-     * backend/largebuddyrange.h.
-     *
-     * This value is statically checked by the frontend to ensure that its
-     * bit packing does not conflict; see mem/remoteallocator.h
+     * Low bits of `remote_and_sizeclass` holding the sizeclass alone.
+     */
+    static constexpr address_t SIZECLASS_MASK = SIZECLASS_REP_SIZE - 1;
+
+    /**
+     * Low bits of `remote_and_sizeclass` holding the (sizeclass, offset)
+     * pair. Also the markerless ownership discriminator:
+     * `(ras & COMBINED_MASK) == 0` iff the entry is NOT in active
+     * frontend use (frontend entries always have sizeclass != 0; slot 0
+     * is the unmapped sentinel).
      */
-    static constexpr address_t REMOTE_BACKEND_MARKER = 1 << 7;
+    static constexpr address_t COMBINED_MASK = COMBINED_REP_SIZE - 1;
 
     /**
      * Bit used to indicate this should not be considered part of the previous
@@ -56,14 +57,12 @@ namespace snmalloc
     static constexpr address_t META_BOUNDARY_BIT = 1 << 0;
 
     /**
-     * The bit above the sizeclass is always zero unless this is used
-     * by the backend to represent another datastructure such as the buddy
-     * allocator entries.
+     * Alignment used by `get_remote` to mask off the (sizeclass, offset)
+     * bits and recover the `RemoteAllocator*` payload.
      */
     static constexpr size_t REMOTE_WITH_BACKEND_MARKER_ALIGN =
-      MetaEntryBase::REMOTE_BACKEND_MARKER;
-    static_assert(
-      (REMOTE_MIN_ALIGN >> 1) == MetaEntryBase::REMOTE_BACKEND_MARKER);
+      COMBINED_REP_SIZE;
+    static_assert(REMOTE_MIN_ALIGN >= COMBINED_REP_SIZE);
 
     /**
      * In common cases, the pointer to the slab metadata.  See
@@ -95,32 +94,38 @@ namespace snmalloc
     constexpr MetaEntryBase() : MetaEntryBase(0, 0) {}
 
     /**
-     * When a meta entry is in use by the back end, it exposes two words of
-     * state.  The low bits in both are reserved.  Bits in this bitmask must
-     * not be set by the back end in either word.
-     *
-     * During a major release, this constraint may be weakened, allowing the
-     * back end to set more bits.  We don't currently use all of these bits in
-     * both words, but we reserve them all to make access uniform.  If more
-     * bits are required by a back end then we could make this asymmetric.
+     * Per-word frontend-reserved masks. Bits in these masks are owned by
+     * the frontend; the backend must preserve them on writes (enforced
+     * by `BackendStateWordRef::operator=`).
      *
-     * `REMOTE_BACKEND_MARKER` is the highest bit that we reserve, so this is
-     * currently every bit including that bit and all lower bits.
+     * - Word::One reserves `META_BOUNDARY_BIT` so PAL-allocation
+     *   boundaries survive ownership transitions.
+     * - Word::Two reserves `COMBINED_MASK`; the markerless ownership
+     *   discriminator requires these bits to be zero in backend mode,
+     *   and backend writes here are chunk-aligned so the requirement
+     *   is naturally satisfied.
      */
-    static constexpr address_t BACKEND_RESERVED_MASK =
-      (REMOTE_BACKEND_MARKER << 1) - 1;
+    static constexpr address_t BACKEND_RESERVED_MASK_WORD_ONE =
+      META_BOUNDARY_BIT;
+    static constexpr address_t BACKEND_RESERVED_MASK_WORD_TWO = COMBINED_MASK;
 
   public:
     /**
-     * Does the back end currently own this entry?  Note that freshly
-     * allocated entries are owned by the front end until explicitly
-     * claimed by the back end and so this will return `false` if neither
-     * the front nor back end owns this entry.
+     * First bit on Word::One available for backend layouts; the bits
+     * below are frontend-reserved. Backends in `largearenarange.h`
+     * derive `RED_BIT`, `VARIANT_SHIFT`, etc. from this.
+     */
+    static constexpr size_t BACKEND_LAYOUT_FIRST_FREE_BIT = 1;
+
+    /**
+     * True iff this entry is not in active frontend use (backend-claimed
+     * or untouched). Frontend entries always have `sizeclass != 0`
+     * (slot 0 is the unmapped sentinel), so the discriminator
+     * `(ras & COMBINED_MASK) == 0` distinguishes them.
      */
     [[nodiscard]] bool is_backend_owned() const
     {
-      return (REMOTE_BACKEND_MARKER & remote_and_sizeclass) ==
-        REMOTE_BACKEND_MARKER;
+      return (remote_and_sizeclass & COMBINED_MASK) == 0;
     }
 
     /**
@@ -134,14 +139,19 @@ namespace snmalloc
     }
 
     /**
-     * Encode the remote and the sizeclass.
+     * Pack `remote`, `sizeclass`, and the per-chunk slab offset into a
+     * `remote_and_sizeclass` word. `offset` defaults to 0; the backend's
+     * multi-slab-tile write loop in `alloc_chunk` overrides it with the
+     * chunk's slab index so `start_of_object` can recover the
+     * allocation base.
      */
     [[nodiscard]] static SNMALLOC_FAST_PATH uintptr_t
-    encode(RemoteAllocator* remote, sizeclass_t sizeclass)
+    encode(RemoteAllocator* remote, sizeclass_t sizeclass, size_t offset = 0)
     {
       /* remote might be nullptr; cast to uintptr_t before offsetting */
       return pointer_offset(
-        reinterpret_cast<uintptr_t>(remote), sizeclass.raw());
+        reinterpret_cast<uintptr_t>(remote),
+        offset_and_sizeclass_t(sizeclass, offset).raw());
     }
 
     /**
@@ -158,10 +168,15 @@ namespace snmalloc
     /**
      * Explicit assignment operator, copies the data preserving the boundary bit
      * in the target if it is set.
+     *
+     * Load-bearing: the pagemap writes back through this operator (its
+     * `set(p, t)` is `body[p >> SHIFT] = t`), so the boundary bit set
+     * once at OS-range registration survives every subsequent metadata
+     * mutation — including chunk reuse via `dealloc_chunk` — without
+     * any consolidation path having to touch it explicitly.
      */
     MetaEntryBase& operator=(const MetaEntryBase& other)
     {
-      // Don't overwrite the boundary bit with the other's
       meta = (other.meta & ~META_BOUNDARY_BIT) |
         address_cast(meta & META_BOUNDARY_BIT);
       remote_and_sizeclass = other.remote_and_sizeclass;
@@ -193,14 +208,14 @@ namespace snmalloc
     ///@}
 
     /**
-     * Returns the remote.
-     *
-     * If the meta entry is owned by the back end then this returns an
-     * undefined value and will abort in debug builds.
+     * Return the `RemoteAllocator*` payload by masking off the low
+     * `COMBINED_BITS`. Callable in any state: for unowned entries
+     * yields nullptr; for backend-owned entries yields a chunk address
+     * which compares unequal to any allocator's `public_state()`, so
+     * dispatch falls through to the slow path.
      */
     [[nodiscard]] SNMALLOC_FAST_PATH RemoteAllocator* get_remote() const
     {
-      SNMALLOC_ASSERT(!is_backend_owned());
       return reinterpret_cast<RemoteAllocator*>(
         pointer_align_down<REMOTE_WITH_BACKEND_MARKER_ALIGN>(
           get_remote_and_sizeclass()));
@@ -228,19 +243,31 @@ namespace snmalloc
       // TODO: perhaps remove static_cast with resolution of
       // https://github.com/CTSRD-CHERI/llvm-project/issues/588
       return sizeclass_t::from_raw(
-        static_cast<size_t>(get_remote_and_sizeclass()) &
-        (REMOTE_WITH_BACKEND_MARKER_ALIGN - 1));
+        static_cast<size_t>(get_remote_and_sizeclass()) & SIZECLASS_MASK);
+    }
+
+    /**
+     * Return the (sizeclass, slab offset) pair indexing
+     * `sizeclass_metadata.start_`. The selected row carries
+     * `offset_bytes = offset * slab_size` precomputed, so
+     * `start_of_object` recovers the allocation base with a single
+     * subtract.
+     */
+    [[nodiscard]] SNMALLOC_FAST_PATH offset_and_sizeclass_t
+    get_offset_and_sizeclass() const
+    {
+      return offset_and_sizeclass_t::from_raw(
+        static_cast<size_t>(get_remote_and_sizeclass()) & COMBINED_MASK);
     }
 
     /**
-     * Claim the meta entry for use by the back end.  This preserves the
-     * boundary bit, if it is set, but otherwise resets the meta entry to a
-     * pristine state.
+     * Claim the meta entry for the backend: preserves the boundary bit
+     * and zeros `remote_and_sizeclass` so `is_backend_owned()` holds.
      */
     void claim_for_backend()
     {
       meta = is_boundary() ? META_BOUNDARY_BIT : 0;
-      remote_and_sizeclass = REMOTE_BACKEND_MARKER;
+      remote_and_sizeclass = 0;
     }
 
     /**
@@ -261,9 +288,11 @@ namespace snmalloc
       Two
     };
 
-    static constexpr bool is_backend_allowed_value(Word, uintptr_t val)
+    static constexpr bool is_backend_allowed_value(Word w, uintptr_t val)
     {
-      return (val & BACKEND_RESERVED_MASK) == 0;
+      const address_t mask = (w == Word::One) ? BACKEND_RESERVED_MASK_WORD_ONE :
+                                                BACKEND_RESERVED_MASK_WORD_TWO;
+      return (val & mask) == 0;
     }
 
     /**
@@ -280,6 +309,14 @@ namespace snmalloc
        */
       uintptr_t* val;
 
+      /**
+       * The frontend-reserved mask for the word that `val` points at. Bits
+       * in this mask are owned by the frontend: `get()` clears them on
+       * read, and `operator=` preserves them on write (by OR-ing the
+       * current value's masked bits into the new value).
+       */
+      address_t reserved_mask{0};
+
     public:
       /**
        * Uninitialised constructor.
@@ -287,9 +324,24 @@ namespace snmalloc
       BackendStateWordRef() = default;
 
       /**
-       * Constructor, wraps a `uintptr_t`.  Note that this may be used outside
-       * of the meta entry by code wishing to provide uniform storage to things
-       * that are either in a meta entry or elsewhere.
+       * Constructor, wraps a `uintptr_t` and the frontend-reserved mask
+       * that applies to that word. Note that this may be used outside of
+       * the meta entry by code wishing to provide uniform storage to
+       * things that are either in a meta entry or elsewhere.
+       */
+      constexpr BackendStateWordRef(uintptr_t* v, address_t mask)
+      : val(v), reserved_mask(mask)
+      {}
+
+      /**
+       * Single-pointer constructor required by the `RBRepMethods`
+       * concept, which constructs a Handle from `&Rep::root` to
+       * verify sentinel constructibility (see
+       * `ds_core/redblacktree.h`). Reserved mask is zero, which is
+       * safe because `Rep::root` is a `static const` sentinel that
+       * the red-black tree never assigns through — any write would
+       * trap on the const data — and on read the underlying value is
+       * zero so `get()` returns zero regardless of the mask.
        */
       constexpr BackendStateWordRef(uintptr_t* v) : val(v) {}
 
@@ -307,7 +359,7 @@ namespace snmalloc
        */
       [[nodiscard]] uintptr_t get() const
       {
-        return (*val) & ~BACKEND_RESERVED_MASK;
+        return (*val) & ~reserved_mask;
       }
 
       /**
@@ -325,13 +377,13 @@ namespace snmalloc
       BackendStateWordRef& operator=(uintptr_t v)
       {
         SNMALLOC_ASSERT_MSG(
-          ((v & BACKEND_RESERVED_MASK) == 0),
-          "The back end is not permitted to use the low bits in the meta "
-          "entry. ({} & {}) == {}.",
+          ((v & reserved_mask) == 0),
+          "The back end is not permitted to use the reserved bits in the "
+          "meta entry. ({} & {}) == {}.",
           v,
-          BACKEND_RESERVED_MASK,
-          (v & BACKEND_RESERVED_MASK));
-        *val = v | (static_cast<address_t>(*val) & BACKEND_RESERVED_MASK);
+          reserved_mask,
+          (v & reserved_mask));
+        *val = v | (static_cast<address_t>(*val) & reserved_mask);
         return *this;
       }
 
@@ -371,7 +423,10 @@ namespace snmalloc
           remote_and_sizeclass);
         claim_for_backend();
       }
-      return {w == Word::One ? &meta : &remote_and_sizeclass};
+      return (w == Word::One) ?
+        BackendStateWordRef{&meta, BACKEND_RESERVED_MASK_WORD_ONE} :
+        BackendStateWordRef{
+          &remote_and_sizeclass, BACKEND_RESERVED_MASK_WORD_TWO};
     }
   };
 
@@ -738,14 +793,7 @@ namespace snmalloc
     SNMALLOC_FAST_PATH
     FrontendMetaEntry(SlabMetadata* meta, uintptr_t remote_and_sizeclass)
     : MetaEntryBase(unsafe_to_uintptr<SlabMetadata>(meta), remote_and_sizeclass)
-    {
-      SNMALLOC_ASSERT_MSG(
-        (REMOTE_BACKEND_MARKER & remote_and_sizeclass) == 0,
-        "Setting a backend-owned value ({}) via the front-end interface is not "
-        "allowed",
-        remote_and_sizeclass);
-      remote_and_sizeclass &= ~REMOTE_BACKEND_MARKER;
-    }
+    {}
 
     /**
      * Implicit copying of meta entries is almost certainly a bug and so the
@@ -764,13 +812,13 @@ namespace snmalloc
     }
 
     /**
-     * Return the FrontendSlabMetadata metadata associated with this chunk,
-     * guarded by an assert that this chunk is being used as a slab (i.e., has
-     * an associated owning allocator).
+     * Return the FrontendSlabMetadata pointer. Only meaningful when the
+     * entry is frontend-owned; in other states the underlying word
+     * holds tree-node fields. Callers must verify ownership first
+     * (the standard idiom is `entry.get_remote() == self->public_state()`).
      */
     [[nodiscard]] SNMALLOC_FAST_PATH SlabMetadata* get_slab_metadata() const
     {
-      SNMALLOC_ASSERT(!is_backend_owned());
       return unsafe_from_uintptr<SlabMetadata>(meta & ~META_BOUNDARY_BIT);
     }
   };
diff --git a/src/snmalloc/mitigations/allocconfig.h b/src/snmalloc/mitigations/allocconfig.h
index 3f326a570..3626e613a 100644
--- a/src/snmalloc/mitigations/allocconfig.h
+++ b/src/snmalloc/mitigations/allocconfig.h
@@ -94,9 +94,10 @@ namespace snmalloc
 #endif
     ;
 
-  // Used to configure when the backend should use thread local buddies.
-  // This only basically is used to disable some buddy allocators on small
-  // fixed heap scenarios like OpenEnclave.
-  static constexpr size_t MIN_HEAP_SIZE_FOR_THREAD_LOCAL_BUDDY =
+  // Used to configure when the backend should use the thread-local
+  // range cache. Disabled below this heap size for small fixed-heap
+  // scenarios like OpenEnclave, where the per-thread cache would
+  // dominate the heap.
+  static constexpr size_t MIN_HEAP_SIZE_FOR_THREAD_LOCAL_CACHE =
     bits::one_at_bit(27);
 } // namespace snmalloc
diff --git a/src/snmalloc/override/rust.cc b/src/snmalloc/override/rust.cc
index f07e51073..86ac6f5f8 100644
--- a/src/snmalloc/override/rust.cc
+++ b/src/snmalloc/override/rust.cc
@@ -39,8 +39,10 @@ extern "C" SNMALLOC_EXPORT void* SNMALLOC_NAME_MANGLE(rust_realloc)(
   size_t aligned_old_size = aligned_size(alignment, old_size),
          aligned_new_size = aligned_size(alignment, new_size);
   if (
-    size_to_sizeclass_full(aligned_old_size).raw() ==
-    size_to_sizeclass_full(aligned_new_size).raw())
+    aligned_old_size <= MAX_LARGE_SIZECLASS_SIZE &&
+    aligned_new_size <= MAX_LARGE_SIZECLASS_SIZE &&
+    size_to_sizeclass_full(aligned_old_size) ==
+      size_to_sizeclass_full(aligned_new_size))
     return ptr;
   void* p = alloc(aligned_new_size);
   if (p)
diff --git a/src/test/func/aligned_dealloc/aligned_dealloc.cc b/src/test/func/aligned_dealloc/aligned_dealloc.cc
new file mode 100644
index 000000000..51646e39c
--- /dev/null
+++ b/src/test/func/aligned_dealloc/aligned_dealloc.cc
@@ -0,0 +1,90 @@
+/**
+ * Regression test for the compile-time aligned alloc/dealloc API.
+ *
+ * `snmalloc::alloc<size, Conts, align>()` applies
+ * `aligned_size(align, size)` internally so the underlying reservation
+ * is large enough to satisfy `align`. The matching
+ * `snmalloc::dealloc<size, align>(p)` overload mirrors that: it applies
+ * the same `aligned_size` before `check_size`, so the size fed to the
+ * sized-dealloc sanity check is the size that was actually reserved.
+ *
+ * Without the aligned dealloc overload, callers either had to use the
+ * unsized `dealloc(p)` or manually pass `dealloc<aligned_size(align,
+ * size)>(p)`. Calling `dealloc<size>(p)` instead trips `check_size`
+ * under `mitigations(sanity_checks)` whenever the alignment upgrade
+ * pushes the reservation into a different sizeclass than `size`
+ * itself (e.g. `S = 33 KiB`, `A = 128 KiB`: the reservation lives in
+ * a 128 KiB sizeclass but `check_size` evaluates
+ * `size_to_sizeclass_full(33 KiB)`, a smaller class).
+ */
+
+#include "test/setup.h"
+#include "test/snmalloc_testlib.h"
+
+#include <iostream>
+
+using namespace snmalloc;
+
+namespace
+{
+  bool any_failures = false;
+
+  void fail(const char* msg)
+  {
+    std::cout << "FAIL: " << msg << std::endl;
+    any_failures = true;
+  }
+
+  template<size_t size, size_t align>
+  void check_round_trip(const char* label)
+  {
+    void* p = snmalloc::alloc<size, ZeroMem::NoZero, align>();
+    if (p == nullptr)
+    {
+      fail(label);
+      return;
+    }
+    constexpr size_t reserved = aligned_size(align, size);
+    if (alloc_size(p) < reserved)
+    {
+      std::cout << "  reservation too small: alloc_size=" << alloc_size(p)
+                << " expected>=" << reserved << std::endl;
+      fail(label);
+      return;
+    }
+    snmalloc::dealloc<size, align>(p);
+  }
+} // namespace
+
+int main(int, char**)
+{
+  setup();
+
+  // The canonical pre-existing reproducer: today's pow2 rounding maps
+  // 33 KiB to one large sizeclass while the alignment-driven
+  // reservation lands in a strictly larger one.
+  check_round_trip<33 * 1024, 128 * 1024>("S=33KiB A=128KiB");
+
+  // Small-to-large alignment upgrade.
+  check_round_trip<48, 64 * 1024>("S=48B A=64KiB");
+
+  // Wider gap between requested size and required alignment.
+  check_round_trip<17 * 1024, 256 * 1024>("S=17KiB A=256KiB");
+
+  // align == size: alloc and dealloc sees the same value pre- and
+  // post-aligned_size; serves as a baseline that the overload
+  // doesn't pessimise the simple case.
+  check_round_trip<64 * 1024, 64 * 1024>("S=64KiB A=64KiB");
+
+  // Small allocation, natural alignment.
+  check_round_trip<32, 32>("S=32B A=32B");
+
+  if (any_failures)
+  {
+    std::cout << "aligned_dealloc test FAILED" << std::endl;
+    return 1;
+  }
+
+  std::cout << "aligned_dealloc test passed" << std::endl;
+  return 0;
+}
diff --git a/src/test/func/arena/arena.cc b/src/test/func/arena/arena.cc
new file mode 100644
index 000000000..9ccb83099
--- /dev/null
+++ b/src/test/func/arena/arena.cc
@@ -0,0 +1,1571 @@
+/**
+ * Unit tests for Arena.
+ *
+ * Exercises the Rep adapters (BinRep, RangeRep), RBTree integration,
+ * add_block with consolidation, remove_block with carving, the
+ * five-clause invariant, and a randomised stress test with oracle.
+ */
+
+#include "test/setup.h"
+#include "test/xoroshiro.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <map>
+#include <set>
+#include <vector>
+
+#ifndef SNMALLOC_TRACING
+#  define SNMALLOC_TRACING
+#endif
+#include "test/snmalloc_testlib.h"
+
+#include <snmalloc/backend_helpers/arena.h>
+
+namespace snmalloc
+{
+  // ---- MockRep: array-backed storage for testing ----
+
+  /**
+   * Thin proxy around uintptr_t* with the same interface as
+   * BackendStateWordRef (get, operator=, operator!=). Used by MockRep
+   * to avoid requiring a real pagemap in unit tests.
+   */
+  struct ArenaWordRef
+  {
+    uintptr_t* val{nullptr};
+
+    constexpr ArenaWordRef() = default;
+
+    constexpr ArenaWordRef(uintptr_t* p) : val(p) {}
+
+    uintptr_t get() const
+    {
+      return *val;
+    }
+
+    ArenaWordRef& operator=(uintptr_t v)
+    {
+      *val = v;
+      return *this;
+    }
+
+    bool operator!=(const ArenaWordRef& other) const
+    {
+      return val != other.val;
+    }
+
+    uintptr_t printable_address() const
+    {
+      return reinterpret_cast<uintptr_t>(val);
+    }
+  };
+
+  // Each chunk-aligned address maps to a mock_entry via its chunk index.
+  // word1/word2 hold bin-tree children; range_word1/range_word2 hold
+  // range-tree children. variant and large_size hold metadata. boundary
+  // mirrors the real PagemapRep's entry.is_boundary() — set it on a
+  // chunk to suppress consolidation across that chunk.
+  struct mock_entry
+  {
+    uintptr_t word1{0};
+    uintptr_t word2{0};
+    uintptr_t range_word1{0};
+    uintptr_t range_word2{0};
+    ArenaVariant variant{ArenaVariant::Min};
+    size_t large_size{0};
+    bool boundary{false};
+  };
+
+  // Size the array for the largest test arena + trailing room.
+  static constexpr size_t MOCK_ARENA_CHUNKS = 1024;
+  static mock_entry mock_store[MOCK_ARENA_CHUNKS];
+
+  static void reset_mock_store()
+  {
+    for (size_t i = 0; i < MOCK_ARENA_CHUNKS; i++)
+      mock_store[i] = mock_entry{};
+  }
+
+  static size_t mock_index(uintptr_t addr)
+  {
+    size_t idx = addr >> MIN_CHUNK_BITS;
+    SNMALLOC_ASSERT(idx < MOCK_ARENA_CHUNKS);
+    SNMALLOC_ASSUME(idx < MOCK_ARENA_CHUNKS);
+    return idx;
+  }
+
+  // Inner RBTree Rep used by both MockRep::BinRep and MockRep::RangeRep.
+  // Tag selects which pair of fields in mock_entry holds the tree pointers.
+  // The red bit is packed into bit 8 of the stored word (matching the
+  // PagemapRep layout, but defined privately here).
+  template<bool IsRange>
+  struct MockTreeRep
+  {
+    using Handle = ArenaWordRef;
+    using Contents = uintptr_t;
+
+    static constexpr Contents null = 0;
+    static constexpr Contents root = 0;
+
+    static constexpr unsigned RED_BIT_POS = 8;
+    static constexpr uintptr_t RED_BIT = uintptr_t(1) << RED_BIT_POS;
+    static_assert(RED_BIT < MIN_CHUNK_SIZE);
+
+    static Handle ref(bool direction, Contents k)
+    {
+      static const Contents null_entry = 0;
+      if (SNMALLOC_UNLIKELY(k == 0))
+        return Handle{const_cast<Contents*>(&null_entry)};
+      auto& e = mock_store[mock_index(k)];
+      if constexpr (IsRange)
+        return Handle{direction ? &e.range_word1 : &e.range_word2};
+      else
+        return Handle{direction ? &e.word1 : &e.word2};
+    }
+
+    static Contents get(Handle h)
+    {
+      return h.get() & ~RED_BIT;
+    }
+
+    static void set(Handle h, Contents v)
+    {
+      h = v | (h.get() & RED_BIT);
+    }
+
+    static bool is_red(Contents k)
+    {
+      return (ref(true, k).get() & RED_BIT) == RED_BIT;
+    }
+
+    static void set_red(Contents k, bool new_is_red)
+    {
+      if (new_is_red != is_red(k))
+      {
+        auto h = ref(true, k);
+        h = h.get() ^ RED_BIT;
+      }
+    }
+
+    static bool compare(Contents k1, Contents k2)
+    {
+      return k1 > k2;
+    }
+
+    static bool equal(Contents k1, Contents k2)
+    {
+      return k1 == k2;
+    }
+
+    static uintptr_t printable(Contents k)
+    {
+      return k;
+    }
+
+    static uintptr_t printable(Handle h)
+    {
+      return h.printable_address();
+    }
+
+    static const char* name()
+    {
+      return IsRange ? "MockRangeRep" : "MockBinRep";
+    }
+  };
+
+  struct MockRep
+  {
+    using BinRep = MockTreeRep<false>;
+    using RangeRep = MockTreeRep<true>;
+
+    static ArenaVariant get_variant(uintptr_t addr)
+    {
+      return mock_store[mock_index(addr)].variant;
+    }
+
+    static void set_variant(uintptr_t addr, ArenaVariant v)
+    {
+      mock_store[mock_index(addr)].variant = v;
+    }
+
+    static size_t get_large_size(uintptr_t addr)
+    {
+      return mock_store[mock_index(addr)].large_size;
+    }
+
+    static void set_large_size(uintptr_t addr, size_t s)
+    {
+      mock_store[mock_index(addr)].large_size = s;
+    }
+
+    // Mirrors PagemapRep::can_consolidate, which reads
+    // entry.is_boundary() from the pagemap. The boundary flag lives
+    // per-chunk in mock_store. An out-of-region probe returns false
+    // (cannot consolidate) — both because that is the right semantic
+    // (no neighbour exists outside the arena) and because it gives
+    // GCC's release-mode `-Warray-bounds` analysis a visible guard
+    // covering the `mock_store[...]` read on this branch.
+    static bool can_consolidate(uintptr_t addr)
+    {
+      size_t idx = addr >> MIN_CHUNK_BITS;
+      if (idx >= MOCK_ARENA_CHUNKS)
+        return false;
+      return !mock_store[idx].boundary;
+    }
+  };
+
+  // ---- Test access ----
+  struct ArenaTestAccess
+  {
+    template<typename Arena>
+    static auto& get_bin_trees(Arena& a)
+    {
+      return a.bin_trees;
+    }
+
+    template<typename Arena>
+    static auto& get_range_tree(Arena& a)
+    {
+      return a.range_tree;
+    }
+
+    template<typename Arena>
+    static auto& get_bitmap(Arena& a)
+    {
+      return a.bitmap;
+    }
+  };
+
+  // Convenience: chunk-aligned address from chunk index.
+  static uintptr_t chunk_addr(size_t chunk_idx)
+  {
+    return static_cast<uintptr_t>(chunk_idx) << MIN_CHUNK_BITS;
+  }
+
+  // Convenience: byte size from chunk count.
+  static constexpr size_t chunk_size(size_t n_chunks)
+  {
+    return n_chunks << MIN_CHUNK_BITS;
+  }
+
+  // ---- Test types ----
+  // K = number of address bits the arena covers above MIN_CHUNK_BITS.
+  // K=6 → arena of 64 chunks, K=8 → 256 chunks, K=10 → 1024 chunks.
+  template<size_t K>
+  using TestArena = Arena<MockRep, MIN_CHUNK_BITS, MIN_CHUNK_BITS + K>;
+
+  using Bins = ArenaBins<2, MIN_CHUNK_BITS>;
+
+  // ==================================================================
+  // (A) Accessor round-trips
+  // ==================================================================
+  static void test_variant_roundtrip()
+  {
+    reset_mock_store();
+    uintptr_t a = chunk_addr(10);
+
+    MockRep::set_variant(a, ArenaVariant::Min);
+    SNMALLOC_ASSERT(MockRep::get_variant(a) == ArenaVariant::Min);
+
+    MockRep::set_variant(a, ArenaVariant::EvenTwo);
+    SNMALLOC_ASSERT(MockRep::get_variant(a) == ArenaVariant::EvenTwo);
+
+    MockRep::set_variant(a, ArenaVariant::Large);
+    SNMALLOC_ASSERT(MockRep::get_variant(a) == ArenaVariant::Large);
+
+    printf("  Variant round-trip: OK\n");
+  }
+
+  static void test_large_size_roundtrip()
+  {
+    reset_mock_store();
+    uintptr_t a = chunk_addr(20);
+
+    for (size_t s :
+         {size_t{3},
+          size_t{7},
+          size_t{15},
+          size_t{63},
+          size_t{255},
+          size_t{1000}})
+    {
+      MockRep::set_large_size(a, s);
+      SNMALLOC_ASSERT(MockRep::get_large_size(a) == s);
+    }
+
+    printf("  Large-size round-trip: OK\n");
+  }
+
+  static void test_word_roundtrip()
+  {
+    reset_mock_store();
+    uintptr_t a = chunk_addr(5);
+
+    uintptr_t v1 = chunk_addr(10);
+    uintptr_t v2 = chunk_addr(20);
+
+    auto w1 = MockRep::BinRep::ref(true, a);
+    auto w2 = MockRep::BinRep::ref(false, a);
+    w1 = v1;
+    w2 = v2;
+    SNMALLOC_ASSERT(MockRep::BinRep::ref(true, a).get() == v1);
+    SNMALLOC_ASSERT(MockRep::BinRep::ref(false, a).get() == v2);
+
+    auto rw1 = MockRep::RangeRep::ref(true, a);
+    auto rw2 = MockRep::RangeRep::ref(false, a);
+    rw1 = v2;
+    rw2 = v1;
+    SNMALLOC_ASSERT(MockRep::RangeRep::ref(true, a).get() == v2);
+    SNMALLOC_ASSERT(MockRep::RangeRep::ref(false, a).get() == v1);
+
+    printf("  Word round-trip: OK\n");
+  }
+
+  // ==================================================================
+  // (B) RBTree<BinRep> / RBTree<RangeRep> smoke
+  // ==================================================================
+
+  // We can't directly instantiate BinRep/RangeRep outside Arena
+  // since they are private nested types. Instead, test them through
+  // Arena's add_block/remove_block which exercise both trees.
+  // For smoke testing of tree operations directly, we test through
+  // the Arena's own invariant and operation correctness.
+
+  static void test_rbtree_smoke_via_arena()
+  {
+    reset_mock_store();
+    TestArena<8> arena;
+    arena.check_invariant(true);
+
+    // Insert a few non-adjacent blocks.
+    uintptr_t a1 = chunk_addr(10);
+    uintptr_t a2 = chunk_addr(20);
+    uintptr_t a3 = chunk_addr(30);
+
+    arena.add_block(a1, chunk_size(3));
+    arena.check_invariant(true);
+
+    arena.add_block(a2, chunk_size(5));
+    arena.check_invariant(true);
+
+    arena.add_block(a3, chunk_size(1));
+    arena.check_invariant(true);
+
+    // Remove them.
+    auto r1 = arena.remove_block(chunk_size(1));
+    SNMALLOC_ASSERT(r1 != 0);
+    UNUSED(r1);
+    arena.check_invariant(true);
+
+    auto r2 = arena.remove_block(chunk_size(3));
+    SNMALLOC_ASSERT(r2 != 0);
+    UNUSED(r2);
+    arena.check_invariant(true);
+
+    auto r3 = arena.remove_block(chunk_size(5));
+    SNMALLOC_ASSERT(r3 != 0);
+    UNUSED(r3);
+    arena.check_invariant(true);
+
+    printf("  RBTree smoke via arena: OK\n");
+  }
+
+  // ==================================================================
+  // (C) Empty-state invariant
+  // ==================================================================
+  template<size_t K>
+  static void test_empty_invariant()
+  {
+    reset_mock_store();
+    TestArena<K> arena;
+    arena.check_invariant(true);
+    printf("  Empty invariant (K=%zu): OK\n", K);
+  }
+
+  // ==================================================================
+  // (D) add_block without consolidation
+  // ==================================================================
+  static void test_add_no_consolidation()
+  {
+    reset_mock_store();
+    TestArena<8> arena;
+
+    // Insert several non-adjacent blocks of various sizes.
+    struct
+    {
+      size_t chunk_idx;
+      size_t size;
+    } blocks[] = {
+      {10, 1},
+      {20, 2},
+      {30, 3},
+      {40, 5},
+      {50, 9},
+    };
+
+    for (auto& b : blocks)
+    {
+      auto result =
+        arena.add_block(chunk_addr(b.chunk_idx), chunk_size(b.size));
+      SNMALLOC_ASSERT(result.first == 0 && result.second == 0);
+      UNUSED(result);
+      arena.check_invariant(true);
+    }
+
+    printf("  add_block without consolidation: OK\n");
+  }
+
+  // ==================================================================
+  // (E) remove_block exact-class + carving
+  // ==================================================================
+  static void test_remove_exact()
+  {
+    reset_mock_store();
+    TestArena<8> arena;
+
+    // Insert 3 blocks of size 5 at non-adjacent locations.
+    arena.add_block(chunk_addr(10), chunk_size(5));
+    arena.add_block(chunk_addr(20), chunk_size(5));
+    arena.add_block(chunk_addr(30), chunk_size(5));
+    arena.check_invariant(true);
+
+    // Remove 3 exact-size blocks.
+    for (int i = 0; i < 3; i++)
+    {
+      auto r = arena.remove_block(chunk_size(5));
+      SNMALLOC_ASSERT(r != 0);
+      UNUSED(r);
+      arena.check_invariant(true);
+    }
+
+    // Arena should be empty now.
+    auto r = arena.remove_block(chunk_size(1));
+    SNMALLOC_ASSERT(r == 0);
+    UNUSED(r);
+
+    printf("  remove_block exact: OK\n");
+  }
+
+  static void test_remove_carving()
+  {
+    reset_mock_store();
+    TestArena<8> arena;
+
+    // Insert one block of size 10.
+    arena.add_block(chunk_addr(10), chunk_size(10));
+    arena.check_invariant(true);
+
+    // Request size 3 chunks — should carve from the 10-chunk block.
+    auto r = arena.remove_block(chunk_size(3));
+    SNMALLOC_ASSERT(r != 0);
+    // The carved piece's address should match what Bins::carve produces.
+    auto carved = Bins::carve({chunk_addr(10), chunk_size(10)}, chunk_size(3));
+    UNUSED(r);
+    arena.check_invariant(true);
+
+    // The remainders should still be in the arena.
+    // We can try to remove everything that's left.
+    size_t remaining = chunk_size(10) - carved.req.size;
+    while (remaining > 0)
+    {
+      auto r2 = arena.remove_block(chunk_size(1));
+      SNMALLOC_ASSERT(r2 != 0);
+      UNUSED(r2);
+      arena.check_invariant(true);
+      remaining -= chunk_size(1);
+    }
+
+    // Should be empty.
+    auto r3 = arena.remove_block(chunk_size(1));
+    SNMALLOC_ASSERT(r3 == 0);
+    UNUSED(r3);
+
+    printf("  remove_block carving: OK\n");
+  }
+
+  // ==================================================================
+  // (F) Consolidation case matrix
+  // ==================================================================
+
+  // Helper: insert a block, verify invariant, return nothing.
+  // `size_in_chunks` is a chunk count; converted to bytes internally.
+  template<typename ArenaT>
+  static void
+  add_and_check(ArenaT& arena, size_t chunk_idx, size_t size_in_chunks)
+  {
+    auto result =
+      arena.add_block(chunk_addr(chunk_idx), chunk_size(size_in_chunks));
+    SNMALLOC_ASSERT(result.first == 0 && result.second == 0);
+    UNUSED(result);
+    arena.check_invariant(true);
+  }
+
+  // Drain the arena by removing 1-chunk blocks until empty.
+  // Returns the total chunks removed.
+  template<typename ArenaT>
+  static size_t drain_arena(ArenaT& arena)
+  {
+    size_t total = 0;
+    while (true)
+    {
+      auto r = arena.remove_block(chunk_size(1));
+      if (r == 0)
+        break;
+      total += 1;
+      arena.check_invariant(true);
+    }
+    return total;
+  }
+
+  // Case 12: P-only, P min (size 1).
+  static void test_consolidation_p_min()
+  {
+    reset_mock_store();
+    TestArena<8> arena;
+    add_and_check(arena, 10, 1);
+    add_and_check(arena, 11, 3);
+
+    // Should have consolidated into a single 4-chunk block.
+    size_t total = drain_arena(arena);
+    SNMALLOC_ASSERT(total == 4);
+    UNUSED(total);
+
+    printf("  Consolidation P-only, P min: OK\n");
+  }
+
+  // Case 13: P-only, P non-min.
+  static void test_consolidation_p_nonmin()
+  {
+    reset_mock_store();
+    TestArena<8> arena;
+    add_and_check(arena, 10, 3);
+    add_and_check(arena, 13, 2);
+
+    size_t total = drain_arena(arena);
+    SNMALLOC_ASSERT(total == 5);
+    UNUSED(total);
+
+    printf("  Consolidation P-only, P non-min: OK\n");
+  }
+
+  // Case 14: S-only, S min.
+  static void test_consolidation_s_min()
+  {
+    reset_mock_store();
+    TestArena<8> arena;
+    add_and_check(arena, 14, 1);
+    add_and_check(arena, 11, 3);
+
+    size_t total = drain_arena(arena);
+    SNMALLOC_ASSERT(total == 4);
+    UNUSED(total);
+
+    printf("  Consolidation S-only, S min: OK\n");
+  }
+
+  // Case 15: S-only, S non-min.
+  static void test_consolidation_s_nonmin()
+  {
+    reset_mock_store();
+    TestArena<8> arena;
+    add_and_check(arena, 14, 4);
+    add_and_check(arena, 11, 3);
+
+    size_t total = drain_arena(arena);
+    SNMALLOC_ASSERT(total == 7);
+    UNUSED(total);
+
+    printf("  Consolidation S-only, S non-min: OK\n");
+  }
+
+  // Case 16: P+S, both min.
+  static void test_consolidation_ps_both_min()
+  {
+    reset_mock_store();
+    TestArena<8> arena;
+    add_and_check(arena, 10, 1);
+    add_and_check(arena, 12, 1);
+    add_and_check(arena, 11, 1);
+
+    size_t total = drain_arena(arena);
+    SNMALLOC_ASSERT(total == 3);
+    UNUSED(total);
+
+    printf("  Consolidation P+S, both min: OK\n");
+  }
+
+  // Case 17: P+S, P min, S non-min.
+  static void test_consolidation_ps_p_min_s_nonmin()
+  {
+    reset_mock_store();
+    TestArena<8> arena;
+    add_and_check(arena, 10, 1);
+    add_and_check(arena, 14, 3);
+    add_and_check(arena, 11, 3);
+
+    size_t total = drain_arena(arena);
+    SNMALLOC_ASSERT(total == 7);
+    UNUSED(total);
+
+    printf("  Consolidation P+S, P min, S non-min: OK\n");
+  }
+
+  // Case 18: P+S, P non-min, S min.
+  static void test_consolidation_ps_p_nonmin_s_min()
+  {
+    reset_mock_store();
+    TestArena<8> arena;
+    add_and_check(arena, 10, 3);
+    add_and_check(arena, 16, 1);
+    add_and_check(arena, 13, 3);
+
+    size_t total = drain_arena(arena);
+    SNMALLOC_ASSERT(total == 7);
+    UNUSED(total);
+
+    printf("  Consolidation P+S, P non-min, S min: OK\n");
+  }
+
+  // Case 19: P+S, both non-min.
+  static void test_consolidation_ps_both_nonmin()
+  {
+    reset_mock_store();
+    TestArena<8> arena;
+    add_and_check(arena, 10, 4);
+    add_and_check(arena, 19, 5);
+    add_and_check(arena, 14, 5);
+
+    size_t total = drain_arena(arena);
+    SNMALLOC_ASSERT(total == 14);
+    UNUSED(total);
+
+    printf("  Consolidation P+S, both non-min: OK\n");
+  }
+
+  // ==================================================================
+  // (F2) OddTwo — unaligned size-2 blocks
+  // ==================================================================
+
+  static void test_oddtwo_variant()
+  {
+    // Odd chunk index → OddTwo, even → EvenTwo.
+    reset_mock_store();
+    TestArena<8> arena;
+
+    // Odd address: chunk 11, size 2
+    arena.add_block(chunk_addr(11), chunk_size(2));
+    SNMALLOC_ASSERT(
+      MockRep::get_variant(chunk_addr(11)) == ArenaVariant::OddTwo);
+    arena.check_invariant(true);
+
+    // Even address: chunk 20, size 2
+    arena.add_block(chunk_addr(20), chunk_size(2));
+    SNMALLOC_ASSERT(
+      MockRep::get_variant(chunk_addr(20)) == ArenaVariant::EvenTwo);
+    arena.check_invariant(true);
+
+    // Both should be in the range tree.
+    auto& rt = ArenaTestAccess::get_range_tree(arena);
+    auto p1 = rt.get_root_path();
+    SNMALLOC_ASSERT(rt.find(p1, chunk_addr(11)));
+    auto p2 = rt.get_root_path();
+    SNMALLOC_ASSERT(rt.find(p2, chunk_addr(20)));
+
+    // OddTwo (chunk 11) should be in bin 0 (size-1 servable set).
+    auto& bt0 = ArenaTestAccess::get_bin_trees(arena)[0];
+    auto p3 = bt0.get_root_path();
+    SNMALLOC_ASSERT(bt0.find(p3, chunk_addr(11)));
+    UNUSED(p1, p2, p3);
+
+    size_t total = drain_arena(arena);
+    SNMALLOC_ASSERT(total == 4);
+    UNUSED(total);
+
+    printf("  OddTwo variant tagging: OK\n");
+  }
+
+  static void test_oddtwo_contains_min_filter()
+  {
+    // contains_min must not match OddTwo entries.
+    reset_mock_store();
+    TestArena<8> arena;
+
+    // Add OddTwo block at chunk 11 (odd, size 2).
+    arena.add_block(chunk_addr(11), chunk_size(2));
+    arena.check_invariant(true);
+
+    // Add a size-1 block at chunk 14, non-adjacent.
+    arena.add_block(chunk_addr(14), chunk_size(1));
+    arena.check_invariant(true);
+
+    // Now add chunk 13 (size 1). Its successor check should NOT
+    // pick up chunk 11's OddTwo entry via contains_min. It should
+    // just insert as size 1.
+    arena.add_block(chunk_addr(13), chunk_size(1));
+    arena.check_invariant(true);
+
+    // Chunk 13 should consolidate with chunk 14 (min successor),
+    // but NOT with chunk 11's OddTwo (range tree handles that).
+    // Drain to verify total.
+    size_t total = drain_arena(arena);
+    SNMALLOC_ASSERT(total == 4);
+    UNUSED(total);
+
+    printf("  OddTwo contains_min filter: OK\n");
+  }
+
+  static void test_oddtwo_consolidation()
+  {
+    // OddTwo block should consolidate via the range tree.
+    reset_mock_store();
+    TestArena<8> arena;
+
+    // Add OddTwo at chunk 11 (odd, size 2 → chunks 11-12).
+    arena.add_block(chunk_addr(11), chunk_size(2));
+    arena.check_invariant(true);
+
+    // Add adjacent block at chunk 13 (size 1).
+    // Range tree finds OddTwo at 11 as predecessor? No — chunk 13's
+    // predecessor in range tree is chunk 11 (size 2, ends at 13).
+    // So they should consolidate into size 3 at chunk 11.
+    arena.add_block(chunk_addr(13), chunk_size(1));
+    arena.check_invariant(true);
+
+    auto r = arena.remove_block(chunk_size(3));
+    SNMALLOC_ASSERT(r == chunk_addr(11));
+    UNUSED(r);
+
+    printf("  OddTwo consolidation (successor): OK\n");
+  }
+
+  static void test_oddtwo_consolidation_pred()
+  {
+    // Consolidation where the new block is a predecessor of OddTwo.
+    reset_mock_store();
+    TestArena<8> arena;
+
+    // Add OddTwo at chunk 11 (odd, size 2 → chunks 11-12).
+    arena.add_block(chunk_addr(11), chunk_size(2));
+    arena.check_invariant(true);
+
+    // Add block at chunk 10 (size 1). OddTwo at 11 is the successor
+    // in the range tree → consolidate into size 3 at chunk 10.
+    arena.add_block(chunk_addr(10), chunk_size(1));
+    arena.check_invariant(true);
+
+    auto r = arena.remove_block(chunk_size(3));
+    SNMALLOC_ASSERT(r == chunk_addr(10));
+    UNUSED(r);
+
+    printf("  OddTwo consolidation (predecessor): OK\n");
+  }
+
+  static void test_oddtwo_remove_carve()
+  {
+    // remove_block(1) from an OddTwo block should carve correctly.
+    reset_mock_store();
+    TestArena<8> arena;
+
+    // Add OddTwo at chunk 11 (odd, size 2).
+    arena.add_block(chunk_addr(11), chunk_size(2));
+    arena.check_invariant(true);
+
+    // Remove 1 chunk. Should carve from the OddTwo block.
+    auto r = arena.remove_block(chunk_size(1));
+    SNMALLOC_ASSERT(r != 0);
+    arena.check_invariant(true);
+
+    // The remainder (1 chunk) should be Min variant.
+    auto r2 = arena.remove_block(chunk_size(1));
+    SNMALLOC_ASSERT(r2 != 0);
+    UNUSED(r, r2);
+
+    // Arena should be empty now.
+    auto r3 = arena.remove_block(chunk_size(1));
+    SNMALLOC_ASSERT(r3 == 0);
+    UNUSED(r3);
+
+    printf("  OddTwo remove + carve: OK\n");
+  }
+
+  // ==================================================================
+  // (G) Overflow — arena-scale consolidation
+  // ==================================================================
+  static void test_overflow()
+  {
+    // K=4 → 16-chunk arena. Use base offset 16 to avoid address 0.
+    reset_mock_store();
+    TestArena<4> arena;
+
+    constexpr size_t BASE = 16;
+
+    // Step 1: add even-indexed chunks as individual blocks (8 blocks).
+    for (size_t i = 0; i < 16; i += 2)
+    {
+      arena.add_block(chunk_addr(BASE + i), chunk_size(1));
+      arena.check_invariant(true);
+    }
+
+    // Step 2: fill odd-indexed gaps. Each add consolidates with its
+    // even-indexed neighbours. The last add completes the arena.
+    for (size_t i = 1; i < 16; i += 2)
+    {
+      arena.add_block(chunk_addr(BASE + i), chunk_size(1));
+      // Don't check invariant on the last add — it returns overflow.
+      if (i < 15)
+      {
+        arena.check_invariant(true);
+      }
+    }
+
+    // The last add should have triggered overflow (16 chunks = 2^4).
+    auto r = arena.remove_block(chunk_size(1));
+    SNMALLOC_ASSERT(r == 0);
+    UNUSED(r);
+
+    printf("  Overflow (arena-scale consolidation): OK\n");
+  }
+
+  static void test_overflow_precise()
+  {
+    // K=4 → 16-chunk arena. Use base offset 16 to avoid address 0.
+    reset_mock_store();
+    TestArena<4> arena;
+
+    constexpr size_t BASE = 16;
+
+    arena.add_block(chunk_addr(BASE), chunk_size(8));
+    arena.check_invariant(true);
+
+    // Adding [BASE+8, BASE+16) consolidates to 16 chunks = 2^4 → overflow.
+    auto r = arena.add_block(chunk_addr(BASE + 8), chunk_size(8));
+    SNMALLOC_ASSERT(r.first == chunk_addr(BASE));
+    SNMALLOC_ASSERT(r.second == chunk_size(16));
+    UNUSED(r);
+
+    auto r2 = arena.remove_block(chunk_size(1));
+    SNMALLOC_ASSERT(r2 == 0);
+    UNUSED(r2);
+
+    printf("  Overflow precise: OK\n");
+  }
+
+  // ==================================================================
+  // (H) Randomised stress with oracle
+  // ==================================================================
+
+  // Oracle: std::set of (addr_chunks, size_chunks) representing
+  // maximally-consolidated free set.
+  struct OracleRange
+  {
+    size_t addr; // in chunk units
+    size_t size; // in chunk units
+
+    bool operator<(const OracleRange& o) const
+    {
+      return addr < o.addr;
+    }
+
+    bool operator==(const OracleRange& o) const
+    {
+      return addr == o.addr && size == o.size;
+    }
+  };
+
+  class Oracle
+  {
+    std::set<OracleRange> ranges;
+    size_t base_offset; // chunk offset to match arena addresses
+
+  public:
+    Oracle() : base_offset(0) {}
+
+    Oracle(size_t base) : base_offset(base) {}
+
+    void add(size_t addr_chunks, size_t size_chunks)
+    {
+      OracleRange key{addr_chunks, size_chunks};
+      auto it = ranges.lower_bound(key);
+
+      size_t new_addr = addr_chunks;
+      size_t new_size = size_chunks;
+
+      if (it != ranges.end() && it->addr == new_addr + new_size)
+      {
+        new_size += it->size;
+        it = ranges.erase(it);
+      }
+
+      if (it != ranges.begin())
+      {
+        auto prev = std::prev(it);
+        if (prev->addr + prev->size == new_addr)
+        {
+          new_addr = prev->addr;
+          new_size += prev->size;
+          ranges.erase(prev);
+        }
+      }
+
+      ranges.insert({new_addr, new_size});
+    }
+
+    // Returns {addr_chunks, size_chunks} or {0, 0} if nothing fits.
+    // addr_chunks is oracle-relative (without base offset).
+    std::pair<size_t, size_t> remove(size_t n_chunks)
+    {
+      size_t n_bytes = n_chunks << MIN_CHUNK_BITS;
+      if (n_bytes == 0 || n_bytes > Bins::max_supported_size())
+        return {0, 0};
+
+      // Mirror the arena exactly: build a bitmap using arena-offset
+      // byte addresses (so bin classification matches), then find_for_request.
+      typename Bins::Bitmap bm{};
+      std::map<size_t, std::vector<std::set<OracleRange>::iterator>> by_bin;
+
+      for (auto it = ranges.begin(); it != ranges.end(); ++it)
+      {
+        typename Bins::range_t r{
+          (base_offset + it->addr) << MIN_CHUNK_BITS,
+          it->size << MIN_CHUNK_BITS};
+        size_t bin = bm.add(r);
+        by_bin[bin].push_back(it);
+      }
+
+      size_t bin_id = bm.find_for_request(n_bytes);
+      if (bin_id == SIZE_MAX)
+        return {0, 0};
+
+      auto& entries = by_bin[bin_id];
+      auto best_it = entries[0];
+      for (size_t i = 1; i < entries.size(); i++)
+      {
+        if (entries[i]->addr < best_it->addr)
+          best_it = entries[i];
+      }
+
+      OracleRange block = *best_it;
+      ranges.erase(best_it);
+
+      auto carved = Bins::carve(
+        {(base_offset + block.addr) << MIN_CHUNK_BITS,
+         block.size << MIN_CHUNK_BITS},
+        n_bytes);
+      if (carved.pre.size != 0)
+        ranges.insert(
+          {(carved.pre.base >> MIN_CHUNK_BITS) - base_offset,
+           carved.pre.size >> MIN_CHUNK_BITS});
+      if (carved.post.size != 0)
+        ranges.insert(
+          {(carved.post.base >> MIN_CHUNK_BITS) - base_offset,
+           carved.post.size >> MIN_CHUNK_BITS});
+
+      return {
+        (carved.req.base >> MIN_CHUNK_BITS) - base_offset,
+        carved.req.size >> MIN_CHUNK_BITS};
+    }
+
+    bool empty() const
+    {
+      return ranges.empty();
+    }
+
+    size_t count() const
+    {
+      return ranges.size();
+    }
+  };
+
+  template<size_t K>
+  static void test_stress_seed(size_t seed, size_t num_ops)
+  {
+    reset_mock_store();
+    TestArena<K> arena;
+
+    constexpr size_t ARENA_CHUNKS = bits::one_at_bit(K);
+    // Offset all chunk addresses to avoid address 0 (tree null).
+    constexpr size_t BASE = ARENA_CHUNKS;
+    Oracle oracle(BASE);
+    // Track which chunks are allocated (not free).
+    std::vector<bool> allocated(ARENA_CHUNKS, true);
+
+    xoroshiro::p128r64 rng(seed);
+
+    for (size_t op = 0; op < num_ops; op++)
+    {
+      bool do_add = (rng.next() % 3) != 0; // Bias towards adding.
+
+      if (do_add)
+      {
+        // Find a free address range of random size within the arena.
+        size_t max_size = ARENA_CHUNKS / 4;
+        if (max_size < 1)
+          max_size = 1;
+        size_t size = (rng.next() % max_size) + 1;
+        size_t start = rng.next() % ARENA_CHUNKS;
+
+        // Adjust: find a contiguous allocated (not free) region.
+        // We need a region that's currently allocated (not in the
+        // free set) to add back.
+        bool found = false;
+        for (size_t try_start = start; try_start < ARENA_CHUNKS; try_start++)
+        {
+          // Check if [try_start, try_start + size) is all allocated.
+          size_t actual_size = 0;
+          for (size_t j = try_start; j < ARENA_CHUNKS && j < try_start + size;
+               j++)
+          {
+            if (!allocated[j])
+              break;
+            actual_size++;
+          }
+
+          if (actual_size >= 1)
+          {
+            size = actual_size;
+            start = try_start;
+            found = true;
+            break;
+          }
+        }
+
+        if (!found)
+          continue;
+
+        // Clamp to arena size limit.
+        if (size >= ARENA_CHUNKS)
+          size = ARENA_CHUNKS - 1;
+        if (start + size > ARENA_CHUNKS)
+          size = ARENA_CHUNKS - start;
+        if (size == 0)
+          continue;
+
+        // Mark as free.
+        SNMALLOC_ASSERT(start + size <= ARENA_CHUNKS);
+        for (size_t j = start; j < start + size; j++)
+          allocated[j] = false;
+
+        auto result =
+          arena.add_block(chunk_addr(BASE + start), chunk_size(size));
+        oracle.add(start, size);
+
+        if (result.first != 0)
+        {
+          // Overflow — all chunks are now free and returned to caller.
+          // Oracle should be empty after we remove the overflow range.
+          // Reset: mark everything as allocated again, clear oracle.
+          for (size_t j = 0; j < ARENA_CHUNKS; j++)
+            allocated[j] = true;
+          oracle = Oracle(BASE);
+          // The overflow range isn't tracked by the arena anymore.
+        }
+
+        arena.check_invariant(true);
+      }
+      else
+      {
+        // Remove.
+        size_t max_req = ARENA_CHUNKS / 4;
+        if (max_req < 1)
+          max_req = 1;
+        size_t n = (rng.next() % max_req) + 1;
+
+        auto arena_result = arena.remove_block(chunk_size(n));
+        auto oracle_result = oracle.remove(n);
+        UNUSED(arena_result);
+
+        // Both should agree on success/failure.
+        if (oracle_result.second == 0)
+        {
+          SNMALLOC_ASSERT(arena_result == 0);
+        }
+        else
+        {
+          SNMALLOC_ASSERT(arena_result != 0);
+          SNMALLOC_ASSERT(
+            arena_result == chunk_addr(BASE + oracle_result.first));
+
+          // Mark as allocated.
+          size_t start = oracle_result.first;
+          SNMALLOC_ASSERT(start + oracle_result.second <= ARENA_CHUNKS);
+          for (size_t j = start; j < start + oracle_result.second; j++)
+            allocated[j] = true;
+        }
+
+        arena.check_invariant(true);
+      }
+    }
+  }
+
+  static void test_stress()
+  {
+    constexpr size_t K = 6; // 64-chunk arena
+    constexpr size_t NUM_OPS = 500;
+    constexpr size_t NUM_SEEDS = 50;
+
+    for (size_t seed = 1; seed <= NUM_SEEDS; seed++)
+    {
+      test_stress_seed<K>(seed, NUM_OPS);
+    }
+    printf(
+      "  Randomised stress (%zu seeds x %zu ops): OK\n", NUM_SEEDS, NUM_OPS);
+  }
+
+  // ==================================================================
+  // (I) Multi-instance: shared pagemap, blocks migrating between arenas
+  // ==================================================================
+
+  static void test_multi_instance_basic()
+  {
+    reset_mock_store();
+    TestArena<8> arena_a;
+    TestArena<8> arena_b;
+    constexpr size_t BASE = 256; // avoid address 0
+
+    // Add distinct blocks to each arena.
+    arena_a.add_block(chunk_addr(BASE + 10), chunk_size(5));
+    arena_b.add_block(chunk_addr(BASE + 30), chunk_size(5));
+    arena_a.check_invariant(true);
+    arena_b.check_invariant(true);
+
+    // Migrate a block from A to B.
+    uintptr_t a_addr = arena_a.remove_block(chunk_size(3));
+    SNMALLOC_ASSERT(a_addr != 0);
+    arena_a.check_invariant(true);
+
+    arena_b.add_block(a_addr, chunk_size(3));
+    arena_a.check_invariant(true);
+    arena_b.check_invariant(true);
+
+    // Migrate from B back to A.
+    uintptr_t b_addr = arena_b.remove_block(chunk_size(2));
+    SNMALLOC_ASSERT(b_addr != 0);
+    arena_b.check_invariant(true);
+
+    arena_a.add_block(b_addr, chunk_size(2));
+    arena_a.check_invariant(true);
+    arena_b.check_invariant(true);
+
+    printf("  Basic migration: OK\n");
+  }
+
+  static void test_multi_instance_consolidation()
+  {
+    reset_mock_store();
+    TestArena<8> arena_a;
+    TestArena<8> arena_b;
+    constexpr size_t BASE = 256;
+
+    // Arena B holds two blocks with a gap: [20..24) and [28..32).
+    arena_b.add_block(chunk_addr(BASE + 20), chunk_size(4));
+    arena_b.add_block(chunk_addr(BASE + 28), chunk_size(4));
+    arena_b.check_invariant(true);
+
+    // Arena A holds the gap: [24..28).
+    arena_a.add_block(chunk_addr(BASE + 24), chunk_size(4));
+    arena_a.check_invariant(true);
+
+    // Migrate the gap from A to B → should consolidate into [20..32).
+    uintptr_t addr = arena_a.remove_block(chunk_size(4));
+    SNMALLOC_ASSERT(addr == chunk_addr(BASE + 24));
+    arena_a.check_invariant(true);
+
+    arena_b.add_block(addr, chunk_size(4));
+    arena_b.check_invariant(true);
+
+    // B should now serve a size-12 request from the consolidated block.
+    uintptr_t r_addr = arena_b.remove_block(chunk_size(12));
+    SNMALLOC_ASSERT(r_addr == chunk_addr(BASE + 20));
+    UNUSED(r_addr);
+    arena_b.check_invariant(true);
+
+    printf("  Consolidation after migration: OK\n");
+  }
+
+  template<size_t K>
+  static void test_multi_stress_seed(size_t seed, size_t num_ops)
+  {
+    reset_mock_store();
+    TestArena<K> arena_a;
+    TestArena<K> arena_b;
+
+    constexpr size_t ARENA_CHUNKS = bits::one_at_bit(K);
+    constexpr size_t BASE = ARENA_CHUNKS;
+    Oracle oracle_a(BASE);
+    Oracle oracle_b(BASE);
+
+    // 0 = not in any arena, 1 = in arena A, 2 = in arena B.
+    std::vector<uint8_t> owner(ARENA_CHUNKS, 0);
+
+    xoroshiro::p128r64 rng(seed);
+
+    for (size_t op = 0; op < num_ops; op++)
+    {
+      // 0,1 = add to A or B; 2,3 = remove from A or B; 4 = migrate.
+      size_t action = rng.next() % 5;
+
+      bool target_a = (action & 1) == 0;
+      auto& arena = target_a ? arena_a : arena_b;
+      auto& oracle = target_a ? oracle_a : oracle_b;
+      uint8_t my_id = target_a ? 1 : 2;
+
+      if (action <= 1)
+      {
+        // Add: find a contiguous unowned region to free into this arena.
+        size_t max_size = ARENA_CHUNKS / 4;
+        if (max_size < 1)
+          max_size = 1;
+        size_t size = (rng.next() % max_size) + 1;
+        size_t start = rng.next() % ARENA_CHUNKS;
+
+        bool found = false;
+        for (size_t s = start; s < ARENA_CHUNKS; s++)
+        {
+          size_t actual = 0;
+          for (size_t j = s; j < ARENA_CHUNKS && j < s + size; j++)
+          {
+            if (owner[j] != 0)
+              break;
+            actual++;
+          }
+          if (actual >= 1)
+          {
+            size = actual;
+            start = s;
+            found = true;
+            break;
+          }
+        }
+        if (!found)
+          continue;
+
+        if (size >= ARENA_CHUNKS)
+          size = ARENA_CHUNKS - 1;
+        if (start + size > ARENA_CHUNKS)
+          size = ARENA_CHUNKS - start;
+        if (size == 0)
+          continue;
+
+        for (size_t j = start; j < start + size; j++)
+          owner[j] = my_id;
+
+        auto result =
+          arena.add_block(chunk_addr(BASE + start), chunk_size(size));
+        oracle.add(start, size);
+
+        if (result.first != 0)
+        {
+          for (size_t j = 0; j < ARENA_CHUNKS; j++)
+            if (owner[j] == my_id)
+              owner[j] = 0;
+          oracle = Oracle(BASE);
+        }
+
+        arena.check_invariant(true);
+      }
+      else if (action <= 3)
+      {
+        // Remove from this arena.
+        size_t max_req = ARENA_CHUNKS / 4;
+        if (max_req < 1)
+          max_req = 1;
+        size_t n = (rng.next() % max_req) + 1;
+
+        auto arena_r = arena.remove_block(chunk_size(n));
+        auto oracle_r = oracle.remove(n);
+        UNUSED(arena_r);
+
+        if (oracle_r.second == 0)
+        {
+          SNMALLOC_ASSERT(arena_r == 0);
+        }
+        else
+        {
+          SNMALLOC_ASSERT(arena_r != 0);
+          SNMALLOC_ASSERT(arena_r == chunk_addr(BASE + oracle_r.first));
+
+          for (size_t j = oracle_r.first; j < oracle_r.first + oracle_r.second;
+               j++)
+          {
+            SNMALLOC_ASSERT(owner[j] == my_id);
+            owner[j] = 0;
+          }
+        }
+
+        arena.check_invariant(true);
+      }
+      else
+      {
+        // Migrate: remove from one arena, add to the other.
+        bool from_a = (rng.next() & 1) == 0;
+        auto& src = from_a ? arena_a : arena_b;
+        auto& src_oracle = from_a ? oracle_a : oracle_b;
+        auto& dst = from_a ? arena_b : arena_a;
+        auto& dst_oracle = from_a ? oracle_b : oracle_a;
+        uint8_t src_id = from_a ? 1 : 2;
+        uint8_t dst_id = from_a ? 2 : 1;
+        UNUSED(src_id);
+
+        size_t n = (rng.next() % 3) + 1;
+        uintptr_t src_r = src.remove_block(chunk_size(n));
+        auto src_or = src_oracle.remove(n);
+
+        if (src_or.second == 0)
+        {
+          SNMALLOC_ASSERT(src_r == 0);
+        }
+        else
+        {
+          SNMALLOC_ASSERT(src_r != 0);
+          SNMALLOC_ASSERT(src_r == chunk_addr(BASE + src_or.first));
+
+          for (size_t j = src_or.first; j < src_or.first + src_or.second; j++)
+          {
+            SNMALLOC_ASSERT(owner[j] == src_id);
+            owner[j] = dst_id;
+          }
+
+          auto dst_r = dst.add_block(src_r, chunk_size(src_or.second));
+          dst_oracle.add(src_or.first, src_or.second);
+
+          if (dst_r.first != 0)
+          {
+            for (size_t j = 0; j < ARENA_CHUNKS; j++)
+              if (owner[j] == dst_id)
+                owner[j] = 0;
+            dst_oracle = Oracle(BASE);
+          }
+        }
+
+        src.check_invariant(true);
+        dst.check_invariant(true);
+      }
+    }
+  }
+
+  static void test_multi_stress()
+  {
+    constexpr size_t K = 6; // 64-chunk arena
+    constexpr size_t NUM_OPS = 500;
+    constexpr size_t NUM_SEEDS = 50;
+
+    for (size_t seed = 1; seed <= NUM_SEEDS; seed++)
+      test_multi_stress_seed<K>(seed, NUM_OPS);
+
+    printf(
+      "  Multi-instance stress (%zu seeds x %zu ops): OK\n",
+      NUM_SEEDS,
+      NUM_OPS);
+  }
+
+  // ==================================================================
+  // (J) Boundary consolidation prevention
+  // ==================================================================
+  //
+  // The boundary field on mock_entry suppresses consolidation across
+  // that chunk; MockRep::can_consolidate reads it. This mirrors the
+  // real PagemapRep::can_consolidate reading entry.is_boundary().
+
+  // Test: predecessor merge blocked by boundary.
+  static void test_boundary_blocks_predecessor()
+  {
+    reset_mock_store();
+    constexpr size_t K = 6;
+    TestArena<K> arena;
+
+    uintptr_t p_addr = chunk_addr(2);
+    uintptr_t a_addr = chunk_addr(4);
+
+    // Place a boundary at a_addr — blocks should not consolidate leftward.
+    mock_store[mock_index(a_addr)].boundary = true;
+
+    arena.add_block(p_addr, chunk_size(2));
+    arena.add_block(a_addr, chunk_size(2));
+
+    // P (chunks 2-3) and A (chunks 4-5) are adjacent but the boundary
+    // at a_addr prevents merging. Both should remain separate.
+    auto r1_addr = arena.remove_block(chunk_size(2));
+    SNMALLOC_ASSERT(r1_addr == p_addr);
+    auto r2_addr = arena.remove_block(chunk_size(2));
+    SNMALLOC_ASSERT(r2_addr == a_addr);
+    UNUSED(r1_addr, r2_addr);
+
+    printf("  Boundary blocks predecessor merge: OK\n");
+  }
+
+  // Test: successor merge blocked by boundary.
+  static void test_boundary_blocks_successor()
+  {
+    reset_mock_store();
+    constexpr size_t K = 6;
+    TestArena<K> arena;
+
+    uintptr_t a_addr = chunk_addr(2);
+    uintptr_t s_addr = chunk_addr(4);
+
+    // Place a boundary at s_addr — blocks should not consolidate rightward.
+    mock_store[mock_index(s_addr)].boundary = true;
+
+    arena.add_block(s_addr, chunk_size(4));
+    arena.add_block(a_addr, chunk_size(2));
+
+    // A (chunks 2-3) and S (chunks 4-7) are adjacent but the boundary
+    // at s_addr prevents merging. Both should remain separate.
+    auto r1_addr = arena.remove_block(chunk_size(2));
+    SNMALLOC_ASSERT(r1_addr == a_addr);
+    auto r2_addr = arena.remove_block(chunk_size(4));
+    SNMALLOC_ASSERT(r2_addr == s_addr);
+    UNUSED(r1_addr, r2_addr);
+
+    printf("  Boundary blocks successor merge: OK\n");
+  }
+
+  // Test: boundary only blocks the specific merge; other merges proceed.
+  static void test_boundary_partial()
+  {
+    reset_mock_store();
+    constexpr size_t K = 6;
+    TestArena<K> arena;
+
+    // Three adjacent blocks: chunks [4,6), [6,8), [8,10).
+    // Boundary at chunk 8 blocks [6,8) ↔ [8,10) merge but allows
+    // [4,6) ↔ [6,8) merge into a 4-aligned block at chunk 4.
+    mock_store[mock_index(chunk_addr(8))].boundary = true;
+
+    arena.add_block(chunk_addr(4), chunk_size(2));
+    arena.add_block(chunk_addr(8), chunk_size(2));
+    arena.add_block(chunk_addr(6), chunk_size(2));
+
+    // [4,6) and [6,8) should consolidate to [4,8).
+    // [8,10) should remain separate due to boundary.
+    auto r1_addr = arena.remove_block(chunk_size(4));
+    SNMALLOC_ASSERT(r1_addr == chunk_addr(4));
+    auto r2_addr = arena.remove_block(chunk_size(2));
+    SNMALLOC_ASSERT(r2_addr == chunk_addr(8));
+    UNUSED(r1_addr, r2_addr);
+
+    printf("  Boundary partial (P merges, S blocked): OK\n");
+  }
+
+  // Regression test: a block whose successor address sits one past
+  // the arena's pagemap must not trigger a can_consolidate probe of
+  // that out-of-range chunk. The fix is in Arena::add_block —
+  // tree-membership tests gate the can_consolidate read. MockRep's
+  // can_consolidate now dereferences mock_store via mock_index, which
+  // asserts on out-of-range indices, so an unguarded probe in
+  // add_block trips here rather than only as a segfault in release
+  // builds.
+  static void test_block_at_arena_top_edge()
+  {
+    reset_mock_store();
+    constexpr size_t K = 10;
+    TestArena<K> arena;
+    constexpr size_t ARENA_CHUNKS = size_t{1} << K;
+
+    // Block ending at the very top of the arena (succ_addr would
+    // address chunk ARENA_CHUNKS, one past mock_store).
+    uintptr_t top_addr = chunk_addr(ARENA_CHUNKS - 4);
+    arena.add_block(top_addr, chunk_size(4));
+    arena.check_invariant(true);
+
+    auto r1 = arena.remove_block(chunk_size(4));
+    SNMALLOC_ASSERT(r1 == top_addr);
+    UNUSED(r1);
+
+    printf("  Block at arena top edge: OK\n");
+  }
+
+  // Test: min-size predecessor blocked by boundary.
+  static void test_boundary_blocks_min_predecessor()
+  {
+    reset_mock_store();
+    constexpr size_t K = 6;
+    TestArena<K> arena;
+
+    uintptr_t p_addr = chunk_addr(4);
+    uintptr_t a_addr = chunk_addr(5);
+
+    mock_store[mock_index(a_addr)].boundary = true;
+
+    arena.add_block(p_addr, chunk_size(1)); // min-size block
+    arena.add_block(
+      a_addr, chunk_size(1)); // adjacent, but boundary prevents merge
+
+    auto r1_addr = arena.remove_block(chunk_size(1));
+    auto r2_addr = arena.remove_block(chunk_size(1));
+    // Both should be separate min-size blocks.
+    SNMALLOC_ASSERT(
+      (r1_addr == p_addr && r2_addr == a_addr) ||
+      (r1_addr == a_addr && r2_addr == p_addr));
+    UNUSED(r1_addr, r2_addr);
+
+    printf("  Boundary blocks min predecessor merge: OK\n");
+  }
+
+} // namespace snmalloc
+
+int main()
+{
+  printf("--- Arena tests ---\n");
+
+  printf("(A) Accessor round-trips:\n");
+  snmalloc::test_variant_roundtrip();
+  snmalloc::test_large_size_roundtrip();
+  snmalloc::test_word_roundtrip();
+
+  printf("(B) RBTree smoke via arena:\n");
+  snmalloc::test_rbtree_smoke_via_arena();
+
+  printf("(C) Empty-state invariant:\n");
+  snmalloc::test_empty_invariant<4>();
+  snmalloc::test_empty_invariant<5>();
+  snmalloc::test_empty_invariant<6>();
+
+  printf("(D) add_block without consolidation:\n");
+  snmalloc::test_add_no_consolidation();
+
+  printf("(E) remove_block:\n");
+  snmalloc::test_remove_exact();
+  snmalloc::test_remove_carving();
+
+  printf("(F) Consolidation case matrix:\n");
+  snmalloc::test_consolidation_p_min();
+  snmalloc::test_consolidation_p_nonmin();
+  snmalloc::test_consolidation_s_min();
+  snmalloc::test_consolidation_s_nonmin();
+  snmalloc::test_consolidation_ps_both_min();
+  snmalloc::test_consolidation_ps_p_min_s_nonmin();
+  snmalloc::test_consolidation_ps_p_nonmin_s_min();
+  snmalloc::test_consolidation_ps_both_nonmin();
+
+  printf("(F2) OddTwo (unaligned size-2):\n");
+  snmalloc::test_oddtwo_variant();
+  snmalloc::test_oddtwo_contains_min_filter();
+  snmalloc::test_oddtwo_consolidation();
+  snmalloc::test_oddtwo_consolidation_pred();
+  snmalloc::test_oddtwo_remove_carve();
+
+  printf("(G) Overflow:\n");
+  snmalloc::test_overflow();
+  snmalloc::test_overflow_precise();
+
+  printf("(H) Randomised stress:\n");
+  snmalloc::test_stress();
+
+  printf("(I) Multi-instance:\n");
+  snmalloc::test_multi_instance_basic();
+  snmalloc::test_multi_instance_consolidation();
+  snmalloc::test_multi_stress();
+
+  printf("(J) Boundary consolidation:\n");
+  snmalloc::test_boundary_blocks_predecessor();
+  snmalloc::test_boundary_blocks_successor();
+  snmalloc::test_boundary_partial();
+  snmalloc::test_block_at_arena_top_edge();
+  snmalloc::test_boundary_blocks_min_predecessor();
+
+  printf("All Arena tests passed.\n");
+  return 0;
+}
diff --git a/src/test/func/arenabins/arenabins.cc b/src/test/func/arenabins/arenabins.cc
new file mode 100644
index 000000000..65e24ba37
--- /dev/null
+++ b/src/test/func/arenabins/arenabins.cc
@@ -0,0 +1,1375 @@
+/**
+ * Unit tests for ArenaBins.
+ *
+ * Exercises:
+ *  - the chunk size class encoding (via `ArenaBinsTestAccess`),
+ *  - the private bin classification (`bin_index`),
+ *  - the narrow public surface: `Bitmap::add` / `find_for_request` /
+ *    `clear`, and the pure `carve(range_t, n)` decomposition.
+ *
+ * Strategy: brute force. For each (addr_chunks, n_chunks) on a small grid
+ * we directly check whether a block can serve every candidate size class
+ * (by finding an aligned sub-range that fits via `can_serve`, and
+ * consulting the canonical `bin_subsets` table via `serves`), and
+ * compare against what `bin_index` predicts. Bitmap behaviour is
+ * cross-checked against a slow reference scanner that formulates
+ * "bin b serves request n" directly in terms of the canonical
+ * `bin_subsets` table; raw word access for tests goes through
+ * `ArenaBinsTestAccess::raw_*`.
+ */
+
+#include "test/setup.h"
+#include "test/snmalloc_testlib.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <snmalloc/backend_helpers/arenabins.h>
+#include <snmalloc/ds_core/helpers.h>
+#include <vector>
+
+namespace snmalloc
+{
+  /**
+   * Friend struct exposing private internals of
+   * `ArenaBins<B, MIN_SIZE_BITS>` (and its nested `Bitmap`)
+   * for unit tests. Forward-declared in `arenabins.h`;
+   * defined here to keep the test-access implementation out of the
+   * in-tree header.
+   */
+  template<size_t INTERMEDIATE_BITS, size_t MIN_SIZE_BITS>
+  struct ArenaBinsTestAccess
+  {
+    using Bins = ArenaBins<INTERMEDIATE_BITS, MIN_SIZE_BITS>;
+
+    using Bitmap = typename Bins::Bitmap;
+    using range_t = typename Bins::range_t;
+    using carve_t = typename Bins::carve_t;
+    using bitmap_info_t = typename Bins::bitmap_info_t;
+    using carve_info_t = typename Bins::carve_info_t;
+
+    static constexpr size_t B = Bins::B;
+    static constexpr size_t MANTISSAS_PER_EXP = Bins::MANTISSAS_PER_EXP;
+    static constexpr size_t BINS_PER_EXP = Bins::BINS_PER_EXP;
+    static constexpr size_t MAX_SC = Bins::MAX_SC;
+
+    SNMALLOC_FAST_PATH static carve_t carve(range_t block, size_t n)
+    {
+      return Bins::carve(block, n);
+    }
+
+    SNMALLOC_FAST_PATH static const bitmap_info_t&
+    bitmap_info_for_request(size_t n)
+    {
+      return Bins::bitmap_info_for_request(n);
+    }
+
+    SNMALLOC_FAST_PATH static const carve_info_t&
+    carve_info_for_request(size_t n)
+    {
+      return Bins::carve_info_for_request(n);
+    }
+
+    SNMALLOC_FAST_PATH static size_t bin_index(range_t block)
+    {
+      return Bins::bin_index(block);
+    }
+
+    static constexpr size_t max_supported_size()
+    {
+      return Bins::max_supported_size();
+    }
+
+    // --- Raw size-class id access ---
+    //
+    // The bin scheme assigns a dense raw id in `[0, MAX_SC)` to each
+    // size class. In-tree callers never name these (the fast path
+    // goes straight from request size to the bitmap-scan / carve
+    // record). Tests cross-check the encoding via the helpers below;
+    // the alias `sc_t = size_t` preserves the existing test
+    // naming.
+
+    using sc_t = size_t;
+
+    /// Raw id of the smallest size class >= n (n in bytes,
+    /// multiple of UNIT_SIZE).
+    SNMALLOC_FAST_PATH static sc_t request(size_t n)
+    {
+      SNMALLOC_ASSERT(n >= (size_t(1) << MIN_SIZE_BITS));
+      SNMALLOC_ASSERT(n <= Bins::max_supported_size());
+      return bits::to_exp_mant<INTERMEDIATE_BITS, MIN_SIZE_BITS>(n);
+    }
+
+    static constexpr size_t sc_size(sc_t sc)
+    {
+      return Bins::table_.carve_info[sc].size;
+    }
+
+    static constexpr size_t sc_align(sc_t sc)
+    {
+      return Bins::table_.carve_info[sc].align;
+    }
+
+    SNMALLOC_FAST_PATH static const bitmap_info_t& bitmap_info(sc_t sc)
+    {
+      SNMALLOC_ASSERT(sc < Bins::MAX_SC);
+      return Bins::table_.bitmap_info[sc];
+    }
+
+    SNMALLOC_FAST_PATH static const carve_info_t& carve_info(sc_t sc)
+    {
+      SNMALLOC_ASSERT(sc < Bins::MAX_SC);
+      return Bins::table_.carve_info[sc];
+    }
+
+    /// `bitmap_info_for_request`, constexpr (uses `to_exp_mant_const`).
+    /// Only used in `static_assert`s.
+    static constexpr const bitmap_info_t&
+    bitmap_info_for_request_const(size_t n)
+    {
+      return Bins::table_
+        .bitmap_info[bits::to_exp_mant_const<INTERMEDIATE_BITS, MIN_SIZE_BITS>(
+          n)];
+    }
+
+    /// `carve_info_for_request`, constexpr (uses `to_exp_mant_const`).
+    /// Only used in `static_assert`s.
+    static constexpr const carve_info_t& carve_info_for_request_const(size_t n)
+    {
+      return Bins::table_
+        .carve_info[bits::to_exp_mant_const<INTERMEDIATE_BITS, MIN_SIZE_BITS>(
+          n)];
+    }
+
+    // The canonical source of truth for what each within-exponent bin
+    // offset can serve. Tests express the conceptual "bin b serves
+    // request n" predicate directly in terms of this table so they do
+    // not depend on the bitmap's pre-shifted layout.
+    static constexpr const auto& bin_subsets = Bins::bin_subsets;
+
+    // --- Bitmap raw-word access ---
+    //
+    // The public Bitmap API is narrow (add / find_for_request / clear).
+    // Tests need to:
+    //  - set up arbitrary bitmap states (single bit, exhaustive patterns)
+    //    without going through `add` (which classifies a (base, size)
+    //    range and so is constrained by what classifications exist).
+    //  - inspect bitmap state after operations (test "exactly this bit is
+    //    set" and "no other bit changed").
+    // These accessors expose the raw word storage to do that.
+
+    static constexpr size_t NUM_BITMAP_WORDS = Bitmap::NUM_BITMAP_WORDS;
+
+    /// Set bit `bin_id` directly in the bitmap, bypassing
+    /// classification. For exhaustive bit-pattern tests.
+    static void raw_set(Bitmap& b, size_t bin_id)
+    {
+      SNMALLOC_ASSERT(bin_id < Bitmap::TOTAL_BINS);
+      b.words_[bin_id / bits::BITS] |=
+        (size_t(1) << (bin_id & (bits::BITS - 1)));
+    }
+
+    /// Test whether bit `bin_id` is set in the bitmap.
+    static bool raw_has(const Bitmap& b, size_t bin_id)
+    {
+      SNMALLOC_ASSERT(bin_id < Bitmap::TOTAL_BINS);
+      return (b.words_[bin_id / bits::BITS] >> (bin_id & (bits::BITS - 1))) &
+        size_t(1);
+    }
+
+    /// Whether the bitmap has no bits set.
+    static bool raw_empty(const Bitmap& b)
+    {
+      for (size_t i = 0; i < Bitmap::NUM_BITMAP_WORDS; i++)
+        if (b.words_[i] != 0)
+          return false;
+      return true;
+    }
+
+    /// Read a raw word of the bitmap; for assertions like "only this
+    /// word is non-zero" or "the words round-trip exactly".
+    static size_t raw_word(const Bitmap& b, size_t word_idx)
+    {
+      SNMALLOC_ASSERT(word_idx < Bitmap::NUM_BITMAP_WORDS);
+      return b.words_[word_idx];
+    }
+  };
+} // namespace snmalloc
+
+using snmalloc::ArenaBinsTestAccess;
+
+// Compile-time checks: a few size-class encoding properties that we want
+// to fail the build (not the runtime) if regressed.
+namespace static_checks
+{
+  using B1 = ArenaBinsTestAccess<1, 0>;
+  using B2 = ArenaBinsTestAccess<2, 0>;
+  using B3 = ArenaBinsTestAccess<3, 0>;
+
+  static_assert(B1::BINS_PER_EXP == 2, "B=1 BINS_PER_EXP");
+  static_assert(B2::BINS_PER_EXP == 5, "B=2 BINS_PER_EXP");
+  static_assert(B3::BINS_PER_EXP == 13, "B=3 BINS_PER_EXP");
+
+  static_assert(
+    B1::MAX_SC == ((snmalloc::bits::BITS - 1) << 1) + ((1 << 1) - 1),
+    "B=1 MAX_SC");
+  static_assert(
+    B2::MAX_SC == ((snmalloc::bits::BITS - 2) << 2) + ((1 << 2) - 1),
+    "B=2 MAX_SC");
+  static_assert(
+    B3::MAX_SC == ((snmalloc::bits::BITS - 3) << 3) + ((1 << 3) - 1),
+    "B=3 MAX_SC");
+
+  // Sizes that are powers of two have align == size.
+  static_assert(B2::carve_info_for_request_const(4).align == 4, "size 4 align");
+  static_assert(B3::carve_info_for_request_const(8).align == 8, "size 8 align");
+
+  // sc_size at request(s) must be >= s.
+  static_assert(B2::carve_info_for_request_const(9).size == 10, "B=2 round-up");
+  static_assert(
+    B3::carve_info_for_request_const(17).size == 18, "B=3 round-up");
+} // namespace static_checks
+
+namespace
+{
+  /// Conceptual predicate, expressed directly in terms of the canonical
+  /// `bin_subsets` table (the single source of truth for the bin
+  /// scheme). Bin `b` serves a request of size `n` iff `b`'s exponent
+  /// strictly exceeds `n`'s (any higher-exponent block is big enough),
+  /// or they share an exponent and `b`'s within-exponent subset
+  /// includes `n`'s mantissa.
+  ///
+  /// This is the reference both for what `find_for_request` must
+  /// return and for what `bin_index` must classify into.
+  template<size_t B>
+  constexpr bool serves(size_t bin, size_t n)
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+    size_t e_b = bin / Bins::BINS_PER_EXP;
+    size_t o_b = bin % Bins::BINS_PER_EXP;
+    size_t raw = snmalloc::bits::to_exp_mant_const<B, 0>(n);
+    size_t size_n = snmalloc::bits::from_exp_mant<B, 0>(raw);
+    size_t e_n = snmalloc::bits::prev_pow2_bits_const(size_n);
+    if (e_b < e_n)
+      return false;
+    if (e_b > e_n)
+      return true;
+    size_t exp_first =
+      snmalloc::bits::to_exp_mant_const<B, 0>(size_t(1) << e_n);
+    size_t m_n = raw - exp_first;
+    return ((Bins::bin_subsets[o_b] >> m_n) & size_t(1)) != 0;
+  }
+
+  /// Return true iff a block of `n` chunks starting at chunk-aligned address
+  /// `addr` can serve a size class of size `s` chunks with natural alignment
+  /// `a` chunks. Brute-force search for an aligned sub-range that fits.
+  bool can_serve(size_t addr, size_t n, size_t s, size_t a)
+  {
+    if (s == 0 || s > n)
+      return false;
+    // Find first a-aligned address in [addr, addr + n - s].
+    size_t mod = addr & (a - 1);
+    size_t first = (mod == 0) ? addr : (addr + (a - mod));
+    return first + s <= addr + n;
+  }
+
+  template<size_t B>
+  void check_chunk_sc_roundtrip()
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+
+    // Properties (together these imply request is the smallest size class
+    // with size >= s):
+    //   1. sc_size(request(s)) >= s for all s >= 1.
+    //   2. Idempotence: request(sc_size(sc)) == sc.
+    //   3. Monotonicity: s1 <= s2 implies request(s1) <= request(s2).
+    auto prev_sc = Bins::request(1);
+    for (size_t s = 1; s <= 4096; s++)
+    {
+      auto sc = Bins::request(s);
+      size_t cs = Bins::sc_size(sc);
+      if (cs < s)
+      {
+        std::printf(
+          "B=%zu request(%zu) gave class with size %zu < %zu\n", B, s, cs, s);
+        std::abort();
+      }
+      if (Bins::request(cs) != sc)
+      {
+        std::printf("B=%zu request(sc_size(sc))!=sc for cs=%zu\n", B, cs);
+        std::abort();
+      }
+      if (sc < prev_sc)
+      {
+        std::printf("B=%zu request not monotone at s=%zu\n", B, s);
+        std::abort();
+      }
+      prev_sc = sc;
+    }
+  }
+
+  template<size_t B>
+  void check_sc_align()
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+
+    for (size_t s = 1; s <= 4096; s++)
+    {
+      auto sc = Bins::request(s);
+      size_t cs = Bins::sc_size(sc);
+      size_t a = Bins::sc_align(sc);
+      // a must be a power of two.
+      if (a == 0 || (a & (a - 1)) != 0)
+      {
+        std::printf("B=%zu size %zu: sc_align %zu not pow2\n", B, cs, a);
+        std::abort();
+      }
+      // a must divide cs.
+      if (cs % a != 0)
+      {
+        std::printf(
+          "B=%zu size %zu: sc_align %zu does not divide size\n", B, cs, a);
+        std::abort();
+      }
+      // a should be the LARGEST power of two dividing cs.
+      if ((a << 1) != 0 && cs % (a << 1) == 0)
+      {
+        std::printf(
+          "B=%zu size %zu: sc_align %zu not the largest pow2 divisor\n",
+          B,
+          cs,
+          a);
+        std::abort();
+      }
+    }
+  }
+
+  /// Collect all sc_t classes whose size fits in the test grid.
+  template<size_t B>
+  std::vector<typename ArenaBinsTestAccess<B, 0>::sc_t>
+  collect_classes(size_t max_size)
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+    using sc_t = typename Bins::sc_t;
+
+    std::vector<sc_t> v;
+    sc_t prev{};
+    bool have_prev = false;
+    for (size_t s = 1; s <= max_size; s++)
+    {
+      sc_t sc = Bins::request(s);
+      if (Bins::sc_size(sc) != s)
+        continue; // s is not a class size
+      if (!have_prev || sc != prev)
+      {
+        v.push_back(sc);
+        prev = sc;
+        have_prev = true;
+      }
+    }
+    return v;
+  }
+
+  template<size_t B>
+  void check_bin_classification(size_t max_addr, size_t max_n)
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+    auto classes = collect_classes<B>(max_n);
+
+    for (size_t addr = 0; addr < max_addr; addr++)
+    {
+      for (size_t n = 1; n <= max_n; n++)
+      {
+        size_t bin = Bins::bin_index({addr, n});
+
+        for (auto sc : classes)
+        {
+          size_t s = Bins::sc_size(sc);
+          size_t a = Bins::sc_align(sc);
+          bool actually = can_serve(addr, n, s, a);
+          bool predicted = serves<B>(bin, s);
+
+          if (predicted != actually)
+          {
+            std::printf(
+              "B=%zu addr=%zu n=%zu bin=%zu sc.size=%zu sc.align=%zu: "
+              "predicted=%d actually=%d\n",
+              B,
+              addr,
+              n,
+              bin,
+              s,
+              a,
+              (int)predicted,
+              (int)actually);
+            std::abort();
+          }
+        }
+      }
+    }
+  }
+
+  template<size_t B>
+  void check_bin_id_range()
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+
+    // bin_index always returns a value in [0, BINS_PER_EXP * (e+1)) for the
+    // block's natural exponent e.
+    for (size_t addr = 0; addr < 32; addr++)
+    {
+      for (size_t n = 1; n <= 64; n++)
+      {
+        size_t bin = Bins::bin_index({addr, n});
+        size_t within = bin % Bins::BINS_PER_EXP;
+        if (within >= Bins::BINS_PER_EXP)
+        {
+          std::printf(
+            "B=%zu addr=%zu n=%zu bin=%zu: within-exp id %zu >= BINS_PER_EXP "
+            "%zu\n",
+            B,
+            addr,
+            n,
+            bin,
+            within,
+            Bins::BINS_PER_EXP);
+          std::abort();
+        }
+      }
+    }
+  }
+
+  /// Verify that `*_info_for_request(n)` agrees with the per-sc
+  /// accessors for every n in a range.
+  template<size_t B>
+  void check_info_consistency()
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+
+    for (size_t s = 1; s <= 4096; s++)
+    {
+      auto sc = Bins::request(s);
+
+      // carve_info_for_request(s) must match the per-sc accessors and
+      // must alias the carve_info(request(s)) record (single table
+      // indirection, no copy).
+      const auto& ci = Bins::carve_info_for_request(s);
+      if (ci.size != Bins::sc_size(sc))
+      {
+        std::printf("B=%zu carve_info_for_request(%zu).size mismatch\n", B, s);
+        std::abort();
+      }
+      if (ci.align != Bins::sc_align(sc))
+      {
+        std::printf("B=%zu carve_info_for_request(%zu).align mismatch\n", B, s);
+        std::abort();
+      }
+      if (&ci != &Bins::carve_info(sc))
+      {
+        std::printf(
+          "B=%zu carve_info_for_request(%zu) and carve_info(request) "
+          "point at different records\n",
+          B,
+          s);
+        std::abort();
+      }
+
+      // bitmap_info_for_request(s) must alias bitmap_info(request(s)).
+      const auto& bi = Bins::bitmap_info_for_request(s);
+      if (&bi != &Bins::bitmap_info(sc))
+      {
+        std::printf(
+          "B=%zu bitmap_info_for_request(%zu) and bitmap_info(request) "
+          "point at different records\n",
+          B,
+          s);
+        std::abort();
+      }
+    }
+  }
+
+  /// to_exp_mant runtime / _const equivalence across a representative
+  /// range of values, including edges near max_supported_size. The
+  /// runtime variant uses the intrinsic; we cross-check against the
+  /// constexpr reference that's already exercised at compile time.
+  template<size_t B>
+  void check_to_exp_mant_equivalence()
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+
+    auto check_one = [&](size_t n) {
+      size_t r = snmalloc::bits::to_exp_mant<B, 0>(n);
+      size_t c = snmalloc::bits::to_exp_mant_const<B, 0>(n);
+      if (r != c)
+      {
+        std::printf("B=%zu to_exp_mant(%zu) = %zu, _const = %zu\n", B, n, r, c);
+        std::abort();
+      }
+    };
+
+    // Small values.
+    for (size_t n = 1; n <= 4096; n++)
+      check_one(n);
+
+    // Powers of two and ±1, up to the largest representable.
+    for (size_t e = 0; e < snmalloc::bits::BITS; e++)
+    {
+      size_t pow = size_t(1) << e;
+      if (pow == 0)
+        continue;
+      if (pow >= 1 && pow <= Bins::max_supported_size())
+        check_one(pow);
+      if (pow + 1 <= Bins::max_supported_size())
+        check_one(pow + 1);
+      if (pow >= 2)
+        check_one(pow - 1);
+    }
+
+    // The upper boundary itself.
+    check_one(Bins::max_supported_size());
+    if (Bins::max_supported_size() > 1)
+      check_one(Bins::max_supported_size() - 1);
+
+    // A handful of stride values across the full range.
+    size_t step = Bins::max_supported_size() / 257;
+    if (step == 0)
+      step = 1;
+    for (size_t n = 1; n <= Bins::max_supported_size() && n > 0; n += step + 1)
+      check_one(n);
+  }
+
+  /// Reference implementation of find_for_request: brute-force scan
+  /// over every bin id, applying the canonical `serves` predicate
+  /// (defined directly in terms of `bin_subsets`).
+  template<size_t B>
+  size_t reference_find(
+    size_t n_chunks, const typename ArenaBinsTestAccess<B, 0>::Bitmap& bm)
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+    using Bitmap = typename Bins::Bitmap;
+    for (size_t b = 0; b < Bitmap::TOTAL_BINS; b++)
+    {
+      if (!Bins::raw_has(bm, b))
+        continue;
+      if (serves<B>(b, n_chunks))
+        return b;
+    }
+    return SIZE_MAX;
+  }
+
+  template<size_t B>
+  void check_bitmap_smoke()
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+    using Bitmap = typename Bins::Bitmap;
+    Bitmap bm;
+    if (!Bins::raw_empty(bm))
+      std::abort();
+    Bins::raw_set(bm, 0);
+    if (Bins::raw_empty(bm))
+      std::abort();
+    if (!Bins::raw_has(bm, 0))
+      std::abort();
+    if (Bins::raw_has(bm, 1))
+      std::abort();
+    Bins::raw_set(bm, Bitmap::TOTAL_BINS - 1);
+    if (!Bins::raw_has(bm, Bitmap::TOTAL_BINS - 1))
+      std::abort();
+    bm.clear(0);
+    if (Bins::raw_has(bm, 0))
+      std::abort();
+    bm.clear(Bitmap::TOTAL_BINS - 1);
+    if (!Bins::raw_empty(bm))
+      std::abort();
+  }
+
+  /// Iterate over every `sc_t` raw id in `[0, MAX_SC)`. For each
+  /// one, decode its request size, look up its `bitmap_info_t`, and
+  /// run `body(n_chunks, bitmap_info)`. Multiple raw ids can share the
+  /// same `(start_word, first_mask, second_mask)` triple; callers that
+  /// want a unique-deposit view are responsible for deduplicating.
+  template<size_t B, typename F>
+  void for_each_class_info(F body)
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+    for (size_t raw = 0; raw < Bins::MAX_SC; raw++)
+    {
+      size_t s = snmalloc::bits::from_exp_mant<B, 0>(raw);
+      const auto& info = Bins::bitmap_info_for_request(s);
+      body(s, info);
+    }
+  }
+
+  template<size_t B>
+  void check_bitmap_find_empty()
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+    using Bitmap = typename Bins::Bitmap;
+    Bitmap bm;
+    for_each_class_info<B>([&](size_t n, const auto& /*info*/) {
+      if (bm.find_for_request(n) != SIZE_MAX)
+        std::abort();
+    });
+  }
+
+  /// For each B and each bin id in [0, TOTAL_BINS): set exactly that
+  /// bit, then for every distinct request info cross-check
+  /// find_for_request against the reference scanner.
+  template<size_t B>
+  void check_bitmap_exhaustive_single_bit()
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+    using Bitmap = typename Bins::Bitmap;
+
+    // Gather a representative set of entries (one per distinct bitmap
+    // deposit, i.e. distinct (start_word, first_mask, second_mask)
+    // triple, with a request size that maps to it).
+    struct Entry
+    {
+      size_t n_chunks;
+      typename Bins::bitmap_info_t info;
+    };
+
+    std::vector<Entry> entries;
+    for_each_class_info<B>([&](size_t n, const auto& info) {
+      for (const auto& e : entries)
+      {
+        if (
+          e.info.start_word == info.start_word &&
+          e.info.first_mask == info.first_mask &&
+          e.info.second_mask == info.second_mask)
+          return;
+      }
+      entries.push_back({n, info});
+    });
+
+    for (size_t bin_id = 0; bin_id < Bitmap::TOTAL_BINS; bin_id++)
+    {
+      Bitmap bm;
+      Bins::raw_set(bm, bin_id);
+      for (const auto& e : entries)
+      {
+        size_t got = bm.find_for_request(e.n_chunks);
+        size_t want = reference_find<B>(e.n_chunks, bm);
+        if (got != want)
+        {
+          std::printf(
+            "B=%zu single-bit: bin=%zu n=%zu: got=%zu want=%zu\n",
+            B,
+            bin_id,
+            e.n_chunks,
+            got,
+            want);
+          std::abort();
+        }
+      }
+    }
+  }
+
+  /// Randomised multi-bit arena states cross-checked against the
+  /// reference scanner.
+  template<size_t B>
+  void check_bitmap_multi_bit_random()
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+    using Bitmap = typename Bins::Bitmap;
+
+    struct Entry
+    {
+      size_t n_chunks;
+      typename Bins::bitmap_info_t info;
+    };
+
+    std::vector<Entry> entries;
+    for_each_class_info<B>([&](size_t n, const auto& info) {
+      for (const auto& e : entries)
+      {
+        if (
+          e.info.start_word == info.start_word &&
+          e.info.first_mask == info.first_mask &&
+          e.info.second_mask == info.second_mask)
+          return;
+      }
+      entries.push_back({n, info});
+    });
+
+    // Deterministic xorshift64 PRNG so failures are reproducible.
+    auto xorshift = [](uint64_t& s) -> uint64_t {
+      s ^= s << 13;
+      s ^= s >> 7;
+      s ^= s << 17;
+      return s;
+    };
+
+    uint64_t rng_state = 0x9E3779B97F4A7C15ull + B;
+    for (size_t trial = 0; trial < 2000; trial++)
+    {
+      Bitmap bm;
+      // Density varies per trial: choose how many bits to set.
+      size_t target = (size_t)(xorshift(rng_state) % (Bitmap::TOTAL_BINS + 1));
+      for (size_t i = 0; i < target; i++)
+      {
+        size_t b = (size_t)(xorshift(rng_state) % Bitmap::TOTAL_BINS);
+        Bins::raw_set(bm, b);
+      }
+      for (const auto& e : entries)
+      {
+        size_t got = bm.find_for_request(e.n_chunks);
+        size_t want = reference_find<B>(e.n_chunks, bm);
+        if (got != want)
+        {
+          std::printf(
+            "B=%zu trial=%zu n=%zu: got=%zu want=%zu\n",
+            B,
+            trial,
+            e.n_chunks,
+            got,
+            want);
+          std::abort();
+        }
+      }
+    }
+  }
+
+  /// Targeted word-boundary cases: enumerate real table entries, pick
+  /// out those whose within-exp range straddles a bitmap word, and
+  /// drive each through a four-way sub-case grid:
+  ///   (i) bit set in first word's considered region only
+  ///   (ii) bit set as within-exp continuation in second word
+  ///   (iii) bit set as higher-exp candidate in second word
+  ///   (iv) bit set only in word 3 or beyond
+  template<size_t B>
+  void check_bitmap_word_boundary()
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+    using Bitmap = typename Bins::Bitmap;
+
+    auto check_predicted =
+      [&](const Bitmap& bm, size_t n_chunks, const char* label) {
+        size_t got = bm.find_for_request(n_chunks);
+        size_t want = reference_find<B>(n_chunks, bm);
+        if (got != want)
+        {
+          std::printf(
+            "B=%zu word-boundary [%s] n=%zu: got=%zu want=%zu\n",
+            B,
+            label,
+            n_chunks,
+            got,
+            want);
+          std::abort();
+        }
+      };
+
+    bool found_straddle = false;
+    bool found_aligned = false;
+    for (size_t raw = 0; raw < Bins::MAX_SC; raw++)
+    {
+      size_t s = snmalloc::bits::from_exp_mant<B, 0>(raw);
+      const auto& info = Bins::bitmap_info_for_request(s);
+      // Recover the absolute start bin from the precomputed layout:
+      // the start bin always serves, so bit 0 of the conceptual
+      // serve_mask is set, which means `first_mask`'s lowest set bit
+      // is at position `shift = start_bit & (BITS - 1)`.
+      size_t shift = snmalloc::bits::ctz(info.first_mask);
+      size_t start_bit = info.start_word * snmalloc::bits::BITS + shift;
+      size_t state = start_bit % Bins::BINS_PER_EXP;
+      size_t r = Bins::BINS_PER_EXP - state;
+      bool straddles = (shift + r) > snmalloc::bits::BITS;
+      bool aligned = (shift == 0);
+
+      if (straddles)
+        found_straddle = true;
+      if (aligned)
+        found_aligned = true;
+      if (!(straddles || aligned))
+        continue;
+
+      // (i) Single bit at the very start_bit.
+      {
+        Bitmap bm;
+        Bins::raw_set(bm, start_bit);
+        check_predicted(bm, s, "case-i-start_bit");
+      }
+
+      // (ii) Single bit in the second word's within-exp continuation
+      // (only meaningful for straddling cases).
+      if (straddles)
+      {
+        size_t carry_bin = start_bit + (snmalloc::bits::BITS - shift);
+        if (carry_bin < Bitmap::TOTAL_BINS)
+        {
+          Bitmap bm;
+          Bins::raw_set(bm, carry_bin);
+          check_predicted(bm, s, "case-ii-continuation");
+        }
+      }
+
+      // (iii) Bit in second word's higher-exp region.
+      {
+        size_t second_word = info.start_word + 1;
+        if (second_word < Bins::NUM_BITMAP_WORDS)
+        {
+          // Pick a bin that is higher-exponent: at least
+          // start_bit + BINS_PER_EXP - state (i.e. into next exponent).
+          size_t higher_bin = start_bit + r;
+          if (higher_bin < Bitmap::TOTAL_BINS)
+          {
+            Bitmap bm;
+            Bins::raw_set(bm, higher_bin);
+            check_predicted(bm, s, "case-iii-higher-exp");
+          }
+        }
+      }
+
+      // (iv) Bit only in word 3 or beyond.
+      {
+        size_t target_word = info.start_word + 2;
+        if (target_word < Bins::NUM_BITMAP_WORDS)
+        {
+          size_t target_bin = target_word * snmalloc::bits::BITS;
+          if (target_bin < Bitmap::TOTAL_BINS)
+          {
+            Bitmap bm;
+            Bins::raw_set(bm, target_bin);
+            check_predicted(bm, s, "case-iv-later-word");
+          }
+        }
+      }
+    }
+
+    // Sanity: for B that actually places entries near word boundaries,
+    // at least one straddling case must exist on 64-bit. We don't assert
+    // straddle exists for all B (B=1's bins-per-exp = 2 might not
+    // straddle on 64-bit), but aligned cases must.
+    if (!found_aligned)
+    {
+      std::printf("B=%zu: no aligned start_bit found!\n", B);
+      std::abort();
+    }
+    (void)found_straddle;
+  }
+
+  /// Integration test: set bits by `bin_index(addr, n)`, then probe via
+  /// `find_for_request(req)`. The bitmap result must equal
+  /// `bin_index(addr, n)` whenever `can_serve` says the block satisfies
+  /// the request, and `SIZE_MAX` otherwise.
+  template<size_t B>
+  void check_bitmap_bin_index_integration()
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+    using Bitmap = typename Bins::Bitmap;
+
+    auto classes = collect_classes<B>(64);
+    for (size_t addr = 0; addr < 32; addr++)
+    {
+      for (size_t n = 1; n <= 64; n++)
+      {
+        Bitmap bm;
+        size_t bin = Bins::bin_index({addr, n});
+        Bins::raw_set(bm, bin);
+        for (auto sc : classes)
+        {
+          size_t s = Bins::sc_size(sc);
+          size_t a = Bins::sc_align(sc);
+          bool actually = can_serve(addr, n, s, a);
+          size_t got = bm.find_for_request(s);
+          size_t want = actually ? bin : size_t(SIZE_MAX);
+          if (got != want)
+          {
+            std::printf(
+              "B=%zu integration: addr=%zu n=%zu bin=%zu sc.size=%zu "
+              "sc.align=%zu: got=%zu want=%zu actually=%d\n",
+              B,
+              addr,
+              n,
+              bin,
+              s,
+              a,
+              got,
+              want,
+              (int)actually);
+            std::abort();
+          }
+        }
+      }
+    }
+  }
+
+  /// Verify that Bitmap::add classifies (base, size) ranges to the same
+  /// bin id as `bin_index`, sets the corresponding bit, and is
+  /// idempotent on both the returned id and the underlying word state.
+  template<size_t B>
+  void check_bitmap_add()
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+    using Bitmap = typename Bins::Bitmap;
+    using range_t = typename Bins::range_t;
+
+    for (size_t addr = 0; addr < 32; addr++)
+    {
+      for (size_t n = 1; n <= 64; n++)
+      {
+        Bitmap bm;
+        size_t expected = Bins::bin_index({addr, n});
+        size_t got = bm.add(range_t{addr, n});
+        if (got != expected)
+        {
+          std::printf(
+            "B=%zu add: addr=%zu n=%zu got=%zu expected=%zu\n",
+            B,
+            addr,
+            n,
+            got,
+            expected);
+          std::abort();
+        }
+        if (!Bins::raw_has(bm, expected))
+        {
+          std::printf(
+            "B=%zu add: addr=%zu n=%zu bin %zu not set after add\n",
+            B,
+            addr,
+            n,
+            expected);
+          std::abort();
+        }
+
+        // Snapshot every word, call add again, verify nothing changed
+        // and we get the same id back. Idempotence on state.
+        std::vector<size_t> snapshot;
+        for (size_t w = 0; w < Bins::NUM_BITMAP_WORDS; w++)
+          snapshot.push_back(Bins::raw_word(bm, w));
+        size_t got2 = bm.add(range_t{addr, n});
+        if (got2 != expected)
+        {
+          std::printf(
+            "B=%zu add idempotent: addr=%zu n=%zu second add returned "
+            "%zu (first returned %zu)\n",
+            B,
+            addr,
+            n,
+            got2,
+            expected);
+          std::abort();
+        }
+        for (size_t w = 0; w < Bins::NUM_BITMAP_WORDS; w++)
+        {
+          if (Bins::raw_word(bm, w) != snapshot[w])
+          {
+            std::printf(
+              "B=%zu add idempotent: addr=%zu n=%zu word %zu changed\n",
+              B,
+              addr,
+              n,
+              w);
+            std::abort();
+          }
+        }
+      }
+    }
+  }
+
+  /// With multiple blocks added, `find_for_request` must return the
+  /// *minimum* bin id whose blocks all serve the request, not just any
+  /// such bin id.
+  template<size_t B>
+  void check_bitmap_find_min()
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+    using Bitmap = typename Bins::Bitmap;
+
+    struct Entry
+    {
+      size_t n_chunks;
+      typename Bins::bitmap_info_t info;
+    };
+
+    std::vector<Entry> entries;
+    for_each_class_info<B>([&](size_t n, const auto& info) {
+      for (const auto& e : entries)
+      {
+        if (
+          e.info.start_word == info.start_word &&
+          e.info.first_mask == info.first_mask &&
+          e.info.second_mask == info.second_mask)
+          return;
+      }
+      entries.push_back({n, info});
+    });
+
+    // For each request entry: pick three bin ids that all serve this
+    // request (the start_bit itself; a higher-exp bin; the topmost
+    // bin), set all three, and verify find_for_request returns the
+    // smallest of the three.
+    for (const auto& e : entries)
+    {
+      // Recover the absolute start bin from the precomputed layout.
+      size_t start_bit = e.info.start_word * snmalloc::bits::BITS +
+        snmalloc::bits::ctz(e.info.first_mask);
+      size_t a = start_bit;
+      size_t b =
+        start_bit + (Bins::BINS_PER_EXP - (start_bit % Bins::BINS_PER_EXP));
+      size_t c = Bitmap::TOTAL_BINS - 1;
+      if (a >= Bitmap::TOTAL_BINS)
+        continue;
+      if (b >= Bitmap::TOTAL_BINS)
+        continue;
+      // a < b < c by construction (a < b since b - a > 0; b <= a + r
+      // <= start_bit + BINS_PER_EXP <= TOTAL_BINS - 1 = c only when
+      // start_bit far enough below; skip cases where it's not).
+      if (!(a < b && b < c))
+        continue;
+
+      Bitmap bm;
+      Bins::raw_set(bm, a);
+      Bins::raw_set(bm, b);
+      Bins::raw_set(bm, c);
+      size_t got = bm.find_for_request(e.n_chunks);
+      if (got != a)
+      {
+        std::printf(
+          "B=%zu find_min: n=%zu bits set {%zu,%zu,%zu} "
+          "got=%zu (expected min %zu)\n",
+          B,
+          e.n_chunks,
+          a,
+          b,
+          c,
+          got,
+          a);
+        std::abort();
+      }
+    }
+  }
+
+  /// Verify carve(): pre.base+pre.size == req.base; req.base aligned;
+  /// req.size == n; post.base == req.end; spans equal.
+  template<size_t B>
+  void check_carve()
+  {
+    using Bins = ArenaBinsTestAccess<B, 0>;
+    using range_t = typename Bins::range_t;
+
+    auto classes = collect_classes<B>(64);
+    for (size_t addr = 0; addr < 32; addr++)
+    {
+      for (size_t n = 1; n <= 64; n++)
+      {
+        for (auto sc : classes)
+        {
+          size_t s = Bins::sc_size(sc);
+          size_t a = Bins::sc_align(sc);
+          if (!can_serve(addr, n, s, a))
+            continue;
+
+          // Exercise both the trivial case (request == SC size) and
+          // the non-trivial case (request strictly less than SC size,
+          // which forces the rounding remainder into `post`). The SC
+          // for `r` must be `sc` itself so the alignment used by carve
+          // matches what `can_serve` checked.
+          for (size_t r = 1; r <= s; r++)
+          {
+            if (Bins::sc_size(Bins::request(r)) != s)
+              continue;
+
+            auto cv = Bins::carve(range_t{addr, n}, r);
+
+            // pre starts at the block's base.
+            if (cv.pre.base != addr)
+            {
+              std::printf(
+                "B=%zu carve pre.base != addr (addr=%zu n=%zu r=%zu s=%zu)\n",
+                B,
+                addr,
+                n,
+                r,
+                s);
+              std::abort();
+            }
+            // pre.end == req.base.
+            if (cv.pre.base + cv.pre.size != cv.req.base)
+            {
+              std::printf("B=%zu carve pre.end != req.base\n", B);
+              std::abort();
+            }
+            // req aligned to the SC's natural alignment.
+            if ((cv.req.base & (a - 1)) != 0)
+            {
+              std::printf(
+                "B=%zu carve req.base %zu not aligned to %zu\n",
+                B,
+                cv.req.base,
+                a);
+              std::abort();
+            }
+            // req.size == requested n_chunks (carve-exact).
+            if (cv.req.size != r)
+            {
+              std::printf(
+                "B=%zu carve req.size %zu != r %zu\n", B, cv.req.size, r);
+              std::abort();
+            }
+            // req.end == post.base.
+            if (cv.req.base + cv.req.size != cv.post.base)
+            {
+              std::printf("B=%zu carve req.end != post.base\n", B);
+              std::abort();
+            }
+            // post.end == block.end.
+            if (cv.post.base + cv.post.size != addr + n)
+            {
+              std::printf("B=%zu carve post.end != block.end\n", B);
+              std::abort();
+            }
+            // pre.size + req.size + post.size == block.size.
+            if (cv.pre.size + cv.req.size + cv.post.size != n)
+            {
+              std::printf("B=%zu carve sizes don't sum to n\n", B);
+              std::abort();
+            }
+          }
+        }
+      }
+    }
+  }
+
+  template<size_t B>
+  void run_all()
+  {
+    std::printf("--- Running ArenaBinsTestAccess<%zu> tests ---\n", B);
+    check_chunk_sc_roundtrip<B>();
+    std::printf("  sc_t round-trip: OK\n");
+    check_sc_align<B>();
+    std::printf("  sc_align: OK\n");
+    check_to_exp_mant_equivalence<B>();
+    std::printf("  to_exp_mant runtime/_const equivalence: OK\n");
+    check_info_consistency<B>();
+    std::printf("  *_info_for_request consistency: OK\n");
+    check_bin_id_range<B>();
+    std::printf("  bin_index within-exp range: OK\n");
+    check_bin_classification<B>(/*max_addr=*/128, /*max_n=*/64);
+    std::printf("  bin classification vs bin_subsets predicate: OK\n");
+    check_bitmap_smoke<B>();
+    std::printf("  Bitmap smoke: OK\n");
+    check_bitmap_find_empty<B>();
+    std::printf("  Bitmap empty find returns SIZE_MAX: OK\n");
+    check_bitmap_exhaustive_single_bit<B>();
+    std::printf("  Bitmap exhaustive single-bit find: OK\n");
+    check_bitmap_multi_bit_random<B>();
+    std::printf("  Bitmap multi-bit random find: OK\n");
+    check_bitmap_word_boundary<B>();
+    std::printf("  Bitmap word-boundary cases: OK\n");
+    check_bitmap_bin_index_integration<B>();
+    std::printf("  Bitmap bin_index integration: OK\n");
+    check_bitmap_add<B>();
+    std::printf("  Bitmap add classify+set+idempotent: OK\n");
+    check_bitmap_find_min<B>();
+    std::printf("  Bitmap find_for_request returns minimum: OK\n");
+    check_carve<B>();
+    std::printf("  carve splits aligned/unaligned blocks: OK\n");
+  }
+
+  /// A few concrete expected values, derived from the prototype's output, to
+  /// catch silent breakage of the canonical numbering.
+  void check_known_values()
+  {
+    using B2 = ArenaBinsTestAccess<2, 0>;
+
+    // size 1 -> raw 0, size 2 -> raw 1, size 3 -> raw 2, size 4 -> raw 3,
+    // size 5 -> raw 4, ..., size 8 -> raw 7, size 10 -> raw 8.
+    if (B2::sc_size(B2::request(1)) != 1)
+      std::abort();
+    if (B2::sc_size(B2::request(8)) != 8)
+      std::abort();
+    if (B2::sc_size(B2::request(9)) != 10)
+      std::abort();
+    if (B2::sc_size(B2::request(11)) != 12)
+      std::abort();
+
+    // sc_align: size 4 -> 4, size 5 -> 1, size 6 -> 2, size 8 -> 8,
+    // size 10 -> 2, size 12 -> 4, size 14 -> 2.
+    if (B2::sc_align(B2::request(4)) != 4)
+      std::abort();
+    if (B2::sc_align(B2::request(5)) != 1)
+      std::abort();
+    if (B2::sc_align(B2::request(6)) != 2)
+      std::abort();
+    if (B2::sc_align(B2::request(8)) != 8)
+      std::abort();
+    if (B2::sc_align(B2::request(10)) != 2)
+      std::abort();
+
+    // BINS_PER_EXP must be 5 for B=2.
+    if (B2::BINS_PER_EXP != 5)
+      std::abort();
+
+    using B3 = ArenaBinsTestAccess<3, 0>;
+
+    if (B3::BINS_PER_EXP != 13)
+      std::abort();
+
+    using B1 = ArenaBinsTestAccess<1, 0>;
+    if (B1::BINS_PER_EXP != 2)
+      std::abort();
+  }
+
+  /**
+   * Verify that scaling the encoding by `UNIT_SIZE = 1 << MIN_SIZE_BITS`
+   * is a structural equivalence: every public observation about a
+   * `ArenaBins<B, MIN_SIZE_BITS>` instance equals the
+   * corresponding observation on `ArenaBins<B, 0>` when the
+   * input is scaled by `UNIT_SIZE` (and outputs, where they are sizes
+   * or addresses, are also scaled by `UNIT_SIZE`).
+   *
+   * This pins the new template parameter to act purely as a unit
+   * change, with no other semantic effect on the bin scheme.
+   */
+  template<size_t B, size_t MIN_SIZE_BITS>
+  void check_min_size_bits_equivalence()
+  {
+    using Scaled = ArenaBinsTestAccess<B, MIN_SIZE_BITS>;
+    using Base = ArenaBinsTestAccess<B, 0>;
+    static_assert(MIN_SIZE_BITS > 0, "this check is for MIN_SIZE_BITS > 0");
+    constexpr size_t U = size_t(1) << MIN_SIZE_BITS;
+
+    // BINS_PER_EXP is independent of MIN_SIZE_BITS.
+    if (Scaled::BINS_PER_EXP != Base::BINS_PER_EXP)
+      std::abort();
+    if (Scaled::MANTISSAS_PER_EXP != Base::MANTISSAS_PER_EXP)
+      std::abort();
+
+    // request(n*U) at MIN_SIZE_BITS==K returns the same raw id as
+    // request(n) at MIN_SIZE_BITS==0; sc_size(raw) at MIN_SIZE_BITS==K
+    // equals sc_size(raw) at MIN_SIZE_BITS==0 times U; sc_align
+    // likewise.
+    size_t probe[] = {
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 17, 32, 65, 127, 1024};
+    for (size_t n : probe)
+    {
+      // Skip values that would overflow either instance's domain.
+      if (n > Base::max_supported_size())
+        continue;
+      if (n > Scaled::max_supported_size() / U)
+        continue;
+      auto sc_base = Base::request(n);
+      auto sc_scaled = Scaled::request(n * U);
+      if (sc_base != sc_scaled)
+        std::abort();
+      if (Scaled::sc_size(sc_scaled) != Base::sc_size(sc_base) * U)
+        std::abort();
+      if (Scaled::sc_align(sc_scaled) != Base::sc_align(sc_base) * U)
+        std::abort();
+    }
+
+    // bin_index({a*U, n*U}) at MIN_SIZE_BITS==K matches bin_index({a, n})
+    // at MIN_SIZE_BITS==0.
+    using ScaledR = typename Scaled::range_t;
+    using BaseR = typename Base::range_t;
+    for (size_t n = 1; n <= 64; n++)
+      for (size_t a = 0; a < 32; a++)
+        if (
+          Scaled::bin_index(ScaledR{a * U, n * U}) !=
+          Base::bin_index(BaseR{a, n}))
+          std::abort();
+
+    // carve({0, blk*U}, n*U) returns the same partition as
+    // carve({0, blk}, n) at MIN_SIZE_BITS==0, scaled by U.
+    for (size_t blk = 1; blk <= 32; blk++)
+      for (size_t n = 1; n <= blk; n++)
+      {
+        // carve's precondition (servability) is that the SC for `n`
+        // fits inside `blk` after alignment. With base 0, pad is 0,
+        // so the condition reduces to `Base::sc_size(Base::request(n))
+        // <= blk`. Skip pairs that don't satisfy it.
+        if (Base::sc_size(Base::request(n)) > blk)
+          continue;
+        auto base_cv = Base::carve(BaseR{0, blk}, n);
+        auto scaled_cv = Scaled::carve(ScaledR{0, blk * U}, n * U);
+        if (
+          scaled_cv.pre.base != base_cv.pre.base * U ||
+          scaled_cv.pre.size != base_cv.pre.size * U)
+          std::abort();
+        if (
+          scaled_cv.req.base != base_cv.req.base * U ||
+          scaled_cv.req.size != base_cv.req.size * U)
+          std::abort();
+        if (
+          scaled_cv.post.base != base_cv.post.base * U ||
+          scaled_cv.post.size != base_cv.post.size * U)
+          std::abort();
+      }
+
+    // Bitmap find_for_request scales: an arena populated by add
+    // returns the same bin id, and `find_for_request(n*U)` agrees
+    // with `find_for_request(n)` at MIN_SIZE_BITS==0.
+    typename Scaled::Bitmap bm_scaled{};
+    typename Base::Bitmap bm_base{};
+    // Populate with a handful of representative ranges.
+    size_t pop[][2] = {{0, 4}, {16, 1}, {17, 7}, {64, 9}, {128, 64}};
+    for (auto& p : pop)
+    {
+      size_t a = p[0], s = p[1];
+      auto id_b = bm_base.add(BaseR{a, s});
+      auto id_s = bm_scaled.add(ScaledR{a * U, s * U});
+      if (id_b != id_s)
+        std::abort();
+    }
+    for (size_t n = 1; n <= 32; n++)
+    {
+      auto f_b = bm_base.find_for_request(n);
+      auto f_s = bm_scaled.find_for_request(n * U);
+      if (f_b != f_s)
+        std::abort();
+    }
+  }
+
+  /// Concrete expected values at MIN_SIZE_BITS == 4 to pin the
+  /// interpretation: bin 0 corresponds to the unit-size block,
+  /// raw 0 decodes to UNIT_SIZE bytes, etc.
+  void check_known_values_unit_16()
+  {
+    using BU = ArenaBinsTestAccess<2, 4>;
+    constexpr size_t U = size_t(1) << 4;
+
+    // size U (UNIT_SIZE) -> raw 0; size 2U -> raw 1; ...
+    if (BU::sc_size(BU::request(U)) != U)
+      std::abort();
+    if (BU::sc_size(BU::request(8 * U)) != 8 * U)
+      std::abort();
+    // size 9U requires SC for 10U at B=2 (round up).
+    if (BU::sc_size(BU::request(9 * U)) != 10 * U)
+      std::abort();
+    if (BU::sc_align(BU::request(4 * U)) != 4 * U)
+      std::abort();
+    if (BU::sc_align(BU::request(8 * U)) != 8 * U)
+      std::abort();
+
+    // Bin 0 corresponds to a UNIT_SIZE block.
+    if (BU::bin_index({0, U}) != 0)
+      std::abort();
+  }
+} // namespace
+
+int main(int, char**)
+{
+  setup();
+
+  check_known_values();
+  std::printf("Known concrete values: OK\n");
+
+  check_known_values_unit_16();
+  std::printf("Known concrete values at MIN_SIZE_BITS=4: OK\n");
+
+  check_min_size_bits_equivalence<1, 4>();
+  check_min_size_bits_equivalence<2, 4>();
+  check_min_size_bits_equivalence<3, 4>();
+  check_min_size_bits_equivalence<2, 14>();
+  std::printf("MIN_SIZE_BITS equivalence: OK\n");
+
+  run_all<1>();
+  run_all<2>();
+  run_all<3>();
+
+  std::printf("All ArenaBins tests passed.\n");
+  return 0;
+}
diff --git a/src/test/func/cheri/cheri.cc b/src/test/func/cheri/cheri.cc
index 1928dbbd5..424a2eae2 100644
--- a/src/test/func/cheri/cheri.cc
+++ b/src/test/func/cheri/cheri.cc
@@ -58,8 +58,8 @@ int main()
   }
 
   /*
-   * This large object is sized to end up in our alloc's local buddy allocators
-   * when it's released.
+   * This large object is sized to end up in our alloc's thread-local
+   * cache range when it's released.
    */
   message("Grab large object");
   ptraddr_t alarge;
@@ -266,7 +266,7 @@ int main()
       SNMALLOC_CHECK(sz == Aal::capptr_size_round(sz));
     }
 
-    for (size_t sc = 0; sc < bits::BITS; sc++)
+    for (size_t sc = 0; sc < NUM_LARGE_CLASSES; sc++)
     {
       size_t sz = sizeclass_full_to_size(sizeclass_t::from_large_class(sc));
       SNMALLOC_CHECK(sz == Aal::capptr_size_round(sz));
diff --git a/src/test/func/client_meta_nonpow2/client_meta_nonpow2.cc b/src/test/func/client_meta_nonpow2/client_meta_nonpow2.cc
new file mode 100644
index 000000000..913afff38
--- /dev/null
+++ b/src/test/func/client_meta_nonpow2/client_meta_nonpow2.cc
@@ -0,0 +1,94 @@
+/**
+ * Exercises the slab metadata allocation path with a ClientMetaDataProvider
+ * whose per-slab extra_bytes is non-power-of-two.
+ *
+ * Before Phase C the backend rounded slab metadata sizes up to the next
+ * power of two, hiding any non-pow2 storage cost. With Phase C the
+ * backend rounds to `MIN_META_ALIGN` (= meta range UNIT_SIZE), so a
+ * non-pow2 client meta size now actually occupies a non-pow2 slab
+ * metadata block. This test gates the alloc/dealloc round-trip on that
+ * path: if `meta_size_round` is wrong, an inconsistent alloc/dealloc
+ * size would either trip an assertion in the meta range or leak.
+ */
+
+#include "test/setup.h"
+
+#include <iostream>
+#include <snmalloc/backend/globalconfig.h>
+#include <snmalloc/snmalloc_core.h>
+#include <vector>
+
+namespace snmalloc
+{
+  /**
+   * Per-slab client meta: `max_count + 7` bytes of storage. With
+   * `StorageType = uint8_t`, the resulting extra_bytes
+   * (= (required_count - 1) * 1) is non-power-of-two for typical
+   * sizeclass slab object counts.
+   */
+  struct NonPow2ClientMetaDataProvider
+  {
+    using StorageType = uint8_t;
+    using DataRef = uint8_t&;
+
+    static size_t required_count(size_t max_count)
+    {
+      return max_count + 7;
+    }
+
+    static DataRef get(StorageType* base, size_t index)
+    {
+      return base[index];
+    }
+  };
+
+  using Config =
+    snmalloc::StandardConfigClientMeta<NonPow2ClientMetaDataProvider>;
+} // namespace snmalloc
+
+#define SNMALLOC_PROVIDE_OWN_CONFIG
+#include <snmalloc/snmalloc.h>
+
+int main()
+{
+#if defined(SNMALLOC_ENABLE_GWP_ASAN_INTEGRATION)
+  // This test does not make sense in GWP-ASan mode.
+  return 0;
+#else
+  // Spread allocations across several small sizeclasses to force a
+  // variety of slab metadata sizes; each combination of (slab object
+  // count, +7 bytes) produces a different non-pow2 extra_bytes.
+  constexpr size_t sizes[] = {16, 48, 96, 192, 512, 1024};
+  std::vector<std::pair<void*, uint8_t>> ptrs;
+
+  for (size_t round = 0; round < 5; round++)
+  {
+    for (size_t s : sizes)
+    {
+      for (size_t i = 0; i < 200; i++)
+      {
+        auto p = snmalloc::libc::malloc(s);
+        auto& meta = snmalloc::get_client_meta_data(p);
+        uint8_t tag = static_cast<uint8_t>((round * 31 + s + i) & 0xff);
+        meta = tag;
+        memset(p, tag, s);
+        ptrs.emplace_back(p, tag);
+      }
+    }
+  }
+
+  for (auto [p, tag] : ptrs)
+  {
+    auto& meta = snmalloc::get_client_meta_data(p);
+    if (meta != tag)
+    {
+      std::cout << "Meta mismatch: expected " << int(tag) << " got "
+                << int(meta) << std::endl;
+      abort();
+    }
+    snmalloc::libc::free(p);
+  }
+
+  return 0;
+#endif
+}
diff --git a/src/test/func/domestication/domestication.cc b/src/test/func/domestication/domestication.cc
index 1c2eb9fef..63b8b380d 100644
--- a/src/test/func/domestication/domestication.cc
+++ b/src/test/func/domestication/domestication.cc
@@ -39,7 +39,7 @@ namespace snmalloc
       PagemapRegisterRange<Pagemap>,
       PagemapRegisterRange<Authmap>>;
 
-    using LocalState = StandardLocalState<Pal, Pagemap, Base>;
+    using LocalState = StandardLocalState<Pal, Pagemap, Authmap, Base>;
 
     using GlobalPoolState = PoolState<Allocator<CustomConfig>>;
 
diff --git a/src/test/func/large_offset/large_offset.cc b/src/test/func/large_offset/large_offset.cc
new file mode 100644
index 000000000..d89bf03ba
--- /dev/null
+++ b/src/test/func/large_offset/large_offset.cc
@@ -0,0 +1,227 @@
+/**
+ * Backend-API counterpart of `large_offset_frontend` for the per-chunk
+ * pagemap offset write path in `BackendAllocator::alloc_chunk`.
+ *
+ * This test pins the contract at the *backend* boundary
+ * (`Config::Backend::alloc_chunk` / `dealloc_chunk`) so it holds
+ * independently of any front-end path: a non-pow2 large allocation
+ * spans multiple slab tiles, and `alloc_chunk` writes a per-chunk
+ * pagemap entry whose offset bits encode the slab index.
+ *
+ * Method:
+ *   - Pick a non-pow2 large sizeclass `sc` whose
+ *     `sizeclass_full_to_slab_size(sc) < sizeclass_full_to_size(sc)`,
+ *     so the multi-slab-tile branch triggers.
+ *   - Call `Config::Backend::alloc_chunk` directly with
+ *     `sizeclass_full_to_size(sc)` (the chunk-multiple reservation)
+ *     and the non-pow2 sc.
+ *   - For each chunk in the region verify the pagemap entry's
+ *     `get_offset_and_sizeclass()` decomposes into the expected
+ *     (sc, slab_index) pair.
+ *   - For sampled interior addresses verify that
+ *     `remaining_bytes` / `index_in_object` return positions within
+ *     the logical allocation.
+ *   - Verify `is_start_of_object` behaviour: true at the allocation
+ *     base, false elsewhere.
+ *   - `dealloc_chunk` and verify entries clear back to "not
+ *     frontend-owned" (low COMBINED_BITS == 0).
+ */
+
+#include "test/setup.h"
+
+#include <iostream>
+#include <snmalloc/backend/fixedglobalconfig.h>
+#include <snmalloc/snmalloc.h>
+
+#ifdef assert
+#  undef assert
+#endif
+#define assert please_use_SNMALLOC_ASSERT
+
+using namespace snmalloc;
+
+using CustomGlobals = FixedRangeConfig<PALNoAlloc<DefaultPal>>;
+using FixedAlloc = Allocator<CustomGlobals>;
+
+namespace
+{
+  bool any_failures = false;
+
+  void fail(const char* msg)
+  {
+    std::cout << "FAIL: " << msg << std::endl;
+    any_failures = true;
+  }
+
+  /**
+   * Find the smallest non-pow2 large sizeclass: one where slab_size <
+   * size. Returns sizeclass_t{} (the unmapped sentinel) if none exists
+   * in this configuration.
+   */
+  sizeclass_t find_non_pow2_large_sc()
+  {
+    for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++)
+    {
+      auto sc = sizeclass_t::from_large_class(lc);
+      const size_t size = sizeclass_full_to_size(sc);
+      const size_t slab_size = sizeclass_full_to_slab_size(sc);
+      if (slab_size < size)
+        return sc;
+    }
+    return sizeclass_t{};
+  }
+
+  void test_per_chunk_offset()
+  {
+    auto sc = find_non_pow2_large_sc();
+    if (sc.raw() == 0)
+    {
+      std::cout << "No non-pow2 large sizeclass available in this config; "
+                   "skipping per-chunk offset test."
+                << std::endl;
+      return;
+    }
+    const size_t size = sizeclass_full_to_size(sc);
+    const size_t slab_size = sizeclass_full_to_slab_size(sc);
+    // The chunk-multiple reservation: the backend precondition is
+    // that `size` is a positive multiple of `slab_size`, satisfied
+    // here by passing the exact sizeclass size.
+    const size_t reserve = size;
+
+    std::cout << "non-pow2 sc raw=" << sc.raw() << " size=" << size
+              << " slab_size=" << slab_size << " reserve=" << reserve
+              << std::endl;
+
+    // Set up an isolated FixedRangeConfig allocator. FixedRangeConfig
+    // owns its own pagemap and never reclaims `region_base`; the
+    // reservation is released when the process exits. For a multi-
+    // test harness, explicit teardown would be required here.
+    const size_t region = bits::one_at_bit(28);
+    auto region_base = DefaultPal::reserve(region);
+    DefaultPal::notify_using<NoZero>(region_base, region);
+    CustomGlobals::init(nullptr, region_base, region);
+
+    auto a = get_scoped_allocator<FixedAlloc>();
+
+    using Backend = typename CustomGlobals::Backend;
+    using Entry = typename CustomGlobals::PagemapEntry;
+
+    // Construct the encoded ras the way the front end does (offset=0).
+    const uintptr_t ras_in = Entry::encode(nullptr, sc);
+
+    auto [chunk, slab_meta] =
+      Backend::alloc_chunk(a->get_backend_local_state(), reserve, ras_in, sc);
+    if (chunk == nullptr)
+    {
+      fail("alloc_chunk returned null");
+      return;
+    }
+
+    const address_t base = address_cast(chunk);
+    std::cout << "Allocated chunk base=" << reinterpret_cast<void*>(base)
+              << " reserve=" << reserve << std::endl;
+
+    // Verify per-chunk pagemap entries.
+    for (size_t chunk_offset = 0; chunk_offset < reserve;
+         chunk_offset += MIN_CHUNK_SIZE)
+    {
+      const size_t expected_slab_index = chunk_offset / slab_size;
+      const auto& entry = Backend::get_metaentry(base + chunk_offset);
+      const offset_and_sizeclass_t osc = entry.get_offset_and_sizeclass();
+      const offset_and_sizeclass_t expected_osc =
+        offset_and_sizeclass_t(sc, expected_slab_index);
+      if (!(osc == expected_osc))
+      {
+        std::cout << "Chunk @+" << chunk_offset << " osc=" << osc.raw()
+                  << " expected=" << expected_osc.raw() << " (sc=" << sc.raw()
+                  << " idx=" << expected_slab_index << ")" << std::endl;
+        fail("offset_and_sizeclass mismatch");
+      }
+      // The pure sizeclass mask must still report `sc`.
+      if (!(entry.get_sizeclass() == sc))
+      {
+        std::cout << "Chunk @+" << chunk_offset << " get_sizeclass mismatch"
+                  << std::endl;
+        fail("get_sizeclass mismatch on offset>0 chunk");
+      }
+    }
+
+    // For an interior address in each chunk that lies within the
+    // *logical* allocation (size, not the pow2 reservation),
+    // remaining_bytes / index_in_object should report position within
+    // the allocation.
+    for (size_t chunk_offset = 0; chunk_offset < size;
+         chunk_offset += MIN_CHUNK_SIZE)
+    {
+      const address_t addr = base + chunk_offset;
+      const size_t rem = snmalloc::remaining_bytes<CustomGlobals>(addr);
+      if (rem != size - chunk_offset)
+      {
+        std::cout << "remaining_bytes @+" << chunk_offset << " = " << rem
+                  << " expected " << (size - chunk_offset) << std::endl;
+        fail("remaining_bytes mismatch");
+      }
+      const size_t idx = snmalloc::index_in_object<CustomGlobals>(addr);
+      if (idx != chunk_offset)
+      {
+        std::cout << "index_in_object @+" << chunk_offset << " = " << idx
+                  << " expected " << chunk_offset << std::endl;
+        fail("index_in_object mismatch");
+      }
+    }
+
+    // Direct is_start_of_object checks: the allocation base address
+    // must be a start-of-object; an interior address inside the first
+    // slab tile (offset_bytes == 0 in pagemap) but not at the base
+    // must NOT; and an address in any non-first slab tile
+    // (offset_bytes != 0 in pagemap) must NOT.
+    {
+      const auto& base_entry = Backend::get_metaentry(base);
+      if (!is_start_of_object(base_entry.get_offset_and_sizeclass(), base))
+        fail("base address not reported as start-of-object");
+      if (is_start_of_object(base_entry.get_offset_and_sizeclass(), base + 1))
+        fail("base+1 incorrectly reported as start-of-object");
+    }
+    if (size > slab_size)
+    {
+      const address_t second_slab = base + slab_size;
+      const auto& second_entry = Backend::get_metaentry(second_slab);
+      if (is_start_of_object(
+            second_entry.get_offset_and_sizeclass(), second_slab))
+        fail("second slab tile base incorrectly reported as start-of-object");
+    }
+
+    // Tear down: dealloc the chunk and verify the per-chunk pagemap
+    // entries no longer report as frontend-owned.
+    auto alloc_cap =
+      capptr_chunk_is_alloc(capptr_to_user_address_control(chunk));
+    Backend::dealloc_chunk(
+      a->get_backend_local_state(), *slab_meta, alloc_cap, reserve, sc);
+
+    for (size_t chunk_offset = 0; chunk_offset < reserve;
+         chunk_offset += MIN_CHUNK_SIZE)
+    {
+      const auto& entry = Backend::get_metaentry(base + chunk_offset);
+      if (!entry.is_backend_owned())
+      {
+        std::cout << "Chunk @+" << chunk_offset
+                  << " not backend-owned after dealloc; osc="
+                  << entry.get_offset_and_sizeclass().raw() << std::endl;
+        fail("dealloc didn't reset per-chunk offset");
+      }
+    }
+  }
+} // namespace
+
+int main()
+{
+  setup();
+  test_per_chunk_offset();
+  if (any_failures)
+  {
+    std::cout << "FAILED" << std::endl;
+    return 1;
+  }
+  std::cout << "PASSED" << std::endl;
+  return 0;
+}
diff --git a/src/test/func/large_offset_frontend/large_offset_frontend.cc b/src/test/func/large_offset_frontend/large_offset_frontend.cc
new file mode 100644
index 000000000..4b4fd7948
--- /dev/null
+++ b/src/test/func/large_offset_frontend/large_offset_frontend.cc
@@ -0,0 +1,192 @@
+/**
+ * Front-end counterpart to `src/test/func/large_offset/`.
+ *
+ * The front-end allocates non-pow2 large allocations directly:
+ * `malloc(80 KiB)` reserves exactly 80 KiB (a sizeclass boundary)
+ * rather than rounding up to the next power of two. This test
+ * exercises the resulting per-chunk pagemap state via the public
+ * recovery API (`external_pointer`, `remaining_bytes`).
+ *
+ * `large_offset.cc` covers the same ground at the backend boundary
+ * (`Config::Backend::alloc_chunk` / `dealloc_chunk`), so the
+ * per-chunk contract is gated independently of any front-end path.
+ * This test gates that the front-end actually produces such
+ * allocations.
+ *
+ * Two sets of checks:
+ *
+ *   1. Pure table-level round-tripping over every large sizeclass:
+ *      `size_to_sizeclass_full(sizeclass_full_to_size(sc)) == sc`.
+ *      No allocation. Cheap and exhaustive.
+ *
+ *   2. End-to-end on a bounded set of representative sizeclasses
+ *      (the smallest non-pow2 large class, plus a non-boundary
+ *      request whose smallest enclosing class is non-pow2): allocate
+ *      via the public front-end API, walk every chunk-aligned
+ *      interior pointer in the logical allocation, assert
+ *      `external_pointer<Start>` recovers the base and
+ *      `remaining_bytes` reports the expected residual.
+ */
+
+#include "test/setup.h"
+
+#include <iostream>
+#include <snmalloc/snmalloc.h>
+
+#ifdef assert
+#  undef assert
+#endif
+#define assert please_use_SNMALLOC_ASSERT
+
+using namespace snmalloc;
+
+namespace
+{
+  bool any_failures = false;
+
+  void fail(const char* msg)
+  {
+    std::cout << "FAIL: " << msg << std::endl;
+    any_failures = true;
+  }
+
+  /**
+   * For every representable large sizeclass `sc`, check that the
+   * sizeclass encoding round-trips: a request of exactly
+   * `sizeclass_full_to_size(sc)` maps back to `sc`. Failure here is
+   * a pure table-encoding bug and is independent of any allocation.
+   */
+  void test_roundtrip_all_large()
+  {
+    for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++)
+    {
+      sizeclass_t sc = sizeclass_t::from_large_class(lc);
+      size_t S = sizeclass_full_to_size(sc);
+      sizeclass_t sc2 = size_to_sizeclass_full(S);
+      if (!(sc2 == sc))
+      {
+        std::cout << "Round-trip fail: lc=" << lc << " S=" << S
+                  << " sc.raw=" << sc.raw() << " sc2.raw=" << sc2.raw()
+                  << std::endl;
+        fail("round-trip");
+      }
+    }
+  }
+
+  /**
+   * Allocate `request` via the public front-end, then walk every
+   * `MIN_CHUNK_SIZE`-aligned interior address and verify pointer
+   * recovery. `expected_reserve` is the reservation the allocator
+   * should produce (the smallest enclosing sizeclass size).
+   */
+  void test_alloc_chunkwalk(size_t request, size_t expected_reserve)
+  {
+    void* p = snmalloc::libc::malloc(request);
+    if (p == nullptr)
+    {
+      fail("malloc returned null");
+      return;
+    }
+
+    const size_t usable = snmalloc::alloc_size(p);
+    if (usable != expected_reserve)
+    {
+      std::cout << "alloc_size mismatch: request=" << request
+                << " usable=" << usable << " expected=" << expected_reserve
+                << std::endl;
+      fail("alloc_size != expected reserve");
+    }
+
+    // Use the `Start` pointer recovery as the start-of-object check
+    // (no `libc::is_start_of_object`): `external_pointer<Start>(p)`
+    // returning `p` itself is the same property.
+
+    for (size_t off = 0; off < usable; off += MIN_CHUNK_SIZE)
+    {
+      void* interior = pointer_offset(p, off);
+      void* base = snmalloc::external_pointer<Start>(interior);
+      if (base != p)
+      {
+        std::cout << "external_pointer<Start>(p + " << off << ") = " << base
+                  << " expected " << p << std::endl;
+        fail("external_pointer mismatch");
+      }
+      size_t rem = snmalloc::remaining_bytes(interior);
+      if (rem != usable - off)
+      {
+        std::cout << "remaining_bytes(p + " << off << ") = " << rem
+                  << " expected " << usable - off << std::endl;
+        fail("remaining_bytes mismatch");
+      }
+    }
+
+    snmalloc::libc::free(p);
+  }
+
+  /**
+   * Find a non-pow2 large sizeclass to exercise. Returns the
+   * sentinel `sizeclass_t{}` if none exists (e.g. INTERMEDIATE_BITS
+   * == 0, all classes are pow2).
+   */
+  sizeclass_t find_non_pow2_large_sc()
+  {
+    for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++)
+    {
+      sizeclass_t sc = sizeclass_t::from_large_class(lc);
+      size_t S = sizeclass_full_to_size(sc);
+      if (!bits::is_pow2(S))
+        return sc;
+    }
+    return sizeclass_t{};
+  }
+
+  void test_end_to_end()
+  {
+    sizeclass_t sc = find_non_pow2_large_sc();
+    if (sc.raw() == 0)
+    {
+      std::cout
+        << "No non-pow2 large sizeclass available (INTERMEDIATE_BITS == 0?); "
+           "skipping end-to-end test."
+        << std::endl;
+      return;
+    }
+
+    const size_t S = sizeclass_full_to_size(sc);
+
+    // Boundary request: ask for exactly the class size.
+    test_alloc_chunkwalk(S, S);
+
+    // Non-boundary request: ask for (S_prev + 1) to land at S via
+    // the ceil encoding. S_prev is the previous class's size; if sc
+    // is the very first large class, fall back to MAX_SMALL+1.
+    size_t S_prev;
+    if (sc.as_large() == 0)
+    {
+      S_prev = MAX_SMALL_SIZECLASS_SIZE;
+    }
+    else
+    {
+      S_prev = sizeclass_full_to_size(
+        sizeclass_t::from_large_class(sc.as_large() - 1));
+    }
+    if (S_prev + 1 < S)
+    {
+      test_alloc_chunkwalk(S_prev + 1, S);
+    }
+  }
+} // namespace
+
+int main()
+{
+  setup();
+  test_roundtrip_all_large();
+  test_end_to_end();
+  if (any_failures)
+  {
+    std::cout << "FAILED" << std::endl;
+    return 1;
+  }
+  std::cout << "PASSED" << std::endl;
+  return 0;
+}
diff --git a/src/test/func/largearenarange/largearenarange.cc b/src/test/func/largearenarange/largearenarange.cc
new file mode 100644
index 000000000..94a6e360a
--- /dev/null
+++ b/src/test/func/largearenarange/largearenarange.cc
@@ -0,0 +1,316 @@
+/**
+ * Unit tests for LargeArenaRange and PagemapRep.
+ *
+ * Tests the Range wrapper around Arena using a real pagemap,
+ * exercising alloc_range, dealloc_range, refill, and overflow paths.
+ */
+
+#include "test/setup.h"
+
+#include <cstdio>
+
+#ifndef SNMALLOC_TRACING
+#  define SNMALLOC_TRACING
+#endif
+#include "test/snmalloc_testlib.h"
+
+#include <snmalloc/backend/globalconfig.h>
+#include <snmalloc/snmalloc_core.h>
+
+namespace
+{
+  using namespace snmalloc;
+
+  // --- Test pagemap and range types ---
+
+  using Pal = DefaultPal;
+  using PagemapEntry = DefaultPagemapEntry<NoClientMetaDataProvider>;
+  using ConcretePagemap = FlatPagemap<MIN_CHUNK_BITS, PagemapEntry, Pal, false>;
+  using TestPagemap = BasicPagemap<Pal, ConcretePagemap, PagemapEntry, false>;
+
+  // Initialise the pagemap once before tests.
+  static bool pagemap_initialised = false;
+
+  static void ensure_pagemap()
+  {
+    if (!pagemap_initialised)
+    {
+      TestPagemap::concretePagemap.template init<false>();
+      pagemap_initialised = true;
+    }
+  }
+
+  // Simple parent: PalRange + PagemapRegisterRange.
+  using ParentSource = Pipe<PalRange<Pal>, PagemapRegisterRange<TestPagemap>>;
+
+  // LargeArenaRange under test: global range (MAX_SIZE_BITS = BITS - 1).
+  // This means overflow dealloc never goes to parent (matches the global
+  // range configuration). MIN_REFILL_BITS = MinBaseSizeBits<Pal>() so
+  // the first parent allocation is at least the PAL's minimum reserve
+  // size — Windows VirtualAlloc cannot reserve below its allocation
+  // granularity (64 KiB) and PalRange returns nullptr in that case.
+  static constexpr size_t REFILL_BITS = 20;
+  static constexpr size_t MAX_BITS = bits::BITS - 1;
+  static constexpr size_t MIN_REFILL_BITS = MinBaseSizeBits<Pal>();
+
+  using ArenaRange = Pipe<
+    ParentSource,
+    LargeArenaRange<REFILL_BITS, MAX_BITS, TestPagemap, MIN_REFILL_BITS>>;
+
+  // --- Tests ---
+
+  static void test_basic_alloc_dealloc()
+  {
+    ensure_pagemap();
+    ArenaRange range{};
+
+    // Allocate a single chunk.
+    auto p1 = range.alloc_range(MIN_CHUNK_SIZE);
+    SNMALLOC_ASSERT(p1 != nullptr);
+    printf("  alloc %zu bytes at %p\n", MIN_CHUNK_SIZE, p1.unsafe_ptr());
+
+    // Deallocate and re-allocate — should succeed.
+    range.dealloc_range(p1, MIN_CHUNK_SIZE);
+    auto p2 = range.alloc_range(MIN_CHUNK_SIZE);
+    SNMALLOC_ASSERT(p2 != nullptr);
+
+    // Clean up.
+    range.dealloc_range(p2, MIN_CHUNK_SIZE);
+
+    printf("  Basic alloc/dealloc: OK\n");
+  }
+
+  static void test_multiple_sizes()
+  {
+    ensure_pagemap();
+    ArenaRange range{};
+
+    // Allocate various power-of-two sizes.
+    constexpr size_t NUM_SIZES = 6;
+    size_t sizes[NUM_SIZES] = {
+      MIN_CHUNK_SIZE,
+      MIN_CHUNK_SIZE * 2,
+      MIN_CHUNK_SIZE * 4,
+      MIN_CHUNK_SIZE * 8,
+      MIN_CHUNK_SIZE * 16,
+      MIN_CHUNK_SIZE * 32};
+    capptr::Arena<void> ptrs[NUM_SIZES] = {};
+
+    for (size_t i = 0; i < NUM_SIZES; i++)
+    {
+      ptrs[i] = range.alloc_range(sizes[i]);
+      SNMALLOC_ASSERT(ptrs[i] != nullptr);
+    }
+
+    // Deallocate all.
+    for (size_t i = 0; i < NUM_SIZES; i++)
+    {
+      range.dealloc_range(ptrs[i], sizes[i]);
+    }
+
+    printf("  Multiple sizes: OK\n");
+  }
+
+  static void test_refill()
+  {
+    ensure_pagemap();
+    ArenaRange range{};
+
+    // Allocate more than one refill's worth of chunks.
+    // REFILL_SIZE is 2^20, MIN_CHUNK_SIZE is 2^14,
+    // so one refill is ~64 chunks.
+    constexpr size_t NUM_ALLOCS = 200;
+    capptr::Arena<void> ptrs[NUM_ALLOCS] = {};
+
+    for (size_t i = 0; i < NUM_ALLOCS; i++)
+    {
+      ptrs[i] = range.alloc_range(MIN_CHUNK_SIZE);
+      SNMALLOC_ASSERT(ptrs[i] != nullptr);
+    }
+
+    // Deallocate all.
+    for (size_t i = 0; i < NUM_ALLOCS; i++)
+    {
+      range.dealloc_range(ptrs[i], MIN_CHUNK_SIZE);
+    }
+
+    // Re-allocate — should serve from freed blocks, no new refill needed
+    // for the first pass.
+    for (size_t i = 0; i < NUM_ALLOCS; i++)
+    {
+      ptrs[i] = range.alloc_range(MIN_CHUNK_SIZE);
+      SNMALLOC_ASSERT(ptrs[i] != nullptr);
+    }
+
+    // Final cleanup.
+    for (size_t i = 0; i < NUM_ALLOCS; i++)
+    {
+      range.dealloc_range(ptrs[i], MIN_CHUNK_SIZE);
+    }
+
+    printf("  Refill (200 allocs): OK\n");
+  }
+
+  static void test_alloc_dealloc_cycle()
+  {
+    ensure_pagemap();
+    ArenaRange range{};
+
+    // Interleave alloc and dealloc to exercise consolidation.
+    constexpr size_t ROUNDS = 100;
+    for (size_t r = 0; r < ROUNDS; r++)
+    {
+      auto p = range.alloc_range(MIN_CHUNK_SIZE);
+      SNMALLOC_ASSERT(p != nullptr);
+      range.dealloc_range(p, MIN_CHUNK_SIZE);
+    }
+
+    // Do a larger allocation after many cycles — verifies
+    // that consolidation is working (freed chunks merge back).
+    auto large = range.alloc_range(MIN_CHUNK_SIZE * 4);
+    SNMALLOC_ASSERT(large != nullptr);
+    range.dealloc_range(large, MIN_CHUNK_SIZE * 4);
+
+    printf("  Alloc/dealloc cycle: OK\n");
+  }
+
+  static void test_alignment()
+  {
+    ensure_pagemap();
+    ArenaRange range{};
+
+    // Verify that returned pointers are properly aligned.
+    constexpr size_t NUM_TESTS = 5;
+    size_t sizes[NUM_TESTS] = {
+      MIN_CHUNK_SIZE,
+      MIN_CHUNK_SIZE * 2,
+      MIN_CHUNK_SIZE * 4,
+      MIN_CHUNK_SIZE * 8,
+      MIN_CHUNK_SIZE * 16};
+
+    for (size_t i = 0; i < NUM_TESTS; i++)
+    {
+      auto p = range.alloc_range(sizes[i]);
+      SNMALLOC_ASSERT(p != nullptr);
+      uintptr_t addr = p.unsafe_uintptr();
+      SNMALLOC_ASSERT(
+        (addr & (sizes[i] - 1)) == 0 && "Allocation not properly aligned");
+      UNUSED(addr);
+      range.dealloc_range(p, sizes[i]);
+    }
+
+    printf("  Alignment: OK\n");
+  }
+
+  static void test_large_then_small()
+  {
+    ensure_pagemap();
+    ArenaRange range{};
+
+    // Allocate a large block, dealloc, then allocate smaller blocks
+    // from the same space.
+    size_t large_size = MIN_CHUNK_SIZE * 16;
+    auto large = range.alloc_range(large_size);
+    SNMALLOC_ASSERT(large != nullptr);
+    range.dealloc_range(large, large_size);
+
+    // Now allocate 16 individual chunks — should come from the freed
+    // large block's space.
+    constexpr size_t N = 16;
+    capptr::Arena<void> ptrs[N] = {};
+    for (size_t i = 0; i < N; i++)
+    {
+      ptrs[i] = range.alloc_range(MIN_CHUNK_SIZE);
+      SNMALLOC_ASSERT(ptrs[i] != nullptr);
+    }
+
+    for (size_t i = 0; i < N; i++)
+    {
+      range.dealloc_range(ptrs[i], MIN_CHUNK_SIZE);
+    }
+
+    printf("  Large then small: OK\n");
+  }
+
+  static void test_non_pow2_sizes()
+  {
+    ensure_pagemap();
+    ArenaRange range{};
+
+    // Non-power-of-two, chunk-multiple sizes. Some of these are not
+    // representable size-classes (e.g. 9, 11, 13 chunks); the arena
+    // carves exactly the requested chunk count and rolls the rounding
+    // remainder into the post fragment, so callers see no excess.
+    constexpr size_t NUM_SIZES = 8;
+    size_t sizes[NUM_SIZES] = {
+      MIN_CHUNK_SIZE * 3,
+      MIN_CHUNK_SIZE * 5,
+      MIN_CHUNK_SIZE * 6,
+      MIN_CHUNK_SIZE * 7,
+      MIN_CHUNK_SIZE * 9,
+      MIN_CHUNK_SIZE * 11,
+      MIN_CHUNK_SIZE * 13,
+      MIN_CHUNK_SIZE * 17};
+
+    capptr::Arena<void> ptrs[NUM_SIZES] = {};
+    for (size_t i = 0; i < NUM_SIZES; i++)
+    {
+      ptrs[i] = range.alloc_range(sizes[i]);
+      SNMALLOC_ASSERT(ptrs[i] != nullptr);
+    }
+
+    // All pointers must be distinct and non-overlapping (within the size
+    // requested — over-allocation would break this because the rounding
+    // remainder would later be handed out a second time).
+    for (size_t i = 0; i < NUM_SIZES; i++)
+    {
+      uintptr_t lo_i = ptrs[i].unsafe_uintptr();
+      uintptr_t hi_i = lo_i + sizes[i];
+      for (size_t j = i + 1; j < NUM_SIZES; j++)
+      {
+        uintptr_t lo_j = ptrs[j].unsafe_uintptr();
+        uintptr_t hi_j = lo_j + sizes[j];
+        SNMALLOC_ASSERT(hi_i <= lo_j || hi_j <= lo_i);
+        UNUSED(hi_i, hi_j);
+      }
+    }
+
+    for (size_t i = 0; i < NUM_SIZES; i++)
+    {
+      range.dealloc_range(ptrs[i], sizes[i]);
+    }
+
+    // After deallocating all, repeat the exact same pattern to confirm
+    // the freed space is reusable (catches leaks from un-returned
+    // rounding remainder).
+    for (size_t i = 0; i < NUM_SIZES; i++)
+    {
+      ptrs[i] = range.alloc_range(sizes[i]);
+      SNMALLOC_ASSERT(ptrs[i] != nullptr);
+    }
+    for (size_t i = 0; i < NUM_SIZES; i++)
+    {
+      range.dealloc_range(ptrs[i], sizes[i]);
+    }
+
+    printf("  Non-pow2 sizes: OK\n");
+  }
+} // anonymous namespace
+
+int main()
+{
+  setup();
+
+  printf("--- LargeArenaRange tests ---\n");
+
+  test_basic_alloc_dealloc();
+  test_multiple_sizes();
+  test_refill();
+  test_alloc_dealloc_cycle();
+  test_alignment();
+  test_large_then_small();
+  test_non_pow2_sizes();
+
+  printf("All LargeArenaRange tests passed.\n");
+  return 0;
+}
diff --git a/src/test/func/memory/memory.cc b/src/test/func/memory/memory.cc
index 253628282..6be2865a8 100644
--- a/src/test/func/memory/memory.cc
+++ b/src/test/func/memory/memory.cc
@@ -307,12 +307,19 @@ void check_offset(void* base, void* interior)
 
 void check_external_pointer_large(size_t* base)
 {
+  // Probe `__malloc_start_pointer` at both ends of each 16 MiB
+  // stride within the allocation. The allocation size is recorded in
+  // the first word of the allocation itself. The end-of-stride probe
+  // is clamped to the last byte of the allocation.
   size_t size = *base;
   char* curr = (char*)base;
   for (size_t offset = 0; offset < size; offset += 1 << 24)
   {
     check_offset(base, (void*)(curr + offset));
-    check_offset(base, (void*)(curr + offset + (1 << 24) - 1));
+    size_t end = offset + (1 << 24) - 1;
+    if (end >= size)
+      end = size - 1;
+    check_offset(base, (void*)(curr + end));
   }
 }
 
@@ -439,6 +446,49 @@ void test_calloc_large_bug()
   snmalloc::dealloc(p1);
 }
 
+/**
+ * `calloc` zeroing must cover exactly the reservation `round_size`
+ * reports — no more, no less. For a large request that lands in a
+ * non-pow2 sizeclass, the reservation is tighter than the next pow2,
+ * so a stray `next_pow2`-sized zeroing loop would overshoot into
+ * backend free range. This test allocates such a non-pow2 large
+ * request and verifies (a) the usable size is strictly less than the
+ * next pow2, and (b) every byte of the visible allocation is zero.
+ *
+ * Note: an overshoot may not fault — the deterministic gate for the
+ * `round_size` contract lives in the sizeclass test.
+ */
+void test_calloc_non_pow2_large()
+{
+  if constexpr (snmalloc::INTERMEDIATE_BITS == 0)
+  {
+    // All sizeclasses are powers of two in this configuration, so
+    // there is no non-pow2 large request to test.
+    std::cout << "INTERMEDIATE_BITS == 0: all sizeclasses pow2; skipping."
+              << std::endl;
+    return;
+  }
+
+  // 2.5 * MAX_SMALL_SIZECLASS_SIZE: definitely large, definitely not
+  // a power of two, and (with INTERMEDIATE_BITS >= 1) the smallest
+  // enclosing sizeclass is strictly less than the next pow2 above.
+  const size_t mss = size_t{1} << snmalloc::max_small_sizeclass_bits();
+  const size_t request = (mss << 1) + (mss >> 1);
+  const size_t next_pow2 = snmalloc::bits::next_pow2(request);
+
+  void* p = snmalloc::alloc<snmalloc::ZeroMem::YesZero>(request);
+  SNMALLOC_CHECK(p != nullptr);
+  const size_t usable = snmalloc::alloc_size(p);
+  SNMALLOC_CHECK(usable >= request);
+  SNMALLOC_CHECK(usable < next_pow2);
+  auto* bytes = static_cast<unsigned char*>(p);
+  for (size_t i = 0; i < usable; i++)
+  {
+    SNMALLOC_CHECK(bytes[i] == 0);
+  }
+  snmalloc::dealloc(p);
+}
+
 template<size_t asz, int dealloc = 2>
 void test_static_sized_alloc()
 {
@@ -589,6 +639,7 @@ int main(int, char**)
   TEST(test_external_pointer);
   TEST(test_alloc_16M);
   TEST(test_calloc_16M);
+  TEST(test_calloc_non_pow2_large);
   TEST(test_consolidaton_bug);
 
   std::cout << "Tests completeed successfully!" << std::endl;
diff --git a/src/test/func/pagemap/pagemap.cc b/src/test/func/pagemap/pagemap.cc
index 7a03fa1a7..f93f64840 100644
--- a/src/test/func/pagemap/pagemap.cc
+++ b/src/test/func/pagemap/pagemap.cc
@@ -14,6 +14,17 @@
 using namespace snmalloc;
 static constexpr size_t GRANULARITY_BITS = 20;
 
+/**
+ * Test PAL that wraps DefaultPal but strips LazyCommit from pal_features.
+ * Used to exercise the get<true> code path that calls register_range on
+ * a bounded pagemap — see test_get_potentially_out_of_range_bounded below.
+ */
+struct NoLazyCommitPal : public DefaultPal
+{
+  static constexpr uint64_t pal_features =
+    DefaultPal::pal_features & ~static_cast<uint64_t>(LazyCommit);
+};
+
 struct T
 {
   size_t v = 99;
@@ -27,6 +38,9 @@ FlatPagemap<GRANULARITY_BITS, T, DefaultPal, false> pagemap_test_unbound;
 
 FlatPagemap<GRANULARITY_BITS, T, DefaultPal, true> pagemap_test_bound;
 
+FlatPagemap<GRANULARITY_BITS, T, NoLazyCommitPal, true>
+  pagemap_test_bound_no_lazy;
+
 size_t failure_count = 0;
 
 void check_get(
@@ -158,6 +172,30 @@ int main(int argc, char** argv)
   test_pagemap(false);
   test_pagemap(true);
 
+  // Regression test for the bounded + !LazyCommit path of get<true>.
+  // Previously, get_mut<true> base-adjusted p before calling register_range,
+  // which double-subtracted base inside register_range and tripped the
+  // out-of-range guard for legitimate in-range addresses.
+  {
+    auto size = bits::one_at_bit(GRANULARITY_BITS + 4);
+    auto* base = NoLazyCommitPal::reserve(size);
+    NoLazyCommitPal::notify_using<NoZero>(base, size);
+    auto [heap_base, heap_size] = pagemap_test_bound_no_lazy.init(base, size);
+    auto low = address_cast(heap_base);
+
+    pagemap_test_bound_no_lazy.set(low, T(7));
+
+    // get<true> with has_bounds && !LazyCommit must not error on an in-range
+    // address: the underlying register_range call sees a fully-adjusted base.
+    T value = pagemap_test_bound_no_lazy.get<true>(low);
+    if (value.v != 7)
+    {
+      std::cout << "get<true> bounded !LazyCommit: read " << value.v
+                << " expected 7" << std::endl;
+      failure_count++;
+    }
+  }
+
   if (failure_count != 0)
   {
     std::cout << "Failure count: " << failure_count << std::endl;
diff --git a/src/test/func/redblack/redblack.cc b/src/test/func/redblack/redblack.cc
index 61fccb6d3..e47138be4 100644
--- a/src/test/func/redblack/redblack.cc
+++ b/src/test/func/redblack/redblack.cc
@@ -5,6 +5,7 @@
 
 #include <algorithm>
 #include <iostream>
+#include <set>
 #include <vector>
 
 #ifndef SNMALLOC_TRACING
@@ -207,6 +208,122 @@ void test(size_t size, unsigned int seed)
   }
 }
 
+template<bool TRACE>
+void test_neighbours(size_t size, unsigned int seed)
+{
+  xoroshiro::p64r32 rand(seed);
+  snmalloc::RBTree<Rep, true, TRACE> tree;
+  std::set<Rep::key> oracle;
+  // Parallel vector keeps random-pick on remove O(1) instead of paying
+  // O(n) for std::advance over a std::set iterator.
+  std::vector<Rep::key> entries;
+
+  auto probe = [&](Rep::key k_probe) {
+    auto result = tree.neighbours(k_probe);
+
+    Rep::key expected_pred = Rep::null;
+    Rep::key expected_succ = Rep::null;
+    auto it = oracle.lower_bound(k_probe);
+    if (it != oracle.begin())
+    {
+      auto prev = it;
+      --prev;
+      expected_pred = *prev;
+    }
+    if (it != oracle.end())
+      expected_succ = *it;
+
+    if (result.first != expected_pred || result.second != expected_succ)
+    {
+      std::cout << "neighbours(" << k_probe << ") mismatch:"
+                << " got (" << result.first << ", " << result.second << ")"
+                << " expected (" << expected_pred << ", " << expected_succ
+                << ")" << std::endl;
+      abort();
+    }
+  };
+
+  auto do_probes = [&]() {
+    // Boundary probes. Key 0 is Rep::null and is never inserted (insert
+    // keys are 1 + rand % size), and size + 1 is one above the maximum
+    // possible insert; both are guaranteed not to be in the tree.
+    probe(Rep::key(0));
+    if (size + 1 <= 0xFFFF)
+      probe(Rep::key(size + 1));
+    // Two random probes, skipping any that collide with the tree.
+    for (size_t p = 0; p < 2; p++)
+    {
+      Rep::key k = Rep::key(rand.next() % (size + 2));
+      if (oracle.count(k) == 0)
+        probe(k);
+    }
+  };
+
+  // Empty tree: every probe must report (null, null).
+  do_probes();
+
+  bool first = true;
+  for (size_t i = 0; i < 20 * size; i++)
+  {
+    auto batch = 1 + rand.next() % (3 + (size / 2));
+    auto op = rand.next() % 4;
+    if (op < 2 || first)
+    {
+      first = false;
+      for (auto j = batch; j > 0; j--)
+      {
+        auto k = Rep::key(1 + rand.next() % size);
+        if (tree.insert_elem(k))
+        {
+          oracle.insert(k);
+          entries.push_back(k);
+        }
+      }
+    }
+    else if (op == 3)
+    {
+      for (auto j = batch; j > 0; j--)
+      {
+        if (entries.empty())
+          break;
+        auto index = rand.next() % entries.size();
+        Rep::key elem = entries[index];
+        if (!tree.remove_elem(elem))
+        {
+          std::cout << "Failed to remove element: " << elem << std::endl;
+          abort();
+        }
+        entries.erase(entries.begin() + static_cast<int>(index));
+        oracle.erase(elem);
+      }
+    }
+    else
+    {
+      for (auto j = batch; j > 0; j--)
+      {
+        if (entries.empty())
+          break;
+        auto min = tree.remove_min();
+        Rep::key expected = *oracle.begin();
+        if (min != expected)
+        {
+          std::cout << "remove_min mismatch: tree=" << min
+                    << " oracle=" << expected << std::endl;
+          abort();
+        }
+        oracle.erase(oracle.begin());
+        entries.erase(
+          std::remove(entries.begin(), entries.end(), min), entries.end());
+      }
+    }
+
+    do_probes();
+
+    if (entries.empty())
+      break;
+  }
+}
+
 int main(int argc, char** argv)
 {
   setup();
@@ -222,6 +339,11 @@ int main(int argc, char** argv)
       for (seed = 1; seed < 5 + (8 * size); seed++)
       {
         test<false>(size, seed);
+        // Run the neighbours oracle on a handful of seeds per size: the
+        // full size range gives good tree-shape coverage, the seed cap
+        // keeps the extra cost from blowing the per-test time budget.
+        if (seed < 5)
+          test_neighbours<false>(size, seed);
       }
 
     return 0;
@@ -235,5 +357,6 @@ int main(int argc, char** argv)
 
   // Trace particular example
   test<true>(size, seed);
+  test_neighbours<true>(size, seed);
   return 0;
 }
diff --git a/src/test/func/release-rounding/rounding.cc b/src/test/func/release-rounding/rounding.cc
index 4d11eaafb..490343dd4 100644
--- a/src/test/func/release-rounding/rounding.cc
+++ b/src/test/func/release-rounding/rounding.cc
@@ -18,18 +18,49 @@ int main(int argc, char** argv)
 
   bool failed = false;
 
+  // Layout invariant: osc(sc, off).raw() == sc.raw() | (off << SIZECLASS_BITS),
+  // and the accessors invert that layout. This is load-bearing because
+  // `SizeClassTable::start(sizeclass_t)` and `start(offset_and_sizeclass_t)`
+  // both index by `.raw()`, so an offset=0 osc must hit the same table
+  // row as the bare sizeclass_t; the offset>0 row-population loop in
+  // the SizeClassTable ctor relies on the same layout. If any of this
+  // drifts, `encode()` in metadata.h would silently produce wrong bits.
+  for (smallsizeclass_t sc_small; sc_small < NUM_SMALL_SIZECLASSES; sc_small++)
+  {
+    sizeclass_t sc = sizeclass_t::from_small_class(sc_small);
+    for (size_t off = 0; off < (size_t{1} << OFFSET_BITS); off++)
+    {
+      auto osc = offset_and_sizeclass_t(sc, off);
+      size_t expected_raw = sc.raw() | (off << SIZECLASS_BITS);
+      if (
+        osc.raw() != expected_raw || osc.sizeclass() != sc ||
+        osc.offset() != off)
+      {
+        std::cout << "osc layout mismatch: sc=" << sc.raw() << " off=" << off
+                  << " -> raw=" << osc.raw() << " expected_raw=" << expected_raw
+                  << " sc'=" << osc.sizeclass().raw()
+                  << " off'=" << osc.offset() << std::endl
+                  << std::flush;
+        failed = true;
+      }
+    }
+  }
+  if (failed)
+    abort();
+
   for (smallsizeclass_t size_class; size_class < NUM_SMALL_SIZECLASSES;
        size_class++)
   {
     size_t rsize = sizeclass_to_size(size_class);
     size_t max_offset = sizeclass_to_slab_size(size_class);
     sizeclass_t sc = sizeclass_t::from_small_class(size_class);
+    offset_and_sizeclass_t osc = offset_and_sizeclass_t(sc, 0);
     for (size_t offset = 0; offset < max_offset; offset++)
     {
       size_t mod = offset % rsize;
       bool mod_0 = (offset % rsize) == 0;
 
-      size_t opt_mod = index_in_object(sc, offset);
+      size_t opt_mod = index_in_object(osc, offset);
       if (mod != opt_mod)
       {
         std::cout << "rsize " << rsize << "  offset  " << offset << "  opt "
@@ -38,7 +69,7 @@ int main(int argc, char** argv)
         failed = true;
       }
 
-      bool opt_mod_0 = is_start_of_object(sc, offset);
+      bool opt_mod_0 = is_start_of_object(osc, offset);
       if (opt_mod_0 != mod_0)
       {
         std::cout << "rsize " << rsize << "  offset  " << offset
@@ -51,5 +82,48 @@ int main(int argc, char** argv)
     if (failed)
       abort();
   }
+
+  // Exercise pow2 large sizeclasses end-to-end materialised in Phase 13.
+  // For each pow2 size S that the front end actually reaches (lc values that
+  // are pow2-aligned in the global exp+mantissa scheme), verify
+  // index_in_object / is_start_of_object at a representative set of offsets:
+  // the start of an object, an arbitrary interior offset, and the start of
+  // the next object. Bound the loop by ENCODED_ADDRESS_BITS so
+  // `bits::one_at_bit(b)` never shifts by >= BITS.
+  for (size_t b = MAX_SMALL_SIZECLASS_BITS + 1; b <= ENCODED_ADDRESS_BITS; b++)
+  {
+    size_t S = bits::one_at_bit(b);
+    sizeclass_t sc = size_to_sizeclass_full(S);
+    offset_and_sizeclass_t osc = offset_and_sizeclass_t(sc, 0);
+
+    address_t base = address_t(0);
+    size_t offsets[] = {0, 1, S / 2, S - 1, S};
+    for (size_t off : offsets)
+    {
+      address_t addr = base + off;
+      size_t expected_mod = off % S;
+      bool expected_start = expected_mod == 0;
+
+      size_t opt_mod = index_in_object(osc, addr);
+      if (opt_mod != expected_mod)
+      {
+        std::cout << "Large S=" << S << " offset=" << off
+                  << " index_in_object=" << opt_mod
+                  << " expected=" << expected_mod << std::endl;
+        failed = true;
+      }
+
+      bool opt_start = is_start_of_object(osc, addr);
+      if (opt_start != expected_start)
+      {
+        std::cout << "Large S=" << S << " offset=" << off
+                  << " is_start_of_object=" << opt_start
+                  << " expected=" << expected_start << std::endl;
+        failed = true;
+      }
+    }
+    if (failed)
+      abort();
+  }
   return 0;
 }
diff --git a/src/test/func/sizeclass/sizeclass.cc b/src/test/func/sizeclass/sizeclass.cc
index ac7ec6bd8..0b0c73eb3 100644
--- a/src/test/func/sizeclass/sizeclass.cc
+++ b/src/test/func/sizeclass/sizeclass.cc
@@ -67,6 +67,172 @@ void test_align_size()
     abort();
 }
 
+void test_uniform_large_sizeclasses()
+{
+  using namespace snmalloc;
+  bool failed = false;
+
+  // Sentinel sanity: default-constructed sizeclass_t is the unmapped sentinel
+  // and not classified as small.
+  if (sizeclass_t{}.raw() != 0)
+  {
+    std::cout << "Default sizeclass_t raw is " << sizeclass_t{}.raw()
+              << " expected 0" << std::endl;
+    failed = true;
+  }
+  if (sizeclass_t{}.is_default() != true)
+  {
+    std::cout << "Default sizeclass_t .is_default() is false" << std::endl;
+    failed = true;
+  }
+  if (sizeclass_t{}.is_small())
+  {
+    std::cout << "Default sizeclass_t.is_small() is true" << std::endl;
+    failed = true;
+  }
+
+  // Encoding sanity: small range and large range are disjoint and adjacent
+  // in the value space.
+  if (sizeclass_t::from_small_class(smallsizeclass_t(0)).raw() != 1)
+  {
+    std::cout << "from_small_class(0).raw() != 1" << std::endl;
+    failed = true;
+  }
+  if (
+    sizeclass_t::from_small_class(smallsizeclass_t(NUM_SMALL_SIZECLASSES - 1))
+        .raw() +
+      1 !=
+    sizeclass_t::from_large_class(0).raw())
+  {
+    std::cout << "Small/large ranges are not adjacent" << std::endl;
+    failed = true;
+  }
+  if (
+    sizeclass_t::from_large_class(NUM_LARGE_CLASSES - 1).raw() >=
+    SIZECLASS_REP_SIZE)
+  {
+    std::cout << "Largest large sizeclass overflows SIZECLASS_REP_SIZE"
+              << std::endl;
+    failed = true;
+  }
+  if (!sizeclass_t::from_small_class(smallsizeclass_t(0)).is_small())
+  {
+    std::cout << "from_small_class(0).is_small() is false" << std::endl;
+    failed = true;
+  }
+  if (sizeclass_t::from_large_class(0).is_small())
+  {
+    std::cout << "from_large_class(0).is_small() is true" << std::endl;
+    failed = true;
+  }
+
+  // Large sizeclasses are strictly increasing in size with lc.
+  size_t prev_size = 0;
+  for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++)
+  {
+    size_t size = sizeclass_full_to_size(sizeclass_t::from_large_class(lc));
+    if (size <= prev_size)
+    {
+      std::cout << "Non-monotonic large sizeclass: lc=" << lc
+                << " size=" << size << " prev=" << prev_size << std::endl;
+      failed = true;
+    }
+    prev_size = size;
+  }
+
+  // Round-trip identity on pow2 large sizes: every pow2 size S in
+  // [MAX_SMALL_SIZECLASS_SIZE * 2, MAX_LARGE_SIZECLASS_SIZE] must
+  // satisfy sizeclass_full_to_size(size_to_sizeclass_full(S)) == S.
+  // Bound the loop by ENCODED_ADDRESS_BITS so `bits::one_at_bit(b)`
+  // never shifts by >= BITS (the bound check itself would fail on
+  // 32-bit otherwise).
+  for (size_t b = MAX_SMALL_SIZECLASS_BITS + 1; b <= ENCODED_ADDRESS_BITS; b++)
+  {
+    size_t S = bits::one_at_bit(b);
+    sizeclass_t sc = size_to_sizeclass_full(S);
+    size_t rs = sizeclass_full_to_size(sc);
+    if (rs != S)
+    {
+      std::cout << "Pow2 round-trip failed: S=" << S << " round=" << rs
+                << std::endl;
+      failed = true;
+    }
+
+    // For every non-pow2 size X strictly between adjacent pow2 [P, 2P),
+    // `size_to_sizeclass_full(X)` must select the smallest sizeclass
+    // whose size is >= X. Compute the expected sizeclass independently
+    // by scanning all large classes. Only check when 2P is still
+    // representable.
+    if (b < ENCODED_ADDRESS_BITS)
+    {
+      size_t mid = S + (S >> 1);
+      sizeclass_t sc_mid = size_to_sizeclass_full(mid);
+      size_t rs_mid = sizeclass_full_to_size(sc_mid);
+
+      // Independent computation: smallest large class size >= mid.
+      size_t expect = 0;
+      for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++)
+      {
+        size_t sz = sizeclass_full_to_size(sizeclass_t::from_large_class(lc));
+        if (sz >= mid)
+        {
+          expect = sz;
+          break;
+        }
+      }
+      if (expect == 0)
+      {
+        std::cout << "No large class >= mid=" << mid << std::endl;
+        failed = true;
+      }
+      else if (rs_mid != expect)
+      {
+        std::cout << "Non-pow2 should round to smallest enclosing class: X="
+                  << mid << " round=" << rs_mid << " expected=" << expect
+                  << std::endl;
+        failed = true;
+      }
+    }
+  }
+
+  // `round_size` contract: for every representable large class size
+  // S, `round_size(S) == S` and `round_size(S_prev + 1) == S` (the
+  // smallest enclosing class). `DefaultConts::success` (corealloc.h)
+  // uses `round_size` to size the `calloc` zeroing range, so any
+  // drift here would over- or under-zero. This is the deterministic
+  // gate for that contract; the `calloc` smoke test in `memory.cc`
+  // would not necessarily fault on an overshoot into backend free
+  // range.
+  {
+    size_t prev = 0;
+    for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++)
+    {
+      size_t S = sizeclass_full_to_size(sizeclass_t::from_large_class(lc));
+      if (round_size(S) != S)
+      {
+        std::cout << "round_size identity failed at large class: S=" << S
+                  << " round_size=" << round_size(S) << std::endl;
+        failed = true;
+      }
+      if (prev != 0 && prev + 1 < S)
+      {
+        size_t probe = prev + 1;
+        if (round_size(probe) != S)
+        {
+          std::cout << "round_size(prev+1) blow-up: probe=" << probe
+                    << " round_size=" << round_size(probe) << " expected=" << S
+                    << std::endl;
+          failed = true;
+        }
+      }
+      prev = S;
+    }
+  }
+
+  if (failed)
+    abort();
+}
+
 int main(int, char**)
 {
   setup();
@@ -149,4 +315,5 @@ int main(int, char**)
     abort();
 
   test_align_size();
+  test_uniform_large_sizeclasses();
 }
diff --git a/src/test/func/smallarenarange/smallarenarange.cc b/src/test/func/smallarenarange/smallarenarange.cc
new file mode 100644
index 000000000..47d6b895c
--- /dev/null
+++ b/src/test/func/smallarenarange/smallarenarange.cc
@@ -0,0 +1,765 @@
+/**
+ * Unit tests for `InplaceRep` exercised through `Arena`.
+ *
+ * Distinct from the `arena` test (which uses an array-backed
+ * MockRep): here the Rep is the in-band representation,
+ * and each free block's tree-node storage lives at the block's own
+ * head bytes. The test allocates a single chunk-aligned backing
+ * buffer and treats addresses within it as block bases.
+ */
+
+#include "test/setup.h"
+#include "test/snmalloc_testlib.h"
+#include "test/xoroshiro.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <map>
+#include <new>
+#include <set>
+#include <snmalloc/backend_helpers/arena.h>
+#include <snmalloc/backend_helpers/authmap.h>
+#include <snmalloc/backend_helpers/inplacerep.h>
+#include <snmalloc/backend_helpers/smallarenarange.h>
+#include <vector>
+
+namespace snmalloc
+{
+  using Rep = InplaceRep<DummyAuthmap, capptr::bounds::Arena>;
+  static constexpr size_t UNIT_SIZE = Rep::UNIT_SIZE;
+  static constexpr size_t MIN_BITS = Rep::MIN_BITS;
+
+  // Arena spans one chunk's worth of space (max block size =
+  // MIN_CHUNK_SIZE - UNIT_SIZE, since the arena's MAX is exclusive).
+  static constexpr size_t MAX_SIZE_BITS = MIN_CHUNK_BITS;
+  using TestArena = Arena<Rep, MIN_BITS, MAX_SIZE_BITS>;
+
+  // Backing buffer: must be UNIT_SIZE-aligned so block bases are
+  // unit-aligned and the in-band node fields land at the expected
+  // offsets. Sized to comfortably cover the arena's full range plus
+  // a small base offset that keeps block addresses non-zero (zero
+  // is the tree null sentinel). Oversized by MIN_CHUNK_SIZE so the
+  // base can be aligned up at runtime — MSVC rejects alignas values
+  // as large as MIN_CHUNK_SIZE on static storage.
+  static unsigned char backing[3 * MIN_CHUNK_SIZE];
+
+  static uintptr_t base_addr()
+  {
+    // Round up to MIN_CHUNK_SIZE, then offset by MIN_CHUNK_SIZE to
+    // keep addresses well clear of zero.
+    uintptr_t raw = reinterpret_cast<uintptr_t>(&backing[0]);
+    uintptr_t aligned = (raw + MIN_CHUNK_SIZE - 1) & ~(MIN_CHUNK_SIZE - 1);
+    return aligned + MIN_CHUNK_SIZE;
+  }
+
+  static void reset_backing()
+  {
+    for (size_t i = 0; i < sizeof(backing); i++)
+      backing[i] = 0;
+  }
+
+  static uintptr_t unit_addr(size_t unit_idx)
+  {
+    return base_addr() + unit_idx * UNIT_SIZE;
+  }
+
+  static constexpr size_t unit_size(size_t n_units)
+  {
+    return n_units * UNIT_SIZE;
+  }
+
+  // ==================================================================
+  // (A) Round-trip: variant tag and large-size storage survive
+  // independent of bin/range pointer writes.
+  // ==================================================================
+
+  static void test_variant_roundtrip()
+  {
+    reset_backing();
+    uintptr_t a = unit_addr(0);
+
+    for (auto v :
+         {ArenaVariant::Min,
+          ArenaVariant::EvenTwo,
+          ArenaVariant::OddTwo,
+          ArenaVariant::Large})
+    {
+      Rep::set_variant(a, v);
+      SNMALLOC_CHECK(Rep::get_variant(a) == v);
+    }
+
+    // Variant tag must not interfere with the red bit at bit 0.
+    Rep::set_variant(a, ArenaVariant::OddTwo);
+    Rep::BinRep::set_red(a, true);
+    SNMALLOC_CHECK(Rep::BinRep::is_red(a));
+    SNMALLOC_CHECK(Rep::get_variant(a) == ArenaVariant::OddTwo);
+
+    Rep::BinRep::set_red(a, false);
+    SNMALLOC_CHECK(!Rep::BinRep::is_red(a));
+    SNMALLOC_CHECK(Rep::get_variant(a) == ArenaVariant::OddTwo);
+
+    printf("  Variant + red roundtrip: OK\n");
+  }
+
+  static void test_large_size_roundtrip()
+  {
+    reset_backing();
+    uintptr_t a = unit_addr(0);
+
+    for (size_t s : {unit_size(3), unit_size(7), unit_size(17), unit_size(125)})
+    {
+      Rep::set_large_size(a, s);
+      SNMALLOC_CHECK(Rep::get_large_size(a) == s);
+    }
+
+    printf("  Large-size roundtrip: OK\n");
+  }
+
+  // ==================================================================
+  // (B) Bin-tree and range-tree red bits live in different units and
+  // must not alias.
+  // ==================================================================
+
+  static void test_red_bits_independent()
+  {
+    reset_backing();
+    uintptr_t a = unit_addr(0);
+
+    Rep::BinRep::set_red(a, true);
+    Rep::RangeRep::set_red(a, false);
+    SNMALLOC_CHECK(Rep::BinRep::is_red(a));
+    SNMALLOC_CHECK(!Rep::RangeRep::is_red(a));
+
+    Rep::BinRep::set_red(a, false);
+    Rep::RangeRep::set_red(a, true);
+    SNMALLOC_CHECK(!Rep::BinRep::is_red(a));
+    SNMALLOC_CHECK(Rep::RangeRep::is_red(a));
+
+    printf("  Bin/range red bits independent: OK\n");
+  }
+
+  // ==================================================================
+  // (B2) `can_consolidate` refuses chunk-boundary merges.
+  // SmallArenaRange splits incoming ranges at chunk boundaries, but
+  // adjacent intra-chunk fragments meeting at a boundary would
+  // otherwise be merged by Arena. The predicate is what
+  // prevents that.
+  // ==================================================================
+
+  static void test_can_consolidate_chunk_boundary()
+  {
+    // Chunk-aligned higher_addr means the lower neighbour ends at
+    // a chunk boundary — refuse.
+    SNMALLOC_CHECK(!Rep::can_consolidate(MIN_CHUNK_SIZE));
+    SNMALLOC_CHECK(!Rep::can_consolidate(2 * MIN_CHUNK_SIZE));
+    // Non-chunk-aligned higher_addr is fine to merge.
+    SNMALLOC_CHECK(Rep::can_consolidate(MIN_CHUNK_SIZE + UNIT_SIZE));
+    SNMALLOC_CHECK(Rep::can_consolidate(MIN_CHUNK_SIZE - UNIT_SIZE));
+    SNMALLOC_CHECK(Rep::can_consolidate(UNIT_SIZE));
+
+    printf("  can_consolidate chunk-boundary refuse: OK\n");
+  }
+
+  // ==================================================================
+  // (C) Through the arena: add a single block and remove it.
+  // ==================================================================
+
+  static void test_arena_add_remove_single()
+  {
+    reset_backing();
+    TestArena arena;
+    arena.check_invariant(true);
+
+    auto a = unit_addr(0);
+    auto [ov_a, ov_s] = arena.add_block(a, unit_size(4));
+    SNMALLOC_CHECK(ov_a == 0 && ov_s == 0);
+    arena.check_invariant(true);
+
+    auto got = arena.remove_block(unit_size(4));
+    SNMALLOC_CHECK(got == a);
+    arena.check_invariant(true);
+
+    printf("  Arena add/remove single: OK\n");
+  }
+
+  // ==================================================================
+  // (D) Consolidation across two adjacent free blocks.
+  // ==================================================================
+
+  static void test_arena_consolidation()
+  {
+    reset_backing();
+    TestArena arena;
+
+    auto a = unit_addr(0);
+    auto b = unit_addr(4);
+    arena.add_block(a, unit_size(4));
+    arena.check_invariant(true);
+    auto [ov_a, ov_s] = arena.add_block(b, unit_size(4));
+    SNMALLOC_CHECK(ov_a == 0 && ov_s == 0);
+    arena.check_invariant(true);
+
+    // A single 8-unit block should now be removable from the
+    // consolidated region.
+    auto got = arena.remove_block(unit_size(8));
+    SNMALLOC_CHECK(got == a);
+    arena.check_invariant(true);
+
+    printf("  Arena consolidation: OK\n");
+  }
+
+  // ==================================================================
+  // (E) Carving: request a smaller size than the free block has.
+  // ==================================================================
+
+  static void test_arena_carve()
+  {
+    reset_backing();
+    TestArena arena;
+
+    auto a = unit_addr(0);
+    arena.add_block(a, unit_size(8));
+    arena.check_invariant(true);
+
+    auto got = arena.remove_block(unit_size(3));
+    SNMALLOC_CHECK(got != 0);
+    arena.check_invariant(true);
+
+    // The remainder is still available; total removed should sum to
+    // 8 units across this and subsequent removes.
+    size_t total_removed = 3;
+    while (true)
+    {
+      auto r = arena.remove_block(unit_size(1));
+      if (r == 0)
+        break;
+      total_removed += 1;
+      arena.check_invariant(true);
+    }
+    SNMALLOC_CHECK(total_removed == 8);
+
+    printf("  Arena carve + drain: OK\n");
+  }
+
+  // ==================================================================
+  // (F) Randomised stress: oracle-checked add/remove over a single
+  // chunk's worth of units. Equivalent to the MockRep stress test in
+  // shape but operates on real in-band storage.
+  // ==================================================================
+
+  static constexpr size_t STRESS_UNITS =
+    (size_t(1) << MAX_SIZE_BITS) / UNIT_SIZE - 1;
+
+  using Bins = ArenaBins<2, MIN_BITS>;
+
+  struct OracleRange
+  {
+    size_t addr_units;
+    size_t size_units;
+
+    bool operator<(const OracleRange& o) const
+    {
+      return addr_units < o.addr_units;
+    }
+  };
+
+  // Mirrors the arena's bin-based allocator: classify entries into
+  // bins, pick the bin via the bitmap's find_for_request, then
+  // pick the lowest-address entry within that bin and carve.
+  class Oracle
+  {
+    std::set<OracleRange> ranges;
+
+  public:
+    void add(size_t addr_units, size_t size_units)
+    {
+      OracleRange key{addr_units, size_units};
+      auto it = ranges.lower_bound(key);
+
+      size_t new_addr = addr_units;
+      size_t new_size = size_units;
+
+      if (it != ranges.end() && it->addr_units == new_addr + new_size)
+      {
+        new_size += it->size_units;
+        it = ranges.erase(it);
+      }
+
+      if (it != ranges.begin())
+      {
+        auto prev = std::prev(it);
+        if (prev->addr_units + prev->size_units == new_addr)
+        {
+          new_addr = prev->addr_units;
+          new_size += prev->size_units;
+          ranges.erase(prev);
+        }
+      }
+
+      ranges.insert({new_addr, new_size});
+    }
+
+    // Returns {addr_units, len_units} or {0, 0} if nothing fits.
+    std::pair<size_t, size_t> remove(size_t n_units)
+    {
+      size_t n_bytes = n_units * UNIT_SIZE;
+      if (n_bytes == 0 || n_bytes > Bins::max_supported_size())
+        return {0, 0};
+
+      typename Bins::Bitmap bm{};
+      std::map<size_t, std::vector<std::set<OracleRange>::iterator>> by_bin;
+
+      for (auto it = ranges.begin(); it != ranges.end(); ++it)
+      {
+        typename Bins::range_t r{
+          unit_addr(it->addr_units), it->size_units * UNIT_SIZE};
+        size_t bin = bm.add(r);
+        by_bin[bin].push_back(it);
+      }
+
+      size_t bin_id = bm.find_for_request(n_bytes);
+      if (bin_id == SIZE_MAX)
+        return {0, 0};
+
+      auto& entries = by_bin[bin_id];
+      auto best_it = entries[0];
+      for (size_t i = 1; i < entries.size(); i++)
+      {
+        if (entries[i]->addr_units < best_it->addr_units)
+          best_it = entries[i];
+      }
+
+      OracleRange block = *best_it;
+      ranges.erase(best_it);
+
+      auto carved = Bins::carve(
+        {unit_addr(block.addr_units), block.size_units * UNIT_SIZE}, n_bytes);
+      auto base = base_addr();
+      if (carved.pre.size != 0)
+        ranges.insert(
+          {(carved.pre.base - base) / UNIT_SIZE, carved.pre.size / UNIT_SIZE});
+      if (carved.post.size != 0)
+        ranges.insert(
+          {(carved.post.base - base) / UNIT_SIZE,
+           carved.post.size / UNIT_SIZE});
+
+      return {
+        (carved.req.base - base) / UNIT_SIZE, carved.req.size / UNIT_SIZE};
+    }
+  };
+
+  static void test_stress_seed(size_t seed, size_t num_ops)
+  {
+    reset_backing();
+    TestArena arena;
+    Oracle oracle;
+
+    // All units initially allocated (i.e., not in the arena).
+    std::vector<bool> allocated(STRESS_UNITS, true);
+
+    xoroshiro::p128r64 rng(seed);
+
+    for (size_t op = 0; op < num_ops; op++)
+    {
+      bool do_add = (rng.next() % 3) != 0;
+
+      if (do_add)
+      {
+        size_t max_size = STRESS_UNITS / 4;
+        if (max_size < 1)
+          max_size = 1;
+        size_t size = (rng.next() % max_size) + 1;
+        size_t start = rng.next() % STRESS_UNITS;
+
+        bool found = false;
+        for (size_t try_start = start; try_start < STRESS_UNITS; try_start++)
+        {
+          size_t actual = 0;
+          for (size_t j = try_start; j < STRESS_UNITS && j < try_start + size;
+               j++)
+          {
+            if (!allocated[j])
+              break;
+            actual++;
+          }
+          if (actual >= 1)
+          {
+            size = actual;
+            start = try_start;
+            found = true;
+            break;
+          }
+        }
+        if (!found)
+          continue;
+
+        for (size_t j = start; j < start + size; j++)
+          allocated[j] = false;
+
+        auto result = arena.add_block(unit_addr(start), unit_size(size));
+        if (result.first == 0)
+          oracle.add(start, size);
+        else
+        {
+          // Overflow: arena spilled the consolidated block back to
+          // the caller. Treat as if everything went back to
+          // "allocated"; clear the oracle.
+          for (size_t j = 0; j < STRESS_UNITS; j++)
+            allocated[j] = true;
+          oracle = Oracle{};
+        }
+        arena.check_invariant(true);
+      }
+      else
+      {
+        size_t max_req = STRESS_UNITS / 4;
+        if (max_req < 1)
+          max_req = 1;
+        size_t n = (rng.next() % max_req) + 1;
+
+        auto arena_addr = arena.remove_block(unit_size(n));
+        auto [o_start, o_len] = oracle.remove(n);
+
+        if (o_len == 0)
+        {
+          SNMALLOC_CHECK(arena_addr == 0);
+        }
+        else
+        {
+          SNMALLOC_CHECK(arena_addr != 0);
+          SNMALLOC_CHECK(arena_addr == unit_addr(o_start));
+          for (size_t j = o_start; j < o_start + o_len; j++)
+            allocated[j] = true;
+        }
+        arena.check_invariant(true);
+      }
+    }
+  }
+
+  static void test_stress()
+  {
+    constexpr size_t NUM_OPS = 500;
+    constexpr size_t NUM_SEEDS = 30;
+    for (size_t s = 1; s <= NUM_SEEDS; s++)
+      test_stress_seed(s, NUM_OPS);
+    printf("  Stress (%zu seeds x %zu ops): OK\n", NUM_SEEDS, NUM_OPS);
+  }
+
+  // ==================================================================
+  // (G) SmallArenaRange — chunk-granularity parent + sub-chunk
+  // sub-allocations served by the in-band arena.
+  // ==================================================================
+
+  // Pool of chunk-aligned buffers, handed out as a chunk-granularity
+  // parent range to SmallArenaRange. Oversized by MIN_CHUNK_SIZE so
+  // `pool_base()` can align up at runtime — MSVC rejects alignas
+  // values as large as MIN_CHUNK_SIZE on static storage.
+  static constexpr size_t POOL_CHUNKS = 8;
+  static unsigned char pool_storage[(POOL_CHUNKS + 1) * MIN_CHUNK_SIZE];
+  static bool pool_in_use[POOL_CHUNKS];
+  // Track returns to detect leaks / double-frees.
+  static size_t pool_alloc_count;
+  static size_t pool_dealloc_count;
+
+  static unsigned char* pool_base()
+  {
+    uintptr_t raw = reinterpret_cast<uintptr_t>(&pool_storage[0]);
+    uintptr_t aligned = (raw + MIN_CHUNK_SIZE - 1) & ~(MIN_CHUNK_SIZE - 1);
+    return reinterpret_cast<unsigned char*>(aligned);
+  }
+
+  static void reset_pool()
+  {
+    for (size_t i = 0; i < POOL_CHUNKS; i++)
+      pool_in_use[i] = false;
+    for (size_t i = 0; i < sizeof(pool_storage); i++)
+      pool_storage[i] = 0;
+    pool_alloc_count = 0;
+    pool_dealloc_count = 0;
+  }
+
+  class MockParent
+  {
+  public:
+    static constexpr bool Aligned = true;
+    static constexpr bool ConcurrencySafe = true;
+    using ChunkBounds = capptr::bounds::Arena;
+
+    constexpr MockParent() = default;
+
+    CapPtr<void, ChunkBounds> alloc_range(size_t size)
+    {
+      SNMALLOC_CHECK(size == MIN_CHUNK_SIZE);
+      for (size_t i = 0; i < POOL_CHUNKS; i++)
+      {
+        if (!pool_in_use[i])
+        {
+          pool_in_use[i] = true;
+          pool_alloc_count++;
+          return CapPtr<void, ChunkBounds>::unsafe_from(
+            pool_base() + i * MIN_CHUNK_SIZE);
+        }
+      }
+      return nullptr;
+    }
+
+    void dealloc_range(CapPtr<void, ChunkBounds> base, size_t size)
+    {
+      SNMALLOC_CHECK(size == MIN_CHUNK_SIZE);
+      auto p = static_cast<unsigned char*>(base.unsafe_ptr());
+      auto idx = static_cast<size_t>(p - pool_base()) / MIN_CHUNK_SIZE;
+      SNMALLOC_CHECK(idx < POOL_CHUNKS);
+      SNMALLOC_CHECK(pool_in_use[idx]);
+      pool_in_use[idx] = false;
+      pool_dealloc_count++;
+    }
+  };
+
+  using SmallArena = SmallArenaRange<DummyAuthmap>::Type<MockParent>;
+
+  static void test_small_arena_basic()
+  {
+    reset_pool();
+    SmallArena r;
+
+    // First alloc triggers a refill of one chunk; the rest of the
+    // chunk is internally available for further sub-allocations.
+    auto a = r.alloc_range(UNIT_SIZE);
+    SNMALLOC_CHECK(a != nullptr);
+    SNMALLOC_CHECK(pool_alloc_count == 1);
+
+    auto b = r.alloc_range(unit_size(3));
+    SNMALLOC_CHECK(b != nullptr);
+    SNMALLOC_CHECK(pool_alloc_count == 1);
+
+    // Non-pow2 size — the whole point of SmallArenaRange.
+    auto c = r.alloc_range(unit_size(5));
+    SNMALLOC_CHECK(c != nullptr);
+    SNMALLOC_CHECK(pool_alloc_count == 1);
+
+    r.dealloc_range(a, UNIT_SIZE);
+    r.dealloc_range(b, unit_size(3));
+    r.dealloc_range(c, unit_size(5));
+
+    printf("  SmallArenaRange basic alloc/dealloc: OK\n");
+  }
+
+  static void test_small_arena_chunk_pass_through()
+  {
+    reset_pool();
+    SmallArena r;
+
+    // A chunk-or-larger alloc should pass through to the parent
+    // without touching the arena.
+    auto a = r.alloc_range(MIN_CHUNK_SIZE);
+    SNMALLOC_CHECK(a != nullptr);
+    SNMALLOC_CHECK(pool_alloc_count == 1);
+
+    r.dealloc_range(a, MIN_CHUNK_SIZE);
+    SNMALLOC_CHECK(pool_dealloc_count == 1);
+
+    printf("  SmallArenaRange chunk pass-through: OK\n");
+  }
+
+  static void test_small_arena_unaligned_dealloc()
+  {
+    reset_pool();
+    SmallArena r;
+
+    // Get some sub-chunk space populated.
+    auto a = r.alloc_range(unit_size(4));
+    SNMALLOC_CHECK(a != nullptr);
+
+    // Donate an unaligned spare (mirrors make()'s spare-seed
+    // donation). Length is not unit-aligned; sub-unit edges must
+    // be silently discarded.
+    auto unaligned_base = pointer_offset(a, 1);
+    r.dealloc_range(unaligned_base, unit_size(4) - 1);
+
+    // Should not have leaked chunks to the parent (sub-chunk
+    // fragments stay in the arena).
+    SNMALLOC_CHECK(pool_dealloc_count == 0);
+
+    printf("  SmallArenaRange unaligned dealloc: OK\n");
+  }
+
+  static void test_small_arena_consolidation_returns_chunk()
+  {
+    reset_pool();
+    SmallArena r;
+
+    // Fully consume one chunk via small allocs; record the chunk
+    // base so we can rebuild the full chunk via deallocs.
+    constexpr size_t N = MIN_CHUNK_SIZE / UNIT_SIZE;
+    std::vector<CapPtr<void, capptr::bounds::Arena>> ps;
+    for (size_t i = 0; i < N; i++)
+    {
+      auto p = r.alloc_range(UNIT_SIZE);
+      SNMALLOC_CHECK(p != nullptr);
+      ps.push_back(p);
+    }
+    // We expect at least one refill happened (likely just one,
+    // since N units == one chunk; but in either case all
+    // sub-allocs come from the same backing chunk).
+    SNMALLOC_CHECK(pool_alloc_count >= 1);
+
+    size_t deallocs_before = pool_dealloc_count;
+    for (auto p : ps)
+      r.dealloc_range(p, UNIT_SIZE);
+
+    // Consolidation should reassemble the whole chunk and donate
+    // it back to the parent.
+    SNMALLOC_CHECK(pool_dealloc_count > deallocs_before);
+
+    printf("  SmallArenaRange consolidation returns chunk: OK\n");
+  }
+
+  // alloc_size_with_align
+
+  static void test_alloc_size_with_align_exact()
+  {
+    reset_pool();
+    SmallArena r;
+
+    size_t size = unit_size(4);
+    size_t align = UNIT_SIZE;
+    auto p = r.alloc_size_with_align(size, align);
+    SNMALLOC_CHECK(p != nullptr);
+    SNMALLOC_CHECK((address_cast(p) & (align - 1)) == 0);
+
+    r.dealloc_range(p, size);
+    printf("  alloc_size_with_align exact (no leftover): OK\n");
+  }
+
+  static void test_alloc_size_with_align_pow2_align_over_size()
+  {
+    reset_pool();
+    SmallArena r;
+
+    size_t size = unit_size(3) + 2;
+    size_t align = 256;
+    SNMALLOC_CHECK(align <= MIN_CHUNK_SIZE);
+    SNMALLOC_CHECK(align >= UNIT_SIZE);
+    SNMALLOC_CHECK(bits::is_pow2(align));
+
+    auto p = r.alloc_size_with_align(size, align);
+    SNMALLOC_CHECK(p != nullptr);
+    SNMALLOC_CHECK((address_cast(p) & (align - 1)) == 0);
+
+    size_t used = bits::align_up(size, UNIT_SIZE);
+    size_t requested = bits::align_up(size, align);
+    SNMALLOC_CHECK(requested - used > 0);
+
+    // Donated tail and the carved-but-unused chunk remainder both
+    // sit in the arena, so the follow-up alloc must succeed
+    // without a second parent refill — exact address is not
+    // pinned down.
+    auto tail = r.alloc_range(requested - used);
+    SNMALLOC_CHECK(tail != nullptr);
+    SNMALLOC_CHECK(pool_alloc_count == 1);
+
+    r.dealloc_range(p, used);
+    r.dealloc_range(tail, requested - used);
+    printf("  alloc_size_with_align pow2 align over non-pow2 size: OK\n");
+  }
+
+  static void test_alloc_size_with_align_align_larger_than_size()
+  {
+    reset_pool();
+    SmallArena r;
+
+    // User's motivating example, scaled into the test arena.
+    size_t align = 4096;
+    SNMALLOC_CHECK(align <= MIN_CHUNK_SIZE);
+    size_t size = align - 254;
+
+    auto p = r.alloc_size_with_align(size, align);
+    SNMALLOC_CHECK(p != nullptr);
+    SNMALLOC_CHECK((address_cast(p) & (align - 1)) == 0);
+
+    size_t used = bits::align_up(size, UNIT_SIZE);
+    auto tail = r.alloc_range(align - used);
+    SNMALLOC_CHECK(tail != nullptr);
+    SNMALLOC_CHECK(pool_alloc_count == 1);
+
+    r.dealloc_range(p, used);
+    r.dealloc_range(tail, align - used);
+    printf("  alloc_size_with_align align > size: OK\n");
+  }
+
+  static void test_alloc_size_with_align_chunk_bypass()
+  {
+    reset_pool();
+    SmallArena r;
+
+    size_t size = MIN_CHUNK_SIZE - 100;
+    size_t align = MIN_CHUNK_SIZE;
+    auto p = r.alloc_size_with_align(size, align);
+    SNMALLOC_CHECK(p != nullptr);
+    SNMALLOC_CHECK((address_cast(p) & (align - 1)) == 0);
+    SNMALLOC_CHECK(pool_alloc_count == 1);
+
+    // requested == MIN_CHUNK_SIZE bypasses to parent (whole chunk,
+    // no carve-time leftover), so the only free arena fragment is
+    // the donated tail — pin its exact address. Tail stays
+    // intra-chunk, so no dealloc to parent.
+    SNMALLOC_CHECK(pool_dealloc_count == 0);
+
+    size_t used = bits::align_up(size, UNIT_SIZE);
+    if (used < MIN_CHUNK_SIZE)
+    {
+      auto tail = r.alloc_range(MIN_CHUNK_SIZE - used);
+      SNMALLOC_CHECK(tail != nullptr);
+      SNMALLOC_CHECK(address_cast(tail) == address_cast(p) + used);
+      r.dealloc_range(tail, MIN_CHUNK_SIZE - used);
+    }
+    r.dealloc_range(p, used);
+
+    printf("  alloc_size_with_align chunk-sized bypass: OK\n");
+  }
+} // namespace snmalloc
+
+int main()
+{
+  printf("--- InplaceRep tests ---\n");
+  printf(
+    "  UNIT_SIZE=%zu, MIN_BITS=%zu, MAX_SIZE_BITS=%zu, STRESS_UNITS=%zu\n",
+    snmalloc::UNIT_SIZE,
+    snmalloc::MIN_BITS,
+    snmalloc::MAX_SIZE_BITS,
+    snmalloc::STRESS_UNITS);
+
+  printf("(A) Accessor round-trips:\n");
+  snmalloc::test_variant_roundtrip();
+  snmalloc::test_large_size_roundtrip();
+
+  printf("(B) Red bits independent:\n");
+  snmalloc::test_red_bits_independent();
+  snmalloc::test_can_consolidate_chunk_boundary();
+
+  printf("(C) Arena add/remove:\n");
+  snmalloc::test_arena_add_remove_single();
+
+  printf("(D) Arena consolidation:\n");
+  snmalloc::test_arena_consolidation();
+
+  printf("(E) Arena carve:\n");
+  snmalloc::test_arena_carve();
+
+  printf("(F) Stress:\n");
+  snmalloc::test_stress();
+
+  printf("(G) SmallArenaRange:\n");
+  snmalloc::test_small_arena_basic();
+  snmalloc::test_small_arena_chunk_pass_through();
+  snmalloc::test_small_arena_unaligned_dealloc();
+  snmalloc::test_small_arena_consolidation_returns_chunk();
+  snmalloc::test_alloc_size_with_align_exact();
+  snmalloc::test_alloc_size_with_align_pow2_align_over_size();
+  snmalloc::test_alloc_size_with_align_align_larger_than_size();
+  snmalloc::test_alloc_size_with_align_chunk_bypass();
+
+  printf("All InplaceRep tests passed.\n");
+  return 0;
+}
diff --git a/src/test/snmalloc_testlib.h b/src/test/snmalloc_testlib.h
index 5b51ff7bd..00b0513e4 100644
--- a/src/test/snmalloc_testlib.h
+++ b/src/test/snmalloc_testlib.h
@@ -41,10 +41,18 @@ namespace snmalloc
   void dealloc(void* p, size_t size);
   void dealloc(void* p, size_t size, size_t align);
 
-  template<size_t size>
+  /**
+   * Compile-time sized dealloc with optional alignment.
+   *
+   * The `align` parameter mirrors the `align` parameter on the
+   * `alloc<size, ZeroMem, align>` overload below: it is applied via
+   * `aligned_size` so the size fed to the sized-dealloc sanity check
+   * matches the size that was actually reserved.
+   */
+  template<size_t size, size_t align = 1>
   inline void dealloc(void* p)
   {
-    dealloc(p, size);
+    dealloc(p, aligned_size(align, size));
   }
 
   void debug_teardown();
@@ -115,12 +123,13 @@ namespace snmalloc
    * goes straight to the sizeclass-based fast path.  Otherwise falls back
    * to the dynamic alloc.
    */
-  template<size_t size, ZeroMem zero_mem = ZeroMem::NoZero>
+  template<size_t size, ZeroMem zero_mem = ZeroMem::NoZero, size_t align = 1>
   inline void* alloc()
   {
-    if constexpr (is_small_sizeclass(size))
+    constexpr size_t sz = aligned_size(align, size);
+    if constexpr (is_small_sizeclass(sz))
     {
-      constexpr auto sc = size_to_sizeclass_const(size);
+      constexpr auto sc = size_to_sizeclass_const(sz);
       if constexpr (zero_mem == ZeroMem::YesZero)
       {
         return libc::malloc_small_zero(sc);
@@ -132,7 +141,7 @@ namespace snmalloc
     }
     else
     {
-      return alloc<zero_mem>(size);
+      return alloc<zero_mem>(sz);
     }
   }
 } // namespace snmalloc