From 7ac917064cc0ecbd2e15f6958ca0c43da2147132 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Fri, 15 May 2026 11:48:10 +0100 Subject: [PATCH 01/31] Base idea Co-authored-by: Copilot --- IDEA.md | 125 +++++++++++++ prototype/servable_sets.py | 351 +++++++++++++++++++++++++++++++++++++ prototype/skip_analysis.py | 226 ++++++++++++++++++++++++ 3 files changed, 702 insertions(+) create mode 100644 IDEA.md create mode 100644 prototype/servable_sets.py create mode 100644 prototype/skip_analysis.py diff --git a/IDEA.md b/IDEA.md new file mode 100644 index 000000000..15d2c5282 --- /dev/null +++ b/IDEA.md @@ -0,0 +1,125 @@ +# Bitmap-Indexed Coalescing Range + +## The problem + +snmalloc's `LargeBuddyRange` only stores power-of-two blocks. A request for 5 +chunks must be served from an 8-chunk buddy block, wasting 3 chunks. We want +to store blocks at their actual size and use snmalloc's full size class +sequence at the range level. + +## The core idea: search upward, skip a mask + +Free blocks are binned by the set of size classes they can serve. To allocate, +search upward through bins — any larger block can be carved down. This almost +works perfectly, but some bins hold blocks whose alignment is too poor to +serve certain smaller, more-aligned sizes. Those bins must be masked out +during the search. + +The mechanism: `find_first_set(bitmap & ~skip_mask)`. The skip mask depends +only on the requested size class, not on the block. It's a small constant +that can be precomputed. + +## Why skips exist + +snmalloc's size classes follow `S = 2^e + m · 2^(e−B)`, where `B` is the +number of intermediate bits. Each size class has a natural alignment +`align(S) = S & ~(S−1)`. + +A size class with high alignment needs padding to reach an aligned address +within a block. A block of a *larger* size class with *lower* alignment may +not have room for that padding. Concretely: a block of size 5 at address 1 +can serve size 5 (alignment 1) but cannot serve size 4 (alignment 4) — there +aren't enough chunks left after padding to the first 4-aligned address. + +Same size block, different address, different capability. This is what creates +the need for separate bins and skip masks. + +## The general structure + +At each exponent level, the distinct "servable sets" (which size classes a +block can serve) form a structure with some incomparable pairs. Exhaustive +enumeration shows: + +| B | Mantissas/exponent | Bins/exponent | Max skip mask bits | +|---|-------------------:|--------------:|-------------------:| +| 1 | 2 | 2 | 0 | +| 2 | 4 | 5 | 1 | +| 3 | 8 | 13 | 4 | +| 4 | 16 | 34 | 11 | + +Each bin corresponds to a distinct servable set. The bins are ordered so that +upward search is almost always correct — the skip mask handles the exceptions. + +For any B, the structure is: +- **Most requests need no skips.** Only size classes with alignment higher + than expected for their position in the sequence need to mask anything. +- **The skip mask is a small constant** per size class, precomputable at + compile time. +- **The mechanism is identical** regardless of B: + `find_first_set(bitmap & ~skip_mask, start_bit)`. + +`prototype/skip_analysis.py` verifies this exhaustively for B = 1, 2, 3. + +## The bitmap design + +Each free block gets one bin based on its size and alignment. Within each +exponent, there are as many bins as there are distinct servable sets (5 for +B=2, 13 for B=3). A flat bitmap tracks which bins are non-empty. + +To allocate size class `(e, m)`: + +1. Compute the **start bit** — the first bin that could serve this size class. +2. Compute the **skip mask** — bits for bins that can't serve this request. +3. `find_first_set(bitmap & ~skip_mask, start_bit)` → pop a block from that + bin. + +The returned block may not be exactly aligned for the requested size class. +The caller **carves** the aligned region and returns any prefix/suffix +remainders to the free pool. + +## Contrast with buddy allocators + +A buddy allocator guarantees alignment by construction — a 16-chunk buddy is +always 16-aligned — but wastes space by decomposing everything into +power-of-two pieces. + +This design stores blocks at their actual size (no decomposition, no waste) +and handles alignment at allocation time by carving. The skip mask makes +lookup O(1) despite blocks having arbitrary size and alignment. + +## Concrete example (B = 2) + +At exponent `e = 2`, the size classes are 4, 5, 6, 7. There are 5 bins, +each labeled by the set of size classes it can serve at this exponent: + + Bin 0: serves {4} + Bin 1: serves {5} + Bin 2: serves {4, 5} + Bin 3: serves {4, 5, 6} + Bin 4: serves {4, 5, 6, 7} + +Allocation searches upward from the smallest sufficient bin: + + Request for 7: can use bin 4 → search bits {4} + Request for 6: can use bins 3, 4 → search bits {3, 4} + Request for 5: can use bins 1, 2, 3, 4 → search bits {1, 2, 3, 4} + Request for 4: can use bins 0, 2, 3, 4 — skip 1 → search bits {0, 2, 3, 4} + +Only the request for size 4 needs to skip a bin: bin 1 holds blocks that can +serve 5 but not 4. The skip mask is just bit 1. + +## Concrete example (B = 3) + +At exponent `e = 4`, the size classes are 16, 18, 20, 22, 24, 26, 28, 30. +There are 13 bins. The skip analysis shows: + + Request for 16 (align 16): must skip bins for {18}, {20}, {22}, {26} + Request for 24 (align 8): must skip bin for {26} + All other requests: no skips needed + +The pattern: size 16 has high alignment and must skip 4 bins whose blocks +are large enough but too poorly aligned. Size 24 is a "sub-power-of-two" +(alignment 8) and must skip 1 bin. All odd-coefficient sizes have low +alignment and never need to skip anything. + +Same mechanism, wider mask, same `find_first_set(bitmap & ~mask)` operation. diff --git a/prototype/servable_sets.py b/prototype/servable_sets.py new file mode 100644 index 000000000..3acdcff5d --- /dev/null +++ b/prototype/servable_sets.py @@ -0,0 +1,351 @@ +#!/usr/bin/env python3 +""" +Exhaustive analysis of servable sets for snmalloc's size classes. + +For every possible free block (offset, size) in a 512-chunk arena, +compute which snmalloc size classes can be allocated from that block, +respecting the natural alignment constraint: + align(S) = S & ~(S-1) (largest power of 2 dividing S) + +A block at offset `a` with size `n` can serve size class `S` iff +there exists an address `x` within [a, a+n-S] such that x is a +multiple of align(S): + first_aligned = ceil(a / align(S)) * align(S) + servable iff first_aligned + S <= a + n +""" + +ARENA = 512 +B = 2 # INTERMEDIATE_BITS + + +def gen_size_classes(max_size): + """Generate snmalloc size classes: S = 2^e + m * 2^{e-B}.""" + classes = set() + classes.add(1) + classes.add(2) + classes.add(3) + e = 2 + while True: + base = 1 << e + step = 1 << (e - B) + for m in range(1 << B): + s = base + m * step + if s > max_size: + break + classes.add(s) + if base > max_size: + break + e += 1 + return sorted(classes) + + +def natural_align(x): + """Largest power of 2 dividing x. For 0, return a large value.""" + if x == 0: + return 1 << 30 + return x & (-x) + + +def can_serve(addr, block_size, sizeclass): + """Can a block at `addr` of `block_size` chunks serve `sizeclass`?""" + A = natural_align(sizeclass) + first_aligned = ((addr + A - 1) // A) * A + return first_aligned + sizeclass <= addr + block_size + + +def get_exponent_mantissa(s): + """Return (exponent, mantissa) for size class s with B=2.""" + if s == 1: + return (0, 0) + if s == 2: + return (1, 0) + if s == 3: + return (1, 1) + e = 2 + while True: + base = 1 << e + step = 1 << (e - B) + for m in range(4): + if base + m * step == s: + return (e, m) + e += 1 + if base > s * 2: + return None + + +def main(): + size_classes = gen_size_classes(ARENA) + print(f"Size classes: {size_classes}") + print(f"Count: {len(size_classes)}") + print() + + # Show alignment for each size class + print("Size class alignments:") + for sc in size_classes: + em = get_exponent_mantissa(sc) + print(f" S={sc:>4d} align={natural_align(sc):>4d} (e={em[0]}, m={em[1]})") + print() + + # ================================================================ + # Step 1: Compute ALL unique servable sets + # ================================================================ + all_sets = {} # frozenset -> list of (addr, size) examples + for a in range(ARENA): + for n in range(1, ARENA - a + 1): + servable = frozenset( + sc for sc in size_classes if can_serve(a, n, sc) + ) + if servable not in all_sets: + all_sets[servable] = [] + all_sets[servable].append((a, n)) + + # Sort by (cardinality, max element) + sorted_sets = sorted( + all_sets.keys(), key=lambda s: (len(s), max(s) if s else 0) + ) + + print(f"Total unique servable sets: {len(sorted_sets)}") + print() + + # ================================================================ + # Step 2: Show each unique servable set and its structure + # ================================================================ + print("=" * 80) + print("ALL UNIQUE SERVABLE SETS") + print("=" * 80) + for i, s in enumerate(sorted_sets): + examples = all_sets[s][:3] + ex_str = ", ".join(f"(a={a},n={n})" for a, n in examples) + print(f" #{i:>3d} |{len(s):>3d} classes| {sorted(s)}") + print(f" examples: {ex_str}") + print() + + # ================================================================ + # Step 3: Analyse containment / subset structure + # ================================================================ + print("=" * 80) + print("CONTAINMENT ANALYSIS") + print("=" * 80) + print() + print("For each set, what's new compared to its largest strict subset?") + print("Incomparable pairs are sets where neither is a subset of the other.") + print() + + for i, s in enumerate(sorted_sets): + # Find strict subsets + subsets = [sorted_sets[j] for j in range(len(sorted_sets)) if sorted_sets[j] < s] + if subsets: + biggest_subset = max(subsets, key=len) + new = sorted(s - biggest_subset) + else: + new = sorted(s) + + # Find incomparable sets (same cardinality, neither subset) + incomparable = [] + for j in range(len(sorted_sets)): + other = sorted_sets[j] + if other == s: + continue + if not (other < s) and not (other > s) and len(other) == len(s): + incomparable.append(j) + + new_em = [(sc, get_exponent_mantissa(sc)) for sc in new] + inc_str = f" ** INCOMPARABLE with #{incomparable}" if incomparable else "" + print(f" #{i:>3d}: +{new} {inc_str}") + + print() + + # ================================================================ + # Step 4: Group by exponent — show the 5-state structure + # ================================================================ + print("=" * 80) + print("PER-EXPONENT STRUCTURE") + print("=" * 80) + print() + print("Within each exponent level, how many distinct states are there?") + print("A 'state' is a distinct subset of {m=0, m=1, m=2, m=3} that") + print("appears as the set of servable mantissas at that exponent.") + print() + + max_exp = max(get_exponent_mantissa(sc)[0] for sc in size_classes) + + for e in range(2, max_exp + 1): + # Size classes at this exponent + sizes_at_e = [] + for m in range(4): + step = 1 << (e - B) + s = (1 << e) + m * step + if s <= ARENA: + sizes_at_e.append((m, s)) + + if not sizes_at_e: + continue + + # For each servable set, extract which mantissas at exponent e are present + mantissa_subsets = set() + for s_set in sorted_sets: + present = frozenset( + m for m, sc in sizes_at_e if sc in s_set + ) + if present: # at least one mantissa servable + mantissa_subsets.add(present) + + print(f" Exponent e={e}: sizes {[s for _, s in sizes_at_e]}") + print(f" Distinct mantissa subsets: {len(mantissa_subsets)}") + for ms in sorted(mantissa_subsets, key=lambda x: (len(x), sorted(x))): + label = "" + ms_sorted = sorted(ms) + if ms_sorted == [0]: + label = "A-only" + elif ms_sorted == [1]: + label = "B-only" + elif ms_sorted == [0, 1]: + label = "both" + elif ms_sorted == [0, 1, 2]: + label = "+m2" + elif ms_sorted == [0, 1, 2, 3]: + label = "+m3" + else: + label = "???" + print(f" mantissas {str(ms_sorted):20s} ({label})") + + # Check for incomparable pairs + for ms1 in mantissa_subsets: + for ms2 in mantissa_subsets: + if ms1 != ms2 and not ms1 < ms2 and not ms2 < ms1: + print(f" ** Incomparable: {sorted(ms1)} vs {sorted(ms2)}") + print() + + # ================================================================ + # Step 5: Show the threshold formula + # ================================================================ + print("=" * 80) + print("THRESHOLD ANALYSIS") + print("=" * 80) + print() + print("T(S, alpha) = S + max(0, align(S) - alpha)") + print("= minimum block size to serve S at block alignment alpha") + print() + + for e in range(2, min(max_exp + 1, 6)): + print(f" Exponent e={e}:") + for m in range(4): + step = 1 << (e - B) + s = (1 << e) + m * step + if s > ARENA: + break + a = natural_align(s) + print(f" m={m}: S={s:>4d}, align={a:>4d}", end="") + # Show threshold at various block alignments + alphas = [1, 2, 4, 1 << e] + vals = [] + for alpha in alphas: + t = s + max(0, a - alpha) + vals.append(f"T(α={alpha})={t}") + print(f" {', '.join(vals)}") + print() + + # ================================================================ + # Step 6: Verify the key property + # ================================================================ + print("=" * 80) + print("KEY PROPERTY VERIFICATION") + print("=" * 80) + print() + print("Checking: servable sets are almost totally ordered.") + print("For each exponent, there should be exactly 5 states") + print("with exactly 1 incomparable pair ({m=0} vs {m=1}).") + print() + + all_ok = True + for e in range(2, max_exp + 1): + sizes_at_e = [] + for m in range(4): + step = 1 << (e - B) + s = (1 << e) + m * step + if s <= ARENA: + sizes_at_e.append((m, s)) + + if len(sizes_at_e) < 4: + continue + + mantissa_subsets = set() + for s_set in sorted_sets: + present = frozenset(m for m, sc in sizes_at_e if sc in s_set) + if present: + mantissa_subsets.add(present) + + n_states = len(mantissa_subsets) + n_incomparable = 0 + for ms1 in mantissa_subsets: + for ms2 in mantissa_subsets: + if ms1 < ms2 or ms2 < ms1 or ms1 == ms2: + continue + n_incomparable += 1 + n_incomparable //= 2 # each pair counted twice + + ok = (n_states == 5 and n_incomparable == 1) + status = "OK" if ok else "FAIL" + if not ok: + all_ok = False + print(f" e={e}: {n_states} states, {n_incomparable} incomparable pairs [{status}]") + + print() + if all_ok: + print(" ALL EXPONENTS HAVE EXACTLY 5 STATES WITH 1 INCOMPARABLE PAIR.") + else: + print(" SOME EXPONENTS DIFFER — check output above.") + + # ================================================================ + # Step 7: Show the two-bin split for m=1 with concrete examples + # ================================================================ + print() + print("=" * 80) + print("THE TWO-BIN SPLIT: blocks of the same size go to different bins") + print("=" * 80) + print() + print("For each exponent, m=1 blocks are split into two bins based on") + print("whether they can also serve m=0 (the power-of-two size).") + print() + + for e in range(2, min(max_exp + 1, 6)): + s0 = 1 << e # m=0 size + s1 = 5 * (1 << (e - B)) # m=1 size + a0 = natural_align(s0) + a1 = natural_align(s1) + + print(f" Exponent e={e}: m=0 is size {s0} (align {a0}), " + f"m=1 is size {s1} (align {a1})") + + # Find concrete blocks of size s1 that can/cannot serve s0 + bin_a_examples = [] # can serve both s0 and s1 + bin_b_examples = [] # can serve s1 but NOT s0 + + for a in range(min(ARENA, 64)): + if can_serve(a, s1, s1): + if can_serve(a, s1, s0): + if len(bin_a_examples) < 3: + bin_a_examples.append((a, s1)) + else: + if len(bin_b_examples) < 3: + bin_b_examples.append((a, s1)) + + # Show what each bin can serve + if bin_a_examples: + a_ex = bin_a_examples[0] + servable = sorted(sc for sc in size_classes if can_serve(a_ex[0], a_ex[1], sc)) + ex_strs = ", ".join(f"(a={a},n={n})" for a, n in bin_a_examples) + print(f" Bin A (serves {s0} AND {s1}): e.g. {ex_strs}") + print(f" serves: {servable}") + + if bin_b_examples: + b_ex = bin_b_examples[0] + servable = sorted(sc for sc in size_classes if can_serve(b_ex[0], b_ex[1], sc)) + ex_strs = ", ".join(f"(a={a},n={n})" for a, n in bin_b_examples) + print(f" Bin B (serves {s1} but NOT {s0}): e.g. {ex_strs}") + print(f" serves: {servable}") + print() + + +if __name__ == "__main__": + main() diff --git a/prototype/skip_analysis.py b/prototype/skip_analysis.py new file mode 100644 index 000000000..fe510677f --- /dev/null +++ b/prototype/skip_analysis.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +""" +Analyse the "skip" structure for different INTERMEDIATE_BITS values. + +Core question: when searching upward through bins for a size class, +how many bins do you need to skip (bins that serve a larger size but +not the one you want, due to alignment)? +""" + +ARENA = 1024 + + +def natural_align(x): + if x == 0: + return 1 << 30 + return x & (-x) + + +def can_serve(addr, block_size, sizeclass): + A = natural_align(sizeclass) + first_aligned = ((addr + A - 1) // A) * A + return first_aligned + sizeclass <= addr + block_size + + +def gen_size_classes(B, max_size): + classes = set() + classes.add(1) + if B >= 1: + classes.add(2) + if B >= 2: + classes.add(3) + if B >= 3: + for s in range(4, 8): + if s <= max_size: + classes.add(s) + e = B + while True: + base = 1 << e + step = 1 << (e - B) + for m in range(1 << B): + s = base + m * step + if s <= max_size: + classes.add(s) + if base > max_size: + break + e += 1 + return sorted(classes) + + +def analyse(B): + M = 1 << B # mantissas per exponent + size_classes = gen_size_classes(B, ARENA) + print(f"{'='*80}") + print(f"INTERMEDIATE_BITS = {B} ({M} mantissas per exponent)") + print(f"{'='*80}") + print(f"Size classes: {size_classes[:30]}{'...' if len(size_classes)>30 else ''}") + print() + + # Show alignment pattern for one exponent + e = max(B + 2, 4) # pick an exponent where sizes aren't tiny + print(f" Alignment pattern at exponent e={e}:") + sizes_at_e = [] + for m in range(M): + step = 1 << (e - B) + s = (1 << e) + m * step + a = natural_align(s) + sizes_at_e.append((m, s, a)) + print(f" m={m}: size={s:>4d} align={a:>4d} (coefficient {s >> (e-B)} = {s // (1 << (e-B))})") + print() + + # Compute all unique servable sets + all_sets = set() + for a in range(ARENA): + for n in range(1, ARENA - a + 1): + servable = frozenset( + sc for sc in size_classes if can_serve(a, n, sc) + ) + all_sets.add(servable) + + sorted_sets = sorted(all_sets, key=lambda s: (len(s), max(s) if s else 0)) + + # Per-exponent analysis + max_exp = 1 + for sc in size_classes: + ee = B + while (1 << ee) <= sc: + ee += 1 + ee -= 1 + if ee >= B: + max_exp = max(max_exp, ee) + + print(f" Per-exponent mantissa state analysis:") + print() + + for e in range(B, max_exp + 1): + sizes_at_e = [] + for m in range(M): + step = 1 << (e - B) + s = (1 << e) + m * step + if s <= ARENA: + sizes_at_e.append((m, s)) + + if len(sizes_at_e) < M: + continue + + # For each servable set, extract which mantissas at this exponent are present + mantissa_subsets = set() + for s_set in sorted_sets: + present = frozenset(m for m, sc in sizes_at_e if sc in s_set) + if present: + mantissa_subsets.add(present) + + # Count incomparable pairs + incomparable_pairs = [] + ms_list = sorted(mantissa_subsets, key=lambda x: (len(x), sorted(x))) + for i, ms1 in enumerate(ms_list): + for ms2 in ms_list[i+1:]: + if not ms1 < ms2 and not ms2 < ms1: + incomparable_pairs.append((sorted(ms1), sorted(ms2))) + + print(f" Exponent e={e}: sizes {[s for _, s in sizes_at_e]}") + print(f" {len(mantissa_subsets)} distinct states, {len(incomparable_pairs)} incomparable pair(s)") + + for ms in sorted(mantissa_subsets, key=lambda x: (len(x), sorted(x))): + print(f" {sorted(ms)}") + + if incomparable_pairs: + for p in incomparable_pairs: + print(f" ** Incomparable: {p[0]} vs {p[1]}") + print() + + # The key analysis: for each size class, which bins must be SKIPPED? + print(f" SKIP ANALYSIS: when searching for size S, which larger-size bins") + print(f" might contain blocks that can't serve S?") + print() + + for e in range(B, min(max_exp + 1, B + 4)): + sizes_at_e = [] + for m in range(M): + step = 1 << (e - B) + s = (1 << e) + m * step + if s <= ARENA: + sizes_at_e.append((m, s)) + + if len(sizes_at_e) < M: + continue + + print(f" Exponent e={e}:") + + for m_req, s_req in sizes_at_e: + a_req = natural_align(s_req) + + # For each larger size at same exponent, check if it can always serve s_req + skips = [] + for m_other, s_other in sizes_at_e: + if s_other <= s_req: + continue + # Can a block of size s_other sometimes NOT serve s_req? + # Check: at worst alignment for s_req, does s_other still suffice? + # T(s_req, alpha=1) = s_req + align(s_req) - 1 + # The block serves s_req if block_size >= T(s_req, block_align) + # A block of size s_other could have any alignment + can_always = True + can_sometimes_not = False + for addr in range(min(ARENA, 64)): + if can_serve(addr, s_other, s_other): # valid block + if not can_serve(addr, s_other, s_req): + can_sometimes_not = True + break + + if can_sometimes_not: + skips.append((m_other, s_other)) + + if skips: + skip_str = ", ".join(f"m={m}(size {s})" for m, s in skips) + print(f" Requesting m={m_req} (size {s_req}, align {a_req}): " + f"must skip: {skip_str}") + else: + print(f" Requesting m={m_req} (size {s_req}, align {a_req}): " + f"no skips needed") + print() + + # Summary: how many skips total per exponent? + print(f" SUMMARY: skips needed per request at each exponent") + print() + + for e in range(B, min(max_exp + 1, B + 4)): + sizes_at_e = [] + for m in range(M): + step = 1 << (e - B) + s = (1 << e) + m * step + if s <= ARENA: + sizes_at_e.append((m, s)) + + if len(sizes_at_e) < M: + continue + + total_skips = 0 + max_skips_per_request = 0 + + for m_req, s_req in sizes_at_e: + skips = 0 + for m_other, s_other in sizes_at_e: + if s_other <= s_req: + continue + for addr in range(min(ARENA, 64)): + if can_serve(addr, s_other, s_other): + if not can_serve(addr, s_other, s_req): + skips += 1 + break + + total_skips += skips + max_skips_per_request = max(max_skips_per_request, skips) + + print(f" e={e}: max skips for any single request = {max_skips_per_request}") + + +def main(): + for B in [1, 2, 3, 4]: + analyse(B) + print() + print() + + +if __name__ == "__main__": + main() From 850a7a407f6f4c23c2484e09e286bb24b2676240 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Wed, 20 May 2026 11:08:12 +0100 Subject: [PATCH 02/31] Add BackendArenaBins: bin scheme, per-sc tables, and bitmap Phase 1 of the BackendArena refactor (see PLAN.md). Introduces src/snmalloc/backend_helpers/backend_arena_bins.h, which owns the chunk-unit size-class scheme and the non-empty-bins bitmap that later phases will use to drive bin selection inside BackendArena. Public surface (the integration contract for future phases): * range_t, carve_t, carve(block, n_chunks), max_supported_chunks(). * Nested Bitmap with add(block), find_for_request(n_chunks), clear(bin_id), and TOTAL_BINS. Everything else (the size-class encoding, the per-SC tables, the free-side classifier bin_index) is private. The unit test reaches it via a friend struct BackendArenaBinsTestAccess that is only forward-declared in the header and defined in the test translation unit, so the production header carries no test-only surface. Implementation: * Two power-of-two-sized rodata tables indexed by raw sc id with shift+add. bitmap_info_t (4 words via alignas) feeds Bitmap::find_for_request; carve_info_t (2 words) feeds carve and the free-side cascade-fit predicate. * bitmap_info_t fields (start_word, first_mask, second_mask) are pre-shifted into the bitmap's word layout so find_for_request is two ANDs on the hot word + word-boundary fall-through. * Tables are populated at constexpr build time by BinTable() consuming the canonical bin_subsets table; the strict-chain invariant on bin_subsets is checked at compile time via throw in the constexpr constructor. * Fast path uses the runtime CLZ intrinsic via the new bits::to_exp_mant (paired with the existing to_exp_mant_const); the _const variant is restricted to constexpr table construction and test static_asserts. bits::prev_pow2_bits / prev_pow2_bits_const are added alongside for symmetric runtime / constexpr access. The new test cross-checks bin classification, carve, and find_for_request against a brute-force scanner derived directly from bin_subsets, for B in {1, 2, 3}. Exhaustive single-bit and multi-bit randomised bitmap states are covered, plus word-boundary straddle cases enumerated automatically from the table. No production code path is changed: BackendArenaBins is unused in the build until later phases compose it into BackendArena. Also lands PLAN.md (single forward-looking spec for the whole BackendArena refactor) and claude.md (development guidance for this branch). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CMakeLists.txt | 1 + PLAN.md | 1255 +++++++++++++++++ claude.md | 153 ++ .../backend_helpers/backend_arena_bins.h | 755 ++++++++++ src/snmalloc/ds_core/bits.h | 44 + .../backend_arena_bins/backend_arena_bins.cc | 1220 ++++++++++++++++ 6 files changed, 3428 insertions(+) create mode 100644 PLAN.md create mode 100644 claude.md create mode 100644 src/snmalloc/backend_helpers/backend_arena_bins.h create mode 100644 src/test/func/backend_arena_bins/backend_arena_bins.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index f49447a8a..ec903a408 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -548,6 +548,7 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) # These are mitigation-independent and can be compiled once, then linked # against both fast and check testlib variants. set(TESTLIB_ONLY_TESTS + backend_arena_bins bits first_operation memory memory_usage multi_atexit multi_threadatexit redblack statistics teardown contention external_pointer large_alloc lotsofthreads post_teardown diff --git a/PLAN.md b/PLAN.md new file mode 100644 index 000000000..7ac7fecf9 --- /dev/null +++ b/PLAN.md @@ -0,0 +1,1255 @@ +# User Plan + +We need to refactor the backend buddy allocator to use the more general concept in IDEA.md, which uses a more general concept of sizeclasses than powers of two to avoid internal fragmentation. + +The design will use the Red-Black tree that currently underlies the buddy allocator, but in a different shape: two parallel trees instead of one-per-exponent. + +Each block will be part of two structures: + +* [Bin] A red-black tree of all blocks held by this BackendArena, in the same bin, ordered by address. +* [Range] A red-black tree of all blocks held by this BackendArena, ordered by address. + +Note that for block of the minimum size will be handled specially as there is insufficient space to have them particpate in both structures, so they will only particpate in the first. + +## Representation + +We use 2 bits to represent the mode of this block of memory + +00 - Minimum size, only in first red-black tree. Single pagemap entry for this block is used for the RB-tree +01 - 2 * minimum size, in both red-black trees. Two pagemap entries for this block are used for the RB-tree +10 - > 2 * minimum size, in both red-black trees. Three pagemap entries used for this block, first two redblack tree, third stores accurate size of block. + +This means it is possible to find the precise size of a block which can account for additional state that is lost by the binning. + +## Maximal consolidation. + +When a block, A, is added, we check if the predeccessor, P, and successor, S, blocks are also in the "Range" red-black tree, if they are then we can combine this block A with P and/or S if they were in the RB-tree. We must also check the "Bin" for the minimum size RB-tree as the minimum sized blocks are not in the "Range" RB tree. + +When we combine a block, we remove the blocks from the appropiate "Bin", and then add the combined block to the appropriate "Bin" for the combined block, we remove one block from the "Range" RB-tree, as we can continue to reuse its entry and not need to mutate the RB-tree. + +## Allocation + +Follows the IDEA.md design, find the smallest bin that can serve the request. Then add back any things that are carved off the block to the free pool. + +## Multiple instances + +As all the state is looked up from the RB-tree, then we can have multiple instances of the data-structure. This allows us to have both thread-local and global RB-trees. + +## Implementation + +### Build BackendArena + +This should use two RB-trees. + +It should support adding and removing blocks. + +There should be unit tests that check that it is functioning correctly. + +There should be a runtime checked invariant that +* the system is maximally consolidated, and +* the system is consistent between the two RB-trees. + +### Build BackendArenaRange + +This should wrap the BackendArena using the snmalloc Range approach that is used in the current backend pipelines. + +### Update backend to use BackendArenaRange + +### Update front-end to request non-power of two size classes for the backend. + +### Generalise the large size classes to no longer be just power of two. + +### Fix memcpy protection + +To find the start of a block will require the pagemap to additionally store an offset. + +Currently, the find the start of a block. Performs an alignment to find the start of the "slab", and uses reciprocal division to find the offset within the slab. For large allocations, we just used the start of the slab as everything was aligned to a power of two. We know need to do + +align(ptr, slab_size) - (offset(ptr) * slab_size) + +Here, offset is stored in the pagemap, and allows us to find the start of the block. We will need to store the offset in every entry of the pagemap for the block as we need to support requests in each offset within the block. + +## Extensions + +This is not to be done in this initial implementation, but we should consider this for possible future extensions, and should not be ruled out by any design. + +### Integrated Decay Range + +We can extend the system by effectively having multiple "Range" RB-trees, and then use multiple ranges to track how long a block has been in the backend. We would always add blocks to the "most recent" Range, and as time passes switch which RB-tree is considered the most recent. The oldest one, can then be passed back to the OS, or alter whether it is MADV_FREE or MADV_DONTNEED. + +--- + +# Implementation plan: BackendArena phase + +## Scope of this phase + +This plan covers **only** the `BackendArena` data structure and its standalone +unit tests. The following are explicitly deferred to follow-up plans, each of +which will become its own PLAN.md revision: + +- `BackendArenaRange` — wrapping `BackendArena` behind snmalloc's Range API. +- Backend integration — replacing `LargeBuddyRange` in + `backend/standard_range.h` and `backend/meta_protected_range.h`. +- Front-end requesting non-power-of-two chunk sizes from the backend. +- Generalising the large size classes to no longer be power-of-two only. +- Memcpy protection fix — storing per-chunk `offset` in the pagemap so the + start of a large allocation can be recovered from any address within it. + +The pagemap encoding chosen in this phase **must leave room** for the future +per-entry `offset` field, so the memcpy fix can land later without re-doing +the encoding work. See "Pagemap encoding" below. + +## Design notes + +### Bins: one Bin tree per IDEA servable-set bin + +Per `IDEA.md` and `prototype/skip_analysis.py`, free blocks are classified by +the *servable set* — the set of size classes they can serve, given their size +and alignment. + +For `INTERMEDIATE_BITS=B` the bin count per exponent is `B=1: 2, B=2: 5, +B=3: 13, B=4: 34`. The snmalloc default is `B=2`. The number of exponents in +range is `MAX_SIZE_BITS - MIN_CHUNK_BITS + 1`. + +Each `BackendArena` instance owns: + +- A flat array of RBTree roots, indexed by bin id (one bin id per + (exponent, servable-set) pair). +- A flat bitmap (`size_t words[NUM_BITMAP_WORDS]`) tracking which bin + RBTrees are non-empty. Word width tracks `bits::BITS` so 32-bit + builds work too. + +Allocation for a request of `n_chunks`: + +1. `bitmap.find_for_request(n_chunks)` returns the bin id of the + smallest serving bin (or `SIZE_MAX` if none). Internally this loads + the per-sc `(start_word, first_mask, second_mask)` triple for + `n_chunks` and applies one AND per word to locate the first set bit + in a serving position. +2. Pop a block from that bin's RBTree (smallest address — `remove_min`). + If the tree empties, `bitmap.clear(bin_id)`. +3. `carve(block, n_chunks)` splits into pre-pad / aligned request / + post-pad. Re-add any non-empty pre/post via `add_block` (which + classifies the remainder via `bitmap.add(remainder)`). + +The bin classification and per-sc search masks (`start_word`, +`first_mask`, `second_mask`) are precomputed at `constexpr` time +directly from the size-class structure (no runtime tables beyond the +bitmap of non-empty bins). + +### Range: single tree across all blocks (with min-size exception) + +A single RBTree per `BackendArena` orders all *non-min-size* free blocks by +address. This is the structure used for adjacency lookup during +consolidation: + +- `predecessor(A)`: largest Range-tree entry with address less than A. +- `successor(A)` : smallest Range-tree entry with address greater than A. + +Min-size blocks are **not** in the Range tree (see "Min-size special case" +below); their adjacency is detected via a `find` in the min-size Bin +RBTree. + +### Block size variants and pagemap encoding + +A free block occupies one or more `MIN_CHUNK_SIZE` chunks, with a pagemap +entry per chunk. The first pagemap entry of a free block carries a +**variant tag** that tells `BackendArena` how to interpret the other +entries in the block: + +| Variant | Block size | Pagemap entries used by BackendArena | +|-------------|----------------|------------------------------------------------------------------------| +| `Min` | exactly min | 1 entry — both words store the Bin RBTree node (left/right + colour). | +| `TwoMin` | exactly 2× min | 2 entries — first stores Bin node, second stores Range node. | +| `Large` | > 2× min | 3 entries — first Bin, second Range, third stores precise block size. | + +**Tree membership is the source of truth for "is this block free?".** +The variant tag is only meaningful for entries `BackendArena` reaches via +its own RBTrees; nothing outside the data structure probes the tag. The +tag is therefore not a state machine that needs an explicit +"BackendOwned" / "allocated" value: when a block is removed from its +trees, the tag's bits are simply not consulted again until the same +chunk(s) are re-added. + +This phase needs to allow **arbitrary chunk counts** for `Large` blocks, +not just exact size-class sizes. Carving will produce non-class +remainders (e.g. for `B=2`, a 9-chunk prefix), and those must round-trip +through `add_block` / `remove_block` without any silent rounding. The +`Large` block's precise chunk count is stored in the third pagemap entry; +`Min` and `TwoMin` sizes are implicit in the variant. + +**Bit positions** are an internal detail of the new Rep: + +- The variant tag needs 2 bits. They live in the first word of the first + pagemap entry of the block, in bits above `BACKEND_RESERVED_MASK` + (bits 0–7) and the existing `RED_BIT` (bit 8) — e.g. bits 9–10. The + chunk-aligned-address keys leave bits below `MIN_CHUNK_BITS` (=14) + free, so this is comfortably within budget. +- The Rep's `get`/`set` for the first word must preserve **both** + `RED_BIT` and the variant-tag bits, generalising `BuddyChunkRep`'s + current `RED_BIT`-only preservation. + +The exact bit positions are documented only inside the new Rep next to +the accessors. `BuddyChunkRep` and `largebuddyrange.h` are not modified +in this phase. + +This phase does **not** define any storage for the future per-entry +`offset` field that the memcpy fix will need. The plan only claims that +choosing 2 bits above `RED_BIT` does not preclude a sensible future +offset layout; the concrete offset design is the responsibility of the +memcpy-fix follow-up plan. + +### Adjacency lookup + +All adjacency lookups are performed via RB-tree finds in this +`BackendArena`'s own trees. **No pagemap probing.** The pagemap is shared +across `BackendArena` instances (e.g. thread-local + global), and reading +entries owned by another instance would be unsafe under concurrent +modification. By restricting reads to RB-tree traversals — which only +follow pointers we wrote, into entries we own — adjacency detection is +race-free without any synchronisation at this layer. + +For an incoming block `A` of size `S` at address `addr_A`: + +- `(P_range, S_range) := Range.neighbours(addr_A)` — one walk yields both + non-min neighbours. + - If `P_range.addr + P_range.size == addr_A`, the non-min left + neighbour is `P_range`; merge. + - If `S_range.addr == addr_A + size_A`, the non-min right neighbour is + `S_range`; merge. +- If no non-min left neighbour was found and `A` is min-eligible at its + boundary: `MinSizeBin.find(addr_A - MIN_CHUNK_SIZE)`; if present, + merge. +- If no non-min right neighbour was found: `MinSizeBin.find(addr_A + + size_A)`; if present, merge. + +`MinSizeBin` is the single Bin RBTree that holds all min-size free blocks +(the bin whose servable set is `{1 chunk}`). Its `find` operation is a +standard RB-tree key lookup (O(log n)), and only traverses entries already +linked into the tree — i.e. entries owned by *this* `BackendArena`. + +Min-size adjacency therefore costs at most one Bin-tree `find` per side +per `add_block`. The Range-tree `neighbours(addr_A)` query yields both +non-min neighbours in a single `O(log n)` walk; no additional pagemap +touches are introduced. + +### Consolidation: reusing tree entries when possible + +The user plan calls out that when consolidating `A` with predecessor `P`, +the Range tree node belonging to `P` can be reused for the consolidated +block without any RB-tree mutation: the combined block has the same +starting address as `P`, so its Range tree key is unchanged. Only the Bin +tree is mutated (remove `P` from its bin, insert combined into its new +bin). + +This optimisation applies **only when `P` is non-min**, i.e. when `P` has a +Range tree entry to reuse. When `P` is min-size, `P` has no Range entry, +and the merged block (which is non-min) must be inserted into the Range +tree normally. The same applies to the `P+S` case: reuse `P`'s Range entry +only if `P` is non-min; otherwise insert the merged block into the Range +tree, then remove `S`'s entry. + +When consolidating `A` with successor `S` (and no `P`), the combined block +starts at `addr_A`, not `S.addr`. Two strategies: + +- **Simple (initial)**: remove `S` from the Range tree, insert combined at + `addr_A`. Two RB-tree operations. +- **Optimised (deferred)**: walk to `S`'s parent via the path returned by + `find`, redirect that parent's child pointer to the new node at `addr_A`, + and copy `S`'s left/right/colour into the new node's pagemap entry. Zero + rotations. + +This plan implements the simple strategy first and gates the optimised +strategy behind a follow-up step with an A/B test. + +After a merge, the combined block may itself become adjacent to a further +block (in principle yes — but if the system was maximally consolidated +before `A` was added, then `P` and `S` were not adjacent to anything else, +so a single merge step suffices). The invariant check verifies this. + +### Min-size special case + +A min-size free block has only one pagemap entry, which fits one RBTree +node. The Bin tree for min-size blocks uses that entry. The Range tree +excludes min-size blocks. Adjacency for min-size neighbours is found via +`MinSizeBin.find(addr)`, never by reading the pagemap directly. + +### Write ordering within add/remove + +Because adjacency lookups are RB-tree-only, a block is "visible to +adjacency" exactly when it is reachable from one of this `BackendArena`'s +RBTree roots. The ordering rules collapse to two: + +- **add_block**: write the variant tag and any auxiliary data (precise + size for `Large`) into the block's pagemap entries *before* the final + RB-tree insertion that makes the block reachable. Anyone who finds the + new block via a tree walk must see a fully-initialised block. +- **remove_block / consolidation**: remove the block(s) from the + RB-tree(s) *before* reusing or overwriting their pagemap entries. + After unlinking, the block is no longer reachable, and the entries are + free for reuse by the next operation. + +No transient "BackendOwned" marker is needed: a chunk's free-ness is +synonymous with its membership in some RBTree owned by this +`BackendArena`. + +### Invariants (debug-only, runtime-checked) + +The `BackendArena::invariant()` method checks: + +1. **Maximally consolidated**: walking the Range tree in order, no two + adjacent entries have `prev.addr + prev.size == curr.addr`; no min-size + block in `MinSizeBin` is adjacent (at `addr ± MIN_CHUNK_SIZE`) to + anything else in either tree. +2. **Cross-tree consistency**: every non-min block in the Bin trees is in + the Range tree; every Range tree entry is in exactly one Bin tree; + sizes agree between the two views. +3. **Bin classification correctness**: each block is in the bin determined + by its `(addr, size)` servable set; arbitrary chunk counts (not just + exact size-class sizes) are classified correctly. +4. **Bitmap consistency**: the non-empty-bins bitmap is set iff the + corresponding RBTree is non-empty. +5. **Variant-tag consistency**: every block reachable from a tree root + has a variant tag (`Min` / `TwoMin` / `Large`) matching its actual + chunk count. (No "BackendOwned" tag is needed because tree membership + is the source of truth for freeness.) + +### Backend chunk size classes + +The new bins are indexed in **chunk units** (1 chunk = `MIN_CHUNK_SIZE` +bytes), not bytes, and not the front-end `sizeclass_t` whose large variant +is currently power-of-two only. `backend_arena_bins.h` defines the +chunk-unit size-class scheme using the snmalloc size-class formula +`S = 2^e + m · 2^(e − B)` applied at **chunk-count exponents starting +from zero**. Low-exponent special cases (chunk counts 1, 2, 3, …) follow +the same pattern as `bits::from_exp_mant` in +`src/snmalloc/ds_core/sizeclassstatic.h`: at small exponents the mantissa +space is degenerate, handled by enumeration. + +The public API of `BackendArenaBins` — the integration contract +`BackendArena` builds on — is intentionally narrow: + +- `struct range_t { size_t base; size_t size; }` — a chunk-count range + used to describe free blocks and carved sub-ranges. +- `struct carve_t { range_t pre, req, post; }` — output of a carving + operation; either of `pre`/`post` may have `size == 0` (absent). +- `static carve_t carve(range_t block, size_t n_chunks)` — given a + free block and an allocation request, split into pre-pad / aligned + request / post-pad. Pure function; does not touch the bitmap. +- `max_supported_chunks() -> size_t` — upper bound on legal `n_chunks`; + used for assertions. +- nested `Bitmap` — the routing layer; see below. + +The `Bitmap` is a per-arena non-empty-bins bitmap that owns the +classification of `(base, size)` pairs to bin ids. Its public surface is +exactly three operations: + +- `add(range_t block) -> size_t` — classify `block` into a bin, ensure + the bit for that bin is set, return the `bin_id` so the caller + inserts the block into `bin_trees[bin_id]`. **Idempotent**: callable + on a block already represented in the trees; setting an already-set + bit is a no-op. This is the only public way to learn a bin id for a + given `(base, size)` block, including during consolidation lookups + for neighbours that are already present. +- `find_for_request(size_t n_chunks) -> size_t` — locate the first set + bin satisfying a request for `n_chunks`. Returns `SIZE_MAX` if no bin + in this arena fits. +- `clear(size_t bin_id)` — caller has popped the last element from + `bin_trees[bin_id]`; the bitmap bit is cleared. + +There are deliberately no general bitmap operations (`set`/`has`/ +`empty`/etc.) on the public surface — the bitmap is not a generic data +structure but a routing index whose only meaningful operations are the +three above. The `bitmap_info_t` / `carve_info_t` rodata layouts, the +`bin_index` classifier, and `bitmap_info_for_request` / +`carve_info_for_request` are private (the bitmap and `carve` consume +them internally). + +The size-class encoding details — the `bitmap_info_t` / `carve_info_t` +rodata records, the bin-scheme constants (`B`, `MANTISSAS_PER_EXP`, +`BINS_PER_EXP`, `MAX_SC`), `bitmap_info_for_request` / +`carve_info_for_request`, `bin_index`, and the constexpr per-sc +accessors — are private implementation details. They are reachable +only via the friend struct `BackendArenaBinsTestAccess` (defined in +the test translation unit, see Phase 1) so unit tests can exercise +them directly; production code outside this header does not depend on +them. + +**Free blocks may have arbitrary chunk counts**, including non-class +sizes that arise from carving (e.g. a 9-chunk prefix at `B=2`). The +private `bin_index` operates on arbitrary `(address, size)` pairs and +classifies into a bin by the block's servable set; the public `Bitmap` +exposes this only through `add(range_t)`, which returns the bin id. +Exact size classes appear only on the request side; free blocks store +their precise chunk count where needed (`Large` variant). + +### Exponent / bin-count bounds + +`BackendArena` takes +**chunk-count exponent** bounds. `MIN_CHUNKS_BITS = 0` (1 chunk). The +upper bound is **exclusive**, matching `Buddy<..., MIN, MAX>`'s +semantics. The total number of bins is +`(MAX_CHUNKS_BITS - MIN_CHUNKS_BITS) * BINS_PER_EXP` plus the +degenerate-low-exponent bins. Static assertions encode the exclusive +semantics; tests exercise minimum, just-below-max, and exact-max sizes +(the last triggers overflow back to the parent, mirroring `Buddy`). + +The chunk-unit bin scheme is independent of `sizeclass_t::as_large()` +for now. The "Generalise the large size classes" follow-up plan will +reconcile the front-end large size classes with this scheme. + +### Multiple instances + +All state lives in pagemap-backed nodes and in per-instance roots/bitmaps; +no global state. Multiple `BackendArena` instances can coexist (thread-local +and global) for the future Range wrapper. + +## Phases + +Each phase produces a test gate that must pass before the next phase begins. +A phase that touches the tree itself must also keep all existing tests +(including `redblack.cc`) green. + +### Reviewer protocol (applies to every phase below) + +Each phase ends with **two** gates, both of which must clear before +the next phase starts: + +- **Test gate** — the listed tests pass on a Debug build (per + `.github/skills/building_and_testing.md`). +- **Review gate** — spawn a fresh-context `code-review` subagent on + the diff added in that phase. The reviewer prompt includes: + 1. The plan section for the current phase (treat as spec). + 2. The diff produced by the phase (compared to the previous + phase's tip). + 3. A reminder that this phase's scope is *only* what the plan + section describes; cross-phase concerns are out of scope. + 4. A pointer to `claude.md` for codebase conventions (no raw + compiler attributes, no C++ STL in production, `SNMALLOC_*` + macros, etc.). + + Address findings, re-spawn a fresh-context reviewer, loop until + a reviewer reports no issues. Disputes with reviewer findings + escalate to the user, not resolved unilaterally. + +Phases 0 and 6 are exempted from the review gate: Phase 0 adds no +code; Phase 6 is test-only over already-reviewed production code. +Phase 7 is the final mandatory review per `claude.md`. + +### Phase 0: Baseline + +Per `claude.md` "Baseline the checkout before starting work": run a clean +Debug build and the full test suite via the testing subagent protocol in +`.github/skills/building_and_testing.md`. Record the results. If the baseline is +broken, stop and report — do not start implementation on a broken base. + +**Test gate**: full ctest run completes; record pass/fail status of each +test for later comparison. + +### Phase 1: BackendArenaBins — bin scheme, per-sc tables, and bitmap + +Add `src/snmalloc/backend_helpers/backend_arena_bins.h` defining +`BackendArenaBins`: the chunk-unit size-class +scheme, two per-sc rodata tables, the free-block classifier, and the +nested non-empty-bins bitmap that the allocation fast path scans. + +#### Public surface — the integration contract + +- `struct range_t { size_t base; size_t size; }` — chunk-count range. +- `struct carve_t { range_t pre; range_t req; range_t post; }` — output + of a carving operation; `pre` and/or `post` may have `size == 0`. +- `static SNMALLOC_FAST_PATH carve_t carve(range_t block, size_t n_chunks)` + — split a free `block` into pre-pad, aligned request, post-pad. + Pure. **Preconditions** (asserted): + `n_chunks >= 1 && n_chunks <= max_supported_chunks()`, + `block.size > 0`, and `block` is servable for `n_chunks` (the caller + has already used `Bitmap::find_for_request`). +- `static constexpr size_t max_supported_chunks()` — upper bound on + legal `n_chunks`; used for assertions. +- nested `class Bitmap` — three methods, all that production code + calls into: + - `size_t add(range_t block)` — classify `block`, ensure the bit + for the resulting bin is set, return the bin id so the caller can + insert `block` into `bin_trees[bin_id]`. **Idempotent**: also the + way to obtain the bin id of an existing neighbour during + consolidation. **Precondition**: `block.size >= 1 && + block.size <= max_supported_chunks()`. + - `size_t find_for_request(size_t n_chunks) const` — smallest set + bin servable for `n_chunks`; `SIZE_MAX` if none. + - `void clear(size_t bin_id)` — caller has popped the last element + from `bin_trees[bin_id]`; clears the bit. + - `static constexpr size_t TOTAL_BINS` — strict upper bound on bin + ids; exposed so callers can size `bin_trees`. + +No general bitmap operations and no size-class handles are exposed. +All other members are private; the unit test reaches them through a +friend struct, defined in the test translation unit (see "Test surface" +below). + +#### Bin scheme + +Following `prototype/skip_analysis.py`: + +- `B = INTERMEDIATE_BITS` (mantissa bits, currently restricted to + `{1, 2, 3}`). +- `MANTISSAS_PER_EXP = 1 << B` (4 / 8 mantissa positions; 2 for B=1). +- `BINS_PER_EXP` = 2 / 5 / 13 for `B` = 1 / 2 / 3 — the count of + distinct *servable subsets* of mantissas at each exponent. Each bin + is a single bit in the bitmap and a single RB-tree at the + `BackendArena` layer; bins are not size classes (multiple size + classes share a bin) and not exponents (each exponent has multiple + bins). +- `MAX_SC = ((bits::BITS - B) << B) + ((1 << B) - 1)` — one past the + largest raw id that `bits::to_exp_mant_const` produces whose + decoded size fits in `size_t`. The architectural max raw id decodes + to `2^bits::BITS`, which overflows; the tables stop one entry short + to keep `from_exp_mant(MAX_SC - 1)` valid. Sizes for `B` = 1 / + 2 / 3 on 64-bit: 127 / 251 / 495. +- `max_supported_chunks() = bits::from_exp_mant(MAX_SC - 1)` — + enormous in practice (far beyond any real arena). + +#### Per-sc rodata tables + +Two power-of-two-sized structs, each indexed by raw sc id with a +single shift+add: + +```cpp +struct alignas(4 * sizeof(size_t)) bitmap_info_t { + size_t start_word, first_mask, second_mask; +}; +struct carve_info_t { size_t size_chunks, align_chunks; }; +``` + +`alignas(4 * sizeof(size_t))` on `bitmap_info_t` rounds its `sizeof` +up to a power of two (C++ requires `sizeof(T)` to be a multiple of +`alignof(T)`), so the table indexes with a shift+add without needing +a named padding member. + +Split into two tables — rather than one combined record — because the +two consumers run at different phases of allocation/free: + +- `bitmap_info_t` is read by `Bitmap::find_for_request` (bin-selection + on allocate). +- `carve_info_t` is read by `carve` (post-pop split on allocate) and by + `bin_index`'s cascade-fit predicate (free-side classification). + +The fields of `bitmap_info_t` are **pre-shifted into the bitmap's word +layout** so the search is two ANDs: + +- `start_word`: the bitmap word containing the SC's lowest serving + bin. +- `first_mask`: serve mask pre-shifted into `start_word`. Bit `i` set + iff `words_[start_word]` bit `i` serves this SC. +- `second_mask`: serve mask carried into `start_word + 1`. When + `start_bit` is word-aligned (`shift == 0`) there is no within-exp + carry and every bit in that word is higher-exponent, so + `second_mask = ~size_t(0)`. + +`static_assert` pins both struct sizes (4 words and 2 words) so the +table index lowers to a shift+add. + +#### Tables and classifier — populated at constexpr build time + +A private `BinTable` struct holds (all `ModArray<...>`): + +- `bitmap_info[MAX_SC]`, `carve_info[MAX_SC]` — the per-sc tables + above. +- `exp_first_sc[bits::BITS + 1]` — first raw sc id at each + BackendArenaBins exponent (sentinel at index `bits::BITS` equals + `MAX_SC`). NOTE: this is not uniform stride — at the bottom of the + encoding the low regime squashes multiple BackendArenaBins exponents + into encoded-exponent 0. +- `exp_bin_base[bits::BITS + 1]` — `e * BINS_PER_EXP`, precomputed so + `bin_index` does no runtime multiply. +- `cascade_steps[MANTISSAS_PER_EXP][MAX_CASCADE_STEPS]` — per-`m_top` + decision lists for `bin_offset_at`. + +A `static constexpr BinTable table_{}` member of `BackendArenaBins` +holds the populated instance. Tables sit in `.rodata`; no static +initialiser runs at program start. Combined size at B=3 is on the +order of tens of KB (estimate: 16 B/sc × 495 + 32 B/sc × 495 + small +cascade table ≈ 24 KB). + +The constructor populates `bitmap_info[sc]` from the canonical +`bin_subsets` table (single source of truth, matches +`prototype/skip_analysis.py`): + +- `start_bin_offset_for_m(m)`: first within-exp bin offset whose + subset contains mantissa `m`. +- `serve_mask_for_m(m)`: bitmask, relative to `start_bin_offset_for_m`, + of bins that serve `m`. Built **positively** (bit set = "serves") + rather than as a "skip" mask: the hot path AND's this directly + against the bitmap word, no NOT. +- `start_bit = exp_bin_base[e] + start_bin_offset_for_m(m)`, then + `start_word = start_bit / bits::BITS`, + `first_mask = serve_mask << (start_bit & (bits::BITS - 1))`, + `second_mask = (shift == 0) ? ~size_t(0) : ((mask >> (bits::BITS - + shift)) | (~size_t(0) << shift))`. + +For `cascade_steps`: for each `m_top`, the bins whose subset has +`m_top` as max element must form a strict containment chain when +sorted descending by popcount. This invariant is **checked at +constexpr build time** (`throw "..."` in the constexpr ctor surfaces +the violation as a compile error). Given the invariant, each +non-default candidate's discriminator is a single mantissa probe; the +list ends with a `NO_TEST` default. + +Two free-side primitives, both private (used internally by `add` and +by `carve`): + +- `bin_index(range_t block) -> size_t`: returns the bin id of `block`, + operating on arbitrary chunk counts (not just exact SCs). Walks + `m_top` from `MANTISSAS_PER_EXP - 1` down at the natural exponent + `e = prev_pow2_bits(block.size)`. If alignment padding eats every + fit at `e`, drops to `e - 1`; one drop is always sufficient (the + smallest SC at `e - 1` has size and alignment `2^(e-1)`, so worst- + case `size + pad < 2^e <= block.size`). +- `bitmap_info_for_request(n_chunks) -> const bitmap_info_t&`, + `carve_info_for_request(n_chunks) -> const carve_info_t&`: single + table read each. Both call `bits::to_exp_mant(n_chunks)` (the + runtime CLZ intrinsic variant) so the encode is fast and is expected + to be CSE'd when both calls appear in the same fast path. + +#### Runtime CLZ on the fast path + +Production calls on the fast path use the runtime intrinsic, not the +constexpr software fallback: + +- `src/snmalloc/ds_core/bits.h` provides + `template inline + SNMALLOC_FAST_PATH size_t to_exp_mant(size_t value)` — body + identical to `to_exp_mant_const` but using `bits::clz` instead of + `clz_const`. `static_assert(MANTISSA_BITS + LOW_BITS > 0, ...)` — + the runtime variant relies on `LEADING_BIT != 0` to guarantee + `clz`'s non-zero precondition. +- Header uses `bits::to_exp_mant(n_chunks)` on the + `bin_index` / `find_for_request` / `carve` paths; + `bits::to_exp_mant_const(...)` is used **only** at table + construction time inside the constexpr `BinTable` constructor (and + in test-only static_asserts — see "Test surface"). + +This is the existing snmalloc convention for paired runtime / +compile-time helpers (`clz` / `clz_const`, `next_pow2` / +`next_pow2_const`). + +#### Nested `Bitmap` + +```cpp +class Bitmap +{ + friend struct BackendArenaBinsTestAccess; + +public: + static constexpr size_t TOTAL_BINS = BINS_PER_EXP * bits::BITS; + + Bitmap() : words_{} {} + SNMALLOC_FAST_PATH size_t add(range_t block); + SNMALLOC_FAST_PATH void clear(size_t bin_id); + SNMALLOC_FAST_PATH size_t find_for_request(size_t n_chunks) const; + +private: + static constexpr size_t NUM_BITMAP_WORDS = + (TOTAL_BINS + bits::BITS - 1) / bits::BITS; + + size_t words_[NUM_BITMAP_WORDS]; +}; +``` + +- `TOTAL_BINS = BINS_PER_EXP * bits::BITS` is the strict upper bound + on `bin_index` output: `bin_index` returns `e * BINS_PER_EXP + + offset` with `e <= bits::BITS - 1` and `offset < BINS_PER_EXP`, so + the maximum is `BINS_PER_EXP * bits::BITS - 1 < TOTAL_BINS`. Values: + 128 / 320 / 832 for `B` = 1 / 2 / 3 on 64-bit. +- `NUM_BITMAP_WORDS == BINS_PER_EXP` exactly (2 / 5 / 13 words); on + 32-bit each word is 4 B instead of 8 B, halving storage. +- `words_` is zero-initialised. Word width tracks `bits::BITS` so the + AND with the precomputed masks has no width mismatch; `bits::ctz` + on a `size_t` produces the bit index. + +Friend declarations: `BackendArenaBins` and its nested `Bitmap` +each carry their own `friend struct BackendArenaBinsTestAccess<...>;` +(C++ friendship does not transit to nested classes). + +Static asserts on bitmap layout: + +- `TOTAL_BINS == BINS_PER_EXP * bits::BITS`. +- `NUM_BITMAP_WORDS == BINS_PER_EXP`. +- `TOTAL_BINS < SIZE_MAX` — so the `SIZE_MAX` sentinel cannot collide + with a valid bin id. +- `BINS_PER_EXP <= bits::BITS` — `find_for_request` assumes the + within-exp range fits in a single word so the search straddles at + most one word boundary. Holds on 32-bit (W=32) and 64-bit (W=64) + for the current B values. If a future B pushes this above + `bits::BITS`, the two-word body must be generalised. + +`find_for_request` body: + +```cpp +SNMALLOC_FAST_PATH size_t find_for_request(size_t n_chunks) const +{ + const bitmap_info_t& info = bitmap_info_for_request(n_chunks); + SNMALLOC_ASSERT(info.start_word < NUM_BITMAP_WORDS); + + // First word: start bin + any within-exp neighbours in same word. + size_t word = info.start_word; + size_t bits = words_[word] & info.first_mask; + if (bits != 0) return word * bits::BITS + bits::ctz(bits); + if (++word == NUM_BITMAP_WORDS) return SIZE_MAX; + + // Second word: within-exp carry plus any higher-exp bits. + bits = words_[word] & info.second_mask; + if (bits != 0) return word * bits::BITS + bits::ctz(bits); + + // Remaining words: purely higher-exponent, any bit serves. + while (++word < NUM_BITMAP_WORDS) + if (words_[word] != 0) return word * bits::BITS + bits::ctz(words_[word]); + return SIZE_MAX; +} +``` + +The two ANDs are the entire bin-selection cost; no shifts, no +`shift == 0` branches at runtime (folded in at construction). + +#### Test surface + +`BackendArenaBinsTestAccess` is **forward-declared** +in `backend_arena_bins.h` (so the friend declarations can refer to it) +and **defined in the test translation unit** +`src/test/func/backend_arena_bins/backend_arena_bins.cc` (inside +`namespace snmalloc`). The production header therefore carries no +test-only members. + +What the test access struct exposes (all delegating to private +internals through the friend grant): + +- Re-exports of the public types and methods, for convenience. +- The bin-scheme constants `B`, `MANTISSAS_PER_EXP`, `BINS_PER_EXP`, + `MAX_SC`. +- `using chunk_sc_t = size_t;` — raw sc id as plain `size_t`; the + production header does NOT define a `chunk_sc_t` handle type. +- `request(n) -> size_t` — `bits::to_exp_mant(n)` (runtime). +- `size_chunks(sc) -> size_t`, `align_chunks(sc) -> size_t` — direct + reads of `Bins::table_.carve_info[sc]`. +- `bitmap_info(sc) -> const bitmap_info_t&`, `carve_info(sc) -> const + carve_info_t&` — direct table reads. +- `bitmap_info_for_request_const(n)`, + `carve_info_for_request_const(n)` — constexpr variants that use + `bits::to_exp_mant_const(n)`; used only inside + `static_assert`s in the test file. +- `bin_index(block) -> size_t`, `bitmap_info_for_request(n)`, + `carve_info_for_request(n)`, `carve(block, n)`, + `max_supported_chunks()` — passthroughs to the private members. +- The canonical `bin_subsets` table. +- Raw-word access on `Bitmap`: `raw_set(b, bin_id)`, `raw_has(b, + bin_id)`, `raw_empty(b)`, `raw_word(b, i)` — for exhaustive + single-bit and "no other bit changed" tests. + +#### Test gate + +New test `src/test/func/backend_arena_bins/backend_arena_bins.cc` +(auto-discovered via `subdirlist` of `src/test/func/`; registered in +`TESTLIB_ONLY_TESTS`). For each `B ∈ {1, 2, 3}`: + +- Compile-time properties via `static_assert` (`BINS_PER_EXP`, + `MAX_SC`, sample sizes/alignments through the `_const` variants). +- Runtime/constexpr CLZ agreement: + `to_exp_mant(n) == to_exp_mant_const(n)` over a + representative range of `n` (1, every power of two and ±1, near + `max_supported_chunks()`, several thousand random values). +- `from_exp_mant` round-trip: + `from_exp_mant(to_exp_mant(n)) >= n` and minimality + (no smaller raw id satisfies the bound). +- Bin-scheme primitives: `size_chunks(sc) >= s` for + `sc = request(s)`; idempotence `request(size_chunks(sc)) == sc`; + monotonicity of `request`; `align_chunks(sc)` is a power of two, + divides `size_chunks(sc)`, and is the largest such. +- `bin_index`: enumerate `(addr_chunks, n_chunks)` over a small grid + (including arbitrary non-class sizes) and check that `bin_index` + matches a brute-force servable-set computation expressed via the + canonical `bin_subsets` table. +- `Bitmap` raw smoke (via friend-struct raw-word access): set / clear + round-trips on individual bin ids; multi-bit states; empty check. +- `find_for_request` on empty bitmap returns `SIZE_MAX` for all + representative request sizes (including `max_supported_chunks()`). +- **Exhaustive single-bit**: for each `bin_id < TOTAL_BINS`, set + exactly that bit (raw access) and verify + `find_for_request(n_chunks)` matches a reference brute-force + scanner over a representative set of request sizes. The reference + predicate "bin b serves request n" is expressed via a + `serves(bin, n)` helper that consults `bin_subsets` directly — + the canonical source from which the precomputed + `start_word`/`first_mask`/`second_mask` are themselves derived, so + any divergence in the derivation chain is caught. +- **Multi-bit randomised**: thousands of random arena states + (uniformly random subset of bin ids) cross-checked against the + reference scanner over representative requests. +- **Word-boundary targeted cases**: classify the table entries + `bitmap_info_for_request_const(...)` produces by `start_bit = + start_word * bits::BITS + bits::ctz(first_mask)` (the start bin is + always the lowest set bit of `first_mask` by construction) into + aligned / fits-in-one-word / boundary-straddling. For each + category, exercise: (i) a single set bit in the first word's + considered region; (ii) first word empty + set bit in the second + word's within-exp carry; (iii) first word empty + set bit in the + second word's higher-exp region; (iv) set bits only in word 3 or + beyond. +- **`add` / `find_for_request` single-block integration**: for + representative `(base, size)` blocks, `bin_id = bm.add({base, + size})`, then `find_for_request(n_chunks) == bin_id` iff + `can_serve(base, size, n_chunks)` (the brute-force predicate using + per-class `size_chunks` / `align_chunks` from the friend struct). +- **`add` / `find_for_request` multi-block integration**: insert + several blocks; for each request, the expected result is the + smallest bin id among the added blocks that can serve it (or + `SIZE_MAX`). Pins the "first serving bin" contract. +- **`add` idempotence**: calling `add(block)` twice returns the same + bin id both times and leaves the bitmap unchanged (verified via raw + word access before and after the second call). +- `carve`: for a representative grid of `(block, n_chunks)`, the + output triple has `pre.base = block.base`, + `pre.base + pre.size = req.base`, `req.size = size_chunks(sc)`, + `req.base` is `align_chunks(sc)`-aligned, + `req.base + req.size = post.base`, and + `post.base + post.size = block.base + block.size`. + +`MAX_SC`-related `static_assert`s use `snmalloc::bits::BITS` (not +hard-coded 64) so they hold on both 32-bit and 64-bit builds. + +#### Review gate + +Spec slice = the Phase 1 section above. Reviewer checks: + +- Tables match the canonical `bin_subsets` (single source of truth); + `prototype/skip_analysis.py` reproduces the same numbering. +- Production header carries no test-only surface (no `chunk_sc_t` + handle class, no `request`, no `_const` variants, no test-only + per-sc accessors — those live only in + `BackendArenaBinsTestAccess` in the test cc). +- Fast path uses runtime `bits::to_exp_mant` / `bits::clz` (not the + `_const` variants); the `_const` variants are reachable only from + the constexpr `BinTable` constructor and the test's + `static_assert`s. +- `Bitmap::find_for_request` matches the reference scanner; word- + boundary straddle is correctly handled. +- `SIZE_MAX` sentinel is unambiguous (`TOTAL_BINS << SIZE_MAX`). +- Tables sit in `.rodata` (no program-start initialiser). +- Comments earn their length: cut anything that justifies layout, + restates code, or doesn't carry correctness-relevant information. + +### Phase 2: RBTree neighbours-of-probe helper + +The current `RBTree` exposes `find`, `remove_min`, `remove_path` (taking +an `RBPath`). For Range-tree adjacency lookups we don't need predecessor +and successor as independent operations — we always want **both +neighbours of a probe value** when classifying an incoming block. A +single tree walk for `K` already records exactly that information: the +last "go-right" descent (the parent of the failed left-child step) +points at the largest entry strictly less than `K`; the last "go-left" +descent points at the smallest entry strictly greater than `K`. + +Add a single helper: + +- `neighbours(K) -> stl::Pair` — performs one walk for + `K` and returns `(largest entry < K, smallest entry > K)`. Either may + be null. If `K` is itself present in the tree, the result describes + the neighbours of the existing entry; the caller decides whether that + is a bug (overlap) for its problem domain — `BackendArena` will treat + an exact hit as an invariant violation, since two free blocks cannot + share a starting address. + +This replaces two separate `O(log n)` walks per `add_block` with one and +keeps the API surface small. Implement on top of the existing `RBPath` +walking primitives — no structural changes to `RBTree` required. + +**Test gate**: extend `src/test/func/redblack/redblack.cc` with a +randomised test of `neighbours(K)` against `std::set::lower_bound` / +`upper_bound` as oracle, over thousands of operations and probe values. +Existing tests must remain green. + +**Review gate**: spec slice = the Phase 2 section above. Reviewer +checks: walk correctly records both turn points; behaviour at empty +tree, single-node tree, `K` smaller than all keys, `K` larger than all +keys, `K` equal to existing key, and `K` between two consecutive keys +all match the oracle; no structural changes to `RBTree`'s existing +invariants. + +### Phase 3: Rep concept + skeleton BackendArena + +Create `src/snmalloc/backend_helpers/backend_arena.h` with: + +- A `BackendArenaRep` concept describing the operations the data + structure needs from its backing pagemap, in **chunk-keyed** form + (callers pass `addr` aligned to `MIN_CHUNK_SIZE`): + - `get_variant(addr) -> {Min, TwoMin, Large}` and `set_variant`. + - First-entry word accessors: `get_word1/set_word1` and + `get_word2/set_word2`, preserving `RED_BIT` *and* the variant-tag + bits on every write. + - Second-entry word accessors (used for `TwoMin` and `Large`): same + `get_word1/set_word1/get_word2/set_word2` shape, applied to the + pagemap entry at `addr + MIN_CHUNK_SIZE`. (No variant-tag preservation + rule here; only `RED_BIT`.) + - Third-entry size accessor (used only for `Large`): + `get_large_size_chunks(addr)/set_large_size_chunks(addr, n_chunks)` + backed by the entry at `addr + 2·MIN_CHUNK_SIZE`. + - **No** pagemap-probing API. All adjacency is performed via the + BackendArena's own RBTree finds. +- Two internal RBRep adapters built **inside** `BackendArena` (not in + user code) on top of `BackendArenaRep`: + - **`BinRep`**: keys by chunk-aligned address; uses the first-entry + word accessors; encodes the red-black `colour` bit in `RED_BIT` of + word 1; left/right child pointers occupy chunk-aligned bits of the + two words. + - **`RangeRep`**: keys by chunk-aligned address; uses the second-entry + word accessors; same `RED_BIT` colour and left/right encoding. Only + consulted for `TwoMin` and `Large` blocks (min-size blocks have no + second entry and are not in the Range tree). + - Each adapter satisfies the existing `RBTree` Rep concept (the same + set of operations `BuddyChunkRep` satisfies for `largebuddyrange`'s + tree). +- `template class BackendArena` with the following API: + - `add_block(addr, size_chunks) -> stl::Pair` — + returns `{0, 0}` if the block was absorbed; returns + `{overflow_addr, overflow_size}` (non-zero) for the portion the + arena cannot index. Mirrors `Buddy::add_block`'s overflow-return + contract; the caller (a future `BackendArenaRange` wrapper) is + responsible for handling overflow. Overflow arises in two cases: + (i) the input is oversized — `size_chunks >= 2^MAX_CHUNKS_BITS` + is split and the excess returned; the absorbed prefix continues + through consolidation. (ii) **consolidation grew the block to + arena scale** — if neighbour coalescing produces a range of + `2^MAX_CHUNKS_BITS` chunks (the entire arena), that consolidated + range is returned as overflow because the bitmap / per-sc tables + are sized for `< 2^MAX_CHUNKS_BITS`. In case (ii) the merged + neighbours are already removed from the trees before the return. + - `remove_block(size_t n_chunks) -> stl::Pair` — + returns `{0, 0}` if nothing serves the request; otherwise the + `(base, size)` of the aligned request range returned by + `BackendArenaBins::carve(...).req`. Phase 3 leaves this + a stub; Phase 4 implements it (popping a larger block and carving + internally as needed). +- The invariant method (initially a no-op). +- **Static assertions** pinning the relationship between BackendArena's + bound and the bin-scheme's representable maximum: + - `static_assert(MAX_CHUNKS_BITS < bits::BITS, ...)` so that + `2^MAX_CHUNKS_BITS` and `bits::one_at_bit(MAX_CHUNKS_BITS)` are + representable in `size_t`. + - `static_assert(bits::one_at_bit(MAX_CHUNKS_BITS) <= + BackendArenaBins::max_supported_chunks() + 1, ...)` so that + every block size the arena can hold (strictly less than + `2^MAX_CHUNKS_BITS`) is classifiable by `bin_index` / + `bitmap_info_for_request` / `carve_info_for_request` without + hitting their upper-bound assertions. + +Create a mock Rep in the test using a fixed-size pagemap array (in the +spirit of `redblack.cc`'s `array[2048]`). The mock Rep implements +`get_variant`/`set_variant` and the word/size accessors on its array; +no probing API to implement. + +This phase does **not** modify `BuddyChunkRep` or `largebuddyrange.h`. +All new encoding documentation lives in `backend_arena.h` next to the +new Rep concept. + +**Test gate**: "accessor smoke test" inside +`src/test/func/backend_arena/backend_arena.cc`: + +- Write each variant tag (`Min`, `TwoMin`, `Large`) at a chunk address, + read it back; assert each round-trip preserves the value and does + not corrupt `RED_BIT` or other reserved bits. +- **Cross-preservation**: in the first-entry word, interleave + `set_variant`, Bin-node `set` (writing left/right), and `set_red` in + every order, then verify all three values round-trip correctly. +- Write Bin node fields (left/right pointer, colour bit), read back + unchanged. +- Write Range node fields, read back unchanged. +- Write a precise chunk count in the third entry, read back unchanged. +- `BackendArena` instantiates for several `K`, and its + `invariant()` returns true on an empty arena. +- `add_block(addr, size_chunks)` for `size_chunks >= 2^K` returns the + overflow portion via its return value; `add_block` for + `size_chunks < 2^K` returns `{0, 0}`. + +**Review gate**: spec slice = "Block size variants and pagemap encoding", +"Write ordering within add/remove", and the Phase 3 section above. +Reviewer checks: `BackendArenaRep` concept is the minimum needed to +express the data structure (no leak of internal `RBTree` Rep shape into +user-facing `BackendArenaRep`); `BinRep`/`RangeRep` adapters preserve +`RED_BIT` and variant-tag bits on writes; chunk-keyed API used +consistently; bit-position choices documented in `backend_arena.h` (not +in `BuddyChunkRep`); no `` or `std::` types in +production headers (use `` and `snmalloc::stl::*`); +`SNMALLOC_*` macros used in place of raw compiler attributes. + +### Phase 4: Full add_block / remove_block with carving and consolidation + +Implement the full data-structure semantics in one step. Carving on +`remove_block` and consolidation on `add_block` are interdependent for +the maximally-consolidated invariant — carving without consolidation +produces adjacent free remainders that violate the invariant — so they +land together. The reuse-`P`'s-Range-entry optimisation is deferred to +Phase 5; this phase uses the simple "remove + reinsert" strategy for +every merge. + +**Ownership and serialisation**: `BackendArena` mutations are +serialised and owned at this layer (the upper layer that wraps an +arena holds exclusive access for the duration of a single +`add_block` / `remove_block`). Transient bitmap states during one +operation — e.g. a bin cleared just before its remainder is re-added +to the same bin — are never observable to a concurrent reader. The +Bin / Range trees and the `Bitmap` are per-arena and not concurrent +in this design (per the broader plan: the bitmap and trees lead +indexing; the pagemap is not probed for routing). + +**Tree mutation contracts**: Bin and Range trees are intrusive +red-black trees backed directly by the pagemap (per the +`backend_arena.h` Rep). `insert` and `remove` are allocation-free and +infallible for well-formed inputs; duplicate insertion and removal of +a non-present node are programmer errors and assert. The +`bitmap.add(range) -> bin_id` step followed by +`bin_trees[bin_id].insert(range)` cannot leave a set bitmap bit +without a corresponding tree entry, because `insert` cannot fail. + +- `add_block(addr, size_chunks)`: + - If `size_chunks >= 2^MAX_CHUNKS_BITS`, return the excess as overflow + per the Phase 3 contract; the absorbed prefix continues below. + - Find adjacencies per the "Adjacency lookup" rules (one + `Range.neighbours` call + at most two `MinSizeBin.find` calls). + - For each merge case (P-only, S-only, P+S, all combinations of + min/non-min P and S), update the trees in this order: + 1. For each merged neighbour `n_range`: + `size_t n_bin = bitmap.add(n_range);` — idempotent classify + (the bit is already set since `n_range` is in the tree); this + is the only public way to obtain the neighbour's bin id. + Then `bin_trees[n_bin].remove(n_range)` (and remove from the + Range tree if non-min). + **Then** `if (bin_trees[n_bin].empty()) bitmap.clear(n_bin);`. + The cleared bin id is the *neighbour's old bin*, not the + consolidated bin. + 2. Compute the consolidated range `c_range`. If + `c_range.size >= 2^MAX_CHUNKS_BITS` (this can only happen when + the entire arena has coalesced into one free block, giving + `c_range.size == 2^MAX_CHUNKS_BITS`), the consolidated range + exceeds what the arena can index — the bitmap bin space and + per-sc tables are sized for `< 2^MAX_CHUNKS_BITS`. Return + `c_range` as overflow to the caller (the merged neighbours + have already been removed in step 1; the arena is now empty + and the caller — typically a future `BackendArenaRange` + wrapper — decides whether to return the arena to its parent + pool). Otherwise: + `size_t c_bin = bitmap.add(c_range)` and + `bin_trees[c_bin].insert(c_range)`. + - Write variant tag and (for `Large`) precise chunk count before + inserting into the Bin tree, per "Write ordering within add/remove". +- `remove_block(size_t n_chunks)`: + - `size_t bin_id = bitmap.find_for_request(n_chunks);` — returns + `SIZE_MAX` if no bin in this arena serves the request. + - Pop the lowest-address block as `range_t block` from + `bin_trees[bin_id]` (`remove_min`); if the tree is now empty, + `bitmap.clear(bin_id)`. + - `auto c = BackendArenaBins::carve(block, n_chunks);` — splits + into `pre` / `req` / `post`. + - For each non-empty remainder (`c.pre`, `c.post`), call `add_block` + on it. Remainders may have arbitrary, non-class chunk counts; + `Bitmap::add` (called inside `add_block`) handles this via + `bin_index`. Consolidation cannot extend a remainder back into + the request range or its sibling remainder: the popped block is + gone from the trees, and `pre`, `req`, and `post` are mutually + contiguous (a remainder's neighbour on the request side is the + just-returned `req` range, which is *not* free). + - Return `c.req`. + +Full invariant enabled, including the **maximally consolidated** clause. + +**Test gate** — new test +`src/test/func/backend_arena/backend_arena.cc` (top-level test-glob +discovery, matching `src/test/func/redblack/`): + +- Unit tests for each consolidation case: P-only, S-only, P+S, with + min/min, min/non-min, non-min/min, non-min/non-min combinations of + P and S. +- Carving tests: request a size strictly smaller than any free block; + verify the returned block has the requested size class, that the + prefix/suffix remainders are correctly classified, and that bin / + Range tree / pagemap variant tags are consistent. +- Overflow tests: + - **Oversized input**: `add_block` for `size_chunks >= + 2^MAX_CHUNKS_BITS` returns the unabsorbed portion; smaller blocks + return `{0, 0}`. + - **Consolidation-grows-to-arena-scale**: fill the arena from + multiple sub-arena pieces such that the last `add_block` makes + the running consolidation cover the whole arena + (`c_range.size == 2^MAX_CHUNKS_BITS`); assert the consolidated + range is returned as overflow, and that both trees and the + bitmap are empty after the call. +- Smoke test: insert N blocks, remove N blocks via exact size-class + requests; final state is empty. +- Randomised stress test: random `add_block(addr, size_chunks)` / + `remove_block(n_chunks)` sequence against an oracle that models the + **same** selection rule — "smallest serving bin via + `bitmap.find_for_request`, lowest-address block within that bin, + carve via `BackendArenaBins::carve`". The oracle is implemented as + a sorted map of free `(addr, size_chunks)` pairs (using whatever the + existing test files use; `redblack.cc` already uses `std::set`). + After each operation, both `invariant()` and a structural comparison + against the oracle must pass. + +**Review gate**: spec slice = "Adjacency lookup", "Consolidation: +reusing tree entries when possible" (note: this phase uses the simple +strategy only — reviewer should flag any premature optimisation), +"Min-size special case", "Write ordering within add/remove", +"Invariants", and the Phase 4 section above. Reviewer checks: all eight +P/S min×non-min consolidation cases handled; write ordering correct +(variant tag and precise size written before tree insertion; removal +from trees before pagemap reuse); maximally-consolidated invariant +holds after every operation; remove → carve → re-add path does not +infinitely recurse (a remainder block re-entering `add_block` cannot +itself consolidate into something larger than what was just popped, but +this should be argued explicitly). + +### Phase 5: Consolidation — reuse predecessor's Range entry (optimisation) + +Switch the P-merge case to reuse `P`'s Range tree node (no RB mutation), +but **only when `P` is non-min** (a min-size `P` has no Range entry to +reuse). The S-only case continues to use remove+reinsert. The P+S case +reuses `P` (when non-min) and removes `S`. When `P` is min-size, the +merged block is inserted into the Range tree normally. + +**Test gate**: all Phase 4 tests still pass. Add debug-only counters at +the `BackendArena` layer (not inside `RBTree`) for "Range tree +`insert_path` calls" and "Range tree `remove_path` calls" during +`add_block` / `remove_block`. Assert that: + +- the non-min-P-only consolidation case records zero + Range-tree insert/remove calls (the existing node is reused in place), +- the min-P-only consolidation case records exactly one Range-tree insert + (no remove), +- the S-only consolidation case records one insert and one remove. + +This avoids any modification to `RBTree` itself — the counter increments +sit in the `BackendArena` wrappers around its Range-tree calls. + +**Review gate**: spec slice = "Consolidation: reusing tree entries when +possible" and the Phase 5 section above. Reviewer checks: reuse path +correctly leaves the Range-tree node in place (key unchanged, only the +back-reference from the new combined block); min-P case correctly falls +back to normal insert; counter assertions cover the cases that +distinguish the optimised path from the simple path; no regression of +Phase 4's full invariant + oracle randomised test. + +### Phase 6: Multi-instance test + +Instantiate two `BackendArena` over disjoint address ranges in +the same test process, drive workloads against both, verify each +invariant independently. + +**Test gate**: multi-instance test passes; total memory accounted for +via both instances matches expectations. + +### Phase 7: Final review and self-review + +Per `claude.md` mandatory review checkpoints: + +- Run the recursive principle check (self-review). +- Spawn a fresh-context reviewer subagent. Address findings. Loop until + reviewer finds no issues. + +**Test gate**: full ctest run (Debug) passes; reviewer reports no issues. + +## Files added / changed (anticipated) + +- New: `src/snmalloc/backend_helpers/backend_arena_bins.h` — + `range_t`, `carve_t`, `carve`, `max_supported_chunks`, and nested + `Bitmap` with `add` / `find_for_request` / `clear` (public surface); + the size-class encoding (`bitmap_info_t`, `carve_info_t`, constexpr + `BinTable`, `bitmap_info_for_request` / `carve_info_for_request`, + `bin_index`) is private and reachable via + `BackendArenaBinsTestAccess` (forward-declared in the header, + defined in the test cc) for unit tests. Templated on + `INTERMEDIATE_BITS` for testability. +- New: `src/snmalloc/backend_helpers/backend_arena.h` — the data structure, + templated on a `BackendArenaRep` concept exposing variant-tag and + node/size accessors (no pagemap-probing API). +- New: `src/test/func/backend_arena_bins/backend_arena_bins.cc` — bin + classification tests and `find_for_request` tests for + `B ∈ {1, 2, 3}`, using `bin_subsets` as the canonical "serves" + predicate. +- New: `src/test/func/backend_arena/backend_arena.cc` — data-structure + tests with a mock Rep (array-backed pagemap, modelled on `redblack.cc`). +- Modified: `src/snmalloc/ds_core/redblacktree.h` — `neighbours(K)` + helper on `RBTree` returning `(largest < K, smallest > K)` in one walk. +- Modified: `src/test/func/redblack/redblack.cc` — randomised + `neighbours(K)` tests against `std::set::lower_bound` / + `upper_bound` as oracle. + +No production code path is changed in this phase: the existing +`LargeBuddyRange` continues to be the active large-block allocator. + +## Resolved during plan review + +- One Bin tree per IDEA servable-set bin (not per size class or per + exponent). +- Scope is the BackendArena data structure + tests only. +- The pagemap encoding carries a 2-bit **variant tag** + (`Min` / `TwoMin` / `Large`) on the first entry of each free block. + Tree membership — not the tag — is the source of truth for "is this + block free?". No transient `BackendOwned` / "claimed" tag is required. +- **No pagemap probing.** All adjacency lookups are restricted to this + `BackendArena`'s own RBTrees: non-min neighbours come from a single + `Range.neighbours(addr_A)` walk that returns both + `(largest < addr_A, smallest > addr_A)`; min-size neighbours come from + `MinSizeBin.find(addr_A ± MIN_CHUNK_SIZE)`. The pagemap is never read + at speculative addresses (concurrency hazard and no defined contract + for pagemap entries the BackendArena does not own). +- Free blocks may have **arbitrary chunk counts**, not just exact + size-class sizes — carving produces non-class remainders. `bin_index` + operates on `(addr_chunks, size_chunks)` pairs; `Large` blocks store + their precise chunk count in the third pagemap entry. +- Write-ordering rule: when adding a free block, the variant tag and any + auxiliary fields are written before the final RB-tree insertion that + makes the block reachable; when removing, the block is unlinked from + its trees before its pagemap entries are reused. +- Predecessor-Range-entry-reuse only applies when `P` is non-min. +- `add_block` returns `{0, 0}` on success; on overflow it returns the + unabsorbed range, mirroring `Buddy::add_block`'s overflow-return + contract. Overflow arises either when `size_chunks >= + 2^MAX_CHUNKS_BITS` on input (excess returned) or when consolidation + grows a coalesced block to exactly `2^MAX_CHUNKS_BITS` (the + consolidated range is returned, neighbours having been removed + first). The future `BackendArenaRange` wrapper is responsible for + handling overflow; the standalone `BackendArena` only exposes the + contract. +- `BackendArenaRep` is a chunk-keyed accessor concept (variant tag plus + word/size accessors for entries 1–3). `BackendArena` builds two + internal `RBTree`-Rep adapters (`BinRep`, `RangeRep`) over it; user + code never sees the adapter shape. +- Backend chunk size classes are a new chunk-unit size-class scheme in + `backend_arena_bins.h` (not bytes), independent of the + power-of-two-only large variant of front-end `sizeclass_t`, with + low-exponent special cases handled in the spirit of + `bits::from_exp_mant`. +- `BackendArena` uses chunk-count + exponent bounds with **exclusive max** semantics, matching the existing + `Buddy<..., MIN, MAX>`. +- Multi-`B` testing is via a templated bin-table generator in a single + test binary, not via separate CMake configurations. +- Phase 5 verifies the reuse optimisation via Range-tree insert/remove + *call counters* at the `BackendArena` layer (no `RBTree` modification). + +## Still open (resolve during implementation) + +- Exact bit positions in the first-word pagemap encoding for the + variant-tag field (Phase 3 decides; documented only in + `backend_arena.h`). +- Whether Bin tree roots are stored flat (`Array`) + or exponent-keyed (`Array, NUM_EXPS>`). + Decide in Phase 3 when `BackendArena` is built; this is an internal + detail of `BackendArena` (bin id is a flat `size_t` returned by + `Bitmap::add` / `Bitmap::find_for_request`, so the choice does not + leak into `BackendArenaBins` or the Rep concept). +- Whether the future memcpy `offset` field is best placed in the second + word of every pagemap entry, in dedicated entries, or in a side table. + Out of scope for this phase; flagged for the memcpy-fix plan to design. +- Whether `INTERMEDIATE_BITS=4` (34 bins/exp) needs to be tested in this + phase. Currently `B ∈ {1, 2, 3}` only. + + diff --git a/claude.md b/claude.md new file mode 100644 index 000000000..05a6398e4 --- /dev/null +++ b/claude.md @@ -0,0 +1,153 @@ +# Claude AI Guidelines for snmalloc + +## Working Style + +**Complete the plan, then check in**: When a plan is approved, execute all +steps to completion. Don't stop after each step for review. When you think +you're done, recursively apply all relevant principles from this file — check +each one, act on any that apply, then check again until no more principles +are relevant. Only then report completion and wait for feedback. + +**Plans require discussion before implementation**: After devising a plan +(whether in plan mode or not), run the review loop (see "Mandatory review +checkpoints") before presenting it. Do NOT proceed to implementation until +the plan has been seen and explicitly approved. + +**Store plans in PLAN.md**: Always write plans to `PLAN.md` in the repository +root so that context survives session boundaries. Update (not append to) the +file when the plan evolves. This is the single source of truth for what is +planned and what has been completed. + +**Baseline the checkout before starting work**: Before beginning implementation +of any plan, verify that the current checkout builds and passes tests. Run the +build and test suite (per `skills/building_and_testing.md`) and record the +results. If the baseline is broken, report the failures and stop — do not start +implementation on a broken base. Pre-existing failures that are not caused by +your changes must be acknowledged upfront so they are not confused with +regressions introduced by the plan. This establishes the ground truth against +which your changes will be measured. + +**Every plan step must have a test gate**: Each step in a plan must produce +a testable result — a test, a build check, or a verifiable property — that +acts as the gate to the next step. Do not move to step N+1 until step N's +gate passes. This catches integration issues incrementally rather than +deferring all testing to the end. When writing a plan, structure it so that +independently testable components are implemented and verified first, and +later steps build on proven foundations. + +**Mandatory review checkpoints**: At each of these points, run the full +review loop — spawn a fresh-context reviewer subagent, address findings, +spawn another fresh reviewer, repeat until a reviewer finds no issues. When +you disagree with a reviewer's finding, escalate — do not resolve disputes +unilaterally. Do not proceed past a checkpoint without a clean review. +1. **After devising a plan**, before presenting it for discussion. For plan + reviews, adapt the reviewer prompt: instead of reading changed files and + running tests, the reviewer should read the plan document, read existing + code the plan references, verify assumptions about the codebase, and check + for structural gaps (missing steps, naming conflicts, incorrect + dependencies). +2. **After completing implementation and self-review**, before opening a PR. + +The only exception: if you believe a change is truly trivial (a typo fix, a +one-line config change), ask for permission to skip the review. Do not decide +on your own that something is trivial enough to skip. When in doubt, run the +review. + +**Go slow to go fast**: Before starting implementation work, identify and state +which principles from these instructions are most relevant to the current task. +This surfaces the right guidelines before they're needed rather than +rediscovering them after a mistake. + +**Challenge me when the evidence says I'm wrong**: If a reviewer flags something +that contradicts what I said, or if you have concrete evidence that an +instruction is incorrect, raise it — don't silently comply. Present the evidence +and discuss it. + +**Research findings belong in the plan**: If research or exploration surfaces +issues beyond the original task (inaccurate comments, dead code, related bugs), +include them as explicit plan steps — don't just mention them in the analysis +and move on. Anything worth noting is worth acting on or explicitly deferring. + +**Self-review is part of done**: The recursive principle check described in +"Complete the plan, then check in" IS the self-review. It's not a separate +step — it's what "done" means. Never report completion without having done it. + +**During reviewer loops**: At any point during the review loop — when fixing +findings, when unsure about a reviewer's suggestion, when making tradeoff +decisions — stop and ask. The automated review removes me as a gatekeeper, not +as a collaborator. + +## Debugging Principles + +1. **Logging is essential** - When debugging issues in allocator code, add tracing to identify the exact point of failure. Use `write()` directly to stderr/file rather than `printf`/`message` to avoid recursion through the allocator. + +2. **New code is most likely at fault** - When tests fail after changes, assume the new code introduced the bug. Don't blame existing infrastructure that was working before. + +3. **Baseline against origin/main** - Before assuming a system-wide issue, verify the test passes on `origin/main`. This confirms whether the issue is a regression introduced by your changes. + +4. **Check the whole PR for patterns** - When fixing a bug of a specific shape (e.g., "one-armed `if constexpr` causes MSVC C4702"), immediately search all changed files in the PR for the same pattern. Fix all instances at once rather than waiting for CI to report each one individually. + +5. **Verify hypotheses before acting** - A hypothesis about a bug's cause is not knowledge — it's a guess. Before investing effort in workarounds or fixes, validate empirically that your suspected cause is actually the cause. Read the code more carefully, write a minimal reproducer, or examine the actual data. Verify first, then act. + +6. **CI is the source of truth for build status** - A local build failure does not mean the build is broken. Local toolchain versions, stale dependency caches, and environment differences can all cause local failures that don't reproduce in CI. Never declare a build "broken on main" based on local results — check CI first. + +## Code Quality + +- **Use cross-platform macros from `defines.h`** - Never use raw compiler attributes like `__attribute__((used))` or `__forceinline` directly. Instead use the corresponding `SNMALLOC_*` macros (e.g., `SNMALLOC_USED_FUNCTION`, `SNMALLOC_FAST_PATH`, `SNMALLOC_SLOW_PATH`, `SNMALLOC_PURE`, `SNMALLOC_COLD`, `SNMALLOC_UNUSED_FUNCTION`, `ALWAYSINLINE`, `NOINLINE`). These are defined in `ds_core/defines.h` with correct expansions for MSVC, GCC, and Clang. + +- **Don't encode platform assumptions** - Avoid hardcoding limits like "48-bit address space" or "256 TiB max allocation". These assumptions may not hold on future platforms (56-bit, 64-bit address spaces, CHERI, etc.). + +- **Trust the existing bounds checks** - snmalloc already has appropriate bounds checking at API boundaries. New internal code should defer to the backend for edge cases rather than adding redundant checks. + +- **Guard new data structures** - When adding caches or intermediate layers, ensure they handle all input ranges correctly, including sizes larger than what they cache. Return early/bypass for out-of-range inputs. + +- **Keep headers minimal** - Each header should only include what it directly needs. Avoid adding transitive includes "for convenience" — if a header's own declarations only need ``, don't pull in heavier internal headers. Includers are responsible for their own dependencies. This keeps compile times low and dependency graphs clean. + +- **No C++ STL or C++ standard library headers** - snmalloc must be compilable as part of a libc implementation, so it cannot depend on an external C++ STL. Never use headers like ``, ``, ``, ``, etc. directly. Instead use the C equivalents (``, ``) or snmalloc's own STL wrappers in `src/snmalloc/stl/` (e.g., `snmalloc/stl/type_traits.h`, `snmalloc/stl/atomic.h`, `snmalloc/stl/array.h`). These wrappers have both a `gnu/` backend (no C++ STL dependency) and a `cxx/` backend, selected at build time. + +- **Prefer explicit over implicit** - Avoid relying on implicit conversions, convention-based wiring, or unnamed dependencies. A few extra characters of explicit code is almost always cheaper than someone later needing to reconstruct the hidden knowledge. This is especially relevant in C++ with its many implicit conversion paths and template magic. + +- **Document coupling at the point of breakage** - When code A depends on the internal behaviour of code B (read sequence, execution order, size assumptions), put the comment on B — that's where a future maintainer would make a breaking change. Commenting at A doesn't help because the person changing B won't be reading A. + +- **Design for changeability, not for predicted changes** - Make designs modular and replaceable so future needs can be accommodated, but don't add abstractions, extension points, or features for changes that haven't happened yet. The goal is a design that's easy to modify, not one that anticipates specific modifications. + +- **Comments earn their length by carrying correctness-relevant information** - A comment exists to convey something the reader cannot recover from the code — a non-obvious invariant, a subtle correctness argument, a coupling that breaks if edited. If you cannot name what the comment teaches that the code does not, cut it. + +- **Test scaffolding does not live in production headers** - A header that needs a friend struct purely for testing carries only the forward declaration and the friend grant; the body lives in test code. + +- **Store data in the form the consumer uses** - If a derived value is only consumed pre-shifted, pre-negated, or pre-masked, store it that way at build time. The cost moves from every consumer call to one build-time loop. + +- **Algorithms must only touch state whose synchronisation they own** - Speculative reads of globally shared, concurrently modified data are races, not "fast probes". If you need information from outside your lock, design without it or pull it through the structure's own discipline. + +- **Verify language and library claims before stating them** - When a comment or rationale invokes a language rule, check it. Load-bearing claims that are wrong make the explanation actively misleading. + +## Code Change Discipline + +- **Read before modifying** - Do not propose changes to code you haven't read. Understand existing code before suggesting modifications. + +- **Prefer editing over creating** - Edit existing files rather than creating new ones. This prevents file bloat and builds on existing work. + +- **Avoid over-engineering** - Only make changes that are directly requested or clearly necessary. Don't add error handling for scenarios that can't happen. Don't add docstrings or comments to code you didn't change. Don't create helpers or abstractions for one-time operations. Three similar lines of code is better than a premature abstraction. + +- **Evaluate copied patterns, don't cargo-cult** - When reusing a pattern from existing snmalloc code, evaluate each choice (`constexpr` vs runtime, template vs function parameter, etc.) in the context of the new usage. The original may have had reasons that don't apply, or it may have been a mistake. Copy the intent, not the incidental choices. Conventions (legal headers, naming schemes, file organisation) should be followed for consistency; technical patterns should be evaluated on merit. + +- **Fix what your change makes stale** - When a change invalidates something elsewhere — a comment, a test description, documentation — fix it in the same PR. Stale artefacts left behind are bugs in the making, and "I didn't modify that line" isn't an excuse when your change is what made it wrong. + +- **Document the code, not the change** - Comments and documentation describe how the code IS, not how it was changed or why it differs from a previous version. Don't leave comments explaining "we removed X" or "this was changed from Y" — a reader shouldn't need the git history to understand the code. If code needs context about alternatives or design decisions, put that in design docs, not inline comments. + +- **Don't fragment atomic refactors** - If a change has a single commit-able outcome and a small footprint, do it as one step. Multi-step plans are for refactors whose intermediate states are independently testable. + +## Building, Testing, and Benchmarking + +All build, test, and benchmarking guidance lives in `skills/building_and_testing.md`. + +**Delegate testing to a subagent.** When it is time to build and run tests, +spawn a subagent whose prompt includes the contents of +`skills/building_and_testing.md` and tells it which tests to run (or "run the +full suite"). Do NOT include implementation context — the subagent must not +know what code changed. This prevents the tester from rationalising failures +as related to the changes instead of reporting them objectively. + +The subagent will report back: which tests passed, which failed, exact +commands, and full output of any failures. If failures are reported, treat +them as actionable per the failure protocol in the skill file. diff --git a/src/snmalloc/backend_helpers/backend_arena_bins.h b/src/snmalloc/backend_helpers/backend_arena_bins.h new file mode 100644 index 000000000..57ab27a03 --- /dev/null +++ b/src/snmalloc/backend_helpers/backend_arena_bins.h @@ -0,0 +1,755 @@ +#pragma once + +#include "../ds_core/bits.h" +#include "../ds_core/helpers.h" + +#include + +namespace snmalloc +{ + template + struct BackendArenaBinsTestAccess; + + /** + * Chunk size class enumeration and bin classification used by the + * BackendArena. + * + * Template parameter B (mantissa-bit width of snmalloc's + * non-power-of-two size class scheme) determines the number of + * RB-trees per exponent — the count of distinct servable subsets a + * free block can occupy at that exponent: B=1 -> 2; B=2 -> 5; + * B=3 -> 13. The canonical within-exponent bin numbering matches + * `prototype/skip_analysis.py`. All bin-scheme metadata derives + * constexpr from a single per-bin subsets table, `bin_subsets`. + * + * Public surface: + * - `range_t`, `carve_t`: chunk-count ranges and carve output. + * - `carve(block, n_chunks)`: split a block into pre-pad / aligned + * request / post-pad. + * - `max_supported_chunks()`: upper bound on legal request sizes. + * - nested `Bitmap`: per-arena non-empty-bins bitmap with + * `add` / `find_for_request` / `clear`. + * + * Everything else is private; tests reach it via + * `BackendArenaBinsTestAccess`. + */ + template + class BackendArenaBins + { + static_assert( + INTERMEDIATE_BITS >= 1 && INTERMEDIATE_BITS <= 3, + "BackendArenaBins currently supports B in {1, 2, 3}"); + + public: + /// (base, size) chunk-count range. `size == 0` means empty (base + /// is unspecified). + struct range_t + { + size_t base; + size_t size; + }; + + /// Output of `carve`: pre-pad / aligned request / post-pad. + /// Either or both of `pre`/`post` may be empty. + struct carve_t + { + range_t pre; + range_t req; + range_t post; + }; + + private: + friend struct BackendArenaBinsTestAccess; + + static constexpr size_t B = INTERMEDIATE_BITS; + + /// Number of mantissa positions per regular exponent (= 2^B). + static constexpr size_t MANTISSAS_PER_EXP = size_t(1) << B; + + /// Number of distinct servable-subset bins per exponent + /// (from prototype/skip_analysis.py). + static constexpr size_t BINS_PER_EXP = (B == 1) ? 2 : + (B == 2) ? 5 : + (B == 3) ? 13 : + 0; + + /// Size of the per-sc info tables. One past the largest raw id from + /// `bits::to_exp_mant_const` whose decoded size fits in + /// `size_t` (the architectural max raw id decodes to `2^bits::BITS`, + /// which overflows). + static constexpr size_t MAX_SC = + ((bits::BITS - B) << B) + ((size_t(1) << B) - 1); + + /** + * Per-SC bitmap-scan record, read by `Bitmap::find_for_request`. + * Fields are pre-shifted into the bitmap's word layout so the + * search hot path is two ANDs. + * + * - `start_word`: bitmap word containing this SC's start bin. + * - `first_mask`: serve mask pre-shifted into `start_word`. Bit + * `i` set iff `words_[start_word]` bit `i` serves this SC. + * - `second_mask`: serve mask carried into `start_word + 1`. When + * the start bin is word-aligned there is no within-exp carry + * and bits there are all higher-exponent, so `second_mask == ~0`. + * + * `alignas(4 * sizeof(size_t))` rounds `sizeof(bitmap_info_t)` up + * to a power of two so `table_.bitmap_info[sc]` indexes with a + * single shift+add. + * + * A *bin* (single bit in `Bitmap`) has no size/alignment of its + * own; it may be set on behalf of any SC whose subset includes it. + */ + struct alignas(4 * sizeof(size_t)) bitmap_info_t + { + size_t start_word; + size_t first_mask; + size_t second_mask; + }; + + static_assert( + sizeof(bitmap_info_t) == 4 * sizeof(size_t), + "bitmap_info_t must be 4*size_t so table_.bitmap_info[sc] indexes " + "with a single shift+add; revisit the alignas if fields change"); + + /** + * Per-SC carve record, read by `carve` and by `bin_offset_at`'s + * `fits` predicate (free-side cascade walk via `bin_index`). + * + * - `size_chunks`: size this SC promises on allocation. + * - `align_chunks`: natural alignment (a power of two, derived + * from `size_chunks`). + */ + struct carve_info_t + { + size_t size_chunks; + size_t align_chunks; + }; + + static_assert( + sizeof(carve_info_t) == 2 * sizeof(size_t), + "carve_info_t must be 2*size_t so table_.carve_info[sc] indexes " + "with a single shift+add"); + + /** + * Map a request size to its bitmap-scan record. + * + * `n_chunks` must be in `[1, max_supported_chunks()]`. + * Not `constexpr`: uses `bits::clz` intrinsic via `bits::to_exp_mant` + * to stay single-cycle on the fast path. + */ + SNMALLOC_FAST_PATH static const bitmap_info_t& + bitmap_info_for_request(size_t n_chunks) + { + SNMALLOC_ASSERT(n_chunks >= 1); + SNMALLOC_ASSERT(n_chunks <= max_supported_chunks()); + size_t raw = bits::to_exp_mant(n_chunks); + SNMALLOC_ASSERT(raw < MAX_SC); + return table_.bitmap_info[raw]; + } + + /// Map a request size to its carve record. Preconditions and + /// properties as `bitmap_info_for_request`. + SNMALLOC_FAST_PATH static const carve_info_t& + carve_info_for_request(size_t n_chunks) + { + SNMALLOC_ASSERT(n_chunks >= 1); + SNMALLOC_ASSERT(n_chunks <= max_supported_chunks()); + size_t raw = bits::to_exp_mant(n_chunks); + SNMALLOC_ASSERT(raw < MAX_SC); + return table_.carve_info[raw]; + } + + /** + * Bin id of `block`. Operates on arbitrary chunk counts, not just + * exact size classes. `block.size` must be >= 1. + * + * A bin id at exponent `e` identifies the *servable set*: the + * subset of SCs at `e` that `block` could serve. Two blocks with + * the same servable set at the same exponent share a bin id. + * + * The natural exponent is `e = prev_pow2_bits(block.size)`. If + * alignment padding eats every SC there, we drop to `e - 1`, + * which is guaranteed to fit: its smallest SC has size and + * alignment `2^(e-1)`, so worst-case `size + pad < 2^e <= + * block.size`. One drop is always enough. + * + * Not `constexpr`: uses `bits::clz` via `bits::prev_pow2_bits`. + */ + SNMALLOC_FAST_PATH static size_t bin_index(range_t block) + { + SNMALLOC_ASSERT(block.size >= 1); + + size_t e = bits::prev_pow2_bits(block.size); + size_t offset = bin_offset_at(block.base, block.size, e); + if (SNMALLOC_UNLIKELY(offset == BINS_PER_EXP)) + { + // Padding ate the natural exponent. Drop one and retry. Proof + // of single-step termination is in the doc comment above. + SNMALLOC_ASSERT(e > 0); + e--; + offset = bin_offset_at(block.base, block.size, e); + SNMALLOC_ASSERT(offset != BINS_PER_EXP); + } + return table_.exp_bin_base[e] + offset; + } + + public: + /// Largest `n_chunks` legal for `carve` / `Bitmap::find_for_request`. + static constexpr size_t max_supported_chunks() + { + return bits::from_exp_mant(MAX_SC - 1); + } + + /** + * Carve a free block into pre-pad / aligned request / post-pad. + * + * Preconditions (caller must have used `Bitmap::find_for_request` + * to locate a servable bin): + * - `block.size > 0`, `n_chunks` in `[1, max_supported_chunks()]`, + * `block` large enough to fit the SC after aligning up. + * - `block.base + block.size` does not wrap. + * + * Pure: does not touch the bitmap or any tree. Either or both + * `pre` / `post` may have `size == 0`; their `base` is still set + * to the natural address so `pre.base + pre.size == req.base` and + * `req.base + req.size == post.base` (keeps caller adjacency + * checks simple). + */ + SNMALLOC_FAST_PATH static carve_t carve(range_t block, size_t n_chunks) + { + SNMALLOC_ASSERT(n_chunks >= 1); + SNMALLOC_ASSERT(n_chunks <= max_supported_chunks()); + SNMALLOC_ASSERT(block.size > 0); + // Combined with the servability precondition, non-wrapping end + // ensures the alignment-up below does not wrap either. + SNMALLOC_ASSERT(block.base + block.size >= block.base); + + const carve_info_t& info = carve_info_for_request(n_chunks); + + size_t req_base = + (block.base + (info.align_chunks - 1)) & ~(info.align_chunks - 1); + size_t pre_size = req_base - block.base; + + SNMALLOC_ASSERT(pre_size <= block.size); + SNMALLOC_ASSERT(block.size - pre_size >= info.size_chunks); + + size_t post_base = req_base + info.size_chunks; + size_t post_size = (block.base + block.size) - post_base; + + carve_t result; + result.pre = {block.base, pre_size}; + result.req = {req_base, info.size_chunks}; + result.post = {post_base, post_size}; + return result; + } + + /** + * Bitmap of non-empty per-arena bins. One bit per bin id + * (`bin_index`'s output); set iff the corresponding RB-tree is + * non-empty. + * + * Three-method API: + * - `add(range_t)`: classify a block and set its bin's bit + * (idempotent on the bit; returns the bin id). + * - `find_for_request(n_chunks)`: smallest set bin whose blocks + * all serve `n_chunks`, or `SIZE_MAX` if none. + * - `clear(bin_id)`: mark empty. Caller must ensure the bin's + * tree is actually empty; the bitmap does not track contents. + * + * Not thread-safe: callers sharing an arena must serialise the + * add / find / clear sequence under an external mutex. + */ + class Bitmap + { + friend struct BackendArenaBinsTestAccess; + + public: + /// Strict upper bound on bin ids `bin_index` produces. Exposed + /// so callers can size parallel arrays (one RB-tree per bin id). + static constexpr size_t TOTAL_BINS = BINS_PER_EXP * bits::BITS; + + Bitmap() : words_{} {} + + /** + * Classify `block`, set its bin's bit, return the bin id. + * + * Idempotent on bitmap state: if the bit is already set, this + * is a no-op (the bin id is still returned). + * + * The bitmap does NOT track which `(base, size)` ranges live in + * each bin's tree — the caller is responsible for inserting + * `block` into the appropriate tree. + */ + SNMALLOC_FAST_PATH size_t add(range_t block) + { + SNMALLOC_ASSERT(block.size >= 1); + SNMALLOC_ASSERT(block.size <= max_supported_chunks()); + size_t bin_id = bin_index(block); + SNMALLOC_ASSERT(bin_id < TOTAL_BINS); + words_[bin_id / bits::BITS] |= + (size_t(1) << (bin_id & (bits::BITS - 1))); + return bin_id; + } + + /// Mark bin `bin_id` empty. Caller must ensure the bin's tree + /// is actually empty; the bitmap does not consult the trees. + SNMALLOC_FAST_PATH void clear(size_t bin_id) + { + SNMALLOC_ASSERT(bin_id < TOTAL_BINS); + words_[bin_id / bits::BITS] &= + ~(size_t(1) << (bin_id & (bits::BITS - 1))); + } + + /** + * Smallest bin id whose set blocks all serve `n_chunks`, or + * `SIZE_MAX` if none. `n_chunks` in `[1, max_supported_chunks()]`. + * + * Invariant (static_assert below): `BINS_PER_EXP <= bits::BITS`, + * so the within-exponent range fits inside one word and the + * search straddles at most one word boundary. After the second + * word, every remaining word is purely higher-exponent. + */ + SNMALLOC_FAST_PATH size_t find_for_request(size_t n_chunks) const + { + const bitmap_info_t& info = bitmap_info_for_request(n_chunks); + SNMALLOC_ASSERT(info.start_word < NUM_BITMAP_WORDS); + + // First word: start bin + any within-exp neighbours in same word. + size_t word = info.start_word; + size_t bits = words_[word] & info.first_mask; + if (bits != 0) + return word * bits::BITS + bits::ctz(bits); + ++word; + if (word == NUM_BITMAP_WORDS) + return SIZE_MAX; + + // Second word: within-exp carry plus any higher-exp bits. + bits = words_[word] & info.second_mask; + if (bits != 0) + return word * bits::BITS + bits::ctz(bits); + + // Remaining words: purely higher-exponent, any bit serves. + while (++word < NUM_BITMAP_WORDS) + if (words_[word] != 0) + return word * bits::BITS + bits::ctz(words_[word]); + return SIZE_MAX; + } + + private: + /// Number of size_t words backing the bitmap. Internal layout. + static constexpr size_t NUM_BITMAP_WORDS = + (TOTAL_BINS + bits::BITS - 1) / bits::BITS; + + static_assert( + TOTAL_BINS == BINS_PER_EXP * bits::BITS, + "Bitmap layout: TOTAL_BINS must be BINS_PER_EXP * bits::BITS so it " + "divides evenly into bits::BITS-sized words."); + static_assert( + NUM_BITMAP_WORDS == BINS_PER_EXP, + "Bitmap layout: with the canonical TOTAL_BINS, the word count is " + "exactly BINS_PER_EXP."); + static_assert( + TOTAL_BINS < SIZE_MAX, + "find_for_request returns SIZE_MAX as the 'no match' sentinel; " + "TOTAL_BINS must be strictly less than SIZE_MAX so no valid bin " + "id can collide with the sentinel."); + static_assert( + BINS_PER_EXP <= bits::BITS, + "find_for_request assumes the within-exponent range (at most " + "BINS_PER_EXP bins) fits inside a single word, so the search " + "straddles at most one word boundary. If a future B pushes " + "BINS_PER_EXP above bits::BITS, the two-word body must be " + "generalised to handle a multi-word straddle."); + + size_t words_[NUM_BITMAP_WORDS]; + }; + + private: + // Vocabulary used in the rest of the private implementation: + // + // exponent (e) : the bin-scheme exponent of a size; one axis of + // the size class grid. + // mantissa (m) : the within-exponent position, in + // [0, MANTISSAS_PER_EXP). The other axis. When + // passed as a single argument it is named `m` + // (e.g. `start_bin_offset_for_m(m)`). + // subset : a bitmask of mantissas. `bin_subsets[b]` is the + // set of mantissas bin offset `b` can serve. + // m_top : when discussing a particular bin, the maximum + // element of its subset. Used as the bucketing + // axis for the cascade (see `bin_offset_at`). + // m_test : a single-mantissa probe in a cascade step; + // chosen so the probe's outcome disambiguates + // one candidate bin from the rest. + + /** + * Single source of truth for the bin scheme. + * + * `bin_subsets[b]` is a bitmask of the mantissas bin offset `b` + * can serve: bit `m` set iff bin offset `b`'s servable subset + * contains mantissa `m`. The canonical bin numbering matches + * `prototype/skip_analysis.py`. Everything else in this file -- + * `start_bin_offset_for_m`, `serve_mask_for_m`, the per-SC + * `start_word` / `first_mask` / `second_mask`, and the per-m_top + * decision lists in `BinTable::cascade_steps` -- is derived + * (constexpr) from this table. + * + * Required invariant (checked at constexpr build time in + * `BinTable::BinTable`; violating it fails the build): for every + * `m_top`, the bins whose subset has `m_top` as max element form a + * strict containment chain when sorted by subset size descending. + * That is, the largest such subset properly contains the next, + * which properly contains the one after, and so on. The chain + * property is what makes the single-mantissa-probe cascade in + * `bin_offset_at` sufficient to disambiguate among them. + * + * If you edit the literals below, re-run + * `prototype/skip_analysis.py` to verify they still match the + * canonical numbering and chain property. + */ + static constexpr ModArray bin_subsets = []() { + ModArray r{}; + if constexpr (B == 1) + { + // bin 0: {0} + // bin 1: {0,1} + r[0] = 0b01; + r[1] = 0b11; + } + else if constexpr (B == 2) + { + // bin 0: {0} bin 3: {0,1,2} + // bin 1: {1} bin 4: {0,1,2,3} + // bin 2: {0,1} + r[0] = 0b0001; + r[1] = 0b0010; + r[2] = 0b0011; + r[3] = 0b0111; + r[4] = 0b1111; + } + else /* B == 3 */ + { + // bin 0: {0} bin 7: {1,2,3,5} + // bin 1: {1} bin 8: {0,1,2,3,4} + // bin 2: {0,1} bin 9: {0,1,2,3,5} + // bin 3: {1,2} bin 10: {0,1,2,3,4,5} + // bin 4: {0,1,2} bin 11: {0,1,2,3,4,5,6} + // bin 5: {1,2,3} bin 12: {0,1,2,3,4,5,6,7} + // bin 6: {0,1,2,3} + r[0] = 0b00000001; + r[1] = 0b00000010; + r[2] = 0b00000011; + r[3] = 0b00000110; + r[4] = 0b00000111; + r[5] = 0b00001110; + r[6] = 0b00001111; + r[7] = 0b00101110; + r[8] = 0b00011111; + r[9] = 0b00101111; + r[10] = 0b00111111; + r[11] = 0b01111111; + r[12] = 0b11111111; + } + return r; + }(); + + /** + * First within-exponent bin offset whose subset contains mantissa + * `m`. Derived from `bin_subsets`. + * + * Combined with the per-exponent base, this is an SC's absolute + * start bin index: `start_bit = exp_bin_base[e] + + * start_bin_offset_for_m(m)`. The bitmap stores its low and high + * halves pre-shifted into the `bitmap_info_t::first_mask` / + * `second_mask` fields. + */ + static constexpr size_t start_bin_offset_for_m(size_t m) + { + size_t mask = size_t(1) << m; + for (size_t b = 0; b < BINS_PER_EXP; b++) + if (bin_subsets[b] & mask) + return b; + return BINS_PER_EXP; // unreachable: every m is in some subset + } + + /** + * Bitmask, relative to `start_bin_offset_for_m(m)`, of bins that + * serve `m`. Bit `k` is set iff bin offset + * `start_bin_offset_for_m(m) + k` serves a request whose + * within-exponent position is `m`. The start bin always serves + * (bit 0 set), within-exponent bins serve iff their subset + * contains `m`, and bins above the within-exponent range belong + * to higher exponents and always serve (high bits all 1). + * + * Built positively (set bit = "serve") rather than as a "skip" + * mask: the hot path in `Bitmap::find_for_request` AND's this + * mask (pre-shifted into `bitmap_info_t::first_mask` / `second_mask`) + * against the bitmap word without an intermediate NOT. + */ + static constexpr size_t serve_mask_for_m(size_t m) + { + size_t mask = size_t(1) << m; + size_t start = start_bin_offset_for_m(m); + size_t result = ~size_t(0); + for (size_t b = start + 1; b < BINS_PER_EXP; b++) + if (!(bin_subsets[b] & mask)) + result &= ~(size_t(1) << (b - start)); + return result; + } + + /// Constexpr popcount: small loop, used only at BinTable build time. + static constexpr size_t popcount_const(size_t x) + { + size_t n = 0; + while (x != 0) + { + n += (x & 1); + x >>= 1; + } + return n; + } + + /// One step of a per-m_top decision list used by `bin_offset_at`. + /// If `m_test == NO_TEST` (see below) or `fits(m_test)` is true, + /// return `bin`. + struct CascadeStep + { + size_t m_test; + size_t bin; + }; + + /// Sentinel for `CascadeStep::m_test` meaning "take this bin + /// unconditionally". Any value `>= MANTISSAS_PER_EXP` would do; the + /// fits() lambda would short-circuit it on `first + m >= past`, but + /// the explicit sentinel makes the walker's intent obvious and + /// avoids one unnecessary comparison. + static constexpr size_t NO_TEST = MANTISSAS_PER_EXP; + + /** + * Maximum decision-list length per `m_top`. Derived from + * `bin_subsets`: the largest number of bins sharing the same max + * subset element. Used to size `cascade_steps[m_top][]`; some + * `m_top` values have fewer candidates, leaving default-initialised + * slots at the end. Those slots are never reached because the + * preceding NO_TEST entry always returns. + */ + static constexpr size_t MAX_CASCADE_STEPS = []() { + size_t mx = 0; + for (size_t m_top = 0; m_top < MANTISSAS_PER_EXP; m_top++) + { + size_t cnt = 0; + for (size_t b = 0; b < BINS_PER_EXP; b++) + { + // Bit m_top set and no higher bit set <=> max element is m_top. + if ((bin_subsets[b] >> m_top) == 1) + cnt++; + } + if (cnt > mx) + mx = cnt; + } + return mx; + }(); + + /** + * Within-exponent bin offset for a block at `addr_chunks` of length + * `n_chunks` at exponent `e`. Returns `BINS_PER_EXP` (sentinel) if + * no mantissa at this exponent fits. + * + * Walks `m_top` from `MANTISSAS_PER_EXP - 1` down. The first + * fitting `m_top` is the largest mantissa this block can serve; + * it is also the natural bucketing axis, because the bins whose + * subset has `m_top` as max element are exactly the candidates we + * still need to disambiguate among. `table_.cascade_steps[m_top]` + * (a constexpr-built decision list, derived from `bin_subsets`) + * disambiguates among them with at most a couple of secondary + * `fits` checks. + * + * Worst case: `MANTISSAS_PER_EXP + MAX_CASCADE_STEPS - 1` fit + * checks — the inner loop's last entry is the NO_TEST default and + * returns without calling `fits`. Typical: 1-2 at the natural + * exponent and 1 at the fallback exponent. + */ + SNMALLOC_FAST_PATH static size_t + bin_offset_at(size_t addr_chunks, size_t n_chunks, size_t e) + { + size_t first = table_.exp_first_sc[e]; + size_t past = table_.exp_first_sc[e + 1]; + + auto fits = [&](size_t m) SNMALLOC_FAST_PATH_LAMBDA -> bool { + // Safety: mantissa m may not exist at this exponent (low + // regime -- exponents 0..B-1 have fewer than 2^B mantissas; + // for any B the very first exponent has only 1). Without this + // check we would index past `past` into the carve_info table. + if (first + m >= past) + return false; + const carve_info_t& ci = table_.carve_info[first + m]; + // Optimisation: near the bottom of n_chunks's exponent range + // the higher-mantissa sizes already exceed n_chunks and cannot + // fit regardless of alignment. Skips the align_up below. + if (n_chunks < ci.size_chunks) + return false; + size_t pad = bits::align_up(addr_chunks, ci.align_chunks) - addr_chunks; + return n_chunks - ci.size_chunks >= pad; + }; + + for (size_t m_top = MANTISSAS_PER_EXP; m_top-- > 0;) + { + if (fits(m_top)) + { + // Walk this m_top's decision list. The list always ends with + // a NO_TEST entry that acts as the default, so the loop is + // guaranteed to return. + for (size_t j = 0; j < MAX_CASCADE_STEPS; j++) + { + const CascadeStep& step = table_.cascade_steps[m_top][j]; + if (step.m_test == NO_TEST || fits(step.m_test)) + return step.bin; + } + SNMALLOC_ASSERT(false); // unreachable per the invariant above + } + } + return BINS_PER_EXP; + } + + /** + * Constexpr-populated rodata tables. + * + * `bitmap_info[sc]` is the bitmap-scan record for each in-range + * sc (consumed by `Bitmap::find_for_request`). + * `carve_info[sc]` is the size/alignment record for each in-range + * sc (consumed by `carve` and by `bin_offset_at`'s `fits` + * predicate during free-side classification). + * `exp_first_sc[e]` is the first raw sc id at BackendArenaBins + * exponent e (with `exp_first_sc[bits::BITS] = MAX_SC` as a sentinel + * so `[exp_first_sc[e], exp_first_sc[e + 1])` is a valid raw range + * for every `e < bits::BITS`). + * `exp_bin_base[e]` is `e * BINS_PER_EXP`, precomputed so the + * `bin_index` fast path never performs a runtime multiply. + * `cascade_steps[m_top]` is the decision list `bin_offset_at` walks + * once it knows `m_top` is the largest fitting mantissa at the + * current exponent. The list always ends with a NO_TEST entry that + * acts as the default. + */ + struct BinTable + { + ModArray bitmap_info{}; + ModArray carve_info{}; + ModArray exp_first_sc{}; + ModArray exp_bin_base{}; + ModArray> + cascade_steps{}; + + constexpr BinTable() + { + // Boundary tables: keep all (e -> raw sc range) and (e -> bin id + // base) knowledge in two small ROM arrays. `to_exp_mant_const` is + // the only place that knows the size class encoding; once we've + // pinned down the raw boundaries, everything else is table lookup. + // + // Note: `exp_first_sc` does NOT have a uniform stride. At the + // bottom of the encoding the low regime (no leading-1 bit; the + // `b = (e == 0) ? 0 : 1` branch in `to_exp_mant_const`) squashes + // multiple BackendArenaBins exponents into encoded-exponent 0. + // For `B = 2` the counts are 1, 2, 4, 4, 4, ... + for (size_t e = 0; e < bits::BITS; e++) + { + exp_first_sc[e] = bits::to_exp_mant_const(size_t(1) << e); + exp_bin_base[e] = e * BINS_PER_EXP; + } + exp_first_sc[bits::BITS] = MAX_SC; + exp_bin_base[bits::BITS] = bits::BITS * BINS_PER_EXP; + + // Per-sc records. Size and alignment come straight from the + // size-class scheme (via from_exp_mant); start_word, first_mask, + // second_mask are derived from bin_subsets via the constexpr + // helpers above, pre-shifted into the bitmap's word layout so + // the search hot path is two ANDs. + for (size_t sc = 0; sc < MAX_SC; sc++) + { + size_t size = bits::from_exp_mant(sc); + size_t e = bits::prev_pow2_bits_const(size); + size_t m = sc - exp_first_sc[e]; + size_t start_bit = exp_bin_base[e] + start_bin_offset_for_m(m); + size_t mask = serve_mask_for_m(m); + size_t shift = start_bit & (bits::BITS - 1); + carve_info[sc].size_chunks = size; + carve_info[sc].align_chunks = size & (~size + 1); + bitmap_info[sc].start_word = start_bit / bits::BITS; + bitmap_info[sc].first_mask = mask << shift; + // shift == 0: no within-exponent carry; the second word is + // entirely higher-exponent. shift > 0: the low `shift` bits + // receive the top of mask (within-exp carry plus its all-1s + // tail), and bits [shift, BITS) are higher-exp and always + // serve. + bitmap_info[sc].second_mask = (shift == 0) ? + ~size_t(0) : + ((mask >> (bits::BITS - shift)) | (~size_t(0) << shift)); + } + + // cascade_steps: for each m_top, build a decision list of + // (m_test, bin) pairs derived from bin_subsets. Candidates are + // bins whose subset has m_top as max element; sort descending + // by subset size. The strict-chain invariant on `bin_subsets` + // (see its doc comment) guarantees each non-default + // candidate's subset properly contains the next candidate's, + // so the discriminator for candidate `i` is one of the + // mantissas in `bin_subsets[b_i] & ~bin_subsets[b_{i+1}]`. + for (size_t m_top = 0; m_top < MANTISSAS_PER_EXP; m_top++) + { + ModArray candidates{}; + size_t n_cand = 0; + for (size_t b = 0; b < BINS_PER_EXP; b++) + { + // bin_subsets[b] >> m_top == 1 <=> bit m_top set and no + // higher bit set <=> max element of subset is m_top. + if ((bin_subsets[b] >> m_top) == 1) + { + candidates[n_cand] = b; + n_cand++; + } + } + // Insertion sort, descending by popcount of subset. + for (size_t i = 1; i < n_cand; i++) + { + size_t b = candidates[i]; + size_t pcb = popcount_const(bin_subsets[b]); + size_t j = i; + while (j > 0 && + popcount_const(bin_subsets[candidates[j - 1]]) < pcb) + { + candidates[j] = candidates[j - 1]; + j--; + } + candidates[j] = b; + } + // Non-default candidates: pick a discriminating mantissa. + // Under the strict-chain invariant on `bin_subsets`, each + // candidate's subset properly contains the next candidate's, + // so `bin_subsets[b] & ~bin_subsets[b_next]` is the + // (non-empty) set of mantissas unique to this candidate. + for (size_t i = 0; i + 1 < n_cand; i++) + { + size_t b = candidates[i]; + size_t b_next = candidates[i + 1]; + size_t discrim_set = bin_subsets[b] & ~bin_subsets[b_next]; + // If this fires, `bin_subsets` violates the strict-chain + // invariant: candidate `b`'s subset does not properly + // contain candidate `b_next`'s, so the cascade can't be + // expressed as single-mantissa probes. `throw` makes the + // constexpr evaluation non-constant and surfaces the + // violation as a compile error. + if (discrim_set == 0) + throw "bin_subsets violates strict-chain invariant"; + cascade_steps[m_top][i].m_test = bits::ctz_const(discrim_set); + cascade_steps[m_top][i].bin = b; + } + // Default (last) candidate. + cascade_steps[m_top][n_cand - 1].m_test = NO_TEST; + cascade_steps[m_top][n_cand - 1].bin = candidates[n_cand - 1]; + } + } + }; + + static constexpr BinTable table_{}; + }; +} // namespace snmalloc diff --git a/src/snmalloc/ds_core/bits.h b/src/snmalloc/ds_core/bits.h index 3391e70f7..57a5a0e73 100644 --- a/src/snmalloc/ds_core/bits.h +++ b/src/snmalloc/ds_core/bits.h @@ -288,6 +288,21 @@ namespace snmalloc return BITS - clz_const(x - 1); } + /** + * Returns `floor(log2(x))`, i.e. the bit index of the highest set bit + * of `x`. Correct for `x >= 1`; calling with `x == 0` is UB (it would + * call `clz(0)`, whose precondition is `x != 0`). + */ + inline SNMALLOC_FAST_PATH size_t prev_pow2_bits(size_t x) + { + return BITS - 1 - clz(x); + } + + constexpr size_t prev_pow2_bits_const(size_t x) + { + return BITS - 1 - clz_const(x); + } + constexpr SNMALLOC_FAST_PATH size_t align_down(size_t value, size_t alignment) { @@ -352,6 +367,35 @@ namespace snmalloc return (e << MANTISSA_BITS) + m; } + /** + * Runtime counterpart of `to_exp_mant_const`. Identical semantics, but + * uses the `clz` intrinsic instead of the 64-iteration `clz_const` + * loop, which makes it suitable for the allocation fast path. + * + * Requires `MANTISSA_BITS + LOW_BITS > 0` so that `value | LEADING_BIT` + * is never zero, satisfying `clz`'s precondition. + */ + template + inline SNMALLOC_FAST_PATH size_t to_exp_mant(size_t value) + { + static_assert( + MANTISSA_BITS + LOW_BITS > 0, + "to_exp_mant requires MANTISSA_BITS + LOW_BITS > 0 so that " + "value | LEADING_BIT is non-zero (clz precondition)"); + + constexpr size_t LEADING_BIT = one_at_bit(MANTISSA_BITS + LOW_BITS) >> 1; + constexpr size_t MANTISSA_MASK = mask_bits(MANTISSA_BITS); + + value = value - 1; + + size_t e = + bits::BITS - MANTISSA_BITS - LOW_BITS - clz(value | LEADING_BIT); + size_t b = (e == 0) ? 0 : 1; + size_t m = (value >> (LOW_BITS + e - b)) & MANTISSA_MASK; + + return (e << MANTISSA_BITS) + m; + } + template constexpr size_t from_exp_mant(size_t m_e) { diff --git a/src/test/func/backend_arena_bins/backend_arena_bins.cc b/src/test/func/backend_arena_bins/backend_arena_bins.cc new file mode 100644 index 000000000..6597ba0ed --- /dev/null +++ b/src/test/func/backend_arena_bins/backend_arena_bins.cc @@ -0,0 +1,1220 @@ +/** + * Unit tests for BackendArenaBins. + * + * Exercises: + * - the chunk size class encoding (via `BackendArenaBinsTestAccess`), + * - the private bin classification (`bin_index`), + * - the narrow public surface: `Bitmap::add` / `find_for_request` / + * `clear`, and the pure `carve(range_t, n)` decomposition. + * + * Strategy: brute force. For each (addr_chunks, n_chunks) on a small grid + * we directly check whether a block can serve every candidate size class + * (by finding an aligned sub-range that fits via `can_serve`, and + * consulting the canonical `bin_subsets` table via `serves`), and + * compare against what `bin_index` predicts. Bitmap behaviour is + * cross-checked against a slow reference scanner that formulates + * "bin b serves request n" directly in terms of the canonical + * `bin_subsets` table; raw word access for tests goes through + * `BackendArenaBinsTestAccess::raw_*`. + */ + +#include "test/setup.h" +#include "test/snmalloc_testlib.h" + +#include +#include +#include +#include +#include + +namespace snmalloc +{ + /** + * Friend struct exposing private internals of `BackendArenaBins` + * (and its nested `Bitmap`) for unit tests. Forward-declared in + * `backend_arena_bins.h`; defined here so the production header + * carries no test-only surface. + */ + template + struct BackendArenaBinsTestAccess + { + using Bins = BackendArenaBins; + + using Bitmap = typename Bins::Bitmap; + using range_t = typename Bins::range_t; + using carve_t = typename Bins::carve_t; + using bitmap_info_t = typename Bins::bitmap_info_t; + using carve_info_t = typename Bins::carve_info_t; + + static constexpr size_t B = Bins::B; + static constexpr size_t MANTISSAS_PER_EXP = Bins::MANTISSAS_PER_EXP; + static constexpr size_t BINS_PER_EXP = Bins::BINS_PER_EXP; + static constexpr size_t MAX_SC = Bins::MAX_SC; + + SNMALLOC_FAST_PATH static carve_t carve(range_t block, size_t n) + { + return Bins::carve(block, n); + } + + SNMALLOC_FAST_PATH static const bitmap_info_t& + bitmap_info_for_request(size_t n) + { + return Bins::bitmap_info_for_request(n); + } + + SNMALLOC_FAST_PATH static const carve_info_t& + carve_info_for_request(size_t n) + { + return Bins::carve_info_for_request(n); + } + + SNMALLOC_FAST_PATH static size_t bin_index(range_t block) + { + return Bins::bin_index(block); + } + + static constexpr size_t max_supported_chunks() + { + return Bins::max_supported_chunks(); + } + + // --- Raw size-class id access --- + // + // The bin scheme assigns a dense raw id in `[0, MAX_SC)` to each + // size class. Production code never names these (the fast path + // goes straight from request size to the bitmap-scan / carve + // record). Tests cross-check the encoding via the helpers below; + // the alias `chunk_sc_t = size_t` preserves the existing test + // naming. + + using chunk_sc_t = size_t; + + /// Raw id of the smallest size class >= n_chunks. + SNMALLOC_FAST_PATH static chunk_sc_t request(size_t n) + { + SNMALLOC_ASSERT(n >= 1); + SNMALLOC_ASSERT(n <= Bins::max_supported_chunks()); + return bits::to_exp_mant(n); + } + + static constexpr size_t size_chunks(chunk_sc_t sc) + { + return Bins::table_.carve_info[sc].size_chunks; + } + + static constexpr size_t align_chunks(chunk_sc_t sc) + { + return Bins::table_.carve_info[sc].align_chunks; + } + + SNMALLOC_FAST_PATH static const bitmap_info_t& bitmap_info(chunk_sc_t sc) + { + SNMALLOC_ASSERT(sc < Bins::MAX_SC); + return Bins::table_.bitmap_info[sc]; + } + + SNMALLOC_FAST_PATH static const carve_info_t& carve_info(chunk_sc_t sc) + { + SNMALLOC_ASSERT(sc < Bins::MAX_SC); + return Bins::table_.carve_info[sc]; + } + + /// `bitmap_info_for_request`, constexpr (uses `to_exp_mant_const`). + /// Only used in `static_assert`s. + static constexpr const bitmap_info_t& + bitmap_info_for_request_const(size_t n) + { + return Bins::table_ + .bitmap_info[bits::to_exp_mant_const(n)]; + } + + /// `carve_info_for_request`, constexpr (uses `to_exp_mant_const`). + /// Only used in `static_assert`s. + static constexpr const carve_info_t& carve_info_for_request_const(size_t n) + { + return Bins::table_ + .carve_info[bits::to_exp_mant_const(n)]; + } + + // The canonical source of truth for what each within-exponent bin + // offset can serve. Tests express the conceptual "bin b serves + // request n" predicate directly in terms of this table so they do + // not depend on the bitmap's pre-shifted layout. + static constexpr const auto& bin_subsets = Bins::bin_subsets; + + // --- Bitmap raw-word access --- + // + // The public Bitmap API is narrow (add / find_for_request / clear). + // Tests need to: + // - set up arbitrary bitmap states (single bit, exhaustive patterns) + // without going through `add` (which classifies a (base, size) + // range and so is constrained by what classifications exist). + // - inspect bitmap state after operations (test "exactly this bit is + // set" and "no other bit changed"). + // These accessors expose the raw word storage to do that. + + static constexpr size_t NUM_BITMAP_WORDS = Bitmap::NUM_BITMAP_WORDS; + + /// Set bit `bin_id` directly in the bitmap, bypassing + /// classification. For exhaustive bit-pattern tests. + static void raw_set(Bitmap& b, size_t bin_id) + { + SNMALLOC_ASSERT(bin_id < Bitmap::TOTAL_BINS); + b.words_[bin_id / bits::BITS] |= + (size_t(1) << (bin_id & (bits::BITS - 1))); + } + + /// Test whether bit `bin_id` is set in the bitmap. + static bool raw_has(const Bitmap& b, size_t bin_id) + { + SNMALLOC_ASSERT(bin_id < Bitmap::TOTAL_BINS); + return (b.words_[bin_id / bits::BITS] >> (bin_id & (bits::BITS - 1))) & + size_t(1); + } + + /// Whether the bitmap has no bits set. + static bool raw_empty(const Bitmap& b) + { + for (size_t i = 0; i < Bitmap::NUM_BITMAP_WORDS; i++) + if (b.words_[i] != 0) + return false; + return true; + } + + /// Read a raw word of the bitmap; for assertions like "only this + /// word is non-zero" or "the words round-trip exactly". + static size_t raw_word(const Bitmap& b, size_t word_idx) + { + SNMALLOC_ASSERT(word_idx < Bitmap::NUM_BITMAP_WORDS); + return b.words_[word_idx]; + } + }; +} // namespace snmalloc + +using snmalloc::BackendArenaBinsTestAccess; + +// Compile-time checks: a few size-class encoding properties that we want +// to fail the build (not the runtime) if regressed. +namespace static_checks +{ + using B1 = BackendArenaBinsTestAccess<1>; + using B2 = BackendArenaBinsTestAccess<2>; + using B3 = BackendArenaBinsTestAccess<3>; + + static_assert(B1::BINS_PER_EXP == 2, "B=1 BINS_PER_EXP"); + static_assert(B2::BINS_PER_EXP == 5, "B=2 BINS_PER_EXP"); + static_assert(B3::BINS_PER_EXP == 13, "B=3 BINS_PER_EXP"); + + static_assert( + B1::MAX_SC == ((snmalloc::bits::BITS - 1) << 1) + ((1 << 1) - 1), + "B=1 MAX_SC"); + static_assert( + B2::MAX_SC == ((snmalloc::bits::BITS - 2) << 2) + ((1 << 2) - 1), + "B=2 MAX_SC"); + static_assert( + B3::MAX_SC == ((snmalloc::bits::BITS - 3) << 3) + ((1 << 3) - 1), + "B=3 MAX_SC"); + + // Sizes that are powers of two have align == size. + static_assert( + B2::carve_info_for_request_const(4).align_chunks == 4, "size 4 align"); + static_assert( + B3::carve_info_for_request_const(8).align_chunks == 8, "size 8 align"); + + // size_chunks at request(s) must be >= s. + static_assert( + B2::carve_info_for_request_const(9).size_chunks == 10, "B=2 round-up"); + static_assert( + B3::carve_info_for_request_const(17).size_chunks == 18, "B=3 round-up"); +} // namespace static_checks + +namespace +{ + /// Conceptual predicate, expressed directly in terms of the canonical + /// `bin_subsets` table (the single source of truth for the bin + /// scheme). Bin `b` serves a request of size `n` iff `b`'s exponent + /// strictly exceeds `n`'s (any higher-exponent block is big enough), + /// or they share an exponent and `b`'s within-exponent subset + /// includes `n`'s mantissa. + /// + /// This is the reference both for what `find_for_request` must + /// return and for what `bin_index` must classify into. + template + constexpr bool serves(size_t bin, size_t n) + { + using Bins = BackendArenaBinsTestAccess; + size_t e_b = bin / Bins::BINS_PER_EXP; + size_t o_b = bin % Bins::BINS_PER_EXP; + size_t raw = snmalloc::bits::to_exp_mant_const(n); + size_t size_n = snmalloc::bits::from_exp_mant(raw); + size_t e_n = snmalloc::bits::prev_pow2_bits_const(size_n); + if (e_b < e_n) + return false; + if (e_b > e_n) + return true; + size_t exp_first = + snmalloc::bits::to_exp_mant_const(size_t(1) << e_n); + size_t m_n = raw - exp_first; + return ((Bins::bin_subsets[o_b] >> m_n) & size_t(1)) != 0; + } + + /// Return true iff a block of `n` chunks starting at chunk-aligned address + /// `addr` can serve a size class of size `s` chunks with natural alignment + /// `a` chunks. Brute-force search for an aligned sub-range that fits. + bool can_serve(size_t addr, size_t n, size_t s, size_t a) + { + if (s == 0 || s > n) + return false; + // Find first a-aligned address in [addr, addr + n - s]. + size_t mod = addr & (a - 1); + size_t first = (mod == 0) ? addr : (addr + (a - mod)); + return first + s <= addr + n; + } + + template + void check_chunk_sc_roundtrip() + { + using Bins = BackendArenaBinsTestAccess; + + // Properties (together these imply request is the smallest size class + // with size >= s): + // 1. size_chunks(request(s)) >= s for all s >= 1. + // 2. Idempotence: request(size_chunks(sc)) == sc. + // 3. Monotonicity: s1 <= s2 implies request(s1) <= request(s2). + auto prev_sc = Bins::request(1); + for (size_t s = 1; s <= 4096; s++) + { + auto sc = Bins::request(s); + size_t cs = Bins::size_chunks(sc); + if (cs < s) + { + std::printf( + "B=%zu request(%zu) gave class with size %zu < %zu\n", B, s, cs, s); + std::abort(); + } + if (Bins::request(cs) != sc) + { + std::printf("B=%zu request(size_chunks(sc))!=sc for cs=%zu\n", B, cs); + std::abort(); + } + if (sc < prev_sc) + { + std::printf("B=%zu request not monotone at s=%zu\n", B, s); + std::abort(); + } + prev_sc = sc; + } + } + + template + void check_align_chunks() + { + using Bins = BackendArenaBinsTestAccess; + + for (size_t s = 1; s <= 4096; s++) + { + auto sc = Bins::request(s); + size_t cs = Bins::size_chunks(sc); + size_t a = Bins::align_chunks(sc); + // a must be a power of two. + if (a == 0 || (a & (a - 1)) != 0) + { + std::printf("B=%zu size %zu: align_chunks %zu not pow2\n", B, cs, a); + std::abort(); + } + // a must divide cs. + if (cs % a != 0) + { + std::printf( + "B=%zu size %zu: align_chunks %zu does not divide size\n", B, cs, a); + std::abort(); + } + // a should be the LARGEST power of two dividing cs. + if ((a << 1) != 0 && cs % (a << 1) == 0) + { + std::printf( + "B=%zu size %zu: align_chunks %zu not the largest pow2 divisor\n", + B, + cs, + a); + std::abort(); + } + } + } + + /// Collect all chunk_sc_t classes whose size fits in the test grid. + template + std::vector::chunk_sc_t> + collect_classes(size_t max_size) + { + using Bins = BackendArenaBinsTestAccess; + using sc_t = typename Bins::chunk_sc_t; + + std::vector v; + sc_t prev{}; + bool have_prev = false; + for (size_t s = 1; s <= max_size; s++) + { + sc_t sc = Bins::request(s); + if (Bins::size_chunks(sc) != s) + continue; // s is not a class size + if (!have_prev || sc != prev) + { + v.push_back(sc); + prev = sc; + have_prev = true; + } + } + return v; + } + + template + void check_bin_classification(size_t max_addr, size_t max_n) + { + using Bins = BackendArenaBinsTestAccess; + auto classes = collect_classes(max_n); + + for (size_t addr = 0; addr < max_addr; addr++) + { + for (size_t n = 1; n <= max_n; n++) + { + size_t bin = Bins::bin_index({addr, n}); + + for (auto sc : classes) + { + size_t s = Bins::size_chunks(sc); + size_t a = Bins::align_chunks(sc); + bool actually = can_serve(addr, n, s, a); + bool predicted = serves(bin, s); + + if (predicted != actually) + { + std::printf( + "B=%zu addr=%zu n=%zu bin=%zu sc.size=%zu sc.align=%zu: " + "predicted=%d actually=%d\n", + B, + addr, + n, + bin, + s, + a, + (int)predicted, + (int)actually); + std::abort(); + } + } + } + } + } + + template + void check_bin_id_range() + { + using Bins = BackendArenaBinsTestAccess; + + // bin_index always returns a value in [0, BINS_PER_EXP * (e+1)) for the + // block's natural exponent e. + for (size_t addr = 0; addr < 32; addr++) + { + for (size_t n = 1; n <= 64; n++) + { + size_t bin = Bins::bin_index({addr, n}); + size_t within = bin % Bins::BINS_PER_EXP; + if (within >= Bins::BINS_PER_EXP) + { + std::printf( + "B=%zu addr=%zu n=%zu bin=%zu: within-exp id %zu >= BINS_PER_EXP " + "%zu\n", + B, + addr, + n, + bin, + within, + Bins::BINS_PER_EXP); + std::abort(); + } + } + } + } + + /// Verify that `*_info_for_request(n)` agrees with the per-sc + /// accessors for every n in a range. + template + void check_info_consistency() + { + using Bins = BackendArenaBinsTestAccess; + + for (size_t s = 1; s <= 4096; s++) + { + auto sc = Bins::request(s); + + // carve_info_for_request(s) must match the per-sc accessors and + // must alias the carve_info(request(s)) record (single table + // indirection, no copy). + const auto& ci = Bins::carve_info_for_request(s); + if (ci.size_chunks != Bins::size_chunks(sc)) + { + std::printf( + "B=%zu carve_info_for_request(%zu).size_chunks mismatch\n", B, s); + std::abort(); + } + if (ci.align_chunks != Bins::align_chunks(sc)) + { + std::printf( + "B=%zu carve_info_for_request(%zu).align_chunks mismatch\n", B, s); + std::abort(); + } + if (&ci != &Bins::carve_info(sc)) + { + std::printf( + "B=%zu carve_info_for_request(%zu) and carve_info(request) " + "point at different records\n", + B, + s); + std::abort(); + } + + // bitmap_info_for_request(s) must alias bitmap_info(request(s)). + const auto& bi = Bins::bitmap_info_for_request(s); + if (&bi != &Bins::bitmap_info(sc)) + { + std::printf( + "B=%zu bitmap_info_for_request(%zu) and bitmap_info(request) " + "point at different records\n", + B, + s); + std::abort(); + } + } + } + + /// to_exp_mant runtime / _const equivalence across a representative + /// range of values, including edges near max_supported_chunks. The + /// runtime variant uses the intrinsic; we cross-check against the + /// constexpr reference that's already exercised at compile time. + template + void check_to_exp_mant_equivalence() + { + using Bins = BackendArenaBinsTestAccess; + + auto check_one = [&](size_t n) { + size_t r = snmalloc::bits::to_exp_mant(n); + size_t c = snmalloc::bits::to_exp_mant_const(n); + if (r != c) + { + std::printf("B=%zu to_exp_mant(%zu) = %zu, _const = %zu\n", B, n, r, c); + std::abort(); + } + }; + + // Small values. + for (size_t n = 1; n <= 4096; n++) + check_one(n); + + // Powers of two and ±1, up to the largest representable. + for (size_t e = 0; e < snmalloc::bits::BITS; e++) + { + size_t pow = size_t(1) << e; + if (pow == 0) + continue; + if (pow >= 1 && pow <= Bins::max_supported_chunks()) + check_one(pow); + if (pow + 1 <= Bins::max_supported_chunks()) + check_one(pow + 1); + if (pow >= 2) + check_one(pow - 1); + } + + // The upper boundary itself. + check_one(Bins::max_supported_chunks()); + if (Bins::max_supported_chunks() > 1) + check_one(Bins::max_supported_chunks() - 1); + + // A handful of stride values across the full range. + size_t step = Bins::max_supported_chunks() / 257; + if (step == 0) + step = 1; + for (size_t n = 1; n <= Bins::max_supported_chunks() && n > 0; + n += step + 1) + check_one(n); + } + + /// Reference implementation of find_for_request: brute-force scan + /// over every bin id, applying the canonical `serves` predicate + /// (defined directly in terms of `bin_subsets`). + template + size_t reference_find( + size_t n_chunks, const typename BackendArenaBinsTestAccess::Bitmap& bm) + { + using Bins = BackendArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + for (size_t b = 0; b < Bitmap::TOTAL_BINS; b++) + { + if (!Bins::raw_has(bm, b)) + continue; + if (serves(b, n_chunks)) + return b; + } + return SIZE_MAX; + } + + template + void check_bitmap_smoke() + { + using Bins = BackendArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + Bitmap bm; + if (!Bins::raw_empty(bm)) + std::abort(); + Bins::raw_set(bm, 0); + if (Bins::raw_empty(bm)) + std::abort(); + if (!Bins::raw_has(bm, 0)) + std::abort(); + if (Bins::raw_has(bm, 1)) + std::abort(); + Bins::raw_set(bm, Bitmap::TOTAL_BINS - 1); + if (!Bins::raw_has(bm, Bitmap::TOTAL_BINS - 1)) + std::abort(); + bm.clear(0); + if (Bins::raw_has(bm, 0)) + std::abort(); + bm.clear(Bitmap::TOTAL_BINS - 1); + if (!Bins::raw_empty(bm)) + std::abort(); + } + + /// Iterate over every `chunk_sc_t` raw id in `[0, MAX_SC)`. For each + /// one, decode its request size, look up its `bitmap_info_t`, and + /// run `body(n_chunks, bitmap_info)`. Multiple raw ids can share the + /// same `(start_word, first_mask, second_mask)` triple; callers that + /// want a unique-deposit view are responsible for deduplicating. + template + void for_each_class_info(F body) + { + using Bins = BackendArenaBinsTestAccess; + for (size_t raw = 0; raw < Bins::MAX_SC; raw++) + { + size_t s = snmalloc::bits::from_exp_mant(raw); + const auto& info = Bins::bitmap_info_for_request(s); + body(s, info); + } + } + + template + void check_bitmap_find_empty() + { + using Bins = BackendArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + Bitmap bm; + for_each_class_info([&](size_t n, const auto& /*info*/) { + if (bm.find_for_request(n) != SIZE_MAX) + std::abort(); + }); + } + + /// For each B and each bin id in [0, TOTAL_BINS): set exactly that + /// bit, then for every distinct request info cross-check + /// find_for_request against the reference scanner. + template + void check_bitmap_exhaustive_single_bit() + { + using Bins = BackendArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + + // Gather a representative set of entries (one per distinct bitmap + // deposit, i.e. distinct (start_word, first_mask, second_mask) + // triple, with a request size that maps to it). + struct Entry + { + size_t n_chunks; + typename Bins::bitmap_info_t info; + }; + + std::vector entries; + for_each_class_info([&](size_t n, const auto& info) { + for (const auto& e : entries) + { + if ( + e.info.start_word == info.start_word && + e.info.first_mask == info.first_mask && + e.info.second_mask == info.second_mask) + return; + } + entries.push_back({n, info}); + }); + + for (size_t bin_id = 0; bin_id < Bitmap::TOTAL_BINS; bin_id++) + { + Bitmap bm; + Bins::raw_set(bm, bin_id); + for (const auto& e : entries) + { + size_t got = bm.find_for_request(e.n_chunks); + size_t want = reference_find(e.n_chunks, bm); + if (got != want) + { + std::printf( + "B=%zu single-bit: bin=%zu n=%zu: got=%zu want=%zu\n", + B, + bin_id, + e.n_chunks, + got, + want); + std::abort(); + } + } + } + } + + /// Randomised multi-bit arena states cross-checked against the + /// reference scanner. + template + void check_bitmap_multi_bit_random() + { + using Bins = BackendArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + + struct Entry + { + size_t n_chunks; + typename Bins::bitmap_info_t info; + }; + + std::vector entries; + for_each_class_info([&](size_t n, const auto& info) { + for (const auto& e : entries) + { + if ( + e.info.start_word == info.start_word && + e.info.first_mask == info.first_mask && + e.info.second_mask == info.second_mask) + return; + } + entries.push_back({n, info}); + }); + + // Deterministic xorshift64 PRNG so failures are reproducible. + auto xorshift = [](uint64_t& s) -> uint64_t { + s ^= s << 13; + s ^= s >> 7; + s ^= s << 17; + return s; + }; + + uint64_t rng_state = 0x9E3779B97F4A7C15ull + B; + for (size_t trial = 0; trial < 2000; trial++) + { + Bitmap bm; + // Density varies per trial: choose how many bits to set. + size_t target = (size_t)(xorshift(rng_state) % (Bitmap::TOTAL_BINS + 1)); + for (size_t i = 0; i < target; i++) + { + size_t b = (size_t)(xorshift(rng_state) % Bitmap::TOTAL_BINS); + Bins::raw_set(bm, b); + } + for (const auto& e : entries) + { + size_t got = bm.find_for_request(e.n_chunks); + size_t want = reference_find(e.n_chunks, bm); + if (got != want) + { + std::printf( + "B=%zu trial=%zu n=%zu: got=%zu want=%zu\n", + B, + trial, + e.n_chunks, + got, + want); + std::abort(); + } + } + } + } + + /// Targeted word-boundary cases: enumerate real table entries, pick + /// out those whose within-exp range straddles a bitmap word, and + /// drive each through a four-way sub-case grid: + /// (i) bit set in first word's considered region only + /// (ii) bit set as within-exp continuation in second word + /// (iii) bit set as higher-exp candidate in second word + /// (iv) bit set only in word 3 or beyond + template + void check_bitmap_word_boundary() + { + using Bins = BackendArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + + auto check_predicted = + [&](const Bitmap& bm, size_t n_chunks, const char* label) { + size_t got = bm.find_for_request(n_chunks); + size_t want = reference_find(n_chunks, bm); + if (got != want) + { + std::printf( + "B=%zu word-boundary [%s] n=%zu: got=%zu want=%zu\n", + B, + label, + n_chunks, + got, + want); + std::abort(); + } + }; + + bool found_straddle = false; + bool found_aligned = false; + for (size_t raw = 0; raw < Bins::MAX_SC; raw++) + { + size_t s = snmalloc::bits::from_exp_mant(raw); + const auto& info = Bins::bitmap_info_for_request(s); + // Recover the absolute start bin from the precomputed layout: + // the start bin always serves, so bit 0 of the conceptual + // serve_mask is set, which means `first_mask`'s lowest set bit + // is at position `shift = start_bit & (BITS - 1)`. + size_t shift = snmalloc::bits::ctz(info.first_mask); + size_t start_bit = info.start_word * snmalloc::bits::BITS + shift; + size_t state = start_bit % Bins::BINS_PER_EXP; + size_t r = Bins::BINS_PER_EXP - state; + bool straddles = (shift + r) > snmalloc::bits::BITS; + bool aligned = (shift == 0); + + if (straddles) + found_straddle = true; + if (aligned) + found_aligned = true; + if (!(straddles || aligned)) + continue; + + // (i) Single bit at the very start_bit. + { + Bitmap bm; + Bins::raw_set(bm, start_bit); + check_predicted(bm, s, "case-i-start_bit"); + } + + // (ii) Single bit in the second word's within-exp continuation + // (only meaningful for straddling cases). + if (straddles) + { + size_t carry_bin = start_bit + (snmalloc::bits::BITS - shift); + if (carry_bin < Bitmap::TOTAL_BINS) + { + Bitmap bm; + Bins::raw_set(bm, carry_bin); + check_predicted(bm, s, "case-ii-continuation"); + } + } + + // (iii) Bit in second word's higher-exp region. + { + size_t second_word = info.start_word + 1; + if (second_word < Bins::NUM_BITMAP_WORDS) + { + // Pick a bin that is higher-exponent: at least + // start_bit + BINS_PER_EXP - state (i.e. into next exponent). + size_t higher_bin = start_bit + r; + if (higher_bin < Bitmap::TOTAL_BINS) + { + Bitmap bm; + Bins::raw_set(bm, higher_bin); + check_predicted(bm, s, "case-iii-higher-exp"); + } + } + } + + // (iv) Bit only in word 3 or beyond. + { + size_t target_word = info.start_word + 2; + if (target_word < Bins::NUM_BITMAP_WORDS) + { + size_t target_bin = target_word * snmalloc::bits::BITS; + if (target_bin < Bitmap::TOTAL_BINS) + { + Bitmap bm; + Bins::raw_set(bm, target_bin); + check_predicted(bm, s, "case-iv-later-word"); + } + } + } + } + + // Sanity: for B that actually places entries near word boundaries, + // at least one straddling case must exist on 64-bit. We don't assert + // straddle exists for all B (B=1's bins-per-exp = 2 might not + // straddle on 64-bit), but aligned cases must. + if (!found_aligned) + { + std::printf("B=%zu: no aligned start_bit found!\n", B); + std::abort(); + } + (void)found_straddle; + } + + /// Integration test: set bits by `bin_index(addr, n)`, then probe via + /// `find_for_request(req)`. The bitmap result must equal + /// `bin_index(addr, n)` whenever `can_serve` says the block satisfies + /// the request, and `SIZE_MAX` otherwise. + template + void check_bitmap_bin_index_integration() + { + using Bins = BackendArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + + auto classes = collect_classes(64); + for (size_t addr = 0; addr < 32; addr++) + { + for (size_t n = 1; n <= 64; n++) + { + Bitmap bm; + size_t bin = Bins::bin_index({addr, n}); + Bins::raw_set(bm, bin); + for (auto sc : classes) + { + size_t s = Bins::size_chunks(sc); + size_t a = Bins::align_chunks(sc); + bool actually = can_serve(addr, n, s, a); + size_t got = bm.find_for_request(s); + size_t want = actually ? bin : size_t(SIZE_MAX); + if (got != want) + { + std::printf( + "B=%zu integration: addr=%zu n=%zu bin=%zu sc.size=%zu " + "sc.align=%zu: got=%zu want=%zu actually=%d\n", + B, + addr, + n, + bin, + s, + a, + got, + want, + (int)actually); + std::abort(); + } + } + } + } + } + + /// Verify that Bitmap::add classifies (base, size) ranges to the same + /// bin id as `bin_index`, sets the corresponding bit, and is + /// idempotent on both the returned id and the underlying word state. + template + void check_bitmap_add() + { + using Bins = BackendArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + using range_t = typename Bins::range_t; + + for (size_t addr = 0; addr < 32; addr++) + { + for (size_t n = 1; n <= 64; n++) + { + Bitmap bm; + size_t expected = Bins::bin_index({addr, n}); + size_t got = bm.add(range_t{addr, n}); + if (got != expected) + { + std::printf( + "B=%zu add: addr=%zu n=%zu got=%zu expected=%zu\n", + B, + addr, + n, + got, + expected); + std::abort(); + } + if (!Bins::raw_has(bm, expected)) + { + std::printf( + "B=%zu add: addr=%zu n=%zu bin %zu not set after add\n", + B, + addr, + n, + expected); + std::abort(); + } + + // Snapshot every word, call add again, verify nothing changed + // and we get the same id back. Idempotence on state. + std::vector snapshot; + for (size_t w = 0; w < Bins::NUM_BITMAP_WORDS; w++) + snapshot.push_back(Bins::raw_word(bm, w)); + size_t got2 = bm.add(range_t{addr, n}); + if (got2 != expected) + { + std::printf( + "B=%zu add idempotent: addr=%zu n=%zu second add returned " + "%zu (first returned %zu)\n", + B, + addr, + n, + got2, + expected); + std::abort(); + } + for (size_t w = 0; w < Bins::NUM_BITMAP_WORDS; w++) + { + if (Bins::raw_word(bm, w) != snapshot[w]) + { + std::printf( + "B=%zu add idempotent: addr=%zu n=%zu word %zu changed\n", + B, + addr, + n, + w); + std::abort(); + } + } + } + } + } + + /// With multiple blocks added, `find_for_request` must return the + /// *minimum* bin id whose blocks all serve the request, not just any + /// such bin id. + template + void check_bitmap_find_min() + { + using Bins = BackendArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + + struct Entry + { + size_t n_chunks; + typename Bins::bitmap_info_t info; + }; + + std::vector entries; + for_each_class_info([&](size_t n, const auto& info) { + for (const auto& e : entries) + { + if ( + e.info.start_word == info.start_word && + e.info.first_mask == info.first_mask && + e.info.second_mask == info.second_mask) + return; + } + entries.push_back({n, info}); + }); + + // For each request entry: pick three bin ids that all serve this + // request (the start_bit itself; a higher-exp bin; the topmost + // bin), set all three, and verify find_for_request returns the + // smallest of the three. + for (const auto& e : entries) + { + // Recover the absolute start bin from the precomputed layout. + size_t start_bit = e.info.start_word * snmalloc::bits::BITS + + snmalloc::bits::ctz(e.info.first_mask); + size_t a = start_bit; + size_t b = + start_bit + (Bins::BINS_PER_EXP - (start_bit % Bins::BINS_PER_EXP)); + size_t c = Bitmap::TOTAL_BINS - 1; + if (a >= Bitmap::TOTAL_BINS) + continue; + if (b >= Bitmap::TOTAL_BINS) + continue; + // a < b < c by construction (a < b since b - a > 0; b <= a + r + // <= start_bit + BINS_PER_EXP <= TOTAL_BINS - 1 = c only when + // start_bit far enough below; skip cases where it's not). + if (!(a < b && b < c)) + continue; + + Bitmap bm; + Bins::raw_set(bm, a); + Bins::raw_set(bm, b); + Bins::raw_set(bm, c); + size_t got = bm.find_for_request(e.n_chunks); + if (got != a) + { + std::printf( + "B=%zu find_min: n=%zu bits set {%zu,%zu,%zu} " + "got=%zu (expected min %zu)\n", + B, + e.n_chunks, + a, + b, + c, + got, + a); + std::abort(); + } + } + } + + /// Verify carve(): pre.base+pre.size == req.base; req.base aligned; + /// req.size == sc.size_chunks; post.base == req.end; spans equal. + template + void check_carve() + { + using Bins = BackendArenaBinsTestAccess; + using range_t = typename Bins::range_t; + + auto classes = collect_classes(64); + for (size_t addr = 0; addr < 32; addr++) + { + for (size_t n = 1; n <= 64; n++) + { + for (auto sc : classes) + { + size_t s = Bins::size_chunks(sc); + size_t a = Bins::align_chunks(sc); + if (!can_serve(addr, n, s, a)) + continue; + + auto cv = Bins::carve(range_t{addr, n}, s); + + // pre starts at the block's base. + if (cv.pre.base != addr) + { + std::printf( + "B=%zu carve pre.base != addr (addr=%zu n=%zu s=%zu)\n", + B, + addr, + n, + s); + std::abort(); + } + // pre.end == req.base. + if (cv.pre.base + cv.pre.size != cv.req.base) + { + std::printf("B=%zu carve pre.end != req.base\n", B); + std::abort(); + } + // req aligned. + if ((cv.req.base & (a - 1)) != 0) + { + std::printf( + "B=%zu carve req.base %zu not aligned to %zu\n", + B, + cv.req.base, + a); + std::abort(); + } + // req.size == sc.size_chunks. + if (cv.req.size != s) + { + std::printf( + "B=%zu carve req.size %zu != s %zu\n", B, cv.req.size, s); + std::abort(); + } + // req.end == post.base. + if (cv.req.base + cv.req.size != cv.post.base) + { + std::printf("B=%zu carve req.end != post.base\n", B); + std::abort(); + } + // post.end == block.end. + if (cv.post.base + cv.post.size != addr + n) + { + std::printf("B=%zu carve post.end != block.end\n", B); + std::abort(); + } + // pre.size + req.size + post.size == block.size. + if (cv.pre.size + cv.req.size + cv.post.size != n) + { + std::printf("B=%zu carve sizes don't sum to n\n", B); + std::abort(); + } + } + } + } + } + + template + void run_all() + { + std::printf("--- Running BackendArenaBinsTestAccess<%zu> tests ---\n", B); + check_chunk_sc_roundtrip(); + std::printf(" chunk_sc_t round-trip: OK\n"); + check_align_chunks(); + std::printf(" align_chunks: OK\n"); + check_to_exp_mant_equivalence(); + std::printf(" to_exp_mant runtime/_const equivalence: OK\n"); + check_info_consistency(); + std::printf(" *_info_for_request consistency: OK\n"); + check_bin_id_range(); + std::printf(" bin_index within-exp range: OK\n"); + check_bin_classification(/*max_addr=*/128, /*max_n=*/64); + std::printf(" bin classification vs bin_subsets predicate: OK\n"); + check_bitmap_smoke(); + std::printf(" Bitmap smoke: OK\n"); + check_bitmap_find_empty(); + std::printf(" Bitmap empty find returns SIZE_MAX: OK\n"); + check_bitmap_exhaustive_single_bit(); + std::printf(" Bitmap exhaustive single-bit find: OK\n"); + check_bitmap_multi_bit_random(); + std::printf(" Bitmap multi-bit random find: OK\n"); + check_bitmap_word_boundary(); + std::printf(" Bitmap word-boundary cases: OK\n"); + check_bitmap_bin_index_integration(); + std::printf(" Bitmap bin_index integration: OK\n"); + check_bitmap_add(); + std::printf(" Bitmap add classify+set+idempotent: OK\n"); + check_bitmap_find_min(); + std::printf(" Bitmap find_for_request returns minimum: OK\n"); + check_carve(); + std::printf(" carve splits aligned/unaligned blocks: OK\n"); + } + + /// A few concrete expected values, derived from the prototype's output, to + /// catch silent breakage of the canonical numbering. + void check_known_values() + { + using B2 = BackendArenaBinsTestAccess<2>; + + // size 1 -> raw 0, size 2 -> raw 1, size 3 -> raw 2, size 4 -> raw 3, + // size 5 -> raw 4, ..., size 8 -> raw 7, size 10 -> raw 8. + if (B2::size_chunks(B2::request(1)) != 1) + std::abort(); + if (B2::size_chunks(B2::request(8)) != 8) + std::abort(); + if (B2::size_chunks(B2::request(9)) != 10) + std::abort(); + if (B2::size_chunks(B2::request(11)) != 12) + std::abort(); + + // align_chunks: size 4 -> 4, size 5 -> 1, size 6 -> 2, size 8 -> 8, + // size 10 -> 2, size 12 -> 4, size 14 -> 2. + if (B2::align_chunks(B2::request(4)) != 4) + std::abort(); + if (B2::align_chunks(B2::request(5)) != 1) + std::abort(); + if (B2::align_chunks(B2::request(6)) != 2) + std::abort(); + if (B2::align_chunks(B2::request(8)) != 8) + std::abort(); + if (B2::align_chunks(B2::request(10)) != 2) + std::abort(); + + // BINS_PER_EXP must be 5 for B=2. + if (B2::BINS_PER_EXP != 5) + std::abort(); + + using B3 = BackendArenaBinsTestAccess<3>; + + if (B3::BINS_PER_EXP != 13) + std::abort(); + + using B1 = BackendArenaBinsTestAccess<1>; + if (B1::BINS_PER_EXP != 2) + std::abort(); + } +} // namespace + +int main(int, char**) +{ + setup(); + + check_known_values(); + std::printf("Known concrete values: OK\n"); + + run_all<1>(); + run_all<2>(); + run_all<3>(); + + std::printf("All BackendArenaBins tests passed.\n"); + return 0; +} From aac5dbda5650f5e97fc1f29d24d9d13e4cb4b9a8 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Wed, 20 May 2026 13:20:28 +0100 Subject: [PATCH 03/31] Add RBTree::neighbours probe helper (Phase 2) Adds a public RBTree method that returns the strict neighbours of a probe value K in a single root-to-leaf descent: - every left turn (parent > K) records the parent as the current successor candidate - every right turn (parent < K) records the parent as the current predecessor candidate At loop exit the tightest neighbours are returned as `stl::Pair{pred, succ}`; either component is `Rep::null` when no such neighbour exists. The "K not in tree" precondition is asserted via SNMALLOC_ASSERT and expands to nothing in Release. BackendArena, the planned caller, relies on the invariant that two free blocks cannot share a starting address. test_neighbours exercises the algorithm against std::set::lower_bound / upper_bound as oracle. Boundary probes (K=0, K=size+1) plus random probes that skip oracle hits keep every call within the precondition. The sweep reuses the existing test()'s size range but caps to the first few seeds per size to keep the per-test time budget in check. PLAN.md Phase 2 spec records the K-not-in-tree precondition. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PLAN.md | 40 +++++---- src/snmalloc/ds_core/redblacktree.h | 38 +++++++++ src/test/func/redblack/redblack.cc | 123 ++++++++++++++++++++++++++++ 3 files changed, 183 insertions(+), 18 deletions(-) diff --git a/PLAN.md b/PLAN.md index 7ac7fecf9..b590b3592 100644 --- a/PLAN.md +++ b/PLAN.md @@ -842,36 +842,40 @@ The current `RBTree` exposes `find`, `remove_min`, `remove_path` (taking an `RBPath`). For Range-tree adjacency lookups we don't need predecessor and successor as independent operations — we always want **both neighbours of a probe value** when classifying an incoming block. A -single tree walk for `K` already records exactly that information: the -last "go-right" descent (the parent of the failed left-child step) -points at the largest entry strictly less than `K`; the last "go-left" -descent points at the smallest entry strictly greater than `K`. +single tree walk for `K` already records exactly that information: every +"go-right" descent passes through a node with key strictly less than `K` +(predecessor candidate); every "go-left" descent passes through a node +with key strictly greater than `K` (successor candidate). The last turn +of each kind is the tight answer. Add a single helper: -- `neighbours(K) -> stl::Pair` — performs one walk for - `K` and returns `(largest entry < K, smallest entry > K)`. Either may - be null. If `K` is itself present in the tree, the result describes - the neighbours of the existing entry; the caller decides whether that - is a bug (overlap) for its problem domain — `BackendArena` will treat - an exact hit as an invariant violation, since two free blocks cannot - share a starting address. +- `neighbours(K) -> stl::Pair` — performs one walk for `K` and + returns `(largest entry < K, smallest entry > K)`. Either component + is `Rep::null` when no such neighbour exists. + **Precondition**: `K` is not present in the tree. This matches the + `BackendArena` use case (two free blocks cannot share a starting + address, so `add_block` only calls `neighbours` on addresses not + already in the tree); in Debug an assert fires if `K` is encountered + on the descent. This replaces two separate `O(log n)` walks per `add_block` with one and -keeps the API surface small. Implement on top of the existing `RBPath` -walking primitives — no structural changes to `RBTree` required. +keeps the API surface small. Implement on top of the existing tree +walking primitives (`get_root`, `get_dir`) — no structural changes to +`RBTree` required. **Test gate**: extend `src/test/func/redblack/redblack.cc` with a randomised test of `neighbours(K)` against `std::set::lower_bound` / -`upper_bound` as oracle, over thousands of operations and probe values. -Existing tests must remain green. +`upper_bound` as oracle, over thousands of operations and probe values +drawn from `K` values **not** present in the tree. Existing tests must +remain green. **Review gate**: spec slice = the Phase 2 section above. Reviewer checks: walk correctly records both turn points; behaviour at empty tree, single-node tree, `K` smaller than all keys, `K` larger than all -keys, `K` equal to existing key, and `K` between two consecutive keys -all match the oracle; no structural changes to `RBTree`'s existing -invariants. +keys, and `K` between two consecutive keys all match the oracle; the +"K not in tree" precondition is asserted in Debug; no structural +changes to `RBTree`'s existing invariants. ### Phase 3: Rep concept + skeleton BackendArena diff --git a/src/snmalloc/ds_core/redblacktree.h b/src/snmalloc/ds_core/redblacktree.h index e6ce73c24..1e9ba5009 100644 --- a/src/snmalloc/ds_core/redblacktree.h +++ b/src/snmalloc/ds_core/redblacktree.h @@ -1,6 +1,7 @@ #pragma once #include "snmalloc/stl/array.h" +#include "snmalloc/stl/utility.h" #include #include @@ -787,6 +788,43 @@ namespace snmalloc return true; } + /** + * Return the strict neighbours of `value` in the tree: + * `(largest key < value, smallest key > value)`. Either component is + * `Rep::null` when no such neighbour exists. + * + * **Precondition**: `value` is not present in the tree. A single + * root-to-leaf descent then records both neighbours: every left + * turn (parent key > value) updates the successor candidate to the + * parent's key, every right turn updates the predecessor candidate. + * In Debug an assert fires if `value` is encountered on the descent. + */ + stl::Pair neighbours(K value) + { + K pred = Rep::null; + K succ = Rep::null; + + ChildRef cur = get_root(); + while (!cur.is_null()) + { + K k = cur; + SNMALLOC_ASSERT(!Rep::equal(k, value)); + if (Rep::compare(k, value)) + { + // k > value: go left; k is the tightest successor seen so far. + succ = k; + cur = get_dir(true, k); + } + else + { + pred = k; + cur = get_dir(false, k); + } + } + + return {pred, succ}; + } + RBPath get_root_path() { return RBPath(H{&root}); diff --git a/src/test/func/redblack/redblack.cc b/src/test/func/redblack/redblack.cc index 61fccb6d3..e47138be4 100644 --- a/src/test/func/redblack/redblack.cc +++ b/src/test/func/redblack/redblack.cc @@ -5,6 +5,7 @@ #include #include +#include #include #ifndef SNMALLOC_TRACING @@ -207,6 +208,122 @@ void test(size_t size, unsigned int seed) } } +template +void test_neighbours(size_t size, unsigned int seed) +{ + xoroshiro::p64r32 rand(seed); + snmalloc::RBTree tree; + std::set oracle; + // Parallel vector keeps random-pick on remove O(1) instead of paying + // O(n) for std::advance over a std::set iterator. + std::vector entries; + + auto probe = [&](Rep::key k_probe) { + auto result = tree.neighbours(k_probe); + + Rep::key expected_pred = Rep::null; + Rep::key expected_succ = Rep::null; + auto it = oracle.lower_bound(k_probe); + if (it != oracle.begin()) + { + auto prev = it; + --prev; + expected_pred = *prev; + } + if (it != oracle.end()) + expected_succ = *it; + + if (result.first != expected_pred || result.second != expected_succ) + { + std::cout << "neighbours(" << k_probe << ") mismatch:" + << " got (" << result.first << ", " << result.second << ")" + << " expected (" << expected_pred << ", " << expected_succ + << ")" << std::endl; + abort(); + } + }; + + auto do_probes = [&]() { + // Boundary probes. Key 0 is Rep::null and is never inserted (insert + // keys are 1 + rand % size), and size + 1 is one above the maximum + // possible insert; both are guaranteed not to be in the tree. + probe(Rep::key(0)); + if (size + 1 <= 0xFFFF) + probe(Rep::key(size + 1)); + // Two random probes, skipping any that collide with the tree. + for (size_t p = 0; p < 2; p++) + { + Rep::key k = Rep::key(rand.next() % (size + 2)); + if (oracle.count(k) == 0) + probe(k); + } + }; + + // Empty tree: every probe must report (null, null). + do_probes(); + + bool first = true; + for (size_t i = 0; i < 20 * size; i++) + { + auto batch = 1 + rand.next() % (3 + (size / 2)); + auto op = rand.next() % 4; + if (op < 2 || first) + { + first = false; + for (auto j = batch; j > 0; j--) + { + auto k = Rep::key(1 + rand.next() % size); + if (tree.insert_elem(k)) + { + oracle.insert(k); + entries.push_back(k); + } + } + } + else if (op == 3) + { + for (auto j = batch; j > 0; j--) + { + if (entries.empty()) + break; + auto index = rand.next() % entries.size(); + Rep::key elem = entries[index]; + if (!tree.remove_elem(elem)) + { + std::cout << "Failed to remove element: " << elem << std::endl; + abort(); + } + entries.erase(entries.begin() + static_cast(index)); + oracle.erase(elem); + } + } + else + { + for (auto j = batch; j > 0; j--) + { + if (entries.empty()) + break; + auto min = tree.remove_min(); + Rep::key expected = *oracle.begin(); + if (min != expected) + { + std::cout << "remove_min mismatch: tree=" << min + << " oracle=" << expected << std::endl; + abort(); + } + oracle.erase(oracle.begin()); + entries.erase( + std::remove(entries.begin(), entries.end(), min), entries.end()); + } + } + + do_probes(); + + if (entries.empty()) + break; + } +} + int main(int argc, char** argv) { setup(); @@ -222,6 +339,11 @@ int main(int argc, char** argv) for (seed = 1; seed < 5 + (8 * size); seed++) { test(size, seed); + // Run the neighbours oracle on a handful of seeds per size: the + // full size range gives good tree-shape coverage, the seed cap + // keeps the extra cost from blowing the per-test time budget. + if (seed < 5) + test_neighbours(size, seed); } return 0; @@ -235,5 +357,6 @@ int main(int argc, char** argv) // Trace particular example test(size, seed); + test_neighbours(size, seed); return 0; } From ce6187d1afbe864dc648264c27dff8660ac50c01 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Thu, 21 May 2026 15:02:02 +0100 Subject: [PATCH 04/31] Add BackendArena: dual-tree free-range manager with consolidation Introduce BackendArena, a free-range allocator that manages chunks within a bounded arena using a dual red-black-tree scheme: - Bin trees: one per size-class bin, for best-fit allocation lookups driven by a non-empty-bins bitmap. - Range tree: keyed by address, for O(log n) neighbour lookup during consolidation of adjacent free blocks. Key design decisions: - Single-chunk (min-size) blocks live only in bin tree 0, not the range tree, keeping range-tree overhead proportional to multi-chunk blocks. The min-size bin is probed as a fallback during consolidation. - Three-variant encoding (Min/TwoMin/Large) in pagemap metadata bits avoids a range-tree lookup for the common 1-chunk and 2-chunk cases. - WordRef handle and TreeRep template follow the existing BackendStateWordRef / BuddyChunkRep patterns from largebuddyrange.h. - Consolidation in add_block checks predecessor then successor, merging adjacent blocks and re-inserting the result. - remove_block uses Bins::carve to split oversized blocks, re-inserting remainders. Also: - Add neighbours() to RBTree: single-descent strict-neighbour query. - Add for_each() to RBTree: in-order traversal for invariant checking. - Make BackendArenaBins::bin_index public (sole consumer is BackendArena). - Add BackendArenaBins::Bitmap::test() for invariant verification. - Five-clause structural invariant gated on bool parameter (defaults to Debug), checked at entry/exit of add_block and remove_block. - Comprehensive test suite: word-level round-trips, tree operations, empty-state invariant, add/remove without consolidation, consolidation case matrix (8 pred/succ combinations), overflow detection, and randomised stress test with oracle validation (50 seeds x 500 ops). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CMakeLists.txt | 1 + PLAN.md | 441 +++---- src/snmalloc/backend_helpers/backend_arena.h | 491 ++++++++ .../backend_helpers/backend_arena_bins.h | 11 +- src/snmalloc/ds_core/redblacktree.h | 21 + src/test/func/backend_arena/backend_arena.cc | 1021 +++++++++++++++++ 6 files changed, 1721 insertions(+), 265 deletions(-) create mode 100644 src/snmalloc/backend_helpers/backend_arena.h create mode 100644 src/test/func/backend_arena/backend_arena.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index ec903a408..b0457bdab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -548,6 +548,7 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) # These are mitigation-independent and can be compiled once, then linked # against both fast and check testlib variants. set(TESTLIB_ONLY_TESTS + backend_arena backend_arena_bins bits first_operation memory memory_usage multi_atexit multi_threadatexit redblack statistics teardown diff --git a/PLAN.md b/PLAN.md index b590b3592..e8023f096 100644 --- a/PLAN.md +++ b/PLAN.md @@ -16,8 +16,9 @@ Note that for block of the minimum size will be handled specially as there is in We use 2 bits to represent the mode of this block of memory 00 - Minimum size, only in first red-black tree. Single pagemap entry for this block is used for the RB-tree -01 - 2 * minimum size, in both red-black trees. Two pagemap entries for this block are used for the RB-tree +01 - 2 * minimum size (2-aligned), in both red-black trees. Two pagemap entries for this block are used for the RB-tree 10 - > 2 * minimum size, in both red-black trees. Three pagemap entries used for this block, first two redblack tree, third stores accurate size of block. +11 - 2 * minimum size (NOT 2-aligned), in both red-black trees. Two pagemap entries for this block are used for the RB-tree. Goes into a size-1 bin since it cannot serve aligned size-2 requests. This means it is possible to find the precise size of a block which can account for additional state that is lost by the binning. @@ -157,11 +158,38 @@ entry per chunk. The first pagemap entry of a free block carries a **variant tag** that tells `BackendArena` how to interpret the other entries in the block: -| Variant | Block size | Pagemap entries used by BackendArena | -|-------------|----------------|------------------------------------------------------------------------| -| `Min` | exactly min | 1 entry — both words store the Bin RBTree node (left/right + colour). | -| `TwoMin` | exactly 2× min | 2 entries — first stores Bin node, second stores Range node. | -| `Large` | > 2× min | 3 entries — first Bin, second Range, third stores precise block size. | +| Variant | Value | Block size | Alignment | Pagemap entries used by BackendArena | +|-------------|-------|----------------|----------------|------------------------------------------------------------------------| +| `Min` | 0 | exactly min | any | 1 entry — both words store the Bin RBTree node (left/right + colour). | +| `TwoMin` | 1 | exactly 2× min | 2-aligned | 2 entries — first stores Bin node, second stores Range node. | +| `Large` | 2 | > 2× min | any | 3 entries — first Bin, second Range, third stores precise block size. | +| `OddTwo` | 3 | exactly 2× min | **not** 2-aligned | 2 entries — first stores Bin node, second stores Range node. | + +#### Unaligned size-2 blocks (`OddTwo`) + +A size-2 block at an odd chunk address (e.g. chunk 3) cannot serve any +size-2 allocation request because all size-2 SCs require 2-chunk +alignment. `bin_index({odd, 2})` correctly places such blocks into a +size-1 bin. However, the `Min` variant can only store one pagemap entry, +and a size-2 block occupies two entries and participates in the range +tree. + +The `OddTwo` variant resolves this: it marks a size-2 block that is not +2-aligned. Like `TwoMin`, it uses two pagemap entries and lives in the +range tree. Unlike `TwoMin`, it goes into a size-1 bin (since it can't +serve aligned size-2 requests). + +The consolidation code's `contains_min` check probes bin 0 for +single-chunk neighbours. Since `OddTwo` blocks also land in bin 0 +(both `Min` and `OddTwo` have a size-1 servable set at exponent 0), +`contains_min` must filter by variant: after finding an address in +bin 0, it checks `get_variant(addr) == Min` to confirm the block is +truly single-chunk. `OddTwo` blocks are found via range-tree neighbour +lookup instead, which correctly returns their size as 2. + +Note: only blocks at even chunk addresses can be `TwoMin`. The +`variant_of` function must take both size and chunk address to +distinguish `TwoMin` from `OddTwo`. **Tree membership is the source of truth for "is this block free?".** The variant tag is only meaningful for entries `BackendArena` reaches via @@ -223,10 +251,12 @@ For an incoming block `A` of size `S` at address `addr_A`: - If no non-min right neighbour was found: `MinSizeBin.find(addr_A + size_A)`; if present, merge. -`MinSizeBin` is the single Bin RBTree that holds all min-size free blocks -(the bin whose servable set is `{1 chunk}`). Its `find` operation is a -standard RB-tree key lookup (O(log n)), and only traverses entries already -linked into the tree — i.e. entries owned by *this* `BackendArena`. +`MinSizeBin` is the single Bin RBTree that holds all blocks whose +servable set is `{1 chunk}` (bin 0). This includes both `Min` (size-1) +and `OddTwo` (unaligned size-2) blocks. The `contains_min` helper +performs a `find` in bin 0, then checks `get_variant(addr) == Min` to +confirm the block is truly single-chunk — `OddTwo` entries are skipped +so they are handled by the range-tree neighbour lookup instead. Min-size adjacency therefore costs at most one Bin-tree `find` per side per `add_block`. The Range-tree `neighbours(addr_A)` query yields both @@ -877,240 +907,124 @@ keys, and `K` between two consecutive keys all match the oracle; the "K not in tree" precondition is asserted in Debug; no structural changes to `RBTree`'s existing invariants. -### Phase 3: Rep concept + skeleton BackendArena +### Phase 3+4: Full BackendArena data structure (atomic) Create `src/snmalloc/backend_helpers/backend_arena.h` with: -- A `BackendArenaRep` concept describing the operations the data - structure needs from its backing pagemap, in **chunk-keyed** form - (callers pass `addr` aligned to `MIN_CHUNK_SIZE`): - - `get_variant(addr) -> {Min, TwoMin, Large}` and `set_variant`. - - First-entry word accessors: `get_word1/set_word1` and - `get_word2/set_word2`, preserving `RED_BIT` *and* the variant-tag - bits on every write. - - Second-entry word accessors (used for `TwoMin` and `Large`): same - `get_word1/set_word1/get_word2/set_word2` shape, applied to the - pagemap entry at `addr + MIN_CHUNK_SIZE`. (No variant-tag preservation - rule here; only `RED_BIT`.) - - Third-entry size accessor (used only for `Large`): - `get_large_size_chunks(addr)/set_large_size_chunks(addr, n_chunks)` - backed by the entry at `addr + 2·MIN_CHUNK_SIZE`. - - **No** pagemap-probing API. All adjacency is performed via the - BackendArena's own RBTree finds. -- Two internal RBRep adapters built **inside** `BackendArena` (not in - user code) on top of `BackendArenaRep`: - - **`BinRep`**: keys by chunk-aligned address; uses the first-entry - word accessors; encodes the red-black `colour` bit in `RED_BIT` of - word 1; left/right child pointers occupy chunk-aligned bits of the - two words. - - **`RangeRep`**: keys by chunk-aligned address; uses the second-entry - word accessors; same `RED_BIT` colour and left/right encoding. Only - consulted for `TwoMin` and `Large` blocks (min-size blocks have no - second entry and are not in the Range tree). - - Each adapter satisfies the existing `RBTree` Rep concept (the same - set of operations `BuddyChunkRep` satisfies for `largebuddyrange`'s - tree). -- `template class BackendArena` with the following API: - - `add_block(addr, size_chunks) -> stl::Pair` — - returns `{0, 0}` if the block was absorbed; returns - `{overflow_addr, overflow_size}` (non-zero) for the portion the - arena cannot index. Mirrors `Buddy::add_block`'s overflow-return - contract; the caller (a future `BackendArenaRange` wrapper) is - responsible for handling overflow. Overflow arises in two cases: - (i) the input is oversized — `size_chunks >= 2^MAX_CHUNKS_BITS` - is split and the excess returned; the absorbed prefix continues - through consolidation. (ii) **consolidation grew the block to - arena scale** — if neighbour coalescing produces a range of - `2^MAX_CHUNKS_BITS` chunks (the entire arena), that consolidated - range is returned as overflow because the bitmap / per-sc tables - are sized for `< 2^MAX_CHUNKS_BITS`. In case (ii) the merged - neighbours are already removed from the trees before the return. - - `remove_block(size_t n_chunks) -> stl::Pair` — - returns `{0, 0}` if nothing serves the request; otherwise the - `(base, size)` of the aligned request range returned by - `BackendArenaBins::carve(...).req`. Phase 3 leaves this - a stub; Phase 4 implements it (popping a larger block and carving - internally as needed). -- The invariant method (initially a no-op). -- **Static assertions** pinning the relationship between BackendArena's - bound and the bin-scheme's representable maximum: - - `static_assert(MAX_CHUNKS_BITS < bits::BITS, ...)` so that - `2^MAX_CHUNKS_BITS` and `bits::one_at_bit(MAX_CHUNKS_BITS)` are - representable in `size_t`. - - `static_assert(bits::one_at_bit(MAX_CHUNKS_BITS) <= - BackendArenaBins::max_supported_chunks() + 1, ...)` so that - every block size the arena can hold (strictly less than - `2^MAX_CHUNKS_BITS`) is classifiable by `bin_index` / - `bitmap_info_for_request` / `carve_info_for_request` without - hitting their upper-bound assertions. - -Create a mock Rep in the test using a fixed-size pagemap array (in the -spirit of `redblack.cc`'s `array[2048]`). The mock Rep implements -`get_variant`/`set_variant` and the word/size accessors on its array; -no probing API to implement. - -This phase does **not** modify `BuddyChunkRep` or `largebuddyrange.h`. -All new encoding documentation lives in `backend_arena.h` next to the -new Rep concept. - -**Test gate**: "accessor smoke test" inside -`src/test/func/backend_arena/backend_arena.cc`: - -- Write each variant tag (`Min`, `TwoMin`, `Large`) at a chunk address, - read it back; assert each round-trip preserves the value and does - not corrupt `RED_BIT` or other reserved bits. -- **Cross-preservation**: in the first-entry word, interleave - `set_variant`, Bin-node `set` (writing left/right), and `set_red` in - every order, then verify all three values round-trip correctly. -- Write Bin node fields (left/right pointer, colour bit), read back - unchanged. -- Write Range node fields, read back unchanged. -- Write a precise chunk count in the third entry, read back unchanged. -- `BackendArena` instantiates for several `K`, and its - `invariant()` returns true on an empty arena. -- `add_block(addr, size_chunks)` for `size_chunks >= 2^K` returns the - overflow portion via its return value; `add_block` for - `size_chunks < 2^K` returns `{0, 0}`. - -**Review gate**: spec slice = "Block size variants and pagemap encoding", -"Write ordering within add/remove", and the Phase 3 section above. -Reviewer checks: `BackendArenaRep` concept is the minimum needed to -express the data structure (no leak of internal `RBTree` Rep shape into -user-facing `BackendArenaRep`); `BinRep`/`RangeRep` adapters preserve -`RED_BIT` and variant-tag bits on writes; chunk-keyed API used -consistently; bit-position choices documented in `backend_arena.h` (not -in `BuddyChunkRep`); no `` or `std::` types in -production headers (use `` and `snmalloc::stl::*`); -`SNMALLOC_*` macros used in place of raw compiler attributes. - -### Phase 4: Full add_block / remove_block with carving and consolidation - -Implement the full data-structure semantics in one step. Carving on -`remove_block` and consolidation on `add_block` are interdependent for -the maximally-consolidated invariant — carving without consolidation -produces adjacent free remainders that violate the invariant — so they -land together. The reuse-`P`'s-Range-entry optimisation is deferred to -Phase 5; this phase uses the simple "remove + reinsert" strategy for -every merge. - -**Ownership and serialisation**: `BackendArena` mutations are -serialised and owned at this layer (the upper layer that wraps an -arena holds exclusive access for the duration of a single -`add_block` / `remove_block`). Transient bitmap states during one -operation — e.g. a bin cleared just before its remainder is re-added -to the same bin — are never observable to a concurrent reader. The -Bin / Range trees and the `Bitmap` are per-arena and not concurrent -in this design (per the broader plan: the bitmap and trees lead -indexing; the pagemap is not probed for routing). - -**Tree mutation contracts**: Bin and Range trees are intrusive -red-black trees backed directly by the pagemap (per the -`backend_arena.h` Rep). `insert` and `remove` are allocation-free and -infallible for well-formed inputs; duplicate insertion and removal of -a non-present node are programmer errors and assert. The -`bitmap.add(range) -> bin_id` step followed by -`bin_trees[bin_id].insert(range)` cannot leave a set bitmap bit -without a corresponding tree entry, because `insert` cannot fail. - -- `add_block(addr, size_chunks)`: - - If `size_chunks >= 2^MAX_CHUNKS_BITS`, return the excess as overflow - per the Phase 3 contract; the absorbed prefix continues below. - - Find adjacencies per the "Adjacency lookup" rules (one - `Range.neighbours` call + at most two `MinSizeBin.find` calls). - - For each merge case (P-only, S-only, P+S, all combinations of - min/non-min P and S), update the trees in this order: - 1. For each merged neighbour `n_range`: - `size_t n_bin = bitmap.add(n_range);` — idempotent classify - (the bit is already set since `n_range` is in the tree); this - is the only public way to obtain the neighbour's bin id. - Then `bin_trees[n_bin].remove(n_range)` (and remove from the - Range tree if non-min). - **Then** `if (bin_trees[n_bin].empty()) bitmap.clear(n_bin);`. - The cleared bin id is the *neighbour's old bin*, not the - consolidated bin. - 2. Compute the consolidated range `c_range`. If - `c_range.size >= 2^MAX_CHUNKS_BITS` (this can only happen when - the entire arena has coalesced into one free block, giving - `c_range.size == 2^MAX_CHUNKS_BITS`), the consolidated range - exceeds what the arena can index — the bitmap bin space and - per-sc tables are sized for `< 2^MAX_CHUNKS_BITS`. Return - `c_range` as overflow to the caller (the merged neighbours - have already been removed in step 1; the arena is now empty - and the caller — typically a future `BackendArenaRange` - wrapper — decides whether to return the arena to its parent - pool). Otherwise: - `size_t c_bin = bitmap.add(c_range)` and - `bin_trees[c_bin].insert(c_range)`. - - Write variant tag and (for `Large`) precise chunk count before - inserting into the Bin tree, per "Write ordering within add/remove". -- `remove_block(size_t n_chunks)`: - - `size_t bin_id = bitmap.find_for_request(n_chunks);` — returns - `SIZE_MAX` if no bin in this arena serves the request. - - Pop the lowest-address block as `range_t block` from - `bin_trees[bin_id]` (`remove_min`); if the tree is now empty, - `bitmap.clear(bin_id)`. - - `auto c = BackendArenaBins::carve(block, n_chunks);` — splits - into `pre` / `req` / `post`. - - For each non-empty remainder (`c.pre`, `c.post`), call `add_block` - on it. Remainders may have arbitrary, non-class chunk counts; - `Bitmap::add` (called inside `add_block`) handles this via - `bin_index`. Consolidation cannot extend a remainder back into - the request range or its sibling remainder: the popped block is - gone from the trees, and `pre`, `req`, and `post` are mutually - contiguous (a remainder's neighbour on the request side is the - just-returned `req` range, which is *not* free). - - Return `c.req`. - -Full invariant enabled, including the **maximally consolidated** clause. - -**Test gate** — new test -`src/test/func/backend_arena/backend_arena.cc` (top-level test-glob -discovery, matching `src/test/func/redblack/`): - -- Unit tests for each consolidation case: P-only, S-only, P+S, with - min/min, min/non-min, non-min/min, non-min/non-min combinations of - P and S. -- Carving tests: request a size strictly smaller than any free block; - verify the returned block has the requested size class, that the - prefix/suffix remainders are correctly classified, and that bin / - Range tree / pagemap variant tags are consistent. -- Overflow tests: - - **Oversized input**: `add_block` for `size_chunks >= - 2^MAX_CHUNKS_BITS` returns the unabsorbed portion; smaller blocks - return `{0, 0}`. - - **Consolidation-grows-to-arena-scale**: fill the arena from - multiple sub-arena pieces such that the last `add_block` makes - the running consolidation cover the whole arena - (`c_range.size == 2^MAX_CHUNKS_BITS`); assert the consolidated - range is returned as overflow, and that both trees and the - bitmap are empty after the call. -- Smoke test: insert N blocks, remove N blocks via exact size-class - requests; final state is empty. -- Randomised stress test: random `add_block(addr, size_chunks)` / - `remove_block(n_chunks)` sequence against an oracle that models the - **same** selection rule — "smallest serving bin via - `bitmap.find_for_request`, lowest-address block within that bin, - carve via `BackendArenaBins::carve`". The oracle is implemented as - a sorted map of free `(addr, size_chunks)` pairs (using whatever the - existing test files use; `redblack.cc` already uses `std::set`). - After each operation, both `invariant()` and a structural comparison - against the oracle must pass. - -**Review gate**: spec slice = "Adjacency lookup", "Consolidation: -reusing tree entries when possible" (note: this phase uses the simple -strategy only — reviewer should flag any premature optimisation), -"Min-size special case", "Write ordering within add/remove", -"Invariants", and the Phase 4 section above. Reviewer checks: all eight -P/S min×non-min consolidation cases handled; write ordering correct -(variant tag and precise size written before tree insertion; removal -from trees before pagemap reuse); maximally-consolidated invariant -holds after every operation; remove → carve → re-add path does not -infinitely recurse (a remainder block re-entering `add_block` cannot -itself consolidate into something larger than what was just popped, but -this should be argued explicitly). - -### Phase 5: Consolidation — reuse predecessor's Range entry (optimisation) +- A `BackendArenaRep` concept describing word-level accessors over the + three pagemap entries, the variant tag, and the large-size accessor: + - `get_variant(addr) -> BackendArenaVariant` / `set_variant` + - `get_word1(addr)` / `set_word1`, `get_word2(addr)` / `set_word2` + (first entry, used by BinRep) + - `get_range_word1(addr)` / `set_range_word1`, + `get_range_word2(addr)` / `set_range_word2` (second entry, used + by RangeRep) + - `get_large_size_chunks(addr)` / `set_large_size_chunks` (third + entry) + - Rep word setters preserve only `BACKEND_RESERVED_MASK` (bits 0–7). + RED_BIT and VARIANT_MASK preservation is handled by the adapters + via read-modify-write. + +- Two internal RBRep adapters: + - **BinRep**: tagged `BinHandle` (root-pointer mode or child-slot + mode dispatching to Rep word1/word2). `META_MASK = RED_BIT | + VARIANT_MASK` preserved on `set`. + - **RangeRep**: tagged `RangeHandle` dispatching to Rep + range_word1/range_word2. Same `META_MASK` (paranoid masking + defends against stale variant bits from pagemap reuse). + - Both: `compare(k1, k2) = k1 > k2` so `remove_min` returns the + lowest address. `null = root = 0`. + +- `BackendArena`: + - `B = 2` hardcoded; `INTERMEDIATE_BITS` wiring deferred. + - `MIN_CHUNKS_BITS == 0` only; larger min values deferred. + - `stl::Array bin_trees` + - `RangeTree range_tree` + - `Bins::Bitmap bitmap` + +- Full `add_block(addr, size_chunks)` with consolidation: + - Uses `range_tree.neighbours(addr)` + `contains_min()` for + adjacency. + - Unlinks merged neighbours from both trees and bitmap. + - Returns overflow `{c_addr, c_size}` when consolidation grows to + arena scale (case (ii)); returns `{0, 0}` on success. + - Asserts `addr != 0`, alignment, and size bounds. + +- Full `remove_block(n_chunks)` with carving: + - `bitmap.find_for_request(n_chunks)` → peek min via Rep → + remove from trees → `Bins::carve` → recursive `add_block` for + remainders. + +- Five-clause `invariant()`: + 1. Maximally consolidated (range-tree adjacency + min-block adjacency) + 2. Cross-tree consistency (forward and reverse membership checks) + 3. Bin classification correctness + 4. Bitmap consistency + 5. Variant-tag consistency + +- `get_root_key()` added to `RBTree` (public method, returns root key + or `Rep::null` when empty). + +- `Bitmap::test(size_t bin_id)` added to `BackendArenaBins` (read-only + accessor used by `invariant()`). + +Modifications to existing files: +- `src/snmalloc/backend_helpers/backend_arena_bins.h`: added + `Bitmap::test()` and made `bin_index` public. +- `src/snmalloc/ds_core/redblacktree.h`: added `get_root_key()`. +- `CMakeLists.txt`: added `backend_arena` to `TESTLIB_ONLY_TESTS`. + +**Test gate**: `src/test/func/backend_arena/backend_arena.cc` with +MockRep and 8 test stages (A–H): +- (A) Accessor round-trips +- (B) RBTree smoke via arena +- (C) Empty-state invariant for K ∈ {4, 5, 6} +- (D) add_block without consolidation +- (E) remove_block exact + carving +- (F) Consolidation case matrix (8 cases: all P/S × min/non-min) +- (G) Overflow (interleaved + precise) +- (H) Randomised stress (50 seeds × 500 ops) with Oracle using + `Bins::Bitmap` for exact bin-classification matching + +### Phase 5: `OddTwo` variant for unaligned size-2 blocks + +A size-2 block at an odd chunk address cannot serve size-2 requests +(which require 2-chunk alignment). `bin_index({odd, 2})` correctly +places it in bin 0 (size-1 servable set). But: + +1. `Min` variant uses only 1 pagemap entry; a size-2 block needs 2. +2. `contains_min` probes bin 0 for single-chunk neighbours — finding + a size-2 block there and treating it as size 1 corrupts metadata. + +All changes are in `backend_arena.h` and the test file. + +1. **Add `OddTwo = 3`** to `BackendArenaVariant` enum. +2. **Change `variant_of`** to take `(size_chunks, chunk_index)`: + - size 1 → `Min` + - size 2, even chunk → `TwoMin` + - size 2, odd chunk → `OddTwo` + - size 3+ → `Large` +3. **Update `range_from_addr`**: `OddTwo` returns `{addr, 2}` (same as + `TwoMin`). +4. **Update `insert_block`**: pass `addr_to_chunk(addr)` to `variant_of`. + The `if (size_chunks >= 2)` range-tree checks already cover `OddTwo`. +5. **Update `contains_min`**: after finding addr in bin 0, check + `Rep::get_variant(addr) == BackendArenaVariant::Min`. Return false + for `OddTwo` entries. +6. **Update invariant clause 5**: pass chunk address to `variant_of`. +7. **Update invariant clause 1c** ("no two adjacent min blocks"): + skip non-`Min` entries in bin 0 (i.e., `OddTwo` blocks). +8. **Add test cases**: + - Odd-address size-2 block: verify variant is `OddTwo`, goes in + correct bin, lives in range tree. + - Consolidation with `OddTwo` predecessor/successor. + - `contains_min` does not match `OddTwo` addresses. + - `remove_block(1)` from an `OddTwo` block: verify carving works + and the remainder becomes `Min`. + +**Test gate**: all existing tests pass + new `OddTwo`-specific tests pass. + +### Phase 6: Consolidation — reuse predecessor's Range entry (optimisation) Switch the P-merge case to reuse `P`'s Range tree node (no RB mutation), but **only when `P` is non-min** (a min-size `P` has no Range entry to @@ -1118,7 +1032,7 @@ reuse). The S-only case continues to use remove+reinsert. The P+S case reuses `P` (when non-min) and removes `S`. When `P` is min-size, the merged block is inserted into the Range tree normally. -**Test gate**: all Phase 4 tests still pass. Add debug-only counters at +**Test gate**: all Phase 3+4 tests still pass. Add debug-only counters at the `BackendArena` layer (not inside `RBTree`) for "Range tree `insert_path` calls" and "Range tree `remove_path` calls" during `add_block` / `remove_block`. Assert that: @@ -1133,14 +1047,14 @@ This avoids any modification to `RBTree` itself — the counter increments sit in the `BackendArena` wrappers around its Range-tree calls. **Review gate**: spec slice = "Consolidation: reusing tree entries when -possible" and the Phase 5 section above. Reviewer checks: reuse path +possible" and the Phase 6 section above. Reviewer checks: reuse path correctly leaves the Range-tree node in place (key unchanged, only the back-reference from the new combined block); min-P case correctly falls back to normal insert; counter assertions cover the cases that distinguish the optimised path from the simple path; no regression of -Phase 4's full invariant + oracle randomised test. +Phase 3+4's full invariant + oracle randomised test. -### Phase 6: Multi-instance test +### Phase 7: Multi-instance test Instantiate two `BackendArena` over disjoint address ranges in the same test process, drive workloads against both, verify each @@ -1149,7 +1063,7 @@ invariant independently. **Test gate**: multi-instance test passes; total memory accounted for via both instances matches expectations. -### Phase 7: Final review and self-review +### Phase 8: Final review and self-review Per `claude.md` mandatory review checkpoints: @@ -1215,13 +1129,15 @@ No production code path is changed in this phase: the existing - Predecessor-Range-entry-reuse only applies when `P` is non-min. - `add_block` returns `{0, 0}` on success; on overflow it returns the unabsorbed range, mirroring `Buddy::add_block`'s overflow-return - contract. Overflow arises either when `size_chunks >= - 2^MAX_CHUNKS_BITS` on input (excess returned) or when consolidation - grows a coalesced block to exactly `2^MAX_CHUNKS_BITS` (the - consolidated range is returned, neighbours having been removed - first). The future `BackendArenaRange` wrapper is responsible for - handling overflow; the standalone `BackendArena` only exposes the - contract. + contract. Oversize inputs (`size_chunks >= 2^MAX_CHUNKS_BITS`) bypass + `BackendArena` entirely — the wrapping `BackendArenaRange` layer + handles them before calling `add_block`, and `add_block` asserts + `size_chunks < 2^MAX_CHUNKS_BITS`. The only overflow case is + consolidation growing a coalesced block to exactly + `2^MAX_CHUNKS_BITS` (the consolidated range is returned, neighbours + having been removed first). The future `BackendArenaRange` wrapper is + responsible for handling overflow; the standalone `BackendArena` only + exposes the contract. - `BackendArenaRep` is a chunk-keyed accessor concept (variant tag plus word/size accessors for entries 1–3). `BackendArena` builds two internal `RBTree`-Rep adapters (`BinRep`, `RangeRep`) over it; user @@ -1241,19 +1157,16 @@ No production code path is changed in this phase: the existing ## Still open (resolve during implementation) -- Exact bit positions in the first-word pagemap encoding for the - variant-tag field (Phase 3 decides; documented only in - `backend_arena.h`). -- Whether Bin tree roots are stored flat (`Array`) - or exponent-keyed (`Array, NUM_EXPS>`). - Decide in Phase 3 when `BackendArena` is built; this is an internal - detail of `BackendArena` (bin id is a flat `size_t` returned by - `Bitmap::add` / `Bitmap::find_for_request`, so the choice does not - leak into `BackendArenaBins` or the Rep concept). +- ~~Exact bit positions in the first-word pagemap encoding for the + variant-tag field.~~ **Resolved** (Phase 3+4): bits 9–10 encode + `BackendArenaVariant` (`VARIANT_MASK = 0x600`); bit 8 is `RED_BIT`; + bits 0–7 are `BACKEND_RESERVED_MASK`. Documented in + `backend_arena.h`. +- ~~Whether Bin tree roots are stored flat + (`Array`) or exponent-keyed.~~ **Resolved** + (Phase 3+4): flat `stl::Array`. - Whether the future memcpy `offset` field is best placed in the second word of every pagemap entry, in dedicated entries, or in a side table. Out of scope for this phase; flagged for the memcpy-fix plan to design. - Whether `INTERMEDIATE_BITS=4` (34 bins/exp) needs to be tested in this phase. Currently `B ∈ {1, 2, 3}` only. - - diff --git a/src/snmalloc/backend_helpers/backend_arena.h b/src/snmalloc/backend_helpers/backend_arena.h new file mode 100644 index 000000000..89f25521a --- /dev/null +++ b/src/snmalloc/backend_helpers/backend_arena.h @@ -0,0 +1,491 @@ +#pragma once + +#include "../ds_core/redblacktree.h" +#include "../ds_core/sizeclassconfig.h" +#include "../stl/array.h" +#include "../stl/utility.h" +#include "backend_arena_bins.h" + +#include +#include + +namespace snmalloc +{ + struct BackendArenaTestAccess; + + /** + * Size encoding for a free block's first pagemap entry. + * Min: exactly 1 chunk (no range-tree entry). + * EvenTwo: exactly 2 chunks, 2-aligned; can serve size-2 requests. + * OddTwo: exactly 2 chunks, NOT 2-aligned; in range tree but + * placed in a size-1 bin (cannot serve aligned size-2 requests). + * Large: 3+ chunks; precise size stored in a separate entry. + */ + enum class BackendArenaVariant : uint8_t + { + Min = 0, + EvenTwo = 1, + OddTwo = 2, + Large = 3 + }; + + /** + * Manages free ranges within a single bounded arena using a dual-tree + * scheme (bin trees for allocation, range tree for consolidation). + * + * `Rep` provides word-level pagemap access: + * - `ref_word(direction, addr) -> uintptr_t*`: bin-tree child slot + * (left/right pointer in the first pagemap entry). + * - `ref_range_word(direction, addr) -> uintptr_t*`: range-tree + * child slot (left/right pointer in the second pagemap entry). + * - `get_variant(addr)` / `set_variant(addr, v)` + * - `get_large_size_chunks(addr)` / `set_large_size_chunks(addr, n)` + * + * `MIN_CHUNKS_BITS`: log2 of minimum allocation unit in chunks (0 for + * this phase — 1-chunk minimum). + * + * `MAX_CHUNKS_BITS`: log2 of the arena size in chunks. Blocks that + * reach this size overflow and are returned to the caller. + */ + template + class BackendArena + { + static_assert(MIN_CHUNKS_BITS == 0, "Only MIN_CHUNKS_BITS == 0 supported"); + static_assert(MAX_CHUNKS_BITS > MIN_CHUNKS_BITS); + static_assert(MAX_CHUNKS_BITS < bits::BITS); + + static constexpr size_t B = 2; + using Bins = BackendArenaBins; + + static_assert( + bits::one_at_bit(MAX_CHUNKS_BITS) - 1 <= Bins::max_supported_chunks()); + + // Bit layout constants. + static constexpr uintptr_t RED_BIT = uintptr_t(1) << 8; + static constexpr uintptr_t VARIANT_MASK = uintptr_t(0x3) << 9; + static constexpr uintptr_t META_MASK = RED_BIT | VARIANT_MASK; + static constexpr uintptr_t BACKEND_RESERVED_MASK = 0xFF; + + static_assert((META_MASK & BACKEND_RESERVED_MASK) == 0); + static_assert(META_MASK < MIN_CHUNK_SIZE); + + // ---- Handle: thin proxy around uintptr_t* ---- + // + // Matches BackendStateWordRef's interface: wraps a pointer to a + // word slot (tree root field or pagemap word). Constructed from + // &root or from Rep::ref_word / Rep::ref_range_word. + struct WordRef + { + uintptr_t* val{nullptr}; + + constexpr WordRef() = default; + + constexpr WordRef(uintptr_t* p) : val(p) {} + + uintptr_t get() const + { + return *val; + } + + WordRef& operator=(uintptr_t v) + { + *val = v; + return *this; + } + + bool operator!=(const WordRef& other) const + { + return val != other.val; + } + + uintptr_t printable_address() const + { + return reinterpret_cast(val); + } + }; + + // ---- TreeRep: RBTree Rep parameterised on which word accessor to use ---- + // + // `RefFn` selects the pagemap entry: ref_word for the bin tree, + // ref_range_word for the range tree. + template + struct TreeRep + { + using Handle = WordRef; + using Contents = uintptr_t; + + static constexpr Contents null = 0; + static constexpr Contents root = 0; + + static Contents get(Handle h) + { + return h.get() & ~META_MASK; + } + + static void set(Handle h, Contents v) + { + h = v | (h.get() & META_MASK); + } + + static Handle ref(bool direction, Contents k) + { + static const Contents null_entry = 0; + if (SNMALLOC_UNLIKELY(k == 0)) + return Handle{const_cast(&null_entry)}; + return Handle{RefFn(direction, k)}; + } + + static bool is_red(Contents k) + { + return (ref(true, k).get() & RED_BIT) == RED_BIT; + } + + static void set_red(Contents k, bool new_is_red) + { + if (new_is_red != is_red(k)) + { + auto h = ref(true, k); + h = h.get() ^ RED_BIT; + } + } + + static bool compare(Contents k1, Contents k2) + { + return k1 > k2; + } + + static bool equal(Contents k1, Contents k2) + { + return k1 == k2; + } + + static uintptr_t printable(Contents k) + { + return k; + } + + static uintptr_t printable(Handle h) + { + return h.printable_address(); + } + + static const char* name() + { + return "TreeRep"; + } + }; + + using BinRep = TreeRep; + using RangeRep = TreeRep; + + using BinTree = RBTree; + using RangeTree = RBTree; + + stl::Array bin_trees{}; + RangeTree range_tree{}; + typename Bins::Bitmap bitmap{}; + + // ---- Address-unit helpers ---- + + static size_t addr_to_chunk(uintptr_t a) + { + return a >> MIN_CHUNK_BITS; + } + + static uintptr_t chunk_to_addr(size_t c) + { + return static_cast(c) << MIN_CHUNK_BITS; + } + + // ---- Metadata helpers ---- + + static BackendArenaVariant + variant_of(size_t size_chunks, size_t chunk_index) + { + if (size_chunks == 1) + return BackendArenaVariant::Min; + if (size_chunks == 2) + return (chunk_index & 1) == 0 ? BackendArenaVariant::EvenTwo : + BackendArenaVariant::OddTwo; + return BackendArenaVariant::Large; + } + + static stl::Pair range_from_addr(uintptr_t a) + { + if (a == 0) + return {0, 0}; + auto v = Rep::get_variant(a); + switch (v) + { + case BackendArenaVariant::Min: + return {a, 1}; + case BackendArenaVariant::EvenTwo: + case BackendArenaVariant::OddTwo: + return {a, 2}; + case BackendArenaVariant::Large: + return {a, Rep::get_large_size_chunks(a)}; + } + SNMALLOC_ASSERT(false); + return {0, 0}; + } + + bool contains_min(uintptr_t a) const + { + auto& self = const_cast(*this); + auto path = self.bin_trees[0].get_root_path(); + return self.bin_trees[0].find(path, a) && + Rep::get_variant(a) == BackendArenaVariant::Min; + } + + void insert_block(uintptr_t addr, size_t size_chunks) + { + Rep::set_variant(addr, variant_of(size_chunks, addr_to_chunk(addr))); + if (size_chunks >= 3) + Rep::set_large_size_chunks(addr, size_chunks); + + auto chunk_range = + typename Bins::range_t{addr_to_chunk(addr), size_chunks}; + size_t bin = bitmap.add(chunk_range); + bin_trees[bin].insert_elem(addr); + if (size_chunks >= 2) + range_tree.insert_elem(addr); + } + + void unlink_block(uintptr_t addr, size_t size_chunks) + { + auto chunk_range = + typename Bins::range_t{addr_to_chunk(addr), size_chunks}; + size_t bin = bitmap.add(chunk_range); + bin_trees[bin].remove_elem(addr); + if (size_chunks >= 2) + range_tree.remove_elem(addr); + if (bin_trees[bin].is_empty()) + bitmap.clear(bin); + } + + friend struct BackendArenaTestAccess; + + public: + using addr_t = uintptr_t; + + constexpr BackendArena() = default; + + /** + * Add a free block at `addr` with `size_chunks` chunks. The block + * is consolidated with any adjacent free neighbours. Returns + * `{0, 0}` on success. If consolidation produces a block spanning + * the entire arena (`>= 2^MAX_CHUNKS_BITS` chunks), returns + * `{consolidated_addr, consolidated_size}` and the arena is empty. + */ + stl::Pair add_block(addr_t addr, size_t size_chunks) + { + check_invariant(); + SNMALLOC_ASSERT(addr != 0); + SNMALLOC_ASSERT((addr & (MIN_CHUNK_SIZE - 1)) == 0); + SNMALLOC_ASSERT(size_chunks > 0); + SNMALLOC_ASSERT(size_chunks < bits::one_at_bit(MAX_CHUNKS_BITS)); + + uintptr_t c_addr = addr; + size_t c_size = size_chunks; + + auto merge = [&](uintptr_t n_addr, size_t n_size) { + unlink_block(n_addr, n_size); + if (n_addr < c_addr) + c_addr = n_addr; + c_size += n_size; + }; + + // Check range tree for non-min neighbours. + auto [p_key, s_key] = range_tree.neighbours(addr); + + // Predecessor: check range tree, then fall back to min-size bin. + auto [pa, ps] = range_from_addr(p_key); + if (pa + ps * MIN_CHUNK_SIZE == addr) + merge(pa, ps); + else if (addr >= MIN_CHUNK_SIZE && contains_min(addr - MIN_CHUNK_SIZE)) + merge(addr - MIN_CHUNK_SIZE, 1); + + // Successor: check range tree, then fall back to min-size bin. + auto [sa, ss] = range_from_addr(s_key); + uintptr_t succ_addr = addr + size_chunks * MIN_CHUNK_SIZE; + if (sa == succ_addr) + merge(sa, ss); + else if (succ_addr > addr && contains_min(succ_addr)) + merge(succ_addr, 1); + + // Arena-scale overflow: consolidated block spans the full arena. + if (c_size >= bits::one_at_bit(MAX_CHUNKS_BITS)) + return {c_addr, c_size}; + + // Insert consolidated block. + insert_block(c_addr, c_size); + + check_invariant(); + return {0, 0}; + } + + /** + * Remove a block of at least `n_chunks` chunks. Returns + * `{addr, actual_size}` on success, `{0, 0}` if nothing fits. + * Any leftover from carving is re-inserted via `add_block`. + */ + stl::Pair remove_block(size_t n_chunks) + { + check_invariant(); + if (n_chunks == 0) + return {0, 0}; + + if (n_chunks > Bins::max_supported_chunks()) + return {0, 0}; + + size_t bin_id = bitmap.find_for_request(n_chunks); + if (bin_id == SIZE_MAX) + return {0, 0}; + + // remove_min returns the lowest-address entry (since compare + // is k1 > k2). Read metadata after removal — remove_elem + // does not clear node contents (redblacktree.h:535). + uintptr_t block_addr = bin_trees[bin_id].remove_min(); + auto [_, block_size] = range_from_addr(block_addr); + (void)_; + + if (block_size >= 2) + range_tree.remove_elem(block_addr); + + if (bin_trees[bin_id].is_empty()) + bitmap.clear(bin_id); + + // Carve the requested chunk count from the block. + auto carved = + Bins::carve({addr_to_chunk(block_addr), block_size}, n_chunks); + + // Re-insert non-empty remainders. By the maximally-consolidated + // invariant, these remainders have no adjacent free neighbours. + if (carved.pre.size != 0) + { + insert_block(chunk_to_addr(carved.pre.base), carved.pre.size); + } + + if (carved.post.size != 0) + { + insert_block(chunk_to_addr(carved.post.base), carved.post.size); + } + + check_invariant(); + return {chunk_to_addr(carved.req.base), carved.req.size}; + } + + /** + * Five-clause structural invariant. Runs when `enabled` is true; + * defaults to `Debug` so release tests can pass `true` explicitly. + */ + void check_invariant(bool enabled = Debug) const + { + if (!enabled) + return; + auto& self = const_cast(*this); + + // Clause 1: Maximally consolidated. + // 1a. No two adjacent non-min blocks. + { + uintptr_t prev_addr = 0; + size_t prev_size = 0; + bool prev_valid = false; + self.range_tree.for_each([&](uintptr_t node) { + auto [a, s] = range_from_addr(node); + if (prev_valid) + SNMALLOC_ASSERT(prev_addr + prev_size * MIN_CHUNK_SIZE != a); + prev_addr = a; + prev_size = s; + prev_valid = true; + }); + } + + // 1b. No non-min block adjacent to a min block. + self.range_tree.for_each([&](uintptr_t node) { + auto [a, s] = range_from_addr(node); + if (a >= MIN_CHUNK_SIZE) + SNMALLOC_ASSERT(!contains_min(a - MIN_CHUNK_SIZE)); + SNMALLOC_ASSERT(!contains_min(a + s * MIN_CHUNK_SIZE)); + }); + + // 1c. No two adjacent min blocks. + { + uintptr_t prev = 0; + bool prev_valid = false; + self.bin_trees[0].for_each([&](uintptr_t node) { + if (Rep::get_variant(node) != BackendArenaVariant::Min) + return; + if (prev_valid) + SNMALLOC_ASSERT(prev + MIN_CHUNK_SIZE != node); + prev = node; + prev_valid = true; + }); + } + + // Clause 2: Cross-tree consistency. + // Every non-min bin-tree entry must be in the range tree; + // every range-tree entry must be in exactly one bin tree. + { + size_t range_tree_count = 0; + size_t bin_tree_nonmin_count = 0; + + for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++) + { + self.bin_trees[bin].for_each([&](uintptr_t node) { + auto [a, s] = range_from_addr(node); + if (s >= 2) + { + auto path = self.range_tree.get_root_path(); + SNMALLOC_ASSERT(self.range_tree.find(path, node)); + bin_tree_nonmin_count++; + } + }); + } + + // Reverse: every range-tree entry must be in its expected bin tree. + self.range_tree.for_each([&](uintptr_t node) { + range_tree_count++; + auto [a, s] = range_from_addr(node); + auto chunk_range = typename Bins::range_t{addr_to_chunk(a), s}; + size_t expected_bin = Bins::bin_index(chunk_range); + auto path = self.bin_trees[expected_bin].get_root_path(); + SNMALLOC_ASSERT(self.bin_trees[expected_bin].find(path, node)); + }); + + SNMALLOC_ASSERT(bin_tree_nonmin_count == range_tree_count); + } + + // Clause 3: Bin classification correctness. + for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++) + { + self.bin_trees[bin].for_each([&](uintptr_t node) { + auto [a, s] = range_from_addr(node); + auto chunk_range = typename Bins::range_t{addr_to_chunk(a), s}; + size_t expected_bin = Bins::bin_index(chunk_range); + SNMALLOC_ASSERT(expected_bin == bin); + }); + } + + // Clause 4: Bitmap consistency. + for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++) + { + bool has_entries = !self.bin_trees[bin].is_empty(); + bool bit_set = bitmap.test(bin); + SNMALLOC_ASSERT(has_entries == bit_set); + } + + // Clause 5: Variant-tag consistency. + for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++) + { + self.bin_trees[bin].for_each([&](uintptr_t node) { + auto v = Rep::get_variant(node); + auto [a, s] = range_from_addr(node); + SNMALLOC_ASSERT(v == variant_of(s, addr_to_chunk(a))); + if (v == BackendArenaVariant::Large) + SNMALLOC_ASSERT(Rep::get_large_size_chunks(node) == s); + }); + } + } + }; +} // namespace snmalloc diff --git a/src/snmalloc/backend_helpers/backend_arena_bins.h b/src/snmalloc/backend_helpers/backend_arena_bins.h index 57ab27a03..2d1feb245 100644 --- a/src/snmalloc/backend_helpers/backend_arena_bins.h +++ b/src/snmalloc/backend_helpers/backend_arena_bins.h @@ -159,6 +159,7 @@ namespace snmalloc return table_.carve_info[raw]; } + public: /** * Bin id of `block`. Operates on arbitrary chunk counts, not just * exact size classes. `block.size` must be >= 1. @@ -193,7 +194,6 @@ namespace snmalloc return table_.exp_bin_base[e] + offset; } - public: /// Largest `n_chunks` legal for `carve` / `Bitmap::find_for_request`. static constexpr size_t max_supported_chunks() { @@ -291,6 +291,15 @@ namespace snmalloc return bin_id; } + /// Read-only test: is the bit for `bin_id` set? + /// Used by `BackendArena::invariant()`. + bool test(size_t bin_id) const + { + SNMALLOC_ASSERT(bin_id < TOTAL_BINS); + return (words_[bin_id / bits::BITS] & + (size_t(1) << (bin_id & (bits::BITS - 1)))) != 0; + } + /// Mark bin `bin_id` empty. Caller must ensure the bin's tree /// is actually empty; the bitmap does not consult the trees. SNMALLOC_FAST_PATH void clear(size_t bin_id) diff --git a/src/snmalloc/ds_core/redblacktree.h b/src/snmalloc/ds_core/redblacktree.h index 1e9ba5009..6f86c8523 100644 --- a/src/snmalloc/ds_core/redblacktree.h +++ b/src/snmalloc/ds_core/redblacktree.h @@ -829,5 +829,26 @@ namespace snmalloc { return RBPath(H{&root}); } + + /** + * Call `fn(key)` for every key in ascending order. + */ + template + void for_each(Fn&& fn) + { + for_each_impl(get_root(), fn); + } + + private: + template + static void for_each_impl(ChildRef node, Fn& fn) + { + if (node.is_null()) + return; + K k = node; + for_each_impl(get_dir(true, k), fn); + fn(k); + for_each_impl(get_dir(false, k), fn); + } }; } // namespace snmalloc diff --git a/src/test/func/backend_arena/backend_arena.cc b/src/test/func/backend_arena/backend_arena.cc new file mode 100644 index 000000000..c0da85692 --- /dev/null +++ b/src/test/func/backend_arena/backend_arena.cc @@ -0,0 +1,1021 @@ +/** + * Unit tests for BackendArena. + * + * Exercises the Rep adapters (BinRep, RangeRep), RBTree integration, + * add_block with consolidation, remove_block with carving, the + * five-clause invariant, and a randomised stress test with oracle. + */ + +#include "test/setup.h" +#include "test/xoroshiro.h" + +#include +#include +#include +#include +#include +#include + +#ifndef SNMALLOC_TRACING +# define SNMALLOC_TRACING +#endif +#include "test/snmalloc_testlib.h" + +#include + +namespace snmalloc +{ + // ---- MockRep: array-backed storage for testing ---- + + // Each chunk-aligned address maps to a mock_entry via its chunk index. + // word1/word2 hold bin-tree children; range_word1/range_word2 hold + // range-tree children. variant and large_size_chunks hold metadata. + struct mock_entry + { + uintptr_t word1{0}; + uintptr_t word2{0}; + uintptr_t range_word1{0}; + uintptr_t range_word2{0}; + BackendArenaVariant variant{BackendArenaVariant::Min}; + size_t large_size_chunks{0}; + }; + + // Size the array for the largest test arena + trailing room. + static constexpr size_t MOCK_ARENA_CHUNKS = 1024; + static mock_entry mock_store[MOCK_ARENA_CHUNKS]; + + static void reset_mock_store() + { + for (size_t i = 0; i < MOCK_ARENA_CHUNKS; i++) + mock_store[i] = mock_entry{}; + } + + static size_t mock_index(uintptr_t addr) + { + size_t idx = addr >> MIN_CHUNK_BITS; + SNMALLOC_ASSERT(idx < MOCK_ARENA_CHUNKS); + return idx; + } + + struct MockRep + { + static BackendArenaVariant get_variant(uintptr_t addr) + { + return mock_store[mock_index(addr)].variant; + } + + static void set_variant(uintptr_t addr, BackendArenaVariant v) + { + mock_store[mock_index(addr)].variant = v; + } + + static uintptr_t* ref_word(bool direction, uintptr_t addr) + { + auto& e = mock_store[mock_index(addr)]; + return direction ? &e.word1 : &e.word2; + } + + static uintptr_t* ref_range_word(bool direction, uintptr_t addr) + { + auto& e = mock_store[mock_index(addr)]; + return direction ? &e.range_word1 : &e.range_word2; + } + + static size_t get_large_size_chunks(uintptr_t addr) + { + return mock_store[mock_index(addr)].large_size_chunks; + } + + static void set_large_size_chunks(uintptr_t addr, size_t s) + { + mock_store[mock_index(addr)].large_size_chunks = s; + } + }; + + // ---- Test access ---- + struct BackendArenaTestAccess + { + template + static auto& get_bin_trees(Arena& a) + { + return a.bin_trees; + } + + template + static auto& get_range_tree(Arena& a) + { + return a.range_tree; + } + + template + static auto& get_bitmap(Arena& a) + { + return a.bitmap; + } + }; + + // Convenience: chunk-aligned address from chunk index. + static uintptr_t chunk_addr(size_t chunk_idx) + { + return static_cast(chunk_idx) << MIN_CHUNK_BITS; + } + + // ---- Test types ---- + // K=6 → arena of 64 chunks, K=8 → 256 chunks, K=10 → 1024 chunks. + template + using Arena = BackendArena; + + using Bins = BackendArenaBins<2>; + + // ================================================================== + // (A) Accessor round-trips + // ================================================================== + static void test_variant_roundtrip() + { + reset_mock_store(); + uintptr_t a = chunk_addr(10); + + MockRep::set_variant(a, BackendArenaVariant::Min); + SNMALLOC_ASSERT(MockRep::get_variant(a) == BackendArenaVariant::Min); + + MockRep::set_variant(a, BackendArenaVariant::EvenTwo); + SNMALLOC_ASSERT(MockRep::get_variant(a) == BackendArenaVariant::EvenTwo); + + MockRep::set_variant(a, BackendArenaVariant::Large); + SNMALLOC_ASSERT(MockRep::get_variant(a) == BackendArenaVariant::Large); + + printf(" Variant round-trip: OK\n"); + } + + static void test_large_size_roundtrip() + { + reset_mock_store(); + uintptr_t a = chunk_addr(20); + + for (size_t s : {3, 7, 15, 63, 255, 1000}) + { + MockRep::set_large_size_chunks(a, s); + SNMALLOC_ASSERT(MockRep::get_large_size_chunks(a) == s); + } + + printf(" Large-size round-trip: OK\n"); + } + + static void test_word_roundtrip() + { + reset_mock_store(); + uintptr_t a = chunk_addr(5); + + uintptr_t v1 = chunk_addr(10); + uintptr_t v2 = chunk_addr(20); + + *MockRep::ref_word(true, a) = v1; + *MockRep::ref_word(false, a) = v2; + SNMALLOC_ASSERT(*MockRep::ref_word(true, a) == v1); + SNMALLOC_ASSERT(*MockRep::ref_word(false, a) == v2); + + *MockRep::ref_range_word(true, a) = v2; + *MockRep::ref_range_word(false, a) = v1; + SNMALLOC_ASSERT(*MockRep::ref_range_word(true, a) == v2); + SNMALLOC_ASSERT(*MockRep::ref_range_word(false, a) == v1); + + printf(" Word round-trip: OK\n"); + } + + // ================================================================== + // (B) RBTree / RBTree smoke + // ================================================================== + + // We can't directly instantiate BinRep/RangeRep outside BackendArena + // since they are private nested types. Instead, test them through + // BackendArena's add_block/remove_block which exercise both trees. + // For smoke testing of tree operations directly, we test through + // the BackendArena's own invariant and operation correctness. + + static void test_rbtree_smoke_via_arena() + { + reset_mock_store(); + Arena<8> arena; + arena.check_invariant(true); + + // Insert a few non-adjacent blocks. + uintptr_t a1 = chunk_addr(10); + uintptr_t a2 = chunk_addr(20); + uintptr_t a3 = chunk_addr(30); + + arena.add_block(a1, 3); + arena.check_invariant(true); + + arena.add_block(a2, 5); + arena.check_invariant(true); + + arena.add_block(a3, 1); + arena.check_invariant(true); + + // Remove them. + auto r1 = arena.remove_block(1); + SNMALLOC_ASSERT(r1.first != 0); + UNUSED(r1); + arena.check_invariant(true); + + auto r2 = arena.remove_block(3); + SNMALLOC_ASSERT(r2.first != 0); + UNUSED(r2); + arena.check_invariant(true); + + auto r3 = arena.remove_block(5); + SNMALLOC_ASSERT(r3.first != 0); + UNUSED(r3); + arena.check_invariant(true); + + printf(" RBTree smoke via arena: OK\n"); + } + + // ================================================================== + // (C) Empty-state invariant + // ================================================================== + template + static void test_empty_invariant() + { + reset_mock_store(); + Arena arena; + arena.check_invariant(true); + printf(" Empty invariant (K=%zu): OK\n", K); + } + + // ================================================================== + // (D) add_block without consolidation + // ================================================================== + static void test_add_no_consolidation() + { + reset_mock_store(); + Arena<8> arena; + + // Insert several non-adjacent blocks of various sizes. + struct + { + size_t chunk_idx; + size_t size; + } blocks[] = { + {10, 1}, + {20, 2}, + {30, 3}, + {40, 5}, + {50, 9}, + }; + + for (auto& b : blocks) + { + auto result = arena.add_block(chunk_addr(b.chunk_idx), b.size); + SNMALLOC_ASSERT(result.first == 0 && result.second == 0); + UNUSED(result); + arena.check_invariant(true); + } + + printf(" add_block without consolidation: OK\n"); + } + + // ================================================================== + // (E) remove_block exact-class + carving + // ================================================================== + static void test_remove_exact() + { + reset_mock_store(); + Arena<8> arena; + + // Insert 3 blocks of size 5 at non-adjacent locations. + arena.add_block(chunk_addr(10), 5); + arena.add_block(chunk_addr(20), 5); + arena.add_block(chunk_addr(30), 5); + arena.check_invariant(true); + + // Remove 3 exact-size blocks. + for (int i = 0; i < 3; i++) + { + auto r = arena.remove_block(5); + SNMALLOC_ASSERT(r.first != 0); + SNMALLOC_ASSERT(r.second == 5); + UNUSED(r); + arena.check_invariant(true); + } + + // Arena should be empty now. + auto r = arena.remove_block(1); + SNMALLOC_ASSERT(r.first == 0); + UNUSED(r); + + printf(" remove_block exact: OK\n"); + } + + static void test_remove_carving() + { + reset_mock_store(); + Arena<8> arena; + + // Insert one block of size 10. + arena.add_block(chunk_addr(10), 10); + arena.check_invariant(true); + + // Request size 3 — should carve from the 10-chunk block. + auto r = arena.remove_block(3); + SNMALLOC_ASSERT(r.first != 0); + // The carved piece should be exactly what Bins::carve produces. + auto carved = Bins::carve({10, 10}, 3); + SNMALLOC_ASSERT(r.second == carved.req.size); + UNUSED(r); + arena.check_invariant(true); + + // The remainders should still be in the arena. + // We can try to remove everything that's left. + size_t remaining = 10 - carved.req.size; + while (remaining > 0) + { + auto r2 = arena.remove_block(1); + SNMALLOC_ASSERT(r2.first != 0); + arena.check_invariant(true); + remaining -= r2.second; + } + + // Should be empty. + auto r3 = arena.remove_block(1); + SNMALLOC_ASSERT(r3.first == 0); + UNUSED(r3); + + printf(" remove_block carving: OK\n"); + } + + // ================================================================== + // (F) Consolidation case matrix + // ================================================================== + + // Helper: insert a block, verify invariant, return nothing. + template + static void + add_and_check(Arena& arena, size_t chunk_idx, size_t size_chunks) + { + auto result = arena.add_block(chunk_addr(chunk_idx), size_chunks); + SNMALLOC_ASSERT(result.first == 0 && result.second == 0); + UNUSED(result); + arena.check_invariant(true); + } + + // Drain the arena by removing 1-chunk blocks until empty. + // Returns the total chunks removed. + template + static size_t drain_arena(Arena& arena) + { + size_t total = 0; + while (true) + { + auto r = arena.remove_block(1); + if (r.first == 0) + break; + total += r.second; + arena.check_invariant(true); + } + return total; + } + + // Case 12: P-only, P min (size 1). + static void test_consolidation_p_min() + { + reset_mock_store(); + Arena<8> arena; + add_and_check(arena, 10, 1); + add_and_check(arena, 11, 3); + + // Should have consolidated into a single 4-chunk block. + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 4); + UNUSED(total); + + printf(" Consolidation P-only, P min: OK\n"); + } + + // Case 13: P-only, P non-min. + static void test_consolidation_p_nonmin() + { + reset_mock_store(); + Arena<8> arena; + add_and_check(arena, 10, 3); + add_and_check(arena, 13, 2); + + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 5); + UNUSED(total); + + printf(" Consolidation P-only, P non-min: OK\n"); + } + + // Case 14: S-only, S min. + static void test_consolidation_s_min() + { + reset_mock_store(); + Arena<8> arena; + add_and_check(arena, 14, 1); + add_and_check(arena, 11, 3); + + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 4); + UNUSED(total); + + printf(" Consolidation S-only, S min: OK\n"); + } + + // Case 15: S-only, S non-min. + static void test_consolidation_s_nonmin() + { + reset_mock_store(); + Arena<8> arena; + add_and_check(arena, 14, 4); + add_and_check(arena, 11, 3); + + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 7); + UNUSED(total); + + printf(" Consolidation S-only, S non-min: OK\n"); + } + + // Case 16: P+S, both min. + static void test_consolidation_ps_both_min() + { + reset_mock_store(); + Arena<8> arena; + add_and_check(arena, 10, 1); + add_and_check(arena, 12, 1); + add_and_check(arena, 11, 1); + + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 3); + UNUSED(total); + + printf(" Consolidation P+S, both min: OK\n"); + } + + // Case 17: P+S, P min, S non-min. + static void test_consolidation_ps_p_min_s_nonmin() + { + reset_mock_store(); + Arena<8> arena; + add_and_check(arena, 10, 1); + add_and_check(arena, 14, 3); + add_and_check(arena, 11, 3); + + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 7); + UNUSED(total); + + printf(" Consolidation P+S, P min, S non-min: OK\n"); + } + + // Case 18: P+S, P non-min, S min. + static void test_consolidation_ps_p_nonmin_s_min() + { + reset_mock_store(); + Arena<8> arena; + add_and_check(arena, 10, 3); + add_and_check(arena, 16, 1); + add_and_check(arena, 13, 3); + + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 7); + UNUSED(total); + + printf(" Consolidation P+S, P non-min, S min: OK\n"); + } + + // Case 19: P+S, both non-min. + static void test_consolidation_ps_both_nonmin() + { + reset_mock_store(); + Arena<8> arena; + add_and_check(arena, 10, 4); + add_and_check(arena, 19, 5); + add_and_check(arena, 14, 5); + + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 14); + UNUSED(total); + + printf(" Consolidation P+S, both non-min: OK\n"); + } + + // ================================================================== + // (F2) OddTwo — unaligned size-2 blocks + // ================================================================== + + static void test_oddtwo_variant() + { + // Odd chunk index → OddTwo, even → EvenTwo. + reset_mock_store(); + Arena<8> arena; + + // Odd address: chunk 11, size 2 + arena.add_block(chunk_addr(11), 2); + SNMALLOC_ASSERT( + MockRep::get_variant(chunk_addr(11)) == BackendArenaVariant::OddTwo); + arena.check_invariant(true); + + // Even address: chunk 20, size 2 + arena.add_block(chunk_addr(20), 2); + SNMALLOC_ASSERT( + MockRep::get_variant(chunk_addr(20)) == BackendArenaVariant::EvenTwo); + arena.check_invariant(true); + + // Both should be in the range tree. + auto& rt = BackendArenaTestAccess::get_range_tree(arena); + auto p1 = rt.get_root_path(); + SNMALLOC_ASSERT(rt.find(p1, chunk_addr(11))); + auto p2 = rt.get_root_path(); + SNMALLOC_ASSERT(rt.find(p2, chunk_addr(20))); + + // OddTwo (chunk 11) should be in bin 0 (size-1 servable set). + auto& bt0 = BackendArenaTestAccess::get_bin_trees(arena)[0]; + auto p3 = bt0.get_root_path(); + SNMALLOC_ASSERT(bt0.find(p3, chunk_addr(11))); + + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 4); + UNUSED(total); + + printf(" OddTwo variant tagging: OK\n"); + } + + static void test_oddtwo_contains_min_filter() + { + // contains_min must not match OddTwo entries. + reset_mock_store(); + Arena<8> arena; + + // Add OddTwo block at chunk 11 (odd, size 2). + arena.add_block(chunk_addr(11), 2); + arena.check_invariant(true); + + // Add a size-1 block at chunk 14, non-adjacent. + arena.add_block(chunk_addr(14), 1); + arena.check_invariant(true); + + // Now add chunk 13 (size 1). Its successor check should NOT + // pick up chunk 11's OddTwo entry via contains_min. It should + // just insert as size 1. + arena.add_block(chunk_addr(13), 1); + arena.check_invariant(true); + + // Chunk 13 should consolidate with chunk 14 (min successor), + // but NOT with chunk 11's OddTwo (range tree handles that). + // Drain to verify total. + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 4); + UNUSED(total); + + printf(" OddTwo contains_min filter: OK\n"); + } + + static void test_oddtwo_consolidation() + { + // OddTwo block should consolidate via the range tree. + reset_mock_store(); + Arena<8> arena; + + // Add OddTwo at chunk 11 (odd, size 2 → chunks 11-12). + arena.add_block(chunk_addr(11), 2); + arena.check_invariant(true); + + // Add adjacent block at chunk 13 (size 1). + // Range tree finds OddTwo at 11 as predecessor? No — chunk 13's + // predecessor in range tree is chunk 11 (size 2, ends at 13). + // So they should consolidate into size 3 at chunk 11. + arena.add_block(chunk_addr(13), 1); + arena.check_invariant(true); + + auto r = arena.remove_block(3); + SNMALLOC_ASSERT(r.first == chunk_addr(11)); + SNMALLOC_ASSERT(r.second == 3); + UNUSED(r); + + printf(" OddTwo consolidation (successor): OK\n"); + } + + static void test_oddtwo_consolidation_pred() + { + // Consolidation where the new block is a predecessor of OddTwo. + reset_mock_store(); + Arena<8> arena; + + // Add OddTwo at chunk 11 (odd, size 2 → chunks 11-12). + arena.add_block(chunk_addr(11), 2); + arena.check_invariant(true); + + // Add block at chunk 10 (size 1). OddTwo at 11 is the successor + // in the range tree → consolidate into size 3 at chunk 10. + arena.add_block(chunk_addr(10), 1); + arena.check_invariant(true); + + auto r = arena.remove_block(3); + SNMALLOC_ASSERT(r.first == chunk_addr(10)); + SNMALLOC_ASSERT(r.second == 3); + UNUSED(r); + + printf(" OddTwo consolidation (predecessor): OK\n"); + } + + static void test_oddtwo_remove_carve() + { + // remove_block(1) from an OddTwo block should carve correctly. + reset_mock_store(); + Arena<8> arena; + + // Add OddTwo at chunk 11 (odd, size 2). + arena.add_block(chunk_addr(11), 2); + arena.check_invariant(true); + + // Remove 1 chunk. Should carve from the OddTwo block. + auto r = arena.remove_block(1); + SNMALLOC_ASSERT(r.first != 0); + SNMALLOC_ASSERT(r.second == 1); + arena.check_invariant(true); + + // The remainder (1 chunk) should be Min variant. + auto r2 = arena.remove_block(1); + SNMALLOC_ASSERT(r2.first != 0); + SNMALLOC_ASSERT(r2.second == 1); + UNUSED(r, r2); + + // Arena should be empty now. + auto r3 = arena.remove_block(1); + SNMALLOC_ASSERT(r3.first == 0); + UNUSED(r3); + + printf(" OddTwo remove + carve: OK\n"); + } + + // ================================================================== + // (G) Overflow — arena-scale consolidation + // ================================================================== + static void test_overflow() + { + // K=4 → 16-chunk arena. Use base offset 16 to avoid address 0. + reset_mock_store(); + Arena<4> arena; + + constexpr size_t BASE = 16; + + // Step 1: add even-indexed chunks as individual blocks (8 blocks). + for (size_t i = 0; i < 16; i += 2) + { + arena.add_block(chunk_addr(BASE + i), 1); + arena.check_invariant(true); + } + + // Step 2: fill odd-indexed gaps. Each add consolidates with its + // even-indexed neighbours. The last add completes the arena. + for (size_t i = 1; i < 16; i += 2) + { + arena.add_block(chunk_addr(BASE + i), 1); + // Don't check invariant on the last add — it returns overflow. + if (i < 15) + { + arena.check_invariant(true); + } + } + + // The last add should have triggered overflow (16 chunks = 2^4). + auto r = arena.remove_block(1); + SNMALLOC_ASSERT(r.first == 0); + UNUSED(r); + + printf(" Overflow (arena-scale consolidation): OK\n"); + } + + static void test_overflow_precise() + { + // K=4 → 16-chunk arena. Use base offset 16 to avoid address 0. + reset_mock_store(); + Arena<4> arena; + + constexpr size_t BASE = 16; + + arena.add_block(chunk_addr(BASE), 8); + arena.check_invariant(true); + + // Adding [BASE+8, BASE+16) consolidates to 16 chunks = 2^4 → overflow. + auto r = arena.add_block(chunk_addr(BASE + 8), 8); + SNMALLOC_ASSERT(r.first == chunk_addr(BASE)); + SNMALLOC_ASSERT(r.second == 16); + UNUSED(r); + + auto r2 = arena.remove_block(1); + SNMALLOC_ASSERT(r2.first == 0); + UNUSED(r2); + + printf(" Overflow precise: OK\n"); + } + + // ================================================================== + // (H) Randomised stress with oracle + // ================================================================== + + // Oracle: std::set of (addr_chunks, size_chunks) representing + // maximally-consolidated free set. + struct OracleRange + { + size_t addr; // in chunk units + size_t size; // in chunk units + + bool operator<(const OracleRange& o) const + { + return addr < o.addr; + } + + bool operator==(const OracleRange& o) const + { + return addr == o.addr && size == o.size; + } + }; + + class Oracle + { + std::set ranges; + size_t base_offset; // chunk offset to match arena addresses + + public: + Oracle() : base_offset(0) {} + + Oracle(size_t base) : base_offset(base) {} + + void add(size_t addr_chunks, size_t size_chunks) + { + OracleRange key{addr_chunks, size_chunks}; + auto it = ranges.lower_bound(key); + + size_t new_addr = addr_chunks; + size_t new_size = size_chunks; + + if (it != ranges.end() && it->addr == new_addr + new_size) + { + new_size += it->size; + it = ranges.erase(it); + } + + if (it != ranges.begin()) + { + auto prev = std::prev(it); + if (prev->addr + prev->size == new_addr) + { + new_addr = prev->addr; + new_size += prev->size; + ranges.erase(prev); + } + } + + ranges.insert({new_addr, new_size}); + } + + // Returns {addr_chunks, size_chunks} or {0, 0} if nothing fits. + // addr_chunks is oracle-relative (without base offset). + std::pair remove(size_t n_chunks) + { + if (n_chunks == 0 || n_chunks > Bins::max_supported_chunks()) + return {0, 0}; + + // Mirror the arena exactly: build a bitmap using arena-offset + // addresses (so bin classification matches), then find_for_request. + typename Bins::Bitmap bm{}; + std::map::iterator>> by_bin; + + for (auto it = ranges.begin(); it != ranges.end(); ++it) + { + // Use base-offset address for bin classification. + Bins::range_t r{base_offset + it->addr, it->size}; + size_t bin = bm.add(r); + by_bin[bin].push_back(it); + } + + size_t bin_id = bm.find_for_request(n_chunks); + if (bin_id == SIZE_MAX) + return {0, 0}; + + auto& entries = by_bin[bin_id]; + auto best_it = entries[0]; + for (size_t i = 1; i < entries.size(); i++) + { + if (entries[i]->addr < best_it->addr) + best_it = entries[i]; + } + + OracleRange block = *best_it; + ranges.erase(best_it); + + // Carve using base-offset address. + auto carved = + Bins::carve({base_offset + block.addr, block.size}, n_chunks); + if (carved.pre.size != 0) + ranges.insert({carved.pre.base - base_offset, carved.pre.size}); + if (carved.post.size != 0) + ranges.insert({carved.post.base - base_offset, carved.post.size}); + + return {carved.req.base - base_offset, carved.req.size}; + } + + bool empty() const + { + return ranges.empty(); + } + + size_t count() const + { + return ranges.size(); + } + }; + + template + static void test_stress_seed(size_t seed, size_t num_ops) + { + reset_mock_store(); + Arena arena; + + constexpr size_t ARENA_CHUNKS = bits::one_at_bit(K); + // Offset all chunk addresses to avoid address 0 (tree null). + constexpr size_t BASE = ARENA_CHUNKS; + Oracle oracle(BASE); + // Track which chunks are allocated (not free). + std::vector allocated(ARENA_CHUNKS, true); + + xoroshiro::p128r64 rng(seed); + + for (size_t op = 0; op < num_ops; op++) + { + bool do_add = (rng.next() % 3) != 0; // Bias towards adding. + + if (do_add) + { + // Find a free address range of random size within the arena. + size_t max_size = ARENA_CHUNKS / 4; + if (max_size < 1) + max_size = 1; + size_t size = (rng.next() % max_size) + 1; + size_t start = rng.next() % ARENA_CHUNKS; + + // Adjust: find a contiguous allocated (not free) region. + // We need a region that's currently allocated (not in the + // free set) to add back. + bool found = false; + for (size_t try_start = start; try_start < ARENA_CHUNKS; try_start++) + { + // Check if [try_start, try_start + size) is all allocated. + size_t actual_size = 0; + for (size_t j = try_start; j < ARENA_CHUNKS && j < try_start + size; + j++) + { + if (!allocated[j]) + break; + actual_size++; + } + + if (actual_size >= 1) + { + size = actual_size; + start = try_start; + found = true; + break; + } + } + + if (!found) + continue; + + // Clamp to arena size limit. + if (size >= ARENA_CHUNKS) + size = ARENA_CHUNKS - 1; + if (start + size > ARENA_CHUNKS) + size = ARENA_CHUNKS - start; + if (size == 0) + continue; + + // Mark as free. + SNMALLOC_ASSERT(start + size <= ARENA_CHUNKS); + for (size_t j = start; j < start + size; j++) + allocated[j] = false; + + auto result = arena.add_block(chunk_addr(BASE + start), size); + oracle.add(start, size); + + if (result.first != 0) + { + // Overflow — all chunks are now free and returned to caller. + // Oracle should be empty after we remove the overflow range. + // Reset: mark everything as allocated again, clear oracle. + for (size_t j = 0; j < ARENA_CHUNKS; j++) + allocated[j] = true; + oracle = Oracle(BASE); + // The overflow range isn't tracked by the arena anymore. + } + + arena.check_invariant(true); + } + else + { + // Remove. + size_t max_req = ARENA_CHUNKS / 4; + if (max_req < 1) + max_req = 1; + size_t n = (rng.next() % max_req) + 1; + + auto arena_result = arena.remove_block(n); + auto oracle_result = oracle.remove(n); + UNUSED(arena_result); + + // Both should agree on success/failure. + // Use size == 0 to detect failure, since oracle address 0 is valid. + if (oracle_result.second == 0) + { + SNMALLOC_ASSERT(arena_result.second == 0); + } + else + { + SNMALLOC_ASSERT(arena_result.second != 0); + // Both should return the same address and size. + SNMALLOC_ASSERT( + arena_result.first == chunk_addr(BASE + oracle_result.first)); + SNMALLOC_ASSERT(arena_result.second == oracle_result.second); + + // Mark as allocated. + size_t start = oracle_result.first; + SNMALLOC_ASSERT(start + oracle_result.second <= ARENA_CHUNKS); + for (size_t j = start; j < start + oracle_result.second; j++) + allocated[j] = true; + } + + arena.check_invariant(true); + } + } + } + + static void test_stress() + { + constexpr size_t K = 6; // 64-chunk arena + constexpr size_t NUM_OPS = 500; + constexpr size_t NUM_SEEDS = 50; + + for (size_t seed = 1; seed <= NUM_SEEDS; seed++) + { + test_stress_seed(seed, NUM_OPS); + } + printf( + " Randomised stress (%zu seeds x %zu ops): OK\n", NUM_SEEDS, NUM_OPS); + } + +} // namespace snmalloc + +int main() +{ + printf("--- BackendArena tests ---\n"); + + printf("(A) Accessor round-trips:\n"); + snmalloc::test_variant_roundtrip(); + snmalloc::test_large_size_roundtrip(); + snmalloc::test_word_roundtrip(); + + printf("(B) RBTree smoke via arena:\n"); + snmalloc::test_rbtree_smoke_via_arena(); + + printf("(C) Empty-state invariant:\n"); + snmalloc::test_empty_invariant<4>(); + snmalloc::test_empty_invariant<5>(); + snmalloc::test_empty_invariant<6>(); + + printf("(D) add_block without consolidation:\n"); + snmalloc::test_add_no_consolidation(); + + printf("(E) remove_block:\n"); + snmalloc::test_remove_exact(); + snmalloc::test_remove_carving(); + + printf("(F) Consolidation case matrix:\n"); + snmalloc::test_consolidation_p_min(); + snmalloc::test_consolidation_p_nonmin(); + snmalloc::test_consolidation_s_min(); + snmalloc::test_consolidation_s_nonmin(); + snmalloc::test_consolidation_ps_both_min(); + snmalloc::test_consolidation_ps_p_min_s_nonmin(); + snmalloc::test_consolidation_ps_p_nonmin_s_min(); + snmalloc::test_consolidation_ps_both_nonmin(); + + printf("(F2) OddTwo (unaligned size-2):\n"); + snmalloc::test_oddtwo_variant(); + snmalloc::test_oddtwo_contains_min_filter(); + snmalloc::test_oddtwo_consolidation(); + snmalloc::test_oddtwo_consolidation_pred(); + snmalloc::test_oddtwo_remove_carve(); + + printf("(G) Overflow:\n"); + snmalloc::test_overflow(); + snmalloc::test_overflow_precise(); + + printf("(H) Randomised stress:\n"); + snmalloc::test_stress(); + + printf("All BackendArena tests passed.\n"); + return 0; +} From 7adf7bad258aace5f1116f7374535b9bac468879 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Thu, 21 May 2026 17:22:04 +0100 Subject: [PATCH 05/31] Add multi-instance BackendArena tests with shared pagemap Tests two BackendArena instances sharing a single MockRep pagemap: - Basic migration: blocks move between arenas - Consolidation after migration: gap block consolidates with neighbours - Randomised stress: 50 seeds x 500 ops with add/remove/migrate Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PLAN.md | 31 +-- src/test/func/backend_arena/backend_arena.cc | 252 +++++++++++++++++++ 2 files changed, 256 insertions(+), 27 deletions(-) diff --git a/PLAN.md b/PLAN.md index e8023f096..ed0666c60 100644 --- a/PLAN.md +++ b/PLAN.md @@ -1026,33 +1026,10 @@ All changes are in `backend_arena.h` and the test file. ### Phase 6: Consolidation — reuse predecessor's Range entry (optimisation) -Switch the P-merge case to reuse `P`'s Range tree node (no RB mutation), -but **only when `P` is non-min** (a min-size `P` has no Range entry to -reuse). The S-only case continues to use remove+reinsert. The P+S case -reuses `P` (when non-min) and removes `S`. When `P` is min-size, the -merged block is inserted into the Range tree normally. - -**Test gate**: all Phase 3+4 tests still pass. Add debug-only counters at -the `BackendArena` layer (not inside `RBTree`) for "Range tree -`insert_path` calls" and "Range tree `remove_path` calls" during -`add_block` / `remove_block`. Assert that: - -- the non-min-P-only consolidation case records zero - Range-tree insert/remove calls (the existing node is reused in place), -- the min-P-only consolidation case records exactly one Range-tree insert - (no remove), -- the S-only consolidation case records one insert and one remove. - -This avoids any modification to `RBTree` itself — the counter increments -sit in the `BackendArena` wrappers around its Range-tree calls. - -**Review gate**: spec slice = "Consolidation: reusing tree entries when -possible" and the Phase 6 section above. Reviewer checks: reuse path -correctly leaves the Range-tree node in place (key unchanged, only the -back-reference from the new combined block); min-P case correctly falls -back to normal insert; counter assertions cover the cases that -distinguish the optimised path from the simple path; no regression of -Phase 3+4's full invariant + oracle randomised test. +**Deferred.** Self-contained optimisation that saves two RB-tree operations +per predecessor consolidation. Can be added later if profiling shows it +matters. The design is recorded in the "Consolidation: reusing tree +entries when possible" section above. ### Phase 7: Multi-instance test diff --git a/src/test/func/backend_arena/backend_arena.cc b/src/test/func/backend_arena/backend_arena.cc index c0da85692..ec5bcd232 100644 --- a/src/test/func/backend_arena/backend_arena.cc +++ b/src/test/func/backend_arena/backend_arena.cc @@ -966,6 +966,253 @@ namespace snmalloc " Randomised stress (%zu seeds x %zu ops): OK\n", NUM_SEEDS, NUM_OPS); } + // ================================================================== + // (I) Multi-instance: shared pagemap, blocks migrating between arenas + // ================================================================== + + static void test_multi_instance_basic() + { + reset_mock_store(); + Arena<8> arena_a; + Arena<8> arena_b; + constexpr size_t BASE = 256; // avoid address 0 + + // Add distinct blocks to each arena. + arena_a.add_block(chunk_addr(BASE + 10), 5); + arena_b.add_block(chunk_addr(BASE + 30), 5); + arena_a.check_invariant(true); + arena_b.check_invariant(true); + + // Migrate a block from A to B. + auto [a_addr, a_size] = arena_a.remove_block(3); + SNMALLOC_ASSERT(a_addr != 0 && a_size != 0); + arena_a.check_invariant(true); + + arena_b.add_block(a_addr, a_size); + arena_a.check_invariant(true); + arena_b.check_invariant(true); + + // Migrate from B back to A. + auto [b_addr, b_size] = arena_b.remove_block(2); + SNMALLOC_ASSERT(b_addr != 0 && b_size != 0); + arena_b.check_invariant(true); + + arena_a.add_block(b_addr, b_size); + arena_a.check_invariant(true); + arena_b.check_invariant(true); + + printf(" Basic migration: OK\n"); + } + + static void test_multi_instance_consolidation() + { + reset_mock_store(); + Arena<8> arena_a; + Arena<8> arena_b; + constexpr size_t BASE = 256; + + // Arena B holds two blocks with a gap: [20..24) and [28..32). + arena_b.add_block(chunk_addr(BASE + 20), 4); + arena_b.add_block(chunk_addr(BASE + 28), 4); + arena_b.check_invariant(true); + + // Arena A holds the gap: [24..28). + arena_a.add_block(chunk_addr(BASE + 24), 4); + arena_a.check_invariant(true); + + // Migrate the gap from A to B → should consolidate into [20..32). + auto [addr, size] = arena_a.remove_block(4); + SNMALLOC_ASSERT(addr == chunk_addr(BASE + 24)); + SNMALLOC_ASSERT(size == 4); + arena_a.check_invariant(true); + + arena_b.add_block(addr, size); + arena_b.check_invariant(true); + + // B should now serve a size-12 request from the consolidated block. + auto [r_addr, r_size] = arena_b.remove_block(12); + SNMALLOC_ASSERT(r_addr == chunk_addr(BASE + 20)); + SNMALLOC_ASSERT(r_size == 12); + arena_b.check_invariant(true); + + printf(" Consolidation after migration: OK\n"); + } + + template + static void test_multi_stress_seed(size_t seed, size_t num_ops) + { + reset_mock_store(); + Arena arena_a; + Arena arena_b; + + constexpr size_t ARENA_CHUNKS = bits::one_at_bit(K); + constexpr size_t BASE = ARENA_CHUNKS; + Oracle oracle_a(BASE); + Oracle oracle_b(BASE); + + // 0 = not in any arena, 1 = in arena A, 2 = in arena B. + std::vector owner(ARENA_CHUNKS, 0); + + xoroshiro::p128r64 rng(seed); + + for (size_t op = 0; op < num_ops; op++) + { + // 0,1 = add to A or B; 2,3 = remove from A or B; 4 = migrate. + size_t action = rng.next() % 5; + + bool target_a = (action & 1) == 0; + auto& arena = target_a ? arena_a : arena_b; + auto& oracle = target_a ? oracle_a : oracle_b; + uint8_t my_id = target_a ? 1 : 2; + + if (action <= 1) + { + // Add: find a contiguous unowned region to free into this arena. + size_t max_size = ARENA_CHUNKS / 4; + if (max_size < 1) + max_size = 1; + size_t size = (rng.next() % max_size) + 1; + size_t start = rng.next() % ARENA_CHUNKS; + + bool found = false; + for (size_t s = start; s < ARENA_CHUNKS; s++) + { + size_t actual = 0; + for (size_t j = s; j < ARENA_CHUNKS && j < s + size; j++) + { + if (owner[j] != 0) + break; + actual++; + } + if (actual >= 1) + { + size = actual; + start = s; + found = true; + break; + } + } + if (!found) + continue; + + if (size >= ARENA_CHUNKS) + size = ARENA_CHUNKS - 1; + if (start + size > ARENA_CHUNKS) + size = ARENA_CHUNKS - start; + if (size == 0) + continue; + + for (size_t j = start; j < start + size; j++) + owner[j] = my_id; + + auto result = arena.add_block(chunk_addr(BASE + start), size); + oracle.add(start, size); + + if (result.first != 0) + { + for (size_t j = 0; j < ARENA_CHUNKS; j++) + if (owner[j] == my_id) + owner[j] = 0; + oracle = Oracle(BASE); + } + + arena.check_invariant(true); + } + else if (action <= 3) + { + // Remove from this arena. + size_t max_req = ARENA_CHUNKS / 4; + if (max_req < 1) + max_req = 1; + size_t n = (rng.next() % max_req) + 1; + + auto arena_r = arena.remove_block(n); + auto oracle_r = oracle.remove(n); + + if (oracle_r.second == 0) + { + SNMALLOC_ASSERT(arena_r.second == 0); + } + else + { + SNMALLOC_ASSERT(arena_r.second != 0); + SNMALLOC_ASSERT(arena_r.first == chunk_addr(BASE + oracle_r.first)); + SNMALLOC_ASSERT(arena_r.second == oracle_r.second); + + for (size_t j = oracle_r.first; j < oracle_r.first + oracle_r.second; + j++) + { + SNMALLOC_ASSERT(owner[j] == my_id); + owner[j] = 0; + } + } + + arena.check_invariant(true); + } + else + { + // Migrate: remove from one arena, add to the other. + bool from_a = (rng.next() & 1) == 0; + auto& src = from_a ? arena_a : arena_b; + auto& src_oracle = from_a ? oracle_a : oracle_b; + auto& dst = from_a ? arena_b : arena_a; + auto& dst_oracle = from_a ? oracle_b : oracle_a; + uint8_t src_id = from_a ? 1 : 2; + uint8_t dst_id = from_a ? 2 : 1; + + size_t n = (rng.next() % 3) + 1; + auto src_r = src.remove_block(n); + auto src_or = src_oracle.remove(n); + + if (src_or.second == 0) + { + SNMALLOC_ASSERT(src_r.second == 0); + } + else + { + SNMALLOC_ASSERT(src_r.second != 0); + SNMALLOC_ASSERT(src_r.first == chunk_addr(BASE + src_or.first)); + SNMALLOC_ASSERT(src_r.second == src_or.second); + + for (size_t j = src_or.first; j < src_or.first + src_or.second; j++) + { + SNMALLOC_ASSERT(owner[j] == src_id); + owner[j] = dst_id; + } + + auto dst_r = dst.add_block(src_r.first, src_r.second); + dst_oracle.add(src_or.first, src_or.second); + + if (dst_r.first != 0) + { + for (size_t j = 0; j < ARENA_CHUNKS; j++) + if (owner[j] == dst_id) + owner[j] = 0; + dst_oracle = Oracle(BASE); + } + } + + src.check_invariant(true); + dst.check_invariant(true); + } + } + } + + static void test_multi_stress() + { + constexpr size_t K = 6; // 64-chunk arena + constexpr size_t NUM_OPS = 500; + constexpr size_t NUM_SEEDS = 50; + + for (size_t seed = 1; seed <= NUM_SEEDS; seed++) + test_multi_stress_seed(seed, NUM_OPS); + + printf( + " Multi-instance stress (%zu seeds x %zu ops): OK\n", + NUM_SEEDS, + NUM_OPS); + } + } // namespace snmalloc int main() @@ -1016,6 +1263,11 @@ int main() printf("(H) Randomised stress:\n"); snmalloc::test_stress(); + printf("(I) Multi-instance:\n"); + snmalloc::test_multi_instance_basic(); + snmalloc::test_multi_instance_consolidation(); + snmalloc::test_multi_stress(); + printf("All BackendArena tests passed.\n"); return 0; } From 42663316a306aee2af50a30bac19b2152f07aaa1 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Fri, 22 May 2026 09:11:48 +0100 Subject: [PATCH 06/31] BackendArena: representation-agnostic Rep concept + boundary support Phase 9 of the BackendArena refactor: - Make BackendArena fully generic over its Rep, mirroring the Buddy/Rep layering. The class no longer holds any bit-layout constants; Rep supplies the full RBTree Rep for both the bin trees and the range tree, owning red-bit (and any tag-bit) packing privately. - Rep concept now requires: using BinRep -- full RBTree Rep for the bin trees using RangeRep -- full RBTree Rep for the range tree get_variant / set_variant get_large_size_chunks / set_large_size_chunks can_consolidate(higher_addr) -> bool - Add can_consolidate checks in add_block before each (predecessor and successor) merge, and update the invariants to tolerate boundary-blocked adjacency. - MockRep grows inner BinRep / RangeRep structs that each provide the full RBTree Rep interface over the mock-entry array, with a private red-bit at bit 8. - New tests verify that can_consolidate returning false at a specific address prevents predecessor- and successor-side merges independently, including at min-block boundaries. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PLAN.md | 518 ++++++++++++++++++- src/snmalloc/backend_helpers/backend_arena.h | 181 ++----- src/test/func/backend_arena/backend_arena.cc | 306 ++++++++++- 3 files changed, 848 insertions(+), 157 deletions(-) diff --git a/PLAN.md b/PLAN.md index ed0666c60..2c058f6a3 100644 --- a/PLAN.md +++ b/PLAN.md @@ -1050,7 +1050,523 @@ Per `claude.md` mandatory review checkpoints: **Test gate**: full ctest run (Debug) passes; reviewer reports no issues. -## Files added / changed (anticipated) +--- + +# Implementation plan: BackendArenaRange phase + +## Scope + +Build `BackendArenaRange` — a Range pipeline component that wraps +`BackendArena` behind snmalloc's Range API, suitable for replacing +`LargeBuddyRange`. This plan covers: + +- Generalising BackendArena's Rep interface for pagemap compatibility. +- `PagemapRep` — adapting pagemap entries to BackendArena's Rep concept. +- `BackendArenaRange` — the Range wrapper with refill and overflow handling. +- Boundary-bit support for safe consolidation across PAL allocations. +- Unit tests for all of the above. + +The pipeline integration (replacing `LargeBuddyRange` in `standard_range.h` +and `meta_protected_range.h`) is a separate step ("Update backend to use +BackendArenaRange") that follows once this plan is complete. + +## Design + +### Rep generalisation: representation-agnostic data structure + +`BackendArena` must be representation-agnostic, mirroring how +`Buddy<>` is generic over its node `Rep` (see `buddy.h`). The +existing buddy ecosystem demonstrates the layering: + +- `buddy.h` — pure data structure, no representation. +- `largebuddyrange.h` defines `BuddyChunkRep` — a pagemap-backed Rep + (red bit at bit 8, layout chosen to coexist with the pagemap's + reserved low bits). +- `smallbuddyrange.h` defines `BuddyInplaceRep` — an inline Rep that + stores tree pointers in the free chunk itself (red bit at bit 0). + +`BackendArena` must support the same two representation paths so it +can eventually replace both `LargeBuddyRange` (pagemap) and +`SmallBuddyRange` (inline) in the standard pipeline. + +#### Rep concept + +`Rep` provides: + +- `using BinRep` — full RBTree Rep for the bin trees. +- `using RangeRep` — full RBTree Rep for the range tree. +- `get_variant(addr)` / `set_variant(addr, v)` — block variant tag. +- `get_large_size_chunks(addr)` / `set_large_size_chunks(addr, n)` — + precise chunk count for `Large` blocks. +- `can_consolidate(higher_addr)` — false at PAL allocation boundaries. + +Each inner `BinRep` / `RangeRep` is a complete RBTree Rep (same shape +as `BuddyChunkRep` / `BuddyInplaceRep`): provides `Handle`, +`Contents`, `null`, `root`, `ref`, `get`, `set`, `is_red`, `set_red`, +`compare`, `equal`, `printable`, `name`. **All bit-packing decisions +(red bit position, mask layout) are private to the Rep** — +`BackendArena` carries no `RED_BIT` / `VARIANT_MASK` / `META_MASK` +constants of its own. + +`BackendArena` instantiates `RBTree` and +`RBTree` directly. It never inspects the bit +layout used by the Rep. + +#### PagemapRep (production) + +Lives in `backend_arena_range.h`. Privately owns its bit layout: + +- Bin tree node in pagemap entry at `addr`, Word::One/Two. `BinRep` + packs the red bit at bit 8 and the variant tag at bits 9–10 of + Word::One; bits 0–7 are reserved by the pagemap. +- Range tree node in pagemap entry at `addr + MIN_CHUNK_SIZE`, with + the same layout for `RangeRep`. +- Large-size chunk count stored as `count << 8` in Word::One of the + entry at `addr + 2*MIN_CHUNK_SIZE`. + +#### MockRep (test only) + +Lives in `src/test/func/backend_arena/backend_arena.cc`. Backs its +storage with an array of `mock_entry` and uses the same `RED_BIT = +1 << 8` layout (so it exercises the same code paths the pagemap Rep +will). The mock's `BinRep` and `RangeRep` are inner structs that own +their own ref/get/set/is_red/set_red implementations. + +#### Future inline Rep (not in this phase) + +Mirroring `BuddyInplaceRep`: tree pointers live inside the free +memory itself. `BinRep` / `RangeRep` would use pointer-low-bits for +red and variant tags. This is what enables a future +`BackendArena`-based replacement for `SmallBuddyRange`. + +### Boundary-bit consolidation check + +On platforms where `CONSOLIDATE_PAL_ALLOCS` is false (CHERI, Windows), +the pagemap sets a boundary bit on the first chunk of each PAL allocation +to prevent consolidation across allocation boundaries +(`BuddyChunkRep::can_consolidate` checks this). + +BackendArena's `add_block` consolidation must respect the same contract. +A new method on the Rep concept: + +``` +static bool can_consolidate(uintptr_t higher_addr); +``` + +Returns `true` if the block at `higher_addr` may be consolidated with the +block immediately below it. `add_block` checks this before each merge: + +- P ↔ A merge: `Rep::can_consolidate(addr)` (A is the higher address). +- A ↔ S merge: `Rep::can_consolidate(succ_addr)` (S is the higher address). + +MockRep: always returns `true` (no boundaries). +PagemapRep: returns `!get_metaentry_mut(higher_addr).is_boundary()`. + +### PagemapRep + +Templated on `Pagemap` and `MAX_CHUNKS_BITS`. The second parameter is +needed for the large-size-shift static assertion: + +``` +template +struct PagemapRep { ... }; +``` + +Each free block uses pagemap entries at three offsets from its base +address: + +- **Chunk 0** (`addr`): Word::One / Word::Two → bin-tree node. + Bits 9–10 of Word::One → variant tag. Bit 8 → RED_BIT. All coexist + because `TreeRep::set` preserves `META_MASK` on writes. +- **Chunk 1** (`addr + MIN_CHUNK_SIZE`): Word::One / Word::Two → + range-tree node (only for blocks ≥ 2 chunks). +- **Chunk 2** (`addr + 2 * MIN_CHUNK_SIZE`): Word::One → large chunk + count (only for blocks ≥ 3 chunks). Stored as `count << 8` to avoid + the 8 reserved low bits; recovered via `word.get() >> 8`. + +**Static assertions in PagemapRep** (catch configuration errors early): + +- `static_assert((VARIANT_MASK | RED_BIT) < MIN_CHUNK_SIZE)` — metadata + bits don't collide with address bits. +- `static_assert(MetaEntryBase::is_backend_allowed_value(Word::One, + VARIANT_MASK | RED_BIT))` — all metadata bits are in the backend- + allowed range. +- `static_assert(MAX_CHUNKS_BITS + 8 <= bits::BITS)` — shifted large + size fits in a pagemap word. + +Method mapping: + +| Method | Implementation | +|--------|---------------| +| `ref_word(dir, addr)` | `get_metaentry_mut(addr).get_backend_word(dir ? One : Two)` | +| `ref_range_word(dir, addr)` | `get_metaentry_mut(addr + MCS).get_backend_word(dir ? One : Two)` | +| `get_variant(addr)` | `(ref_word(true, addr).get() & VARIANT_MASK) >> 9` | +| `set_variant(addr, v)` | RMW on `ref_word(true, addr)`: clear VARIANT_MASK, OR new value | +| `get_large_size_chunks(addr)` | `get_metaentry_mut(addr + 2*MCS).get_backend_word(One).get() >> 8` | +| `set_large_size_chunks(addr, s)` | `...get_backend_word(One) = s << 8` | +| `can_consolidate(addr)` | `!get_metaentry_mut(addr).is_boundary()` | + +(`MCS = MIN_CHUNK_SIZE`) + +`get_backend_word` auto-calls `claim_for_backend()` on first access to +an unowned entry, so pagemap ownership transitions happen implicitly. +The boundary bit (bit 0 of `meta`) is in the reserved-mask zone and is +preserved by both `claim_for_backend()` and `BackendStateWordRef::operator=`. + +### BackendArenaRange + +Outer template matches `LargeBuddyRange`'s shape so it is a drop-in +replacement in `Pipe<...>` compositions: + +``` +template< + size_t REFILL_SIZE_BITS, + size_t MAX_SIZE_BITS, + SNMALLOC_CONCEPT(IsWritablePagemap) Pagemap, + size_t MIN_REFILL_SIZE_BITS = 0> +class BackendArenaRange +{ +public: + template> + class Type : public ContainsParent + { + using ContainsParent::parent; + + static constexpr size_t MAX_CHUNKS_BITS = MAX_SIZE_BITS - MIN_CHUNK_BITS; + using PagemapRepT = PagemapRep; + BackendArena arena; + size_t requested_total = 0; + + public: + static constexpr bool Aligned = true; + static constexpr bool ConcurrencySafe = false; + using ChunkBounds = capptr::bounds::Arena; + + capptr::Arena alloc_range(size_t size); + void dealloc_range(capptr::Arena base, size_t size); + }; +}; +``` + +**`alloc_range(size)`**: + +1. `SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE)`. +2. `SNMALLOC_ASSERT(bits::is_pow2(size))` — same assertion as + `LargeBuddyRange`. Non-power-of-two support is deferred to + "Update front-end" step. +3. `n_chunks = size >> MIN_CHUNK_BITS`. +4. Oversize bypass: if `n_chunks >= bits::one_at_bit(MAX_CHUNKS_BITS)`, + delegate to `parent.alloc_range(size)` (if `ParentRange::Aligned`), + else return `nullptr`. Same as `LargeBuddyRange`. +5. `auto [addr, actual] = arena.remove_block(n_chunks)`. + (`actual == n_chunks` for power-of-two requests — the `is_pow2` + assertion above guarantees this for this phase.) +6. If `addr != 0`, return + `capptr::Arena::unsafe_from(reinterpret_cast(addr))`. +7. If `addr == 0`, call `refill(size)`. + +**`dealloc_range(base, size)`**: + +1. `addr = base.unsafe_uintptr()`, `n_chunks = size >> MIN_CHUNK_BITS`. +2. Oversize bypass: if `n_chunks >= bits::one_at_bit(MAX_CHUNKS_BITS)`, + delegate to `parent.dealloc_range(base, size)`. Same condition and + SFINAE guard as `LargeBuddyRange::parent_dealloc_range`. +3. `auto [ov_addr, ov_size] = arena.add_block(addr, n_chunks)`. +4. If overflow (`ov_addr != 0`): call `dealloc_overflow(ov_addr, + ov_size)`. + +**`dealloc_overflow(addr, size_chunks)`** — matches +`LargeBuddyRange::dealloc_overflow` pattern: + +Overflow from `add_block` can produce non-power-of-two sizes (e.g., +consolidated blocks that span multiple non-aligned PAL allocations). +The parent may require power-of-two aligned inputs. So overflow is +decomposed using `range_to_pow_2_blocks`: + +``` +void dealloc_overflow(uintptr_t addr, size_t size_chunks) +{ + auto base = capptr::Arena::unsafe_from( + reinterpret_cast(addr)); + size_t size_bytes = size_chunks << MIN_CHUNK_BITS; + if constexpr (MAX_SIZE_BITS != (bits::BITS - 1)) + { + range_to_pow_2_blocks( + base, size_bytes, + [this](capptr::Arena b, size_t s, bool) { + parent.dealloc_range(b, s); + }); + } + else + { + // Global range: no parent to return to. + SNMALLOC_CHECK(false && "Global range overflow should not happen"); + } +} +``` + +When `MAX_SIZE_BITS == BITS - 1` (global range), the arena covers the +entire address space. Overflow would mean all managed memory has +coalesced — this should not happen in normal operation. If it does, +abort (matching `LargeBuddyRange`'s behaviour for the unreachable +case). + +**`refill(size)`** — closely follows `LargeBuddyRange::refill`: + +For `ParentRange::Aligned` (the standard path): + +1. Compute `refill_size = min(REFILL_SIZE, requested_total)`, clamped to + `max(MIN_REFILL_SIZE, size)`, rounded up to next power of two. +2. `auto refill_range = parent.alloc_range(refill_size)`. +3. If `refill_range != nullptr`: + - `requested_total += refill_size`. + - `remainder_size = refill_size - size`. + - If `remainder_size > 0`: + `arena.add_block(refill_range.unsafe_uintptr() + size, + remainder_size >> MIN_CHUNK_BITS)`. + Handle overflow (send to parent). + - Return `refill_range`. +4. If `nullptr`, return `nullptr`. + +The returned portion (`refill_range` to `refill_range + size`) bypasses +the arena entirely — it is not inserted or tracked. The remainder is +added to the arena for future allocations. Since the remainder comes +from a fresh refill and has no neighbours in the arena, `add_block` +performs a simple insertion with no consolidation (boundary bit on the +refill base may prevent consolidation with any pre-existing blocks +below it, which is correct). + +For the unaligned parent path: over-allocate `2 * size` (with overflow +check), add everything to the arena via `add_range` (which decomposes +into power-of-two blocks using `range_to_pow_2_blocks`), then call +`alloc_range(size)` recursively. Same logic as `LargeBuddyRange`. + +Safety guards (both from `LargeBuddyRange`): +- `static_assert((REFILL_SIZE < bits::one_at_bit(MAX_SIZE_BITS)) || + ParentRange::Aligned)` — prevents the unaligned path from adding a + block that violates `add_block`'s `size_chunks < 2^MAX_CHUNKS_BITS` + precondition. +- Runtime: `SNMALLOC_ASSERT(refill_size < bits::one_at_bit(MAX_SIZE_BITS))` + — catches the computed `refill_size` (which may be larger than + `REFILL_SIZE` when `needed_size = 2 * size` dominates). + +### Static properties + +- `Aligned = true`: BackendArena's carving ensures that a request of + size `n` (power-of-two, chunk-aligned) is placed at an `n`-aligned + address within the source block. For non-power-of-two requests + (future: step 4), the bin scheme's alignment rules still hold. +- `ConcurrencySafe = false`: same as `LargeBuddyRange`. +- `ChunkBounds = capptr::bounds::Arena`: same as `LargeBuddyRange`. + +### MAX_SIZE_BITS = BITS - 1 (global range) + +The global `LargeBuddyRange` uses `MAX_SIZE_BITS = BITS - 1`, meaning +the buddy can hold up to half the address space. For BackendArenaRange: +`MAX_CHUNKS_BITS = (BITS - 1) - MIN_CHUNK_BITS`. On 64-bit with +`MIN_CHUNK_BITS = 14`, this is 49 — the arena can hold up to 2^49 +chunks. The arena's overflow path returns consolidated blocks that +reach this size, handled by `dealloc_overflow` (see above). + +The `large_size_chunks` field (stored shifted by 8 in a pagemap word) +needs at most 49 bits, which fits in the 56 backend-usable bits of a +64-bit pagemap word. A `static_assert(MAX_CHUNKS_BITS + 8 <= BITS)` +in `PagemapRep` catches configurations where this would overflow. + +## Phases + +### Phase 9: Rep generalisation + boundary support + +**Status**: implemented; staged (not committed); awaiting review. + +Changes to `backend_arena.h`: + +1. Delete the private `WordRef` nested struct, the `TreeRep` + template, and all bit-layout constants + (`RED_BIT`/`VARIANT_MASK`/`META_MASK` and `BACKEND_RESERVED_MASK`). + `BackendArena` is now representation-agnostic, mirroring how + `buddy.h` is generic over its node `Rep`. +2. Replace the internal `using BinRep = TreeRep` / + `RangeRep = TreeRep` aliases with direct use + of `typename Rep::BinRep` and `typename Rep::RangeRep` — full + RBTree Reps supplied by the user, owning their own bit packing. +3. Update the Rep concept doc to require `BinRep`, `RangeRep`, + `get_variant`/`set_variant`, + `get_large_size_chunks`/`set_large_size_chunks`, and + `can_consolidate`. +4. Add `can_consolidate` calls in `add_block` before each merge + (predecessor and successor) and update the invariant clauses to + tolerate boundary-blocked adjacency. + +Changes to `backend_arena.cc` (test file): + +5. Define `BackendArenaWordRef` (test-only proxy) at the top of the + test file. +6. MockRep grows inner `BinRep` and `RangeRep` structs that each + provide the full RBTree Rep interface (ref/get/set/is_red/etc.) + over the mock-entry array. Each owns its own private bit layout + (red bit at bit 8 to match production layout). +7. MockRep keeps top-level `get_variant`/`set_variant`/large-size + accessors and adds `can_consolidate(uintptr_t) → true`. +8. New test: verify that a MockRep variant with `can_consolidate` + returning false at a specific address prevents consolidation across + that boundary. Test both predecessor and successor merges being + independently blocked. + +**Test gate**: all existing BackendArena tests pass unchanged; new +boundary test passes. + +### Phase 10: PagemapRep + BackendArenaRange + tests + +New file: `src/snmalloc/backend_helpers/backend_arena_range.h` + +1. `PagemapRep` — full Rep implementation using pagemap entries + as described above, with all static assertions. +2. `BackendArenaRange` — the Range wrapper with `alloc_range`, + `dealloc_range`, `refill`, and `dealloc_overflow`. + +Modified: `src/snmalloc/backend_helpers/backend_helpers.h` + +3. Add `#include "backend_arena_range.h"` so the new header is + available through the standard include path. + +New file: `src/test/func/backend_arena_range/backend_arena_range.cc` + +4. Test with snmalloc's `BasicPagemap` (or a test-appropriate pagemap): + - PagemapRep word round-trips (variant, tree words, large size). + - BackendArenaRange `alloc_range` / `dealloc_range` smoke test with + a simple parent range. + - Refill: verify that allocating when the arena is empty triggers a + parent refill and returns memory. + - Overflow: verify that deallocating a block that triggers arena-scale + consolidation passes the decomposed overflow to the parent via + `range_to_pow_2_blocks`. + - Overflow with non-power-of-two consolidated size: verify + decomposition produces valid power-of-two blocks. + - Boundary: verify that a boundary bit in the pagemap prevents + consolidation of adjacent blocks from different refills (when + `CONSOLIDATE_PAL_ALLOCS` is false). + - Test at largest configured `MAX_SIZE_BITS` values, especially + `MAX_SIZE_BITS == bits::BITS - 1` if feasible. + +Modified: `CMakeLists.txt` + +5. Register `backend_arena_range` in `TESTLIB_ONLY_TESTS`. + +**Test gate**: BackendArenaRange tests pass; existing tests unaffected. + +### Phase 11: Final review + +Per `claude.md` mandatory review checkpoints: + +- Spawn a fresh-context reviewer on the full diff (Phases 9–10). +- Address findings, loop until clean. + +**Test gate**: full ctest run passes; reviewer reports no issues. + +*Pipeline integration (replacing `LargeBuddyRange` in `standard_range.h` +and `meta_protected_range.h`) is a separate follow-up plan: "Update +backend to use BackendArenaRange."* + +## Files added / changed (anticipated, this phase) + +- Modified: `src/snmalloc/backend_helpers/backend_arena.h` — + representation-agnostic: delete private `WordRef`, `TreeRep`, and + all bit-layout constants (`RED_BIT`/`VARIANT_MASK`/`META_MASK`/ + reserved); use `Rep::BinRep` and `Rep::RangeRep` directly; + `can_consolidate` check in `add_block`; invariant clauses updated. +- New: `src/snmalloc/backend_helpers/backend_arena_range.h` — + `PagemapRep` + `BackendArenaRange`. +- Modified: `src/snmalloc/backend_helpers/backend_helpers.h` — include + `backend_arena_range.h`. +- Modified: `src/test/func/backend_arena/backend_arena.cc` — define + `BackendArenaWordRef` test helper at top of file; MockRep updated + (`BackendArenaWordRef` returns, `can_consolidate`); boundary tests. +- New: `src/test/func/backend_arena_range/backend_arena_range.cc` — + Range wrapper tests. +- Modified: `CMakeLists.txt` — register `backend_arena_range` test. + +## Key design decisions + +1. **Representation-agnostic data structure** — `BackendArena` + carries no bit-layout constants. All red/variant packing decisions + live in the user-supplied `Rep::BinRep` / `Rep::RangeRep`, matching + how `BuddyChunkRep` and `BuddyInplaceRep` each own their own + layouts. This is what makes a future inline Rep (to replace + `SmallBuddyRange`) possible. + +2. **PagemapRep variant in bin-tree Word::One** — PagemapRep packs + the variant tag at bits 9–10 of Word::One alongside the red bit + (bit 8) and child pointer (bits ≥ MIN_CHUNK_BITS). These are + private constants inside PagemapRep, not exposed by BackendArena. + +3. **Large size stored shifted** — PagemapRep stores the chunk count + as `count << 8` to avoid the pagemap's reserved low byte; recovered + via `>> 8`. Guarded by `static_assert(MAX_CHUNKS_BITS + 8 <= bits::BITS)`. + +4. **Boundary checks in BackendArena** — not in BackendArenaRange. + Consolidation decisions happen inside `add_block`, so the boundary + check must be there. The Rep concept cleanly abstracts this via + `can_consolidate`. + +5. **Refill returns prefix directly** — like LargeBuddyRange, the + first `size` bytes of a refill bypass the arena. Only the remainder + enters the arena. This avoids unnecessary tree operations on the + hot path. + +6. **PagemapRep auto-claims entries** — `get_backend_word` calls + `claim_for_backend()` on first access. No explicit ownership + management needed in BackendArena or BackendArenaRange. + +7. **Overflow decomposition** — `add_block` overflow may produce non- + power-of-two sizes (consolidated blocks from multiple PAL allocs). + `dealloc_overflow` uses `range_to_pow_2_blocks` to decompose before + passing to parent, matching the existing pattern in + `LargeBuddyRange::add_range`. + +8. **`BackendArenaWordRef` lives in the test file** — production + `PagemapRep` returns `BackendStateWordRef` directly (mirroring + `BuddyChunkRep` in `largebuddyrange.h`). The test-only + `BackendArenaWordRef` proxy is defined in + `src/test/func/backend_arena/backend_arena.cc` and used only by + MockRep, so production headers carry no test scaffolding. + +9. **Power-of-two assertion retained** — `alloc_range` keeps + `is_pow2(size)` for this phase. Non-power-of-two support is + deferred to "Update front-end" step, when the bin scheme's + alignment guarantees are verified end-to-end. + +## Resolved during plan review + +- Overflow handling: `add_block` can return non-power-of-two sizes when + blocks from multiple PAL allocations consolidate. `dealloc_overflow` + decomposes via `range_to_pow_2_blocks`. (Rubber-duck finding #2.) +- Handle visibility / layering: original plan promoted bit-layout + constants and a `BackendArenaWordRef` proxy to namespace scope so + production and test code could share them. Subsequent review + observed that this broke the Buddy/`BuddyChunkRep`/`BuddyInplaceRep` + layering: the data structure should be representation-agnostic. + Resolved by making `BackendArena` carry no bit-layout state and + requiring `Rep::BinRep` / `Rep::RangeRep` to own all packing + decisions. Production `PagemapRep` keeps its layout private; the + test `BackendArenaWordRef` lives in the test file alongside MockRep. + (Rubber-duck finding #1, then revised after layering review.) +- Size shift overflow: `static_assert(MAX_CHUNKS_BITS + 8 <= BITS)` in + `PagemapRep` prevents shift overflow. (Rubber-duck finding #4.) +- Unaligned refill guard: both static assert AND runtime assert copied + from `LargeBuddyRange` to prevent `add_block` precondition violation. + (Rubber-duck finding #6, strengthened in second review.) +- Pipeline integration (Phase 11) removed from this plan's scope — + separate follow-up plan. (Rubber-duck finding #8.) +- `PagemapRep` templated on `MAX_CHUNKS_BITS` so the size-shift + static_assert is in scope. (Second review finding #1.) +- `remove_block` exact-size guarantee is scoped to power-of-two + requests only. (Second review finding #4.) + +--- + +## Files added / changed (BackendArena phase, completed) - New: `src/snmalloc/backend_helpers/backend_arena_bins.h` — `range_t`, `carve_t`, `carve`, `max_supported_chunks`, and nested diff --git a/src/snmalloc/backend_helpers/backend_arena.h b/src/snmalloc/backend_helpers/backend_arena.h index 89f25521a..17b581de9 100644 --- a/src/snmalloc/backend_helpers/backend_arena.h +++ b/src/snmalloc/backend_helpers/backend_arena.h @@ -31,18 +31,30 @@ namespace snmalloc /** * Manages free ranges within a single bounded arena using a dual-tree - * scheme (bin trees for allocation, range tree for consolidation). + * scheme: a set of bin trees indexed by the floor-log2 size class + * (used for allocation lookup) and one range tree keyed by address + * (used for consolidation of adjacent free ranges). * - * `Rep` provides word-level pagemap access: - * - `ref_word(direction, addr) -> uintptr_t*`: bin-tree child slot - * (left/right pointer in the first pagemap entry). - * - `ref_range_word(direction, addr) -> uintptr_t*`: range-tree - * child slot (left/right pointer in the second pagemap entry). - * - `get_variant(addr)` / `set_variant(addr, v)` + * `Rep` is the representation. It owns *all* storage and bit-layout + * decisions for tree nodes and per-block metadata. `Rep` must provide: + * + * - `using BinRep` — full RBTree Rep for the bin trees, supplying + * `Handle`, `Contents`, `null`, `root`, `ref`, `get`, `set`, + * `is_red`, `set_red`, `compare`, `equal`, `printable`, `name`. + * Owns its own red-bit packing privately. + * - `using RangeRep` — full RBTree Rep for the range tree, same + * shape as `BinRep`. + * - `get_variant(addr)` / `set_variant(addr, v)` — the + * `BackendArenaVariant` tag for the block starting at `addr`. * - `get_large_size_chunks(addr)` / `set_large_size_chunks(addr, n)` + * — exact chunk count for `Large` blocks (3+ chunks). + * - `can_consolidate(higher_addr) -> bool` — whether the block at + * `higher_addr` may be merged with the block immediately below + * it. Returns false at allocation boundaries that must be + * preserved. * - * `MIN_CHUNKS_BITS`: log2 of minimum allocation unit in chunks (0 for - * this phase — 1-chunk minimum). + * `MIN_CHUNKS_BITS`: log2 of the minimum allocation unit in chunks + * (currently only 0 is supported — 1-chunk minimum). * * `MAX_CHUNKS_BITS`: log2 of the arena size in chunks. Blocks that * reach this size overflow and are returned to the caller. @@ -60,123 +72,8 @@ namespace snmalloc static_assert( bits::one_at_bit(MAX_CHUNKS_BITS) - 1 <= Bins::max_supported_chunks()); - // Bit layout constants. - static constexpr uintptr_t RED_BIT = uintptr_t(1) << 8; - static constexpr uintptr_t VARIANT_MASK = uintptr_t(0x3) << 9; - static constexpr uintptr_t META_MASK = RED_BIT | VARIANT_MASK; - static constexpr uintptr_t BACKEND_RESERVED_MASK = 0xFF; - - static_assert((META_MASK & BACKEND_RESERVED_MASK) == 0); - static_assert(META_MASK < MIN_CHUNK_SIZE); - - // ---- Handle: thin proxy around uintptr_t* ---- - // - // Matches BackendStateWordRef's interface: wraps a pointer to a - // word slot (tree root field or pagemap word). Constructed from - // &root or from Rep::ref_word / Rep::ref_range_word. - struct WordRef - { - uintptr_t* val{nullptr}; - - constexpr WordRef() = default; - - constexpr WordRef(uintptr_t* p) : val(p) {} - - uintptr_t get() const - { - return *val; - } - - WordRef& operator=(uintptr_t v) - { - *val = v; - return *this; - } - - bool operator!=(const WordRef& other) const - { - return val != other.val; - } - - uintptr_t printable_address() const - { - return reinterpret_cast(val); - } - }; - - // ---- TreeRep: RBTree Rep parameterised on which word accessor to use ---- - // - // `RefFn` selects the pagemap entry: ref_word for the bin tree, - // ref_range_word for the range tree. - template - struct TreeRep - { - using Handle = WordRef; - using Contents = uintptr_t; - - static constexpr Contents null = 0; - static constexpr Contents root = 0; - - static Contents get(Handle h) - { - return h.get() & ~META_MASK; - } - - static void set(Handle h, Contents v) - { - h = v | (h.get() & META_MASK); - } - - static Handle ref(bool direction, Contents k) - { - static const Contents null_entry = 0; - if (SNMALLOC_UNLIKELY(k == 0)) - return Handle{const_cast(&null_entry)}; - return Handle{RefFn(direction, k)}; - } - - static bool is_red(Contents k) - { - return (ref(true, k).get() & RED_BIT) == RED_BIT; - } - - static void set_red(Contents k, bool new_is_red) - { - if (new_is_red != is_red(k)) - { - auto h = ref(true, k); - h = h.get() ^ RED_BIT; - } - } - - static bool compare(Contents k1, Contents k2) - { - return k1 > k2; - } - - static bool equal(Contents k1, Contents k2) - { - return k1 == k2; - } - - static uintptr_t printable(Contents k) - { - return k; - } - - static uintptr_t printable(Handle h) - { - return h.printable_address(); - } - - static const char* name() - { - return "TreeRep"; - } - }; - - using BinRep = TreeRep; - using RangeRep = TreeRep; + using BinRep = typename Rep::BinRep; + using RangeRep = typename Rep::RangeRep; using BinTree = RBTree; using RangeTree = RBTree; @@ -300,17 +197,21 @@ namespace snmalloc // Predecessor: check range tree, then fall back to min-size bin. auto [pa, ps] = range_from_addr(p_key); - if (pa + ps * MIN_CHUNK_SIZE == addr) + if (pa + ps * MIN_CHUNK_SIZE == addr && Rep::can_consolidate(addr)) merge(pa, ps); - else if (addr >= MIN_CHUNK_SIZE && contains_min(addr - MIN_CHUNK_SIZE)) + else if ( + addr >= MIN_CHUNK_SIZE && Rep::can_consolidate(addr) && + contains_min(addr - MIN_CHUNK_SIZE)) merge(addr - MIN_CHUNK_SIZE, 1); // Successor: check range tree, then fall back to min-size bin. auto [sa, ss] = range_from_addr(s_key); uintptr_t succ_addr = addr + size_chunks * MIN_CHUNK_SIZE; - if (sa == succ_addr) + if (sa == succ_addr && Rep::can_consolidate(succ_addr)) merge(sa, ss); - else if (succ_addr > addr && contains_min(succ_addr)) + else if ( + succ_addr > addr && Rep::can_consolidate(succ_addr) && + contains_min(succ_addr)) merge(succ_addr, 1); // Arena-scale overflow: consolidated block spans the full arena. @@ -386,7 +287,7 @@ namespace snmalloc auto& self = const_cast(*this); // Clause 1: Maximally consolidated. - // 1a. No two adjacent non-min blocks. + // 1a. No two adjacent non-min blocks (unless boundary prevents merge). { uintptr_t prev_addr = 0; size_t prev_size = 0; @@ -394,22 +295,27 @@ namespace snmalloc self.range_tree.for_each([&](uintptr_t node) { auto [a, s] = range_from_addr(node); if (prev_valid) - SNMALLOC_ASSERT(prev_addr + prev_size * MIN_CHUNK_SIZE != a); + { + uintptr_t prev_end = prev_addr + prev_size * MIN_CHUNK_SIZE; + SNMALLOC_ASSERT(prev_end != a || !Rep::can_consolidate(a)); + } prev_addr = a; prev_size = s; prev_valid = true; }); } - // 1b. No non-min block adjacent to a min block. + // 1b. No non-min block adjacent to a min block (unless boundary). self.range_tree.for_each([&](uintptr_t node) { auto [a, s] = range_from_addr(node); if (a >= MIN_CHUNK_SIZE) - SNMALLOC_ASSERT(!contains_min(a - MIN_CHUNK_SIZE)); - SNMALLOC_ASSERT(!contains_min(a + s * MIN_CHUNK_SIZE)); + SNMALLOC_ASSERT( + !contains_min(a - MIN_CHUNK_SIZE) || !Rep::can_consolidate(a)); + uintptr_t end = a + s * MIN_CHUNK_SIZE; + SNMALLOC_ASSERT(!contains_min(end) || !Rep::can_consolidate(end)); }); - // 1c. No two adjacent min blocks. + // 1c. No two adjacent min blocks (unless boundary). { uintptr_t prev = 0; bool prev_valid = false; @@ -417,7 +323,8 @@ namespace snmalloc if (Rep::get_variant(node) != BackendArenaVariant::Min) return; if (prev_valid) - SNMALLOC_ASSERT(prev + MIN_CHUNK_SIZE != node); + SNMALLOC_ASSERT( + prev + MIN_CHUNK_SIZE != node || !Rep::can_consolidate(node)); prev = node; prev_valid = true; }); diff --git a/src/test/func/backend_arena/backend_arena.cc b/src/test/func/backend_arena/backend_arena.cc index ec5bcd232..fc605bbdf 100644 --- a/src/test/func/backend_arena/backend_arena.cc +++ b/src/test/func/backend_arena/backend_arena.cc @@ -27,6 +27,41 @@ namespace snmalloc { // ---- MockRep: array-backed storage for testing ---- + /** + * Thin proxy around uintptr_t* with the same interface as + * BackendStateWordRef (get, operator=, operator!=). Used by MockRep + * to avoid requiring a real pagemap in unit tests. + */ + struct BackendArenaWordRef + { + uintptr_t* val{nullptr}; + + constexpr BackendArenaWordRef() = default; + + constexpr BackendArenaWordRef(uintptr_t* p) : val(p) {} + + uintptr_t get() const + { + return *val; + } + + BackendArenaWordRef& operator=(uintptr_t v) + { + *val = v; + return *this; + } + + bool operator!=(const BackendArenaWordRef& other) const + { + return val != other.val; + } + + uintptr_t printable_address() const + { + return reinterpret_cast(val); + } + }; + // Each chunk-aligned address maps to a mock_entry via its chunk index. // word1/word2 hold bin-tree children; range_word1/range_word2 hold // range-tree children. variant and large_size_chunks hold metadata. @@ -57,28 +92,97 @@ namespace snmalloc return idx; } - struct MockRep + // Inner RBTree Rep used by both MockRep::BinRep and MockRep::RangeRep. + // Tag selects which pair of fields in mock_entry holds the tree pointers. + // The red bit is packed into bit 8 of the stored word (matching the + // production PagemapRep layout, but defined privately here). + template + struct MockTreeRep { - static BackendArenaVariant get_variant(uintptr_t addr) + using Handle = BackendArenaWordRef; + using Contents = uintptr_t; + + static constexpr Contents null = 0; + static constexpr Contents root = 0; + + static constexpr uintptr_t RED_BIT = uintptr_t(1) << 8; + static_assert(RED_BIT < MIN_CHUNK_SIZE); + + static Handle ref(bool direction, Contents k) { - return mock_store[mock_index(addr)].variant; + static const Contents null_entry = 0; + if (SNMALLOC_UNLIKELY(k == 0)) + return Handle{const_cast(&null_entry)}; + auto& e = mock_store[mock_index(k)]; + if constexpr (IsRange) + return Handle{direction ? &e.range_word1 : &e.range_word2}; + else + return Handle{direction ? &e.word1 : &e.word2}; } - static void set_variant(uintptr_t addr, BackendArenaVariant v) + static Contents get(Handle h) { - mock_store[mock_index(addr)].variant = v; + return h.get() & ~RED_BIT; + } + + static void set(Handle h, Contents v) + { + h = v | (h.get() & RED_BIT); + } + + static bool is_red(Contents k) + { + return (ref(true, k).get() & RED_BIT) == RED_BIT; + } + + static void set_red(Contents k, bool new_is_red) + { + if (new_is_red != is_red(k)) + { + auto h = ref(true, k); + h = h.get() ^ RED_BIT; + } + } + + static bool compare(Contents k1, Contents k2) + { + return k1 > k2; + } + + static bool equal(Contents k1, Contents k2) + { + return k1 == k2; } - static uintptr_t* ref_word(bool direction, uintptr_t addr) + static uintptr_t printable(Contents k) { - auto& e = mock_store[mock_index(addr)]; - return direction ? &e.word1 : &e.word2; + return k; } - static uintptr_t* ref_range_word(bool direction, uintptr_t addr) + static uintptr_t printable(Handle h) { - auto& e = mock_store[mock_index(addr)]; - return direction ? &e.range_word1 : &e.range_word2; + return h.printable_address(); + } + + static const char* name() + { + return IsRange ? "MockRangeRep" : "MockBinRep"; + } + }; + + struct MockRep + { + using BinRep = MockTreeRep; + using RangeRep = MockTreeRep; + + static BackendArenaVariant get_variant(uintptr_t addr) + { + return mock_store[mock_index(addr)].variant; + } + + static void set_variant(uintptr_t addr, BackendArenaVariant v) + { + mock_store[mock_index(addr)].variant = v; } static size_t get_large_size_chunks(uintptr_t addr) @@ -90,6 +194,11 @@ namespace snmalloc { mock_store[mock_index(addr)].large_size_chunks = s; } + + static bool can_consolidate(uintptr_t) + { + return true; + } }; // ---- Test access ---- @@ -169,15 +278,19 @@ namespace snmalloc uintptr_t v1 = chunk_addr(10); uintptr_t v2 = chunk_addr(20); - *MockRep::ref_word(true, a) = v1; - *MockRep::ref_word(false, a) = v2; - SNMALLOC_ASSERT(*MockRep::ref_word(true, a) == v1); - SNMALLOC_ASSERT(*MockRep::ref_word(false, a) == v2); + auto w1 = MockRep::BinRep::ref(true, a); + auto w2 = MockRep::BinRep::ref(false, a); + w1 = v1; + w2 = v2; + SNMALLOC_ASSERT(MockRep::BinRep::ref(true, a).get() == v1); + SNMALLOC_ASSERT(MockRep::BinRep::ref(false, a).get() == v2); - *MockRep::ref_range_word(true, a) = v2; - *MockRep::ref_range_word(false, a) = v1; - SNMALLOC_ASSERT(*MockRep::ref_range_word(true, a) == v2); - SNMALLOC_ASSERT(*MockRep::ref_range_word(false, a) == v1); + auto rw1 = MockRep::RangeRep::ref(true, a); + auto rw2 = MockRep::RangeRep::ref(false, a); + rw1 = v2; + rw2 = v1; + SNMALLOC_ASSERT(MockRep::RangeRep::ref(true, a).get() == v2); + SNMALLOC_ASSERT(MockRep::RangeRep::ref(false, a).get() == v1); printf(" Word round-trip: OK\n"); } @@ -1213,6 +1326,155 @@ namespace snmalloc NUM_OPS); } + // ================================================================== + // (J) Boundary consolidation prevention + // ================================================================== + + // A Rep variant that blocks consolidation at specific addresses. + static std::set boundary_addrs; + + struct BoundaryMockRep + { + using BinRep = MockRep::BinRep; + using RangeRep = MockRep::RangeRep; + + static BackendArenaVariant get_variant(uintptr_t addr) + { + return MockRep::get_variant(addr); + } + + static void set_variant(uintptr_t addr, BackendArenaVariant v) + { + MockRep::set_variant(addr, v); + } + + static size_t get_large_size_chunks(uintptr_t addr) + { + return MockRep::get_large_size_chunks(addr); + } + + static void set_large_size_chunks(uintptr_t addr, size_t s) + { + MockRep::set_large_size_chunks(addr, s); + } + + static bool can_consolidate(uintptr_t higher_addr) + { + return boundary_addrs.find(higher_addr) == boundary_addrs.end(); + } + }; + + template + using BoundaryArena = BackendArena; + + // Test: predecessor merge blocked by boundary. + static void test_boundary_blocks_predecessor() + { + reset_mock_store(); + boundary_addrs.clear(); + constexpr size_t K = 6; + BoundaryArena arena; + + uintptr_t p_addr = chunk_addr(2); + uintptr_t a_addr = chunk_addr(4); + + // Place a boundary at a_addr — blocks should not consolidate leftward. + boundary_addrs.insert(a_addr); + + arena.add_block(p_addr, 2); + arena.add_block(a_addr, 2); + + // P (chunks 2-3) and A (chunks 4-5) are adjacent but the boundary + // at a_addr prevents merging. Both should remain separate. + auto [r1_addr, r1_size] = arena.remove_block(2); + SNMALLOC_ASSERT(r1_addr == p_addr && r1_size == 2); + auto [r2_addr, r2_size] = arena.remove_block(2); + SNMALLOC_ASSERT(r2_addr == a_addr && r2_size == 2); + + printf(" Boundary blocks predecessor merge: OK\n"); + } + + // Test: successor merge blocked by boundary. + static void test_boundary_blocks_successor() + { + reset_mock_store(); + boundary_addrs.clear(); + constexpr size_t K = 6; + BoundaryArena arena; + + uintptr_t a_addr = chunk_addr(2); + uintptr_t s_addr = chunk_addr(4); + + // Place a boundary at s_addr — blocks should not consolidate rightward. + boundary_addrs.insert(s_addr); + + arena.add_block(s_addr, 4); + arena.add_block(a_addr, 2); + + // A (chunks 2-3) and S (chunks 4-7) are adjacent but the boundary + // at s_addr prevents merging. Both should remain separate. + auto [r1_addr, r1_size] = arena.remove_block(2); + SNMALLOC_ASSERT(r1_addr == a_addr && r1_size == 2); + auto [r2_addr, r2_size] = arena.remove_block(4); + SNMALLOC_ASSERT(r2_addr == s_addr && r2_size == 4); + + printf(" Boundary blocks successor merge: OK\n"); + } + + // Test: boundary only blocks the specific merge; other merges proceed. + static void test_boundary_partial() + { + reset_mock_store(); + boundary_addrs.clear(); + constexpr size_t K = 6; + BoundaryArena arena; + + // Three adjacent blocks: chunks [4,6), [6,8), [8,10). + // Boundary at chunk 8 blocks [6,8) ↔ [8,10) merge but allows + // [4,6) ↔ [6,8) merge into a 4-aligned block at chunk 4. + boundary_addrs.insert(chunk_addr(8)); + + arena.add_block(chunk_addr(4), 2); + arena.add_block(chunk_addr(8), 2); + arena.add_block(chunk_addr(6), 2); + + // [4,6) and [6,8) should consolidate to [4,8). + // [8,10) should remain separate due to boundary. + auto [r1_addr, r1_size] = arena.remove_block(4); + SNMALLOC_ASSERT(r1_addr == chunk_addr(4) && r1_size == 4); + auto [r2_addr, r2_size] = arena.remove_block(2); + SNMALLOC_ASSERT(r2_addr == chunk_addr(8) && r2_size == 2); + + printf(" Boundary partial (P merges, S blocked): OK\n"); + } + + // Test: min-size predecessor blocked by boundary. + static void test_boundary_blocks_min_predecessor() + { + reset_mock_store(); + boundary_addrs.clear(); + constexpr size_t K = 6; + BoundaryArena arena; + + uintptr_t p_addr = chunk_addr(4); + uintptr_t a_addr = chunk_addr(5); + + boundary_addrs.insert(a_addr); + + arena.add_block(p_addr, 1); // min-size block + arena.add_block(a_addr, 1); // adjacent, but boundary prevents merge + + auto [r1_addr, r1_size] = arena.remove_block(1); + auto [r2_addr, r2_size] = arena.remove_block(1); + // Both should be separate min-size blocks. + SNMALLOC_ASSERT(r1_size == 1 && r2_size == 1); + SNMALLOC_ASSERT( + (r1_addr == p_addr && r2_addr == a_addr) || + (r1_addr == a_addr && r2_addr == p_addr)); + + printf(" Boundary blocks min predecessor merge: OK\n"); + } + } // namespace snmalloc int main() @@ -1268,6 +1530,12 @@ int main() snmalloc::test_multi_instance_consolidation(); snmalloc::test_multi_stress(); + printf("(J) Boundary consolidation:\n"); + snmalloc::test_boundary_blocks_predecessor(); + snmalloc::test_boundary_blocks_successor(); + snmalloc::test_boundary_partial(); + snmalloc::test_boundary_blocks_min_predecessor(); + printf("All BackendArena tests passed.\n"); return 0; } From 9c1ca745691bb78a6d2be09bc776b3a0d5c69c81 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Fri, 22 May 2026 15:56:17 +0100 Subject: [PATCH 07/31] Add BackendArenaRange and generalise BackendArena to byte units Phase 10 of the BackendArena work. Adds the BackendArenaRange wrapper that drops into the LargeBuddyRange slot, generalises BackendArena and BackendArenaBins on MIN_SIZE_BITS, and converts the arena/range API boundary to bytes throughout. * BackendArenaRange with a PagemapRep that packs variant tag, RB red bit and the consolidated large-block size into the first pagemap word, and uses the second word for in-tree links. Provides alloc_range / dealloc_range / add_range over the bin-tree arena. * parent_dealloc unifies the old parent_dealloc_range and dealloc_overflow paths; add_range uses bits::align_up / bits::align_down for parent-input trimming. * BackendArenaBins generalises the bin scheme so its range_t, carve and find_for_request all speak bytes (multiples of UNIT_SIZE = 1 << MIN_SIZE_BITS). Tests cover MIN_SIZE_BITS in {0, 4, 14}. * BackendArena: add_block / remove_block / variant_of / insert_block / range_from_addr / invariants all work in bytes. remove_block returns a scalar address (0 = failure); the size half of the old pair was tautological. CHUNKS_BITS / addr_to_chunk / chunk_to_addr removed. * PagemapRep::get_large_size / set_large_size are bytes-in / bytes-out; storage still scales by MIN_SIZE_BITS so the shifted field fits a pagemap word. * Tests: func-backend_arena_range exercises alloc/dealloc/refill/large paths against a mock parent; func-backend_arena and func-backend_arena_bins updated for the bytes-throughout convention (chunk_size(N) helper at the test boundary). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CMakeLists.txt | 1 + PLAN.md | 557 ++++++++++++++---- claude.md | 4 +- src/snmalloc/backend_helpers/backend_arena.h | 185 +++--- .../backend_helpers/backend_arena_bins.h | 222 ++++--- .../backend_helpers/backend_arena_range.h | 368 ++++++++++++ .../backend_helpers/backend_helpers.h | 1 + src/test/func/backend_arena/backend_arena.cc | 327 +++++----- .../backend_arena_bins/backend_arena_bins.cc | 460 ++++++++++----- .../backend_arena_range.cc | 309 ++++++++++ 10 files changed, 1840 insertions(+), 594 deletions(-) create mode 100644 src/snmalloc/backend_helpers/backend_arena_range.h create mode 100644 src/test/func/backend_arena_range/backend_arena_range.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index b0457bdab..be1fca26a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -550,6 +550,7 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) set(TESTLIB_ONLY_TESTS backend_arena backend_arena_bins + backend_arena_range bits first_operation memory memory_usage multi_atexit multi_threadatexit redblack statistics teardown contention external_pointer large_alloc lotsofthreads post_teardown diff --git a/PLAN.md b/PLAN.md index 2c058f6a3..7520631d3 100644 --- a/PLAN.md +++ b/PLAN.md @@ -129,9 +129,12 @@ Allocation for a request of `n_chunks`: in a serving position. 2. Pop a block from that bin's RBTree (smallest address — `remove_min`). If the tree empties, `bitmap.clear(bin_id)`. -3. `carve(block, n_chunks)` splits into pre-pad / aligned request / - post-pad. Re-add any non-empty pre/post via `add_block` (which - classifies the remainder via `bitmap.add(remainder)`). +3. `carve(block, n_chunks)` splits into pre-pad / aligned request of + exactly `n_chunks` chunks / post-pad. SC rounding stays internal: + the SC for `n_chunks` only fixes the alignment of the request and + the minimum block size required; any remainder beyond `n_chunks` + rolls into `post`. Re-add any non-empty pre/post via `add_block` + (which classifies the remainder via `bitmap.add(remainder)`). The bin classification and per-sc search masks (`start_word`, `first_mask`, `second_mask`) are precomputed at `constexpr` time @@ -365,7 +368,9 @@ The public API of `BackendArenaBins` — the integration contract operation; either of `pre`/`post` may have `size == 0` (absent). - `static carve_t carve(range_t block, size_t n_chunks)` — given a free block and an allocation request, split into pre-pad / aligned - request / post-pad. Pure function; does not touch the bitmap. + request of exactly `n_chunks` chunks / post-pad. SC rounding stays + inside the carve: the SC for `n_chunks` only fixes alignment and the + servability precondition. Pure function; does not touch the bitmap. - `max_supported_chunks() -> size_t` — upper bound on legal `n_chunks`; used for assertions. - nested `Bitmap` — the routing layer; see below. @@ -402,7 +407,7 @@ rodata records, the bin-scheme constants (`B`, `MANTISSAS_PER_EXP`, accessors — are private implementation details. They are reachable only via the friend struct `BackendArenaBinsTestAccess` (defined in the test translation unit, see Phase 1) so unit tests can exercise -them directly; production code outside this header does not depend on +them directly; code outside this header does not depend on them. **Free blocks may have arbitrary chunk counts**, including non-class @@ -415,11 +420,12 @@ their precise chunk count where needed (`Large` variant). ### Exponent / bin-count bounds -`BackendArena` takes -**chunk-count exponent** bounds. `MIN_CHUNKS_BITS = 0` (1 chunk). The -upper bound is **exclusive**, matching `Buddy<..., MIN, MAX>`'s -semantics. The total number of bins is -`(MAX_CHUNKS_BITS - MIN_CHUNKS_BITS) * BINS_PER_EXP` plus the +`BackendArena` takes **byte-size +exponent** bounds (mirroring `Buddy`). +`MIN_SIZE_BITS` is the log2 of the unit of allocation; everything inside +the arena is in multiples of `1 << MIN_SIZE_BITS`. The upper bound is +**exclusive**. The total number of bins is +`(MAX_SIZE_BITS - MIN_SIZE_BITS) * BINS_PER_EXP` plus the degenerate-low-exponent bins. Static assertions encode the exclusive semantics; tests exercise minimum, just-below-max, and exact-max sizes (the last triggers overflow back to the parent, mirroring `Buddy`). @@ -455,7 +461,7 @@ the next phase starts: 3. A reminder that this phase's scope is *only* what the plan section describes; cross-phase concerns are out of scope. 4. A pointer to `claude.md` for codebase conventions (no raw - compiler attributes, no C++ STL in production, `SNMALLOC_*` + compiler attributes, no C++ STL, `SNMALLOC_*` macros, etc.). Address findings, re-spawn a fresh-context reviewer, loop until @@ -463,7 +469,7 @@ the next phase starts: escalate to the user, not resolved unilaterally. Phases 0 and 6 are exempted from the review gate: Phase 0 adds no -code; Phase 6 is test-only over already-reviewed production code. +code; Phase 6 is test-only over already-reviewed code. Phase 7 is the final mandatory review per `claude.md`. ### Phase 0: Baseline @@ -489,14 +495,17 @@ nested non-empty-bins bitmap that the allocation fast path scans. - `struct carve_t { range_t pre; range_t req; range_t post; }` — output of a carving operation; `pre` and/or `post` may have `size == 0`. - `static SNMALLOC_FAST_PATH carve_t carve(range_t block, size_t n_chunks)` - — split a free `block` into pre-pad, aligned request, post-pad. - Pure. **Preconditions** (asserted): + — split a free `block` into pre-pad, aligned request of exactly + `n_chunks` chunks, and post-pad. SC rounding stays internal: the SC + for `n_chunks` only fixes alignment and the servability precondition; + any rounding remainder absorbs into `post`. Pure. **Preconditions** + (asserted): `n_chunks >= 1 && n_chunks <= max_supported_chunks()`, `block.size > 0`, and `block` is servable for `n_chunks` (the caller has already used `Bitmap::find_for_request`). - `static constexpr size_t max_supported_chunks()` — upper bound on legal `n_chunks`; used for assertions. -- nested `class Bitmap` — three methods, all that production code +- nested `class Bitmap` — three methods, all that other code calls into: - `size_t add(range_t block)` — classify `block`, ensure the bit for the resulting bin is set, return the bin id so the caller can @@ -642,7 +651,7 @@ by `carve`): #### Runtime CLZ on the fast path -Production calls on the fast path use the runtime intrinsic, not the +Fast-path calls use the runtime intrinsic, not the constexpr software fallback: - `src/snmalloc/ds_core/bits.h` provides @@ -746,7 +755,7 @@ The two ANDs are the entire bin-selection cost; no shifts, no in `backend_arena_bins.h` (so the friend declarations can refer to it) and **defined in the test translation unit** `src/test/func/backend_arena_bins/backend_arena_bins.cc` (inside -`namespace snmalloc`). The production header therefore carries no +`namespace snmalloc`). The header therefore carries no test-only members. What the test access struct exposes (all delegating to private @@ -756,7 +765,7 @@ internals through the friend grant): - The bin-scheme constants `B`, `MANTISSAS_PER_EXP`, `BINS_PER_EXP`, `MAX_SC`. - `using chunk_sc_t = size_t;` — raw sc id as plain `size_t`; the - production header does NOT define a `chunk_sc_t` handle type. + header does NOT define a `chunk_sc_t` handle type. - `request(n) -> size_t` — `bits::to_exp_mant(n)` (runtime). - `size_chunks(sc) -> size_t`, `align_chunks(sc) -> size_t` — direct reads of `Bins::table_.carve_info[sc]`. @@ -851,7 +860,7 @@ Spec slice = the Phase 1 section above. Reviewer checks: - Tables match the canonical `bin_subsets` (single source of truth); `prototype/skip_analysis.py` reproduces the same numbering. -- Production header carries no test-only surface (no `chunk_sc_t` +- The in-tree header carries no test-only surface (no `chunk_sc_t` handle class, no `request`, no `_const` variants, no test-only per-sc accessors — those live only in `BackendArenaBinsTestAccess` in the test cc). @@ -935,9 +944,10 @@ Create `src/snmalloc/backend_helpers/backend_arena.h` with: - Both: `compare(k1, k2) = k1 > k2` so `remove_min` returns the lowest address. `null = root = 0`. -- `BackendArena`: +- `BackendArena`: - `B = 2` hardcoded; `INTERMEDIATE_BITS` wiring deferred. - - `MIN_CHUNKS_BITS == 0` only; larger min values deferred. + - `MIN_SIZE_BITS` selects the unit of allocation (= pagemap stride + when used with `PagemapRep`). - `stl::Array bin_trees` - `RangeTree range_tree` - `Bins::Bitmap bitmap` @@ -1112,7 +1122,7 @@ constants of its own. `RBTree` directly. It never inspects the bit layout used by the Rep. -#### PagemapRep (production) +#### PagemapRep Lives in `backend_arena_range.h`. Privately owns its bit layout: @@ -1164,35 +1174,40 @@ PagemapRep: returns `!get_metaentry_mut(higher_addr).is_boundary()`. ### PagemapRep -Templated on `Pagemap` and `MAX_CHUNKS_BITS`. The second parameter is -needed for the large-size-shift static assertion: +Templated on `Pagemap`, `MIN_SIZE_BITS`, and `MAX_SIZE_BITS` (mirroring +`Buddy`'s shape). `MIN_SIZE_BITS` is the log2 of the pagemap stride +(snmalloc's `MIN_CHUNK_BITS` when wired through `BackendArenaRange`); +`MAX_SIZE_BITS` is needed for the large-size-shift static assertion: ``` -template +template< + SNMALLOC_CONCEPT(IsWritablePagemap) Pagemap, + size_t MIN_SIZE_BITS, + size_t MAX_SIZE_BITS> struct PagemapRep { ... }; ``` Each free block uses pagemap entries at three offsets from its base -address: +address (where `UNIT_SIZE = 1 << MIN_SIZE_BITS`): -- **Chunk 0** (`addr`): Word::One / Word::Two → bin-tree node. +- **Unit 0** (`addr`): Word::One / Word::Two → bin-tree node. Bits 9–10 of Word::One → variant tag. Bit 8 → RED_BIT. All coexist because `TreeRep::set` preserves `META_MASK` on writes. -- **Chunk 1** (`addr + MIN_CHUNK_SIZE`): Word::One / Word::Two → - range-tree node (only for blocks ≥ 2 chunks). -- **Chunk 2** (`addr + 2 * MIN_CHUNK_SIZE`): Word::One → large chunk - count (only for blocks ≥ 3 chunks). Stored as `count << 8` to avoid +- **Unit 1** (`addr + UNIT_SIZE`): Word::One / Word::Two → + range-tree node (only for blocks ≥ 2 units). +- **Unit 2** (`addr + 2 * UNIT_SIZE`): Word::One → large chunk + count (only for blocks ≥ 3 units). Stored as `count << 8` to avoid the 8 reserved low bits; recovered via `word.get() >> 8`. **Static assertions in PagemapRep** (catch configuration errors early): -- `static_assert((VARIANT_MASK | RED_BIT) < MIN_CHUNK_SIZE)` — metadata +- `static_assert((VARIANT_MASK | RED_BIT) < UNIT_SIZE)` — metadata bits don't collide with address bits. - `static_assert(MetaEntryBase::is_backend_allowed_value(Word::One, VARIANT_MASK | RED_BIT))` — all metadata bits are in the backend- allowed range. -- `static_assert(MAX_CHUNKS_BITS + 8 <= bits::BITS)` — shifted large - size fits in a pagemap word. +- `static_assert((MAX_SIZE_BITS - MIN_SIZE_BITS) + LARGE_SIZE_SHIFT <= + bits::BITS)` — shifted large size fits in a pagemap word. Method mapping: @@ -1232,9 +1247,8 @@ public: { using ContainsParent::parent; - static constexpr size_t MAX_CHUNKS_BITS = MAX_SIZE_BITS - MIN_CHUNK_BITS; - using PagemapRepT = PagemapRep; - BackendArena arena; + using PagemapRepT = PagemapRep; + BackendArena arena; size_t requested_total = 0; public: @@ -1251,51 +1265,49 @@ public: **`alloc_range(size)`**: 1. `SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE)`. -2. `SNMALLOC_ASSERT(bits::is_pow2(size))` — same assertion as - `LargeBuddyRange`. Non-power-of-two support is deferred to - "Update front-end" step. +2. `SNMALLOC_ASSERT((size & (MIN_CHUNK_SIZE - 1)) == 0)` — size must be + a chunk multiple, but no power-of-two restriction. The arena handles + any size in `[MIN_CHUNK_SIZE, 2^MAX_SIZE_BITS)`. 3. `n_chunks = size >> MIN_CHUNK_BITS`. -4. Oversize bypass: if `n_chunks >= bits::one_at_bit(MAX_CHUNKS_BITS)`, +4. Oversize bypass: if `n_chunks >= bits::one_at_bit(MAX_SIZE_BITS - MIN_CHUNK_BITS)`, delegate to `parent.alloc_range(size)` (if `ParentRange::Aligned`), else return `nullptr`. Same as `LargeBuddyRange`. -5. `auto [addr, actual] = arena.remove_block(n_chunks)`. - (`actual == n_chunks` for power-of-two requests — the `is_pow2` - assertion above guarantees this for this phase.) +5. `auto [addr, actual] = arena.remove_block(n_chunks)`. The arena + carves exactly `n_chunks` chunks via `Bins::carve`; `actual` is + always `n_chunks` on success and is asserted as such. 6. If `addr != 0`, return `capptr::Arena::unsafe_from(reinterpret_cast(addr))`. 7. If `addr == 0`, call `refill(size)`. **`dealloc_range(base, size)`**: -1. `addr = base.unsafe_uintptr()`, `n_chunks = size >> MIN_CHUNK_BITS`. -2. Oversize bypass: if `n_chunks >= bits::one_at_bit(MAX_CHUNKS_BITS)`, - delegate to `parent.dealloc_range(base, size)`. Same condition and - SFINAE guard as `LargeBuddyRange::parent_dealloc_range`. -3. `auto [ov_addr, ov_size] = arena.add_block(addr, n_chunks)`. +1. `SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE)`, + `SNMALLOC_ASSERT((size & (MIN_CHUNK_SIZE - 1)) == 0)` — chunk multiple + only; no power-of-two restriction. +2. Oversize bypass: if `size >= 2^MAX_SIZE_BITS`, delegate to + `parent.dealloc_range(base, size)`. Same SFINAE guard as + `LargeBuddyRange::parent_dealloc_range`. +3. `n_chunks = size >> MIN_CHUNK_BITS`, + `auto [ov_addr, ov_size] = arena.add_block(base.unsafe_uintptr(), n_chunks)`. 4. If overflow (`ov_addr != 0`): call `dealloc_overflow(ov_addr, ov_size)`. -**`dealloc_overflow(addr, size_chunks)`** — matches -`LargeBuddyRange::dealloc_overflow` pattern: +**`dealloc_overflow(addr, size_chunks)`**: -Overflow from `add_block` can produce non-power-of-two sizes (e.g., -consolidated blocks that span multiple non-aligned PAL allocations). -The parent may require power-of-two aligned inputs. So overflow is -decomposed using `range_to_pow_2_blocks`: +Overflow from `add_block` is forwarded directly to the parent's +`dealloc_range`. The parent does not require power-of-two input — all +non-Buddy ranges accept any chunk-aligned size, and `BackendArenaRange` +itself accepts any chunk-multiple size — so no decomposition is needed. ``` void dealloc_overflow(uintptr_t addr, size_t size_chunks) { - auto base = capptr::Arena::unsafe_from( - reinterpret_cast(addr)); - size_t size_bytes = size_chunks << MIN_CHUNK_BITS; if constexpr (MAX_SIZE_BITS != (bits::BITS - 1)) { - range_to_pow_2_blocks( - base, size_bytes, - [this](capptr::Arena b, size_t s, bool) { - parent.dealloc_range(b, s); - }); + auto base = capptr::Arena::unsafe_from( + reinterpret_cast(addr)); + size_t size_bytes = size_chunks << MIN_CHUNK_BITS; + parent.dealloc_range(base, size_bytes); } else { @@ -1337,14 +1349,20 @@ refill base may prevent consolidation with any pre-existing blocks below it, which is correct). For the unaligned parent path: over-allocate `2 * size` (with overflow -check), add everything to the arena via `add_range` (which decomposes -into power-of-two blocks using `range_to_pow_2_blocks`), then call -`alloc_range(size)` recursively. Same logic as `LargeBuddyRange`. +check), add everything to the arena via `add_range`, then call +`alloc_range(size)` recursively. + +**`add_range(base, length)`** trims `(base, length)` to chunk boundaries +on both ends (PalRange returns page-aligned but not chunk-aligned +addresses) and inserts a single block via `add_block` — no power-of-two +decomposition is needed because `add_block` accepts any size in +`[1, 2^CHUNKS_BITS)` chunks. Any overflow from `add_block` is forwarded +to `dealloc_overflow`. Safety guards (both from `LargeBuddyRange`): - `static_assert((REFILL_SIZE < bits::one_at_bit(MAX_SIZE_BITS)) || ParentRange::Aligned)` — prevents the unaligned path from adding a - block that violates `add_block`'s `size_chunks < 2^MAX_CHUNKS_BITS` + block that violates `add_block`'s `size_chunks < 2^(MAX_SIZE_BITS - MIN_CHUNK_BITS)` precondition. - Runtime: `SNMALLOC_ASSERT(refill_size < bits::one_at_bit(MAX_SIZE_BITS))` — catches the computed `refill_size` (which may be larger than @@ -1354,8 +1372,9 @@ Safety guards (both from `LargeBuddyRange`): - `Aligned = true`: BackendArena's carving ensures that a request of size `n` (power-of-two, chunk-aligned) is placed at an `n`-aligned - address within the source block. For non-power-of-two requests - (future: step 4), the bin scheme's alignment rules still hold. + address within the source block. For non-power-of-two requests, the + bin scheme's alignment rules still hold (alignment matches the + lowest set bit of the size class). - `ConcurrencySafe = false`: same as `LargeBuddyRange`. - `ChunkBounds = capptr::bounds::Arena`: same as `LargeBuddyRange`. @@ -1363,15 +1382,17 @@ Safety guards (both from `LargeBuddyRange`): The global `LargeBuddyRange` uses `MAX_SIZE_BITS = BITS - 1`, meaning the buddy can hold up to half the address space. For BackendArenaRange: -`MAX_CHUNKS_BITS = (BITS - 1) - MIN_CHUNK_BITS`. On 64-bit with -`MIN_CHUNK_BITS = 14`, this is 49 — the arena can hold up to 2^49 -chunks. The arena's overflow path returns consolidated blocks that -reach this size, handled by `dealloc_overflow` (see above). +the maximum block size in chunks is `2^(MAX_SIZE_BITS - MIN_CHUNK_BITS)`. +On 64-bit with `MIN_CHUNK_BITS = 14`, this gives a chunk-bit width of +49 — the arena can hold up to 2^49 chunks. The arena's overflow path +returns consolidated blocks that reach this size, handled by +`dealloc_overflow` (see above). The `large_size_chunks` field (stored shifted by 8 in a pagemap word) needs at most 49 bits, which fits in the 56 backend-usable bits of a -64-bit pagemap word. A `static_assert(MAX_CHUNKS_BITS + 8 <= BITS)` -in `PagemapRep` catches configurations where this would overflow. +64-bit pagemap word. A `static_assert((MAX_SIZE_BITS - MIN_SIZE_BITS) + +LARGE_SIZE_SHIFT <= bits::BITS)` in `PagemapRep` catches configurations +where this would overflow. ## Phases @@ -1405,7 +1426,7 @@ Changes to `backend_arena.cc` (test file): 6. MockRep grows inner `BinRep` and `RangeRep` structs that each provide the full RBTree Rep interface (ref/get/set/is_red/etc.) over the mock-entry array. Each owns its own private bit layout - (red bit at bit 8 to match production layout). + (red bit at bit 8 to match the PagemapRep layout). 7. MockRep keeps top-level `get_variant`/`set_variant`/large-size accessors and adds `can_consolidate(uintptr_t) → true`. 8. New test: verify that a MockRep variant with `can_consolidate` @@ -1418,10 +1439,33 @@ boundary test passes. ### Phase 10: PagemapRep + BackendArenaRange + tests +**Status**: implemented and tested. + +**Phase 10b refactor (also implemented):** `BackendArena` and `PagemapRep` +were both retemplated to mirror `Buddy`'s 3-parameter shape: + +- `template class BackendArena` + — the always-zero `MIN_CHUNKS_BITS` placeholder is gone, and the unit + of allocation is named explicitly via `MIN_SIZE_BITS` instead of being + implicitly tied to snmalloc's global `MIN_CHUNK_BITS`. Internally, + `UNIT_SIZE = 1 << MIN_SIZE_BITS` and `CHUNKS_BITS = MAX_SIZE_BITS - + MIN_SIZE_BITS` replace the old `MIN_CHUNK_SIZE` / `MAX_CHUNKS_BITS` + usages. +- `template class PagemapRep` + — owns the large-size-shift capacity static_assert + `(MAX_SIZE_BITS - MIN_SIZE_BITS) + LARGE_SIZE_SHIFT <= bits::BITS`; + `LARGE_SIZE_SHIFT` is private. The Rep's pagemap stride is + `UNIT_SIZE = 1 << MIN_SIZE_BITS`. +- `BackendArenaRange::Type` wires snmalloc's `MIN_CHUNK_BITS` as + `MIN_SIZE_BITS` for both PagemapRep and BackendArena: + `PagemapRep` and + `BackendArena`. + New file: `src/snmalloc/backend_helpers/backend_arena_range.h` -1. `PagemapRep` — full Rep implementation using pagemap entries - as described above, with all static assertions. +1. `PagemapRep` — full Rep + implementation using pagemap entries as described above, with all + static assertions. 2. `BackendArenaRange` — the Range wrapper with `alloc_range`, `dealloc_range`, `refill`, and `dealloc_overflow`. @@ -1440,10 +1484,12 @@ New file: `src/test/func/backend_arena_range/backend_arena_range.cc` - Refill: verify that allocating when the arena is empty triggers a parent refill and returns memory. - Overflow: verify that deallocating a block that triggers arena-scale - consolidation passes the decomposed overflow to the parent via - `range_to_pow_2_blocks`. - - Overflow with non-power-of-two consolidated size: verify - decomposition produces valid power-of-two blocks. + consolidation forwards the overflow to the parent via + `dealloc_overflow`. + - Non-power-of-two sizes: verify `alloc_range` / `dealloc_range` work + for chunk-multiple but non-power-of-two sizes, including sizes that + are not representable size classes. The arena carves exactly the + requested chunk count internally, so callers see no excess. - Boundary: verify that a boundary bit in the pagemap prevents consolidation of adjacent blocks from different refills (when `CONSOLIDATE_PAL_ALLOCS` is false). @@ -1465,6 +1511,48 @@ Per `claude.md` mandatory review checkpoints: **Test gate**: full ctest run passes; reviewer reports no issues. +### Phase 10d: Bytes throughout (replace chunk-count internal API) + +**Goal**: drop the `size_chunks` / chunk-count internal convention from +`BackendArena` and `PagemapRep` so byte sizes (multiples of UNIT_SIZE) +flow end-to-end, removing the `<< MIN_CHUNK_BITS` conversion dance at +the BackendArenaRange ↔ BackendArena boundary and the matching reverse +shifts inside the range wrapper. + +**Substep 1 (DONE)**: generalise `BackendArenaBins` on a new +`MIN_SIZE_BITS` template parameter so its `range_t.size`, carve +arguments, and `max_supported_size()` are byte sizes (multiples of +`UNIT_SIZE = 1 << MIN_SIZE_BITS`). Renames inside Bins: +`size_chunks → size`, `align_chunks → align`, `max_supported_chunks +→ max_supported_size`. Tests cover `MIN_SIZE_BITS ∈ {0, 4, 14}`. + +**Substep 2 (DONE)**: flip `BackendArena`, `PagemapRep`, and +`BackendArenaRange` to bytes throughout: +- `BackendArena` now uses + `BackendArenaBins`; `add_block` / `remove_block` + take/return bytes; `addr_to_chunk` / `chunk_to_addr` / `CHUNKS_BITS` + deleted; `variant_of(size, addr)` works in byte units with + parity from `(addr >> MIN_SIZE_BITS) & 1`. +- `remove_block(size)` returns a scalar `addr_t` (0 = failure). The + size in the returned pair was tautological (always equal to the + requested `size` on success). +- `PagemapRep::get_large_size` / `set_large_size` (renamed from + `*_chunks`) take and return bytes; internal storage still scales + by `MIN_SIZE_BITS` so the shifted field fits a pagemap word. +- `BackendArenaRange::add_range` / `dealloc_range` / + `parent_dealloc` (unified from `parent_dealloc_range` and + `dealloc_overflow`) drop chunk-count conversions; `add_range` + uses `bits::align_up` / `bits::align_down`. +- Test scaffolding (`MockRep`, `BoundaryMockRep`, `Oracle`) + updated; tests introduce `chunk_size(N) = N << MIN_CHUNK_BITS` + helper. + +**Test gate**: `func-backend_arena-check`, `func-backend_arena_bins-check`, +`func-backend_arena_range-check` all pass; full `ninja` build clean. + +**Remaining**: code-review checkpoint for Phase 10d combined diff +before opening a PR; then proceed to Phase 12 (pipeline integration). + *Pipeline integration (replacing `LargeBuddyRange` in `standard_range.h` and `meta_protected_range.h`) is a separate follow-up plan: "Update backend to use BackendArenaRange."* @@ -1503,7 +1591,7 @@ backend to use BackendArenaRange."* 3. **Large size stored shifted** — PagemapRep stores the chunk count as `count << 8` to avoid the pagemap's reserved low byte; recovered - via `>> 8`. Guarded by `static_assert(MAX_CHUNKS_BITS + 8 <= bits::BITS)`. + via `>> 8`. Guarded by `static_assert((MAX_SIZE_BITS - MIN_CHUNK_BITS) + 8 <= bits::BITS)`. 4. **Boundary checks in BackendArena** — not in BackendArenaRange. Consolidation decisions happen inside `add_block`, so the boundary @@ -1519,48 +1607,56 @@ backend to use BackendArenaRange."* `claim_for_backend()` on first access. No explicit ownership management needed in BackendArena or BackendArenaRange. -7. **Overflow decomposition** — `add_block` overflow may produce non- +7. **Overflow forwarding** — `add_block` overflow may produce non- power-of-two sizes (consolidated blocks from multiple PAL allocs). - `dealloc_overflow` uses `range_to_pow_2_blocks` to decompose before - passing to parent, matching the existing pattern in - `LargeBuddyRange::add_range`. + `dealloc_overflow` forwards the overflow directly to the parent's + `dealloc_range`; no power-of-two decomposition is needed because + `BackendArenaRange` (which is what replaces `LargeBuddyRange` in + the pipeline) accepts any chunk-multiple size. -8. **`BackendArenaWordRef` lives in the test file** — production +8. **`BackendArenaWordRef` lives in the test file** — the in-tree `PagemapRep` returns `BackendStateWordRef` directly (mirroring `BuddyChunkRep` in `largebuddyrange.h`). The test-only `BackendArenaWordRef` proxy is defined in `src/test/func/backend_arena/backend_arena.cc` and used only by - MockRep, so production headers carry no test scaffolding. + MockRep, so the in-tree headers carry no test scaffolding. -9. **Power-of-two assertion retained** — `alloc_range` keeps - `is_pow2(size)` for this phase. Non-power-of-two support is - deferred to "Update front-end" step, when the bin scheme's - alignment guarantees are verified end-to-end. +9. **No power-of-two restriction on the public API** — `alloc_range` + and `dealloc_range` accept any chunk-multiple size; the only + restriction is `size >= MIN_CHUNK_SIZE` and `size < 2^MAX_SIZE_BITS`. + The arena's `Bins::carve` delivers exactly the requested chunk + count, rolling any size-class rounding remainder into the post + fragment that is re-inserted internally. SC rounding therefore + stays a private arena detail. This lifts a restriction inherited + from `LargeBuddyRange`. ## Resolved during plan review - Overflow handling: `add_block` can return non-power-of-two sizes when blocks from multiple PAL allocations consolidate. `dealloc_overflow` - decomposes via `range_to_pow_2_blocks`. (Rubber-duck finding #2.) + forwards the overflow directly to the parent — no decomposition is + required because `BackendArenaRange` itself accepts arbitrary + chunk-multiple sizes and replaces `LargeBuddyRange` in the pipeline. + (Rubber-duck finding #2 superseded by Option B refactor.) - Handle visibility / layering: original plan promoted bit-layout constants and a `BackendArenaWordRef` proxy to namespace scope so - production and test code could share them. Subsequent review + the in-tree header and tests could share them. Subsequent review observed that this broke the Buddy/`BuddyChunkRep`/`BuddyInplaceRep` layering: the data structure should be representation-agnostic. Resolved by making `BackendArena` carry no bit-layout state and requiring `Rep::BinRep` / `Rep::RangeRep` to own all packing - decisions. Production `PagemapRep` keeps its layout private; the + decisions. `PagemapRep` keeps its layout private; the test `BackendArenaWordRef` lives in the test file alongside MockRep. (Rubber-duck finding #1, then revised after layering review.) -- Size shift overflow: `static_assert(MAX_CHUNKS_BITS + 8 <= BITS)` in +- Size shift overflow: `static_assert((MAX_SIZE_BITS - MIN_CHUNK_BITS) + 8 <= BITS)` in `PagemapRep` prevents shift overflow. (Rubber-duck finding #4.) - Unaligned refill guard: both static assert AND runtime assert copied from `LargeBuddyRange` to prevent `add_block` precondition violation. (Rubber-duck finding #6, strengthened in second review.) - Pipeline integration (Phase 11) removed from this plan's scope — separate follow-up plan. (Rubber-duck finding #8.) -- `PagemapRep` templated on `MAX_CHUNKS_BITS` so the size-shift - static_assert is in scope. (Second review finding #1.) +- `PagemapRep` templated on `MIN_SIZE_BITS` and `MAX_SIZE_BITS` so the + size-shift static_assert is in scope. (Second review finding #1.) - `remove_block` exact-size guarantee is scoped to power-of-two requests only. (Second review finding #4.) @@ -1592,7 +1688,7 @@ backend to use BackendArenaRange."* `neighbours(K)` tests against `std::set::lower_bound` / `upper_bound` as oracle. -No production code path is changed in this phase: the existing +No in-tree code path is changed in this phase: the existing `LargeBuddyRange` continues to be the active large-block allocator. ## Resolved during plan review @@ -1622,12 +1718,12 @@ No production code path is changed in this phase: the existing - Predecessor-Range-entry-reuse only applies when `P` is non-min. - `add_block` returns `{0, 0}` on success; on overflow it returns the unabsorbed range, mirroring `Buddy::add_block`'s overflow-return - contract. Oversize inputs (`size_chunks >= 2^MAX_CHUNKS_BITS`) bypass + contract. Oversize inputs (`size_chunks >= 2^(MAX_SIZE_BITS - MIN_CHUNK_BITS)`) bypass `BackendArena` entirely — the wrapping `BackendArenaRange` layer handles them before calling `add_block`, and `add_block` asserts - `size_chunks < 2^MAX_CHUNKS_BITS`. The only overflow case is + `size_chunks < 2^(MAX_SIZE_BITS - MIN_CHUNK_BITS)`. The only overflow case is consolidation growing a coalesced block to exactly - `2^MAX_CHUNKS_BITS` (the consolidated range is returned, neighbours + `2^(MAX_SIZE_BITS - MIN_CHUNK_BITS)` (the consolidated range is returned, neighbours having been removed first). The future `BackendArenaRange` wrapper is responsible for handling overflow; the standalone `BackendArena` only exposes the contract. @@ -1640,7 +1736,7 @@ No production code path is changed in this phase: the existing power-of-two-only large variant of front-end `sizeclass_t`, with low-exponent special cases handled in the spirit of `bits::from_exp_mant`. -- `BackendArena` uses chunk-count +- `BackendArena` uses byte-size exponent bounds with **exclusive max** semantics, matching the existing `Buddy<..., MIN, MAX>`. - Multi-`B` testing is via a templated bin-table generator in a single @@ -1663,3 +1759,256 @@ No production code path is changed in this phase: the existing Out of scope for this phase; flagged for the memcpy-fix plan to design. - Whether `INTERMEDIATE_BITS=4` (34 bins/exp) needs to be tested in this phase. Currently `B ∈ {1, 2, 3}` only. + +--- + +# Phase 12: Update backend to use BackendArenaRange + +## Goal + +Replace every `LargeBuddyRange` instantiation in the range +pipelines with `BackendArenaRange`. After this phase, snmalloc uses +the BackendArena bin-tree allocator instead of the power-of-two buddy +for all large-range management. The `LargeBuddyRange` and +`BuddyChunkRep` classes are **not deleted** — they remain available +for alternative configurations and external embedders. Only the +default pipeline wiring changes. + +## Scope + +- Modify `standard_range.h` — replace all `LargeBuddyRange` with + `BackendArenaRange` (same template parameters). +- Modify `meta_protected_range.h` — replace all `LargeBuddyRange` + with `BackendArenaRange` (same template parameters). +- **No other source files change.** `BackendArenaRange` is already a + drop-in replacement: same template signature, same `Type` + shape, same `alloc_range`/`dealloc_range` API, same `Aligned`, + `ConcurrencySafe`, and `ChunkBounds` constants. + +## Pre-conditions + +- Phase 10 (BackendArenaRange) is committed and all its tests pass. +- Phase 11 (final review of Phases 9–10) is complete. +- Baseline: the checkout builds and all tests pass before this change. + +## Analysis of every LargeBuddyRange instantiation + +### `standard_range.h` + +**1. GlobalR** +```cpp +LargeBuddyRange +``` +→ `BackendArenaRange` + +- `MAX_SIZE_BITS = bits::BITS - 1` → global-range mode (no parent + dealloc). `BackendArenaRange` handles this identically. +- `MIN_REFILL_SIZE_BITS = MinSizeBits` (Windows: 16, otherwise PAL- + dependent). `BackendArenaRange` passes this through. +- Parent is `Base` (PalRange + PagemapRegisterRange chain). Parent is + **unaligned** on PALs without `AlignedAllocation` (e.g. Linux mmap) + and aligned otherwise. `BackendArenaRange::refill` currently still + carries the aligned/unaligned dual path inherited from + `LargeBuddyRange`; collapsing this into a single path is in scope for + Step 4 below. + +**2. LargeObjectRange (local cache)** +```cpp +LargeBuddyRange +``` +→ `BackendArenaRange` + +- `MAX_SIZE_BITS = LocalCacheSizeBits = 21` (2 MiB). Non-global mode. + Overflow goes to parent. +- `BackendArenaRange::dealloc_overflow` forwards directly to parent + without decomposition. Since the chunk-bit width + (`MAX_SIZE_BITS - MIN_CHUNK_BITS = 7` on 64-bit) is small, the arena + has at most 128 chunk slots — overflow can only produce one block of + exactly `1 << MAX_SIZE_BITS`. +- Wrapped in `StaticConditionalRange` — no impact on the substitution. + +### `meta_protected_range.h` + +**3. GlobalR** — identical to standard_range.h #1. + +**4. CentralObjectRange** +```cpp +LargeBuddyRange +``` +→ `BackendArenaRange` + +- `MIN_REFILL_SIZE_BITS = 0` (default). Global-range mode. + +**5. CentralMetaRange** +```cpp +LargeBuddyRange +``` +→ `BackendArenaRange` + +- Global-range mode. + +**6. CentralMetaRange conditional huge-page buddy** +```cpp +stl::conditional_t< + (max_page_chunk_size_bits > MIN_CHUNK_BITS), + LargeBuddyRange< + max_page_chunk_size_bits, max_page_chunk_size_bits, + Pagemap, page_size_bits>, + NopRange> +``` +→ Replace `LargeBuddyRange` with `BackendArenaRange` inside the + `conditional_t`. + +- This is a small local cache for huge-page consolidation. + `MAX_SIZE_BITS = max_page_chunk_size_bits` (typically + `page_size_bits` when page_size_bits > MIN_CHUNK_BITS, e.g. + huge pages at 21 bits). +- Non-global mode. Overflow decomposed and passed to parent. + +**7. ObjectRange (local)** +```cpp +LargeBuddyRange +``` +→ `BackendArenaRange` + +- Same shape as standard_range.h #2. + +**8. MetaRange (local)** +```cpp +LargeBuddyRange +``` +→ `BackendArenaRange` + +- `REFILL_SIZE_BITS = 21 - 6 = 15`. Global-range mode. + `MIN_REFILL_SIZE_BITS = 0`. + +## Implementation + +The change is a mechanical text substitution — replace the string +`LargeBuddyRange` with `BackendArenaRange` in both files. No +template parameters, no API calls, no structural changes. + +### Step 1: Replace LargeBuddyRange → BackendArenaRange + +In `src/snmalloc/backend/standard_range.h`: +- Line 32: `LargeBuddyRange<` → `BackendArenaRange<` +- Line 52: `LargeBuddyRange<` → `BackendArenaRange<` + +In `src/snmalloc/backend/meta_protected_range.h`: +- Line 35: `LargeBuddyRange<` → `BackendArenaRange<` +- Line 54: `LargeBuddyRange<` → `BackendArenaRange<` +- Line 71: `LargeBuddyRange<` → `BackendArenaRange<` +- Line 82: `LargeBuddyRange<` → `BackendArenaRange<` +- Line 93: `LargeBuddyRange<` → `BackendArenaRange<` +- Line 103: `LargeBuddyRange<` → `BackendArenaRange<` + +### Step 2: Verify include paths + +Both files include `"../backend/backend.h"` which includes +`"../backend_helpers/backend_helpers.h"` which already includes +`"backend_arena_range.h"`. **No new includes needed.** + +### Step 3: Build and test + +- Full `ctest` suite must pass. This is the primary validation: + hundreds of functional tests exercise the full allocator pipeline. +- Specific tests to watch: + - `func-memory-fast` — core malloc/free workloads + - `func-pool-fast` — pool allocator + - `func-domestication-fast` — boundary/domestication + - `func-fixed_region-fast` — fixed-region (uses `FixedRangeConfig` + which uses `StandardLocalState`) + - `perf-*` — performance tests (functional correctness only) + +**Test gate**: full `ctest` passes. No new tests needed — the existing +test suite exercises the pipeline end-to-end. + +### Step 4: Retire the `ParentRange::Aligned` concept + +Once `BackendArenaRange` is the only large-range layer, the +`Aligned` template property loses most of its remaining use: + +- `BackendArenaRange::Aligned` is always `true` (the bin scheme + guarantees size-aligned output for in-arena allocations). +- `BackendArenaRange::add_range` already trims arbitrary + (page-aligned-but-not-chunk-aligned) parent input to chunk + boundaries, so an unaligned parent no longer requires a separate + refill path. + +Plan: + +1. **Collapse `BackendArenaRange::refill` to a single path.** Drop the + `if (ParentRange::Aligned)` branch. The unified path allocates + `refill_size` from the parent, places the chunk-aligned remainder + into the arena via `add_range` (which already trims for unaligned + parents), then recursively calls `alloc_range(size)` to obtain the + size-aligned chunk for the caller. The refill-size accounting must + keep `refill_size < 2^MAX_SIZE_BITS` so the assertion in + `add_block` holds for the whole-refill `add_range` call; configs + where `REFILL_SIZE_BITS == MAX_SIZE_BITS` (the local-cache + configurations) need either a one-bit refill-size reduction or a + `MAX_SIZE_BITS` bump. +2. **Drop the oversize-fallback alignment check.** In `alloc_range`, + the `if (ParentRange::Aligned) return parent.alloc_range(size);` + branch is dead once the property goes away; replace with an + unconditional delegate (sizes ≥ `2^MAX_SIZE_BITS` are always + forwarded to the parent — alignment is no longer differentiated). +3. **Remove `Aligned` from the Range concept.** Once + `BackendArenaRange` and `SmallBuddyRange` no longer reference it, + drop the `static constexpr bool Aligned` field from every + pass-through range (`StatsRange`, `CommitRange`, `LockRange`, + `IndirectRange`, `StaticRange`, `StaticConditionalRange`, + `SubRange`, `LogRange`, `NopRange`, `PagemapRegisterRange`, + `PalRange`). The `pal_supports` query + itself remains for PALs that want to advertise the capability, + but the range stack no longer threads it through. +4. **Update `SmallBuddyRange`.** Drop the + `static_assert(ParentRange::Aligned)` (its parent is always + `BackendArenaRange` after Phase 12, which always provides aligned + output by construction). + +This is a structural simplification, not a behavioural change — the +test suite is the gate. + +## Risks + +1. **BackendArenaRange behaviour differences.** The bin-tree allocator + returns blocks with different internal fragmentation characteristics + than the power-of-two buddy. Functionally, the caller always gets + at least the requested size (power-of-two), so correctness is + maintained. The arena may produce different carving patterns, but + `alloc_range` always returns exactly the requested size. + +2. **Overflow behaviour.** `LargeBuddyRange::dealloc_overflow` returns + a single block of exactly `1 << MAX_SIZE_BITS`. + `BackendArenaRange::dealloc_overflow` forwards a single block of the + consolidated size directly to the parent. The size can be any + chunk multiple up to `2^MAX_SIZE_BITS`, not just power-of-two, but + the parent (now itself a `BackendArenaRange` or pass-through layer) + accepts arbitrary chunk-multiple sizes. + +3. **`FixedRangeConfig` uses `StandardLocalState`.** The fixed-region + configuration pushes memory directly into `GlobalR.dealloc_range`. + This works with `BackendArenaRange` because `dealloc_range` has the + same signature and contract. + +## Resolved during plan review + +- `backend_arena_range.h` was missing `#include "empty_range.h"` for + its `EmptyRange<>` default template parameter. Fixed pre-commit. + (Rubber-duck finding #2.) +- The `conditional_t` huge-page path in `meta_protected_range.h` may + not be instantiated on default builds. CI tests multiple PAL + configurations. Risk acknowledged but no custom build added — the + conditional branch is structurally identical to other + `BackendArenaRange` uses and shares the same template. (Rubber-duck + finding #1.) + +## Out of scope + +- Deleting `LargeBuddyRange` / `BuddyChunkRep` (keep for embedders). +- Modifying `Buddy<>` or `redblacktree.h`. +- Non-power-of-two `alloc_range` requests (deferred to front-end + generalisation phase). +- Performance benchmarking (separate task). +- Any front-end changes. diff --git a/claude.md b/claude.md index 05a6398e4..9cb11e281 100644 --- a/claude.md +++ b/claude.md @@ -113,7 +113,9 @@ as a collaborator. - **Comments earn their length by carrying correctness-relevant information** - A comment exists to convey something the reader cannot recover from the code — a non-obvious invariant, a subtle correctness argument, a coupling that breaks if edited. If you cannot name what the comment teaches that the code does not, cut it. -- **Test scaffolding does not live in production headers** - A header that needs a friend struct purely for testing carries only the forward declaration and the friend grant; the body lives in test code. +- **Don't qualify `src/snmalloc/` code as "production"** - Everything under `src/snmalloc/` is the shipped library; calling it "production code" / "production header" / "production Rep" adds no information and implicitly suggests there's non-production code in the same tree. Use unqualified names (e.g. "the in-tree header", "the shipped `PagemapRep`", or just the name itself). The meaningful distinction is in-tree (`src/snmalloc/`) vs test (`src/test/`), and that distinction is already clear from the path. This applies equally to comments inside `src/snmalloc/`, to comments inside `src/test/` referring back to in-tree code, and to design documents like `PLAN.md`. + +- **Test scaffolding does not live in `src/snmalloc/` headers** - A header that needs a friend struct purely for testing carries only the forward declaration and the friend grant; the body lives in test code. - **Store data in the form the consumer uses** - If a derived value is only consumed pre-shifted, pre-negated, or pre-masked, store it that way at build time. The cost moves from every consumer call to one build-time loop. diff --git a/src/snmalloc/backend_helpers/backend_arena.h b/src/snmalloc/backend_helpers/backend_arena.h index 17b581de9..ad6835ecc 100644 --- a/src/snmalloc/backend_helpers/backend_arena.h +++ b/src/snmalloc/backend_helpers/backend_arena.h @@ -46,31 +46,36 @@ namespace snmalloc * shape as `BinRep`. * - `get_variant(addr)` / `set_variant(addr, v)` — the * `BackendArenaVariant` tag for the block starting at `addr`. - * - `get_large_size_chunks(addr)` / `set_large_size_chunks(addr, n)` - * — exact chunk count for `Large` blocks (3+ chunks). + * - `get_large_size(addr)` / `set_large_size(addr, size)` — + * exact byte size for `Large` blocks (3+ units). * - `can_consolidate(higher_addr) -> bool` — whether the block at * `higher_addr` may be merged with the block immediately below * it. Returns false at allocation boundaries that must be * preserved. * - * `MIN_CHUNKS_BITS`: log2 of the minimum allocation unit in chunks - * (currently only 0 is supported — 1-chunk minimum). + * `MIN_SIZE_BITS`: log2 of the unit of allocation (= the minimum + * block size in bytes). All addresses and sizes managed by this + * arena are multiples of `1 << MIN_SIZE_BITS`. * - * `MAX_CHUNKS_BITS`: log2 of the arena size in chunks. Blocks that - * reach this size overflow and are returned to the caller. + * `MAX_SIZE_BITS`: log2 of the (exclusive) upper bound on managed + * block sizes. Blocks that reach this size overflow and are + * returned to the caller. */ - template + template class BackendArena { - static_assert(MIN_CHUNKS_BITS == 0, "Only MIN_CHUNKS_BITS == 0 supported"); - static_assert(MAX_CHUNKS_BITS > MIN_CHUNKS_BITS); - static_assert(MAX_CHUNKS_BITS < bits::BITS); + static_assert(MAX_SIZE_BITS > MIN_SIZE_BITS); + static_assert(MAX_SIZE_BITS < bits::BITS); + static_assert(MIN_SIZE_BITS < bits::BITS); + + static constexpr size_t UNIT_SIZE = size_t(1) << MIN_SIZE_BITS; + static constexpr size_t TWO_UNITS = size_t(2) << MIN_SIZE_BITS; static constexpr size_t B = 2; - using Bins = BackendArenaBins; + using Bins = BackendArenaBins; static_assert( - bits::one_at_bit(MAX_CHUNKS_BITS) - 1 <= Bins::max_supported_chunks()); + bits::one_at_bit(MAX_SIZE_BITS) - 1 <= Bins::max_supported_size()); using BinRep = typename Rep::BinRep; using RangeRep = typename Rep::RangeRep; @@ -82,28 +87,16 @@ namespace snmalloc RangeTree range_tree{}; typename Bins::Bitmap bitmap{}; - // ---- Address-unit helpers ---- - - static size_t addr_to_chunk(uintptr_t a) - { - return a >> MIN_CHUNK_BITS; - } - - static uintptr_t chunk_to_addr(size_t c) - { - return static_cast(c) << MIN_CHUNK_BITS; - } - // ---- Metadata helpers ---- - static BackendArenaVariant - variant_of(size_t size_chunks, size_t chunk_index) + static BackendArenaVariant variant_of(size_t size, uintptr_t addr) { - if (size_chunks == 1) + if (size == UNIT_SIZE) return BackendArenaVariant::Min; - if (size_chunks == 2) - return (chunk_index & 1) == 0 ? BackendArenaVariant::EvenTwo : - BackendArenaVariant::OddTwo; + if (size == TWO_UNITS) + return ((addr >> MIN_SIZE_BITS) & 1) == 0 ? + BackendArenaVariant::EvenTwo : + BackendArenaVariant::OddTwo; return BackendArenaVariant::Large; } @@ -115,12 +108,12 @@ namespace snmalloc switch (v) { case BackendArenaVariant::Min: - return {a, 1}; + return {a, UNIT_SIZE}; case BackendArenaVariant::EvenTwo: case BackendArenaVariant::OddTwo: - return {a, 2}; + return {a, TWO_UNITS}; case BackendArenaVariant::Large: - return {a, Rep::get_large_size_chunks(a)}; + return {a, Rep::get_large_size(a)}; } SNMALLOC_ASSERT(false); return {0, 0}; @@ -134,27 +127,25 @@ namespace snmalloc Rep::get_variant(a) == BackendArenaVariant::Min; } - void insert_block(uintptr_t addr, size_t size_chunks) + void insert_block(uintptr_t addr, size_t size) { - Rep::set_variant(addr, variant_of(size_chunks, addr_to_chunk(addr))); - if (size_chunks >= 3) - Rep::set_large_size_chunks(addr, size_chunks); + Rep::set_variant(addr, variant_of(size, addr)); + if (size > TWO_UNITS) + Rep::set_large_size(addr, size); - auto chunk_range = - typename Bins::range_t{addr_to_chunk(addr), size_chunks}; - size_t bin = bitmap.add(chunk_range); + auto range = typename Bins::range_t{addr, size}; + size_t bin = bitmap.add(range); bin_trees[bin].insert_elem(addr); - if (size_chunks >= 2) + if (size >= TWO_UNITS) range_tree.insert_elem(addr); } - void unlink_block(uintptr_t addr, size_t size_chunks) + void unlink_block(uintptr_t addr, size_t size) { - auto chunk_range = - typename Bins::range_t{addr_to_chunk(addr), size_chunks}; - size_t bin = bitmap.add(chunk_range); + auto range = typename Bins::range_t{addr, size}; + size_t bin = bitmap.add(range); bin_trees[bin].remove_elem(addr); - if (size_chunks >= 2) + if (size >= TWO_UNITS) range_tree.remove_elem(addr); if (bin_trees[bin].is_empty()) bitmap.clear(bin); @@ -168,22 +159,29 @@ namespace snmalloc constexpr BackendArena() = default; /** - * Add a free block at `addr` with `size_chunks` chunks. The block - * is consolidated with any adjacent free neighbours. Returns - * `{0, 0}` on success. If consolidation produces a block spanning - * the entire arena (`>= 2^MAX_CHUNKS_BITS` chunks), returns - * `{consolidated_addr, consolidated_size}` and the arena is empty. + * Add a free block at `addr` with `size` bytes. The block is + * consolidated with any adjacent free neighbours. Returns + * `{0, 0}` on success. If consolidation produces a block whose + * size reaches `2^MAX_SIZE_BITS` bytes (the exclusive upper bound + * on representable block sizes), the block is not inserted; + * returns `{consolidated_addr, consolidated_size}` so the caller + * can return it to a parent range. */ - stl::Pair add_block(addr_t addr, size_t size_chunks) + stl::Pair add_block(addr_t addr, size_t size) { check_invariant(); SNMALLOC_ASSERT(addr != 0); - SNMALLOC_ASSERT((addr & (MIN_CHUNK_SIZE - 1)) == 0); - SNMALLOC_ASSERT(size_chunks > 0); - SNMALLOC_ASSERT(size_chunks < bits::one_at_bit(MAX_CHUNKS_BITS)); + // Unit alignment is required: callers feeding parent ranges (e.g. + // mmap-backed PalRange returns page-aligned but not chunk-aligned + // memory) must trim their input to UNIT_SIZE before reaching here. + // BackendArenaRange::add_range does this trim. + SNMALLOC_ASSERT((addr & (UNIT_SIZE - 1)) == 0); + SNMALLOC_ASSERT(size > 0); + SNMALLOC_ASSERT((size & (UNIT_SIZE - 1)) == 0); + SNMALLOC_ASSERT(size < bits::one_at_bit(MAX_SIZE_BITS)); uintptr_t c_addr = addr; - size_t c_size = size_chunks; + size_t c_size = size; auto merge = [&](uintptr_t n_addr, size_t n_size) { unlink_block(n_addr, n_size); @@ -197,25 +195,25 @@ namespace snmalloc // Predecessor: check range tree, then fall back to min-size bin. auto [pa, ps] = range_from_addr(p_key); - if (pa + ps * MIN_CHUNK_SIZE == addr && Rep::can_consolidate(addr)) + if (pa + ps == addr && Rep::can_consolidate(addr)) merge(pa, ps); else if ( - addr >= MIN_CHUNK_SIZE && Rep::can_consolidate(addr) && - contains_min(addr - MIN_CHUNK_SIZE)) - merge(addr - MIN_CHUNK_SIZE, 1); + addr >= UNIT_SIZE && Rep::can_consolidate(addr) && + contains_min(addr - UNIT_SIZE)) + merge(addr - UNIT_SIZE, UNIT_SIZE); // Successor: check range tree, then fall back to min-size bin. auto [sa, ss] = range_from_addr(s_key); - uintptr_t succ_addr = addr + size_chunks * MIN_CHUNK_SIZE; + uintptr_t succ_addr = addr + size; if (sa == succ_addr && Rep::can_consolidate(succ_addr)) merge(sa, ss); else if ( succ_addr > addr && Rep::can_consolidate(succ_addr) && contains_min(succ_addr)) - merge(succ_addr, 1); + merge(succ_addr, UNIT_SIZE); // Arena-scale overflow: consolidated block spans the full arena. - if (c_size >= bits::one_at_bit(MAX_CHUNKS_BITS)) + if (c_size >= bits::one_at_bit(MAX_SIZE_BITS)) return {c_addr, c_size}; // Insert consolidated block. @@ -226,22 +224,26 @@ namespace snmalloc } /** - * Remove a block of at least `n_chunks` chunks. Returns - * `{addr, actual_size}` on success, `{0, 0}` if nothing fits. - * Any leftover from carving is re-inserted via `add_block`. + * Remove exactly `size` bytes. Returns the address on success or + * 0 if nothing fits. SC rounding is internal: the arena may + * locate a larger free region but only the requested `size` is + * handed out — the remainder rolls into the carve remainders + * which are re-inserted via `add_block`. */ - stl::Pair remove_block(size_t n_chunks) + addr_t remove_block(size_t size) { check_invariant(); - if (n_chunks == 0) - return {0, 0}; + if (size == 0) + return 0; - if (n_chunks > Bins::max_supported_chunks()) - return {0, 0}; + if (size > Bins::max_supported_size()) + return 0; + + SNMALLOC_ASSERT((size & (UNIT_SIZE - 1)) == 0); - size_t bin_id = bitmap.find_for_request(n_chunks); + size_t bin_id = bitmap.find_for_request(size); if (bin_id == SIZE_MAX) - return {0, 0}; + return 0; // remove_min returns the lowest-address entry (since compare // is k1 > k2). Read metadata after removal — remove_elem @@ -250,30 +252,29 @@ namespace snmalloc auto [_, block_size] = range_from_addr(block_addr); (void)_; - if (block_size >= 2) + if (block_size >= TWO_UNITS) range_tree.remove_elem(block_addr); if (bin_trees[bin_id].is_empty()) bitmap.clear(bin_id); - // Carve the requested chunk count from the block. - auto carved = - Bins::carve({addr_to_chunk(block_addr), block_size}, n_chunks); + // Carve the requested size from the block. + auto carved = Bins::carve({block_addr, block_size}, size); // Re-insert non-empty remainders. By the maximally-consolidated // invariant, these remainders have no adjacent free neighbours. if (carved.pre.size != 0) { - insert_block(chunk_to_addr(carved.pre.base), carved.pre.size); + insert_block(carved.pre.base, carved.pre.size); } if (carved.post.size != 0) { - insert_block(chunk_to_addr(carved.post.base), carved.post.size); + insert_block(carved.post.base, carved.post.size); } check_invariant(); - return {chunk_to_addr(carved.req.base), carved.req.size}; + return carved.req.base; } /** @@ -296,7 +297,7 @@ namespace snmalloc auto [a, s] = range_from_addr(node); if (prev_valid) { - uintptr_t prev_end = prev_addr + prev_size * MIN_CHUNK_SIZE; + uintptr_t prev_end = prev_addr + prev_size; SNMALLOC_ASSERT(prev_end != a || !Rep::can_consolidate(a)); } prev_addr = a; @@ -308,10 +309,10 @@ namespace snmalloc // 1b. No non-min block adjacent to a min block (unless boundary). self.range_tree.for_each([&](uintptr_t node) { auto [a, s] = range_from_addr(node); - if (a >= MIN_CHUNK_SIZE) + if (a >= UNIT_SIZE) SNMALLOC_ASSERT( - !contains_min(a - MIN_CHUNK_SIZE) || !Rep::can_consolidate(a)); - uintptr_t end = a + s * MIN_CHUNK_SIZE; + !contains_min(a - UNIT_SIZE) || !Rep::can_consolidate(a)); + uintptr_t end = a + s; SNMALLOC_ASSERT(!contains_min(end) || !Rep::can_consolidate(end)); }); @@ -324,7 +325,7 @@ namespace snmalloc return; if (prev_valid) SNMALLOC_ASSERT( - prev + MIN_CHUNK_SIZE != node || !Rep::can_consolidate(node)); + prev + UNIT_SIZE != node || !Rep::can_consolidate(node)); prev = node; prev_valid = true; }); @@ -341,7 +342,7 @@ namespace snmalloc { self.bin_trees[bin].for_each([&](uintptr_t node) { auto [a, s] = range_from_addr(node); - if (s >= 2) + if (s >= TWO_UNITS) { auto path = self.range_tree.get_root_path(); SNMALLOC_ASSERT(self.range_tree.find(path, node)); @@ -354,8 +355,8 @@ namespace snmalloc self.range_tree.for_each([&](uintptr_t node) { range_tree_count++; auto [a, s] = range_from_addr(node); - auto chunk_range = typename Bins::range_t{addr_to_chunk(a), s}; - size_t expected_bin = Bins::bin_index(chunk_range); + auto range = typename Bins::range_t{a, s}; + size_t expected_bin = Bins::bin_index(range); auto path = self.bin_trees[expected_bin].get_root_path(); SNMALLOC_ASSERT(self.bin_trees[expected_bin].find(path, node)); }); @@ -368,8 +369,8 @@ namespace snmalloc { self.bin_trees[bin].for_each([&](uintptr_t node) { auto [a, s] = range_from_addr(node); - auto chunk_range = typename Bins::range_t{addr_to_chunk(a), s}; - size_t expected_bin = Bins::bin_index(chunk_range); + auto range = typename Bins::range_t{a, s}; + size_t expected_bin = Bins::bin_index(range); SNMALLOC_ASSERT(expected_bin == bin); }); } @@ -388,9 +389,9 @@ namespace snmalloc self.bin_trees[bin].for_each([&](uintptr_t node) { auto v = Rep::get_variant(node); auto [a, s] = range_from_addr(node); - SNMALLOC_ASSERT(v == variant_of(s, addr_to_chunk(a))); + SNMALLOC_ASSERT(v == variant_of(s, a)); if (v == BackendArenaVariant::Large) - SNMALLOC_ASSERT(Rep::get_large_size_chunks(node) == s); + SNMALLOC_ASSERT(Rep::get_large_size(node) == s); }); } } diff --git a/src/snmalloc/backend_helpers/backend_arena_bins.h b/src/snmalloc/backend_helpers/backend_arena_bins.h index 2d1feb245..b6d98e233 100644 --- a/src/snmalloc/backend_helpers/backend_arena_bins.h +++ b/src/snmalloc/backend_helpers/backend_arena_bins.h @@ -7,14 +7,14 @@ namespace snmalloc { - template + template struct BackendArenaBinsTestAccess; /** - * Chunk size class enumeration and bin classification used by the + * Size class enumeration and bin classification used by the * BackendArena. * - * Template parameter B (mantissa-bit width of snmalloc's + * Template parameter `B` (mantissa-bit width of snmalloc's * non-power-of-two size class scheme) determines the number of * RB-trees per exponent — the count of distinct servable subsets a * free block can occupy at that exponent: B=1 -> 2; B=2 -> 5; @@ -22,27 +22,41 @@ namespace snmalloc * `prototype/skip_analysis.py`. All bin-scheme metadata derives * constexpr from a single per-bin subsets table, `bin_subsets`. * + * Template parameter `MIN_SIZE_BITS` is the log2 of the allocation + * unit: every byte size handled here is a multiple of + * `UNIT_SIZE = 1 << MIN_SIZE_BITS`, and the smallest representable + * size is `UNIT_SIZE`. With `MIN_SIZE_BITS == 0` the unit is a single + * byte and the classifier degenerates to the bare bin scheme; + * larger values scale the entire size axis (and the bin tables) + * by `UNIT_SIZE`. + * * Public surface: - * - `range_t`, `carve_t`: chunk-count ranges and carve output. - * - `carve(block, n_chunks)`: split a block into pre-pad / aligned - * request / post-pad. - * - `max_supported_chunks()`: upper bound on legal request sizes. + * - `range_t`, `carve_t`: byte ranges and carve output. + * - `carve(block, n)`: split a block into pre-pad / aligned + * request / post-pad, where `n` is in bytes. + * - `max_supported_size()`: upper bound on legal request sizes + * (in bytes). * - nested `Bitmap`: per-arena non-empty-bins bitmap with * `add` / `find_for_request` / `clear`. * * Everything else is private; tests reach it via - * `BackendArenaBinsTestAccess`. + * `BackendArenaBinsTestAccess`. */ - template + template class BackendArenaBins { static_assert( INTERMEDIATE_BITS >= 1 && INTERMEDIATE_BITS <= 3, "BackendArenaBins currently supports B in {1, 2, 3}"); + static_assert( + MIN_SIZE_BITS + INTERMEDIATE_BITS < bits::BITS, + "MIN_SIZE_BITS + INTERMEDIATE_BITS must leave room for at least one " + "exponent above the low regime so MAX_SC is non-trivial"); public: - /// (base, size) chunk-count range. `size == 0` means empty (base - /// is unspecified). + /// (base, size) byte range. Both fields are multiples of + /// `UNIT_SIZE = 1 << MIN_SIZE_BITS`. `size == 0` means empty + /// (base is unspecified). struct range_t { size_t base; @@ -59,10 +73,15 @@ namespace snmalloc }; private: - friend struct BackendArenaBinsTestAccess; + friend struct BackendArenaBinsTestAccess; static constexpr size_t B = INTERMEDIATE_BITS; + /// Size of the allocation unit. Every byte size handled by the + /// classifier is a multiple of this value, and the smallest + /// representable size is `UNIT_SIZE`. + static constexpr size_t UNIT_SIZE = size_t(1) << MIN_SIZE_BITS; + /// Number of mantissa positions per regular exponent (= 2^B). static constexpr size_t MANTISSAS_PER_EXP = size_t(1) << B; @@ -74,11 +93,11 @@ namespace snmalloc 0; /// Size of the per-sc info tables. One past the largest raw id from - /// `bits::to_exp_mant_const` whose decoded size fits in - /// `size_t` (the architectural max raw id decodes to `2^bits::BITS`, - /// which overflows). + /// `bits::to_exp_mant_const` whose decoded size + /// fits in `size_t` (the architectural max raw id would decode to + /// `2^bits::BITS`, which overflows). static constexpr size_t MAX_SC = - ((bits::BITS - B) << B) + ((size_t(1) << B) - 1); + ((bits::BITS - B - MIN_SIZE_BITS) << B) + ((size_t(1) << B) - 1); /** * Per-SC bitmap-scan record, read by `Bitmap::find_for_request`. @@ -115,14 +134,15 @@ namespace snmalloc * Per-SC carve record, read by `carve` and by `bin_offset_at`'s * `fits` predicate (free-side cascade walk via `bin_index`). * - * - `size_chunks`: size this SC promises on allocation. - * - `align_chunks`: natural alignment (a power of two, derived - * from `size_chunks`). + * - `size`: byte size this SC promises on allocation (multiple + * of `UNIT_SIZE`). + * - `align`: natural byte alignment (a power of two, derived + * from `size`). */ struct carve_info_t { - size_t size_chunks; - size_t align_chunks; + size_t size; + size_t align; }; static_assert( @@ -133,16 +153,18 @@ namespace snmalloc /** * Map a request size to its bitmap-scan record. * - * `n_chunks` must be in `[1, max_supported_chunks()]`. - * Not `constexpr`: uses `bits::clz` intrinsic via `bits::to_exp_mant` - * to stay single-cycle on the fast path. + * `n` must be in `[UNIT_SIZE, max_supported_size()]` and a + * multiple of `UNIT_SIZE`. Not `constexpr`: uses `bits::clz` + * intrinsic via `bits::to_exp_mant` to stay single-cycle on the + * fast path. */ SNMALLOC_FAST_PATH static const bitmap_info_t& - bitmap_info_for_request(size_t n_chunks) + bitmap_info_for_request(size_t n) { - SNMALLOC_ASSERT(n_chunks >= 1); - SNMALLOC_ASSERT(n_chunks <= max_supported_chunks()); - size_t raw = bits::to_exp_mant(n_chunks); + SNMALLOC_ASSERT(n >= UNIT_SIZE); + SNMALLOC_ASSERT((n & (UNIT_SIZE - 1)) == 0); + SNMALLOC_ASSERT(n <= max_supported_size()); + size_t raw = bits::to_exp_mant(n); SNMALLOC_ASSERT(raw < MAX_SC); return table_.bitmap_info[raw]; } @@ -150,37 +172,47 @@ namespace snmalloc /// Map a request size to its carve record. Preconditions and /// properties as `bitmap_info_for_request`. SNMALLOC_FAST_PATH static const carve_info_t& - carve_info_for_request(size_t n_chunks) + carve_info_for_request(size_t n) { - SNMALLOC_ASSERT(n_chunks >= 1); - SNMALLOC_ASSERT(n_chunks <= max_supported_chunks()); - size_t raw = bits::to_exp_mant(n_chunks); + SNMALLOC_ASSERT(n >= UNIT_SIZE); + SNMALLOC_ASSERT((n & (UNIT_SIZE - 1)) == 0); + SNMALLOC_ASSERT(n <= max_supported_size()); + size_t raw = bits::to_exp_mant(n); SNMALLOC_ASSERT(raw < MAX_SC); return table_.carve_info[raw]; } public: /** - * Bin id of `block`. Operates on arbitrary chunk counts, not just - * exact size classes. `block.size` must be >= 1. + * Bin id of `block`. Operates on arbitrary byte sizes that are + * multiples of `UNIT_SIZE`, not just exact size classes. + * `block.size` must be at least `UNIT_SIZE`. * * A bin id at exponent `e` identifies the *servable set*: the * subset of SCs at `e` that `block` could serve. Two blocks with * the same servable set at the same exponent share a bin id. * - * The natural exponent is `e = prev_pow2_bits(block.size)`. If - * alignment padding eats every SC there, we drop to `e - 1`, - * which is guaranteed to fit: its smallest SC has size and - * alignment `2^(e-1)`, so worst-case `size + pad < 2^e <= - * block.size`. One drop is always enough. + * The natural byte exponent is `prev_pow2_bits(block.size)`, + * which ranges over `[MIN_SIZE_BITS, bits::BITS)` once the + * size is a multiple of `UNIT_SIZE`. The internal exponent + * `e` is normalised by subtracting `MIN_SIZE_BITS`, so bin + * 0 always corresponds to the `UNIT_SIZE` block. + * + * If alignment padding eats every SC at the natural exponent we + * drop to `e - 1`, which is guaranteed to fit: its smallest SC + * has size and alignment `UNIT_SIZE << (e - 1)`, so worst-case + * `size + pad < UNIT_SIZE << e <= block.size`. One drop is + * always enough. * * Not `constexpr`: uses `bits::clz` via `bits::prev_pow2_bits`. */ SNMALLOC_FAST_PATH static size_t bin_index(range_t block) { - SNMALLOC_ASSERT(block.size >= 1); + SNMALLOC_ASSERT(block.size >= UNIT_SIZE); + SNMALLOC_ASSERT((block.size & (UNIT_SIZE - 1)) == 0); + SNMALLOC_ASSERT((block.base & (UNIT_SIZE - 1)) == 0); - size_t e = bits::prev_pow2_bits(block.size); + size_t e = bits::prev_pow2_bits(block.size) - MIN_SIZE_BITS; size_t offset = bin_offset_at(block.base, block.size, e); if (SNMALLOC_UNLIKELY(offset == BINS_PER_EXP)) { @@ -194,19 +226,30 @@ namespace snmalloc return table_.exp_bin_base[e] + offset; } - /// Largest `n_chunks` legal for `carve` / `Bitmap::find_for_request`. - static constexpr size_t max_supported_chunks() + /// Largest byte size legal for `carve` / `Bitmap::find_for_request`. + static constexpr size_t max_supported_size() { - return bits::from_exp_mant(MAX_SC - 1); + return bits::from_exp_mant(MAX_SC - 1); } /** - * Carve a free block into pre-pad / aligned request / post-pad. + * Carve a free block into pre-pad / aligned request / post-pad, + * delivering exactly `n` bytes to the caller. + * + * The carve_info for `n` is used only to find a valid alignment + * and to verify that the block has room: `req.base` is aligned + * to `info.align` (the natural alignment of the SC that covers + * `n`), and the block must contain `info.size` bytes from that + * point. Only `n` bytes are handed out, and the leftover + * `info.size - n` bytes roll into `post`. This keeps SC rounding + * as an arena-internal detail: callers always receive exactly + * what they asked for. * * Preconditions (caller must have used `Bitmap::find_for_request` * to locate a servable bin): - * - `block.size > 0`, `n_chunks` in `[1, max_supported_chunks()]`, - * `block` large enough to fit the SC after aligning up. + * - `block.size > 0`, `n` in `[UNIT_SIZE, max_supported_size()]` + * and a multiple of `UNIT_SIZE`, `block` large enough to fit + * the SC after aligning up. * - `block.base + block.size` does not wrap. * * Pure: does not touch the bitmap or any tree. Either or both @@ -215,30 +258,36 @@ namespace snmalloc * `req.base + req.size == post.base` (keeps caller adjacency * checks simple). */ - SNMALLOC_FAST_PATH static carve_t carve(range_t block, size_t n_chunks) + SNMALLOC_FAST_PATH static carve_t carve(range_t block, size_t n) { - SNMALLOC_ASSERT(n_chunks >= 1); - SNMALLOC_ASSERT(n_chunks <= max_supported_chunks()); + SNMALLOC_ASSERT(n >= UNIT_SIZE); + SNMALLOC_ASSERT((n & (UNIT_SIZE - 1)) == 0); + SNMALLOC_ASSERT(n <= max_supported_size()); SNMALLOC_ASSERT(block.size > 0); + SNMALLOC_ASSERT((block.size & (UNIT_SIZE - 1)) == 0); + SNMALLOC_ASSERT((block.base & (UNIT_SIZE - 1)) == 0); // Combined with the servability precondition, non-wrapping end // ensures the alignment-up below does not wrap either. SNMALLOC_ASSERT(block.base + block.size >= block.base); - const carve_info_t& info = carve_info_for_request(n_chunks); + const carve_info_t& info = carve_info_for_request(n); size_t req_base = - (block.base + (info.align_chunks - 1)) & ~(info.align_chunks - 1); + (block.base + (info.align - 1)) & ~(info.align - 1); size_t pre_size = req_base - block.base; + // Servability precondition: `info.size >= n` bytes fit after + // `pre`. We only hand out `n`; the remainder (`info.size - n`) + // joins `post`. SNMALLOC_ASSERT(pre_size <= block.size); - SNMALLOC_ASSERT(block.size - pre_size >= info.size_chunks); + SNMALLOC_ASSERT(block.size - pre_size >= info.size); - size_t post_base = req_base + info.size_chunks; + size_t post_base = req_base + n; size_t post_size = (block.base + block.size) - post_base; carve_t result; result.pre = {block.base, pre_size}; - result.req = {req_base, info.size_chunks}; + result.req = {req_base, n}; result.post = {post_base, post_size}; return result; } @@ -251,8 +300,8 @@ namespace snmalloc * Three-method API: * - `add(range_t)`: classify a block and set its bin's bit * (idempotent on the bit; returns the bin id). - * - `find_for_request(n_chunks)`: smallest set bin whose blocks - * all serve `n_chunks`, or `SIZE_MAX` if none. + * - `find_for_request(n)`: smallest set bin whose blocks + * all serve `n`, or `SIZE_MAX` if none. * - `clear(bin_id)`: mark empty. Caller must ensure the bin's * tree is actually empty; the bitmap does not track contents. * @@ -261,7 +310,7 @@ namespace snmalloc */ class Bitmap { - friend struct BackendArenaBinsTestAccess; + friend struct BackendArenaBinsTestAccess; public: /// Strict upper bound on bin ids `bin_index` produces. Exposed @@ -282,8 +331,8 @@ namespace snmalloc */ SNMALLOC_FAST_PATH size_t add(range_t block) { - SNMALLOC_ASSERT(block.size >= 1); - SNMALLOC_ASSERT(block.size <= max_supported_chunks()); + SNMALLOC_ASSERT(block.size >= UNIT_SIZE); + SNMALLOC_ASSERT(block.size <= max_supported_size()); size_t bin_id = bin_index(block); SNMALLOC_ASSERT(bin_id < TOTAL_BINS); words_[bin_id / bits::BITS] |= @@ -310,17 +359,18 @@ namespace snmalloc } /** - * Smallest bin id whose set blocks all serve `n_chunks`, or - * `SIZE_MAX` if none. `n_chunks` in `[1, max_supported_chunks()]`. + * Smallest bin id whose set blocks all serve `n`, or `SIZE_MAX` + * if none. `n` in `[UNIT_SIZE, max_supported_size()]` and a + * multiple of `UNIT_SIZE`. * * Invariant (static_assert below): `BINS_PER_EXP <= bits::BITS`, * so the within-exponent range fits inside one word and the * search straddles at most one word boundary. After the second * word, every remaining word is purely higher-exponent. */ - SNMALLOC_FAST_PATH size_t find_for_request(size_t n_chunks) const + SNMALLOC_FAST_PATH size_t find_for_request(size_t n) const { - const bitmap_info_t& info = bitmap_info_for_request(n_chunks); + const bitmap_info_t& info = bitmap_info_for_request(n); SNMALLOC_ASSERT(info.start_word < NUM_BITMAP_WORDS); // First word: start bin + any within-exp neighbours in same word. @@ -560,9 +610,10 @@ namespace snmalloc }(); /** - * Within-exponent bin offset for a block at `addr_chunks` of length - * `n_chunks` at exponent `e`. Returns `BINS_PER_EXP` (sentinel) if - * no mantissa at this exponent fits. + * Within-exponent bin offset for a block at byte address `addr` + * of byte length `n` at internal exponent `e`. Returns + * `BINS_PER_EXP` (sentinel) if no mantissa at this exponent + * fits. * * Walks `m_top` from `MANTISSAS_PER_EXP - 1` down. The first * fitting `m_top` is the largest mantissa this block can serve; @@ -579,7 +630,7 @@ namespace snmalloc * exponent and 1 at the fallback exponent. */ SNMALLOC_FAST_PATH static size_t - bin_offset_at(size_t addr_chunks, size_t n_chunks, size_t e) + bin_offset_at(size_t addr, size_t n, size_t e) { size_t first = table_.exp_first_sc[e]; size_t past = table_.exp_first_sc[e + 1]; @@ -592,13 +643,13 @@ namespace snmalloc if (first + m >= past) return false; const carve_info_t& ci = table_.carve_info[first + m]; - // Optimisation: near the bottom of n_chunks's exponent range - // the higher-mantissa sizes already exceed n_chunks and cannot - // fit regardless of alignment. Skips the align_up below. - if (n_chunks < ci.size_chunks) + // Optimisation: near the bottom of n's exponent range the + // higher-mantissa sizes already exceed n and cannot fit + // regardless of alignment. Skips the align_up below. + if (n < ci.size) return false; - size_t pad = bits::align_up(addr_chunks, ci.align_chunks) - addr_chunks; - return n_chunks - ci.size_chunks >= pad; + size_t pad = bits::align_up(addr, ci.align) - addr; + return n - ci.size >= pad; }; for (size_t m_top = MANTISSAS_PER_EXP; m_top-- > 0;) @@ -655,18 +706,23 @@ namespace snmalloc // the only place that knows the size class encoding; once we've // pinned down the raw boundaries, everything else is table lookup. // + // `e` here is the internal (normalised) exponent: an SC's + // `e == 0` corresponds to byte size `UNIT_SIZE = 1 << MIN_SIZE_BITS`. + // // Note: `exp_first_sc` does NOT have a uniform stride. At the // bottom of the encoding the low regime (no leading-1 bit; the // `b = (e == 0) ? 0 : 1` branch in `to_exp_mant_const`) squashes - // multiple BackendArenaBins exponents into encoded-exponent 0. + // multiple internal exponents into encoded-exponent 0. // For `B = 2` the counts are 1, 2, 4, 4, 4, ... - for (size_t e = 0; e < bits::BITS; e++) + constexpr size_t MAX_E = bits::BITS - MIN_SIZE_BITS; + for (size_t e = 0; e < MAX_E; e++) { - exp_first_sc[e] = bits::to_exp_mant_const(size_t(1) << e); + exp_first_sc[e] = + bits::to_exp_mant_const(size_t(1) << (e + MIN_SIZE_BITS)); exp_bin_base[e] = e * BINS_PER_EXP; } - exp_first_sc[bits::BITS] = MAX_SC; - exp_bin_base[bits::BITS] = bits::BITS * BINS_PER_EXP; + exp_first_sc[MAX_E] = MAX_SC; + exp_bin_base[MAX_E] = MAX_E * BINS_PER_EXP; // Per-sc records. Size and alignment come straight from the // size-class scheme (via from_exp_mant); start_word, first_mask, @@ -675,14 +731,14 @@ namespace snmalloc // the search hot path is two ANDs. for (size_t sc = 0; sc < MAX_SC; sc++) { - size_t size = bits::from_exp_mant(sc); - size_t e = bits::prev_pow2_bits_const(size); + size_t size = bits::from_exp_mant(sc); + size_t e = bits::prev_pow2_bits_const(size) - MIN_SIZE_BITS; size_t m = sc - exp_first_sc[e]; size_t start_bit = exp_bin_base[e] + start_bin_offset_for_m(m); size_t mask = serve_mask_for_m(m); size_t shift = start_bit & (bits::BITS - 1); - carve_info[sc].size_chunks = size; - carve_info[sc].align_chunks = size & (~size + 1); + carve_info[sc].size = size; + carve_info[sc].align = size & (~size + 1); bitmap_info[sc].start_word = start_bit / bits::BITS; bitmap_info[sc].first_mask = mask << shift; // shift == 0: no within-exponent carry; the second word is diff --git a/src/snmalloc/backend_helpers/backend_arena_range.h b/src/snmalloc/backend_helpers/backend_arena_range.h new file mode 100644 index 000000000..8c50b6c2a --- /dev/null +++ b/src/snmalloc/backend_helpers/backend_arena_range.h @@ -0,0 +1,368 @@ +#pragma once + +#include "backend_arena.h" +#include "empty_range.h" +#include "range_helpers.h" + +namespace snmalloc +{ + /** + * PagemapRep — Rep for `BackendArena` over a Pagemap. + * + * Each free block uses three pagemap entries at unit-aligned offsets: + * + * Unit 0 (addr): bin-tree node + variant tag. + * Unit 1 (addr + UNIT_SIZE): range-tree node (size ≥ 2 units). + * Unit 2 (addr + 2*UNIT_SIZE): large chunk count (size ≥ 3 units). + * + * Bit-layout decisions for tree nodes are private to this class: + * - Bits 0–7 of each pagemap word are reserved by the pagemap. + * - Bit 8 is the red bit (both trees). + * - Bits 9–10 of Word::One at unit 0 hold the variant tag. + * - Large chunk count is stored shifted left by 8 in Word::One of + * unit 2. + * + * `MIN_SIZE_BITS` is the log2 size of the allocation unit (= pagemap + * stride); the caller passes whatever unit it uses (snmalloc's global + * `MIN_CHUNK_BITS` in the in-tree pipeline). + * `MAX_SIZE_BITS` is the log2 of the (exclusive) upper bound on block + * size in bytes; used here only to verify that the largest chunk + * count fits in a shifted pagemap word. + */ + template< + SNMALLOC_CONCEPT(IsWritablePagemap) Pagemap, + size_t MIN_SIZE_BITS, + size_t MAX_SIZE_BITS> + class PagemapRep + { + using Entry = typename Pagemap::Entry; + + static constexpr uintptr_t UNIT_SIZE = uintptr_t(1) << MIN_SIZE_BITS; + + // Bit positions inside a pagemap word. Bits 0–7 are reserved by the + // pagemap; tree-node and large-size encodings start at bit 8. + static constexpr unsigned RED_BIT_POS = 8; + static constexpr unsigned VARIANT_SHIFT = 9; + static constexpr unsigned VARIANT_BITS = 2; + + // Shift used to encode the large-size chunk count in Word::One of + // unit 2. + static constexpr size_t LARGE_SIZE_SHIFT = 8; + + static constexpr uintptr_t RED_BIT = uintptr_t(1) << RED_BIT_POS; + static constexpr uintptr_t VARIANT_MASK = + ((uintptr_t(1) << VARIANT_BITS) - 1) << VARIANT_SHIFT; + static constexpr uintptr_t BIN_META_MASK = RED_BIT | VARIANT_MASK; + static constexpr uintptr_t RANGE_META_MASK = RED_BIT; + + static_assert(MAX_SIZE_BITS > MIN_SIZE_BITS); + static_assert( + (MAX_SIZE_BITS - MIN_SIZE_BITS) + LARGE_SIZE_SHIFT <= bits::BITS, + "Shifted large-size field must fit in a pagemap word."); + static_assert((RED_BIT & VARIANT_MASK) == 0); + static_assert(BIN_META_MASK < UNIT_SIZE); + static_assert( + Entry::is_backend_allowed_value(Entry::Word::One, BIN_META_MASK)); + static_assert(Entry::is_backend_allowed_value(Entry::Word::Two, RED_BIT)); + + using Word = typename Entry::Word; + using Handle = typename Entry::BackendStateWordRef; + + /** + * Pagemap word for the `UnitIdx`-th unit of the block at `addr`. + * Centralises the layout decision "which pagemap entry encodes + * data for unit i". Used by `TreeRep::ref` and by the variant / + * large-size accessors below. + */ + template + static Handle word_at(uintptr_t addr, Word w) + { + auto& entry = Pagemap::template get_metaentry_mut( + address_cast(addr + UnitIdx * UNIT_SIZE)); + return entry.get_backend_word(w); + } + + /** + * RBTree Rep shared by `BinRep` and `RangeRep`. `UnitIdx` selects + * which unit (0 or 1) of the block holds this Rep's tree node; the + * Rep's pagemap words live at `addr + UnitIdx * UNIT_SIZE`. + * `MetaMask` covers the bits in that node's words that are owned by + * this Rep (red + any tag bits) and must be preserved by get/set. + */ + template + struct TreeRep + { + using Handle = PagemapRep::Handle; + using Contents = uintptr_t; + + static constexpr Contents null = 0; + static constexpr Contents root = 0; + + static Handle ref(bool direction, Contents k) + { + static const Contents null_entry = 0; + if (SNMALLOC_UNLIKELY(k == 0)) + return Handle{const_cast(&null_entry)}; + return word_at(k, direction ? Word::One : Word::Two); + } + + static Contents get(Handle h) + { + return h.get() & ~MetaMask; + } + + static void set(Handle h, Contents v) + { + h = v | (h.get() & MetaMask); + } + + static bool is_red(Contents k) + { + return (ref(true, k).get() & RED_BIT) == RED_BIT; + } + + static void set_red(Contents k, bool new_is_red) + { + if (new_is_red != is_red(k)) + { + auto h = ref(true, k); + h = h.get() ^ RED_BIT; + } + SNMALLOC_ASSERT(is_red(k) == new_is_red); + } + + static bool compare(Contents k1, Contents k2) + { + return k1 > k2; + } + + static bool equal(Contents k1, Contents k2) + { + return k1 == k2; + } + + static uintptr_t printable(Contents k) + { + return k; + } + + static uintptr_t printable(Handle h) + { + return h.printable_address(); + } + + static const char* name() + { + return Name; + } + }; + + static constexpr char BIN_REP_NAME[] = "PagemapBinRep"; + static constexpr char RANGE_REP_NAME[] = "PagemapRangeRep"; + + public: + using BinRep = TreeRep<0, BIN_META_MASK, BIN_REP_NAME>; + using RangeRep = TreeRep<1, RANGE_META_MASK, RANGE_REP_NAME>; + + static BackendArenaVariant get_variant(uintptr_t addr) + { + auto w = word_at<0>(addr, Word::One); + return static_cast( + (w.get() & VARIANT_MASK) >> VARIANT_SHIFT); + } + + static void set_variant(uintptr_t addr, BackendArenaVariant v) + { + auto w = word_at<0>(addr, Word::One); + w = (w.get() & ~VARIANT_MASK) | + (static_cast(v) << VARIANT_SHIFT); + } + + static size_t get_large_size(uintptr_t addr) + { + // Stored as chunk count to keep the shifted field within a + // pagemap word (see LARGE_SIZE_SHIFT static_assert). Returns + // the byte size. + return (word_at<2>(addr, Word::One).get() >> LARGE_SIZE_SHIFT) + << MIN_SIZE_BITS; + } + + static void set_large_size(uintptr_t addr, size_t size) + { + SNMALLOC_ASSERT((size & (UNIT_SIZE - 1)) == 0); + word_at<2>(addr, Word::One) = (size >> MIN_SIZE_BITS) << LARGE_SIZE_SHIFT; + } + + static bool can_consolidate(uintptr_t higher_addr) + { + auto& entry = + Pagemap::template get_metaentry_mut(address_cast(higher_addr)); + return !entry.is_boundary(); + } + }; + + /** + * Range wrapper around BackendArena. Drop-in replacement for + * LargeBuddyRange in Pipe<...> compositions. + */ + template< + size_t REFILL_SIZE_BITS, + size_t MAX_SIZE_BITS, + SNMALLOC_CONCEPT(IsWritablePagemap) Pagemap, + size_t MIN_REFILL_SIZE_BITS = 0> + class BackendArenaRange + { + static_assert( + REFILL_SIZE_BITS <= MAX_SIZE_BITS, "REFILL_SIZE_BITS > MAX_SIZE_BITS"); + static_assert( + MIN_REFILL_SIZE_BITS <= REFILL_SIZE_BITS, + "MIN_REFILL_SIZE_BITS > REFILL_SIZE_BITS"); + + static constexpr size_t REFILL_SIZE = bits::one_at_bit(REFILL_SIZE_BITS); + static constexpr size_t MIN_REFILL_SIZE = + bits::one_at_bit(MIN_REFILL_SIZE_BITS); + + public: + template> + class Type : public ContainsParent + { + using ContainsParent::parent; + + using PagemapRepT = PagemapRep; + + BackendArena arena; + size_t requested_total = 0; + + void parent_dealloc(uintptr_t addr, size_t size) + { + if constexpr (MAX_SIZE_BITS != (bits::BITS - 1)) + { + auto base = + capptr::Arena::unsafe_from(reinterpret_cast(addr)); + parent.dealloc_range(base, size); + } + else + { + SNMALLOC_CHECK(false && "Global range overflow should not happen"); + } + } + + void add_range(capptr::Arena base, size_t length) + { + // Parent ranges (e.g. mmap-backed PalRange) may return regions + // that are page-aligned but not chunk-aligned; trim to chunk + // boundaries on both ends before handing to the arena. + uintptr_t lo = bits::align_up(base.unsafe_uintptr(), MIN_CHUNK_SIZE); + uintptr_t hi = + bits::align_down(base.unsafe_uintptr() + length, MIN_CHUNK_SIZE); + if (lo >= hi) + return; + auto [ov_addr, ov_size] = arena.add_block(lo, hi - lo); + if (ov_addr != 0) + parent_dealloc(ov_addr, ov_size); + } + + capptr::Arena refill(size_t size) + { + if (ParentRange::Aligned) + { + size_t refill_size = bits::min(REFILL_SIZE, requested_total); + refill_size = bits::max(refill_size, MIN_REFILL_SIZE); + refill_size = bits::max(refill_size, size); + refill_size = bits::next_pow2(refill_size); + + auto refill_range = parent.alloc_range(refill_size); + if (refill_range != nullptr) + { + requested_total += refill_size; + add_range(pointer_offset(refill_range, size), refill_size - size); + } + return refill_range; + } + + bool overflow = false; + size_t needed_size = bits::umul(size, 2, overflow); + if (overflow) + { + return nullptr; + } + + auto refill_size = bits::max(needed_size, REFILL_SIZE); + while (needed_size <= refill_size) + { + auto refill = parent.alloc_range(refill_size); + + if (refill != nullptr) + { + requested_total += refill_size; + add_range(refill, refill_size); + + SNMALLOC_ASSERT(refill_size < bits::one_at_bit(MAX_SIZE_BITS)); + static_assert( + (REFILL_SIZE < bits::one_at_bit(MAX_SIZE_BITS)) || + ParentRange::Aligned, + "Required to prevent overflow."); + + return alloc_range(size); + } + + refill_size >>= 1; + } + + return nullptr; + } + + public: + static constexpr bool Aligned = true; + static constexpr bool ConcurrencySafe = false; + using ChunkBounds = capptr::bounds::Arena; + static_assert( + stl::is_same_v); + + constexpr Type() = default; + + capptr::Arena alloc_range(size_t size) + { + SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE); + SNMALLOC_ASSERT((size & (MIN_CHUNK_SIZE - 1)) == 0); + + if (size >= bits::mask_bits(MAX_SIZE_BITS)) + { + if (ParentRange::Aligned) + return parent.alloc_range(size); + + return nullptr; + } + + uintptr_t addr = arena.remove_block(size); + if (addr != 0) + { + return capptr::Arena::unsafe_from( + reinterpret_cast(addr)); + } + + return refill(size); + } + + void dealloc_range(capptr::Arena base, size_t size) + { + SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE); + SNMALLOC_ASSERT((size & (MIN_CHUNK_SIZE - 1)) == 0); + + if constexpr (MAX_SIZE_BITS != (bits::BITS - 1)) + { + if (size >= bits::mask_bits(MAX_SIZE_BITS)) + { + parent_dealloc(base.unsafe_uintptr(), size); + return; + } + } + + auto [ov_addr, ov_size] = + arena.add_block(base.unsafe_uintptr(), size); + if (ov_addr != 0) + parent_dealloc(ov_addr, ov_size); + } + }; + }; +} // namespace snmalloc diff --git a/src/snmalloc/backend_helpers/backend_helpers.h b/src/snmalloc/backend_helpers/backend_helpers.h index ee339337b..10382c611 100644 --- a/src/snmalloc/backend_helpers/backend_helpers.h +++ b/src/snmalloc/backend_helpers/backend_helpers.h @@ -2,6 +2,7 @@ #include "../mem/mem.h" #include "authmap.h" +#include "backend_arena_range.h" #include "buddy.h" #include "commitrange.h" #include "commonconfig.h" diff --git a/src/test/func/backend_arena/backend_arena.cc b/src/test/func/backend_arena/backend_arena.cc index fc605bbdf..242b984f7 100644 --- a/src/test/func/backend_arena/backend_arena.cc +++ b/src/test/func/backend_arena/backend_arena.cc @@ -64,7 +64,7 @@ namespace snmalloc // Each chunk-aligned address maps to a mock_entry via its chunk index. // word1/word2 hold bin-tree children; range_word1/range_word2 hold - // range-tree children. variant and large_size_chunks hold metadata. + // range-tree children. variant and large_size hold metadata. struct mock_entry { uintptr_t word1{0}; @@ -72,7 +72,7 @@ namespace snmalloc uintptr_t range_word1{0}; uintptr_t range_word2{0}; BackendArenaVariant variant{BackendArenaVariant::Min}; - size_t large_size_chunks{0}; + size_t large_size{0}; }; // Size the array for the largest test arena + trailing room. @@ -95,7 +95,7 @@ namespace snmalloc // Inner RBTree Rep used by both MockRep::BinRep and MockRep::RangeRep. // Tag selects which pair of fields in mock_entry holds the tree pointers. // The red bit is packed into bit 8 of the stored word (matching the - // production PagemapRep layout, but defined privately here). + // PagemapRep layout, but defined privately here). template struct MockTreeRep { @@ -105,7 +105,8 @@ namespace snmalloc static constexpr Contents null = 0; static constexpr Contents root = 0; - static constexpr uintptr_t RED_BIT = uintptr_t(1) << 8; + static constexpr unsigned RED_BIT_POS = 8; + static constexpr uintptr_t RED_BIT = uintptr_t(1) << RED_BIT_POS; static_assert(RED_BIT < MIN_CHUNK_SIZE); static Handle ref(bool direction, Contents k) @@ -185,14 +186,14 @@ namespace snmalloc mock_store[mock_index(addr)].variant = v; } - static size_t get_large_size_chunks(uintptr_t addr) + static size_t get_large_size(uintptr_t addr) { - return mock_store[mock_index(addr)].large_size_chunks; + return mock_store[mock_index(addr)].large_size; } - static void set_large_size_chunks(uintptr_t addr, size_t s) + static void set_large_size(uintptr_t addr, size_t s) { - mock_store[mock_index(addr)].large_size_chunks = s; + mock_store[mock_index(addr)].large_size = s; } static bool can_consolidate(uintptr_t) @@ -229,12 +230,19 @@ namespace snmalloc return static_cast(chunk_idx) << MIN_CHUNK_BITS; } + // Convenience: byte size from chunk count. + static constexpr size_t chunk_size(size_t n_chunks) + { + return n_chunks << MIN_CHUNK_BITS; + } + // ---- Test types ---- + // K = number of address bits the arena covers above MIN_CHUNK_BITS. // K=6 → arena of 64 chunks, K=8 → 256 chunks, K=10 → 1024 chunks. template - using Arena = BackendArena; + using Arena = BackendArena; - using Bins = BackendArenaBins<2>; + using Bins = BackendArenaBins<2, MIN_CHUNK_BITS>; // ================================================================== // (A) Accessor round-trips @@ -263,8 +271,8 @@ namespace snmalloc for (size_t s : {3, 7, 15, 63, 255, 1000}) { - MockRep::set_large_size_chunks(a, s); - SNMALLOC_ASSERT(MockRep::get_large_size_chunks(a) == s); + MockRep::set_large_size(a, s); + SNMALLOC_ASSERT(MockRep::get_large_size(a) == s); } printf(" Large-size round-trip: OK\n"); @@ -316,28 +324,28 @@ namespace snmalloc uintptr_t a2 = chunk_addr(20); uintptr_t a3 = chunk_addr(30); - arena.add_block(a1, 3); + arena.add_block(a1, chunk_size(3)); arena.check_invariant(true); - arena.add_block(a2, 5); + arena.add_block(a2, chunk_size(5)); arena.check_invariant(true); - arena.add_block(a3, 1); + arena.add_block(a3, chunk_size(1)); arena.check_invariant(true); // Remove them. - auto r1 = arena.remove_block(1); - SNMALLOC_ASSERT(r1.first != 0); + auto r1 = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r1 != 0); UNUSED(r1); arena.check_invariant(true); - auto r2 = arena.remove_block(3); - SNMALLOC_ASSERT(r2.first != 0); + auto r2 = arena.remove_block(chunk_size(3)); + SNMALLOC_ASSERT(r2 != 0); UNUSED(r2); arena.check_invariant(true); - auto r3 = arena.remove_block(5); - SNMALLOC_ASSERT(r3.first != 0); + auto r3 = arena.remove_block(chunk_size(5)); + SNMALLOC_ASSERT(r3 != 0); UNUSED(r3); arena.check_invariant(true); @@ -379,7 +387,7 @@ namespace snmalloc for (auto& b : blocks) { - auto result = arena.add_block(chunk_addr(b.chunk_idx), b.size); + auto result = arena.add_block(chunk_addr(b.chunk_idx), chunk_size(b.size)); SNMALLOC_ASSERT(result.first == 0 && result.second == 0); UNUSED(result); arena.check_invariant(true); @@ -397,24 +405,23 @@ namespace snmalloc Arena<8> arena; // Insert 3 blocks of size 5 at non-adjacent locations. - arena.add_block(chunk_addr(10), 5); - arena.add_block(chunk_addr(20), 5); - arena.add_block(chunk_addr(30), 5); + arena.add_block(chunk_addr(10), chunk_size(5)); + arena.add_block(chunk_addr(20), chunk_size(5)); + arena.add_block(chunk_addr(30), chunk_size(5)); arena.check_invariant(true); // Remove 3 exact-size blocks. for (int i = 0; i < 3; i++) { - auto r = arena.remove_block(5); - SNMALLOC_ASSERT(r.first != 0); - SNMALLOC_ASSERT(r.second == 5); + auto r = arena.remove_block(chunk_size(5)); + SNMALLOC_ASSERT(r != 0); UNUSED(r); arena.check_invariant(true); } // Arena should be empty now. - auto r = arena.remove_block(1); - SNMALLOC_ASSERT(r.first == 0); + auto r = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r == 0); UNUSED(r); printf(" remove_block exact: OK\n"); @@ -426,32 +433,31 @@ namespace snmalloc Arena<8> arena; // Insert one block of size 10. - arena.add_block(chunk_addr(10), 10); + arena.add_block(chunk_addr(10), chunk_size(10)); arena.check_invariant(true); - // Request size 3 — should carve from the 10-chunk block. - auto r = arena.remove_block(3); - SNMALLOC_ASSERT(r.first != 0); - // The carved piece should be exactly what Bins::carve produces. - auto carved = Bins::carve({10, 10}, 3); - SNMALLOC_ASSERT(r.second == carved.req.size); + // Request size 3 chunks — should carve from the 10-chunk block. + auto r = arena.remove_block(chunk_size(3)); + SNMALLOC_ASSERT(r != 0); + // The carved piece's address should match what Bins::carve produces. + auto carved = Bins::carve({chunk_addr(10), chunk_size(10)}, chunk_size(3)); UNUSED(r); arena.check_invariant(true); // The remainders should still be in the arena. // We can try to remove everything that's left. - size_t remaining = 10 - carved.req.size; + size_t remaining = chunk_size(10) - carved.req.size; while (remaining > 0) { - auto r2 = arena.remove_block(1); - SNMALLOC_ASSERT(r2.first != 0); + auto r2 = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r2 != 0); arena.check_invariant(true); - remaining -= r2.second; + remaining -= chunk_size(1); } // Should be empty. - auto r3 = arena.remove_block(1); - SNMALLOC_ASSERT(r3.first == 0); + auto r3 = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r3 == 0); UNUSED(r3); printf(" remove_block carving: OK\n"); @@ -462,11 +468,12 @@ namespace snmalloc // ================================================================== // Helper: insert a block, verify invariant, return nothing. - template + // `size_in_chunks` is a chunk count; converted to bytes internally. + template static void - add_and_check(Arena& arena, size_t chunk_idx, size_t size_chunks) + add_and_check(ArenaT& arena, size_t chunk_idx, size_t size_in_chunks) { - auto result = arena.add_block(chunk_addr(chunk_idx), size_chunks); + auto result = arena.add_block(chunk_addr(chunk_idx), chunk_size(size_in_chunks)); SNMALLOC_ASSERT(result.first == 0 && result.second == 0); UNUSED(result); arena.check_invariant(true); @@ -474,16 +481,16 @@ namespace snmalloc // Drain the arena by removing 1-chunk blocks until empty. // Returns the total chunks removed. - template - static size_t drain_arena(Arena& arena) + template + static size_t drain_arena(ArenaT& arena) { size_t total = 0; while (true) { - auto r = arena.remove_block(1); - if (r.first == 0) + auto r = arena.remove_block(chunk_size(1)); + if (r == 0) break; - total += r.second; + total += 1; arena.check_invariant(true); } return total; @@ -625,13 +632,13 @@ namespace snmalloc Arena<8> arena; // Odd address: chunk 11, size 2 - arena.add_block(chunk_addr(11), 2); + arena.add_block(chunk_addr(11), chunk_size(2)); SNMALLOC_ASSERT( MockRep::get_variant(chunk_addr(11)) == BackendArenaVariant::OddTwo); arena.check_invariant(true); // Even address: chunk 20, size 2 - arena.add_block(chunk_addr(20), 2); + arena.add_block(chunk_addr(20), chunk_size(2)); SNMALLOC_ASSERT( MockRep::get_variant(chunk_addr(20)) == BackendArenaVariant::EvenTwo); arena.check_invariant(true); @@ -662,17 +669,17 @@ namespace snmalloc Arena<8> arena; // Add OddTwo block at chunk 11 (odd, size 2). - arena.add_block(chunk_addr(11), 2); + arena.add_block(chunk_addr(11), chunk_size(2)); arena.check_invariant(true); // Add a size-1 block at chunk 14, non-adjacent. - arena.add_block(chunk_addr(14), 1); + arena.add_block(chunk_addr(14), chunk_size(1)); arena.check_invariant(true); // Now add chunk 13 (size 1). Its successor check should NOT // pick up chunk 11's OddTwo entry via contains_min. It should // just insert as size 1. - arena.add_block(chunk_addr(13), 1); + arena.add_block(chunk_addr(13), chunk_size(1)); arena.check_invariant(true); // Chunk 13 should consolidate with chunk 14 (min successor), @@ -692,19 +699,18 @@ namespace snmalloc Arena<8> arena; // Add OddTwo at chunk 11 (odd, size 2 → chunks 11-12). - arena.add_block(chunk_addr(11), 2); + arena.add_block(chunk_addr(11), chunk_size(2)); arena.check_invariant(true); // Add adjacent block at chunk 13 (size 1). // Range tree finds OddTwo at 11 as predecessor? No — chunk 13's // predecessor in range tree is chunk 11 (size 2, ends at 13). // So they should consolidate into size 3 at chunk 11. - arena.add_block(chunk_addr(13), 1); + arena.add_block(chunk_addr(13), chunk_size(1)); arena.check_invariant(true); - auto r = arena.remove_block(3); - SNMALLOC_ASSERT(r.first == chunk_addr(11)); - SNMALLOC_ASSERT(r.second == 3); + auto r = arena.remove_block(chunk_size(3)); + SNMALLOC_ASSERT(r == chunk_addr(11)); UNUSED(r); printf(" OddTwo consolidation (successor): OK\n"); @@ -717,17 +723,16 @@ namespace snmalloc Arena<8> arena; // Add OddTwo at chunk 11 (odd, size 2 → chunks 11-12). - arena.add_block(chunk_addr(11), 2); + arena.add_block(chunk_addr(11), chunk_size(2)); arena.check_invariant(true); // Add block at chunk 10 (size 1). OddTwo at 11 is the successor // in the range tree → consolidate into size 3 at chunk 10. - arena.add_block(chunk_addr(10), 1); + arena.add_block(chunk_addr(10), chunk_size(1)); arena.check_invariant(true); - auto r = arena.remove_block(3); - SNMALLOC_ASSERT(r.first == chunk_addr(10)); - SNMALLOC_ASSERT(r.second == 3); + auto r = arena.remove_block(chunk_size(3)); + SNMALLOC_ASSERT(r == chunk_addr(10)); UNUSED(r); printf(" OddTwo consolidation (predecessor): OK\n"); @@ -740,24 +745,22 @@ namespace snmalloc Arena<8> arena; // Add OddTwo at chunk 11 (odd, size 2). - arena.add_block(chunk_addr(11), 2); + arena.add_block(chunk_addr(11), chunk_size(2)); arena.check_invariant(true); // Remove 1 chunk. Should carve from the OddTwo block. - auto r = arena.remove_block(1); - SNMALLOC_ASSERT(r.first != 0); - SNMALLOC_ASSERT(r.second == 1); + auto r = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r != 0); arena.check_invariant(true); // The remainder (1 chunk) should be Min variant. - auto r2 = arena.remove_block(1); - SNMALLOC_ASSERT(r2.first != 0); - SNMALLOC_ASSERT(r2.second == 1); + auto r2 = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r2 != 0); UNUSED(r, r2); // Arena should be empty now. - auto r3 = arena.remove_block(1); - SNMALLOC_ASSERT(r3.first == 0); + auto r3 = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r3 == 0); UNUSED(r3); printf(" OddTwo remove + carve: OK\n"); @@ -777,7 +780,7 @@ namespace snmalloc // Step 1: add even-indexed chunks as individual blocks (8 blocks). for (size_t i = 0; i < 16; i += 2) { - arena.add_block(chunk_addr(BASE + i), 1); + arena.add_block(chunk_addr(BASE + i), chunk_size(1)); arena.check_invariant(true); } @@ -785,7 +788,7 @@ namespace snmalloc // even-indexed neighbours. The last add completes the arena. for (size_t i = 1; i < 16; i += 2) { - arena.add_block(chunk_addr(BASE + i), 1); + arena.add_block(chunk_addr(BASE + i), chunk_size(1)); // Don't check invariant on the last add — it returns overflow. if (i < 15) { @@ -794,8 +797,8 @@ namespace snmalloc } // The last add should have triggered overflow (16 chunks = 2^4). - auto r = arena.remove_block(1); - SNMALLOC_ASSERT(r.first == 0); + auto r = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r == 0); UNUSED(r); printf(" Overflow (arena-scale consolidation): OK\n"); @@ -809,17 +812,17 @@ namespace snmalloc constexpr size_t BASE = 16; - arena.add_block(chunk_addr(BASE), 8); + arena.add_block(chunk_addr(BASE), chunk_size(8)); arena.check_invariant(true); // Adding [BASE+8, BASE+16) consolidates to 16 chunks = 2^4 → overflow. - auto r = arena.add_block(chunk_addr(BASE + 8), 8); + auto r = arena.add_block(chunk_addr(BASE + 8), chunk_size(8)); SNMALLOC_ASSERT(r.first == chunk_addr(BASE)); - SNMALLOC_ASSERT(r.second == 16); + SNMALLOC_ASSERT(r.second == chunk_size(16)); UNUSED(r); - auto r2 = arena.remove_block(1); - SNMALLOC_ASSERT(r2.first == 0); + auto r2 = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r2 == 0); UNUSED(r2); printf(" Overflow precise: OK\n"); @@ -889,23 +892,25 @@ namespace snmalloc // addr_chunks is oracle-relative (without base offset). std::pair remove(size_t n_chunks) { - if (n_chunks == 0 || n_chunks > Bins::max_supported_chunks()) + size_t n_bytes = n_chunks << MIN_CHUNK_BITS; + if (n_bytes == 0 || n_bytes > Bins::max_supported_size()) return {0, 0}; // Mirror the arena exactly: build a bitmap using arena-offset - // addresses (so bin classification matches), then find_for_request. + // byte addresses (so bin classification matches), then find_for_request. typename Bins::Bitmap bm{}; std::map::iterator>> by_bin; for (auto it = ranges.begin(); it != ranges.end(); ++it) { - // Use base-offset address for bin classification. - Bins::range_t r{base_offset + it->addr, it->size}; + typename Bins::range_t r{ + (base_offset + it->addr) << MIN_CHUNK_BITS, + it->size << MIN_CHUNK_BITS}; size_t bin = bm.add(r); by_bin[bin].push_back(it); } - size_t bin_id = bm.find_for_request(n_chunks); + size_t bin_id = bm.find_for_request(n_bytes); if (bin_id == SIZE_MAX) return {0, 0}; @@ -920,15 +925,22 @@ namespace snmalloc OracleRange block = *best_it; ranges.erase(best_it); - // Carve using base-offset address. - auto carved = - Bins::carve({base_offset + block.addr, block.size}, n_chunks); + auto carved = Bins::carve( + {(base_offset + block.addr) << MIN_CHUNK_BITS, + block.size << MIN_CHUNK_BITS}, + n_bytes); if (carved.pre.size != 0) - ranges.insert({carved.pre.base - base_offset, carved.pre.size}); + ranges.insert( + {(carved.pre.base >> MIN_CHUNK_BITS) - base_offset, + carved.pre.size >> MIN_CHUNK_BITS}); if (carved.post.size != 0) - ranges.insert({carved.post.base - base_offset, carved.post.size}); + ranges.insert( + {(carved.post.base >> MIN_CHUNK_BITS) - base_offset, + carved.post.size >> MIN_CHUNK_BITS}); - return {carved.req.base - base_offset, carved.req.size}; + return { + (carved.req.base >> MIN_CHUNK_BITS) - base_offset, + carved.req.size >> MIN_CHUNK_BITS}; } bool empty() const @@ -1011,7 +1023,7 @@ namespace snmalloc for (size_t j = start; j < start + size; j++) allocated[j] = false; - auto result = arena.add_block(chunk_addr(BASE + start), size); + auto result = arena.add_block(chunk_addr(BASE + start), chunk_size(size)); oracle.add(start, size); if (result.first != 0) @@ -1035,23 +1047,20 @@ namespace snmalloc max_req = 1; size_t n = (rng.next() % max_req) + 1; - auto arena_result = arena.remove_block(n); + auto arena_result = arena.remove_block(chunk_size(n)); auto oracle_result = oracle.remove(n); - UNUSED(arena_result); // Both should agree on success/failure. - // Use size == 0 to detect failure, since oracle address 0 is valid. if (oracle_result.second == 0) { - SNMALLOC_ASSERT(arena_result.second == 0); + SNMALLOC_ASSERT(arena_result == 0); } else { - SNMALLOC_ASSERT(arena_result.second != 0); - // Both should return the same address and size. + SNMALLOC_ASSERT(arena_result != 0); + // Arena should return the address oracle predicts. SNMALLOC_ASSERT( - arena_result.first == chunk_addr(BASE + oracle_result.first)); - SNMALLOC_ASSERT(arena_result.second == oracle_result.second); + arena_result == chunk_addr(BASE + oracle_result.first)); // Mark as allocated. size_t start = oracle_result.first; @@ -1091,26 +1100,26 @@ namespace snmalloc constexpr size_t BASE = 256; // avoid address 0 // Add distinct blocks to each arena. - arena_a.add_block(chunk_addr(BASE + 10), 5); - arena_b.add_block(chunk_addr(BASE + 30), 5); + arena_a.add_block(chunk_addr(BASE + 10), chunk_size(5)); + arena_b.add_block(chunk_addr(BASE + 30), chunk_size(5)); arena_a.check_invariant(true); arena_b.check_invariant(true); // Migrate a block from A to B. - auto [a_addr, a_size] = arena_a.remove_block(3); - SNMALLOC_ASSERT(a_addr != 0 && a_size != 0); + uintptr_t a_addr = arena_a.remove_block(chunk_size(3)); + SNMALLOC_ASSERT(a_addr != 0); arena_a.check_invariant(true); - arena_b.add_block(a_addr, a_size); + arena_b.add_block(a_addr, chunk_size(3)); arena_a.check_invariant(true); arena_b.check_invariant(true); // Migrate from B back to A. - auto [b_addr, b_size] = arena_b.remove_block(2); - SNMALLOC_ASSERT(b_addr != 0 && b_size != 0); + uintptr_t b_addr = arena_b.remove_block(chunk_size(2)); + SNMALLOC_ASSERT(b_addr != 0); arena_b.check_invariant(true); - arena_a.add_block(b_addr, b_size); + arena_a.add_block(b_addr, chunk_size(2)); arena_a.check_invariant(true); arena_b.check_invariant(true); @@ -1125,27 +1134,25 @@ namespace snmalloc constexpr size_t BASE = 256; // Arena B holds two blocks with a gap: [20..24) and [28..32). - arena_b.add_block(chunk_addr(BASE + 20), 4); - arena_b.add_block(chunk_addr(BASE + 28), 4); + arena_b.add_block(chunk_addr(BASE + 20), chunk_size(4)); + arena_b.add_block(chunk_addr(BASE + 28), chunk_size(4)); arena_b.check_invariant(true); // Arena A holds the gap: [24..28). - arena_a.add_block(chunk_addr(BASE + 24), 4); + arena_a.add_block(chunk_addr(BASE + 24), chunk_size(4)); arena_a.check_invariant(true); // Migrate the gap from A to B → should consolidate into [20..32). - auto [addr, size] = arena_a.remove_block(4); + uintptr_t addr = arena_a.remove_block(chunk_size(4)); SNMALLOC_ASSERT(addr == chunk_addr(BASE + 24)); - SNMALLOC_ASSERT(size == 4); arena_a.check_invariant(true); - arena_b.add_block(addr, size); + arena_b.add_block(addr, chunk_size(4)); arena_b.check_invariant(true); // B should now serve a size-12 request from the consolidated block. - auto [r_addr, r_size] = arena_b.remove_block(12); + uintptr_t r_addr = arena_b.remove_block(chunk_size(12)); SNMALLOC_ASSERT(r_addr == chunk_addr(BASE + 20)); - SNMALLOC_ASSERT(r_size == 12); arena_b.check_invariant(true); printf(" Consolidation after migration: OK\n"); @@ -1218,7 +1225,7 @@ namespace snmalloc for (size_t j = start; j < start + size; j++) owner[j] = my_id; - auto result = arena.add_block(chunk_addr(BASE + start), size); + auto result = arena.add_block(chunk_addr(BASE + start), chunk_size(size)); oracle.add(start, size); if (result.first != 0) @@ -1239,18 +1246,17 @@ namespace snmalloc max_req = 1; size_t n = (rng.next() % max_req) + 1; - auto arena_r = arena.remove_block(n); + auto arena_r = arena.remove_block(chunk_size(n)); auto oracle_r = oracle.remove(n); if (oracle_r.second == 0) { - SNMALLOC_ASSERT(arena_r.second == 0); + SNMALLOC_ASSERT(arena_r == 0); } else { - SNMALLOC_ASSERT(arena_r.second != 0); - SNMALLOC_ASSERT(arena_r.first == chunk_addr(BASE + oracle_r.first)); - SNMALLOC_ASSERT(arena_r.second == oracle_r.second); + SNMALLOC_ASSERT(arena_r != 0); + SNMALLOC_ASSERT(arena_r == chunk_addr(BASE + oracle_r.first)); for (size_t j = oracle_r.first; j < oracle_r.first + oracle_r.second; j++) @@ -1274,18 +1280,17 @@ namespace snmalloc uint8_t dst_id = from_a ? 2 : 1; size_t n = (rng.next() % 3) + 1; - auto src_r = src.remove_block(n); + uintptr_t src_r = src.remove_block(chunk_size(n)); auto src_or = src_oracle.remove(n); if (src_or.second == 0) { - SNMALLOC_ASSERT(src_r.second == 0); + SNMALLOC_ASSERT(src_r == 0); } else { - SNMALLOC_ASSERT(src_r.second != 0); - SNMALLOC_ASSERT(src_r.first == chunk_addr(BASE + src_or.first)); - SNMALLOC_ASSERT(src_r.second == src_or.second); + SNMALLOC_ASSERT(src_r != 0); + SNMALLOC_ASSERT(src_r == chunk_addr(BASE + src_or.first)); for (size_t j = src_or.first; j < src_or.first + src_or.second; j++) { @@ -1293,7 +1298,7 @@ namespace snmalloc owner[j] = dst_id; } - auto dst_r = dst.add_block(src_r.first, src_r.second); + auto dst_r = dst.add_block(src_r, chunk_size(src_or.second)); dst_oracle.add(src_or.first, src_or.second); if (dst_r.first != 0) @@ -1348,14 +1353,14 @@ namespace snmalloc MockRep::set_variant(addr, v); } - static size_t get_large_size_chunks(uintptr_t addr) + static size_t get_large_size(uintptr_t addr) { - return MockRep::get_large_size_chunks(addr); + return MockRep::get_large_size(addr); } - static void set_large_size_chunks(uintptr_t addr, size_t s) + static void set_large_size(uintptr_t addr, size_t s) { - MockRep::set_large_size_chunks(addr, s); + MockRep::set_large_size(addr, s); } static bool can_consolidate(uintptr_t higher_addr) @@ -1365,7 +1370,8 @@ namespace snmalloc }; template - using BoundaryArena = BackendArena; + using BoundaryArena = + BackendArena; // Test: predecessor merge blocked by boundary. static void test_boundary_blocks_predecessor() @@ -1381,15 +1387,15 @@ namespace snmalloc // Place a boundary at a_addr — blocks should not consolidate leftward. boundary_addrs.insert(a_addr); - arena.add_block(p_addr, 2); - arena.add_block(a_addr, 2); + arena.add_block(p_addr, chunk_size(2)); + arena.add_block(a_addr, chunk_size(2)); // P (chunks 2-3) and A (chunks 4-5) are adjacent but the boundary // at a_addr prevents merging. Both should remain separate. - auto [r1_addr, r1_size] = arena.remove_block(2); - SNMALLOC_ASSERT(r1_addr == p_addr && r1_size == 2); - auto [r2_addr, r2_size] = arena.remove_block(2); - SNMALLOC_ASSERT(r2_addr == a_addr && r2_size == 2); + auto r1_addr = arena.remove_block(chunk_size(2)); + SNMALLOC_ASSERT(r1_addr == p_addr); + auto r2_addr = arena.remove_block(chunk_size(2)); + SNMALLOC_ASSERT(r2_addr == a_addr); printf(" Boundary blocks predecessor merge: OK\n"); } @@ -1408,15 +1414,15 @@ namespace snmalloc // Place a boundary at s_addr — blocks should not consolidate rightward. boundary_addrs.insert(s_addr); - arena.add_block(s_addr, 4); - arena.add_block(a_addr, 2); + arena.add_block(s_addr, chunk_size(4)); + arena.add_block(a_addr, chunk_size(2)); // A (chunks 2-3) and S (chunks 4-7) are adjacent but the boundary // at s_addr prevents merging. Both should remain separate. - auto [r1_addr, r1_size] = arena.remove_block(2); - SNMALLOC_ASSERT(r1_addr == a_addr && r1_size == 2); - auto [r2_addr, r2_size] = arena.remove_block(4); - SNMALLOC_ASSERT(r2_addr == s_addr && r2_size == 4); + auto r1_addr = arena.remove_block(chunk_size(2)); + SNMALLOC_ASSERT(r1_addr == a_addr); + auto r2_addr = arena.remove_block(chunk_size(4)); + SNMALLOC_ASSERT(r2_addr == s_addr); printf(" Boundary blocks successor merge: OK\n"); } @@ -1434,16 +1440,16 @@ namespace snmalloc // [4,6) ↔ [6,8) merge into a 4-aligned block at chunk 4. boundary_addrs.insert(chunk_addr(8)); - arena.add_block(chunk_addr(4), 2); - arena.add_block(chunk_addr(8), 2); - arena.add_block(chunk_addr(6), 2); + arena.add_block(chunk_addr(4), chunk_size(2)); + arena.add_block(chunk_addr(8), chunk_size(2)); + arena.add_block(chunk_addr(6), chunk_size(2)); // [4,6) and [6,8) should consolidate to [4,8). // [8,10) should remain separate due to boundary. - auto [r1_addr, r1_size] = arena.remove_block(4); - SNMALLOC_ASSERT(r1_addr == chunk_addr(4) && r1_size == 4); - auto [r2_addr, r2_size] = arena.remove_block(2); - SNMALLOC_ASSERT(r2_addr == chunk_addr(8) && r2_size == 2); + auto r1_addr = arena.remove_block(chunk_size(4)); + SNMALLOC_ASSERT(r1_addr == chunk_addr(4)); + auto r2_addr = arena.remove_block(chunk_size(2)); + SNMALLOC_ASSERT(r2_addr == chunk_addr(8)); printf(" Boundary partial (P merges, S blocked): OK\n"); } @@ -1461,13 +1467,12 @@ namespace snmalloc boundary_addrs.insert(a_addr); - arena.add_block(p_addr, 1); // min-size block - arena.add_block(a_addr, 1); // adjacent, but boundary prevents merge + arena.add_block(p_addr, chunk_size(1)); // min-size block + arena.add_block(a_addr, chunk_size(1)); // adjacent, but boundary prevents merge - auto [r1_addr, r1_size] = arena.remove_block(1); - auto [r2_addr, r2_size] = arena.remove_block(1); + auto r1_addr = arena.remove_block(chunk_size(1)); + auto r2_addr = arena.remove_block(chunk_size(1)); // Both should be separate min-size blocks. - SNMALLOC_ASSERT(r1_size == 1 && r2_size == 1); SNMALLOC_ASSERT( (r1_addr == p_addr && r2_addr == a_addr) || (r1_addr == a_addr && r2_addr == p_addr)); diff --git a/src/test/func/backend_arena_bins/backend_arena_bins.cc b/src/test/func/backend_arena_bins/backend_arena_bins.cc index 6597ba0ed..235ceb690 100644 --- a/src/test/func/backend_arena_bins/backend_arena_bins.cc +++ b/src/test/func/backend_arena_bins/backend_arena_bins.cc @@ -30,15 +30,16 @@ namespace snmalloc { /** - * Friend struct exposing private internals of `BackendArenaBins` - * (and its nested `Bitmap`) for unit tests. Forward-declared in - * `backend_arena_bins.h`; defined here so the production header - * carries no test-only surface. + * Friend struct exposing private internals of + * `BackendArenaBins` (and its nested `Bitmap`) + * for unit tests. Forward-declared in `backend_arena_bins.h`; + * defined here so the production header carries no test-only + * surface. */ - template + template struct BackendArenaBinsTestAccess { - using Bins = BackendArenaBins; + using Bins = BackendArenaBins; using Bitmap = typename Bins::Bitmap; using range_t = typename Bins::range_t; @@ -73,9 +74,9 @@ namespace snmalloc return Bins::bin_index(block); } - static constexpr size_t max_supported_chunks() + static constexpr size_t max_supported_size() { - return Bins::max_supported_chunks(); + return Bins::max_supported_size(); } // --- Raw size-class id access --- @@ -84,36 +85,37 @@ namespace snmalloc // size class. Production code never names these (the fast path // goes straight from request size to the bitmap-scan / carve // record). Tests cross-check the encoding via the helpers below; - // the alias `chunk_sc_t = size_t` preserves the existing test + // the alias `sc_t = size_t` preserves the existing test // naming. - using chunk_sc_t = size_t; + using sc_t = size_t; - /// Raw id of the smallest size class >= n_chunks. - SNMALLOC_FAST_PATH static chunk_sc_t request(size_t n) + /// Raw id of the smallest size class >= n (n in bytes, + /// multiple of UNIT_SIZE). + SNMALLOC_FAST_PATH static sc_t request(size_t n) { - SNMALLOC_ASSERT(n >= 1); - SNMALLOC_ASSERT(n <= Bins::max_supported_chunks()); - return bits::to_exp_mant(n); + SNMALLOC_ASSERT(n >= (size_t(1) << MIN_SIZE_BITS)); + SNMALLOC_ASSERT(n <= Bins::max_supported_size()); + return bits::to_exp_mant(n); } - static constexpr size_t size_chunks(chunk_sc_t sc) + static constexpr size_t sc_size(sc_t sc) { - return Bins::table_.carve_info[sc].size_chunks; + return Bins::table_.carve_info[sc].size; } - static constexpr size_t align_chunks(chunk_sc_t sc) + static constexpr size_t sc_align(sc_t sc) { - return Bins::table_.carve_info[sc].align_chunks; + return Bins::table_.carve_info[sc].align; } - SNMALLOC_FAST_PATH static const bitmap_info_t& bitmap_info(chunk_sc_t sc) + SNMALLOC_FAST_PATH static const bitmap_info_t& bitmap_info(sc_t sc) { SNMALLOC_ASSERT(sc < Bins::MAX_SC); return Bins::table_.bitmap_info[sc]; } - SNMALLOC_FAST_PATH static const carve_info_t& carve_info(chunk_sc_t sc) + SNMALLOC_FAST_PATH static const carve_info_t& carve_info(sc_t sc) { SNMALLOC_ASSERT(sc < Bins::MAX_SC); return Bins::table_.carve_info[sc]; @@ -125,7 +127,7 @@ namespace snmalloc bitmap_info_for_request_const(size_t n) { return Bins::table_ - .bitmap_info[bits::to_exp_mant_const(n)]; + .bitmap_info[bits::to_exp_mant_const(n)]; } /// `carve_info_for_request`, constexpr (uses `to_exp_mant_const`). @@ -133,7 +135,7 @@ namespace snmalloc static constexpr const carve_info_t& carve_info_for_request_const(size_t n) { return Bins::table_ - .carve_info[bits::to_exp_mant_const(n)]; + .carve_info[bits::to_exp_mant_const(n)]; } // The canonical source of truth for what each within-exponent bin @@ -197,9 +199,9 @@ using snmalloc::BackendArenaBinsTestAccess; // to fail the build (not the runtime) if regressed. namespace static_checks { - using B1 = BackendArenaBinsTestAccess<1>; - using B2 = BackendArenaBinsTestAccess<2>; - using B3 = BackendArenaBinsTestAccess<3>; + using B1 = BackendArenaBinsTestAccess<1, 0>; + using B2 = BackendArenaBinsTestAccess<2, 0>; + using B3 = BackendArenaBinsTestAccess<3, 0>; static_assert(B1::BINS_PER_EXP == 2, "B=1 BINS_PER_EXP"); static_assert(B2::BINS_PER_EXP == 5, "B=2 BINS_PER_EXP"); @@ -217,15 +219,15 @@ namespace static_checks // Sizes that are powers of two have align == size. static_assert( - B2::carve_info_for_request_const(4).align_chunks == 4, "size 4 align"); + B2::carve_info_for_request_const(4).align == 4, "size 4 align"); static_assert( - B3::carve_info_for_request_const(8).align_chunks == 8, "size 8 align"); + B3::carve_info_for_request_const(8).align == 8, "size 8 align"); - // size_chunks at request(s) must be >= s. + // sc_size at request(s) must be >= s. static_assert( - B2::carve_info_for_request_const(9).size_chunks == 10, "B=2 round-up"); + B2::carve_info_for_request_const(9).size == 10, "B=2 round-up"); static_assert( - B3::carve_info_for_request_const(17).size_chunks == 18, "B=3 round-up"); + B3::carve_info_for_request_const(17).size == 18, "B=3 round-up"); } // namespace static_checks namespace @@ -242,7 +244,7 @@ namespace template constexpr bool serves(size_t bin, size_t n) { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; size_t e_b = bin / Bins::BINS_PER_EXP; size_t o_b = bin % Bins::BINS_PER_EXP; size_t raw = snmalloc::bits::to_exp_mant_const(n); @@ -274,18 +276,18 @@ namespace template void check_chunk_sc_roundtrip() { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; // Properties (together these imply request is the smallest size class // with size >= s): - // 1. size_chunks(request(s)) >= s for all s >= 1. - // 2. Idempotence: request(size_chunks(sc)) == sc. + // 1. sc_size(request(s)) >= s for all s >= 1. + // 2. Idempotence: request(sc_size(sc)) == sc. // 3. Monotonicity: s1 <= s2 implies request(s1) <= request(s2). auto prev_sc = Bins::request(1); for (size_t s = 1; s <= 4096; s++) { auto sc = Bins::request(s); - size_t cs = Bins::size_chunks(sc); + size_t cs = Bins::sc_size(sc); if (cs < s) { std::printf( @@ -294,7 +296,7 @@ namespace } if (Bins::request(cs) != sc) { - std::printf("B=%zu request(size_chunks(sc))!=sc for cs=%zu\n", B, cs); + std::printf("B=%zu request(sc_size(sc))!=sc for cs=%zu\n", B, cs); std::abort(); } if (sc < prev_sc) @@ -307,33 +309,33 @@ namespace } template - void check_align_chunks() + void check_sc_align() { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; for (size_t s = 1; s <= 4096; s++) { auto sc = Bins::request(s); - size_t cs = Bins::size_chunks(sc); - size_t a = Bins::align_chunks(sc); + size_t cs = Bins::sc_size(sc); + size_t a = Bins::sc_align(sc); // a must be a power of two. if (a == 0 || (a & (a - 1)) != 0) { - std::printf("B=%zu size %zu: align_chunks %zu not pow2\n", B, cs, a); + std::printf("B=%zu size %zu: sc_align %zu not pow2\n", B, cs, a); std::abort(); } // a must divide cs. if (cs % a != 0) { std::printf( - "B=%zu size %zu: align_chunks %zu does not divide size\n", B, cs, a); + "B=%zu size %zu: sc_align %zu does not divide size\n", B, cs, a); std::abort(); } // a should be the LARGEST power of two dividing cs. if ((a << 1) != 0 && cs % (a << 1) == 0) { std::printf( - "B=%zu size %zu: align_chunks %zu not the largest pow2 divisor\n", + "B=%zu size %zu: sc_align %zu not the largest pow2 divisor\n", B, cs, a); @@ -342,13 +344,13 @@ namespace } } - /// Collect all chunk_sc_t classes whose size fits in the test grid. + /// Collect all sc_t classes whose size fits in the test grid. template - std::vector::chunk_sc_t> + std::vector::sc_t> collect_classes(size_t max_size) { - using Bins = BackendArenaBinsTestAccess; - using sc_t = typename Bins::chunk_sc_t; + using Bins = BackendArenaBinsTestAccess; + using sc_t = typename Bins::sc_t; std::vector v; sc_t prev{}; @@ -356,7 +358,7 @@ namespace for (size_t s = 1; s <= max_size; s++) { sc_t sc = Bins::request(s); - if (Bins::size_chunks(sc) != s) + if (Bins::sc_size(sc) != s) continue; // s is not a class size if (!have_prev || sc != prev) { @@ -371,7 +373,7 @@ namespace template void check_bin_classification(size_t max_addr, size_t max_n) { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; auto classes = collect_classes(max_n); for (size_t addr = 0; addr < max_addr; addr++) @@ -382,8 +384,8 @@ namespace for (auto sc : classes) { - size_t s = Bins::size_chunks(sc); - size_t a = Bins::align_chunks(sc); + size_t s = Bins::sc_size(sc); + size_t a = Bins::sc_align(sc); bool actually = can_serve(addr, n, s, a); bool predicted = serves(bin, s); @@ -410,7 +412,7 @@ namespace template void check_bin_id_range() { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; // bin_index always returns a value in [0, BINS_PER_EXP * (e+1)) for the // block's natural exponent e. @@ -442,7 +444,7 @@ namespace template void check_info_consistency() { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; for (size_t s = 1; s <= 4096; s++) { @@ -452,16 +454,16 @@ namespace // must alias the carve_info(request(s)) record (single table // indirection, no copy). const auto& ci = Bins::carve_info_for_request(s); - if (ci.size_chunks != Bins::size_chunks(sc)) + if (ci.size != Bins::sc_size(sc)) { std::printf( - "B=%zu carve_info_for_request(%zu).size_chunks mismatch\n", B, s); + "B=%zu carve_info_for_request(%zu).size mismatch\n", B, s); std::abort(); } - if (ci.align_chunks != Bins::align_chunks(sc)) + if (ci.align != Bins::sc_align(sc)) { std::printf( - "B=%zu carve_info_for_request(%zu).align_chunks mismatch\n", B, s); + "B=%zu carve_info_for_request(%zu).align mismatch\n", B, s); std::abort(); } if (&ci != &Bins::carve_info(sc)) @@ -489,13 +491,13 @@ namespace } /// to_exp_mant runtime / _const equivalence across a representative - /// range of values, including edges near max_supported_chunks. The + /// range of values, including edges near max_supported_size. The /// runtime variant uses the intrinsic; we cross-check against the /// constexpr reference that's already exercised at compile time. template void check_to_exp_mant_equivalence() { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; auto check_one = [&](size_t n) { size_t r = snmalloc::bits::to_exp_mant(n); @@ -517,24 +519,24 @@ namespace size_t pow = size_t(1) << e; if (pow == 0) continue; - if (pow >= 1 && pow <= Bins::max_supported_chunks()) + if (pow >= 1 && pow <= Bins::max_supported_size()) check_one(pow); - if (pow + 1 <= Bins::max_supported_chunks()) + if (pow + 1 <= Bins::max_supported_size()) check_one(pow + 1); if (pow >= 2) check_one(pow - 1); } // The upper boundary itself. - check_one(Bins::max_supported_chunks()); - if (Bins::max_supported_chunks() > 1) - check_one(Bins::max_supported_chunks() - 1); + check_one(Bins::max_supported_size()); + if (Bins::max_supported_size() > 1) + check_one(Bins::max_supported_size() - 1); // A handful of stride values across the full range. - size_t step = Bins::max_supported_chunks() / 257; + size_t step = Bins::max_supported_size() / 257; if (step == 0) step = 1; - for (size_t n = 1; n <= Bins::max_supported_chunks() && n > 0; + for (size_t n = 1; n <= Bins::max_supported_size() && n > 0; n += step + 1) check_one(n); } @@ -544,9 +546,9 @@ namespace /// (defined directly in terms of `bin_subsets`). template size_t reference_find( - size_t n_chunks, const typename BackendArenaBinsTestAccess::Bitmap& bm) + size_t n_chunks, const typename BackendArenaBinsTestAccess::Bitmap& bm) { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; for (size_t b = 0; b < Bitmap::TOTAL_BINS; b++) { @@ -561,7 +563,7 @@ namespace template void check_bitmap_smoke() { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; Bitmap bm; if (!Bins::raw_empty(bm)) @@ -584,7 +586,7 @@ namespace std::abort(); } - /// Iterate over every `chunk_sc_t` raw id in `[0, MAX_SC)`. For each + /// Iterate over every `sc_t` raw id in `[0, MAX_SC)`. For each /// one, decode its request size, look up its `bitmap_info_t`, and /// run `body(n_chunks, bitmap_info)`. Multiple raw ids can share the /// same `(start_word, first_mask, second_mask)` triple; callers that @@ -592,7 +594,7 @@ namespace template void for_each_class_info(F body) { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; for (size_t raw = 0; raw < Bins::MAX_SC; raw++) { size_t s = snmalloc::bits::from_exp_mant(raw); @@ -604,7 +606,7 @@ namespace template void check_bitmap_find_empty() { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; Bitmap bm; for_each_class_info([&](size_t n, const auto& /*info*/) { @@ -619,7 +621,7 @@ namespace template void check_bitmap_exhaustive_single_bit() { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; // Gather a representative set of entries (one per distinct bitmap @@ -672,7 +674,7 @@ namespace template void check_bitmap_multi_bit_random() { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; struct Entry @@ -742,7 +744,7 @@ namespace template void check_bitmap_word_boundary() { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; auto check_predicted = @@ -858,7 +860,7 @@ namespace template void check_bitmap_bin_index_integration() { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; auto classes = collect_classes(64); @@ -871,8 +873,8 @@ namespace Bins::raw_set(bm, bin); for (auto sc : classes) { - size_t s = Bins::size_chunks(sc); - size_t a = Bins::align_chunks(sc); + size_t s = Bins::sc_size(sc); + size_t a = Bins::sc_align(sc); bool actually = can_serve(addr, n, s, a); size_t got = bm.find_for_request(s); size_t want = actually ? bin : size_t(SIZE_MAX); @@ -903,7 +905,7 @@ namespace template void check_bitmap_add() { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; using range_t = typename Bins::range_t; @@ -977,7 +979,7 @@ namespace template void check_bitmap_find_min() { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; struct Entry @@ -1045,11 +1047,11 @@ namespace } /// Verify carve(): pre.base+pre.size == req.base; req.base aligned; - /// req.size == sc.size_chunks; post.base == req.end; spans equal. + /// req.size == n; post.base == req.end; spans equal. template void check_carve() { - using Bins = BackendArenaBinsTestAccess; + using Bins = BackendArenaBinsTestAccess; using range_t = typename Bins::range_t; auto classes = collect_classes(64); @@ -1059,64 +1061,76 @@ namespace { for (auto sc : classes) { - size_t s = Bins::size_chunks(sc); - size_t a = Bins::align_chunks(sc); + size_t s = Bins::sc_size(sc); + size_t a = Bins::sc_align(sc); if (!can_serve(addr, n, s, a)) continue; - auto cv = Bins::carve(range_t{addr, n}, s); - - // pre starts at the block's base. - if (cv.pre.base != addr) - { - std::printf( - "B=%zu carve pre.base != addr (addr=%zu n=%zu s=%zu)\n", - B, - addr, - n, - s); - std::abort(); - } - // pre.end == req.base. - if (cv.pre.base + cv.pre.size != cv.req.base) - { - std::printf("B=%zu carve pre.end != req.base\n", B); - std::abort(); - } - // req aligned. - if ((cv.req.base & (a - 1)) != 0) - { - std::printf( - "B=%zu carve req.base %zu not aligned to %zu\n", - B, - cv.req.base, - a); - std::abort(); - } - // req.size == sc.size_chunks. - if (cv.req.size != s) - { - std::printf( - "B=%zu carve req.size %zu != s %zu\n", B, cv.req.size, s); - std::abort(); - } - // req.end == post.base. - if (cv.req.base + cv.req.size != cv.post.base) - { - std::printf("B=%zu carve req.end != post.base\n", B); - std::abort(); - } - // post.end == block.end. - if (cv.post.base + cv.post.size != addr + n) + // Exercise both the trivial case (request == SC size) and + // the non-trivial case (request strictly less than SC size, + // which forces the rounding remainder into `post`). The SC + // for `r` must be `sc` itself so the alignment used by carve + // matches what `can_serve` checked. + for (size_t r = 1; r <= s; r++) { - std::printf("B=%zu carve post.end != block.end\n", B); - std::abort(); - } - // pre.size + req.size + post.size == block.size. - if (cv.pre.size + cv.req.size + cv.post.size != n) - { - std::printf("B=%zu carve sizes don't sum to n\n", B); - std::abort(); + if (Bins::sc_size(Bins::request(r)) != s) + continue; + + auto cv = Bins::carve(range_t{addr, n}, r); + + // pre starts at the block's base. + if (cv.pre.base != addr) + { + std::printf( + "B=%zu carve pre.base != addr (addr=%zu n=%zu r=%zu s=%zu)\n", + B, + addr, + n, + r, + s); + std::abort(); + } + // pre.end == req.base. + if (cv.pre.base + cv.pre.size != cv.req.base) + { + std::printf("B=%zu carve pre.end != req.base\n", B); + std::abort(); + } + // req aligned to the SC's natural alignment. + if ((cv.req.base & (a - 1)) != 0) + { + std::printf( + "B=%zu carve req.base %zu not aligned to %zu\n", + B, + cv.req.base, + a); + std::abort(); + } + // req.size == requested n_chunks (carve-exact). + if (cv.req.size != r) + { + std::printf( + "B=%zu carve req.size %zu != r %zu\n", B, cv.req.size, r); + std::abort(); + } + // req.end == post.base. + if (cv.req.base + cv.req.size != cv.post.base) + { + std::printf("B=%zu carve req.end != post.base\n", B); + std::abort(); + } + // post.end == block.end. + if (cv.post.base + cv.post.size != addr + n) + { + std::printf("B=%zu carve post.end != block.end\n", B); + std::abort(); + } + // pre.size + req.size + post.size == block.size. + if (cv.pre.size + cv.req.size + cv.post.size != n) + { + std::printf("B=%zu carve sizes don't sum to n\n", B); + std::abort(); + } } } } @@ -1128,9 +1142,9 @@ namespace { std::printf("--- Running BackendArenaBinsTestAccess<%zu> tests ---\n", B); check_chunk_sc_roundtrip(); - std::printf(" chunk_sc_t round-trip: OK\n"); - check_align_chunks(); - std::printf(" align_chunks: OK\n"); + std::printf(" sc_t round-trip: OK\n"); + check_sc_align(); + std::printf(" sc_align: OK\n"); check_to_exp_mant_equivalence(); std::printf(" to_exp_mant runtime/_const equivalence: OK\n"); check_info_consistency(); @@ -1163,45 +1177,176 @@ namespace /// catch silent breakage of the canonical numbering. void check_known_values() { - using B2 = BackendArenaBinsTestAccess<2>; + using B2 = BackendArenaBinsTestAccess<2, 0>; // size 1 -> raw 0, size 2 -> raw 1, size 3 -> raw 2, size 4 -> raw 3, // size 5 -> raw 4, ..., size 8 -> raw 7, size 10 -> raw 8. - if (B2::size_chunks(B2::request(1)) != 1) + if (B2::sc_size(B2::request(1)) != 1) std::abort(); - if (B2::size_chunks(B2::request(8)) != 8) + if (B2::sc_size(B2::request(8)) != 8) std::abort(); - if (B2::size_chunks(B2::request(9)) != 10) + if (B2::sc_size(B2::request(9)) != 10) std::abort(); - if (B2::size_chunks(B2::request(11)) != 12) + if (B2::sc_size(B2::request(11)) != 12) std::abort(); - // align_chunks: size 4 -> 4, size 5 -> 1, size 6 -> 2, size 8 -> 8, + // sc_align: size 4 -> 4, size 5 -> 1, size 6 -> 2, size 8 -> 8, // size 10 -> 2, size 12 -> 4, size 14 -> 2. - if (B2::align_chunks(B2::request(4)) != 4) + if (B2::sc_align(B2::request(4)) != 4) std::abort(); - if (B2::align_chunks(B2::request(5)) != 1) + if (B2::sc_align(B2::request(5)) != 1) std::abort(); - if (B2::align_chunks(B2::request(6)) != 2) + if (B2::sc_align(B2::request(6)) != 2) std::abort(); - if (B2::align_chunks(B2::request(8)) != 8) + if (B2::sc_align(B2::request(8)) != 8) std::abort(); - if (B2::align_chunks(B2::request(10)) != 2) + if (B2::sc_align(B2::request(10)) != 2) std::abort(); // BINS_PER_EXP must be 5 for B=2. if (B2::BINS_PER_EXP != 5) std::abort(); - using B3 = BackendArenaBinsTestAccess<3>; + using B3 = BackendArenaBinsTestAccess<3, 0>; if (B3::BINS_PER_EXP != 13) std::abort(); - using B1 = BackendArenaBinsTestAccess<1>; + using B1 = BackendArenaBinsTestAccess<1, 0>; if (B1::BINS_PER_EXP != 2) std::abort(); } + + /** + * Verify that scaling the encoding by `UNIT_SIZE = 1 << MIN_SIZE_BITS` + * is a structural equivalence: every public observation about a + * `BackendArenaBins` instance equals the + * corresponding observation on `BackendArenaBins` when the + * input is scaled by `UNIT_SIZE` (and outputs, where they are sizes + * or addresses, are also scaled by `UNIT_SIZE`). + * + * This pins the new template parameter to act purely as a unit + * change, with no other semantic effect on the bin scheme. + */ + template + void check_min_size_bits_equivalence() + { + using Scaled = BackendArenaBinsTestAccess; + using Base = BackendArenaBinsTestAccess; + static_assert(MIN_SIZE_BITS > 0, "this check is for MIN_SIZE_BITS > 0"); + constexpr size_t U = size_t(1) << MIN_SIZE_BITS; + + // BINS_PER_EXP is independent of MIN_SIZE_BITS. + if (Scaled::BINS_PER_EXP != Base::BINS_PER_EXP) + std::abort(); + if (Scaled::MANTISSAS_PER_EXP != Base::MANTISSAS_PER_EXP) + std::abort(); + + // request(n*U) at MIN_SIZE_BITS==K returns the same raw id as + // request(n) at MIN_SIZE_BITS==0; sc_size(raw) at MIN_SIZE_BITS==K + // equals sc_size(raw) at MIN_SIZE_BITS==0 times U; sc_align + // likewise. + size_t probe[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 17, 32, 65, 127, 1024}; + for (size_t n : probe) + { + // Skip values that would overflow either instance's domain. + if (n > Base::max_supported_size()) + continue; + if (n > Scaled::max_supported_size() / U) + continue; + auto sc_base = Base::request(n); + auto sc_scaled = Scaled::request(n * U); + if (sc_base != sc_scaled) + std::abort(); + if (Scaled::sc_size(sc_scaled) != Base::sc_size(sc_base) * U) + std::abort(); + if (Scaled::sc_align(sc_scaled) != Base::sc_align(sc_base) * U) + std::abort(); + } + + // bin_index({a*U, n*U}) at MIN_SIZE_BITS==K matches bin_index({a, n}) + // at MIN_SIZE_BITS==0. + using ScaledR = typename Scaled::range_t; + using BaseR = typename Base::range_t; + for (size_t n = 1; n <= 64; n++) + for (size_t a = 0; a < 32; a++) + if (Scaled::bin_index(ScaledR{a * U, n * U}) != + Base::bin_index(BaseR{a, n})) + std::abort(); + + // carve({0, blk*U}, n*U) returns the same partition as + // carve({0, blk}, n) at MIN_SIZE_BITS==0, scaled by U. + for (size_t blk = 1; blk <= 32; blk++) + for (size_t n = 1; n <= blk; n++) + { + // carve's precondition (servability) is that the SC for `n` + // fits inside `blk` after alignment. With base 0, pad is 0, + // so the condition reduces to `Base::sc_size(Base::request(n)) + // <= blk`. Skip pairs that don't satisfy it. + if (Base::sc_size(Base::request(n)) > blk) + continue; + auto base_cv = Base::carve(BaseR{0, blk}, n); + auto scaled_cv = Scaled::carve(ScaledR{0, blk * U}, n * U); + if (scaled_cv.pre.base != base_cv.pre.base * U || + scaled_cv.pre.size != base_cv.pre.size * U) + std::abort(); + if (scaled_cv.req.base != base_cv.req.base * U || + scaled_cv.req.size != base_cv.req.size * U) + std::abort(); + if (scaled_cv.post.base != base_cv.post.base * U || + scaled_cv.post.size != base_cv.post.size * U) + std::abort(); + } + + // Bitmap find_for_request scales: an arena populated by add + // returns the same bin id, and `find_for_request(n*U)` agrees + // with `find_for_request(n)` at MIN_SIZE_BITS==0. + typename Scaled::Bitmap bm_scaled{}; + typename Base::Bitmap bm_base{}; + // Populate with a handful of representative ranges. + size_t pop[][2] = {{0, 4}, {16, 1}, {17, 7}, {64, 9}, {128, 64}}; + for (auto& p : pop) + { + size_t a = p[0], s = p[1]; + auto id_b = bm_base.add(BaseR{a, s}); + auto id_s = bm_scaled.add(ScaledR{a * U, s * U}); + if (id_b != id_s) + std::abort(); + } + for (size_t n = 1; n <= 32; n++) + { + auto f_b = bm_base.find_for_request(n); + auto f_s = bm_scaled.find_for_request(n * U); + if (f_b != f_s) + std::abort(); + } + } + + /// Concrete expected values at MIN_SIZE_BITS == 4 to pin the + /// interpretation: bin 0 corresponds to the unit-size block, + /// raw 0 decodes to UNIT_SIZE bytes, etc. + void check_known_values_unit_16() + { + using BU = BackendArenaBinsTestAccess<2, 4>; + constexpr size_t U = size_t(1) << 4; + + // size U (UNIT_SIZE) -> raw 0; size 2U -> raw 1; ... + if (BU::sc_size(BU::request(U)) != U) + std::abort(); + if (BU::sc_size(BU::request(8 * U)) != 8 * U) + std::abort(); + // size 9U requires SC for 10U at B=2 (round up). + if (BU::sc_size(BU::request(9 * U)) != 10 * U) + std::abort(); + if (BU::sc_align(BU::request(4 * U)) != 4 * U) + std::abort(); + if (BU::sc_align(BU::request(8 * U)) != 8 * U) + std::abort(); + + // Bin 0 corresponds to a UNIT_SIZE block. + if (BU::bin_index({0, U}) != 0) + std::abort(); + } } // namespace int main(int, char**) @@ -1211,6 +1356,15 @@ int main(int, char**) check_known_values(); std::printf("Known concrete values: OK\n"); + check_known_values_unit_16(); + std::printf("Known concrete values at MIN_SIZE_BITS=4: OK\n"); + + check_min_size_bits_equivalence<1, 4>(); + check_min_size_bits_equivalence<2, 4>(); + check_min_size_bits_equivalence<3, 4>(); + check_min_size_bits_equivalence<2, 14>(); + std::printf("MIN_SIZE_BITS equivalence: OK\n"); + run_all<1>(); run_all<2>(); run_all<3>(); diff --git a/src/test/func/backend_arena_range/backend_arena_range.cc b/src/test/func/backend_arena_range/backend_arena_range.cc new file mode 100644 index 000000000..342253c4b --- /dev/null +++ b/src/test/func/backend_arena_range/backend_arena_range.cc @@ -0,0 +1,309 @@ +/** + * Unit tests for BackendArenaRange and PagemapRep. + * + * Tests the Range wrapper around BackendArena using a real pagemap, + * exercising alloc_range, dealloc_range, refill, and overflow paths. + */ + +#include "test/setup.h" + +#include + +#ifndef SNMALLOC_TRACING +# define SNMALLOC_TRACING +#endif +#include "test/snmalloc_testlib.h" + +#include +#include + +namespace +{ + using namespace snmalloc; + + // --- Test pagemap and range types --- + + using Pal = DefaultPal; + using PagemapEntry = DefaultPagemapEntry; + using ConcretePagemap = FlatPagemap; + using TestPagemap = BasicPagemap; + + // Initialise the pagemap once before tests. + static bool pagemap_initialised = false; + + static void ensure_pagemap() + { + if (!pagemap_initialised) + { + TestPagemap::concretePagemap.template init(); + pagemap_initialised = true; + } + } + + // Simple parent: PalRange + PagemapRegisterRange. + using ParentSource = Pipe, PagemapRegisterRange>; + + // BackendArenaRange under test: global range (MAX_SIZE_BITS = BITS - 1). + // This means overflow dealloc never goes to parent (matches the global + // range configuration). + static constexpr size_t REFILL_BITS = 20; + static constexpr size_t MAX_BITS = bits::BITS - 1; + + using ArenaRange = + Pipe>; + + // --- Tests --- + + static void test_basic_alloc_dealloc() + { + ensure_pagemap(); + ArenaRange range{}; + + // Allocate a single chunk. + auto p1 = range.alloc_range(MIN_CHUNK_SIZE); + SNMALLOC_ASSERT(p1 != nullptr); + printf(" alloc %zu bytes at %p\n", MIN_CHUNK_SIZE, p1.unsafe_ptr()); + + // Deallocate and re-allocate — should succeed. + range.dealloc_range(p1, MIN_CHUNK_SIZE); + auto p2 = range.alloc_range(MIN_CHUNK_SIZE); + SNMALLOC_ASSERT(p2 != nullptr); + + // Clean up. + range.dealloc_range(p2, MIN_CHUNK_SIZE); + + printf(" Basic alloc/dealloc: OK\n"); + } + + static void test_multiple_sizes() + { + ensure_pagemap(); + ArenaRange range{}; + + // Allocate various power-of-two sizes. + constexpr size_t NUM_SIZES = 6; + size_t sizes[NUM_SIZES] = { + MIN_CHUNK_SIZE, + MIN_CHUNK_SIZE * 2, + MIN_CHUNK_SIZE * 4, + MIN_CHUNK_SIZE * 8, + MIN_CHUNK_SIZE * 16, + MIN_CHUNK_SIZE * 32}; + capptr::Arena ptrs[NUM_SIZES] = {}; + + for (size_t i = 0; i < NUM_SIZES; i++) + { + ptrs[i] = range.alloc_range(sizes[i]); + SNMALLOC_ASSERT(ptrs[i] != nullptr); + } + + // Deallocate all. + for (size_t i = 0; i < NUM_SIZES; i++) + { + range.dealloc_range(ptrs[i], sizes[i]); + } + + printf(" Multiple sizes: OK\n"); + } + + static void test_refill() + { + ensure_pagemap(); + ArenaRange range{}; + + // Allocate more than one refill's worth of chunks. + // REFILL_SIZE is 2^20, MIN_CHUNK_SIZE is 2^14, + // so one refill is ~64 chunks. + constexpr size_t NUM_ALLOCS = 200; + capptr::Arena ptrs[NUM_ALLOCS] = {}; + + for (size_t i = 0; i < NUM_ALLOCS; i++) + { + ptrs[i] = range.alloc_range(MIN_CHUNK_SIZE); + SNMALLOC_ASSERT(ptrs[i] != nullptr); + } + + // Deallocate all. + for (size_t i = 0; i < NUM_ALLOCS; i++) + { + range.dealloc_range(ptrs[i], MIN_CHUNK_SIZE); + } + + // Re-allocate — should serve from freed blocks, no new refill needed + // for the first pass. + for (size_t i = 0; i < NUM_ALLOCS; i++) + { + ptrs[i] = range.alloc_range(MIN_CHUNK_SIZE); + SNMALLOC_ASSERT(ptrs[i] != nullptr); + } + + // Final cleanup. + for (size_t i = 0; i < NUM_ALLOCS; i++) + { + range.dealloc_range(ptrs[i], MIN_CHUNK_SIZE); + } + + printf(" Refill (200 allocs): OK\n"); + } + + static void test_alloc_dealloc_cycle() + { + ensure_pagemap(); + ArenaRange range{}; + + // Interleave alloc and dealloc to exercise consolidation. + constexpr size_t ROUNDS = 100; + for (size_t r = 0; r < ROUNDS; r++) + { + auto p = range.alloc_range(MIN_CHUNK_SIZE); + SNMALLOC_ASSERT(p != nullptr); + range.dealloc_range(p, MIN_CHUNK_SIZE); + } + + // Do a larger allocation after many cycles — verifies + // that consolidation is working (freed chunks merge back). + auto large = range.alloc_range(MIN_CHUNK_SIZE * 4); + SNMALLOC_ASSERT(large != nullptr); + range.dealloc_range(large, MIN_CHUNK_SIZE * 4); + + printf(" Alloc/dealloc cycle: OK\n"); + } + + static void test_alignment() + { + ensure_pagemap(); + ArenaRange range{}; + + // Verify that returned pointers are properly aligned. + constexpr size_t NUM_TESTS = 5; + size_t sizes[NUM_TESTS] = { + MIN_CHUNK_SIZE, + MIN_CHUNK_SIZE * 2, + MIN_CHUNK_SIZE * 4, + MIN_CHUNK_SIZE * 8, + MIN_CHUNK_SIZE * 16}; + + for (size_t i = 0; i < NUM_TESTS; i++) + { + auto p = range.alloc_range(sizes[i]); + SNMALLOC_ASSERT(p != nullptr); + uintptr_t addr = p.unsafe_uintptr(); + SNMALLOC_ASSERT( + (addr & (sizes[i] - 1)) == 0 && "Allocation not properly aligned"); + range.dealloc_range(p, sizes[i]); + } + + printf(" Alignment: OK\n"); + } + + static void test_large_then_small() + { + ensure_pagemap(); + ArenaRange range{}; + + // Allocate a large block, dealloc, then allocate smaller blocks + // from the same space. + size_t large_size = MIN_CHUNK_SIZE * 16; + auto large = range.alloc_range(large_size); + SNMALLOC_ASSERT(large != nullptr); + range.dealloc_range(large, large_size); + + // Now allocate 16 individual chunks — should come from the freed + // large block's space. + constexpr size_t N = 16; + capptr::Arena ptrs[N] = {}; + for (size_t i = 0; i < N; i++) + { + ptrs[i] = range.alloc_range(MIN_CHUNK_SIZE); + SNMALLOC_ASSERT(ptrs[i] != nullptr); + } + + for (size_t i = 0; i < N; i++) + { + range.dealloc_range(ptrs[i], MIN_CHUNK_SIZE); + } + + printf(" Large then small: OK\n"); + } + + static void test_non_pow2_sizes() + { + ensure_pagemap(); + ArenaRange range{}; + + // Non-power-of-two, chunk-multiple sizes. Some of these are not + // representable size-classes (e.g. 9, 11, 13 chunks); the arena + // carves exactly the requested chunk count and rolls the rounding + // remainder into the post fragment, so callers see no excess. + constexpr size_t NUM_SIZES = 8; + size_t sizes[NUM_SIZES] = { + MIN_CHUNK_SIZE * 3, + MIN_CHUNK_SIZE * 5, + MIN_CHUNK_SIZE * 6, + MIN_CHUNK_SIZE * 7, + MIN_CHUNK_SIZE * 9, + MIN_CHUNK_SIZE * 11, + MIN_CHUNK_SIZE * 13, + MIN_CHUNK_SIZE * 17}; + + capptr::Arena ptrs[NUM_SIZES] = {}; + for (size_t i = 0; i < NUM_SIZES; i++) + { + ptrs[i] = range.alloc_range(sizes[i]); + SNMALLOC_ASSERT(ptrs[i] != nullptr); + } + + // All pointers must be distinct and non-overlapping (within the size + // requested — over-allocation would break this because the rounding + // remainder would later be handed out a second time). + for (size_t i = 0; i < NUM_SIZES; i++) + { + uintptr_t lo_i = ptrs[i].unsafe_uintptr(); + uintptr_t hi_i = lo_i + sizes[i]; + for (size_t j = i + 1; j < NUM_SIZES; j++) + { + uintptr_t lo_j = ptrs[j].unsafe_uintptr(); + uintptr_t hi_j = lo_j + sizes[j]; + SNMALLOC_ASSERT(hi_i <= lo_j || hi_j <= lo_i); + } + } + + for (size_t i = 0; i < NUM_SIZES; i++) + { + range.dealloc_range(ptrs[i], sizes[i]); + } + + // After deallocating all, repeat the exact same pattern to confirm + // the freed space is reusable (catches leaks from un-returned + // rounding remainder). + for (size_t i = 0; i < NUM_SIZES; i++) + { + ptrs[i] = range.alloc_range(sizes[i]); + SNMALLOC_ASSERT(ptrs[i] != nullptr); + } + for (size_t i = 0; i < NUM_SIZES; i++) + { + range.dealloc_range(ptrs[i], sizes[i]); + } + + printf(" Non-pow2 sizes: OK\n"); + } +} // anonymous namespace + +int main() +{ + setup(); + + printf("--- BackendArenaRange tests ---\n"); + + test_basic_alloc_dealloc(); + test_multiple_sizes(); + test_refill(); + test_alloc_dealloc_cycle(); + test_alignment(); + test_large_then_small(); + test_non_pow2_sizes(); + + printf("All BackendArenaRange tests passed.\n"); + return 0; +} From d4ef917f913d4a1cdc51c36e038e7f40923fd4e8 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Sat, 23 May 2026 10:42:29 +0100 Subject: [PATCH 08/31] claude.md: require explicit approval before committing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every `git commit`, `--amend`, `push`, `reset`, `rebase`, or `gh pr create` must be preceded by an explicit ask_user approval for that specific commit/PR. "Begin the next phase" does not authorise committing later work — only "commit this" for the change in hand counts as approval. If a commit has already been made without approval, offer `git reset --soft HEAD~1` to undo it while preserving the staged changes. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- claude.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/claude.md b/claude.md index 9cb11e281..87dde8ea7 100644 --- a/claude.md +++ b/claude.md @@ -18,6 +18,23 @@ root so that context survives session boundaries. Update (not append to) the file when the plan evolves. This is the single source of truth for what is planned and what has been completed. +**Never commit without explicit approval**: Do not run `git commit`, +`git commit --amend`, `git push`, `git reset`, `git rebase`, or any other +history-mutating command until the user has explicitly approved the commit +for the current change. "I'm happy with this phase now, please commit" +counts as approval for that single commit; "begin the next phase" does not +authorise committing later work. When you believe a change is ready to +commit: + 1. Show the user `git status` and `git diff --stat` (and the proposed + commit message) so they can see exactly what would be committed. + 2. Ask for explicit approval — use the `ask_user` tool, do not infer + consent from earlier messages. + 3. Only after the user has approved THIS commit, run `git commit`. +If you have already committed without approval, offer to `git reset --soft +HEAD~1` to undo it while keeping the changes staged. The same rule applies +to opening pull requests: never run `gh pr create` (or equivalent) without +explicit approval for that PR. + **Baseline the checkout before starting work**: Before beginning implementation of any plan, verify that the current checkout builds and passes tests. Run the build and test suite (per `skills/building_and_testing.md`) and record the From 8e65a55c53d68eb3f3bce3d51592e2c1da00053b Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Sat, 23 May 2026 10:42:52 +0100 Subject: [PATCH 09/31] Phase 12: substitute LargeBuddyRange with BackendArenaRange MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mechanical substitution of every LargeBuddyRange instantiation in the default in-tree range pipelines: - src/snmalloc/backend/standard_range.h (GlobalR, LargeObjectRange). - src/snmalloc/backend/meta_protected_range.h (GlobalR, CentralObjectRange, CentralMetaRange, the conditional_t huge-page cache, ObjectRange, MetaRange). After this change snmalloc uses the BackendArena bin-tree allocator instead of the power-of-two buddy for all large-range management in the default pipelines. LargeBuddyRange and BuddyChunkRep remain in the tree, available for alternative configurations. Two issues uncovered during Phase 12 testing and fixed here: 1. backend_arena.h: BackendArena::add_block's successor-min branch called Rep::can_consolidate(succ_addr) before contains_min(succ_addr) confirmed succ_addr is in our region. For a block added at the very top of a registered region (e.g. last 8 MiB of a 256 MiB fixed region), succ_addr = addr + size sits one chunk past the pagemap's mapped backing, and the can_consolidate probe segfaults. The fix reorders the checks so the tree-membership test gates the pagemap read, matching the documented pattern in buddy.h:90-93. Regression coverage: MockRep gains a per-chunk `boundary` field on `mock_entry`. `MockRep::can_consolidate(addr)` now returns `!mock_store[mock_index(addr)].boundary` — faithful to the real `PagemapRep::can_consolidate` reading `entry.is_boundary()`. The `mock_index` bounds assertion fires on any out-of-range probe, so the unsafe pattern trips in unit tests rather than only as a segfault in production. A new test_block_at_arena_top_edge adds a block whose succ_addr would address chunk MOCK_ARENA_CHUNKS; without the reorder this reproduces the original failure. This unification also subsumed the previous BoundaryMockRep and its boundary_addrs global std::set: the four boundary tests now run on Arena and set mock_store[mock_index(addr)].boundary = true instead. Net -35 lines in backend_arena.cc. 2. backend_arena_bins.h: the BinTable constexpr constructor used throw "..." as a constexpr-eval-fails trick to surface invariant violations as compile errors. throw requires exception support, which is disabled in the main allocator (-fno-exceptions), so this broke Phase 12 builds. Replaced with SNMALLOC_CHECK(false && "..."), which calls a non-constexpr error path and achieves the same compile-time failure without runtime exception machinery. Full ctest suite passes (86/86, --timeout 120 -j 4). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PLAN.md | 167 ++++++++++++++---- src/snmalloc/backend/meta_protected_range.h | 12 +- src/snmalloc/backend/standard_range.h | 4 +- src/snmalloc/backend_helpers/backend_arena.h | 10 +- .../backend_helpers/backend_arena_bins.h | 10 +- src/test/func/backend_arena/backend_arena.cc | 122 +++++++------ 6 files changed, 223 insertions(+), 102 deletions(-) diff --git a/PLAN.md b/PLAN.md index 7520631d3..e5df08956 100644 --- a/PLAN.md +++ b/PLAN.md @@ -1439,7 +1439,16 @@ boundary test passes. ### Phase 10: PagemapRep + BackendArenaRange + tests -**Status**: implemented and tested. +**Status**: implemented and tested. Committed in `9c1ca745`. + +> **Note**: the design notes below were written before Phase 10d +> (bytes-throughout). The as-built code uses byte sizes everywhere +> at the arena/range API and a unified `parent_dealloc(uintptr_t, +> size_t)` helper in place of the old `dealloc_overflow` / +> `parent_dealloc_range` pair. See the Phase 10d section for the +> current shape. Where the notes below say `size_chunks`, the +> implementation uses bytes; where they say `dealloc_overflow`, the +> implementation uses `parent_dealloc`. **Phase 10b refactor (also implemented):** `BackendArena` and `PagemapRep` were both retemplated to mirror `Buddy`'s 3-parameter shape: @@ -1764,6 +1773,18 @@ No in-tree code path is changed in this phase: the existing # Phase 12: Update backend to use BackendArenaRange +## Status: implementation complete, awaiting commit approval + +Substitution implemented and tested in the working tree (uncommitted on +top of `9c1ca745`). `BackendArena::add_block` had a latent +out-of-region pagemap-probe bug in its successor-min branch that +became reachable once `BackendArenaRange` started serving fixed-region +allocations; fixed in this phase (see "Issue found during Phase 12 +test run" below). Full ctest suite passes (86/86). + +Diff: 6 files, 183/45 +/- (PLAN.md, both pipeline range headers, +`backend_arena.h`, `backend_arena_bins.h`, `backend_arena.cc`). + ## Goal Replace every `LargeBuddyRange` instantiation in the range @@ -1787,9 +1808,12 @@ default pipeline wiring changes. ## Pre-conditions -- Phase 10 (BackendArenaRange) is committed and all its tests pass. -- Phase 11 (final review of Phases 9–10) is complete. +- Phase 10 (BackendArenaRange) is committed and all its tests pass + (commit `9c1ca745`). +- Phase 11 (final review of Phases 9–10) was waived by the user; + Phase 12 proceeds without it. - Baseline: the checkout builds and all tests pass before this change. + Recorded after 9c1ca745: 86/86 ctest passed, no warnings. ## Analysis of every LargeBuddyRange instantiation @@ -1809,8 +1833,8 @@ LargeBuddyRange **unaligned** on PALs without `AlignedAllocation` (e.g. Linux mmap) and aligned otherwise. `BackendArenaRange::refill` currently still carries the aligned/unaligned dual path inherited from - `LargeBuddyRange`; collapsing this into a single path is in scope for - Step 4 below. + `LargeBuddyRange`; collapsing this into a single path is deferred to + Phase 13. **2. LargeObjectRange (local cache)** ```cpp @@ -1820,11 +1844,12 @@ LargeBuddyRange - `MAX_SIZE_BITS = LocalCacheSizeBits = 21` (2 MiB). Non-global mode. Overflow goes to parent. -- `BackendArenaRange::dealloc_overflow` forwards directly to parent - without decomposition. Since the chunk-bit width - (`MAX_SIZE_BITS - MIN_CHUNK_BITS = 7` on 64-bit) is small, the arena - has at most 128 chunk slots — overflow can only produce one block of - exactly `1 << MAX_SIZE_BITS`. +- `BackendArenaRange::parent_dealloc` forwards directly to parent + without decomposition (single block returned by + `BackendArena::add_block` when consolidation reaches the arena-scale + upper bound). The size is a chunk multiple up to `2^MAX_SIZE_BITS`, + not necessarily power-of-two — the parent must accept arbitrary + chunk-multiple sizes. - Wrapped in `StaticConditionalRange` — no impact on the substitution. ### `meta_protected_range.h` @@ -1863,7 +1888,8 @@ stl::conditional_t< `MAX_SIZE_BITS = max_page_chunk_size_bits` (typically `page_size_bits` when page_size_bits > MIN_CHUNK_BITS, e.g. huge pages at 21 bits). -- Non-global mode. Overflow decomposed and passed to parent. +- Non-global mode. Overflow forwarded to parent as one consolidated + chunk-multiple block via `parent_dealloc`. **7. ObjectRange (local)** ```cpp @@ -1891,16 +1917,12 @@ template parameters, no API calls, no structural changes. ### Step 1: Replace LargeBuddyRange → BackendArenaRange In `src/snmalloc/backend/standard_range.h`: -- Line 32: `LargeBuddyRange<` → `BackendArenaRange<` -- Line 52: `LargeBuddyRange<` → `BackendArenaRange<` +- 2 instantiations of `LargeBuddyRange<` (GlobalR, LargeObjectRange). In `src/snmalloc/backend/meta_protected_range.h`: -- Line 35: `LargeBuddyRange<` → `BackendArenaRange<` -- Line 54: `LargeBuddyRange<` → `BackendArenaRange<` -- Line 71: `LargeBuddyRange<` → `BackendArenaRange<` -- Line 82: `LargeBuddyRange<` → `BackendArenaRange<` -- Line 93: `LargeBuddyRange<` → `BackendArenaRange<` -- Line 103: `LargeBuddyRange<` → `BackendArenaRange<` +- 6 instantiations of `LargeBuddyRange<` (GlobalR, CentralObjectRange, + CentralMetaRange, the `conditional_t` huge-page cache, + ObjectRange, MetaRange). ### Step 2: Verify include paths @@ -1923,10 +1945,66 @@ Both files include `"../backend/backend.h"` which includes **Test gate**: full `ctest` passes. No new tests needed — the existing test suite exercises the pipeline end-to-end. +### Issue found during Phase 12 test run: out-of-region pagemap probe + +`func-fixed_region_alloc-check` segfaulted in `PagemapRep::can_consolidate` +when `BackendArena::add_block` was called with a block whose +`succ_addr = addr + size` sat one chunk past the registered pagemap +range (the last 8 MiB of a 256 MiB FixedRange). The bug shape matches +the `buddy.h:90-93` comment exactly: `can_consolidate` reads the +pagemap entry at `succ_addr`, and that read is only safe once a +tree-membership test has confirmed the address is in our region. + +**Fix.** In `BackendArena::add_block`, the successor-min branch was +reordered so the tree-membership check (`contains_min(succ_addr)`) +short-circuits before the pagemap probe (`Rep::can_consolidate`). +All other can_consolidate call sites already had their preconditions +established (either `addr` is the input block, or the address was +returned from `range_tree.neighbours()` and is in the tree). + +**Regression coverage.** `MockRep` was extended with a per-chunk +`boundary` flag stored on `mock_entry`. `MockRep::can_consolidate(addr)` +now returns `!mock_store[mock_index(addr)].boundary` — faithful to the +real `PagemapRep::can_consolidate` reading `entry.is_boundary()`. The +`mock_index` bounds assertion fires on any out-of-range probe, so the +unsafe pattern trips in unit tests rather than only as a segfault in +production. A new test `test_block_at_arena_top_edge` adds a block +whose `succ_addr` sits one past the arena's pagemap; without the +reorder this test reproduces the original failure. + +This unification also subsumed the previous `BoundaryMockRep` and its +`boundary_addrs` global `std::set`: the four boundary tests +(`test_boundary_blocks_predecessor`, `test_boundary_blocks_successor`, +`test_boundary_partial`, `test_boundary_blocks_min_predecessor`) now +run on `Arena` and set `mock_store[mock_index(addr)].boundary = true` +instead. Net −35 lines in `backend_arena.cc`. + +A leftover `throw "..."` in `backend_arena_bins.h:807` (used as a +constexpr-failure trick in the `BinTable` constructor) caused a build +failure in `-fno-exceptions` configurations during Phase 12. Replaced +with `SNMALLOC_CHECK(false && "...")`, which is non-constexpr and +fails compile-time evaluation the same way without requiring +exception support. + ### Step 4: Retire the `ParentRange::Aligned` concept -Once `BackendArenaRange` is the only large-range layer, the -`Aligned` template property loses most of its remaining use: +**Deferred to Phase 13.** Originally listed here but moved out for the +following reasons (rubber-duck review): +- It touches `LargeBuddyRange`, which Phase 12 explicitly keeps + available for alternative configurations / embedders. +- It changes the public range concept (every pass-through range loses + a static field) — a structural change, not a wiring change. +- It would split Phase 12 across an atomic-substitution commit and a + separate concept-cleanup commit anyway; better to make that split + explicit in the plan. + +Phase 12 ends after Step 3 with the test suite green. + +## Phase 13: Retire `ParentRange::Aligned` + +Once `BackendArenaRange` is the only large-range layer in the default +pipelines, the `Aligned` template property loses most of its remaining +use: - `BackendArenaRange::Aligned` is always `true` (the bin scheme guarantees size-aligned output for in-arena allocations). @@ -1935,7 +2013,7 @@ Once `BackendArenaRange` is the only large-range layer, the boundaries, so an unaligned parent no longer requires a separate refill path. -Plan: +Plan (deferred until after Phase 12 lands): 1. **Collapse `BackendArenaRange::refill` to a single path.** Drop the `if (ParentRange::Aligned)` branch. The unified path allocates @@ -1953,16 +2031,35 @@ Plan: branch is dead once the property goes away; replace with an unconditional delegate (sizes ≥ `2^MAX_SIZE_BITS` are always forwarded to the parent — alignment is no longer differentiated). -3. **Remove `Aligned` from the Range concept.** Once - `BackendArenaRange` and `SmallBuddyRange` no longer reference it, - drop the `static constexpr bool Aligned` field from every + **Hazard:** for a `BackendArenaRange` directly above an unaligned + parent (`PalRange` on PALs without `AlignedAllocation`), the + oversize delegation can return a non-size-aligned block while + `BackendArenaRange::Aligned` is `true`. Resolve at Phase 13 start + by either (i) routing oversize allocations through the same + over-allocate-and-trim path `refill` uses for unaligned parents, + or (ii) keeping an explicit alignment-preserving fallback for the + unaligned-parent case until in-tree pipelines no longer expose + that combination. +3. **Decide what to do with `LargeBuddyRange`'s use of `Aligned`.** + `LargeBuddyRange::refill` and oversize-fallback still consume + `ParentRange::Aligned` (`largebuddyrange.h:273`, `:357`). Either: + (a) leave `LargeBuddyRange` alone and keep the `Aligned` field on + pass-through ranges (Phase 13 then only collapses + `BackendArenaRange::refill` and oversize-fallback — minimal + surface change), or + (b) update `LargeBuddyRange` in the same phase to also stop + consulting `Aligned`. Option (a) is the smaller change and + preserves the embedder contract. Decide at the start of Phase 13. +4. **(Conditional on 3b.) Remove `Aligned` from the Range concept.** + Once neither `BackendArenaRange` nor `LargeBuddyRange` references + it, drop the `static constexpr bool Aligned` field from every pass-through range (`StatsRange`, `CommitRange`, `LockRange`, `IndirectRange`, `StaticRange`, `StaticConditionalRange`, `SubRange`, `LogRange`, `NopRange`, `PagemapRegisterRange`, - `PalRange`). The `pal_supports` query - itself remains for PALs that want to advertise the capability, - but the range stack no longer threads it through. -4. **Update `SmallBuddyRange`.** Drop the + `PalRange`, `EmptyRange`). The `pal_supports` + query itself remains for PALs that want to advertise the + capability, but the range stack no longer threads it through. +5. **Update `SmallBuddyRange`.** Drop the `static_assert(ParentRange::Aligned)` (its parent is always `BackendArenaRange` after Phase 12, which always provides aligned output by construction). @@ -1981,7 +2078,7 @@ test suite is the gate. 2. **Overflow behaviour.** `LargeBuddyRange::dealloc_overflow` returns a single block of exactly `1 << MAX_SIZE_BITS`. - `BackendArenaRange::dealloc_overflow` forwards a single block of the + `BackendArenaRange::parent_dealloc` forwards a single block of the consolidated size directly to the parent. The size can be any chunk multiple up to `2^MAX_SIZE_BITS`, not just power-of-two, but the parent (now itself a `BackendArenaRange` or pass-through layer) @@ -1992,6 +2089,16 @@ test suite is the gate. This works with `BackendArenaRange` because `dealloc_range` has the same signature and contract. +4. **Pagemap metadata footprint.** `BackendArenaRange` uses up to + three pagemap entries per free block (`backend_arena_range.h:12-17`) + — one at the base, one at `base + UNIT_SIZE`, one at + `base + 2*UNIT_SIZE`. `LargeBuddyRange`'s `BuddyChunkRep` only + touched the base entry. Pagemap registration covers every + `MIN_CHUNK_SIZE` stride for the full reserved address range + (`pagemap.h:60-65`), so this is safe in the in-tree pipeline, but + external embedders with custom Pagemap implementations should + verify their pagemap entries cover the per-unit stride. + ## Resolved during plan review - `backend_arena_range.h` was missing `#include "empty_range.h"` for diff --git a/src/snmalloc/backend/meta_protected_range.h b/src/snmalloc/backend/meta_protected_range.h index 857e853d2..df0245beb 100644 --- a/src/snmalloc/backend/meta_protected_range.h +++ b/src/snmalloc/backend/meta_protected_range.h @@ -32,7 +32,7 @@ namespace snmalloc // Global range of memory using GlobalR = Pipe< Base, - LargeBuddyRange< + BackendArenaRange< GlobalCacheSizeBits, bits::BITS - 1, Pagemap, @@ -51,7 +51,7 @@ namespace snmalloc // would be able to corrupt meta-data. using CentralObjectRange = Pipe< GlobalR, - LargeBuddyRange, + BackendArenaRange, LogRange<3>, GlobalRange, CommitRange, @@ -67,7 +67,7 @@ namespace snmalloc GlobalR, SubRange, // Use SubRange to introduce guard // pages. - LargeBuddyRange< + BackendArenaRange< GlobalCacheSizeBits, bits::BITS - 1, Pagemap, @@ -77,7 +77,7 @@ namespace snmalloc // page, so commit in the global range. stl::conditional_t< (max_page_chunk_size_bits > MIN_CHUNK_BITS), - LargeBuddyRange< + BackendArenaRange< max_page_chunk_size_bits, max_page_chunk_size_bits, Pagemap, @@ -90,7 +90,7 @@ namespace snmalloc // Local caching of object range using ObjectRange = Pipe< CentralObjectRange, - LargeBuddyRange< + BackendArenaRange< LocalCacheSizeBits, LocalCacheSizeBits, Pagemap, @@ -100,7 +100,7 @@ namespace snmalloc // Local caching of meta-data range using MetaRange = Pipe< CentralMetaRange, - LargeBuddyRange< + BackendArenaRange< LocalCacheSizeBits - SubRangeRatioBits, bits::BITS - 1, Pagemap>, diff --git a/src/snmalloc/backend/standard_range.h b/src/snmalloc/backend/standard_range.h index 78609ed2d..2d9d5e961 100644 --- a/src/snmalloc/backend/standard_range.h +++ b/src/snmalloc/backend/standard_range.h @@ -29,7 +29,7 @@ namespace snmalloc // Global range of memory, expose this so can be filled by init. using GlobalR = Pipe< Base, - LargeBuddyRange< + BackendArenaRange< GlobalCacheSizeBits, bits::BITS - 1, Pagemap, @@ -49,7 +49,7 @@ namespace snmalloc // Use buddy allocators to cache locally. using LargeObjectRange = Pipe< Stats, - StaticConditionalRange addr && Rep::can_consolidate(succ_addr) && - contains_min(succ_addr)) + succ_addr > addr && contains_min(succ_addr) && + Rep::can_consolidate(succ_addr)) merge(succ_addr, UNIT_SIZE); // Arena-scale overflow: consolidated block spans the full arena. diff --git a/src/snmalloc/backend_helpers/backend_arena_bins.h b/src/snmalloc/backend_helpers/backend_arena_bins.h index b6d98e233..88445e372 100644 --- a/src/snmalloc/backend_helpers/backend_arena_bins.h +++ b/src/snmalloc/backend_helpers/backend_arena_bins.h @@ -800,11 +800,13 @@ namespace snmalloc // If this fires, `bin_subsets` violates the strict-chain // invariant: candidate `b`'s subset does not properly // contain candidate `b_next`'s, so the cascade can't be - // expressed as single-mantissa probes. `throw` makes the - // constexpr evaluation non-constant and surfaces the - // violation as a compile error. + // expressed as single-mantissa probes. Calling the + // non-constexpr `SNMALLOC_CHECK` makes the constexpr + // evaluation non-constant and surfaces the violation as + // a compile error. if (discrim_set == 0) - throw "bin_subsets violates strict-chain invariant"; + SNMALLOC_CHECK( + false && "bin_subsets violates strict-chain invariant"); cascade_steps[m_top][i].m_test = bits::ctz_const(discrim_set); cascade_steps[m_top][i].bin = b; } diff --git a/src/test/func/backend_arena/backend_arena.cc b/src/test/func/backend_arena/backend_arena.cc index 242b984f7..684783a7c 100644 --- a/src/test/func/backend_arena/backend_arena.cc +++ b/src/test/func/backend_arena/backend_arena.cc @@ -64,7 +64,9 @@ namespace snmalloc // Each chunk-aligned address maps to a mock_entry via its chunk index. // word1/word2 hold bin-tree children; range_word1/range_word2 hold - // range-tree children. variant and large_size hold metadata. + // range-tree children. variant and large_size hold metadata. boundary + // mirrors the real PagemapRep's entry.is_boundary() — set it on a + // chunk to suppress consolidation across that chunk. struct mock_entry { uintptr_t word1{0}; @@ -73,6 +75,7 @@ namespace snmalloc uintptr_t range_word2{0}; BackendArenaVariant variant{BackendArenaVariant::Min}; size_t large_size{0}; + bool boundary{false}; }; // Size the array for the largest test arena + trailing room. @@ -196,9 +199,17 @@ namespace snmalloc mock_store[mock_index(addr)].large_size = s; } - static bool can_consolidate(uintptr_t) + // Mirrors PagemapRep::can_consolidate, which reads + // entry.is_boundary() from the pagemap. The boundary flag lives + // per-chunk in mock_store; mock_index asserts the index is in + // range, so any caller that probes outside the arena trips the + // assertion — this catches the buddy.h:90-93 unsafe-probe pattern + // (calling can_consolidate before confirming the address is in + // our region) in BackendArena unit tests rather than as a runtime + // segfault in production builds. + static bool can_consolidate(uintptr_t addr) { - return true; + return !mock_store[mock_index(addr)].boundary; } }; @@ -387,7 +398,8 @@ namespace snmalloc for (auto& b : blocks) { - auto result = arena.add_block(chunk_addr(b.chunk_idx), chunk_size(b.size)); + auto result = + arena.add_block(chunk_addr(b.chunk_idx), chunk_size(b.size)); SNMALLOC_ASSERT(result.first == 0 && result.second == 0); UNUSED(result); arena.check_invariant(true); @@ -473,7 +485,8 @@ namespace snmalloc static void add_and_check(ArenaT& arena, size_t chunk_idx, size_t size_in_chunks) { - auto result = arena.add_block(chunk_addr(chunk_idx), chunk_size(size_in_chunks)); + auto result = + arena.add_block(chunk_addr(chunk_idx), chunk_size(size_in_chunks)); SNMALLOC_ASSERT(result.first == 0 && result.second == 0); UNUSED(result); arena.check_invariant(true); @@ -1023,7 +1036,8 @@ namespace snmalloc for (size_t j = start; j < start + size; j++) allocated[j] = false; - auto result = arena.add_block(chunk_addr(BASE + start), chunk_size(size)); + auto result = + arena.add_block(chunk_addr(BASE + start), chunk_size(size)); oracle.add(start, size); if (result.first != 0) @@ -1225,7 +1239,8 @@ namespace snmalloc for (size_t j = start; j < start + size; j++) owner[j] = my_id; - auto result = arena.add_block(chunk_addr(BASE + start), chunk_size(size)); + auto result = + arena.add_block(chunk_addr(BASE + start), chunk_size(size)); oracle.add(start, size); if (result.first != 0) @@ -1334,58 +1349,23 @@ namespace snmalloc // ================================================================== // (J) Boundary consolidation prevention // ================================================================== - - // A Rep variant that blocks consolidation at specific addresses. - static std::set boundary_addrs; - - struct BoundaryMockRep - { - using BinRep = MockRep::BinRep; - using RangeRep = MockRep::RangeRep; - - static BackendArenaVariant get_variant(uintptr_t addr) - { - return MockRep::get_variant(addr); - } - - static void set_variant(uintptr_t addr, BackendArenaVariant v) - { - MockRep::set_variant(addr, v); - } - - static size_t get_large_size(uintptr_t addr) - { - return MockRep::get_large_size(addr); - } - - static void set_large_size(uintptr_t addr, size_t s) - { - MockRep::set_large_size(addr, s); - } - - static bool can_consolidate(uintptr_t higher_addr) - { - return boundary_addrs.find(higher_addr) == boundary_addrs.end(); - } - }; - - template - using BoundaryArena = - BackendArena; + // + // The boundary field on mock_entry suppresses consolidation across + // that chunk; MockRep::can_consolidate reads it. This mirrors the + // real PagemapRep::can_consolidate reading entry.is_boundary(). // Test: predecessor merge blocked by boundary. static void test_boundary_blocks_predecessor() { reset_mock_store(); - boundary_addrs.clear(); constexpr size_t K = 6; - BoundaryArena arena; + Arena arena; uintptr_t p_addr = chunk_addr(2); uintptr_t a_addr = chunk_addr(4); // Place a boundary at a_addr — blocks should not consolidate leftward. - boundary_addrs.insert(a_addr); + mock_store[mock_index(a_addr)].boundary = true; arena.add_block(p_addr, chunk_size(2)); arena.add_block(a_addr, chunk_size(2)); @@ -1404,15 +1384,14 @@ namespace snmalloc static void test_boundary_blocks_successor() { reset_mock_store(); - boundary_addrs.clear(); constexpr size_t K = 6; - BoundaryArena arena; + Arena arena; uintptr_t a_addr = chunk_addr(2); uintptr_t s_addr = chunk_addr(4); // Place a boundary at s_addr — blocks should not consolidate rightward. - boundary_addrs.insert(s_addr); + mock_store[mock_index(s_addr)].boundary = true; arena.add_block(s_addr, chunk_size(4)); arena.add_block(a_addr, chunk_size(2)); @@ -1431,14 +1410,13 @@ namespace snmalloc static void test_boundary_partial() { reset_mock_store(); - boundary_addrs.clear(); constexpr size_t K = 6; - BoundaryArena arena; + Arena arena; // Three adjacent blocks: chunks [4,6), [6,8), [8,10). // Boundary at chunk 8 blocks [6,8) ↔ [8,10) merge but allows // [4,6) ↔ [6,8) merge into a 4-aligned block at chunk 4. - boundary_addrs.insert(chunk_addr(8)); + mock_store[mock_index(chunk_addr(8))].boundary = true; arena.add_block(chunk_addr(4), chunk_size(2)); arena.add_block(chunk_addr(8), chunk_size(2)); @@ -1454,21 +1432,48 @@ namespace snmalloc printf(" Boundary partial (P merges, S blocked): OK\n"); } + // Regression test: a block whose successor address sits one past + // the arena's pagemap must not trigger a can_consolidate probe of + // that out-of-range chunk. The fix is in BackendArena::add_block — + // tree-membership tests gate the can_consolidate read. MockRep's + // can_consolidate now dereferences mock_store via mock_index, which + // asserts on out-of-range indices, so an unguarded probe in + // add_block trips here rather than only as a segfault in production + // builds. + static void test_block_at_arena_top_edge() + { + reset_mock_store(); + constexpr size_t K = 10; + Arena arena; + constexpr size_t ARENA_CHUNKS = size_t{1} << K; + + // Block ending at the very top of the arena (succ_addr would + // address chunk ARENA_CHUNKS, one past mock_store). + uintptr_t top_addr = chunk_addr(ARENA_CHUNKS - 4); + arena.add_block(top_addr, chunk_size(4)); + arena.check_invariant(true); + + auto r1 = arena.remove_block(chunk_size(4)); + SNMALLOC_ASSERT(r1 == top_addr); + + printf(" Block at arena top edge: OK\n"); + } + // Test: min-size predecessor blocked by boundary. static void test_boundary_blocks_min_predecessor() { reset_mock_store(); - boundary_addrs.clear(); constexpr size_t K = 6; - BoundaryArena arena; + Arena arena; uintptr_t p_addr = chunk_addr(4); uintptr_t a_addr = chunk_addr(5); - boundary_addrs.insert(a_addr); + mock_store[mock_index(a_addr)].boundary = true; arena.add_block(p_addr, chunk_size(1)); // min-size block - arena.add_block(a_addr, chunk_size(1)); // adjacent, but boundary prevents merge + arena.add_block( + a_addr, chunk_size(1)); // adjacent, but boundary prevents merge auto r1_addr = arena.remove_block(chunk_size(1)); auto r2_addr = arena.remove_block(chunk_size(1)); @@ -1539,6 +1544,7 @@ int main() snmalloc::test_boundary_blocks_predecessor(); snmalloc::test_boundary_blocks_successor(); snmalloc::test_boundary_partial(); + snmalloc::test_block_at_arena_top_edge(); snmalloc::test_boundary_blocks_min_predecessor(); printf("All BackendArena tests passed.\n"); From 63a2be03a82a61aace660d3e963294a45fa05b7e Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Sat, 23 May 2026 11:10:13 +0100 Subject: [PATCH 10/31] PLAN.md: drop Phase 13 (Retire ParentRange::Aligned) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On closer inspection at Phase 13 start, the two-path conditional in BackendArenaRange::refill turns out to be load-bearing rather than vestigial: - Aligned-parent path serves caller sizes up to (1 << MAX_SIZE_BITS) - 1; unaligned-parent path caps at ~REFILL_SIZE / 2 because of its while (needed_size <= refill_size) guard. Unifying on the unaligned strategy reduces capability for aligned-parent configs. - The aligned-parent carve shortcut is precise, not a perf optimisation: it hands the caller's `size` bytes back directly and calls add_range with refill_size - size, which is strictly less than refill_size and so satisfies add_block's size < 2^MAX_SIZE_BITS precondition even when REFILL_SIZE_BITS == MAX_SIZE_BITS (the LargeObjectRange config). A unified "add the whole refill then recurse" path violates that precondition for the same config, and the workarounds (cut LocalCacheSizeBits by 1, or bump MAX_SIZE_BITS by 1) carry real cost for no behavioural win. - LargeBuddyRange would still consume Aligned under the agreed-minimal (a)+(ii) scope, so the Aligned field's footprint in pass-through ranges doesn't shrink — defeating the only structural-cleanup motivation. The BackendArena refactor (Phases 1-12) ends with Phase 12. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PLAN.md | 102 ++++++++++++++++++++------------------------------------ 1 file changed, 36 insertions(+), 66 deletions(-) diff --git a/PLAN.md b/PLAN.md index e5df08956..570ecd6ea 100644 --- a/PLAN.md +++ b/PLAN.md @@ -2000,72 +2000,42 @@ following reasons (rubber-duck review): Phase 12 ends after Step 3 with the test suite green. -## Phase 13: Retire `ParentRange::Aligned` - -Once `BackendArenaRange` is the only large-range layer in the default -pipelines, the `Aligned` template property loses most of its remaining -use: - -- `BackendArenaRange::Aligned` is always `true` (the bin scheme - guarantees size-aligned output for in-arena allocations). -- `BackendArenaRange::add_range` already trims arbitrary - (page-aligned-but-not-chunk-aligned) parent input to chunk - boundaries, so an unaligned parent no longer requires a separate - refill path. - -Plan (deferred until after Phase 12 lands): - -1. **Collapse `BackendArenaRange::refill` to a single path.** Drop the - `if (ParentRange::Aligned)` branch. The unified path allocates - `refill_size` from the parent, places the chunk-aligned remainder - into the arena via `add_range` (which already trims for unaligned - parents), then recursively calls `alloc_range(size)` to obtain the - size-aligned chunk for the caller. The refill-size accounting must - keep `refill_size < 2^MAX_SIZE_BITS` so the assertion in - `add_block` holds for the whole-refill `add_range` call; configs - where `REFILL_SIZE_BITS == MAX_SIZE_BITS` (the local-cache - configurations) need either a one-bit refill-size reduction or a - `MAX_SIZE_BITS` bump. -2. **Drop the oversize-fallback alignment check.** In `alloc_range`, - the `if (ParentRange::Aligned) return parent.alloc_range(size);` - branch is dead once the property goes away; replace with an - unconditional delegate (sizes ≥ `2^MAX_SIZE_BITS` are always - forwarded to the parent — alignment is no longer differentiated). - **Hazard:** for a `BackendArenaRange` directly above an unaligned - parent (`PalRange` on PALs without `AlignedAllocation`), the - oversize delegation can return a non-size-aligned block while - `BackendArenaRange::Aligned` is `true`. Resolve at Phase 13 start - by either (i) routing oversize allocations through the same - over-allocate-and-trim path `refill` uses for unaligned parents, - or (ii) keeping an explicit alignment-preserving fallback for the - unaligned-parent case until in-tree pipelines no longer expose - that combination. -3. **Decide what to do with `LargeBuddyRange`'s use of `Aligned`.** - `LargeBuddyRange::refill` and oversize-fallback still consume - `ParentRange::Aligned` (`largebuddyrange.h:273`, `:357`). Either: - (a) leave `LargeBuddyRange` alone and keep the `Aligned` field on - pass-through ranges (Phase 13 then only collapses - `BackendArenaRange::refill` and oversize-fallback — minimal - surface change), or - (b) update `LargeBuddyRange` in the same phase to also stop - consulting `Aligned`. Option (a) is the smaller change and - preserves the embedder contract. Decide at the start of Phase 13. -4. **(Conditional on 3b.) Remove `Aligned` from the Range concept.** - Once neither `BackendArenaRange` nor `LargeBuddyRange` references - it, drop the `static constexpr bool Aligned` field from every - pass-through range (`StatsRange`, `CommitRange`, `LockRange`, - `IndirectRange`, `StaticRange`, `StaticConditionalRange`, - `SubRange`, `LogRange`, `NopRange`, `PagemapRegisterRange`, - `PalRange`, `EmptyRange`). The `pal_supports` - query itself remains for PALs that want to advertise the - capability, but the range stack no longer threads it through. -5. **Update `SmallBuddyRange`.** Drop the - `static_assert(ParentRange::Aligned)` (its parent is always - `BackendArenaRange` after Phase 12, which always provides aligned - output by construction). - -This is a structural simplification, not a behavioural change — the -test suite is the gate. +## Phase 13: Retire `ParentRange::Aligned` — DROPPED + +**Status: dropped on review.** Phase 13 was deferred from Phase 12 with +the intent of collapsing `BackendArenaRange::refill`'s two-path +conditional and (optionally) removing `ParentRange::Aligned` from the +range concept. Closer inspection of the existing code found the +conditional is load-bearing, not vestigial: + +- **The two paths give different capabilities, not just different + efficiencies.** The aligned-parent path serves caller sizes up to + `(1 << MAX_SIZE_BITS) - 1`. The unaligned-parent path's + `while (needed_size <= refill_size)` guard caps caller size at + ~`REFILL_SIZE / 2`. Unifying on the unaligned strategy reduces + capability for aligned-parent configs. + +- **The aligned-parent path's carve shortcut is precise, not a perf + optimisation.** It hands the caller's `size` bytes back directly + and calls `add_range(refill + size, refill_size - size)` — + passing `refill_size - size` (strictly less than `refill_size`) + to `add_block`, which satisfies `add_block`'s + `size < 2^MAX_SIZE_BITS` precondition even when + `REFILL_SIZE_BITS == MAX_SIZE_BITS` (the `LargeObjectRange` config + in `standard_range.h:52-56`). A unified "add the whole refill then + recurse" path violates that precondition for the same config. + +- **The proposed "fix" for the precondition has real cost.** Either + cut `LocalCacheSizeBits` by 1 (half the per-local cache) or bump + `MAX_SIZE_BITS` by 1 (double the local arena's internal state), + for no behavioural win. + +- **`LargeBuddyRange` would still consume `Aligned`** under the + agreed-minimal (a)+(ii) scope, so the field's footprint in + pass-through ranges doesn't shrink — defeating the only + structural-cleanup motivation. + +The BackendArena refactor (Phases 1–12) ends with Phase 12. No Phase 13. ## Risks From b7abca0a1f3b20e875a59e949084c1ddefb498d6 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Mon, 25 May 2026 14:43:34 +0100 Subject: [PATCH 11/31] Phase 13: uniform sizeclass encoding (small + large) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the tagged small/large encoding and the leading-zero-count large-class indexing with a single uniform exp+mantissa scheme: value == 0 : unmapped sentinel value in [1, 1 + NUM_SMALL_SIZECLASSES) : small (sc = value - 1) value in [1 + NUM_SMALL_SIZECLASSES, 1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES) : large (lc = ...) Small classes use `from_exp_mant(sc)` (unchanged). Large classes continue the same exp+mantissa namespace as `from_exp_mant(NUM_SMALL_SIZECLASSES + lc)`. The discriminator tag bit is gone — small and large share one contiguous index space — and the sentinel slot 0 lets the size-lookup fast path return 0 / 0 for unmapped pointers without a branch. The `SIZECLASS_REP_SIZE` / `REMOTE_BACKEND_MARKER` / `REMOTE_MIN_ALIGN` chain is re-derived from the new `SIZECLASS_BITS` (renamed from `TAG_SIZECLASS_BITS`); RED_BIT / VARIANT_SHIFT / LARGE_SIZE_SHIFT in `backend_arena_range.h` and RED_BIT in `largebuddyrange.h` derive from the new public `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT` so future widenings propagate automatically. A new `MAX_LARGE_SIZECLASS_SIZE` constant gates user-supplied sizes at the API boundary (`alloc_not_small`, `round_size`, `check_size`, `rust_realloc`) — replacing the loose `> 2^63` bound. `ENCODED_ADDRESS_BITS` caps the encoding at `BITS - 1` so the constant survives 32-bit platforms where `DefaultPal::address_bits == BITS`. The pre-Phase-13 `large_size_to_chunk_sizeclass` helper is removed — its `+NUM_SMALL_SIZECLASSES` / `-NUM_SMALL_SIZECLASSES` round-trip through an `lc` index cancels in the uniform scheme, so `size_to_sizeclass_full`'s large branch inlines the `to_exp_mant` directly. Front-end semantics are unchanged: `large_size_to_chunk_size` still returns `next_pow2(size)` and the front end still reserves pow2 chunk sizes. The non-pow2 large sizeclasses exist in `sizeclass_metadata` (with `slab_mask = info.align - 1`) but are unreachable from `size_to_sizeclass_full` until Phase 15 drops the `next_pow2` rounding. Tests: - `sizeclass.cc`: sentinel sanity, raw-value adjacency, range disjoint, large monotonicity, pow2 round-trip, non-pow2 rounds up. - `rounding.cc`: extends to pow2 large sizeclasses, verifying `index_in_object` / `is_start_of_object` at representative offsets. - `cheri.cc`: large-class verification loop bound updated to `NUM_LARGE_CLASSES`. - Loop bounds in tests use `ENCODED_ADDRESS_BITS` to avoid `bits::one_at_bit(BITS)` UB on 32-bit. ctest: 86/86 passing. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PLAN.md | 1045 ++++++++++++++++- .../backend_helpers/backend_arena_bins.h | 11 +- .../backend_helpers/backend_arena_range.h | 34 +- .../backend_helpers/largebuddyrange.h | 3 +- src/snmalloc/ds/sizeclasstable.h | 177 ++- src/snmalloc/global/globalalloc.h | 13 + src/snmalloc/mem/corealloc.h | 10 +- src/snmalloc/mem/metadata.h | 17 +- src/snmalloc/override/rust.cc | 4 +- .../backend_arena_bins/backend_arena_bins.cc | 50 +- src/test/func/cheri/cheri.cc | 2 +- src/test/func/release-rounding/rounding.cc | 42 + src/test/func/sizeclass/sizeclass.cc | 113 ++ 13 files changed, 1405 insertions(+), 116 deletions(-) diff --git a/PLAN.md b/PLAN.md index 570ecd6ea..b3174911f 100644 --- a/PLAN.md +++ b/PLAN.md @@ -2000,7 +2000,7 @@ following reasons (rubber-duck review): Phase 12 ends after Step 3 with the test suite green. -## Phase 13: Retire `ParentRange::Aligned` — DROPPED +## Investigated and dropped: Retire `ParentRange::Aligned` **Status: dropped on review.** Phase 13 was deferred from Phase 12 with the intent of collapsing `BackendArenaRange::refill`'s two-path @@ -2089,3 +2089,1046 @@ The BackendArena refactor (Phases 1–12) ends with Phase 12. No Phase 13. generalisation phase). - Performance benchmarking (separate task). - Any front-end changes. + +# Phase 13: Uniform exp+mantissa sizeclass encoding + +## Goal + +Replace the large-sizeclass encoding `from_large_class(clz(size - 1))` +(which can only represent powers of two) with the same exp+mantissa +scheme small classes already use. After this phase, **every** size +class — small or large — is represented as +`bits::from_exp_mant(global_index)`, +where `global_index` is a single continuous index that runs across +small AND large. So a single uniform table accessor works across the +whole range, and the large table is the natural continuation of the +small one. + +Specifically: +- Small class `sc ∈ [0, NUM_SMALL_SIZECLASSES)` corresponds to + `from_exp_mant(sc)` + (unchanged from today's small encoding in `sizeclassstatic.h:62-64`). +- Large class `lc ∈ [0, NUM_LARGE_CLASSES)` corresponds to + `from_exp_mant(NUM_SMALL_SIZECLASSES + lc)`. + +This is the "continuation of the small exp+mantissa", NOT a separate +exp+mantissa space starting at `MAX_SMALL_SIZECLASS_BITS`. Adjacent +classes step by `2^(E - INTERMEDIATE_BITS)` continuously, with no +jump at the small/large boundary. + +No production behaviour changes yet: the front-end still calls +`large_size_to_chunk_size(size) = next_pow2(size)` and writes the +pagemap with the corresponding pow2-rounded sizeclass. The non-pow2 +large sizeclasses are **populated in the table** (so the size / +slab_mask metadata is correct should any code path query them) but +are **unreachable** from `size_to_sizeclass_full` and +`large_size_to_chunk_size` until Phase 15. + +That means: +- `size_to_sizeclass_full(non_pow2_large_size)` must continue to + return the pow2-rounded sizeclass (the one whose + `sizeclass_full_to_size` equals `next_pow2(size)`). Phase 15 + changes this to return the exp+mantissa-rounded sizeclass. +- `large_size_to_chunk_size(size)` continues to return + `next_pow2(size)`. Phase 15 changes this to return + `sizeclass_full_to_size(size_to_sizeclass_full(size))`. + +The two functions stay in lock-step: the front-end's reservation +size and the pagemap-recorded sizeclass must agree, or `dealloc_chunk` +gets the wrong size. Phase 15 changes both together. + +Phase 13 lays the encoding ground; Phase 14 adds the per-chunk +offset; Phase 15 flips the front-end. + +## Why now + +- The large-class table is currently indexed by leading-zero count, + which has exactly one entry per power-of-two size — fundamentally + pow2-only. +- Switching to exp+mantissa multiplies the large-class count by + `1 << INTERMEDIATE_BITS = 4` (default), taking the default + (`MAX_SMALL_SIZECLASS_BITS=16`, `address_bits=48`) from 32 large + entries to 128. +- Once small and large share the same exp+mantissa scheme, the + small/large tag bit in `sizeclass_t` becomes redundant: the two + ranges can live in a single contiguous index space, and + `is_small()` becomes `value < 1 + NUM_SMALL_SIZECLASSES` instead + of `(value & TAG) != 0`. This drops one bit from + `SIZECLASS_REP_SIZE`, undoing roughly half of the alignment + cascade widening that Phase 13 would otherwise cause. + +(For the default config, `NUM_SMALL_SIZECLASSES = 44`, defined as +`size_to_sizeclass_const(MAX_SMALL_SIZECLASS_SIZE) + 1` in +`sizeclassstatic.h:53-54`. All numeric examples below use the +symbol `NUM_SMALL_SIZECLASSES` for the count, with `44` as the +default-config concrete value.) + +## Uniform (untagged) sizeclass encoding + +Today `sizeclass_t::value` packs `[small: TAG | sc]` and +`[large: large_class]`, with a discriminator bit at position +`TAG_SIZECLASS_BITS`. Width consumed in the pagemap word = +`TAG_SIZECLASS_BITS + 1`. + +After Phase 13 the discriminator is unnecessary because the small +and large ranges are both exp+mantissa-indexed and can sit in a +single contiguous index space. + +### Index 0 is reserved as the unmapped sentinel + +`value == 0` MUST remain the "default / unmapped" sentinel. An +unmapped pagemap entry reads as all-zero, and the size-lookup +machinery relies on `sizeclass == 0 ⇒ size == 0` to safely answer +`malloc_usable_size` / `remaining_bytes` queries on +not-an-allocation pointers without branching on validity. + +So the uniform layout reserves index 0 and shifts everything up by +one: + +``` +0 -> unmapped sentinel +[1, 1 + NUM_SMALL_SIZECLASSES) -> small (sc index = value - 1) +[1 + NUM_SMALL_SIZECLASSES, + 1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES) -> large (lc index = value - 1 - NUM_SMALL) +``` + +Width consumed = `next_pow2_bits_const(1 + NUM_SMALL_SIZECLASSES + +NUM_LARGE_CLASSES)`. For the default config: +`next_pow2_bits_const(1 + 44 + 128) = next_pow2_bits_const(173) = 8`. +The tagged scheme today (without the same uniform shift) would need +`max(6, 8) + 1 = 9`. **One bit saved** +(`SIZECLASS_REP_SIZE = 256`, `REMOTE_MIN_ALIGN = 512`). + +### Table padding to avoid the subtract + +The shift introduces a `- 1` on the size/metadata-lookup hot path +(`sizeclass_metadata[value - 1]`). We pay that on every dealloc. + +Cheaper option: pad the table by one slot at index 0 (a dummy +"default" entry whose `size` is 0 and whose `slab_mask` is 0). Then +the lookup is `sizeclass_metadata[value]` with no subtract — and +querying the sentinel returns "size 0 / slab_mask 0" naturally, +which is the answer the existing API wants for unmapped pointers. + +Cost: one wasted slot per table indexed by `sizeclass_t::raw()`. +Inspect each such table in `sizeclasstable.h`: +- `sizeclass_metadata` ModArray (`sizeclass_data_fast` / + `sizeclass_data_slow`): pad slot 0 with zeros. Worth it (hot + path). +- `sizeclass_compress_t` reverse lookups, if any: pad similarly. +- `ChunkSizeMetadata` (if indexed by raw value): inspect first. + +The wasted-slot cost is `sizeof(slot) * num_tables` ≈ tens to +hundreds of bytes — negligible compared to the hot-path subtract +saved. + +### `sizeclass_t` accessors + +- `from_small_class(sc) { return {sc + 1}; }` +- `from_large_class(lc) { return {1 + NUM_SMALL_SIZECLASSES + lc}; }` +- `as_small() { return value - 1; }` (asserts `is_small()`). +- `as_large() { return value - 1 - NUM_SMALL_SIZECLASSES; }` +- `is_small() { return value < 1 + NUM_SMALL_SIZECLASSES && + value != 0; }` — but in practice + `is_small` is only meaningful for non-sentinel values; the + default-sentinel case is filtered upstream. Simplification: + `is_small() { return value - 1 < NUM_SMALL_SIZECLASSES; }` + (works because for `value == 0`, `value - 1` underflows to + SIZE_MAX which is ≥ NUM_SMALL_SIZECLASSES, so returns false — + matching today's semantics where `sizeclass_t{}` is not "small"). +- `is_default() { return value == 0; }` — unchanged. +- `raw()` returns the new shifted value — audit all callers (see + Risks). + +## RemoteAllocator alignment chain + +Verified by inspection of `sizeclasstable.h:18-33` and +`metadata.h:16-45`: + +``` +SIZECLASS_BITS = next_pow2_bits_const( + 1 + + NUM_SMALL_SIZECLASSES + + NUM_LARGE_CLASSES) + (uniform encoding; no separate tag bit; + index 0 reserved as the unmapped sentinel) + (= 8 after Phase 13 with + INTERMEDIATE_BITS=2 large-classes) + -> SIZECLASS_REP_SIZE = 1 << SIZECLASS_BITS + (= 256 after Phase 13) + (used as ModArray length for sizeclass_metadata; the + encoding occupies bits 0..SIZECLASS_BITS-1) + -> REMOTE_MIN_ALIGN = max(CACHELINE_SIZE, SIZECLASS_REP_SIZE) << 1 + (= 512 after Phase 13) + -> REMOTE_BACKEND_MARKER currently hard-coded `1 << 7`. After + Phase 13, MUST be derived: + `static constexpr address_t REMOTE_BACKEND_MARKER = + SIZECLASS_REP_SIZE;` + (= bit 8 after Phase 13). + -> BACKEND_RESERVED_MASK = (REMOTE_BACKEND_MARKER << 1) - 1 + (= 0x1FF after Phase 13 — low 9 bits reserved for backend.) +``` + +NOTE: the `+ 1` previously in `SIZECLASS_REP_SIZE = 1 << +(TAG_SIZECLASS_BITS + 1)` is dropped — that `+ 1` was for the +small/large tag bit which uniform encoding eliminates. (Phase 13 also +renames the constant to `SIZECLASS_BITS` since the encoding no longer +carries a separate tag.) The static-assert in `metadata.h:64-67` that +enforces `REMOTE_BACKEND_MARKER == SIZECLASS_REP_SIZE` continues to +hold by construction. + +**Concrete instances of `RemoteAllocator`** (all places where the +alignment cost lands): + +- Inline `RemoteAllocator` inside `CoreAllocator` — one per + allocator (`corealloc.h:155-159`), allocated from the meta range. +- `unused_remote` BSS global (`commonconfig.h:120`) — once, static. +- Stack-local instances in tests `sandbox.cc:162`, + `domestication.cc` — test-only. + +No other code constrains `RemoteAllocator` alignment. + +**Cost of widening `REMOTE_MIN_ALIGN` (256 → 512 in this phase):** at +most ~256 bytes additional padding per CoreAllocator / +`unused_remote` — under 1 KB total, paid once, not per allocation. +Cheap. + +## Cascading bit-layout changes elsewhere + +`BACKEND_RESERVED_MASK` widens from `0xFF` (8 bits) to `0x1FF` (9 +bits). All backend-side `PagemapRep` layouts that sit immediately +above the reserved range must shift up by +`new_marker_pos - old_marker_pos = log2(REMOTE_BACKEND_MARKER) - 7`. + +Verified consumers of `BACKEND_RESERVED_MASK` / bits immediately +above bit 7: + +- `backend_arena_range.h:42-50`: `RED_BIT_POS = 8`, + `VARIANT_SHIFT = 9`, `LARGE_SIZE_SHIFT = 8`. Today these sit at + bits 8/9-10/8. After Phase 13 they shift to bits 9/10-11/9. The + `static_assert(Entry::is_backend_allowed_value(...))` at + `backend_arena_range.h:64-66` catches any miss at compile time. +- `backend_helpers/largebuddyrange.h:40-46`: `BuddyChunkRep` + `RED_BIT = 1 << 8`. Same shift required. (The plan previously + said "`backend_helpers/buddy.h`" — corrected. Grep `RED_BIT` to + confirm no other site.) + +The plan does the shift in terms of an existing or new constant +(e.g. `BACKEND_LAYOUT_FIRST_FREE_BIT = log2(REMOTE_BACKEND_MARKER) ++ 1`) rather than hard-coding new bit numbers — so a future widening +auto-propagates. + +## Changes + +### `src/snmalloc/ds/sizeclasstable.h` + +- `NUM_LARGE_CLASSES`: redefine as + `(address_bits - MAX_SMALL_SIZECLASS_BITS) << INTERMEDIATE_BITS` + so it tracks the exp+mantissa scheme. Update the comment. +- `SIZECLASS_BITS` (renamed from `TAG_SIZECLASS_BITS` — the encoding + no longer carries a separate tag): redefine as + `next_pow2_bits_const(1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES)`. + The `+ 1` reserves index 0 as the unmapped sentinel. +- `SIZECLASS_REP_SIZE`: redefine as `1 << SIZECLASS_BITS` (drop + the `+ 1` that came from the tag bit). +- `sizeclass_t`: rewrite the encoding per "Uniform (untagged) + sizeclass encoding" (small at `value = sc + 1`, large at + `value = 1 + NUM_SMALL_SIZECLASSES + lc`, index 0 is the + default sentinel). Drop the `TAG` constant. Convert all accessors + (`from_small_class`, `from_large_class`, `as_small`, `as_large`, + `is_small`, `index`). Default-construction (`sizeclass_t{}`) and + `is_default()` keep their `value == 0` semantics. +- Audit callers of `sizeclass_t::raw()` — the raw value's meaning + has changed (no tag bit, shifted by 1). Most callers use it as a + `ModArray` index into `sizeclass_metadata`, which still works + with the size-0 padding slot. +- `sizeclass_metadata` ModArray (`sizeclass_data_fast` and + `sizeclass_data_slow`): pad slot 0 with zero-initialised entries + (`size = 0`, `slab_mask = 0`, all other fields zero). A + `static_assert` after construction enforces this. +- `sizeclass_metadata` constructor: rewrite as a single contiguous + loop over `global_index ∈ [0, NUM_SMALL_SIZECLASSES + + NUM_LARGE_CLASSES)`, writing slot `global_index + 1` with the + size derived from + `bits::from_exp_mant(global_index)`. + Small-specific fields populate only for indices < NUM_SMALL; + large-specific fields populate only for indices ≥ NUM_SMALL. The + current split between small (lines ~190-225) and large + (lines 226-238) loops collapses into one, eliminating the + pow2-vs-exp+mantissa mismatch the old large loop had. +- `large_size_to_chunk_size(size)`: **semantics unchanged in this + phase** — continues to return `bits::next_pow2(size)`. (Phase 15 + changes the body to + `sizeclass_full_to_size(size_to_sizeclass_full(size))`.) +- `size_to_sizeclass_full(size)`: **semantics unchanged in this + phase** for both branches — still pow2-rounded for large. + *Implementation* of the large branch must change because the + old body assumed `from_large_class` was a leading-zero-count + mapping. Phase 13 redefines `from_large_class(lc)` to mean + `from_exp_mant(NUM_SMALL_SIZECLASSES + lc)`, + so the new body uses `bits::to_exp_mant` (the literal inverse of + `from_exp_mant`) directly: + ``` + size_t pow2 = bits::next_pow2(size); + size_t global = + bits::to_exp_mant(pow2); + return sizeclass_t::from_large_class(global - NUM_SMALL_SIZECLASSES); + ``` + (Pre-Phase-13 there was a separate helper + `large_size_to_chunk_sizeclass(size)` returning an `lc` index. + Post-Phase-13, small and large share a single global exp+mantissa + index space, so the `+NUM_SMALL_SIZECLASSES` / `-NUM_SMALL_SIZECLASSES` + round-trip via `lc` cancels out and the helper is removed. The + large branch of `size_to_sizeclass_full` inlines the + `to_exp_mant` call directly.) + (An earlier draft tried a manual + `(next_pow2_bits - MIN_ALLOC_STEP_BITS) << INTERMEDIATE_BITS` + formula. That is wrong — `to_exp_mant` does not simply place the + exponent at MANTISSA_BITS; it uses a `b` offset that makes + consecutive pow2 inputs differ by exactly `2^INTERMEDIATE_BITS`. + Always use `to_exp_mant`, which is the literal inverse of the + table-build helper.) + Phase 15 replaces `pow2` with `size` (no `next_pow2`); + `to_exp_mant` rounds non-pow2 sizes up to the next exp+mantissa + step. +- The non-pow2 large slots in `sizeclass_metadata` are populated + with correct size/slab_mask values in Phase 13 but are + unreachable from `size_to_sizeclass_full` / + `large_size_to_chunk_size` until Phase 15. This keeps Phase 13's + end-to-end behaviour identical to today's. +- `slab_index`, `start_of_object`, `is_start_of_object`: continue + to use `meta.slab_mask`. The metadata table builder sets + `slab_mask = info.align - 1` for large (where + `info.align = size & (~size + 1)`, the natural alignment from + `backend_arena_bins.h:741`). For pow2 sizes, `info.align == size`, + so `slab_mask = size - 1` — matching today's value. For + non-pow2 sizes (table-populated but unreachable in Phase 13), + `slab_mask = info.align - 1 < size - 1`. Phase 14 adds the + per-chunk offset that lets recovery work for non-pow2 once + Phase 15 lights them up. +- `round_size(size)` (lines 478-501): large branch left unchanged + in this phase — still rounds to next pow2. Phase 15 updates it + to match the new `large_size_to_chunk_size`. + +### `src/snmalloc/mem/metadata.h` + +- Line 45: change + `static constexpr address_t REMOTE_BACKEND_MARKER = 1 << 7;` to + `static constexpr address_t REMOTE_BACKEND_MARKER = + SIZECLASS_REP_SIZE;` + so the marker tracks the (now-untagged) sizeclass field width. + Adjust the comment to point at `sizeclasstable.h` for the + derivation. +- The existing static-asserts at `metadata.h:64-67` already + enforce the invariant; verify they still pass. +- **Add a public layout constant** on `MetaEntryBase` so backend + code can derive shift positions without violating the protected + access of `REMOTE_BACKEND_MARKER`. Insert into the `public:` + section (after line 113): + ```cpp + /** + * Bit position of the first bit available to backend metadata + * layouts above the reserved region. Used by + * `backend_arena_range.h` and `largebuddyrange.h` to derive + * RED_BIT_POS, VARIANT_SHIFT, and LARGE_SIZE_SHIFT. + */ + static constexpr size_t BACKEND_LAYOUT_FIRST_FREE_BIT = + bits::next_pow2_bits_const(REMOTE_BACKEND_MARKER) + 1; + ``` + The `+1` reserves `REMOTE_BACKEND_MARKER`'s own bit (it lives at + `next_pow2_bits_const(REMOTE_BACKEND_MARKER)`). + +### `src/snmalloc/backend_helpers/backend_arena_range.h` and `src/snmalloc/backend_helpers/largebuddyrange.h` + +- Replace hard-coded `RED_BIT_POS = 8`, `VARIANT_SHIFT = 9`, + `LARGE_SIZE_SHIFT = 8` in `backend_arena_range.h` with + derivations from the new public + `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT`: + `RED_BIT_POS = MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT;` + `LARGE_SIZE_SHIFT = MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT;` + `VARIANT_SHIFT = MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT + 1;` + (the `+1` reserves the RED bit). +- `backend_arena_range.h:64-66` `static_assert` continues to enforce + no clash with reserved bits. +- `largebuddyrange.h:40-46`: `BuddyChunkRep::RED_BIT = 1 << 8`. + Replace with `1 << MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT`. + (The plan previously cited the wrong filename `buddy.h` — + corrected.) +- Grep `RED_BIT` and `1 << 8` / `<< 8` across `backend_helpers/` + to confirm no other site needs the same shift. + +### `src/snmalloc/mem/corealloc.h` + +- Line 1120-1121: replace + `size_t size = bits::one_at_bit(entry_sizeclass);` + with + `size_t size = sizeclass_full_to_size(entry.get_sizeclass());` + so the dealloc-large path reads the precise sizeclass-encoded + size instead of reconstructing it from a leading-zero count. This + is a no-op today (pow2 only); it makes Phase 15's behaviour change + land at a single accessor. +- Grep `corealloc.h` for other `one_at_bit(` calls that derive + large-allocation size from an `as_large()` value and convert all + of them. (Known candidate: `corealloc.h:1576` — verify scope.) + +### `src/snmalloc/global/globalalloc.h` and other consumers + +- Grep for any other consumer of `as_large()` that interprets the + value as a leading-zero count. Convert each to + `sizeclass_full_to_size` or to the exp+mantissa accessor. +- Audit any code that uses `sizeclass_t::raw()` directly assuming + the tag-bit-set-means-small invariant. The uniform encoding + changes the meaning of `raw()`. +- Verified candidates from inspection: `globalalloc.h:145-220` + (`remaining_bytes`, `index_in_object`, `external_pointer`) — these + go through `start_of_object`/`slab_index`, so they pick up the + change automatically via `slab_mask`. + +## User-input size bounds (`MAX_LARGE_SIZECLASS_SIZE`) + +Before Phase 13, the largest representable large allocation was +`1 << (address_bits - 1)` (half the address space, derived from +`from_large_class` being a leading-zero-count mapping). The +pre-existing bound check used `size > (size_t(1) << 63)` as a sloppy +upper limit and let anything below through. With the exp+mantissa +encoding, the largest representable size is the exact value of the +top large class — sizes between that and `2^address_bits` no longer +map to any valid sizeclass and must be rejected at the API boundary. + +Define a derived constant alongside the encoding: + +- `ENCODED_ADDRESS_BITS = bits::min(DefaultPal::address_bits, bits::BITS - 1)`. + Caps the encoding range one bit below the native word width so that + `from_exp_mant(NUM_SMALL + NUM_LARGE - 1) = 1 << ENCODED_ADDRESS_BITS` + does not overflow `size_t` on 32-bit (`address_bits == BITS == 32`). + On x86_64 (`address_bits = 48`) this is unchanged. +- `NUM_LARGE_CLASSES = (ENCODED_ADDRESS_BITS - MAX_SMALL_SIZECLASS_BITS) + << INTERMEDIATE_BITS` (use `ENCODED_ADDRESS_BITS`, not `address_bits`). +- `MAX_LARGE_SIZECLASS_SIZE = from_exp_mant(NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES - 1)`. +- Add `static_assert(MAX_LARGE_SIZECLASS_SIZE == bits::one_at_bit( + ENCODED_ADDRESS_BITS))` to pin the encoding invariant (a strict + nonzero check would not catch a wrong table-build mantissa offset). +- Add `static_assert(ENCODED_ADDRESS_BITS > MAX_SMALL_SIZECLASS_BITS)` + so `NUM_LARGE_CLASSES > 0` is structural, not coincidental. + +Fan out the new bound to every site that accepts a user-supplied size +and feeds it into the size→sizeclass lookup (these were either +unguarded or had the loose `> 2^63` check): + +- `src/snmalloc/mem/corealloc.h` `alloc_not_small`: replace the + `1 << 63` bound with `MAX_LARGE_SIZECLASS_SIZE`. +- `src/snmalloc/ds/sizeclasstable.h` `round_size`: same. +- `src/snmalloc/global/globalalloc.h` `check_size`: early-return via + `snmalloc_check_client` when `size > MAX_LARGE_SIZECLASS_SIZE`. +- `src/snmalloc/override/rust.cc` `rust_realloc`: gate the + equality-fast-path on both aligned sizes being + `<= MAX_LARGE_SIZECLASS_SIZE`. + +Add defensive `SNMALLOC_ASSERT`s in the large branch of +`size_to_sizeclass_full`: `size != 0`, `size <= +MAX_LARGE_SIZECLASS_SIZE`. These document the preconditions at the +function whose behaviour is constrained by them ("document coupling at +the point of breakage") and turn into noisy debug failures if a future +caller path skips the bound check. + +## Test gates + +1. **Build**: clean build of the default config passes. The + `static_assert(Entry::is_backend_allowed_value(...))` checks at + `backend_arena_range.h:64-66` catch any bit-layout mismatch. +2. **Full ctest suite**: all existing tests pass (no behaviour + regression — front-end still issues pow2 large requests, so + non-pow2 large sizeclasses exist in tables but are unreachable + from the API). +3. **BackendArena unit tests** (`test_backend_arena`) continue to + pass — they exercise the shifted RED/variant bits in the pagemap + encoding. +4. **Extend `src/test/func/sizeclass/sizeclass.cc`** with a + `uniform_large_sizeclasses` test case: + - For every large `sizeclass_t` index `lc ∈ [0, NUM_LARGE_CLASSES)`, + assert `sizeclass_full_to_size(from_large_class(lc))` is + strictly increasing in `lc`. + - For every pow2 size `S` in + `[MAX_SMALL_SIZECLASS_SIZE * 2, 2^(address_bits - 1)]`, assert + `sizeclass_full_to_size(size_to_sizeclass_full(S)) == S` + (round-trip identity on pow2 — still holds in Phase 13 + because `size_to_sizeclass_full` for large still rounds to + next pow2). + - For every non-pow2 size `X` strictly between adjacent pow2 + `[P, 2P)`, assert + `sizeclass_full_to_size(size_to_sizeclass_full(X)) == 2P` + (still pow2-rounded in Phase 13 — Phase 15 changes this). + - Sentinel sanity: `sizeclass_t{}.raw() == 0`; + `sizeclass_t{}.is_default()` is true; + `sizeclass_data_fast[0].size == 0`; + `sizeclass_data_fast[0].slab_mask == 0`; + `is_small(sizeclass_t{})` is false. + - Encoding sanity: `is_small(from_small_class(0))` is true; + `is_small(from_large_class(0))` is false; small range and + large range are disjoint and adjacent in the value space. +5. **Extend `src/test/func/release-rounding/rounding.cc`** to + exercise non-trivial pow2 large sizeclasses. Today this test + covers small only. Add cases that exercise + `start_of_object` / `is_start_of_object` for the pow2 large + sizeclasses materialised end-to-end in Phase 13. (Phase 14 + extends to the per-chunk offset; Phase 15 to non-pow2.) + (The plan previously cited `test/func/sizeclass/rounding.cc`, + which does not exist — corrected.) + +## Risks + +1. **SIZECLASS_BITS widening cascades.** Caught by the existing + `static_assert`s in `metadata.h:64-67` and + `backend_arena_range.h:64-66`. +2. **Some embedder set REMOTE_MIN_ALIGN tighter than chain allows.** + Would surface as a compile-error on the cacheline-vs-REP_SIZE + max. Address only if it actually fires. +3. **Stale `as_large()` callers.** Mitigation: grep + convert ALL + uses of `as_large()` in this phase. Phase 13 is not done until + the leading-zero-count semantics are retired. +4. **Stale `raw()` callers assuming tag-bit semantics.** The + uniform encoding changes `raw()`'s meaning (no tag bit, shifted + by 1). Grep all callers and convert each to the appropriate + accessor (`as_small`, `as_large`, `is_small`, or — if it's a + `ModArray` index — leave alone, relying on the size-0 padding + slot at index 0 to make the no-subtract lookup return the right + sentinel values). +5. **Adding the size-0 padding slot at index 0.** The padding slot + in `sizeclass_metadata` must have `size = 0` and `slab_mask = 0` + (and any other fields zero-initialised) so that + `sizeclass_full_to_size(sizeclass_t{}) == 0` and any accidental + slab-mask arithmetic on the sentinel returns 0 / a no-op. Verify + by reading every field in `sizeclass_data_fast` / + `sizeclass_data_slow`. Add a static-assert that index 0 has + `size == 0` after table init. + +## Out of scope + +- Per-chunk pagemap offset (Phase 14). +- Non-pow2 reservations (Phase 15). +- Changes to the small sizeclass encoding (other than dropping + the tag bit). +- `round_size(size)` for large: still pow2 here; Phase 15 fixes. + +# Phase 14: Per-chunk pagemap offset (slab-granular) + +## Goal + +Add a per-chunk "slab offset within allocation" field to +`FrontendMetaEntry`, written by a new `set_metaentry_large` path, +and use it in `start_of_object` / `is_start_of_object` so that the +start of a large allocation can be recovered from any address +within it — independent of the allocation's alignment. This unlocks +Phase 15. + +After Phase 14, the start-finding code uses the per-chunk offset +for large allocations and continues to use `slab_mask` for small. +With the front-end still issuing pow2 large requests (Phase 15 +changes that), every materialised large allocation has +`info.align == size` so `slab_mask = size - 1` covers the whole +allocation with offset always 0 — exactly today's behaviour. + +## Why now + +- Phase 15 introduces non-pow2 reservations. The existing + `addr & ~slab_mask` answer is wrong for non-pow2 sizes/alignments. +- Per-chunk offset is the mechanism PLAN.md (lines 65-71) already + identified. Phase 14 implements that mechanism with offset = 0 + semantics matching the existing pow2 path — so it can land + without changing observable behaviour. + +## Design + +### Slab granularity, not chunk granularity + +The offset records "which slab within the allocation does this +chunk belong to", in units of the per-sizeclass `slab_size`. The +recovery formula (matching PLAN.md lines 65-71) is: + +``` +start = (addr & ~slab_mask) - offset * slab_size +``` + +where `slab_size = info.align` (the natural alignment from +`backend_arena_bins.h:741`, `info.align = size & (~size + 1)`, +i.e. the lowest set bit of `size`), and `slab_mask = slab_size - 1`. +Both are per-sizeclass, stored in `sizeclass_data_fast` as +`slab_mask` (already there, value changes per Phase 13). + +**Offset width.** With `INTERMEDIATE_BITS = 2`, a large sizeclass +of size `S = (4+M) * 2^(E-2)` for `M ∈ {0,1,2,3}` has: + +| M | size factor | info.align (lowest set bit) | slabs (size/align) | +|---|-------------|------------------------------|---------------------| +| 0 | 4 | 2^E (= size) | 1 | +| 1 | 5 | 2^(E-2) | 5 | +| 2 | 6 | 2^(E-1) | 3 | +| 3 | 7 | 2^(E-2) | 7 | + +Worst case `2^(M+1) - 1` slabs (M = `INTERMEDIATE_BITS`): for M=2, +the table above shows 7 slabs. Generalising: `OFFSET_BITS = M + 1` +gives the needed `2^(M+1)` distinct values. A `static_assert` in +`metadata.h` guards the bound: +`static_assert((1 << OFFSET_BITS) > max_slabs_in_largest_class)`. + +(With natural alignment, the allocation incurs no address-space +waste beyond what alignment already implies. With computed +`OFFSET_BITS = INTERMEDIATE_BITS + 1`, we accept the extra +`meta`-word bit consumption to keep allocations at natural +alignment.) + +### Offset is a frontend concept (layering) + +Per user clarification: the offset is owned by the frontend (used +to recover start-of-object from an interior pointer); the boundary +bit is owned by the backend (used to mark PAL-allocation boundaries +for the buddy allocator). + +Both bits happen to live in the `meta` word of the pagemap entry, +but they are conceptually disjoint: + +- Offset accessors live on `FrontendMetaEntry`, not on + `MetaEntryBase`. The boundary bit machinery + (`MetaEntryBase::set_boundary`, `clear_boundary_bit`, `is_boundary`) + must not clobber offset bits — and currently doesn't, because + it only `|=` / `&= ~` the single boundary bit at position 0. +- The frontend's `set_metaentry_large` packs offset into `meta` + and must preserve the boundary bit. **Key observation**: + `MetaEntryBase::operator=` at `metadata.h:162-169` *already* + preserves the target's boundary bit on assignment. So writing a + freshly-constructed `Entry t_i(meta, ras)` (with offset and + boundary both zero), calling `set_offset(slab_index)` on it (RMW + that touches only OFFSET bits — boundary on `t_i` is still 0), + then `concretePagemap.set(addr, t_i)` (which assigns via + `operator=`) leaves the pagemap entry's pre-existing boundary + bit intact. No manual boundary-preservation logic is needed. +- `FrontendMetaEntry::get_slab_metadata()` (currently at + `metadata.h:739-740` masks `meta & ~META_BOUNDARY_BIT`) must + also mask the offset bits. The simplest way: extend the existing + mask constant. Define `META_FRONTEND_RESERVED_MASK = + META_BOUNDARY_BIT | (((1 << OFFSET_BITS) - 1) << OFFSET_SHIFT)` + and mask with that everywhere `get_slab_metadata` needs the + pointer. + +### Where the offset lives in the `meta` word + +Bits `1..OFFSET_BITS` of `meta` (with `OFFSET_SHIFT = 1`): + +- Bit 0: `META_BOUNDARY_BIT` (backend-owned). +- Bits `1..OFFSET_BITS`: offset (frontend-owned, large-only). +- Bits `(1 + OFFSET_BITS)..`: `SlabMetadata*` payload (natural + pointer alignment). + +This requires `alignof(SlabMetadata) >= (1 << (1 + OFFSET_BITS))`. +For default `INTERMEDIATE_BITS=2`, `OFFSET_BITS=3`, the requirement +is `alignof(SlabMetadata) >= 16`. Inspect `SlabMetadata` at the top +of Phase 14; if alignment is insufficient, add +`alignas(1 << (1 + OFFSET_BITS))` (= `alignas(16)` for default) to +`SlabMetadata`. Cost: a few bytes of padding per slab metadata +record — negligible. + +### Accessors + +Add to `FrontendMetaEntry`: + +- `static constexpr size_t OFFSET_BITS = INTERMEDIATE_BITS + 1;` + (derives from `INTERMEDIATE_BITS` because the worst-case slab + count for a non-pow2 large class with M mantissa bits is + `2^M + (2^M - 1) = 2^(M+1) - 1`. For default `INTERMEDIATE_BITS=2` + this gives `OFFSET_BITS = 3` (max offset 7, matching a worst case + of 7 slabs). For `INTERMEDIATE_BITS=3` (config option) it gives + `OFFSET_BITS = 4` (max offset 15, matching a worst case of 15 + slabs).) +- `static constexpr size_t OFFSET_SHIFT = 1;` (immediately above + the boundary bit) +- `static constexpr address_t OFFSET_MASK = + ((1 << OFFSET_BITS) - 1) << OFFSET_SHIFT;` +- `void set_offset(size_t slab_offset)`: read-modify-write of + `meta`, preserving boundary bit and `SlabMetadata*` payload. + Asserts `slab_offset < (1 << OFFSET_BITS)`. +- `size_t get_offset() const`: reads `(meta & OFFSET_MASK) >> + OFFSET_SHIFT`. + +Update the existing pointer mask: define +`META_FRONTEND_RESERVED_MASK = META_BOUNDARY_BIT | OFFSET_MASK`, +update `get_slab_metadata()` to mask `meta & ~META_FRONTEND_RESERVED_MASK`. + +A `static_assert(alignof(SlabMetadata) >= (1 << (OFFSET_BITS + +OFFSET_SHIFT)))` enforces the pointer-alignment requirement at +compile time. For the default config this requires +`alignof(SlabMetadata) >= 16`. Verify the current value and add +`alignas(16)` (or computed `alignas(1 << (OFFSET_BITS+OFFSET_SHIFT))`) +to `FrontendSlabMetadata` if needed. + +### `Pagemap::set_metaentry` (split into small vs large) + +The existing `set_metaentry` (writes uniform entries per chunk in +a range) is a static member of `BasicPagemap` in +`backend_helpers/pagemap.h:56-66`, which uses +`concretePagemap.set(...)` to reach the underlying `FlatPagemap`. +The new `set_metaentry_large` is added as a static member alongside +it. + +`FrontendMetaEntry` deletes its copy constructor (`metadata.h:754`), +so we cannot use `Entry t_i = t;` and modify per chunk. Instead, +reconstruct each per-chunk entry from its components: + +```cpp +// In BasicPagemap, alongside set_metaentry: +static void set_metaentry_large( + address_t p, + size_t size, + size_t slab_size, + SlabMetadata* meta, + uintptr_t remote_and_sizeclass) +{ + // slab_size = info.align of this sizeclass. + // size = total allocation size (== sizeclass-encoded size). + for (size_t chunk_offset = 0; chunk_offset < size; + chunk_offset += MIN_CHUNK_SIZE) + { + size_t slab_index = chunk_offset / slab_size; + Entry t_i(meta, remote_and_sizeclass); // meta low bits = 0 + t_i.set_offset(slab_index); // RMW; touches only OFFSET bits + concretePagemap.set(p + chunk_offset, t_i); + } +} +``` + +**Boundary-bit preservation**: `MetaEntryBase::operator=` at +`metadata.h:162-169` already preserves the *target's* boundary bit +when copy-assigning from `other`. `FlatPagemap::set` uses `=` to +write entries. Therefore: the freshly-constructed `t_i` carries +`boundary = 0`, but when it is assigned into the pagemap slot, the +slot's pre-existing boundary bit (set earlier by the backend's +`register_range`) is preserved by `operator=`. No manual +boundary-preservation logic is needed in this loop. + +**Backend call site** (`backend.h:131-132`): dispatch on +`sizeclass.is_small()`. Small path keeps existing +`Pagemap::set_metaentry(p, size, t)`. Large path: +`Pagemap::set_metaentry_large(p, size, + sizeclass_data_fast(sc).slab_mask + 1, + meta, ras);` +where `meta` and `ras` are the `SlabMetadata*` and +`remote_and_sizeclass` values currently passed to the `Entry t(meta, +ras)` construction at `backend.h:131`. + +### `start_of_object` / `is_start_of_object` + +The current `start_of_object` lives in `sizeclasstable.h` with no +Pagemap access. After Phase 14, the large case needs the per-chunk +offset — which lives in the pagemap. + +Split the function: keep `sizeclasstable.h`'s `start_of_object` as +the small-case implementation (rename internally to +`start_of_object_small` if helpful), and add a Config-aware wrapper +in `globalalloc.h` (or `mem/start_of_object.h`): + +```cpp +template +inline address_t start_of_object(address_t addr) { + // Use the existing public BackendAllocator accessor (see + // backend.h:197) instead of reaching for `Config::Backend::Pagemap` + // directly — `Pagemap` is a template parameter of `BackendAllocator`, + // not a publicly exposed nested type. The public + // `get_metaentry(addr)` static + // wraps the Pagemap access. + auto& entry = Config::Backend::template get_metaentry(addr); + auto sc = entry.get_sizeclass(); + if (sc.is_small()) { + auto info = sizeclass_data_fast(sc); + return start_of_object_small(info, addr); + } + // Large: PLAN.md (65-71) recovery. + auto info = sizeclass_data_fast(sc); + size_t slab_size = info.slab_mask + 1; + return (addr & ~info.slab_mask) - entry.get_offset() * slab_size; +} +``` + +### Consumers that MUST be rewritten in Phase 14 + +Phase 14 is incomplete until every caller of the +sizeclass-table-only `start_of_object` / `is_start_of_object` / +`remaining_bytes` on a potentially-large pointer is moved to the +Config-aware wrapper: + +- `globalalloc.h:137-144` (`remaining_bytes`): currently calls + `snmalloc::remaining_bytes(sizeclass, p)` which has no pagemap + offset access. Replace with the Config-aware path that consults + the pagemap entry for offset and computes + `start + sizeclass_full_to_size(sc) - addr`. +- `globalalloc.h:145-220` (`index_in_object`, `external_pointer`): + similarly rewrite to consult the pagemap. +- `corealloc.h` deallocation-sanity checks: `is_start_of_object` + is used in dealloc paths to assert the caller is passing a + valid base pointer. Grep `is_start_of_object` and `start_of_object` + across `corealloc.h` (verified candidates at + `corealloc.h:534-537` and `corealloc.h:1080-1083` per + rubber-duck review). Each call site that may receive a large + allocation's pointer must use the Config-aware variant. Without + this update, after Phase 15 a `dealloc` of a non-pow2 large + allocation could miss the start-of-object check entirely (every + natural-alignment slab boundary inside the allocation would + satisfy the old `slab_mask`-only check). +- `bounds_checks.h` memcpy gate (line 99-103): calls + `remaining_bytes(...)`. Moves to the Config-aware version + transitively via the `globalalloc.h::remaining_bytes` rewrite. + +`is_start_of_object` analogue: for small, today's formula; for +large, `(addr & info.slab_mask) == 0 && entry.get_offset() == 0`. + +`slab_index` for large: irrelevant — large allocations are a single +"object" of size `sizeclass_full_to_size(sc)`, not a slab of +multiple. Existing callers gated by `sc.is_small()` already avoid +calling `slab_index` for large. + +### Backend changes + +- `backend.h:131-132`: at the `set_metaentry` call site after a + large `alloc_chunk`, dispatch on small vs large as above. Phase + 14 keeps `alloc_chunk`'s `bits::is_pow2(size)` assertion (Phase + 15 relaxes it). This is fine: today only pow2 large allocations + reach this site, so `slab_size == size` and offset is always 0. +- `backend.h:169` (dealloc): writes backend-claim entries via the + backend Rep's word setters; those don't touch frontend bits. + No change. + +## Test gates + +1. **Build**: clean build passes. +2. **Full ctest suite**: all existing tests pass. Front-end still + issues pow2 large requests, so for every materialised large + allocation `info.align == size` and offset is always 0 — the + new `set_metaentry_large` path produces the same `get_slab_metadata()` + answer as before. Existing `start_of_object` answers (via + `slab_mask`) match the new offset-based answers for pow2-aligned + allocations. +3. **`src/test/func/release-rounding/rounding.cc`** continues to + pass — small path unchanged; large path uses offset = 0 always. +4. **Extend `src/test/func/memory/memory.cc`** with a + `large_alloc_pointer_recovery` test (public-API path): + - Allocate several large sizes via the public API. For each + allocation `p` of requested size `S_req`, the actual reservation + in Phase 14 is `S_res = bits::next_pow2(S_req)` (front-end is + still pow2-only). For each: + - For every chunk offset `k * MIN_CHUNK_SIZE` for + `k = 0..S_res/MIN_CHUNK_SIZE - 1`, assert + `Pagemap::get_metaentry(p + k * MIN_CHUNK_SIZE).get_offset() + == 0` (since the reservation is pow2 and `slab_size == + reservation_size` for pow2 large classes, all chunks live in + the single slab and have offset 0). + - For every interior address `q = p + j` with `j ∈ {0, 1, + S_res/2, S_res-1}`, assert `start_of_object(q) == p`. +5. **New test or extension** to exercise the non-zero offset write + path directly (Phase 14 is otherwise un-tested with non-zero + offsets, because the front-end is still pow2-only). Two options: + - (a) Add an internal-API test in + `src/test/func/large_offset/large_offset.cc` (or extend + `memory.cc`) that calls `BasicPagemap::set_metaentry_large` + directly on a freshly-allocated chunk-multiple range with a + synthetic non-pow2 sizeclass (one already populated in the + table in Phase 13). Then verify: + - `get_metaentry(p + k * MIN_CHUNK_SIZE).get_offset() == k * + MIN_CHUNK_SIZE / slab_size` for each chunk. + - `start_of_object(p + interior_addr) == p` for a + sample of interior addresses across all slabs. + - (b) Defer non-zero offset coverage to Phase 15 explicitly and + accept that Phase 14's gate is "no regressions on + pow2-allocation paths". + The plan picks (a) — Phase 14 must be independently testable. +6. **Boundary-bit-preservation test**: in an existing test that + exercises PAL-allocation boundaries (or a new minimal one), set + the boundary bit on a chunk via the backend path, then call + `set_offset(3)` on the frontend side, then read both — both + round-trip without clobbering each other. + +## Risks + +1. **`alignof(SlabMetadata)` insufficient.** Required alignment is + `1 << (1 + OFFSET_BITS)` — 16 bytes for default config. If + inspection shows alignment is smaller (likely 8 today), add + `alignas(1 << (1 + OFFSET_BITS))`. Caught at compile time by the + new `static_assert`. +2. **`get_slab_metadata` mask update missed somewhere.** Grep for + `META_BOUNDARY_BIT` and `meta &` to find every site that + masks the meta word for a pointer. Convert each to the new + `META_FRONTEND_RESERVED_MASK`. +3. **Offset-bit positions overlap with backend bits when the entry + is backend-claimed.** Not a real risk: when the backend writes + its claim, the entry's `meta` is owned by the backend Rep + (different layout). Frontend reads `get_offset()` only on + frontend-claimed entries. +4. **Boundary bit not preserved during `set_offset`.** Mitigation: + implement `set_offset` as RMW preserving all bits except the + offset field. Test case: set boundary, set offset, read offset, + read boundary — both round-trip. + +## Out of scope + +- Front-end requesting non-pow2 large sizes (Phase 15). +- Per-chunk offset for small allocations (small uses slab_mask + recovery, no per-chunk offset needed). +- Multi-byte offset (`OFFSET_BITS = INTERMEDIATE_BITS + 1` bits, + fits cleanly in `meta` low bits). + +# Phase 15: Front-end requests non-pow2 large allocations + +## Goal + +Flip the front-end so that large allocations request exactly the +sizeclass-encoded size (chunk-multiple, exp+mantissa-rounded), +instead of always the next power of two. This is the long-running +goal of the refactor: the backend (`BackendArenaRange`) has +supported arbitrary chunk-multiple sizes since Phase 10–12, the +sizeclass encoding has supported non-pow2 large since Phase 13, +and the per-chunk offset machinery has supported pointer recovery +since Phase 14. + +## Changes + +### `src/snmalloc/ds/sizeclasstable.h` + +- `large_size_to_chunk_size(size)`: replace + `bits::next_pow2(size)` with the rounded sizeclass-derived size: + `sizeclass_full_to_size(size_to_sizeclass_full(size))`. Now + rounds to exp+mantissa boundaries (matching Phase 13 encoding). +- `round_size(size)` for large (lines 478-501): currently returns + `bits::next_pow2(size)`. Update to match `large_size_to_chunk_size`: + `return sizeclass_full_to_size(size_to_sizeclass_full(size));` + This is critical because `DefaultConts::success` in + `corealloc.h:34-47` uses `round_size` to determine the zeroing + range for `calloc`. Without this update, `calloc` would zero + beyond the actual reservation. The two functions converge to + the same value now that the front-end's chunk-size request + matches the round-size. +- Update the comments on both functions to describe the new + rounding behaviour (no "next pow2"; "exp+mantissa rounded"). + +### `src/snmalloc/backend/backend.h` + +- `alloc_chunk` (line 89-95): + `SNMALLOC_ASSERT(bits::is_pow2(size))` → relaxed to + `SNMALLOC_ASSERT((size & (MIN_CHUNK_SIZE - 1)) == 0)`. + (Already permissible per Phase 14; tighten only if Phase 14 + did not relax it.) +- `meta_size = bits::next_pow2(sizeof(SlabMetadata) + extra_bytes);` + unchanged — that's metadata-array size, not allocation size. + +### `src/snmalloc/mem/corealloc.h` + +- Verify line 1576 (and any other `next_pow2(round_sizeof)` site) + — read context and update to match the new rounding scheme if + it's on the large-allocation path. +- The dealloc-large path was already migrated in Phase 13 to + `sizeclass_full_to_size(entry.get_sizeclass())` — no further + change needed. +- The front-end large-alloc path (corealloc.h:703-727) uses + `large_size_to_chunk_size` — automatically picks up the new + behaviour. + +### `src/snmalloc/mem/smallbuddyrange.h:232` + +- `auto rsize = bits::next_pow2(size);` inside + `alloc_range_with_leftover` is used only by the meta-data range + (and arguably the small object path). Read context to determine + scope. Likely no change in Phase 15; Phase 15 only touches large + object allocations. If a change is required, include it here; + if not, document the decision. + +## Test gates + +1. **Build**: clean build passes. +2. **Full ctest suite**: all existing tests pass. Existing tests + that exercise large allocations now allocate chunk-multiples, + not pow2 sizes. Reservation footprint shrinks; functional + results are unchanged. +3. **Extend `src/test/func/memcpy/func-memcpy.cc`** with a + non-pow2 large case: + - For sizes `S` strictly between adjacent pow2 (e.g. `S = + 1.5 * MAX_SMALL_SIZECLASS_SIZE`), call `malloc(S)`. Verify: + - `memcpy(p + sizeclass_full_to_size(sc) - 1, src, 1)` succeeds. + - `memcpy(p + sizeclass_full_to_size(sc), src, 1)` traps (in + the bounds-checking variant). + - **Prerequisite**: Phase 14 must have already replaced + `globalalloc.h::remaining_bytes` with the Config-aware + pagemap-offset path. Without that prerequisite, this test + does not exercise the offset path. (Verify by inspection: + confirm the new `remaining_bytes` consults + `entry.get_offset()`, not just `start_of_object_small`.) +4. **Extend existing `test/func/memory/memory.cc`** with a + non-pow2 pointer-recovery case (mirroring the Phase 14 test but + on front-end-issued non-pow2 allocations): + - For sizes `S` strictly between adjacent pow2 in the large + range, call `malloc(S)`, save `p`. Compute + `S_rounded = sizeclass_full_to_size(size_to_sizeclass_full(S))`. + - For every interior address `q = p + j` with `j ∈ {0, 1, + MIN_CHUNK_SIZE, S_rounded / 2, S_rounded - 1}`, assert + `external_pointer(q) == p`. + - Assert `is_start_of_object(p)` is true; `is_start_of_object(p + + 1)` is false; `is_start_of_object(p + MIN_CHUNK_SIZE)` is + false (every interior chunk has offset != 0). + - Assert reservation footprint matches `S_rounded / + MIN_CHUNK_SIZE` chunks (NOT `next_pow2(S) / MIN_CHUNK_SIZE`). +5. **Extend `src/test/func/release-rounding/rounding.cc`** to cover + non-pow2 large sizeclasses now that they're materialised + end-to-end. +6. **Existing memory-stress tests** (e.g. `external_pointer.cc`) + continue to pass. + +## Risks + +1. **Existing tests assume pow2 reservation footprint.** Grep tests + for `next_pow2`, `pow2`, and any size-arithmetic over allocations + returned from `malloc`. Likely small handful; convert each to + `sizeclass_full_to_size` or to a less assumption-laden check. +2. **`calloc` zeroing range.** Mitigated by updating `round_size` + for large (item above). Verify by inspecting + `corealloc.h:34-47` (`DefaultConts::success`) — it should now + zero exactly the reservation size. +3. **`SlabMetadata` reuse boundary.** The current + `slab_metadata == &slab_metadata` assertion in `dealloc_chunk` + relies on every chunk in the allocation pointing to the same + `SlabMetadata`. Phase 14's per-chunk-offset path keeps the + `meta` field's pointer bits unchanged across chunks (only + offset differs), so the assertion continues to hold after the + Phase 14 mask update. Verify by re-reading the assertion site. +4. **`remaining_bytes` overflow for very large allocations.** + After Phase 15, the rounded size can be just below the next + pow2, which is still bounded by `2^MAX_address_bits` — no + arithmetic overflow. Verify with a max-size allocation test. +5. **Performance.** Front-end alloc path: `next_pow2` is replaced + by an exp+mantissa table lookup. Dealloc-large already moved + to a table lookup in Phase 13. Net neutral. + +## Out of scope + +- Reducing `INTERMEDIATE_BITS` to gain bits in the sizeclass tag + (Phase 13 already chose 2 = the existing value). +- Generalising small allocations (already exp+mantissa). +- Any change to `alloc_range` / `dealloc_range` of arbitrary + byte-multiples — front-end always rounds via the sizeclass + encoding. + +# Review plan for Phases 13–15 + +Per claude.md "Mandatory review checkpoints": + +1. After this plan is written (now), run the rubber-duck review + pass on Phases 13–15 — read the plan + existing + `sizeclasstable.h`, `metadata.h`, `corealloc.h`, + `backend.h`, and confirm: + - Assumptions about bit availability in `FrontendMetaEntry` + (especially `alignof(SlabMetadata)`) are correct. + - No phase has a hidden cross-phase dependency that breaks the + "each phase ends with passing tests" invariant. + - The SIZECLASS_BITS widening doesn't break MetaEntry encoding. + - Tests proposed for each phase actually gate the right + invariants. +2. Address findings; present revised plan for explicit user + approval before any code changes. +3. After implementation of each phase, run the build/test subagent + per `.github/skills/building_and_testing.md`. +4. After all three phases land, pre-PR review (mandatory checkpoint). diff --git a/src/snmalloc/backend_helpers/backend_arena_bins.h b/src/snmalloc/backend_helpers/backend_arena_bins.h index 88445e372..26abf3665 100644 --- a/src/snmalloc/backend_helpers/backend_arena_bins.h +++ b/src/snmalloc/backend_helpers/backend_arena_bins.h @@ -272,8 +272,7 @@ namespace snmalloc const carve_info_t& info = carve_info_for_request(n); - size_t req_base = - (block.base + (info.align - 1)) & ~(info.align - 1); + size_t req_base = (block.base + (info.align - 1)) & ~(info.align - 1); size_t pre_size = req_base - block.base; // Servability precondition: `info.size >= n` bytes fit after @@ -310,7 +309,9 @@ namespace snmalloc */ class Bitmap { - friend struct BackendArenaBinsTestAccess; + friend struct BackendArenaBinsTestAccess< + INTERMEDIATE_BITS, + MIN_SIZE_BITS>; public: /// Strict upper bound on bin ids `bin_index` produces. Exposed @@ -717,8 +718,8 @@ namespace snmalloc constexpr size_t MAX_E = bits::BITS - MIN_SIZE_BITS; for (size_t e = 0; e < MAX_E; e++) { - exp_first_sc[e] = - bits::to_exp_mant_const(size_t(1) << (e + MIN_SIZE_BITS)); + exp_first_sc[e] = bits::to_exp_mant_const( + size_t(1) << (e + MIN_SIZE_BITS)); exp_bin_base[e] = e * BINS_PER_EXP; } exp_first_sc[MAX_E] = MAX_SC; diff --git a/src/snmalloc/backend_helpers/backend_arena_range.h b/src/snmalloc/backend_helpers/backend_arena_range.h index 8c50b6c2a..ff3e8dda4 100644 --- a/src/snmalloc/backend_helpers/backend_arena_range.h +++ b/src/snmalloc/backend_helpers/backend_arena_range.h @@ -15,12 +15,15 @@ namespace snmalloc * Unit 1 (addr + UNIT_SIZE): range-tree node (size ≥ 2 units). * Unit 2 (addr + 2*UNIT_SIZE): large chunk count (size ≥ 3 units). * - * Bit-layout decisions for tree nodes are private to this class: - * - Bits 0–7 of each pagemap word are reserved by the pagemap. - * - Bit 8 is the red bit (both trees). - * - Bits 9–10 of Word::One at unit 0 hold the variant tag. - * - Large chunk count is stored shifted left by 8 in Word::One of - * unit 2. + * Bit-layout decisions for tree nodes are private to this class. The + * pagemap reserves the low bits of each word for the meta-entry (see + * `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT`); the red bit, variant + * tag, and shifted large-chunk count all live at or above that bit: + * - Red bit (both trees) at `BACKEND_LAYOUT_FIRST_FREE_BIT`. + * - Variant tag (Word::One at unit 0) occupies 2 bits starting at + * `BACKEND_LAYOUT_FIRST_FREE_BIT + 1`. + * - Large chunk count is stored in Word::One of unit 2 left-shifted by + * `BACKEND_LAYOUT_FIRST_FREE_BIT`. * * `MIN_SIZE_BITS` is the log2 size of the allocation unit (= pagemap * stride); the caller passes whatever unit it uses (snmalloc's global @@ -39,15 +42,21 @@ namespace snmalloc static constexpr uintptr_t UNIT_SIZE = uintptr_t(1) << MIN_SIZE_BITS; - // Bit positions inside a pagemap word. Bits 0–7 are reserved by the - // pagemap; tree-node and large-size encodings start at bit 8. - static constexpr unsigned RED_BIT_POS = 8; - static constexpr unsigned VARIANT_SHIFT = 9; + // Bit positions inside a pagemap word. Bits in the reserved region + // (sizeclass + REMOTE_BACKEND_MARKER) are owned by the meta-entry + // layout; tree-node and large-size encodings start at the first free + // bit above that reserved range — see + // `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT` in `mem/metadata.h`. + static constexpr unsigned RED_BIT_POS = + MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT; + static constexpr unsigned VARIANT_SHIFT = + MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT + 1; static constexpr unsigned VARIANT_BITS = 2; // Shift used to encode the large-size chunk count in Word::One of // unit 2. - static constexpr size_t LARGE_SIZE_SHIFT = 8; + static constexpr size_t LARGE_SIZE_SHIFT = + MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT; static constexpr uintptr_t RED_BIT = uintptr_t(1) << RED_BIT_POS; static constexpr uintptr_t VARIANT_MASK = @@ -358,8 +367,7 @@ namespace snmalloc } } - auto [ov_addr, ov_size] = - arena.add_block(base.unsafe_uintptr(), size); + auto [ov_addr, ov_size] = arena.add_block(base.unsafe_uintptr(), size); if (ov_addr != 0) parent_dealloc(ov_addr, ov_size); } diff --git a/src/snmalloc/backend_helpers/largebuddyrange.h b/src/snmalloc/backend_helpers/largebuddyrange.h index 15324753f..bf217bc06 100644 --- a/src/snmalloc/backend_helpers/largebuddyrange.h +++ b/src/snmalloc/backend_helpers/largebuddyrange.h @@ -37,7 +37,8 @@ namespace snmalloc * a bit that is a valid part of the address of a chunk. * @{ */ - static constexpr address_t RED_BIT = 1 << 8; + static constexpr address_t RED_BIT = address_t(1) + << MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT; static_assert(RED_BIT < MIN_CHUNK_SIZE); static_assert(MetaEntryBase::is_backend_allowed_value( diff --git a/src/snmalloc/ds/sizeclasstable.h b/src/snmalloc/ds/sizeclasstable.h index 5db3cb5fa..c42e4643b 100644 --- a/src/snmalloc/ds/sizeclasstable.h +++ b/src/snmalloc/ds/sizeclasstable.h @@ -17,31 +17,72 @@ namespace snmalloc { using chunksizeclass_t = size_t; - // Large classes range from [MAX_SMALL_SIZECLASS_SIZE, ADDRESS_SPACE). + // Cap the address bits the encoding tries to represent so that + // `MAX_LARGE_SIZECLASS_SIZE` (= 2 ^ ENCODED_ADDRESS_BITS) always fits in + // `size_t`. On 64-bit platforms `DefaultPal::address_bits` is already 48, + // but on 32-bit platforms it equals `bits::BITS` and would otherwise + // overflow the encoded maximum to 0. + constexpr size_t ENCODED_ADDRESS_BITS = + bits::min(DefaultPal::address_bits, bits::BITS - 1); + + // Number of large sizeclasses. Large classes follow on directly from small + // classes in the global exp+mantissa scheme used by + // `bits::from_exp_mant`. The total + // span of representable sizes is from MIN_ALLOC_SIZE up to and including + // 2^ENCODED_ADDRESS_BITS, so the count of large entries beyond the small + // range is (ENCODED_ADDRESS_BITS - MAX_SMALL_SIZECLASS_BITS) mantissa + // cycles, each with 2^INTERMEDIATE_BITS entries. constexpr size_t NUM_LARGE_CLASSES = - DefaultPal::address_bits - MAX_SMALL_SIZECLASS_BITS; + (ENCODED_ADDRESS_BITS - MAX_SMALL_SIZECLASS_BITS) << INTERMEDIATE_BITS; + + // Bits required to encode any sizeclass value. Slot 0 is reserved as the + // unmapped/default sentinel, so the count includes a leading +1. + constexpr size_t SIZECLASS_BITS = + bits::next_pow2_bits_const(1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES); + + // Size of the sizeclass-keyed lookup tables and the alignment that the + // REMOTE_BACKEND_MARKER constraint requires of RemoteAllocator. There is no + // separate tag bit: all valid sizeclass raw values are in + // [0, 1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES) and live in the low + // SIZECLASS_BITS bits of a pagemap word. + constexpr size_t SIZECLASS_REP_SIZE = bits::one_at_bit(SIZECLASS_BITS); + + // Largest allocation size representable by the uniform sizeclass encoding. + // Equals `from_exp_mant(NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES - 1)`, + // which for the default config is `2 ^ ENCODED_ADDRESS_BITS`. Requests + // strictly larger than this cannot be encoded and must be failed before + // any call to `size_to_sizeclass_full`. + constexpr size_t MAX_LARGE_SIZECLASS_SIZE = + bits::from_exp_mant( + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES - 1); - // How many bits are required to represent either a large or a small - // sizeclass. - constexpr size_t TAG_SIZECLASS_BITS = bits::max( - bits::next_pow2_bits_const(NUM_SMALL_SIZECLASSES), - bits::next_pow2_bits_const(NUM_LARGE_CLASSES + 1)); - - // Number of bits required to represent a tagged sizeclass that can be - // either small or large. - constexpr size_t SIZECLASS_REP_SIZE = - bits::one_at_bit(TAG_SIZECLASS_BITS + 1); + static_assert( + MAX_LARGE_SIZECLASS_SIZE == bits::one_at_bit(ENCODED_ADDRESS_BITS), + "MAX_LARGE_SIZECLASS_SIZE must equal 2 ^ ENCODED_ADDRESS_BITS; if this " + "fails, the exp+mantissa math no longer matches NUM_LARGE_CLASSES."); + static_assert( + ENCODED_ADDRESS_BITS > MAX_SMALL_SIZECLASS_BITS, + "ENCODED_ADDRESS_BITS must exceed MAX_SMALL_SIZECLASS_BITS so the large " + "range is non-empty."); /** - * Encapsulates a tagged union of large and small sizeclasses. + * Represents a sizeclass identifier shared by small and large allocations + * using a single uniform encoding: * - * Used in various lookup tables to make efficient code that handles - * all objects allocated by snmalloc. + * value == 0 : unmapped / default sentinel + * value ∈ [1, 1 + NUM_SMALL_SIZECLASSES) : small sizeclass sc = value - 1 + * value ∈ [1 + NUM_SMALL_SIZECLASSES, + * 1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES) + * : large class lc = + * value - 1 - + * NUM_SMALL_SIZECLASSES + * + * Used directly as an index into `sizeclass_metadata`. Slot 0 of that table + * is zero-padded so the sentinel can flow through the fast-path table + * lookups without a subtract on the hot path. */ class sizeclass_t { - static constexpr size_t TAG = bits::one_at_bit(TAG_SIZECLASS_BITS); - size_t value{0}; constexpr sizeclass_t(size_t value) : value(value) {} @@ -51,20 +92,19 @@ namespace snmalloc static constexpr sizeclass_t from_small_class(smallsizeclass_t sc) { - SNMALLOC_ASSERT(sc < TAG); - // Note could use `+` or `|`. Using `+` as will combine nicely with array - // offset. - return {TAG + sc}; + SNMALLOC_ASSERT(sc < NUM_SMALL_SIZECLASSES); + return {sc + 1}; } /** - * Takes the number of leading zero bits from the actual large size-1. - * See size_to_sizeclass_full + * Construct from a large class index `lc` in [0, NUM_LARGE_CLASSES). + * Large classes are stored as a contiguous run immediately after the + * small range and the sentinel slot. */ static constexpr sizeclass_t from_large_class(size_t large_class) { - SNMALLOC_ASSERT(large_class < TAG); - return {large_class}; + SNMALLOC_ASSERT(large_class < NUM_LARGE_CLASSES); + return {1 + NUM_SMALL_SIZECLASSES + large_class}; } static constexpr sizeclass_t from_raw(size_t raw) @@ -72,21 +112,16 @@ namespace snmalloc return {raw}; } - constexpr size_t index() - { - return value & (TAG - 1); - } - constexpr smallsizeclass_t as_small() { SNMALLOC_ASSERT(is_small()); - return smallsizeclass_t(value & (TAG - 1)); + return smallsizeclass_t(value - 1); } constexpr chunksizeclass_t as_large() { - SNMALLOC_ASSERT(!is_small()); - return bits::BITS - (value & (TAG - 1)); + SNMALLOC_ASSERT(!is_small() && !is_default()); + return value - 1 - NUM_SMALL_SIZECLASSES; } constexpr size_t raw() @@ -96,7 +131,9 @@ namespace snmalloc constexpr bool is_small() { - return (value & TAG) != 0; + // Sentinel (value == 0) underflows to a large positive value, which + // also fails the comparison — the sentinel is therefore not small. + return (value - 1) < NUM_SMALL_SIZECLASSES; } constexpr bool is_default() @@ -223,12 +260,23 @@ namespace snmalloc meta.mod_zero_mult = (~zero / meta.size) + 1; } - for (size_t sizeclass = 0; sizeclass < bits::BITS; sizeclass++) + for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) { - auto lsc = sizeclass_t::from_large_class(sizeclass); + auto lsc = sizeclass_t::from_large_class(lc); auto& meta = fast(lsc); - meta.size = sizeclass == 0 ? 0 : bits::one_at_bit(lsc.as_large()); - meta.slab_mask = meta.size - 1; + // Continuous global exp+mantissa scheme: small classes occupy + // global indices [0, NUM_SMALL_SIZECLASSES); large classes occupy + // [NUM_SMALL_SIZECLASSES, NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES). + size_t size = + bits::from_exp_mant( + NUM_SMALL_SIZECLASSES + lc); + meta.size = size; + // Natural alignment of the size: the largest power of two that + // divides `size`. For pow2 sizes, this equals `size`; for non-pow2 + // mantissa steps it is the slab granularity at which the allocation + // tiles. `slab_mask = align - 1`. + size_t align = size & (~size + 1); + meta.slab_mask = align - 1; // The slab_mask will do all the necessary work, so // perform identity multiplication for the test. meta.mod_zero_mult = 1; @@ -241,6 +289,16 @@ namespace snmalloc constexpr SizeClassTable sizeclass_metadata = SizeClassTable(); + // Slot 0 of `sizeclass_metadata` is the unmapped sentinel; it must remain + // zero-initialised so fast-path lookups via `fast(sc)` return zero size + // and slab_mask without needing a sentinel check before indexing. + static_assert( + sizeclass_metadata.fast(sizeclass_t{}).size == 0, + "sentinel slot must have size 0"); + static_assert( + sizeclass_metadata.fast(sizeclass_t{}).slab_mask == 0, + "sentinel slot must have slab_mask 0"); + static_assert( bits::BITS - sizeclass_metadata.DIV_MULT_SHIFT <= MAX_CAPACITY_BITS); @@ -291,16 +349,6 @@ namespace snmalloc return bits::one_at_bit(MIN_CHUNK_BITS + sizeclass); } - /** - * For large allocations, the metaentry stores the raw log_2 of the size, - * which must be shifted into the index space of slab_sizeclass-es. - */ - constexpr size_t - metaentry_chunk_sizeclass_to_slab_sizeclass(chunksizeclass_t sizeclass) - { - return sizeclass - MIN_CHUNK_BITS; - } - constexpr uint16_t sizeclass_to_slab_object_count(smallsizeclass_t sizeclass) { return sizeclass_metadata.slow(sizeclass_t::from_small_class(sizeclass)) @@ -378,10 +426,6 @@ namespace snmalloc return bits::next_pow2(size); } - inline static size_t large_size_to_chunk_sizeclass(size_t size) - { - return bits::next_pow2_bits(size) - MIN_CHUNK_BITS; - } constexpr SNMALLOC_PURE size_t sizeclass_lookup_index(const size_t s) { @@ -456,13 +500,17 @@ namespace snmalloc } /** - * A compressed size representation, - * either a small size class with the 7th bit set - * or a large class with the 7th bit not set. - * Large classes are stored as a mask shift. - * size = (~0 >> lc) + 1; - * Thus large size class 0, has size 0. - * And large size class 33, has size 2^31 + * Maps a requested size to its sizeclass. The result uses the unified + * encoding documented on `sizeclass_t`. + * + * For small sizes, this delegates to `size_to_sizeclass`. For large + * sizes in Phase 13, this rounds up to the next power of two (the + * front end still requests pow2-rounded reservations); Phase 15 + * removes the `next_pow2` call to enable non-pow2 large reservations. + * + * `to_exp_mant` is the literal inverse of the `from_exp_mant` used + * when populating `sizeclass_metadata`, so this never indexes the + * wrong slot. */ static inline sizeclass_t size_to_sizeclass_full(size_t size) { @@ -470,9 +518,12 @@ namespace snmalloc { return sizeclass_t::from_small_class(size_to_sizeclass(size)); } - // bits::clz is undefined on 0, but we have size == 1 has already been - // handled here. We conflate 0 and sizes larger than we can allocate. - return sizeclass_t::from_large_class(bits::clz(size - 1)); + SNMALLOC_ASSERT(size != 0); + SNMALLOC_ASSERT(size <= MAX_LARGE_SIZECLASS_SIZE); + size_t pow2 = bits::next_pow2(size); + size_t global = + bits::to_exp_mant(pow2); + return sizeclass_t::from_large_class(global - NUM_SMALL_SIZECLASSES); } inline SNMALLOC_FAST_PATH static size_t round_size(size_t size) @@ -492,7 +543,7 @@ namespace snmalloc return sizeclass_to_size(size_to_sizeclass(1)); } - if (size > bits::one_at_bit(bits::BITS - 1)) + if (size > MAX_LARGE_SIZECLASS_SIZE) { // This size is too large, no rounding should occur as will result in a // failed allocation later. diff --git a/src/snmalloc/global/globalalloc.h b/src/snmalloc/global/globalalloc.h index 7607e582a..8b4e42e2d 100644 --- a/src/snmalloc/global/globalalloc.h +++ b/src/snmalloc/global/globalalloc.h @@ -287,6 +287,19 @@ namespace snmalloc if (!entry.is_owned()) return; size = size == 0 ? 1 : size; + // Any size beyond what the sizeclass encoding can represent is + // necessarily a mismatch with the pagemap's recorded sizeclass; report + // it directly rather than feeding the unrepresentable size into + // `size_to_sizeclass_full`. + if (size > MAX_LARGE_SIZECLASS_SIZE) + { + snmalloc_check_client( + mitigations(sanity_checks), + p == nullptr, + "Dealloc size exceeds encodable range: {}", + size); + return; + } auto sc = size_to_sizeclass_full(size); auto pm_sc = entry.get_sizeclass(); auto rsize = sizeclass_full_to_size(sc); diff --git a/src/snmalloc/mem/corealloc.h b/src/snmalloc/mem/corealloc.h index 127abc76a..10482b6b7 100644 --- a/src/snmalloc/mem/corealloc.h +++ b/src/snmalloc/mem/corealloc.h @@ -700,10 +700,11 @@ namespace snmalloc [](Allocator* self, size_t size) SNMALLOC_FAST_PATH_LAMBDA { return CheckInit::check_init( [self, size]() SNMALLOC_FAST_PATH_LAMBDA { - if (size > bits::one_at_bit(bits::BITS - 1)) + if (size > MAX_LARGE_SIZECLASS_SIZE) { - // Cannot allocate something that is more that half the size of - // the address space + // Cannot allocate something the sizeclass encoding cannot + // represent (equals `2 ^ ENCODED_ADDRESS_BITS` in + // `sizeclasstable.h` — well above any plausible request). return Conts::failure(size); } @@ -1117,8 +1118,7 @@ namespace snmalloc // XXX: because large objects have unique metadata associated with them, // the ring size here is one. We should probably assert that. - size_t entry_sizeclass = entry.get_sizeclass().as_large(); - size_t size = bits::one_at_bit(entry_sizeclass); + size_t size = sizeclass_full_to_size(entry.get_sizeclass()); #ifdef SNMALLOC_TRACING message<1024>("Large deallocation: {}", size); diff --git a/src/snmalloc/mem/metadata.h b/src/snmalloc/mem/metadata.h index e753f125c..dc4ff0948 100644 --- a/src/snmalloc/mem/metadata.h +++ b/src/snmalloc/mem/metadata.h @@ -40,9 +40,12 @@ namespace snmalloc * backend/largebuddyrange.h. * * This value is statically checked by the frontend to ensure that its - * bit packing does not conflict; see mem/remoteallocator.h + * bit packing does not conflict; see mem/remoteallocator.h. The marker + * tracks the sizeclass-encoding width (see `SIZECLASS_REP_SIZE` in + * ds/sizeclasstable.h): it must sit immediately above the highest bit + * used by a sizeclass raw value. */ - static constexpr address_t REMOTE_BACKEND_MARKER = 1 << 7; + static constexpr address_t REMOTE_BACKEND_MARKER = SIZECLASS_REP_SIZE; /** * Bit used to indicate this should not be considered part of the previous @@ -111,6 +114,16 @@ namespace snmalloc (REMOTE_BACKEND_MARKER << 1) - 1; public: + /** + * Bit position of the first bit available to backend metadata layouts + * above the reserved region. The reserved region runs from bit 0 up to + * and including the `REMOTE_BACKEND_MARKER` bit; layouts in + * `backend_arena_range.h` and `largebuddyrange.h` derive their bit + * positions (RED_BIT, VARIANT_SHIFT, LARGE_SIZE_SHIFT, ...) from this. + */ + static constexpr size_t BACKEND_LAYOUT_FIRST_FREE_BIT = + bits::next_pow2_bits_const(REMOTE_BACKEND_MARKER) + 1; + /** * Does the back end currently own this entry? Note that freshly * allocated entries are owned by the front end until explicitly diff --git a/src/snmalloc/override/rust.cc b/src/snmalloc/override/rust.cc index f07e51073..d2e7e2e08 100644 --- a/src/snmalloc/override/rust.cc +++ b/src/snmalloc/override/rust.cc @@ -39,8 +39,10 @@ extern "C" SNMALLOC_EXPORT void* SNMALLOC_NAME_MANGLE(rust_realloc)( size_t aligned_old_size = aligned_size(alignment, old_size), aligned_new_size = aligned_size(alignment, new_size); if ( + aligned_old_size <= MAX_LARGE_SIZECLASS_SIZE && + aligned_new_size <= MAX_LARGE_SIZECLASS_SIZE && size_to_sizeclass_full(aligned_old_size).raw() == - size_to_sizeclass_full(aligned_new_size).raw()) + size_to_sizeclass_full(aligned_new_size).raw()) return ptr; void* p = alloc(aligned_new_size); if (p) diff --git a/src/test/func/backend_arena_bins/backend_arena_bins.cc b/src/test/func/backend_arena_bins/backend_arena_bins.cc index 235ceb690..7dc126931 100644 --- a/src/test/func/backend_arena_bins/backend_arena_bins.cc +++ b/src/test/func/backend_arena_bins/backend_arena_bins.cc @@ -127,7 +127,8 @@ namespace snmalloc bitmap_info_for_request_const(size_t n) { return Bins::table_ - .bitmap_info[bits::to_exp_mant_const(n)]; + .bitmap_info[bits::to_exp_mant_const( + n)]; } /// `carve_info_for_request`, constexpr (uses `to_exp_mant_const`). @@ -135,7 +136,8 @@ namespace snmalloc static constexpr const carve_info_t& carve_info_for_request_const(size_t n) { return Bins::table_ - .carve_info[bits::to_exp_mant_const(n)]; + .carve_info[bits::to_exp_mant_const( + n)]; } // The canonical source of truth for what each within-exponent bin @@ -218,14 +220,11 @@ namespace static_checks "B=3 MAX_SC"); // Sizes that are powers of two have align == size. - static_assert( - B2::carve_info_for_request_const(4).align == 4, "size 4 align"); - static_assert( - B3::carve_info_for_request_const(8).align == 8, "size 8 align"); + static_assert(B2::carve_info_for_request_const(4).align == 4, "size 4 align"); + static_assert(B3::carve_info_for_request_const(8).align == 8, "size 8 align"); // sc_size at request(s) must be >= s. - static_assert( - B2::carve_info_for_request_const(9).size == 10, "B=2 round-up"); + static_assert(B2::carve_info_for_request_const(9).size == 10, "B=2 round-up"); static_assert( B3::carve_info_for_request_const(17).size == 18, "B=3 round-up"); } // namespace static_checks @@ -456,14 +455,12 @@ namespace const auto& ci = Bins::carve_info_for_request(s); if (ci.size != Bins::sc_size(sc)) { - std::printf( - "B=%zu carve_info_for_request(%zu).size mismatch\n", B, s); + std::printf("B=%zu carve_info_for_request(%zu).size mismatch\n", B, s); std::abort(); } if (ci.align != Bins::sc_align(sc)) { - std::printf( - "B=%zu carve_info_for_request(%zu).align mismatch\n", B, s); + std::printf("B=%zu carve_info_for_request(%zu).align mismatch\n", B, s); std::abort(); } if (&ci != &Bins::carve_info(sc)) @@ -536,8 +533,7 @@ namespace size_t step = Bins::max_supported_size() / 257; if (step == 0) step = 1; - for (size_t n = 1; n <= Bins::max_supported_size() && n > 0; - n += step + 1) + for (size_t n = 1; n <= Bins::max_supported_size() && n > 0; n += step + 1) check_one(n); } @@ -546,7 +542,8 @@ namespace /// (defined directly in terms of `bin_subsets`). template size_t reference_find( - size_t n_chunks, const typename BackendArenaBinsTestAccess::Bitmap& bm) + size_t n_chunks, + const typename BackendArenaBinsTestAccess::Bitmap& bm) { using Bins = BackendArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; @@ -1246,7 +1243,8 @@ namespace // request(n) at MIN_SIZE_BITS==0; sc_size(raw) at MIN_SIZE_BITS==K // equals sc_size(raw) at MIN_SIZE_BITS==0 times U; sc_align // likewise. - size_t probe[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 17, 32, 65, 127, 1024}; + size_t probe[] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 17, 32, 65, 127, 1024}; for (size_t n : probe) { // Skip values that would overflow either instance's domain. @@ -1270,8 +1268,9 @@ namespace using BaseR = typename Base::range_t; for (size_t n = 1; n <= 64; n++) for (size_t a = 0; a < 32; a++) - if (Scaled::bin_index(ScaledR{a * U, n * U}) != - Base::bin_index(BaseR{a, n})) + if ( + Scaled::bin_index(ScaledR{a * U, n * U}) != + Base::bin_index(BaseR{a, n})) std::abort(); // carve({0, blk*U}, n*U) returns the same partition as @@ -1287,14 +1286,17 @@ namespace continue; auto base_cv = Base::carve(BaseR{0, blk}, n); auto scaled_cv = Scaled::carve(ScaledR{0, blk * U}, n * U); - if (scaled_cv.pre.base != base_cv.pre.base * U || - scaled_cv.pre.size != base_cv.pre.size * U) + if ( + scaled_cv.pre.base != base_cv.pre.base * U || + scaled_cv.pre.size != base_cv.pre.size * U) std::abort(); - if (scaled_cv.req.base != base_cv.req.base * U || - scaled_cv.req.size != base_cv.req.size * U) + if ( + scaled_cv.req.base != base_cv.req.base * U || + scaled_cv.req.size != base_cv.req.size * U) std::abort(); - if (scaled_cv.post.base != base_cv.post.base * U || - scaled_cv.post.size != base_cv.post.size * U) + if ( + scaled_cv.post.base != base_cv.post.base * U || + scaled_cv.post.size != base_cv.post.size * U) std::abort(); } diff --git a/src/test/func/cheri/cheri.cc b/src/test/func/cheri/cheri.cc index 1928dbbd5..7e2318e11 100644 --- a/src/test/func/cheri/cheri.cc +++ b/src/test/func/cheri/cheri.cc @@ -266,7 +266,7 @@ int main() SNMALLOC_CHECK(sz == Aal::capptr_size_round(sz)); } - for (size_t sc = 0; sc < bits::BITS; sc++) + for (size_t sc = 0; sc < NUM_LARGE_CLASSES; sc++) { size_t sz = sizeclass_full_to_size(sizeclass_t::from_large_class(sc)); SNMALLOC_CHECK(sz == Aal::capptr_size_round(sz)); diff --git a/src/test/func/release-rounding/rounding.cc b/src/test/func/release-rounding/rounding.cc index 4d11eaafb..d03cfe772 100644 --- a/src/test/func/release-rounding/rounding.cc +++ b/src/test/func/release-rounding/rounding.cc @@ -51,5 +51,47 @@ int main(int argc, char** argv) if (failed) abort(); } + + // Exercise pow2 large sizeclasses end-to-end materialised in Phase 13. + // For each pow2 size S that the front end actually reaches (lc values that + // are pow2-aligned in the global exp+mantissa scheme), verify + // index_in_object / is_start_of_object at a representative set of offsets: + // the start of an object, an arbitrary interior offset, and the start of + // the next object. Bound the loop by ENCODED_ADDRESS_BITS so + // `bits::one_at_bit(b)` never shifts by >= BITS. + for (size_t b = MAX_SMALL_SIZECLASS_BITS + 1; b <= ENCODED_ADDRESS_BITS; b++) + { + size_t S = bits::one_at_bit(b); + sizeclass_t sc = size_to_sizeclass_full(S); + + address_t base = address_t(0); + size_t offsets[] = {0, 1, S / 2, S - 1, S}; + for (size_t off : offsets) + { + address_t addr = base + off; + size_t expected_mod = off % S; + bool expected_start = expected_mod == 0; + + size_t opt_mod = index_in_object(sc, addr); + if (opt_mod != expected_mod) + { + std::cout << "Large S=" << S << " offset=" << off + << " index_in_object=" << opt_mod + << " expected=" << expected_mod << std::endl; + failed = true; + } + + bool opt_start = is_start_of_object(sc, addr); + if (opt_start != expected_start) + { + std::cout << "Large S=" << S << " offset=" << off + << " is_start_of_object=" << opt_start + << " expected=" << expected_start << std::endl; + failed = true; + } + } + if (failed) + abort(); + } return 0; } diff --git a/src/test/func/sizeclass/sizeclass.cc b/src/test/func/sizeclass/sizeclass.cc index ac7ec6bd8..093b17424 100644 --- a/src/test/func/sizeclass/sizeclass.cc +++ b/src/test/func/sizeclass/sizeclass.cc @@ -67,6 +67,118 @@ void test_align_size() abort(); } +void test_uniform_large_sizeclasses() +{ + using namespace snmalloc; + bool failed = false; + + // Sentinel sanity: default-constructed sizeclass_t is the unmapped sentinel + // and not classified as small. + if (sizeclass_t{}.raw() != 0) + { + std::cout << "Default sizeclass_t raw is " << sizeclass_t{}.raw() + << " expected 0" << std::endl; + failed = true; + } + if (sizeclass_t{}.is_default() != true) + { + std::cout << "Default sizeclass_t .is_default() is false" << std::endl; + failed = true; + } + if (sizeclass_t{}.is_small()) + { + std::cout << "Default sizeclass_t.is_small() is true" << std::endl; + failed = true; + } + + // Encoding sanity: small range and large range are disjoint and adjacent + // in the value space. + if (sizeclass_t::from_small_class(smallsizeclass_t(0)).raw() != 1) + { + std::cout << "from_small_class(0).raw() != 1" << std::endl; + failed = true; + } + if ( + sizeclass_t::from_small_class(smallsizeclass_t(NUM_SMALL_SIZECLASSES - 1)) + .raw() + + 1 != + sizeclass_t::from_large_class(0).raw()) + { + std::cout << "Small/large ranges are not adjacent" << std::endl; + failed = true; + } + if ( + sizeclass_t::from_large_class(NUM_LARGE_CLASSES - 1).raw() >= + SIZECLASS_REP_SIZE) + { + std::cout << "Largest large sizeclass overflows SIZECLASS_REP_SIZE" + << std::endl; + failed = true; + } + if (!sizeclass_t::from_small_class(smallsizeclass_t(0)).is_small()) + { + std::cout << "from_small_class(0).is_small() is false" << std::endl; + failed = true; + } + if (sizeclass_t::from_large_class(0).is_small()) + { + std::cout << "from_large_class(0).is_small() is true" << std::endl; + failed = true; + } + + // Large sizeclasses are strictly increasing in size with lc. + size_t prev_size = 0; + for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) + { + size_t size = sizeclass_full_to_size(sizeclass_t::from_large_class(lc)); + if (size <= prev_size) + { + std::cout << "Non-monotonic large sizeclass: lc=" << lc + << " size=" << size << " prev=" << prev_size << std::endl; + failed = true; + } + prev_size = size; + } + + // Round-trip identity on pow2 large sizes in Phase 13: every pow2 size + // S in [MAX_SMALL_SIZECLASS_SIZE * 2, MAX_LARGE_SIZECLASS_SIZE] must satisfy + // sizeclass_full_to_size(size_to_sizeclass_full(S)) == S. Bound the loop by + // ENCODED_ADDRESS_BITS so `bits::one_at_bit(bits)` never shifts by >= BITS + // (the bound check itself would fail on 32-bit otherwise). + for (size_t b = MAX_SMALL_SIZECLASS_BITS + 1; b <= ENCODED_ADDRESS_BITS; b++) + { + size_t S = bits::one_at_bit(b); + sizeclass_t sc = size_to_sizeclass_full(S); + size_t rs = sizeclass_full_to_size(sc); + if (rs != S) + { + std::cout << "Pow2 round-trip failed: S=" << S << " round=" << rs + << std::endl; + failed = true; + } + + // For every non-pow2 size X strictly between adjacent pow2 [P, 2P), the + // result must round up to 2P (pow2 rounding still in force in Phase 13). + // Only check when 2P is still representable. + if (b < ENCODED_ADDRESS_BITS) + { + size_t mid = S + (S >> 1); + sizeclass_t sc_mid = size_to_sizeclass_full(mid); + size_t rs_mid = sizeclass_full_to_size(sc_mid); + size_t expect = bits::one_at_bit(b + 1); + if (rs_mid != expect) + { + std::cout << "Non-pow2 should round to next pow2: X=" << mid + << " round=" << rs_mid << " expected=" << expect << std::endl; + failed = true; + } + } + } + + if (failed) + abort(); +} + int main(int, char**) { setup(); @@ -149,4 +261,5 @@ int main(int, char**) abort(); test_align_size(); + test_uniform_large_sizeclasses(); } From 1144eab4f31f176d1eca7adc5f0187b8fa03eb73 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Mon, 25 May 2026 16:02:06 +0100 Subject: [PATCH 12/31] Fix double base-adjust in FlatPagemap::get_mut get_mut base-adjusted p before calling register_range, which then re-applied the base subtraction internally and tripped its out-of-range guard for legitimate in-range addresses. The path is reachable on PALs without LazyCommit (e.g. PALNoAlloc) when get/get_mut is called on an in-range address of a bounded pagemap. Move the register_range call before the p = p - base adjust so it sees the un-adjusted address that its bounds check expects. Add a regression test in func-pagemap that wraps DefaultPal with a stub stripping LazyCommit; this exercises the previously-broken path. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/ds/pagemap.h | 9 +++++++- src/test/func/pagemap/pagemap.cc | 39 ++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/snmalloc/ds/pagemap.h b/src/snmalloc/ds/pagemap.h index 2ee3cdd29..983b82e83 100644 --- a/src/snmalloc/ds/pagemap.h +++ b/src/snmalloc/ds/pagemap.h @@ -343,17 +343,24 @@ namespace snmalloc PAL::error("Internal error: Pagemap read access out of range."); } } - p = p - base; } // If this is potentially_out_of_range, then the pages will not have // been mapped. With Lazy commit they will at least be mapped read-only // Note that: this means external pointer on Windows will be slow. + // register_range takes an unadjusted address: it does its own + // base-relative arithmetic when has_bounds, so it must be called + // before the p = p - base adjustment below. if constexpr (potentially_out_of_range && !pal_supports) { register_range(p, 1); } + if constexpr (has_bounds) + { + p = p - base; + } + if constexpr (potentially_out_of_range) return body_opt[p >> SHIFT]; else diff --git a/src/test/func/pagemap/pagemap.cc b/src/test/func/pagemap/pagemap.cc index 7a03fa1a7..8e024bf6b 100644 --- a/src/test/func/pagemap/pagemap.cc +++ b/src/test/func/pagemap/pagemap.cc @@ -14,6 +14,17 @@ using namespace snmalloc; static constexpr size_t GRANULARITY_BITS = 20; +/** + * Test PAL that wraps DefaultPal but strips LazyCommit from pal_features. + * Used to exercise the get code path that calls register_range on + * a bounded pagemap — see test_get_potentially_out_of_range_bounded below. + */ +struct NoLazyCommitPal : public DefaultPal +{ + static constexpr uint64_t pal_features = + DefaultPal::pal_features & ~static_cast(LazyCommit); +}; + struct T { size_t v = 99; @@ -27,6 +38,9 @@ FlatPagemap pagemap_test_unbound; FlatPagemap pagemap_test_bound; +FlatPagemap + pagemap_test_bound_no_lazy; + size_t failure_count = 0; void check_get( @@ -158,6 +172,31 @@ int main(int argc, char** argv) test_pagemap(false); test_pagemap(true); + // Regression test for the bounded + !LazyCommit path of get. + // Previously, get_mut base-adjusted p before calling register_range, + // which double-subtracted base inside register_range and tripped the + // out-of-range guard for legitimate in-range addresses. + { + auto size = bits::one_at_bit(GRANULARITY_BITS + 4); + auto* base = NoLazyCommitPal::reserve(size); + NoLazyCommitPal::notify_using(base, size); + auto [heap_base, heap_size] = + pagemap_test_bound_no_lazy.init(base, size); + auto low = address_cast(heap_base); + + pagemap_test_bound_no_lazy.set(low, T(7)); + + // get with has_bounds && !LazyCommit must not error on an in-range + // address: the underlying register_range call sees a fully-adjusted base. + T value = pagemap_test_bound_no_lazy.get(low); + if (value.v != 7) + { + std::cout << "get bounded !LazyCommit: read " << value.v + << " expected 7" << std::endl; + failure_count++; + } + } + if (failure_count != 0) { std::cout << "Failure count: " << failure_count << std::endl; From 8a3da45c3b2a4418e4b6fce3455abbcba9eaf976 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Wed, 3 Jun 2026 17:03:57 +0100 Subject: [PATCH 13/31] Phase 14: per-chunk pagemap offset encoding Encode (sizeclass, slab-offset) jointly in the pagemap entry so the front end can recover the allocation start for an arbitrary interior chunk of a multi-slab-tile large allocation. The front end still only issues pow2 large requests, so every materialised entry today has offset=0; this lays the groundwork for Phase 15+ non-pow2 large support without front-end changes. Key pieces: - offset_and_sizeclass_t packs sizeclass into the low SIZECLASS_BITS and per-chunk offset into the next OFFSET_BITS of one word. - Backend::alloc_chunk loops over slab tiles, writing each tile's slab_index into the offset bits of its pagemap entry. - SizeClassTable is split into three by purpose: * start_ (sizeclass_data_start, 32B/row, indexed by osc): hot path for start_of_object on every dealloc. * align_ (sizeclass_data_align, 16B/row, indexed by sc): used by is_start_of_object alignment check in -check builds. * slab_ (sizeclass_data_slab, 4B/row, indexed by sc): cold; slab init thresholds. - start_of_object branches on osc.offset() == 0 (testable from bits already loaded in osc.raw()), so the offset=0 hot path skips the offset_bytes load and offset-shift arithmetic. Combined with the table split, perf-external_pointer-fast matches the baseline (~290 ms median) with no regression; perf-singlethread-check is within noise. - New src/test/func/large_offset targeted test reaches the multi-slab-tile branch via the public backend API. - check_invariant in BackendArena now uses SNMALLOC_CHECK rather than SNMALLOC_ASSERT, so callers that opt in via enabled=true get the invariant checks even in Release builds (which is what the tests want); the #ifndef NDEBUG wrapper is no longer needed. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PLAN.md | 1109 ++++++++++++----- src/snmalloc/backend/backend.h | 26 +- src/snmalloc/backend_helpers/backend_arena.h | 43 +- .../backend_helpers/backend_arena_bins.h | 2 +- .../backend_helpers/backend_arena_range.h | 28 +- .../backend_helpers/largebuddyrange.h | 5 +- src/snmalloc/ds/sizeclasstable.h | 416 +++++-- src/snmalloc/global/globalalloc.h | 14 +- src/snmalloc/mem/corealloc.h | 4 +- src/snmalloc/mem/metadata.h | 199 +-- src/snmalloc/override/new.cc | 3 +- src/snmalloc/override/rust.cc | 4 +- src/test/func/backend_arena/backend_arena.cc | 4 +- .../backend_arena_bins/backend_arena_bins.cc | 6 +- src/test/func/large_offset/large_offset.cc | 225 ++++ src/test/func/release-rounding/rounding.cc | 40 +- 16 files changed, 1549 insertions(+), 579 deletions(-) create mode 100644 src/test/func/large_offset/large_offset.cc diff --git a/PLAN.md b/PLAN.md index b3174911f..cb99ba20c 100644 --- a/PLAN.md +++ b/PLAN.md @@ -1968,7 +1968,7 @@ now returns `!mock_store[mock_index(addr)].boundary` — faithful to the real `PagemapRep::can_consolidate` reading `entry.is_boundary()`. The `mock_index` bounds assertion fires on any out-of-range probe, so the unsafe pattern trips in unit tests rather than only as a segfault in -production. A new test `test_block_at_arena_top_edge` adds a block +release builds. A new test `test_block_at_arena_top_edge` adds a block whose `succ_addr` sits one past the arena's pagemap; without the reorder this test reproduces the original failure. @@ -2116,7 +2116,7 @@ exp+mantissa space starting at `MAX_SMALL_SIZECLASS_BITS`. Adjacent classes step by `2^(E - INTERMEDIATE_BITS)` continuously, with no jump at the small/large boundary. -No production behaviour changes yet: the front-end still calls +No front-end behaviour changes yet: the front-end still calls `large_size_to_chunk_size(size) = next_pow2(size)` and writes the pagemap with the corresponding pow2-rounded sizeclass. The non-pow2 large sizeclasses are **populated in the table** (so the size / @@ -2611,301 +2611,706 @@ caller path skips the bound check. the tag bit). - `round_size(size)` for large: still pow2 here; Phase 15 fixes. -# Phase 14: Per-chunk pagemap offset (slab-granular) +# Phase 14: Per-chunk offset in `ras` + combined-indexed metadata ## Goal -Add a per-chunk "slab offset within allocation" field to -`FrontendMetaEntry`, written by a new `set_metaentry_large` path, -and use it in `start_of_object` / `is_start_of_object` so that the -start of a large allocation can be recovered from any address -within it — independent of the allocation's alignment. This unlocks -Phase 15. - -After Phase 14, the start-finding code uses the per-chunk offset -for large allocations and continues to use `slab_mask` for small. -With the front-end still issuing pow2 large requests (Phase 15 -changes that), every materialised large allocation has -`info.align == size` so `slab_mask = size - 1` covers the whole -allocation with offset always 0 — exactly today's behaviour. +Recover the start address of a large allocation from any interior +address, independent of allocation alignment. Stored as a per-chunk +slab-offset in the pagemap entry, packed alongside the sizeclass in +the `ras` (`remote_and_sizeclass`) word so that the same pagemap +word loaded for the sizeclass directly yields the index into the +metadata table that already has the offset-recovery delta +pre-baked. This unlocks Phase 15. + +## Design summary + +- **Layout**: offset bits sit in `ras` directly above the sizeclass + bits and directly below the `REMOTE_BACKEND_MARKER`. Reading the + same `ras` word the sizeclass-extract path already loads, masking + with `COMBINED_MASK` yields the combined sizeclass+offset value + ready to use as a table index — no extra load, no shift, no OR, + no multiply. (Default config: 11 bits of combined index; the mask + widens from `SIZECLASS_REP_SIZE - 1` to `COMBINED_REP_SIZE - 1` + but is still a single `and`-with-imm.) +- **Metadata table**: `sizeclass_metadata.fast_` is widened from + `SIZECLASS_REP_SIZE` rows to `COMBINED_REP_SIZE` rows + (= `SIZECLASS_REP_SIZE << OFFSET_BITS`). Each row gains a + pre-computed `offset_bytes` field equal to `offset * slab_size` + for that sizeclass. Recovery is + `alloc_start = (addr & ~slab_mask) - offset_bytes`. +- **Code**: `start_of_object` and friends take a *combined* index + (`size_t`); the wrapper in `globalalloc.h` passes + `entry.get_offset_and_sizeclass()`. No branches, no extra word loads + on the fast path. +- **Backend**: in `alloc_chunk`, the small-and-pow2-large fast path + (`slab_size >= size`) uses the existing `set_metaentry`. The + non-pow2-large (multi-slab-tile) path writes a per-chunk + `ras = encode(remote, sc, slab_index)` via `concretePagemap.set`. ## Why now - Phase 15 introduces non-pow2 reservations. The existing `addr & ~slab_mask` answer is wrong for non-pow2 sizes/alignments. -- Per-chunk offset is the mechanism PLAN.md (lines 65-71) already - identified. Phase 14 implements that mechanism with offset = 0 - semantics matching the existing pow2 path — so it can land - without changing observable behaviour. +- A per-chunk offset is the long-identified mechanism (PLAN.md + intro). Phase 14 implements that mechanism with offset = 0 + semantics matching the existing pow2 path — so it lands without + changing observable behaviour for today's allocations. +- Packing the offset into `ras` (not `meta`) at the time we land + the field avoids a second `meta`-word load on + `__malloc_start_pointer` and avoids a runtime multiply on every + external_pointer query. ## Design -### Slab granularity, not chunk granularity +### Bit layout of `ras` + +``` +ras = [ RemoteAllocator* | BACKEND_MARKER | offset_bits | sizeclass_bits ] + ↑ + low bits +``` + +Bit positions (low to high): +- bits `[0, SIZECLASS_BITS)`: sizeclass — **unchanged** position. +- bits `[SIZECLASS_BITS, SIZECLASS_BITS + OFFSET_BITS)`: offset + (frontend-owned, non-zero only for non-pow2 large in Phase 15+). +- bit `[SIZECLASS_BITS + OFFSET_BITS]`: `REMOTE_BACKEND_MARKER` + (moves up by `OFFSET_BITS` positions). +- bits above: `RemoteAllocator*` payload. + +Constants: + +```cpp +// in sizeclasstable.h (alongside existing SIZECLASS_BITS): +constexpr size_t OFFSET_BITS = INTERMEDIATE_BITS + 1; +constexpr size_t COMBINED_BITS = SIZECLASS_BITS + OFFSET_BITS; +constexpr size_t COMBINED_REP_SIZE = bits::one_at_bit(COMBINED_BITS); +``` + +`REMOTE_BACKEND_MARKER` in `metadata.h` redefines from +`SIZECLASS_REP_SIZE` to `COMBINED_REP_SIZE`. `REMOTE_MIN_ALIGN` +follows: `max(CACHELINE_SIZE, COMBINED_REP_SIZE) << 1`. For the +default config (SIZECLASS_BITS=8, OFFSET_BITS=3): the marker moves +from bit 8 to bit 11, and `REMOTE_MIN_ALIGN` from 512 B to 4096 B. + +Existing `MetaEntryBase::get_sizeclass()` must continue to return +pure sizeclass; with the marker moving up, masking by +`REMOTE_WITH_BACKEND_MARKER_ALIGN - 1` would now include the offset +bits. Define a dedicated `SIZECLASS_MASK = SIZECLASS_REP_SIZE - 1` +(unchanged in value from today's effective mask) and use it +explicitly in `get_sizeclass()`. The new `COMBINED_MASK = +COMBINED_REP_SIZE - 1` is what `get_offset_and_sizeclass()` uses. + +### `OFFSET_BITS` derivation + +With `INTERMEDIATE_BITS = M`, the worst-case non-pow2 large +sizeclass tiles into `2^(M+1)` slabs (e.g., a 7×slab_size class +with M=2: reserve rounds up to 8 slabs, max slab index = 7). So +`OFFSET_BITS = M + 1` gives `2^(M+1)` distinct values, exactly +enough for `[0, 2^(M+1))`. A `static_assert` on +`max_large_slab_index() < (1 << OFFSET_BITS)` (existing helper at +`sizeclasstable.h:273-285`) guards against any sizeclass-table +change. + +### `meta` word stays simple -The offset records "which slab within the allocation does this -chunk belong to", in units of the per-sizeclass `slab_size`. The -recovery formula (matching PLAN.md lines 65-71) is: +The `meta` word goes back to its pre-Phase-14 layout: ``` -start = (addr & ~slab_mask) - offset * slab_size +meta = [ SlabMetadata* | META_BOUNDARY_BIT ] ``` -where `slab_size = info.align` (the natural alignment from -`backend_arena_bins.h:741`, `info.align = size & (~size + 1)`, -i.e. the lowest set bit of `size`), and `slab_mask = slab_size - 1`. -Both are per-sizeclass, stored in `sizeclass_data_fast` as -`slab_mask` (already there, value changes per Phase 13). - -**Offset width.** With `INTERMEDIATE_BITS = 2`, a large sizeclass -of size `S = (4+M) * 2^(E-2)` for `M ∈ {0,1,2,3}` has: - -| M | size factor | info.align (lowest set bit) | slabs (size/align) | -|---|-------------|------------------------------|---------------------| -| 0 | 4 | 2^E (= size) | 1 | -| 1 | 5 | 2^(E-2) | 5 | -| 2 | 6 | 2^(E-1) | 3 | -| 3 | 7 | 2^(E-2) | 7 | - -Worst case `2^(M+1) - 1` slabs (M = `INTERMEDIATE_BITS`): for M=2, -the table above shows 7 slabs. Generalising: `OFFSET_BITS = M + 1` -gives the needed `2^(M+1)` distinct values. A `static_assert` in -`metadata.h` guards the bound: -`static_assert((1 << OFFSET_BITS) > max_slabs_in_largest_class)`. - -(With natural alignment, the allocation incurs no address-space -waste beyond what alignment already implies. With computed -`OFFSET_BITS = INTERMEDIATE_BITS + 1`, we accept the extra -`meta`-word bit consumption to keep allocations at natural -alignment.) - -### Offset is a frontend concept (layering) - -Per user clarification: the offset is owned by the frontend (used -to recover start-of-object from an interior pointer); the boundary -bit is owned by the backend (used to mark PAL-allocation boundaries -for the buddy allocator). - -Both bits happen to live in the `meta` word of the pagemap entry, -but they are conceptually disjoint: - -- Offset accessors live on `FrontendMetaEntry`, not on - `MetaEntryBase`. The boundary bit machinery - (`MetaEntryBase::set_boundary`, `clear_boundary_bit`, `is_boundary`) - must not clobber offset bits — and currently doesn't, because - it only `|=` / `&= ~` the single boundary bit at position 0. -- The frontend's `set_metaentry_large` packs offset into `meta` - and must preserve the boundary bit. **Key observation**: - `MetaEntryBase::operator=` at `metadata.h:162-169` *already* - preserves the target's boundary bit on assignment. So writing a - freshly-constructed `Entry t_i(meta, ras)` (with offset and - boundary both zero), calling `set_offset(slab_index)` on it (RMW - that touches only OFFSET bits — boundary on `t_i` is still 0), - then `concretePagemap.set(addr, t_i)` (which assigns via - `operator=`) leaves the pagemap entry's pre-existing boundary - bit intact. No manual boundary-preservation logic is needed. -- `FrontendMetaEntry::get_slab_metadata()` (currently at - `metadata.h:739-740` masks `meta & ~META_BOUNDARY_BIT`) must - also mask the offset bits. The simplest way: extend the existing - mask constant. Define `META_FRONTEND_RESERVED_MASK = - META_BOUNDARY_BIT | (((1 << OFFSET_BITS) - 1) << OFFSET_SHIFT)` - and mask with that everywhere `get_slab_metadata` needs the - pointer. - -### Where the offset lives in the `meta` word - -Bits `1..OFFSET_BITS` of `meta` (with `OFFSET_SHIFT = 1`): - -- Bit 0: `META_BOUNDARY_BIT` (backend-owned). -- Bits `1..OFFSET_BITS`: offset (frontend-owned, large-only). -- Bits `(1 + OFFSET_BITS)..`: `SlabMetadata*` payload (natural - pointer alignment). - -This requires `alignof(SlabMetadata) >= (1 << (1 + OFFSET_BITS))`. -For default `INTERMEDIATE_BITS=2`, `OFFSET_BITS=3`, the requirement -is `alignof(SlabMetadata) >= 16`. Inspect `SlabMetadata` at the top -of Phase 14; if alignment is insufficient, add -`alignas(1 << (1 + OFFSET_BITS))` (= `alignas(16)` for default) to -`SlabMetadata`. Cost: a few bytes of padding per slab metadata -record — negligible. - -### Accessors - -Add to `FrontendMetaEntry`: - -- `static constexpr size_t OFFSET_BITS = INTERMEDIATE_BITS + 1;` - (derives from `INTERMEDIATE_BITS` because the worst-case slab - count for a non-pow2 large class with M mantissa bits is - `2^M + (2^M - 1) = 2^(M+1) - 1`. For default `INTERMEDIATE_BITS=2` - this gives `OFFSET_BITS = 3` (max offset 7, matching a worst case - of 7 slabs). For `INTERMEDIATE_BITS=3` (config option) it gives - `OFFSET_BITS = 4` (max offset 15, matching a worst case of 15 - slabs).) -- `static constexpr size_t OFFSET_SHIFT = 1;` (immediately above - the boundary bit) -- `static constexpr address_t OFFSET_MASK = - ((1 << OFFSET_BITS) - 1) << OFFSET_SHIFT;` -- `void set_offset(size_t slab_offset)`: read-modify-write of - `meta`, preserving boundary bit and `SlabMetadata*` payload. - Asserts `slab_offset < (1 << OFFSET_BITS)`. -- `size_t get_offset() const`: reads `(meta & OFFSET_MASK) >> - OFFSET_SHIFT`. - -Update the existing pointer mask: define -`META_FRONTEND_RESERVED_MASK = META_BOUNDARY_BIT | OFFSET_MASK`, -update `get_slab_metadata()` to mask `meta & ~META_FRONTEND_RESERVED_MASK`. - -A `static_assert(alignof(SlabMetadata) >= (1 << (OFFSET_BITS + -OFFSET_SHIFT)))` enforces the pointer-alignment requirement at -compile time. For the default config this requires -`alignof(SlabMetadata) >= 16`. Verify the current value and add -`alignas(16)` (or computed `alignas(1 << (OFFSET_BITS+OFFSET_SHIFT))`) -to `FrontendSlabMetadata` if needed. - -### `Pagemap::set_metaentry` (split into small vs large) - -The existing `set_metaentry` (writes uniform entries per chunk in -a range) is a static member of `BasicPagemap` in -`backend_helpers/pagemap.h:56-66`, which uses -`concretePagemap.set(...)` to reach the underlying `FlatPagemap`. -The new `set_metaentry_large` is added as a static member alongside -it. - -`FrontendMetaEntry` deletes its copy constructor (`metadata.h:754`), -so we cannot use `Entry t_i = t;` and modify per chunk. Instead, -reconstruct each per-chunk entry from its components: +No offset bits. No `META_FRONTEND_RESERVED_MASK`. No alignas on +`FrontendSlabMetadata`. `get_slab_metadata()` masks just +`META_BOUNDARY_BIT`. This removes a load on the pointer-recovery +hot path (no `mov (%rdx),%rcx` to fish offset out of `meta`). + +### Combined-indexed metadata table + +`SizeClassTable::fast_` (`sizeclasstable.h:181`) widens: ```cpp -// In BasicPagemap, alongside set_metaentry: -static void set_metaentry_large( - address_t p, - size_t size, - size_t slab_size, - SlabMetadata* meta, - uintptr_t remote_and_sizeclass) -{ - // slab_size = info.align of this sizeclass. - // size = total allocation size (== sizeclass-encoded size). - for (size_t chunk_offset = 0; chunk_offset < size; - chunk_offset += MIN_CHUNK_SIZE) - { - size_t slab_index = chunk_offset / slab_size; - Entry t_i(meta, remote_and_sizeclass); // meta low bits = 0 - t_i.set_offset(slab_index); // RMW; touches only OFFSET bits - concretePagemap.set(p + chunk_offset, t_i); - } +struct sizeclass_data_fast { + size_t size; + size_t slab_mask; + size_t div_mult; + size_t mod_zero_mult; + size_t offset_bytes; // NEW: precomputed (combined >> SIZECLASS_BITS) * slab_size +}; + +ModArray fast_{}; +``` + +Memory: `COMBINED_REP_SIZE × sizeof(sizeclass_data_fast)`. With +SIZECLASS_BITS=8, OFFSET_BITS=3, sizeof=40: ~80 KB. Fits L2. +(`fast_small`'s today-1KB working set still fits L1 for the +small-only paths because those index `sc.raw()` directly, which +lands in the first `SIZECLASS_REP_SIZE` rows.) + +`slow_` stays sc-indexed at `SIZECLASS_REP_SIZE` rows: it is only +read by slow paths that don't care about offset. + +Table initialization fills every `(sc, offset)` cell: +- Other fields duplicate the `(sc, 0)` row. +- `offset_bytes = offset * sizeclass_full_to_slab_size(sc)`. + +For `offset == 0` rows: `offset_bytes = 0`. The first +`SIZECLASS_REP_SIZE` rows of the new `fast_` are byte-identical to +today's table plus a trailing `offset_bytes = 0`. + +**`fast()` overloads.** Keep the existing +`fast(sizeclass_t sc)` overload (`sizeclasstable.h:186-193`) +unchanged — it forwards to `fast_[sc.raw()]`, which hits the +offset = 0 row, identical to today's behaviour. Add a new +overload `fast(size_t combined)` that does `fast_[combined]`. +Call sites that have a sizeclass_t (most existing code) keep +calling `fast(sc)`; sites that have a combined index from the +pagemap call `fast(combined)`. No source change for the majority +of existing call sites. + +### Accessors on `MetaEntryBase` / `FrontendMetaEntry` + +Add to `MetaEntryBase`: + +```cpp +// returns the value to use as an index into sizeclass_metadata.fast_ +[[nodiscard]] SNMALLOC_FAST_PATH size_t get_offset_and_sizeclass() const { + return static_cast(remote_and_sizeclass) & COMBINED_MASK; } ``` -**Boundary-bit preservation**: `MetaEntryBase::operator=` at -`metadata.h:162-169` already preserves the *target's* boundary bit -when copy-assigning from `other`. `FlatPagemap::set` uses `=` to -write entries. Therefore: the freshly-constructed `t_i` carries -`boundary = 0`, but when it is assigned into the pagemap slot, the -slot's pre-existing boundary bit (set earlier by the backend's -`register_range`) is preserved by `operator=`. No manual -boundary-preservation logic is needed in this loop. - -**Backend call site** (`backend.h:131-132`): dispatch on -`sizeclass.is_small()`. Small path keeps existing -`Pagemap::set_metaentry(p, size, t)`. Large path: -`Pagemap::set_metaentry_large(p, size, - sizeclass_data_fast(sc).slab_mask + 1, - meta, ras);` -where `meta` and `ras` are the `SlabMetadata*` and -`remote_and_sizeclass` values currently passed to the `Entry t(meta, -ras)` construction at `backend.h:131`. - -### `start_of_object` / `is_start_of_object` - -The current `start_of_object` lives in `sizeclasstable.h` with no -Pagemap access. After Phase 14, the large case needs the per-chunk -offset — which lives in the pagemap. - -Split the function: keep `sizeclasstable.h`'s `start_of_object` as -the small-case implementation (rename internally to -`start_of_object_small` if helpful), and add a Config-aware wrapper -in `globalalloc.h` (or `mem/start_of_object.h`): +Keep `get_sizeclass()` returning a `sizeclass_t` (pure sizeclass, +low SIZECLASS_BITS only). Add an offset accessor for tests / +diagnostics: ```cpp -template -inline address_t start_of_object(address_t addr) { - // Use the existing public BackendAllocator accessor (see - // backend.h:197) instead of reaching for `Config::Backend::Pagemap` - // directly — `Pagemap` is a template parameter of `BackendAllocator`, - // not a publicly exposed nested type. The public - // `get_metaentry(addr)` static - // wraps the Pagemap access. - auto& entry = Config::Backend::template get_metaentry(addr); - auto sc = entry.get_sizeclass(); - if (sc.is_small()) { - auto info = sizeclass_data_fast(sc); - return start_of_object_small(info, addr); - } - // Large: PLAN.md (65-71) recovery. - auto info = sizeclass_data_fast(sc); - size_t slab_size = info.slab_mask + 1; - return (addr & ~info.slab_mask) - entry.get_offset() * slab_size; +[[nodiscard]] SNMALLOC_FAST_PATH size_t get_offset() const { + return (static_cast(remote_and_sizeclass) >> SIZECLASS_BITS) + & ((1 << OFFSET_BITS) - 1); +} +``` + +`encode(RemoteAllocator*, sizeclass_t)` gains an optional `offset` +parameter (defaults to 0 so existing callers compile): + +```cpp +[[nodiscard]] static SNMALLOC_FAST_PATH uintptr_t +encode(RemoteAllocator* remote, sizeclass_t sizeclass, size_t offset = 0) { + return pointer_offset( + reinterpret_cast(remote), + sizeclass.raw() | (offset << SIZECLASS_BITS)); } ``` -### Consumers that MUST be rewritten in Phase 14 +Compile-time check: `offset < (1 << OFFSET_BITS)` (assert). + +### `start_of_object` and friends + +Refactor signatures to take a combined index (`size_t`) instead of +`(sizeclass_t, slab_offset)`. The recovery formula collapses to a +single subtract because `offset_bytes` is precomputed: + +```cpp +SNMALLOC_FAST_PATH constexpr address_t +start_of_object(size_t combined, address_t addr) { + auto meta = sizeclass_metadata.fast(combined); + address_t alloc_start = (addr & ~meta.slab_mask) - meta.offset_bytes; + size_t index = slab_index_via(meta, addr - alloc_start); + return alloc_start + (index * meta.size); +} +``` + +`slab_index_via(meta, addr)` is the existing `slab_index` body +(`sizeclasstable.h:358-383`) refactored to take an already-loaded +`sizeclass_data_fast` instead of doing its own +`sizeclass_metadata.fast(sc)` lookup. All current behaviour is +preserved: the `offset = addr & meta.slab_mask` mask, the 64-bit +reciprocal-division (`(offset * meta.div_mult) >> DIV_MULT_SHIFT`), +and the 32-bit `offset / size` fallback for `sizeof(size_t) < 8` +platforms with the `size == 0` short-circuit. The original +`slab_index(sizeclass_t sc, address_t addr)` is kept as a +one-line wrapper that resolves `sc` to a row and forwards to +`slab_index_via` so call sites that don't already have the row +(e.g., `globalalloc.h:231,260` — which today pass +`entry.get_sizeclass()`) keep compiling unchanged. + +`index_in_object`, `remaining_bytes`, `is_start_of_object` follow +the same shape, all taking `size_t combined`. Where callers have +only a `sizeclass_t` (e.g., for self-allocations they did +themselves), they pass `sc.raw()` directly — that selects the +offset=0 row, equivalent to today. + +### Backend write in `alloc_chunk` + +For the small / pow2-large (single-slab-tile) case (`slab_size >= +size`), keep `set_metaentry(addr, size, t)` where +`t = Entry(meta, encode(remote, sc))` — encoded with offset=0 +implicitly. + +For multi-slab-tile (Phase 15+, currently dormant): + +```cpp +size_t slab_size = sizeclass_full_to_slab_size(sizeclass); +for (size_t chunk_offset = 0; chunk_offset < size; + chunk_offset += MIN_CHUNK_SIZE) +{ + size_t slab_index = chunk_offset / slab_size; + uintptr_t ras_i = Pagemap::Entry::encode(remote, sizeclass, slab_index); + typename Pagemap::Entry t_i(meta, ras_i); + Pagemap::concretePagemap.set(address_cast(p) + chunk_offset, t_i); +} +``` + +Only the `META_BOUNDARY_BIT` in `meta` is preserved across this +write: `MetaEntryBase::operator=` (`metadata.h:235-242`) +explicitly preserves the target's boundary bit and otherwise +overwrites both `meta` (modulo that bit) and `remote_and_sizeclass` +in full. Any prior backend-owned state in the old `ras` is gone +once the frontend claims the chunk (in `claim_for_backend`, +`metadata.h:313-317`, which resets `ras` to +`REMOTE_BACKEND_MARKER`), so the frontend's per-chunk write +overwriting `ras` from that pristine `REMOTE_BACKEND_MARKER`-only +state to the encoded `(remote, sc, offset)` is exactly the +expected ownership transition. + +### Backend bits relocate automatically + +`MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT` is derived from +`REMOTE_BACKEND_MARKER`; since the marker moves up by `OFFSET_BITS`, +the backend's `RED_BIT`, `VARIANT_SHIFT`, `LARGE_SIZE_SHIFT` +(`backend_arena_range.h:50-67`) auto-shift up by the same amount. +Verify the existing +`static_assert((MAX_SIZE_BITS - MIN_SIZE_BITS) + LARGE_SIZE_SHIFT +<= bits::BITS, ...)` still holds. For the default config: +- `MAX_SIZE_BITS = bits::BITS - 1 = 63` +- `MIN_CHUNK_BITS = 14`, so the large size field needs + `MAX_SIZE_BITS - MIN_CHUNK_BITS = 49` bits. +- Pre-Phase-14: `BACKEND_LAYOUT_FIRST_FREE_BIT = SIZECLASS_BITS = 8`, + so `LARGE_SIZE_SHIFT ≈ 9` → `49 + 9 = 58 ≤ 64`. ✓ +- Phase 14: `BACKEND_LAYOUT_FIRST_FREE_BIT = SIZECLASS_BITS + + OFFSET_BITS = 11`, so `LARGE_SIZE_SHIFT ≈ 12` → `49 + 12 = 61 ≤ + 64`. ✓ (Three bits of headroom remain; OFFSET_BITS = 4 — the + `INTERMEDIATE_BITS = 3` config — would still pass.) + +### Pre-existing pagemap bug (still fixed in prep commit `1144eab4`) + +Same as before: `FlatPagemap::get_mut` double-base-adjust on +`PALNoAlloc`. Fix unrelated to Phase 14 layout choice. + +### Consumers that MUST be updated in Phase 14 Phase 14 is incomplete until every caller of the sizeclass-table-only `start_of_object` / `is_start_of_object` / -`remaining_bytes` on a potentially-large pointer is moved to the -Config-aware wrapper: - -- `globalalloc.h:137-144` (`remaining_bytes`): currently calls - `snmalloc::remaining_bytes(sizeclass, p)` which has no pagemap - offset access. Replace with the Config-aware path that consults - the pagemap entry for offset and computes - `start + sizeclass_full_to_size(sc) - addr`. -- `globalalloc.h:145-220` (`index_in_object`, `external_pointer`): - similarly rewrite to consult the pagemap. -- `corealloc.h` deallocation-sanity checks: `is_start_of_object` - is used in dealloc paths to assert the caller is passing a - valid base pointer. Grep `is_start_of_object` and `start_of_object` - across `corealloc.h` (verified candidates at - `corealloc.h:534-537` and `corealloc.h:1080-1083` per - rubber-duck review). Each call site that may receive a large - allocation's pointer must use the Config-aware variant. Without - this update, after Phase 15 a `dealloc` of a non-pow2 large - allocation could miss the start-of-object check entirely (every - natural-alignment slab boundary inside the allocation would - satisfy the old `slab_mask`-only check). -- `bounds_checks.h` memcpy gate (line 99-103): calls - `remaining_bytes(...)`. Moves to the Config-aware version - transitively via the `globalalloc.h::remaining_bytes` rewrite. - -`is_start_of_object` analogue: for small, today's formula; for -large, `(addr & info.slab_mask) == 0 && entry.get_offset() == 0`. +`remaining_bytes` on a **user-supplied** potentially-large pointer +is offset-aware. + +The offset support is pushed into the inner helpers in +`sizeclasstable.h` themselves: `start_of_object`, `index_in_object`, +`remaining_bytes`, and `is_start_of_object` take a mandatory +*combined* `size_t` index parameter (sizeclass + offset packed into +the low `COMBINED_BITS` of `ras`). Callers must explicitly pass +either `sc.raw()` (when local context proves the address is in the +allocation's first slab — offset implicitly 0) or +`entry.get_offset_and_sizeclass()` (from the address's pagemap entry). +Removing default arguments forces every call site to make a +deliberate choice and prevents a future Phase 15 caller from +accidentally inheriting offset = 0 when it should consult the +pagemap. + +Inside each helper, the formula uses a single +`sizeclass_metadata.start(combined)` lookup — the `start_` table is +indexed by `COMBINED_REP_SIZE` rows so the combined index lands +directly in a precomputed row. `offset_bytes` collapses to 0 for the +offset = 0 rows, which today are the only rows reached from +front-end allocation paths. This keeps `globalalloc.h` and +`corealloc.h` branch-free at the call site and avoids duplicating +the slab-mask / slab-size arithmetic across files. + +- `globalalloc.h:138-144` (`remaining_bytes`): reads the metaentry, + then unconditionally calls + `snmalloc::remaining_bytes(entry.get_offset_and_sizeclass(), p)`. + No small/large dispatch. +- `globalalloc.h:158-167` (`index_in_object`): same pattern. +- `bounds_checks.h:101` memcpy gate: calls `remaining_bytes(...)`. + Moves to the offset-aware version transitively via the inner-helper + rewrite — no source change here, and no extra branch on the + bounds-check fast path. + +Audit of all `is_start_of_object` call sites (verified against the +post-Phase-13 tree via `grep -rn is_start_of_object src/snmalloc`): + +| File:line | Sizeclass source | Pointer source | Action | +|---|---|---|---| +| `corealloc.h:41` (`DefaultConts::success`) | requested-size→sc | allocator-output base | **Keep** — slab-mask check on the allocator's own freshly-returned base is tight enough; pass `sc.raw()`. | +| `override/new.cc:40` (`handler::Base::success`) | requested-size→sc | allocator-output base | **Keep** — same rationale as above; pass `sc.raw()`. | +| `corealloc.h:536` (`dealloc_local_object_meta`) | `entry.get_sizeclass()` | **user input** | **Update** — pass `entry.get_offset_and_sizeclass()`; the helper folds the offset check internally. | +| `corealloc.h:1084` (`dealloc_local_object`) | `entry.get_sizeclass()` | **user input** | **Update** — same: pass `entry.get_offset_and_sizeclass()`. | +| `corealloc.h:1258` | `from_small_class(...)` | small allocation | **Keep** — small-only path; pass `sc.raw()`. | +| `corealloc.h:1438` | `from_small_class(...)` | small allocation | **Keep** — small-only path; pass `sc.raw()`. | + +Additionally, `slab_index` itself has two call sites outside the +`start_of_object` family: + +- `globalalloc.h:231` (`remaining_bytes` wrapper, large-class + arm): calls `slab_index(entry.get_sizeclass(), address_cast(p))`. +- `globalalloc.h:260` (`index_in_object` wrapper, large-class + arm): same shape. + +After the helper-signature refactor these two wrappers fold into +the new `start_of_object(combined, addr)` path entirely (the +combined-index version of `remaining_bytes`/`index_in_object` +calls `start_of_object` internally, which itself dispatches to +`slab_index_via`). Neither wrapper calls `slab_index` directly +post-refactor. + +The "Keep" rows on allocator-output base pointers are safe because +the allocator itself always returns the allocation base, which by +construction is slab-aligned (`addr & info.slab_mask == 0`) *and* +allocation-start (offset == 0 in pagemap, so combined == +`sc.raw()`). The old `is_start_of_object(sc, addr)` test reduces to +`(addr & info.slab_mask) == 0`, which holds for all such bases +both today and after Phase 15. + +The dealloc-API consumers (rows 3 and 4) get the offset folded +inside the combined index because for non-pow2 large in Phase 15 +every natural-alignment slab boundary *inside* the allocation would +satisfy the old `slab_mask`-only check; the precomputed +`offset_bytes` in the combined row distinguishes the actual +allocation base. These call sites remain gated by +`snmalloc_check_client(mitigations(sanity_checks), ...)`, so the +additional comparison is dead in release/non-checked builds. `slab_index` for large: irrelevant — large allocations are a single "object" of size `sizeclass_full_to_size(sc)`, not a slab of -multiple. Existing callers gated by `sc.is_small()` already avoid -calling `slab_index` for large. +multiple. The refactored `start_of_object` uses +`addr - alloc_start` (offset within the *allocation*, not the slab) +as the dividend, which is 0 for any in-range large pointer. ### Backend changes -- `backend.h:131-132`: at the `set_metaentry` call site after a - large `alloc_chunk`, dispatch on small vs large as above. Phase - 14 keeps `alloc_chunk`'s `bits::is_pow2(size)` assertion (Phase - 15 relaxes it). This is fine: today only pow2 large allocations - reach this site, so `slab_size == size` and offset is always 0. -- `backend.h:169` (dealloc): writes backend-claim entries via the - backend Rep's word setters; those don't touch frontend bits. - No change. - -## Test gates - -1. **Build**: clean build passes. +- `backend.h:131-156` (alloc_chunk small/large dispatch): replace + the single `set_metaentry(p, size, t)` call with the small/large + dispatch described in "Backend write in `alloc_chunk`" above. + Phase 14 keeps `alloc_chunk`'s `bits::is_pow2(size)` assertion + (Phase 15 relaxes it). This is fine: today only pow2 large + allocations reach this site, so `slab_size == size` and offset + is always 0; the entries written by the new large path are + bit-identical to the entries written by the old uniform path. +- `backend.h:172-196` (dealloc_chunk): constructs + `Entry t(nullptr, 0)`, calls `claim_for_backend()`, then + `set_metaentry(p, size, t)`. The `Entry(nullptr, 0)` + constructor's `ras = 0` clears both the sizeclass and offset + fields. `claim_for_backend()` (`metadata.h:313-317`) sets `ras` + to `REMOTE_BACKEND_MARKER` and only the boundary bit on `meta` is + preserved. The subsequent `set_metaentry` writes the + cleared-ras `Entry` to every pagemap cell in the range. No + further change is needed: the offset is meaningful only while + the chunk is owned by the frontend. + +### `RemoteAllocator` alignment + +`REMOTE_MIN_ALIGN` bumps from 512 B to 4096 B (default config: +`COMBINED_REP_SIZE = 2048`, doubled for the marker, so +`max(CACHELINE, 2048) << 1 = 4096`). + +`RemoteAllocator` (`remoteallocator.h:292-310`) gets its alignment +from its `FreeListMPSCQ` member (`freelist_queue.h`), +which is declared `alignas(REMOTE_MIN_ALIGN)`. So bumping +`REMOTE_MIN_ALIGN` automatically widens `alignof(RemoteAllocator)` +to 4096 with no source change to `RemoteAllocator` itself. + +Verifications (do during step 2): + +1. `sizeof(RemoteAllocator)` does not blow up. The structure is a + small fixed-size queue head plus padding; rounding up to a + 4096-B alignment unit only consumes extra padding in + surrounding containers (allocators, pool slots), not inside + `RemoteAllocator`. +2. `CommonConfig::unused_remote` (`commonconfig.h:119-120`) — a + static `RemoteAllocator` — inherits the new alignment from + `RemoteAllocator`'s natural alignof. Confirm it still compiles + and the linker honours the alignment (compilers do; some older + linkers cap `.bss` alignment, but 4096 is the page size, so it + is universally supported). +3. Per-allocator-pool storage: the pool allocates `Allocator` + instances; each `Allocator` contains a `RemoteAllocator` + (transitively), and the pool's metadata-allocation path is + already aligned to `alignof(Allocator)` via the backend's + metadata allocator. Confirm via inspection that + `Pool::acquire` honours `alignof(Allocator)` after + the bump. +4. `unused_remote_address`-style runtime checks (any assertion that + `(uintptr_t)remote & (REMOTE_MIN_ALIGN - 1) == 0`) — grep for + `REMOTE_MIN_ALIGN` to find them and confirm they pass with the + bumped value. + +## Implementation steps + +Each step must produce a testable result before moving to the next. +Steps are ordered so that earlier steps' tests don't depend on +later steps' code. + +### Step 0: Revert the current (meta-based) Phase-14 implementation + +The current working tree carries a partial, meta-word-based +Phase 14 (`META_OFFSET_BITS`, `META_OFFSET_SHIFT`, +`META_OFFSET_MASK`, `META_FRONTEND_RESERVED_MASK`, `set_offset` / +`get_offset` on `FrontendMetaEntry`, `alignas(...)` on +`FrontendSlabMetadata`, branchless three-parameter +`start_of_object(sc, addr, slab_offset)` / `index_in_object` / +`remaining_bytes` / `is_start_of_object`, three-parameter wrapper +calls in `globalalloc.h` / `corealloc.h` / `override/new.cc` / +`test/func/release-rounding/rounding.cc`, and the small/large +dispatch in `backend.h::alloc_chunk`). The new design replaces all +of this. Revert these files to the pre-Phase-14 head (commit +`1144eab4`), keeping only: +- The new test scaffolding in `src/test/func/memory/memory.cc` + (`test_large_alloc_pointer_recovery`) and + `src/test/func/large_offset/large_offset.cc` — to be updated for + the combined-index API in steps 4 and 6. + +**Gate**: clean build, full ctest suite passes (this is the +pre-Phase-14 head with two test additions that will be updated +later — the additions either compile and pass or are temporarily +gated out until step 4). + +### Step 1: Constants + table widening (no behaviour change) + +> **Implementation note**: the as-shipped design splits the metadata +> table into `start_` / `align_` / `slab_` rather than widening the +> single `fast_` table described below. See Step 7 Outcome for the +> rationale (perf gate). The constants and `(sc, offset)` +> initialisation described here apply to `start_`. + +Changes: +- `sizeclasstable.h`: add `OFFSET_BITS`, `COMBINED_BITS`, + `COMBINED_REP_SIZE`. Add `offset_bytes` column to + `sizeclass_data_fast`. Widen `fast_` to `COMBINED_REP_SIZE`. + Initialise every `(sc, offset)` cell — non-zero rows duplicate + the `(sc, 0)` row's fields except `offset_bytes = offset * + slab_size`. Add new overload `fast(size_t combined)`. Keep + `fast(sizeclass_t)` unchanged. +- Add `static_assert(max_large_slab_index() < (1 << + OFFSET_BITS))`. + +**Gate**: clean build. All existing tests still pass — nothing +reads `fast(combined)` yet, and the offset = 0 rows of the widened +table are byte-identical to today's rows for callers that index +via `sc.raw()` (whose value lies in `[0, SIZECLASS_REP_SIZE)`). + +### Step 1.5: Per-word backend-reserved mask + lower BIN/RANGE bit positions + +Motivation: today's `BACKEND_RESERVED_MASK = (REMOTE_BACKEND_MARKER +<< 1) - 1` applies symmetrically to both `meta` (Word::One) and +`ras` (Word::Two). That is overly conservative: in backend mode, +the only invariants are +- `meta` must preserve `META_BOUNDARY_BIT` (bit 0) across the + ownership transition (frontend reads it to detect PAL + boundaries), and +- `ras` must keep `REMOTE_BACKEND_MARKER` set while backend-owns + (frontend reads bit MARKER to detect ownership). + +Everything else on both words is free for the backend. Today's +unified mask forces `RED + VARIANT` (which live on `meta`) up to +`BACKEND_LAYOUT_FIRST_FREE_BIT`, i.e., just above the marker +position. After Step 2 moves the marker from bit 8 to bit 11, +those positions become bits 12, 13, 14 — and bit 14 collides with +the `MIN_CHUNK_BITS = 14` unit-address packing in the backend's +buddy-tree pointer storage, tripping the +`BIN_META_MASK < UNIT_SIZE` assertion in +`backend_arena_range.h:72`. + +Changes: + +- `metadata.h`: + - Replace `BACKEND_RESERVED_MASK` with two per-word constants: + - `BACKEND_RESERVED_MASK_WORD_ONE = META_BOUNDARY_BIT` + - `BACKEND_RESERVED_MASK_WORD_TWO = (REMOTE_BACKEND_MARKER << + 1) - 1` (the old value — unchanged in behaviour for `ras`). + - Make `is_backend_allowed_value(Word w, uintptr_t v)` use the + right mask per `w`. + - Change `BackendStateWordRef` to carry the relevant mask (or + its `Word` identity) so its `get()` and `operator=` use the + correct per-word mask. The simplest mechanical change is to + pass the mask into the `BackendStateWordRef` constructor and + store it as a member; `get_backend_word(Word w)` selects the + right mask at the call site. +- `backend_arena_range.h`: + - Move `RED_BIT_POS` and `VARIANT_SHIFT` down to start at bit 1 + (just above `META_BOUNDARY_BIT`). `RED_BIT_POS = 1`, + `VARIANT_SHIFT = 2`. `BIN_META_MASK = (1<<1) | (3<<2) = 14`. + - Move `LARGE_SIZE_SHIFT` to bit 1 too (it stores the large + chunk count on `Word::One` of unit 2 — same word, same + relaxed reservation). + - The `is_backend_allowed_value(Word::Two, RED_BIT)` assert at + line 75 — RANGE_META_MASK applied to Word::Two of unit 1 + stores bit 1 in the left-child mask region. Bit 1 ≠ bit + MARKER (= 11 in new layout or 8 today), so the marker bit + is not disturbed. Verify the per-word mask check passes for + Word::Two with bit 1 (it should: the new Word::Two mask still + forbids the backend from writing the marker bit, but bit 1 is + not the marker). + - **Note**: After this step, `Word::Two`'s relaxed mask still + requires the backend not to disturb the marker. Today's + Word::Two mask was bits 0..MARKER, which forbade *any* bits + in that range. The relaxed mask forbids only the marker bit + itself. So the backend can now write low bits of `ras` + (sizeclass/offset positions) — those are zero in backend mode + (cleared by `claim_for_backend()`) and overwritten on + ownership transition, so no real change. + +**Gate**: clean build. Full ctest suite passes. The marker has +NOT moved yet (still at SIZECLASS_REP_SIZE), so the layout +change is invisible to allocation behaviour; only the +relaxation of asserts and the lowered bit positions for +RED/VARIANT/LARGE_SIZE_SHIFT differ. Run a focused build to +re-trigger the static_asserts in `backend_arena_range.h` and +confirm they all pass. + +### Step 2: Marker move + ras encoding (no offset writers yet) + +Changes: +- `metadata.h`: change `REMOTE_BACKEND_MARKER` from + `SIZECLASS_REP_SIZE` to `COMBINED_REP_SIZE`. Define + `SIZECLASS_MASK = SIZECLASS_REP_SIZE - 1` and + `COMBINED_MASK = COMBINED_REP_SIZE - 1`. Update + `get_sizeclass()` to mask with `SIZECLASS_MASK` explicitly. + Add `get_offset_and_sizeclass()`. Extend `encode(remote, sc)` to + `encode(remote, sc, size_t offset = 0)`; assert + `offset < (1 << OFFSET_BITS)`. +- Verify alignment chain (RemoteAllocator alignment section + above). If any check fails, fix before continuing. + +**Gate**: clean build (the size-budget `static_assert` in +`backend_arena_range.h` is the compile-time guard for the marker +shift). All existing tests still pass — every `ras` write still +encodes with `offset = 0` (the new default arg), so every +combined value still equals `sc.raw()`. + +### Step 3: Refactor `slab_index` into `slab_index_via` + +Changes: +- `sizeclasstable.h`: introduce + `slab_index_via(sizeclass_data_fast const& meta, address_t + addr)` carrying the existing body (mask, 64-bit reciprocal-mul, + 32-bit fallback). Make `slab_index(sizeclass_t, addr)` a + one-line wrapper over `slab_index_via`. + +**Gate**: clean build. All existing tests still pass — pure +refactor. + +### Step 4: Switch helpers to combined index + +Changes: +- `sizeclasstable.h`: change `start_of_object`, `index_in_object`, + `remaining_bytes`, `is_start_of_object` to take a single + `size_t combined` parameter. Body uses `fast(combined)` and + reads `meta.offset_bytes`; recovery is + `(addr & ~slab_mask) - offset_bytes`. Mark `index_in_object` + and `remaining_bytes` `SNMALLOC_FAST_PATH`. +- Update all call sites per the audit table above: + - `globalalloc.h:138-144`, `:158-167`: pass + `entry.get_offset_and_sizeclass()`. + - `globalalloc.h:231,260` (`slab_index` direct callers): fold + into the new `start_of_object`-based path (large arm of + `remaining_bytes` / `index_in_object` now goes through + `start_of_object(combined, addr)` and no longer calls + `slab_index` directly). + - `corealloc.h:41`, `override/new.cc:40`, + `corealloc.h:1258`, `corealloc.h:1438`: pass + `sc.raw()`. + - `corealloc.h:536`, `corealloc.h:1084`: pass + `entry.get_offset_and_sizeclass()`. + - `src/test/func/release-rounding/rounding.cc`: pass + `sc.raw()`. + +**Gate**: clean build. Full ctest suite passes. All combined +values are still `sc.raw()` because no offset writer exists yet +(step 5). + +### Step 5: Backend `alloc_chunk` writes per-chunk offset + +Changes: +- `backend.h::alloc_chunk` (~lines 131-156 today): keep the + `slab_size >= size` fast path using `set_metaentry` (offset = 0 + for every chunk). Add the multi-slab-tile branch (currently + dormant — only reached after Phase 15) that loops over chunks + and writes `ras_i = encode(remote, sc, slab_index)` via + `concretePagemap.set`. + +**Gate**: clean build. Full ctest suite passes — every Phase-14 +allocation today is single-slab-tile, so the new branch is +dormant. + +### Step 6: Targeted test for the per-chunk offset write + +Add `src/test/func/large_offset/large_offset.cc` per the +"Targeted test" subsection of "Final acceptance gates" below; +this exercises the multi-slab-tile write path by calling +`Config::Backend::alloc_chunk` directly with a synthetic +non-pow2 sizeclass. + +**Gate**: the new test passes; full suite still passes. + +### Step 7: Performance gate + +Run `perf-external_pointer` and `perf-large_alloc` on +`build-rel-base` vs `build-rel-p14`, 10× medians, per +`.github/skills/building_and_testing.md`. Compare against the +baseline-noise band measured pre-Phase-14. + +**Gate**: `perf-external_pointer` and `perf-large_alloc` within +noise of baseline (no statistically significant regression). +Disassemble `__malloc_start_pointer` to confirm: one `ras`-word +load, mask + table lookup with `offset_bytes`, no `meta`-word +load on the recovery path, no `imul`. + +**Outcome**: gate met after splitting the sizeclass metadata table +into three by purpose, plus an offset-aware branch in +`start_of_object`. + +1. Three tables, replacing the previous `fast_`/`slow_` pair: + - `start_` (4 × size_t = 32 B/row, indexed by + `offset_and_sizeclass_t`): `size`, `slab_mask`, `div_mult`, + `offset_bytes`. Power-of-two stride keeps the + `__malloc_start_pointer` index calc to a single `ubfiz #5`, + matching the baseline shape. + - `align_` (2 × size_t = 16 B/row, indexed by `sizeclass_t`): + `slab_mask` (duplicated), `mod_zero_mult`. + `is_start_of_object` reads both fields from one row instead of + straddling two tables; cold in `-fast` builds. + - `slab_` (2 × uint16 = 4 B/row, indexed by `sizeclass_t`): + `capacity`, `waking`. Slab init thresholds; cold. +2. `start_of_object` branches on `osc.offset() == 0` (testable from + bits already loaded in the `ras` word, before any metadata-table + access). The common arm skips the `offset_bytes` field load and + the offset-shift arithmetic; the slow arm handles non-pow2 large + interior chunks. Branch fully predicted on small-allocation + workloads. + +Without these refinements `perf-external_pointer-fast` regressed by +~24% (median ~360 ms vs baseline ~290 ms). With them, median +~290 ms — within noise of baseline. `perf-singlethread-check` +(exercises `is_start_of_object` on every dealloc) is also within +noise: identical 9-instruction codegen, now reading from the +narrower `align_` rows (4-per-cache-line vs the baseline's +2-per-cache-line). + +## Final acceptance gates + +1. **Build**: clean build passes. The new `static_assert` in + `sizeclasstable.h` (max large slab index < `1 << OFFSET_BITS`) + guards the OFFSET_BITS choice. The size-budget assert in + `backend_arena_range.h` (`(MAX_SIZE_BITS - MIN_SIZE_BITS) + + LARGE_SIZE_SHIFT <= bits::BITS`) guards the upward shift of + backend bits. 2. **Full ctest suite**: all existing tests pass. Front-end still issues pow2 large requests, so for every materialised large allocation `info.align == size` and offset is always 0 — the - new `set_metaentry_large` path produces the same `get_slab_metadata()` - answer as before. Existing `start_of_object` answers (via - `slab_mask`) match the new offset-based answers for pow2-aligned - allocations. + combined index for every entry equals `sc.raw()`, indexing the + offset = 0 row of `start_`, which is bit-identical to the + pre-split row layout. 3. **`src/test/func/release-rounding/rounding.cc`** continues to pass — small path unchanged; large path uses offset = 0 always. 4. **Extend `src/test/func/memory/memory.cc`** with a @@ -2916,63 +3321,159 @@ calling `slab_index` for large. still pow2-only). For each: - For every chunk offset `k * MIN_CHUNK_SIZE` for `k = 0..S_res/MIN_CHUNK_SIZE - 1`, assert - `Pagemap::get_metaentry(p + k * MIN_CHUNK_SIZE).get_offset() - == 0` (since the reservation is pow2 and `slab_size == - reservation_size` for pow2 large classes, all chunks live in - the single slab and have offset 0). + `remaining_bytes(p + k * MIN_CHUNK_SIZE) == S_res - k * + MIN_CHUNK_SIZE`. The public `remaining_bytes` routes through + `index_in_object` and therefore consumes the + combined index from the pagemap entry; any miscalculation + in `offset_bytes` would produce a wrong residual. - For every interior address `q = p + j` with `j ∈ {0, 1, - S_res/2, S_res-1}`, assert `start_of_object(q) == p`. + S_res/2, S_res-1}`, assert + `address_cast(snmalloc::external_pointer( + reinterpret_cast(q))) == p` (offset-aware public API, + which uses `index_in_object` → pagemap entry → combined + index → `offset_bytes` subtraction). 5. **New test or extension** to exercise the non-zero offset write path directly (Phase 14 is otherwise un-tested with non-zero - offsets, because the front-end is still pow2-only). Two options: - - (a) Add an internal-API test in - `src/test/func/large_offset/large_offset.cc` (or extend - `memory.cc`) that calls `BasicPagemap::set_metaentry_large` - directly on a freshly-allocated chunk-multiple range with a - synthetic non-pow2 sizeclass (one already populated in the - table in Phase 13). Then verify: - - `get_metaentry(p + k * MIN_CHUNK_SIZE).get_offset() == k * - MIN_CHUNK_SIZE / slab_size` for each chunk. - - `start_of_object(p + interior_addr) == p` for a - sample of interior addresses across all slabs. - - (b) Defer non-zero offset coverage to Phase 15 explicitly and - accept that Phase 14's gate is "no regressions on - pow2-allocation paths". - The plan picks (a) — Phase 14 must be independently testable. -6. **Boundary-bit-preservation test**: in an existing test that - exercises PAL-allocation boundaries (or a new minimal one), set - the boundary bit on a chunk via the backend path, then call - `set_offset(3)` on the frontend side, then read both — both - round-trip without clobbering each other. + offsets, because the front-end is still pow2-only). Path: + - Add a test in `src/test/func/large_offset/large_offset.cc` + that calls `Config::Backend::alloc_chunk` directly. The test + obtains a `LocalState&` from a constructed `snmalloc::Allocator` + via its public `get_backend_local_state()` accessor + (`corealloc.h:378`). + - Sizeclass selection: pick a non-pow2 large `sc` via + `sizeclass_t::from_raw(raw)` for a raw index whose + `sizeclass_metadata` entry has non-pow2 `size` but a smaller + `slab_mask` (= `info.align - 1`). These entries are + table-populated in Phase 13 and unreachable from the public + allocation API, but they are usable here because + `alloc_chunk`'s sizeclass argument is only consulted in the + pagemap write loop (which is what we want to exercise). + - Size argument: `alloc_chunk` asserts `bits::is_pow2(size)` + (`backend.h:95`). Pass `bits::next_pow2(sizeclass_full_to_size(sc))` + so the assert holds. This is *larger* than the sizeclass's + `size`, but the pagemap write loop iterates over the + passed-in pow2 region, computing per-chunk offsets via + `chunk_offset / slab_size` (where `slab_size = + sizeclass_full_to_slab_size(sc) < size`). Non-zero offsets + are therefore written for all chunks past the first slab. + - `ras` argument: construct via + `Config::PagemapEntry::encode(nullptr, sc)` (see + `metadata.h:211-219`), which matches how the front end builds + `ras` in `corealloc.h:723-728`. Avoids hard-coding the bit + layout in the test. The per-chunk `alloc_chunk` loop re-encodes + `ras` per chunk with the appropriate offset. + - Capability handling: `alloc_chunk` returns + `capptr::Chunk` (`backend.h:89-93`). Use + `address_cast(chunk)` for pagemap/start-of-object checks. + Before calling `dealloc_chunk`, convert via + `capptr_chunk_is_alloc(capptr_to_user_address_control(chunk))` + to get the `capptr::Alloc` it expects. + - Verify: + - For each chunk in the pow2 region: + - `Config::Backend::get_metaentry(address_cast(chunk) + + k * MIN_CHUNK_SIZE).get_offset_and_sizeclass()` decomposes + as `sc.raw() | (expected_slab_idx << SIZECLASS_BITS)` + where `expected_slab_idx = (k * MIN_CHUNK_SIZE) / + sizeclass_full_to_slab_size(sc)`. + - The same entry's `get_sizeclass()` (low-bits-only mask) + still returns `sc`. + - `address_cast(snmalloc::external_pointer(reinterpret_cast(address_cast(chunk) + + interior_offset))) == address_cast(chunk)` for a sample of + interior addresses spanning multiple slabs (one address + per slab boundary, plus mid-slab). `external_pointer` + routes through `index_in_object` which consults the + pagemap entry's combined index and the precomputed + `offset_bytes`. + - Then `Config::Backend::dealloc_chunk` with the *same* pow2 + size, and verify all chunks' offsets are cleared + (`get_offset_and_sizeclass() == 0`) — the dealloc path + constructs `Entry(nullptr, 0)`, whose `ras = 0` clears the + combined-index field entirely. +6. **Backend-bit-preservation test**: with the synthetic-sizeclass + test from (5) in place, allocate a region whose pow2 size spans + a PAL-allocation boundary so the backend has set bits in `meta` + and (after the move) the upper bits of `ras`. Verify the + boundary bit and other backend-owned bits survive the per-chunk + frontend write loop. (This is implicitly already covered by the + existing ctest suite — every multi-PAL-chunk allocation today + already does this, just without per-chunk offset writes — but + the explicit large_offset test makes the guarantee local.) ## Risks -1. **`alignof(SlabMetadata)` insufficient.** Required alignment is - `1 << (1 + OFFSET_BITS)` — 16 bytes for default config. If - inspection shows alignment is smaller (likely 8 today), add - `alignas(1 << (1 + OFFSET_BITS))`. Caught at compile time by the - new `static_assert`. -2. **`get_slab_metadata` mask update missed somewhere.** Grep for - `META_BOUNDARY_BIT` and `meta &` to find every site that - masks the meta word for a pointer. Convert each to the new - `META_FRONTEND_RESERVED_MASK`. -3. **Offset-bit positions overlap with backend bits when the entry - is backend-claimed.** Not a real risk: when the backend writes - its claim, the entry's `meta` is owned by the backend Rep - (different layout). Frontend reads `get_offset()` only on - frontend-claimed entries. -4. **Boundary bit not preserved during `set_offset`.** Mitigation: - implement `set_offset` as RMW preserving all bits except the - offset field. Test case: set boundary, set offset, read offset, - read boundary — both round-trip. +1. **`RemoteAllocator` alignment bump.** `REMOTE_MIN_ALIGN` rises + from 512 B to 4096 B. Mitigation: verify the structure size and + pool-storage alignment annotations before changing the + constant; bump pool alignment if needed. Caught at runtime by + the existing `snmalloc_check_client` assertions on `ras` + pointer-bit-extraction, and by misaligned-pointer crashes in + message-passing. +2. **Backend bit budget.** `MAX_SIZE_BITS - MIN_SIZE_BITS + + LARGE_SIZE_SHIFT <= bits::BITS` (the assert in + `backend_arena_range.h:68-70`). With `LARGE_SIZE_SHIFT` + auto-shifted up by `OFFSET_BITS`, default config goes from ~44 + to ~47 bits used, still ≤ 64. The assert is the gate. +3. **Combined-index table size.** The combined-index `start_` table + holds `1 << OFFSET_BITS` × `sizeof(sizeclass_data_start)` more + rows than the original sizeclass-indexed table. Default: 8 × 32 B + × `SIZECLASS_REP_SIZE` ≈ 64 KB. Acceptable for an L2-resident + metadata table; if `INTERMEDIATE_BITS` is raised to 3 + (`OFFSET_BITS = 4`) the table grows to ~128 KB — also + acceptable. +4. **Encode-time offset overflow.** `encode(remote, sc, offset)` + asserts `offset < (1 << OFFSET_BITS)`. The `alloc_chunk` loop + bounds `slab_index` to `size / slab_size`, which is bounded by + the worst-case slab count for the chosen sizeclass — the same + bound the `static_assert` on `OFFSET_BITS` enforces. Caught at + build time by `static_assert`, at runtime by the encode assert. +5. **Combined-index masks elsewhere.** Anywhere that previously + masked `ras` by `SIZECLASS_REP_SIZE - 1` (or equivalent) to + extract a sizeclass needs an audit: does it want pure sizeclass + (`SIZECLASS_MASK`) or combined (`COMBINED_MASK`)? Grep for + `SIZECLASS_REP_SIZE`, `(0xff)` style masks on `ras`, and + `get_sizeclass()` callers. Convert each deliberately. The + primary risk site is the backend's claim/release flow, which is + already gated on the marker bit and so unaffected. ## Out of scope - Front-end requesting non-pow2 large sizes (Phase 15). - Per-chunk offset for small allocations (small uses slab_mask recovery, no per-chunk offset needed). -- Multi-byte offset (`OFFSET_BITS = INTERMEDIATE_BITS + 1` bits, - fits cleanly in `meta` low bits). +- Configs where `slab_size < MIN_CHUNK_SIZE` (multiple logical + slabs per pagemap entry). The default `INTERMEDIATE_BITS = 2` + config does not hit this. Deferred to a future phase if needed. + +## Performance characterisation + +Goal: the layout-aware design should bring `perf-external_pointer` +back to baseline (or within noise). The two costs the previous, +`meta`-word-based, Phase 14 design carried — + +- one extra 8-byte load of the `meta` word per `external_pointer` + query, just to extract the offset; and +- one `imul` for `offset * slab_size` on the critical path — + +are both eliminated: + +- The combined index is the same `ras` word already loaded for the + sizeclass; masking with `COMBINED_MASK` is a single `and`-with-imm. +- `offset_bytes` is a table column; the subtraction is a load + a + sub, with no multiplication. + +`perf-large_alloc` is unchanged from the prior fix (single-slab-tile +fast path keeps `set_metaentry` as before; the per-chunk loop is +dormant until Phase 15). `perf-singlethread` and `perf-memcpy` were +within noise before and should remain so. + +Measure with five-run medians on `build-rel-base` (commit +`1144eab4`) vs `build-rel-p14` (head + Phase 14 layout-aware), per +the perf workflow in `.github/skills/building_and_testing.md`. If +`perf-external_pointer` is not within noise of baseline, +disassemble the new `__malloc_start_pointer` to confirm the load +count matches baseline (one 8-byte load of the pagemap byte, no +`meta` word load, no `imul`). # Phase 15: Front-end requests non-pow2 large allocations diff --git a/src/snmalloc/backend/backend.h b/src/snmalloc/backend/backend.h index 2772cf319..80ff58da8 100644 --- a/src/snmalloc/backend/backend.h +++ b/src/snmalloc/backend/backend.h @@ -128,8 +128,30 @@ namespace snmalloc return {nullptr, nullptr}; } - typename Pagemap::Entry t(meta, ras); - Pagemap::set_metaentry(address_cast(p), size, t); + const size_t slab_size = sizeclass_full_to_slab_size(sizeclass); + // `size` and `slab_size` are powers of two with `size >= slab_size`, + // so `size = k * slab_size` for some integer `k >= 1`. Each slab + // tile gets the same `ras_in | (slab_index << SIZECLASS_BITS)` + // entry, written in one `set_metaentry` call. + SNMALLOC_ASSERT(size >= slab_size); + // The OR below assumes the per-chunk-offset bits of `ras` are + // zero; `MetaEntryBase::encode` defaults offset to 0, and the + // backend is the only place per-chunk offsets are written. + SNMALLOC_ASSERT( + (ras & (((size_t{1} << OFFSET_BITS) - 1) << SIZECLASS_BITS)) == 0); + for (size_t chunk_offset = 0; chunk_offset < size; + chunk_offset += slab_size) + { + const size_t slab_index = chunk_offset / slab_size; + // `compute_max_large_slab_index() < (1 << OFFSET_BITS)` is + // static_asserted in sizeclasstable.h; this asserts the + // arithmetic that derives `slab_index` from `size`/`slab_size`. + SNMALLOC_ASSERT(slab_index < (size_t{1} << OFFSET_BITS)); + const uintptr_t ras_i = ras | (slab_index << SIZECLASS_BITS); + typename Pagemap::Entry t_i(meta, ras_i); + Pagemap::set_metaentry( + address_cast(p) + chunk_offset, slab_size, t_i); + } return {Aal::capptr_bound(p, size), meta}; } diff --git a/src/snmalloc/backend_helpers/backend_arena.h b/src/snmalloc/backend_helpers/backend_arena.h index 3353c5417..149c2f567 100644 --- a/src/snmalloc/backend_helpers/backend_arena.h +++ b/src/snmalloc/backend_helpers/backend_arena.h @@ -285,7 +285,10 @@ namespace snmalloc /** * Five-clause structural invariant. Runs when `enabled` is true; - * defaults to `Debug` so release tests can pass `true` explicitly. + * defaults to `Debug` so in-tree callers compile away in Release + * while tests can opt in by passing `true` explicitly. Uses + * `SNMALLOC_CHECK` rather than `SNMALLOC_ASSERT` so that + * test-driven invocations are checked even under NDEBUG. */ void check_invariant(bool enabled = Debug) const { @@ -304,7 +307,7 @@ namespace snmalloc if (prev_valid) { uintptr_t prev_end = prev_addr + prev_size; - SNMALLOC_ASSERT(prev_end != a || !Rep::can_consolidate(a)); + SNMALLOC_CHECK(prev_end != a || !Rep::can_consolidate(a)); } prev_addr = a; prev_size = s; @@ -316,10 +319,10 @@ namespace snmalloc self.range_tree.for_each([&](uintptr_t node) { auto [a, s] = range_from_addr(node); if (a >= UNIT_SIZE) - SNMALLOC_ASSERT( + SNMALLOC_CHECK( !contains_min(a - UNIT_SIZE) || !Rep::can_consolidate(a)); uintptr_t end = a + s; - SNMALLOC_ASSERT(!contains_min(end) || !Rep::can_consolidate(end)); + SNMALLOC_CHECK(!contains_min(end) || !Rep::can_consolidate(end)); }); // 1c. No two adjacent min blocks (unless boundary). @@ -330,7 +333,7 @@ namespace snmalloc if (Rep::get_variant(node) != BackendArenaVariant::Min) return; if (prev_valid) - SNMALLOC_ASSERT( + SNMALLOC_CHECK( prev + UNIT_SIZE != node || !Rep::can_consolidate(node)); prev = node; prev_valid = true; @@ -346,15 +349,15 @@ namespace snmalloc for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++) { - self.bin_trees[bin].for_each([&](uintptr_t node) { - auto [a, s] = range_from_addr(node); - if (s >= TWO_UNITS) - { - auto path = self.range_tree.get_root_path(); - SNMALLOC_ASSERT(self.range_tree.find(path, node)); - bin_tree_nonmin_count++; - } - }); + self.bin_trees[bin].for_each([&](uintptr_t node) { + auto [a, s] = range_from_addr(node); + if (s >= TWO_UNITS) + { + auto path = self.range_tree.get_root_path(); + SNMALLOC_CHECK(self.range_tree.find(path, node)); + bin_tree_nonmin_count++; + } + }); } // Reverse: every range-tree entry must be in its expected bin tree. @@ -364,10 +367,10 @@ namespace snmalloc auto range = typename Bins::range_t{a, s}; size_t expected_bin = Bins::bin_index(range); auto path = self.bin_trees[expected_bin].get_root_path(); - SNMALLOC_ASSERT(self.bin_trees[expected_bin].find(path, node)); + SNMALLOC_CHECK(self.bin_trees[expected_bin].find(path, node)); }); - SNMALLOC_ASSERT(bin_tree_nonmin_count == range_tree_count); + SNMALLOC_CHECK(bin_tree_nonmin_count == range_tree_count); } // Clause 3: Bin classification correctness. @@ -377,7 +380,7 @@ namespace snmalloc auto [a, s] = range_from_addr(node); auto range = typename Bins::range_t{a, s}; size_t expected_bin = Bins::bin_index(range); - SNMALLOC_ASSERT(expected_bin == bin); + SNMALLOC_CHECK(expected_bin == bin); }); } @@ -386,7 +389,7 @@ namespace snmalloc { bool has_entries = !self.bin_trees[bin].is_empty(); bool bit_set = bitmap.test(bin); - SNMALLOC_ASSERT(has_entries == bit_set); + SNMALLOC_CHECK(has_entries == bit_set); } // Clause 5: Variant-tag consistency. @@ -395,9 +398,9 @@ namespace snmalloc self.bin_trees[bin].for_each([&](uintptr_t node) { auto v = Rep::get_variant(node); auto [a, s] = range_from_addr(node); - SNMALLOC_ASSERT(v == variant_of(s, a)); + SNMALLOC_CHECK(v == variant_of(s, a)); if (v == BackendArenaVariant::Large) - SNMALLOC_ASSERT(Rep::get_large_size(node) == s); + SNMALLOC_CHECK(Rep::get_large_size(node) == s); }); } } diff --git a/src/snmalloc/backend_helpers/backend_arena_bins.h b/src/snmalloc/backend_helpers/backend_arena_bins.h index 26abf3665..0d045983e 100644 --- a/src/snmalloc/backend_helpers/backend_arena_bins.h +++ b/src/snmalloc/backend_helpers/backend_arena_bins.h @@ -47,7 +47,7 @@ namespace snmalloc { static_assert( INTERMEDIATE_BITS >= 1 && INTERMEDIATE_BITS <= 3, - "BackendArenaBins currently supports B in {1, 2, 3}"); + "BackendArenaBins supports B in {1, 2, 3}"); static_assert( MIN_SIZE_BITS + INTERMEDIATE_BITS < bits::BITS, "MIN_SIZE_BITS + INTERMEDIATE_BITS must leave room for at least one " diff --git a/src/snmalloc/backend_helpers/backend_arena_range.h b/src/snmalloc/backend_helpers/backend_arena_range.h index ff3e8dda4..8fc3cbd68 100644 --- a/src/snmalloc/backend_helpers/backend_arena_range.h +++ b/src/snmalloc/backend_helpers/backend_arena_range.h @@ -42,10 +42,11 @@ namespace snmalloc static constexpr uintptr_t UNIT_SIZE = uintptr_t(1) << MIN_SIZE_BITS; - // Bit positions inside a pagemap word. Bits in the reserved region - // (sizeclass + REMOTE_BACKEND_MARKER) are owned by the meta-entry - // layout; tree-node and large-size encodings start at the first free - // bit above that reserved range — see + // Bit positions inside a pagemap word. The reserved region (the + // sizeclass+offset bits on Word::Two, and META_BOUNDARY_BIT on + // Word::One) is owned by the meta-entry layout; tree-node and + // large-size encodings start at the first free bit above that + // reserved range — see // `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT` in `mem/metadata.h`. static constexpr unsigned RED_BIT_POS = MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT; @@ -72,17 +73,10 @@ namespace snmalloc static_assert(BIN_META_MASK < UNIT_SIZE); static_assert( Entry::is_backend_allowed_value(Entry::Word::One, BIN_META_MASK)); - static_assert(Entry::is_backend_allowed_value(Entry::Word::Two, RED_BIT)); using Word = typename Entry::Word; using Handle = typename Entry::BackendStateWordRef; - /** - * Pagemap word for the `UnitIdx`-th unit of the block at `addr`. - * Centralises the layout decision "which pagemap entry encodes - * data for unit i". Used by `TreeRep::ref` and by the variant / - * large-size accessors below. - */ template static Handle word_at(uintptr_t addr, Word w) { @@ -92,11 +86,11 @@ namespace snmalloc } /** - * RBTree Rep shared by `BinRep` and `RangeRep`. `UnitIdx` selects - * which unit (0 or 1) of the block holds this Rep's tree node; the - * Rep's pagemap words live at `addr + UnitIdx * UNIT_SIZE`. - * `MetaMask` covers the bits in that node's words that are owned by - * this Rep (red + any tag bits) and must be preserved by get/set. + * Tree rep shared by `BinRep` and `RangeRep`. `UnitIdx` is the + * block-relative pagemap unit (0 or 1) that holds this Rep's + * node; `MetaMask` covers bits in that unit's words owned by + * this Rep (red + variant tag for `BinRep`, red only for + * `RangeRep`) and must be preserved across get/set. */ template struct TreeRep @@ -111,7 +105,7 @@ namespace snmalloc { static const Contents null_entry = 0; if (SNMALLOC_UNLIKELY(k == 0)) - return Handle{const_cast(&null_entry)}; + return Handle{const_cast(&null_entry), 0}; return word_at(k, direction ? Word::One : Word::Two); } diff --git a/src/snmalloc/backend_helpers/largebuddyrange.h b/src/snmalloc/backend_helpers/largebuddyrange.h index bf217bc06..3eb5f5c21 100644 --- a/src/snmalloc/backend_helpers/largebuddyrange.h +++ b/src/snmalloc/backend_helpers/largebuddyrange.h @@ -76,10 +76,13 @@ namespace snmalloc // Special case for accessing the null entry. We want to make sure // that this is never modified by the back end, so we make it point to // a constant entry and use the MMU to trap even in release modes. + // The mask passed to the handle is irrelevant: the null entry is + // never written (any attempt would trap), and on read its underlying + // value is zero so `get()` returns zero regardless of the mask. static const Contents null_entry = 0; if (SNMALLOC_UNLIKELY(address_cast(k) == 0)) { - return {const_cast(&null_entry)}; + return {const_cast(&null_entry), 0}; } auto& entry = Pagemap::template get_metaentry_mut(address_cast(k)); if (direction) diff --git a/src/snmalloc/ds/sizeclasstable.h b/src/snmalloc/ds/sizeclasstable.h index c42e4643b..66b68f86a 100644 --- a/src/snmalloc/ds/sizeclasstable.h +++ b/src/snmalloc/ds/sizeclasstable.h @@ -17,41 +17,39 @@ namespace snmalloc { using chunksizeclass_t = size_t; - // Cap the address bits the encoding tries to represent so that - // `MAX_LARGE_SIZECLASS_SIZE` (= 2 ^ ENCODED_ADDRESS_BITS) always fits in - // `size_t`. On 64-bit platforms `DefaultPal::address_bits` is already 48, - // but on 32-bit platforms it equals `bits::BITS` and would otherwise - // overflow the encoded maximum to 0. + // Capped to `bits::BITS - 1` so `MAX_LARGE_SIZECLASS_SIZE` fits in + // `size_t` on 32-bit platforms where `DefaultPal::address_bits == + // bits::BITS`. constexpr size_t ENCODED_ADDRESS_BITS = bits::min(DefaultPal::address_bits, bits::BITS - 1); - // Number of large sizeclasses. Large classes follow on directly from small - // classes in the global exp+mantissa scheme used by - // `bits::from_exp_mant`. The total - // span of representable sizes is from MIN_ALLOC_SIZE up to and including - // 2^ENCODED_ADDRESS_BITS, so the count of large entries beyond the small - // range is (ENCODED_ADDRESS_BITS - MAX_SMALL_SIZECLASS_BITS) mantissa - // cycles, each with 2^INTERMEDIATE_BITS entries. + // Large classes follow on directly from small classes in the global + // exp+mantissa scheme: `(ENCODED_ADDRESS_BITS - MAX_SMALL_SIZECLASS_BITS)` + // mantissa cycles of `2^INTERMEDIATE_BITS` entries each. constexpr size_t NUM_LARGE_CLASSES = (ENCODED_ADDRESS_BITS - MAX_SMALL_SIZECLASS_BITS) << INTERMEDIATE_BITS; - // Bits required to encode any sizeclass value. Slot 0 is reserved as the - // unmapped/default sentinel, so the count includes a leading +1. + // Slot 0 of the table is reserved as the unmapped sentinel, hence +1. constexpr size_t SIZECLASS_BITS = bits::next_pow2_bits_const(1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES); - // Size of the sizeclass-keyed lookup tables and the alignment that the - // REMOTE_BACKEND_MARKER constraint requires of RemoteAllocator. There is no - // separate tag bit: all valid sizeclass raw values are in - // [0, 1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES) and live in the low - // SIZECLASS_BITS bits of a pagemap word. constexpr size_t SIZECLASS_REP_SIZE = bits::one_at_bit(SIZECLASS_BITS); - // Largest allocation size representable by the uniform sizeclass encoding. - // Equals `from_exp_mant(NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES - 1)`, - // which for the default config is `2 ^ ENCODED_ADDRESS_BITS`. Requests - // strictly larger than this cannot be encoded and must be failed before - // any call to `size_to_sizeclass_full`. + // Width of the per-chunk slab-offset field packed immediately above the + // sizeclass in `ras`. The worst-case slab count for any non-pow2 large + // class with `INTERMEDIATE_BITS = M` is `2^(M+1)`; `M + 1` bits cover + // the maximum index. `compute_max_large_slab_index` static_asserts the + // bound against the actual table below. + constexpr size_t OFFSET_BITS = INTERMEDIATE_BITS + 1; + + // `ras & COMBINED_MASK` directly indexes the `(sizeclass, offset)` table + // row, which already carries `offset_bytes = offset * slab_size`. + constexpr size_t COMBINED_BITS = SIZECLASS_BITS + OFFSET_BITS; + constexpr size_t COMBINED_REP_SIZE = bits::one_at_bit(COMBINED_BITS); + + // Largest size representable by the uniform sizeclass encoding; + // requests larger than this must be failed before + // `size_to_sizeclass_full`. constexpr size_t MAX_LARGE_SIZECLASS_SIZE = bits::from_exp_mant( NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES - 1); @@ -59,27 +57,21 @@ namespace snmalloc static_assert( MAX_LARGE_SIZECLASS_SIZE == bits::one_at_bit(ENCODED_ADDRESS_BITS), "MAX_LARGE_SIZECLASS_SIZE must equal 2 ^ ENCODED_ADDRESS_BITS; if this " - "fails, the exp+mantissa math no longer matches NUM_LARGE_CLASSES."); + "fails, the exp+mantissa math does not match NUM_LARGE_CLASSES."); static_assert( ENCODED_ADDRESS_BITS > MAX_SMALL_SIZECLASS_BITS, "ENCODED_ADDRESS_BITS must exceed MAX_SMALL_SIZECLASS_BITS so the large " "range is non-empty."); /** - * Represents a sizeclass identifier shared by small and large allocations - * using a single uniform encoding: + * Sizeclass identifier shared by small and large allocations: * - * value == 0 : unmapped / default sentinel - * value ∈ [1, 1 + NUM_SMALL_SIZECLASSES) : small sizeclass sc = value - 1 - * value ∈ [1 + NUM_SMALL_SIZECLASSES, - * 1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES) - * : large class lc = - * value - 1 - - * NUM_SMALL_SIZECLASSES + * value == 0 : sentinel (unmapped) + * value ∈ [1, 1 + NUM_SMALL_SIZECLASSES) : small, sc = value - 1 + * value ∈ [1 + NUM_SMALL_SIZECLASSES, ...): large * - * Used directly as an index into `sizeclass_metadata`. Slot 0 of that table - * is zero-padded so the sentinel can flow through the fast-path table - * lookups without a subtract on the hot path. + * Indexes `sizeclass_metadata` directly; slot 0 is zero-padded so the + * sentinel flows through fast-path lookups without a branch. */ class sizeclass_t { @@ -131,8 +123,7 @@ namespace snmalloc constexpr bool is_small() { - // Sentinel (value == 0) underflows to a large positive value, which - // also fails the comparison — the sentinel is therefore not small. + // Sentinel (value == 0) underflows past NUM_SMALL_SIZECLASSES. return (value - 1) < NUM_SMALL_SIZECLASSES; } @@ -147,12 +138,70 @@ namespace snmalloc } }; + /** + * (sizeclass, per-chunk slab offset) packed into the low `COMBINED_BITS` + * of a pagemap entry's `remote_and_sizeclass`. Non-zero offsets occur + * only for interior chunks of non-pow2 large allocations; the offset + * lets `start_of_object` recover the allocation base. + * + * Distinct from `sizeclass_t` so `is_small()` / `as_small()` / + * `as_large()` cannot be called on a value carrying offset bits, and so + * the offset can never be synthesised: constructing a value requires + * supplying both components explicitly, or going through `from_raw` + * with bits read from storage. + */ + class offset_and_sizeclass_t + { + size_t value{0}; + + constexpr offset_and_sizeclass_t(size_t value) : value(value) {} + + public: + constexpr offset_and_sizeclass_t() = default; + + constexpr offset_and_sizeclass_t(sizeclass_t sc, size_t offset) + : value(sc.raw() | (offset << SIZECLASS_BITS)) + { + SNMALLOC_ASSERT(offset < (size_t{1} << OFFSET_BITS)); + } + + static constexpr offset_and_sizeclass_t from_raw(size_t raw) + { + return {raw}; + } + + constexpr size_t raw() const + { + return value; + } + + constexpr sizeclass_t sizeclass() const + { + return sizeclass_t::from_raw(value & (SIZECLASS_REP_SIZE - 1)); + } + + constexpr size_t offset() const + { + return (value >> SIZECLASS_BITS) & ((size_t{1} << OFFSET_BITS) - 1); + } + + constexpr bool operator==(offset_and_sizeclass_t other) const + { + return value == other.value; + } + }; + using sizeclass_compress_t = uint8_t; /** - * This structure contains the fields required for fast paths for sizeclasses. + * Per-`offset_and_sizeclass_t` metadata for `start_of_object` — + * recovering the allocation base from an interior pointer. + * + * Sized to a power of two (4 × `size_t` = 32 bytes) so the table + * stride collapses to a single shift in the + * `__malloc_start_pointer` hot path. */ - struct sizeclass_data_fast + struct sizeclass_data_start { size_t size; // We store the mask as it is used more on the fast path, and the size of @@ -160,58 +209,106 @@ namespace snmalloc size_t slab_mask; // Table of constants for reciprocal division for each sizeclass. size_t div_mult; - // Table of constants for reciprocal modulus for each sizeclass. + // `offset * slab_size`, precomputed. Zero for `offset == 0` rows. + size_t offset_bytes; + }; + + static_assert( + sizeof(sizeclass_data_start) == 4 * sizeof(size_t), + "sizeclass_data_start must be a power-of-two stride for single-shift " + "indexing in start_of_object"); + + /** + * Per-`sizeclass_t` metadata for `is_start_of_object` — the + * Lemire-style alignment check used by check-build dealloc and + * debug asserts. + * + * `slab_mask` is duplicated here (also held in `sizeclass_data_start`) + * so the alignment check loads from a single row instead of straddling + * two tables. + */ + struct sizeclass_data_align + { + size_t slab_mask; size_t mod_zero_mult; }; /** - * This structure contains the remaining fields required for slow paths for - * sizeclasses. + * Per-`sizeclass_t` thresholds used when initialising a slab — + * cold-path data consumed at slab allocation/refill time. */ - struct sizeclass_data_slow + struct sizeclass_data_slab { uint16_t capacity; uint16_t waking; }; - static_assert(sizeof(sizeclass_data_slow::capacity) * 8 > MAX_CAPACITY_BITS); + static_assert(sizeof(sizeclass_data_slab::capacity) * 8 > MAX_CAPACITY_BITS); struct SizeClassTable { - ModArray fast_{}; - ModArray slow_{}; + // `start_` is indexed by an `offset_and_sizeclass_t` (Word::Two of + // the pagemap entry & COMBINED_MASK). The first SIZECLASS_REP_SIZE + // rows have offset == 0; subsequent rows carry the offset_bytes + // needed for `start_of_object` on non-pow2 large interior chunks. + ModArray start_{}; + ModArray align_{}; + ModArray slab_{}; size_t DIV_MULT_SHIFT{0}; - [[nodiscard]] constexpr sizeclass_data_fast& fast(sizeclass_t index) + [[nodiscard]] constexpr sizeclass_data_start& start(sizeclass_t index) { - return fast_[index.raw()]; + return start_[index.raw()]; } - [[nodiscard]] constexpr sizeclass_data_fast fast(sizeclass_t index) const + [[nodiscard]] constexpr sizeclass_data_start start(sizeclass_t index) const { - return fast_[index.raw()]; + return start_[index.raw()]; } - [[nodiscard]] constexpr sizeclass_data_fast& fast_small(smallsizeclass_t sc) + [[nodiscard]] constexpr sizeclass_data_start& + start(offset_and_sizeclass_t osc) { - return fast_[sizeclass_t::from_small_class(sc).raw()]; + return start_[osc.raw()]; } - [[nodiscard]] constexpr sizeclass_data_fast - fast_small(smallsizeclass_t sc) const + [[nodiscard]] constexpr sizeclass_data_start + start(offset_and_sizeclass_t osc) const { - return fast_[sizeclass_t::from_small_class(sc).raw()]; + return start_[osc.raw()]; } - [[nodiscard]] constexpr sizeclass_data_slow& slow(sizeclass_t index) + [[nodiscard]] constexpr sizeclass_data_start& + start_small(smallsizeclass_t sc) { - return slow_[index.raw()]; + return start_[sizeclass_t::from_small_class(sc).raw()]; } - [[nodiscard]] constexpr sizeclass_data_slow slow(sizeclass_t index) const + [[nodiscard]] constexpr sizeclass_data_start + start_small(smallsizeclass_t sc) const { - return slow_[index.raw()]; + return start_[sizeclass_t::from_small_class(sc).raw()]; + } + + [[nodiscard]] constexpr sizeclass_data_align& align(sizeclass_t index) + { + return align_[index.raw()]; + } + + [[nodiscard]] constexpr sizeclass_data_align align(sizeclass_t index) const + { + return align_[index.raw()]; + } + + [[nodiscard]] constexpr sizeclass_data_slab& slab(sizeclass_t index) + { + return slab_[index.raw()]; + } + + [[nodiscard]] constexpr sizeclass_data_slab slab(sizeclass_t index) const + { + return slab_[index.raw()]; } constexpr SizeClassTable() @@ -221,7 +318,8 @@ namespace snmalloc for (smallsizeclass_t sizeclass(0); sizeclass < NUM_SMALL_SIZECLASSES; sizeclass++) { - auto& meta = fast_small(sizeclass); + auto& meta = start_small(sizeclass); + auto sc = sizeclass_t::from_small_class(sizeclass); size_t rsize = bits::from_exp_mant( @@ -231,18 +329,19 @@ namespace snmalloc bits::next_pow2_bits_const(MIN_OBJECT_COUNT * rsize), MIN_CHUNK_BITS); meta.slab_mask = bits::mask_bits(slab_bits); + align(sc).slab_mask = meta.slab_mask; - auto& meta_slow = slow(sizeclass_t::from_small_class(sizeclass)); - meta_slow.capacity = + auto& meta_slab = slab(sc); + meta_slab.capacity = static_cast((meta.slab_mask + 1) / rsize); - meta_slow.waking = mitigations(random_larger_thresholds) ? - static_cast(meta_slow.capacity / 4) : - static_cast(bits::min((meta_slow.capacity / 4), 32)); + meta_slab.waking = mitigations(random_larger_thresholds) ? + static_cast(meta_slab.capacity / 4) : + static_cast(bits::min((meta_slab.capacity / 4), 32)); - if (meta_slow.capacity > max_capacity) + if (meta_slab.capacity > max_capacity) { - max_capacity = meta_slow.capacity; + max_capacity = meta_slab.capacity; } } @@ -253,75 +352,110 @@ namespace snmalloc sizeclass++) { // Calculate reciprocal division constant. - auto& meta = fast_small(sizeclass); + auto& meta = start_small(sizeclass); meta.div_mult = (bits::mask_bits(DIV_MULT_SHIFT) / meta.size) + 1; size_t zero = 0; - meta.mod_zero_mult = (~zero / meta.size) + 1; + align(sizeclass_t::from_small_class(sizeclass)).mod_zero_mult = + (~zero / meta.size) + 1; } for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) { auto lsc = sizeclass_t::from_large_class(lc); - auto& meta = fast(lsc); - // Continuous global exp+mantissa scheme: small classes occupy - // global indices [0, NUM_SMALL_SIZECLASSES); large classes occupy - // [NUM_SMALL_SIZECLASSES, NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES). + auto& meta = start(lsc); size_t size = bits::from_exp_mant( NUM_SMALL_SIZECLASSES + lc); meta.size = size; - // Natural alignment of the size: the largest power of two that - // divides `size`. For pow2 sizes, this equals `size`; for non-pow2 - // mantissa steps it is the slab granularity at which the allocation - // tiles. `slab_mask = align - 1`. - size_t align = size & (~size + 1); - meta.slab_mask = align - 1; - // The slab_mask will do all the necessary work, so - // perform identity multiplication for the test. - meta.mod_zero_mult = 1; - // The slab_mask will do all the necessary work for division - // so collapse the calculated offset. + // `slab_mask = (natural alignment of size) - 1`; for pow2 sizes + // this equals size - 1, for non-pow2 mantissa steps it is the + // slab granularity at which the allocation tiles. + size_t align_bytes = size & (~size + 1); + meta.slab_mask = align_bytes - 1; + align(lsc).slab_mask = meta.slab_mask; + // slab_mask handles the math; identity values neutralise the + // mod/div reciprocals. + align(lsc).mod_zero_mult = 1; meta.div_mult = 0; } + + // Populate offset > 0 rows: same as the (sc, 0) row but with + // `offset_bytes = offset * slab_size` so that `start_of_object` + // collapses to `(addr & ~slab_mask) - offset_bytes`. Read when + // the backend writes per-chunk offsets for multi-slab-tile + // reservations. + for (size_t sc_raw = 0; sc_raw < SIZECLASS_REP_SIZE; sc_raw++) + { + const auto& base = start_[sc_raw]; + const size_t slab_size = base.slab_mask + 1; + for (size_t offset = 1; offset < (size_t{1} << OFFSET_BITS); offset++) + { + auto& row = start_[sc_raw | (offset << SIZECLASS_BITS)]; + row.size = base.size; + row.slab_mask = base.slab_mask; + row.div_mult = base.div_mult; + row.offset_bytes = offset * slab_size; + } + } } }; constexpr SizeClassTable sizeclass_metadata = SizeClassTable(); - // Slot 0 of `sizeclass_metadata` is the unmapped sentinel; it must remain - // zero-initialised so fast-path lookups via `fast(sc)` return zero size - // and slab_mask without needing a sentinel check before indexing. + // Sentinel must remain zero-initialised so fast-path lookups via + // `start(sc)` return zero size and slab_mask without a branch. static_assert( - sizeclass_metadata.fast(sizeclass_t{}).size == 0, + sizeclass_metadata.start(sizeclass_t{}).size == 0, "sentinel slot must have size 0"); static_assert( - sizeclass_metadata.fast(sizeclass_t{}).slab_mask == 0, + sizeclass_metadata.start(sizeclass_t{}).slab_mask == 0, "sentinel slot must have slab_mask 0"); static_assert( bits::BITS - sizeclass_metadata.DIV_MULT_SHIFT <= MAX_CAPACITY_BITS); + // Largest slab index for any large class: `OFFSET_BITS` must cover it. + constexpr size_t compute_max_large_slab_index() + { + size_t max_idx = 0; + for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) + { + const auto& meta = + sizeclass_metadata.start(sizeclass_t::from_large_class(lc)); + const size_t slab_size = meta.slab_mask + 1; + const size_t reserve = bits::next_pow2_const(meta.size); + const size_t idx = (reserve / slab_size) - 1; + if (idx > max_idx) + max_idx = idx; + } + return max_idx; + } + + static_assert( + compute_max_large_slab_index() < (size_t{1} << OFFSET_BITS), + "OFFSET_BITS must cover the worst-case slab index for any large class"); + constexpr size_t DIV_MULT_SHIFT = sizeclass_metadata.DIV_MULT_SHIFT; constexpr size_t sizeclass_to_size(smallsizeclass_t sizeclass) { - return sizeclass_metadata.fast_small(sizeclass).size; + return sizeclass_metadata.start_small(sizeclass).size; } constexpr size_t sizeclass_full_to_size(sizeclass_t sizeclass) { - return sizeclass_metadata.fast(sizeclass).size; + return sizeclass_metadata.start(sizeclass).size; } constexpr size_t sizeclass_full_to_slab_size(sizeclass_t sizeclass) { - return sizeclass_metadata.fast(sizeclass).slab_mask + 1; + return sizeclass_metadata.start(sizeclass).slab_mask + 1; } constexpr size_t sizeclass_to_slab_size(smallsizeclass_t sizeclass) { - return sizeclass_metadata.fast_small(sizeclass).slab_mask + 1; + return sizeclass_metadata.start_small(sizeclass).slab_mask + 1; } /** @@ -333,7 +467,7 @@ namespace snmalloc */ constexpr uint16_t threshold_for_waking_slab(smallsizeclass_t sizeclass) { - return sizeclass_metadata.slow(sizeclass_t::from_small_class(sizeclass)) + return sizeclass_metadata.slab(sizeclass_t::from_small_class(sizeclass)) .waking; } @@ -351,13 +485,14 @@ namespace snmalloc constexpr uint16_t sizeclass_to_slab_object_count(smallsizeclass_t sizeclass) { - return sizeclass_metadata.slow(sizeclass_t::from_small_class(sizeclass)) + return sizeclass_metadata.slab(sizeclass_t::from_small_class(sizeclass)) .capacity; } - SNMALLOC_FAST_PATH constexpr size_t slab_index(sizeclass_t sc, address_t addr) + SNMALLOC_FAST_PATH constexpr size_t + slab_index(offset_and_sizeclass_t osc, address_t addr) { - auto meta = sizeclass_metadata.fast(sc); + auto meta = sizeclass_metadata.start(osc); size_t offset = addr & meta.slab_mask; if constexpr (sizeof(offset) >= 8) { @@ -382,29 +517,55 @@ namespace snmalloc } } + /** + * Recover the start address of the allocation containing `addr`. + * + * Branch on `osc.offset() == 0` (testable from bits already loaded + * into `osc.raw()`, before any metadata-table access). The common + * case skips the `offset_bytes` field load and four extra arithmetic + * insns; the slow arm handles non-pow2 large interior chunks where + * the slab base must be shifted back to the allocation base. + */ SNMALLOC_FAST_PATH constexpr address_t - start_of_object(sizeclass_t sc, address_t addr) + start_of_object(offset_and_sizeclass_t osc, address_t addr) { - auto meta = sizeclass_metadata.fast(sc); - address_t slab_start = addr & ~meta.slab_mask; - size_t index = slab_index(sc, addr); - return slab_start + (index * meta.size); + auto meta = sizeclass_metadata.start(osc); + if (SNMALLOC_LIKELY(osc.offset() == 0)) + { + address_t slab_base = addr & ~meta.slab_mask; + size_t in_slab = addr - slab_base; + size_t index = (in_slab * meta.div_mult) >> DIV_MULT_SHIFT; + return slab_base + (index * meta.size); + } + address_t alloc_start = (addr & ~meta.slab_mask) - meta.offset_bytes; + size_t index = slab_index(osc, addr - alloc_start); + return alloc_start + (index * meta.size); } - constexpr size_t index_in_object(sizeclass_t sc, address_t addr) + SNMALLOC_FAST_PATH constexpr size_t + index_in_object(offset_and_sizeclass_t osc, address_t addr) { - return addr - start_of_object(sc, addr); + return addr - start_of_object(osc, addr); } - constexpr size_t remaining_bytes(sizeclass_t sc, address_t addr) + SNMALLOC_FAST_PATH constexpr size_t + remaining_bytes(offset_and_sizeclass_t osc, address_t addr) { - return sizeclass_metadata.fast(sc).size - index_in_object(sc, addr); + return sizeclass_metadata.start(osc).size - index_in_object(osc, addr); } + /** + * True iff `addr` is correctly aligned for an object of this + * sizeclass within its slab. Does NOT check whether `addr` lies in + * the first slab tile of a non-pow2 large allocation; callers that + * could be looking at an interior chunk must read the + * `offset_and_sizeclass_t` from the pagemap and use that overload + * instead. + */ constexpr bool is_start_of_object(sizeclass_t sc, address_t addr) { - size_t offset = addr & (sizeclass_full_to_slab_size(sc) - 1); - + auto meta = sizeclass_metadata.align(sc); + size_t offset = addr & meta.slab_mask; // Only works up to certain offsets, exhaustively tested by rounding.cc if constexpr (sizeof(offset) >= 8) { @@ -412,8 +573,7 @@ namespace snmalloc // 32bit. // This is based on: // https://lemire.me/blog/2019/02/20/more-fun-with-fast-remainders-when-the-divisor-is-a-constant/ - auto mod_zero_mult = sizeclass_metadata.fast(sc).mod_zero_mult; - return (offset * mod_zero_mult) < mod_zero_mult; + return (offset * meta.mod_zero_mult) < meta.mod_zero_mult; } else // Use 32-bit division as considerably faster than 64-bit, and @@ -421,12 +581,25 @@ namespace snmalloc return static_cast(offset % sizeclass_full_to_size(sc)) == 0; } + /** + * True iff `addr` is the start of an object. Interior chunks of + * non-pow2 large allocations carry `offset_bytes != 0`; only the + * first slab tile holds an allocation base, so a non-zero + * `offset_bytes` short-circuits to false. + */ + constexpr bool + is_start_of_object(offset_and_sizeclass_t osc, address_t addr) + { + if (sizeclass_metadata.start(osc).offset_bytes != 0) + return false; + return is_start_of_object(osc.sizeclass(), addr); + } + inline static size_t large_size_to_chunk_size(size_t size) { return bits::next_pow2(size); } - constexpr SNMALLOC_PURE size_t sizeclass_lookup_index(const size_t s) { // We subtract and shift to reduce the size of the table, i.e. we don't have @@ -460,7 +633,7 @@ namespace snmalloc for (; sizeclass < minimum_class; sizeclass++) { for (; curr <= - sizeclass_metadata.fast_small(smallsizeclass_t(sizeclass)).size; + sizeclass_metadata.start_small(smallsizeclass_t(sizeclass)).size; curr += MIN_ALLOC_STEP_SIZE) { table[sizeclass_lookup_index(curr)] = minimum_class; @@ -470,7 +643,7 @@ namespace snmalloc for (; sizeclass < NUM_SMALL_SIZECLASSES; sizeclass++) { for (; curr <= - sizeclass_metadata.fast_small(smallsizeclass_t(sizeclass)).size; + sizeclass_metadata.start_small(smallsizeclass_t(sizeclass)).size; curr += MIN_ALLOC_STEP_SIZE) { auto i = sizeclass_lookup_index(curr); @@ -500,17 +673,8 @@ namespace snmalloc } /** - * Maps a requested size to its sizeclass. The result uses the unified - * encoding documented on `sizeclass_t`. - * - * For small sizes, this delegates to `size_to_sizeclass`. For large - * sizes in Phase 13, this rounds up to the next power of two (the - * front end still requests pow2-rounded reservations); Phase 15 - * removes the `next_pow2` call to enable non-pow2 large reservations. - * - * `to_exp_mant` is the literal inverse of the `from_exp_mant` used - * when populating `sizeclass_metadata`, so this never indexes the - * wrong slot. + * Map a requested size to its sizeclass. Large requests are rounded up + * to the next power of two. */ static inline sizeclass_t size_to_sizeclass_full(size_t size) { diff --git a/src/snmalloc/global/globalalloc.h b/src/snmalloc/global/globalalloc.h index 8b4e42e2d..8bc22e87c 100644 --- a/src/snmalloc/global/globalalloc.h +++ b/src/snmalloc/global/globalalloc.h @@ -138,9 +138,7 @@ namespace snmalloc size_t SNMALLOC_FAST_PATH_INLINE remaining_bytes(address_t p) { const auto& entry = Config_::Backend::template get_metaentry(p); - - auto sizeclass = entry.get_sizeclass(); - return snmalloc::remaining_bytes(sizeclass, p); + return snmalloc::remaining_bytes(entry.get_offset_and_sizeclass(), p); } template @@ -159,9 +157,7 @@ namespace snmalloc static inline size_t index_in_object(address_t p) { const auto& entry = Config_::Backend::template get_metaentry(p); - - auto sizeclass = entry.get_sizeclass(); - return snmalloc::index_in_object(sizeclass, p); + return snmalloc::index_in_object(entry.get_offset_and_sizeclass(), p); } enum Boundary @@ -230,7 +226,8 @@ namespace snmalloc { const auto& entry = Config_::Backend::get_metaentry(address_cast(p)); - size_t index = slab_index(entry.get_sizeclass(), address_cast(p)); + size_t index = + slab_index(entry.get_offset_and_sizeclass(), address_cast(p)); auto* meta_slab = entry.get_slab_metadata(); @@ -259,7 +256,8 @@ namespace snmalloc const auto& entry = Config_::Backend::template get_metaentry(address_cast(p)); - size_t index = slab_index(entry.get_sizeclass(), address_cast(p)); + size_t index = + slab_index(entry.get_offset_and_sizeclass(), address_cast(p)); auto* meta_slab = entry.get_slab_metadata(); diff --git a/src/snmalloc/mem/corealloc.h b/src/snmalloc/mem/corealloc.h index 10482b6b7..fa5f1389c 100644 --- a/src/snmalloc/mem/corealloc.h +++ b/src/snmalloc/mem/corealloc.h @@ -533,7 +533,7 @@ namespace snmalloc snmalloc_check_client( mitigations(sanity_checks), - is_start_of_object(entry.get_sizeclass(), address_cast(msg)), + is_start_of_object(entry.get_offset_and_sizeclass(), address_cast(msg)), "Not deallocating start of an object"); size_t objsize = sizeclass_full_to_size(entry.get_sizeclass()); @@ -1080,7 +1080,7 @@ namespace snmalloc snmalloc_check_client( mitigations(sanity_checks), - is_start_of_object(entry.get_sizeclass(), address_cast(p)), + is_start_of_object(entry.get_offset_and_sizeclass(), address_cast(p)), "Not deallocating start of an object"); auto cp = p.as_static>(); diff --git a/src/snmalloc/mem/metadata.h b/src/snmalloc/mem/metadata.h index dc4ff0948..423bd8772 100644 --- a/src/snmalloc/mem/metadata.h +++ b/src/snmalloc/mem/metadata.h @@ -9,12 +9,11 @@ namespace snmalloc struct RemoteAllocator; /** - * Remotes need to be aligned enough that the bottom bits have enough room for - * all the size classes, both large and small. An additional bit is required - * to separate backend uses. + * RemoteAllocator pointers must have their low `COMBINED_BITS` zero + * so the (sizeclass, offset) field can be OR-ed in by `encode`. */ static constexpr size_t REMOTE_MIN_ALIGN = - bits::max(CACHELINE_SIZE, SIZECLASS_REP_SIZE) << 1; + bits::max(CACHELINE_SIZE, COMBINED_REP_SIZE); /** * Base class for the templated FrontendMetaEntry. This exists to avoid @@ -33,19 +32,18 @@ namespace snmalloc { protected: /** - * This bit is set in remote_and_sizeclass to discriminate between the case - * that it is in use by the frontend (0) or by the backend (1). For the - * former case, see other methods on this and the subclass - * `FrontendMetaEntry`; for the latter, see backend/backend.h and - * backend/largebuddyrange.h. - * - * This value is statically checked by the frontend to ensure that its - * bit packing does not conflict; see mem/remoteallocator.h. The marker - * tracks the sizeclass-encoding width (see `SIZECLASS_REP_SIZE` in - * ds/sizeclasstable.h): it must sit immediately above the highest bit - * used by a sizeclass raw value. + * Low bits of `remote_and_sizeclass` holding the sizeclass alone. + */ + static constexpr address_t SIZECLASS_MASK = SIZECLASS_REP_SIZE - 1; + + /** + * Low bits of `remote_and_sizeclass` holding the (sizeclass, offset) + * pair. Also the markerless ownership discriminator: + * `(ras & COMBINED_MASK) == 0` iff the entry is NOT in active + * frontend use (frontend entries always have sizeclass != 0; slot 0 + * is the unmapped sentinel). */ - static constexpr address_t REMOTE_BACKEND_MARKER = SIZECLASS_REP_SIZE; + static constexpr address_t COMBINED_MASK = COMBINED_REP_SIZE - 1; /** * Bit used to indicate this should not be considered part of the previous @@ -59,14 +57,12 @@ namespace snmalloc static constexpr address_t META_BOUNDARY_BIT = 1 << 0; /** - * The bit above the sizeclass is always zero unless this is used - * by the backend to represent another datastructure such as the buddy - * allocator entries. + * Alignment used by `get_remote` to mask off the (sizeclass, offset) + * bits and recover the `RemoteAllocator*` payload. */ static constexpr size_t REMOTE_WITH_BACKEND_MARKER_ALIGN = - MetaEntryBase::REMOTE_BACKEND_MARKER; - static_assert( - (REMOTE_MIN_ALIGN >> 1) == MetaEntryBase::REMOTE_BACKEND_MARKER); + COMBINED_REP_SIZE; + static_assert(REMOTE_MIN_ALIGN >= COMBINED_REP_SIZE); /** * In common cases, the pointer to the slab metadata. See @@ -98,42 +94,38 @@ namespace snmalloc constexpr MetaEntryBase() : MetaEntryBase(0, 0) {} /** - * When a meta entry is in use by the back end, it exposes two words of - * state. The low bits in both are reserved. Bits in this bitmask must - * not be set by the back end in either word. - * - * During a major release, this constraint may be weakened, allowing the - * back end to set more bits. We don't currently use all of these bits in - * both words, but we reserve them all to make access uniform. If more - * bits are required by a back end then we could make this asymmetric. + * Per-word frontend-reserved masks. Bits in these masks are owned by + * the frontend; the backend must preserve them on writes (enforced + * by `BackendStateWordRef::operator=`). * - * `REMOTE_BACKEND_MARKER` is the highest bit that we reserve, so this is - * currently every bit including that bit and all lower bits. + * - Word::One reserves `META_BOUNDARY_BIT` so PAL-allocation + * boundaries survive ownership transitions. + * - Word::Two reserves `COMBINED_MASK`; the markerless ownership + * discriminator requires these bits to be zero in backend mode, + * and backend writes here are chunk-aligned so the requirement + * is naturally satisfied. */ - static constexpr address_t BACKEND_RESERVED_MASK = - (REMOTE_BACKEND_MARKER << 1) - 1; + static constexpr address_t BACKEND_RESERVED_MASK_WORD_ONE = + META_BOUNDARY_BIT; + static constexpr address_t BACKEND_RESERVED_MASK_WORD_TWO = COMBINED_MASK; public: /** - * Bit position of the first bit available to backend metadata layouts - * above the reserved region. The reserved region runs from bit 0 up to - * and including the `REMOTE_BACKEND_MARKER` bit; layouts in - * `backend_arena_range.h` and `largebuddyrange.h` derive their bit - * positions (RED_BIT, VARIANT_SHIFT, LARGE_SIZE_SHIFT, ...) from this. + * First bit on Word::One available for backend layouts; the bits + * below are frontend-reserved. Backends in `backend_arena_range.h` + * derive `RED_BIT`, `VARIANT_SHIFT`, etc. from this. */ - static constexpr size_t BACKEND_LAYOUT_FIRST_FREE_BIT = - bits::next_pow2_bits_const(REMOTE_BACKEND_MARKER) + 1; + static constexpr size_t BACKEND_LAYOUT_FIRST_FREE_BIT = 1; /** - * Does the back end currently own this entry? Note that freshly - * allocated entries are owned by the front end until explicitly - * claimed by the back end and so this will return `false` if neither - * the front nor back end owns this entry. + * True iff this entry is not in active frontend use (backend-claimed + * or untouched). Frontend entries always have `sizeclass != 0` + * (slot 0 is the unmapped sentinel), so the discriminator + * `(ras & COMBINED_MASK) == 0` distinguishes them. */ [[nodiscard]] bool is_backend_owned() const { - return (REMOTE_BACKEND_MARKER & remote_and_sizeclass) == - REMOTE_BACKEND_MARKER; + return (remote_and_sizeclass & COMBINED_MASK) == 0; } /** @@ -147,14 +139,19 @@ namespace snmalloc } /** - * Encode the remote and the sizeclass. + * Pack `remote`, `sizeclass`, and the per-chunk slab offset into a + * `remote_and_sizeclass` word. `offset` defaults to 0; the backend's + * multi-slab-tile write loop in `alloc_chunk` overrides it with the + * chunk's slab index so `start_of_object` can recover the + * allocation base. */ [[nodiscard]] static SNMALLOC_FAST_PATH uintptr_t - encode(RemoteAllocator* remote, sizeclass_t sizeclass) + encode(RemoteAllocator* remote, sizeclass_t sizeclass, size_t offset = 0) { /* remote might be nullptr; cast to uintptr_t before offsetting */ return pointer_offset( - reinterpret_cast(remote), sizeclass.raw()); + reinterpret_cast(remote), + offset_and_sizeclass_t(sizeclass, offset).raw()); } /** @@ -206,14 +203,14 @@ namespace snmalloc ///@} /** - * Returns the remote. - * - * If the meta entry is owned by the back end then this returns an - * undefined value and will abort in debug builds. + * Return the `RemoteAllocator*` payload by masking off the low + * `COMBINED_BITS`. Callable in any state: for unowned entries + * yields nullptr; for backend-owned entries yields a chunk address + * which compares unequal to any allocator's `public_state()`, so + * dispatch falls through to the slow path. */ [[nodiscard]] SNMALLOC_FAST_PATH RemoteAllocator* get_remote() const { - SNMALLOC_ASSERT(!is_backend_owned()); return reinterpret_cast( pointer_align_down( get_remote_and_sizeclass())); @@ -241,19 +238,31 @@ namespace snmalloc // TODO: perhaps remove static_cast with resolution of // https://github.com/CTSRD-CHERI/llvm-project/issues/588 return sizeclass_t::from_raw( - static_cast(get_remote_and_sizeclass()) & - (REMOTE_WITH_BACKEND_MARKER_ALIGN - 1)); + static_cast(get_remote_and_sizeclass()) & SIZECLASS_MASK); + } + + /** + * Return the (sizeclass, slab offset) pair indexing + * `sizeclass_metadata.start_`. The selected row carries + * `offset_bytes = offset * slab_size` precomputed, so + * `start_of_object` recovers the allocation base with a single + * subtract. + */ + [[nodiscard]] SNMALLOC_FAST_PATH offset_and_sizeclass_t + get_offset_and_sizeclass() const + { + return offset_and_sizeclass_t::from_raw( + static_cast(get_remote_and_sizeclass()) & COMBINED_MASK); } /** - * Claim the meta entry for use by the back end. This preserves the - * boundary bit, if it is set, but otherwise resets the meta entry to a - * pristine state. + * Claim the meta entry for the backend: preserves the boundary bit + * and zeros `remote_and_sizeclass` so `is_backend_owned()` holds. */ void claim_for_backend() { meta = is_boundary() ? META_BOUNDARY_BIT : 0; - remote_and_sizeclass = REMOTE_BACKEND_MARKER; + remote_and_sizeclass = 0; } /** @@ -274,9 +283,11 @@ namespace snmalloc Two }; - static constexpr bool is_backend_allowed_value(Word, uintptr_t val) + static constexpr bool is_backend_allowed_value(Word w, uintptr_t val) { - return (val & BACKEND_RESERVED_MASK) == 0; + const address_t mask = (w == Word::One) ? BACKEND_RESERVED_MASK_WORD_ONE : + BACKEND_RESERVED_MASK_WORD_TWO; + return (val & mask) == 0; } /** @@ -293,6 +304,14 @@ namespace snmalloc */ uintptr_t* val; + /** + * The frontend-reserved mask for the word that `val` points at. Bits + * in this mask are owned by the frontend: `get()` clears them on + * read, and `operator=` preserves them on write (by OR-ing the + * current value's masked bits into the new value). + */ + address_t reserved_mask{0}; + public: /** * Uninitialised constructor. @@ -300,9 +319,21 @@ namespace snmalloc BackendStateWordRef() = default; /** - * Constructor, wraps a `uintptr_t`. Note that this may be used outside - * of the meta entry by code wishing to provide uniform storage to things - * that are either in a meta entry or elsewhere. + * Constructor, wraps a `uintptr_t` and the frontend-reserved mask + * that applies to that word. Note that this may be used outside of + * the meta entry by code wishing to provide uniform storage to + * things that are either in a meta entry or elsewhere. + */ + constexpr BackendStateWordRef(uintptr_t* v, address_t mask) + : val(v), reserved_mask(mask) + {} + + /** + * Single-pointer constructor for sentinel storage that the back + * end never writes through (e.g. red-black tree concept-check + * null/root nodes — see `largebuddyrange.h`). Reserved mask is + * 0, so the `operator=` assertion is vacuous; safety relies on + * the sentinels being `static const`, making any write UB. */ constexpr BackendStateWordRef(uintptr_t* v) : val(v) {} @@ -320,7 +351,7 @@ namespace snmalloc */ [[nodiscard]] uintptr_t get() const { - return (*val) & ~BACKEND_RESERVED_MASK; + return (*val) & ~reserved_mask; } /** @@ -338,13 +369,13 @@ namespace snmalloc BackendStateWordRef& operator=(uintptr_t v) { SNMALLOC_ASSERT_MSG( - ((v & BACKEND_RESERVED_MASK) == 0), - "The back end is not permitted to use the low bits in the meta " - "entry. ({} & {}) == {}.", + ((v & reserved_mask) == 0), + "The back end is not permitted to use the reserved bits in the " + "meta entry. ({} & {}) == {}.", v, - BACKEND_RESERVED_MASK, - (v & BACKEND_RESERVED_MASK)); - *val = v | (static_cast(*val) & BACKEND_RESERVED_MASK); + reserved_mask, + (v & reserved_mask)); + *val = v | (static_cast(*val) & reserved_mask); return *this; } @@ -384,7 +415,10 @@ namespace snmalloc remote_and_sizeclass); claim_for_backend(); } - return {w == Word::One ? &meta : &remote_and_sizeclass}; + return (w == Word::One) ? + BackendStateWordRef{&meta, BACKEND_RESERVED_MASK_WORD_ONE} : + BackendStateWordRef{ + &remote_and_sizeclass, BACKEND_RESERVED_MASK_WORD_TWO}; } }; @@ -751,14 +785,7 @@ namespace snmalloc SNMALLOC_FAST_PATH FrontendMetaEntry(SlabMetadata* meta, uintptr_t remote_and_sizeclass) : MetaEntryBase(unsafe_to_uintptr(meta), remote_and_sizeclass) - { - SNMALLOC_ASSERT_MSG( - (REMOTE_BACKEND_MARKER & remote_and_sizeclass) == 0, - "Setting a backend-owned value ({}) via the front-end interface is not " - "allowed", - remote_and_sizeclass); - remote_and_sizeclass &= ~REMOTE_BACKEND_MARKER; - } + {} /** * Implicit copying of meta entries is almost certainly a bug and so the @@ -777,13 +804,13 @@ namespace snmalloc } /** - * Return the FrontendSlabMetadata metadata associated with this chunk, - * guarded by an assert that this chunk is being used as a slab (i.e., has - * an associated owning allocator). + * Return the FrontendSlabMetadata pointer. Only meaningful when the + * entry is frontend-owned; in other states the underlying word + * holds tree-node fields. Callers must verify ownership first + * (the standard idiom is `entry.get_remote() == self->public_state()`). */ [[nodiscard]] SNMALLOC_FAST_PATH SlabMetadata* get_slab_metadata() const { - SNMALLOC_ASSERT(!is_backend_owned()); return unsafe_from_uintptr(meta & ~META_BOUNDARY_BIT); } }; diff --git a/src/snmalloc/override/new.cc b/src/snmalloc/override/new.cc index 667ca9c45..a3f8fc6ea 100644 --- a/src/snmalloc/override/new.cc +++ b/src/snmalloc/override/new.cc @@ -37,7 +37,8 @@ namespace snmalloc SNMALLOC_ASSERT( secondary_allocator || - is_start_of_object(size_to_sizeclass_full(size), address_cast(p))); + is_start_of_object( + size_to_sizeclass_full(size), address_cast(p))); return p; } diff --git a/src/snmalloc/override/rust.cc b/src/snmalloc/override/rust.cc index d2e7e2e08..86ac6f5f8 100644 --- a/src/snmalloc/override/rust.cc +++ b/src/snmalloc/override/rust.cc @@ -41,8 +41,8 @@ extern "C" SNMALLOC_EXPORT void* SNMALLOC_NAME_MANGLE(rust_realloc)( if ( aligned_old_size <= MAX_LARGE_SIZECLASS_SIZE && aligned_new_size <= MAX_LARGE_SIZECLASS_SIZE && - size_to_sizeclass_full(aligned_old_size).raw() == - size_to_sizeclass_full(aligned_new_size).raw()) + size_to_sizeclass_full(aligned_old_size) == + size_to_sizeclass_full(aligned_new_size)) return ptr; void* p = alloc(aligned_new_size); if (p) diff --git a/src/test/func/backend_arena/backend_arena.cc b/src/test/func/backend_arena/backend_arena.cc index 684783a7c..e94ca990d 100644 --- a/src/test/func/backend_arena/backend_arena.cc +++ b/src/test/func/backend_arena/backend_arena.cc @@ -206,7 +206,7 @@ namespace snmalloc // assertion — this catches the buddy.h:90-93 unsafe-probe pattern // (calling can_consolidate before confirming the address is in // our region) in BackendArena unit tests rather than as a runtime - // segfault in production builds. + // segfault in release builds. static bool can_consolidate(uintptr_t addr) { return !mock_store[mock_index(addr)].boundary; @@ -1438,7 +1438,7 @@ namespace snmalloc // tree-membership tests gate the can_consolidate read. MockRep's // can_consolidate now dereferences mock_store via mock_index, which // asserts on out-of-range indices, so an unguarded probe in - // add_block trips here rather than only as a segfault in production + // add_block trips here rather than only as a segfault in release // builds. static void test_block_at_arena_top_edge() { diff --git a/src/test/func/backend_arena_bins/backend_arena_bins.cc b/src/test/func/backend_arena_bins/backend_arena_bins.cc index 7dc126931..8190c3909 100644 --- a/src/test/func/backend_arena_bins/backend_arena_bins.cc +++ b/src/test/func/backend_arena_bins/backend_arena_bins.cc @@ -33,8 +33,8 @@ namespace snmalloc * Friend struct exposing private internals of * `BackendArenaBins` (and its nested `Bitmap`) * for unit tests. Forward-declared in `backend_arena_bins.h`; - * defined here so the production header carries no test-only - * surface. + * defined here to keep the test-access implementation out of the + * in-tree header. */ template struct BackendArenaBinsTestAccess @@ -82,7 +82,7 @@ namespace snmalloc // --- Raw size-class id access --- // // The bin scheme assigns a dense raw id in `[0, MAX_SC)` to each - // size class. Production code never names these (the fast path + // size class. In-tree callers never name these (the fast path // goes straight from request size to the bitmap-scan / carve // record). Tests cross-check the encoding via the helpers below; // the alias `sc_t = size_t` preserves the existing test diff --git a/src/test/func/large_offset/large_offset.cc b/src/test/func/large_offset/large_offset.cc new file mode 100644 index 000000000..e7c45c246 --- /dev/null +++ b/src/test/func/large_offset/large_offset.cc @@ -0,0 +1,225 @@ +/** + * Targeted test for the per-chunk pagemap offset write path in + * `BackendAllocator::alloc_chunk`. + * + * The front end currently only issues pow2 large requests (the + * `slab_size >= size` fast path), so the multi-slab-tile branch in + * `alloc_chunk` writing per-chunk offsets is otherwise unreachable + * from the in-tree allocation paths. This test reaches it via the + * public backend API. + * + * Method: + * - Pick a non-pow2 large sizeclass `sc` whose + * `sizeclass_full_to_slab_size(sc) < sizeclass_full_to_size(sc)`, + * so the multi-slab-tile branch triggers. + * - Compute the pow2 reservation `next_pow2(size)` (the size + * `alloc_chunk` asserts). + * - Call `Config::Backend::alloc_chunk` directly with that pow2 size + * and the non-pow2 sc. + * - For each chunk in the pow2 region verify the pagemap entry's + * `get_offset_and_sizeclass()` decomposes into the expected + * (sc, slab_index) pair. + * - For sampled interior addresses verify that + * `remaining_bytes` / `index_in_object` return positions within + * the logical allocation. + * - Verify `is_start_of_object` behaviour: true at the allocation + * base, false elsewhere. + * - `dealloc_chunk` and verify entries clear back to "not + * frontend-owned" (low COMBINED_BITS == 0). + */ + +#include "test/setup.h" + +#include +#include +#include + +#ifdef assert +# undef assert +#endif +#define assert please_use_SNMALLOC_ASSERT + +using namespace snmalloc; + +using CustomGlobals = FixedRangeConfig>; +using FixedAlloc = Allocator; + +namespace +{ + bool any_failures = false; + + void fail(const char* msg) + { + std::cout << "FAIL: " << msg << std::endl; + any_failures = true; + } + + /** + * Find the smallest non-pow2 large sizeclass: one where slab_size < + * size. Returns sizeclass_t{} (the unmapped sentinel) if none exists + * in this configuration. + */ + sizeclass_t find_non_pow2_large_sc() + { + for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) + { + auto sc = sizeclass_t::from_large_class(lc); + const size_t size = sizeclass_full_to_size(sc); + const size_t slab_size = sizeclass_full_to_slab_size(sc); + if (slab_size < size) + return sc; + } + return sizeclass_t{}; + } + + void test_per_chunk_offset() + { + auto sc = find_non_pow2_large_sc(); + if (sc.raw() == 0) + { + std::cout << "No non-pow2 large sizeclass available in this config; " + "skipping per-chunk offset test." + << std::endl; + return; + } + const size_t size = sizeclass_full_to_size(sc); + const size_t slab_size = sizeclass_full_to_slab_size(sc); + const size_t reserve = bits::next_pow2(size); + + std::cout << "non-pow2 sc raw=" << sc.raw() << " size=" << size + << " slab_size=" << slab_size << " reserve=" << reserve + << std::endl; + + // Set up an isolated FixedRangeConfig allocator. FixedRangeConfig + // owns its own pagemap and never reclaims `region_base`; the + // reservation is released when the process exits. For a multi- + // test harness, explicit teardown would be required here. + const size_t region = bits::one_at_bit(28); + auto region_base = DefaultPal::reserve(region); + DefaultPal::notify_using(region_base, region); + CustomGlobals::init(nullptr, region_base, region); + + auto a = get_scoped_allocator(); + + using Backend = typename CustomGlobals::Backend; + using Entry = typename CustomGlobals::PagemapEntry; + + // Construct the encoded ras the way the front end does (offset=0). + const uintptr_t ras_in = Entry::encode(nullptr, sc); + + auto [chunk, slab_meta] = + Backend::alloc_chunk(a->get_backend_local_state(), reserve, ras_in, sc); + if (chunk == nullptr) + { + fail("alloc_chunk returned null"); + return; + } + + const address_t base = address_cast(chunk); + std::cout << "Allocated chunk base=" << reinterpret_cast(base) + << " reserve=" << reserve << std::endl; + + // Verify per-chunk pagemap entries. + for (size_t chunk_offset = 0; chunk_offset < reserve; + chunk_offset += MIN_CHUNK_SIZE) + { + const size_t expected_slab_index = chunk_offset / slab_size; + const auto& entry = Backend::get_metaentry(base + chunk_offset); + const offset_and_sizeclass_t osc = entry.get_offset_and_sizeclass(); + const offset_and_sizeclass_t expected_osc = + offset_and_sizeclass_t(sc, expected_slab_index); + if (!(osc == expected_osc)) + { + std::cout << "Chunk @+" << chunk_offset << " osc=" << osc.raw() + << " expected=" << expected_osc.raw() << " (sc=" << sc.raw() + << " idx=" << expected_slab_index << ")" << std::endl; + fail("offset_and_sizeclass mismatch"); + } + // The pure sizeclass mask must still report `sc`. + if (!(entry.get_sizeclass() == sc)) + { + std::cout << "Chunk @+" << chunk_offset << " get_sizeclass mismatch" + << std::endl; + fail("get_sizeclass mismatch on offset>0 chunk"); + } + } + + // For an interior address in each chunk that lies within the + // *logical* allocation (size, not the pow2 reservation), + // remaining_bytes / index_in_object should report position within + // the allocation. + for (size_t chunk_offset = 0; chunk_offset < size; + chunk_offset += MIN_CHUNK_SIZE) + { + const address_t addr = base + chunk_offset; + const size_t rem = snmalloc::remaining_bytes(addr); + if (rem != size - chunk_offset) + { + std::cout << "remaining_bytes @+" << chunk_offset << " = " << rem + << " expected " << (size - chunk_offset) << std::endl; + fail("remaining_bytes mismatch"); + } + const size_t idx = snmalloc::index_in_object(addr); + if (idx != chunk_offset) + { + std::cout << "index_in_object @+" << chunk_offset << " = " << idx + << " expected " << chunk_offset << std::endl; + fail("index_in_object mismatch"); + } + } + + // Direct is_start_of_object checks: the allocation base address + // must be a start-of-object; an interior address inside the first + // slab tile (offset_bytes == 0 in pagemap) but not at the base + // must NOT; and an address in any non-first slab tile + // (offset_bytes != 0 in pagemap) must NOT. + { + const auto& base_entry = Backend::get_metaentry(base); + if (!is_start_of_object(base_entry.get_offset_and_sizeclass(), base)) + fail("base address not reported as start-of-object"); + if (is_start_of_object(base_entry.get_offset_and_sizeclass(), base + 1)) + fail("base+1 incorrectly reported as start-of-object"); + } + if (size > slab_size) + { + const address_t second_slab = base + slab_size; + const auto& second_entry = Backend::get_metaentry(second_slab); + if (is_start_of_object( + second_entry.get_offset_and_sizeclass(), second_slab)) + fail("second slab tile base incorrectly reported as start-of-object"); + } + + // Tear down: dealloc the chunk and verify the per-chunk pagemap + // entries no longer report as frontend-owned. + auto alloc_cap = + capptr_chunk_is_alloc(capptr_to_user_address_control(chunk)); + Backend::dealloc_chunk( + a->get_backend_local_state(), *slab_meta, alloc_cap, reserve, sc); + + for (size_t chunk_offset = 0; chunk_offset < reserve; + chunk_offset += MIN_CHUNK_SIZE) + { + const auto& entry = Backend::get_metaentry(base + chunk_offset); + if (!entry.is_backend_owned()) + { + std::cout << "Chunk @+" << chunk_offset + << " not backend-owned after dealloc; osc=" + << entry.get_offset_and_sizeclass().raw() << std::endl; + fail("dealloc didn't reset per-chunk offset"); + } + } + } +} // namespace + +int main() +{ + setup(); + test_per_chunk_offset(); + if (any_failures) + { + std::cout << "FAILED" << std::endl; + return 1; + } + std::cout << "PASSED" << std::endl; + return 0; +} diff --git a/src/test/func/release-rounding/rounding.cc b/src/test/func/release-rounding/rounding.cc index d03cfe772..490343dd4 100644 --- a/src/test/func/release-rounding/rounding.cc +++ b/src/test/func/release-rounding/rounding.cc @@ -18,18 +18,49 @@ int main(int argc, char** argv) bool failed = false; + // Layout invariant: osc(sc, off).raw() == sc.raw() | (off << SIZECLASS_BITS), + // and the accessors invert that layout. This is load-bearing because + // `SizeClassTable::start(sizeclass_t)` and `start(offset_and_sizeclass_t)` + // both index by `.raw()`, so an offset=0 osc must hit the same table + // row as the bare sizeclass_t; the offset>0 row-population loop in + // the SizeClassTable ctor relies on the same layout. If any of this + // drifts, `encode()` in metadata.h would silently produce wrong bits. + for (smallsizeclass_t sc_small; sc_small < NUM_SMALL_SIZECLASSES; sc_small++) + { + sizeclass_t sc = sizeclass_t::from_small_class(sc_small); + for (size_t off = 0; off < (size_t{1} << OFFSET_BITS); off++) + { + auto osc = offset_and_sizeclass_t(sc, off); + size_t expected_raw = sc.raw() | (off << SIZECLASS_BITS); + if ( + osc.raw() != expected_raw || osc.sizeclass() != sc || + osc.offset() != off) + { + std::cout << "osc layout mismatch: sc=" << sc.raw() << " off=" << off + << " -> raw=" << osc.raw() << " expected_raw=" << expected_raw + << " sc'=" << osc.sizeclass().raw() + << " off'=" << osc.offset() << std::endl + << std::flush; + failed = true; + } + } + } + if (failed) + abort(); + for (smallsizeclass_t size_class; size_class < NUM_SMALL_SIZECLASSES; size_class++) { size_t rsize = sizeclass_to_size(size_class); size_t max_offset = sizeclass_to_slab_size(size_class); sizeclass_t sc = sizeclass_t::from_small_class(size_class); + offset_and_sizeclass_t osc = offset_and_sizeclass_t(sc, 0); for (size_t offset = 0; offset < max_offset; offset++) { size_t mod = offset % rsize; bool mod_0 = (offset % rsize) == 0; - size_t opt_mod = index_in_object(sc, offset); + size_t opt_mod = index_in_object(osc, offset); if (mod != opt_mod) { std::cout << "rsize " << rsize << " offset " << offset << " opt " @@ -38,7 +69,7 @@ int main(int argc, char** argv) failed = true; } - bool opt_mod_0 = is_start_of_object(sc, offset); + bool opt_mod_0 = is_start_of_object(osc, offset); if (opt_mod_0 != mod_0) { std::cout << "rsize " << rsize << " offset " << offset @@ -63,6 +94,7 @@ int main(int argc, char** argv) { size_t S = bits::one_at_bit(b); sizeclass_t sc = size_to_sizeclass_full(S); + offset_and_sizeclass_t osc = offset_and_sizeclass_t(sc, 0); address_t base = address_t(0); size_t offsets[] = {0, 1, S / 2, S - 1, S}; @@ -72,7 +104,7 @@ int main(int argc, char** argv) size_t expected_mod = off % S; bool expected_start = expected_mod == 0; - size_t opt_mod = index_in_object(sc, addr); + size_t opt_mod = index_in_object(osc, addr); if (opt_mod != expected_mod) { std::cout << "Large S=" << S << " offset=" << off @@ -81,7 +113,7 @@ int main(int argc, char** argv) failed = true; } - bool opt_start = is_start_of_object(sc, addr); + bool opt_start = is_start_of_object(osc, addr); if (opt_start != expected_start) { std::cout << "Large S=" << S << " offset=" << off From 3e75889823f0d8397d9b36411479e903288e1f03 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Thu, 4 Jun 2026 11:02:13 +0100 Subject: [PATCH 14/31] Pre-Phase-15: fix compile-time aligned alloc/dealloc asymmetry snmalloc::alloc() applies aligned_size(align, size) internally; snmalloc::dealloc(p) did not. When the alignment upgrade pushed the reservation into a different sizeclass than `size`, check_size fired under the check flavour. Reproducer: alloc<33*1024, _, 128*1024>(); dealloc<33*1024>(p) => "Dealloc rounded size mismatch: 0xa000 != 0x20000". Merge dealloc into a single template `dealloc` applying aligned_size(align, size) before check_size. The default align=1 preserves existing one-argument-template behaviour because aligned_size(1, size) == size. Move aligned_size from sizeclasstable.h to sizeclassstatic.h so the test library header can use it without pulling in the full runtime sizeclass machinery. Existing consumers still get it transitively via the pal.h -> ds_core.h -> sizeclassstatic.h include chain. Mirror the merge in the test library header: dealloc and alloc. Add aligned_dealloc to TESTLIB_ONLY_TESTS. Includes src/test/func/aligned_dealloc/ with the canonical reproducer and additional (S, A) pairs. Also captures the planning context in PLAN.md (pre-Phase-15 + Phase 15 sections). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CMakeLists.txt | 1 + PLAN.md | 539 ++++++++++++++---- src/snmalloc/ds/sizeclasstable.h | 40 +- src/snmalloc/ds_core/sizeclassstatic.h | 47 ++ src/snmalloc/global/globalalloc.h | 11 +- .../func/aligned_dealloc/aligned_dealloc.cc | 90 +++ src/test/snmalloc_testlib.h | 21 +- 7 files changed, 604 insertions(+), 145 deletions(-) create mode 100644 src/test/func/aligned_dealloc/aligned_dealloc.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index be1fca26a..d9c01b0ab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -548,6 +548,7 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) # These are mitigation-independent and can be compiled once, then linked # against both fast and check testlib variants. set(TESTLIB_ONLY_TESTS + aligned_dealloc backend_arena backend_arena_bins backend_arena_range diff --git a/PLAN.md b/PLAN.md index cb99ba20c..2bdcde19e 100644 --- a/PLAN.md +++ b/PLAN.md @@ -3475,6 +3475,112 @@ disassemble the new `__malloc_start_pointer` to confirm the load count matches baseline (one 8-byte load of the pagemap byte, no `meta` word load, no `imul`). +# Pre-Phase-15: compile-time aligned dealloc overload + +## Goal + +Fix a pre-existing latent bug in the compile-time templated alloc / +dealloc API. This is independent of Phase 15 and is committed as a +sibling commit before Phase 15 begins. + +## The bug + +`globalalloc.h:341-356` `alloc` applies +`aligned_size(align, size)` internally: +``` +constexpr size_t sz = aligned_size(align, size); +… alloc(sz); +``` + +`globalalloc.h:394-399` `dealloc(p)` does not — it passes the +raw `size` to `check_size`: +``` +template +SNMALLOC_FAST_PATH_INLINE void dealloc(void* p) +{ + check_size(p, size); + … +} +``` + +When the alignment-driven upgrade pushes the alloc into a different +sizeclass than `size` itself, `check_size` fires. Concretely today +(pre-Phase-15), with `S = 33 KiB`, `A = 128 KiB`: + +- `alloc<33 KiB, Uninit, 128 KiB>()` → `aligned_size(128 KiB, 33 KiB) + = 128 KiB` → pagemap `sc(128 KiB)`. +- `dealloc<33 KiB>(p)` → `check_size(p, 33 KiB)` → + `size_to_sizeclass_full(33 KiB) = sc(40 KiB)` (sc(64 KiB) once + Phase 15 lands). +- Mismatch — `check_size` fires under `mitigations(sanity_checks)`. + Verified on `main` with a manual reproducer: + `Dealloc rounded size mismatch: 0xa000 != 0x20000`. + +The bug exists in `main` today; it does not require Phase 15. Phase +15 lowers the threshold (more (A, S) pairs cross a sizeclass +boundary) but does not introduce the asymmetry. + +## Fix + +Merge `dealloc` into a single template with `align` defaulted +to 1, so the same body handles both calling forms: +``` +template +SNMALLOC_FAST_PATH_INLINE void dealloc(void* p) +{ + constexpr size_t sz = aligned_size(align, size); + check_size(p, sz); + ThreadAlloc::get().dealloc(p); +} +``` +`aligned_size(1, size) == size` for all `size`, so existing +single-argument `dealloc(p)` callers are bit-equivalent to +their previous behaviour. + +To make `aligned_size` reachable from the test library header (which +deliberately avoids pulling in the full runtime sizeclass tables), +move its definition from `sizeclasstable.h` to `sizeclassstatic.h`. +The function is a pure compile-time-friendly utility — it depends +only on `is_small_sizeclass`, `bits::is_pow2`, and the SNMALLOC_* +macros, all of which are already available in `sizeclassstatic.h`. +Consumers of `aligned_size` previously included via `sizeclasstable.h` +still pick it up transitively through the existing include chain +(`pal.h` → `ds_core.h` → `sizeclassstatic.h`). + +Apply the same merge in the test library: +- `template void dealloc(void* p)` + replaces the previous `template` testlib overload. +- `template void* alloc()` + replaces the previous two-parameter testlib `alloc`. The body + computes `sz = aligned_size(align, size)` and routes to the + small/large path based on `sz`. + +## Test + +`src/test/func/aligned_dealloc/aligned_dealloc.cc`, listed in +`TESTLIB_ONLY_TESTS` so it is compiled once and linked against both +testlib flavours. + +- Includes `test/snmalloc_testlib.h` only — exercises the public + templated `alloc` / `dealloc` + surface through the testlib layering. +- The canonical reproducer `(S = 33 KiB, A = 128 KiB)` fires the bug + on `main` under the `check` flavour. Confirmed by hand before the + fix. +- Additional `(S, A)` pairs cover a small-to-large alignment upgrade, + a wider gap, the `align == size` baseline, and a small natural + alignment case. + +## Gate + +1. Build clean. +2. New test passes under both `fast` and `check`. +3. Full ctest suite green. +4. Pre-commit review loop. +5. Commit approval. + +After this commit lands, Phase 15 begins on top of it. + # Phase 15: Front-end requests non-pow2 large allocations ## Goal @@ -3488,130 +3594,367 @@ sizeclass encoding has supported non-pow2 large since Phase 13, and the per-chunk offset machinery has supported pointer recovery since Phase 14. +Effect: a request for e.g. 70 KiB on the default config +(`INTERMEDIATE_BITS = 2`) currently reserves 128 KiB (next pow2); +after Phase 15 it reserves 80 KiB (the next exp+mantissa class, +saving ~37.5%). A request for 96 KiB + 1 byte currently reserves +128 KiB; after Phase 15 it reserves 112 KiB. Sizes that already +land on a class boundary (e.g. 80 KiB, 96 KiB) reserve exactly +their requested size where today they reserve the next pow2. Net +effect across workloads is a reduction of large-allocation +footprint up to ~33% for sizes that fall mid-exponent. + +## Why now + +Phase 14 added the per-chunk offset write in `Backend::alloc_chunk`, +the three-table sizeclass metadata split (`start_` / `align_` / +`slab_`), and the offset==0 fast-path branch in `start_of_object`. +All of this is dormant on the front-end today because +`large_size_to_chunk_size(size) = next_pow2(size)` means every +materialised large allocation has `offset = 0` in every chunk. The +Phase 14 `large_offset` test reaches the per-chunk path via the +public *backend* API to confirm the dormant code is correct; Phase +15 is what makes the front-end actually exercise it. + +## Pre-flight verification + +Before implementing, confirm these Phase 14 facts (all true today — +listed so reviewers can re-check): + +- `bits::to_exp_mant(v)` + ceil-encodes (`v = v - 1; …`), so passing the raw size (not + `next_pow2(size)`) maps to the smallest enclosing sizeclass. +- `Backend::alloc_chunk` currently asserts + `bits::is_pow2(size)`. The Phase 14 pagemap loop advances by + `slab_size = sizeclass_full_to_slab_size(sizeclass)`, so the + correct precondition is `size >= slab_size` *and* + `(size & (slab_size - 1)) == 0`. Both already hold by + construction for front-end calls because + `size = sizeclass_full_to_size(sc)` and `slab_size = size & -size` + is the largest pow2 divisor of `size`; the loop terminates + exactly at `size`. We will tighten/relax the assert to match. +- The Phase 14 assert that `ras`'s offset bits are zero on entry + to `alloc_chunk` continues to hold: front-end calls + `PagemapEntry::encode(remote, sc)` with default `offset = 0`. +- `BackendArenaBins::carve` returns a base aligned to + `info.align = size & -size` (the largest pow2 divisor of size, + set in the bin-table ctor at `backend_arena_bins.h:742`). For a + 96 KiB request that is 32 KiB = `slab_size` = + `sizeclass_full_to_slab_size(sc)` — exactly what + `start_of_object`'s `addr & ~slab_mask` requires. +- `globalalloc::remaining_bytes` / `index_in_object` already route + through `entry.get_offset_and_sizeclass()` (committed in Phase + 14's API cleanup), so they will pick up non-zero offsets + automatically once the front-end produces them. + ## Changes ### `src/snmalloc/ds/sizeclasstable.h` -- `large_size_to_chunk_size(size)`: replace +- `size_to_sizeclass_full(size)` (line 679): large branch currently + does `to_exp_mant(next_pow2(size))`. Drop the `next_pow2` step + and call `to_exp_mant(size)` directly. The encoding's ceil + semantic means a request of `S` lands in the smallest sizeclass + whose size is `>= S`. +- `large_size_to_chunk_size(size)` (line 598): replace `bits::next_pow2(size)` with the rounded sizeclass-derived size: - `sizeclass_full_to_size(size_to_sizeclass_full(size))`. Now - rounds to exp+mantissa boundaries (matching Phase 13 encoding). -- `round_size(size)` for large (lines 478-501): currently returns - `bits::next_pow2(size)`. Update to match `large_size_to_chunk_size`: - `return sizeclass_full_to_size(size_to_sizeclass_full(size));` - This is critical because `DefaultConts::success` in + `sizeclass_full_to_size(size_to_sizeclass_full(size))`. With the + change above this collapses to one table lookup. +- `round_size(size)` (line 693): the large branch currently returns + `bits::next_pow2(size)`. Update to match + `large_size_to_chunk_size`: + `return sizeclass_full_to_size(size_to_sizeclass_full(size));`. + This is correctness-critical because `DefaultConts::success` in `corealloc.h:34-47` uses `round_size` to determine the zeroing range for `calloc`. Without this update, `calloc` would zero - beyond the actual reservation. The two functions converge to - the same value now that the front-end's chunk-size request - matches the round-size. -- Update the comments on both functions to describe the new - rounding behaviour (no "next pow2"; "exp+mantissa rounded"). + beyond the actual reservation. +- Update the doc-comments on `size_to_sizeclass_full` and + `round_size` to drop the "rounded up to the next power of two" + language; describe the exp+mantissa rounding instead. ### `src/snmalloc/backend/backend.h` -- `alloc_chunk` (line 89-95): - `SNMALLOC_ASSERT(bits::is_pow2(size))` → relaxed to - `SNMALLOC_ASSERT((size & (MIN_CHUNK_SIZE - 1)) == 0)`. - (Already permissible per Phase 14; tighten only if Phase 14 - did not relax it.) -- `meta_size = bits::next_pow2(sizeof(SlabMetadata) + extra_bytes);` - unchanged — that's metadata-array size, not allocation size. +- `alloc_chunk` precondition (line 95): currently + `SNMALLOC_ASSERT(bits::is_pow2(size))`. Replace with the + slab-tile invariant: + ``` + const size_t slab_size = sizeclass_full_to_slab_size(sizeclass); + SNMALLOC_ASSERT(size >= slab_size); + SNMALLOC_ASSERT((size & (slab_size - 1)) == 0); + ``` + These match the pagemap loop's stride exactly and are the + minimum required for the per-chunk write to terminate at `size`. + The existing `size >= slab_size` assert on line 136 becomes + redundant once the precondition asserts it; consolidate. +- The Phase 14 offset-bits-zero assert on `ras` (lines 140-141) + stays — front-end still uses `encode(remote, sc)` with default + offset. +- Comment on lines 132-135 ("`size` and `slab_size` are powers of + two") is invalidated by Phase 15; rewrite to "`size` is a + multiple of `slab_size` with `size >= slab_size`". + +### `src/snmalloc/global/globalalloc.h` + +No change in Phase 15. The runtime sized-dealloc check is correct +after Phase 15 because every legitimate caller pre-applies +`aligned_size`: + +- Unaligned `sized_dealloc(p, S)`: alloc was `malloc(S)`, which goes + through `size_to_sizeclass_full(S)`; the dealloc check evaluates + the same function on the same `S`. Same sizeclass. +- Aligned `sized_dealloc(p, S, A)` (line 401): computes + `aligned_size(A, S)` *before* calling `check_size`. +- `rust.cc:33` and `rust.cc:51`: both apply `aligned_size` before + the 2-arg `dealloc(ptr, size)` path. +- `jemalloc_compat::sdallocx`: ignores the size argument. + +A 2-arg `sized_dealloc(p, S)` after `aligned_alloc(A, S)` with +`aligned_size(A, S) > S` would mismatch — but that is a client bug: +the client should use the 3-arg form for aligned allocations. + +The compile-time `alloc` / `dealloc` +asymmetry is being fixed in the **pre-Phase-15 sibling commit** +(see the "Pre-Phase-15: compile-time aligned dealloc overload" +section below). Phase 15 does not touch `globalalloc.h`. ### `src/snmalloc/mem/corealloc.h` -- Verify line 1576 (and any other `next_pow2(round_sizeof)` site) - — read context and update to match the new rounding scheme if - it's on the large-allocation path. -- The dealloc-large path was already migrated in Phase 13 to - `sizeclass_full_to_size(entry.get_sizeclass())` — no further - change needed. -- The front-end large-alloc path (corealloc.h:703-727) uses - `large_size_to_chunk_size` — automatically picks up the new - behaviour. - -### `src/snmalloc/mem/smallbuddyrange.h:232` - -- `auto rsize = bits::next_pow2(size);` inside - `alloc_range_with_leftover` is used only by the meta-data range - (and arguably the small object path). Read context to determine - scope. Likely no change in Phase 15; Phase 15 only touches large - object allocations. If a change is required, include it here; - if not, document the decision. +- Large-alloc handler at lines 723-728 currently invokes + `size_to_sizeclass_full(size)` three times and + `large_size_to_chunk_size(size)` once. Hoist into locals so the + table lookups happen once: + ``` + const auto sc = size_to_sizeclass_full(size); + const size_t chunk_sz = sizeclass_full_to_size(sc); + auto [chunk, meta] = Config::Backend::alloc_chunk( + self->get_backend_local_state(), + chunk_sz, + PagemapEntry::encode(self->public_state(), sc), + sc); + ``` + - Phase 15 still leaves the large path through the same handler; + the hoist removes duplicated work on the large-allocation path + rather than changing any small-allocation hot loop. + +### `src/snmalloc/backend_helpers/smallbuddyrange.h:232` and similar + +- `alloc_range_with_leftover` uses `bits::next_pow2(size)` to size + its parent request. This range serves the *meta-data* allocator, + not the user object range — meta_size is always pow2 (line 203 + of `backend.h` already calls `next_pow2(sizeof(SlabMetadata) + + extra_bytes)`). No change needed; verify by inspection that the + call site is not on the user-large path and note the conclusion + in the commit. + +### Tests + +- The existing `src/test/func/large_offset/large_offset.cc` test + exercises the per-chunk path via the *backend* API. Phase 15 + flips the *front-end* to do the same. The test's header + comment (lines 5-9) currently says "currently only issues pow2 + large requests" and that `alloc_chunk` "asserts pow2"; both + become false after Phase 15. Update the comment to describe + this test as the *low-level* / *backend-API* counterpart of the + new front-end test. + +- Add a sibling test `src/test/func/large_offset_frontend/` that + exercises a *bounded* set of representative large sizeclasses + (smallest non-pow2 large class, two mid-range classes spanning + different exponents, one near `MAX_LARGE_SIZECLASS_SIZE` only if + the total allocation is well under the available test-time + address budget — cap at a few MiB per allocation). For each + selected sizeclass `sc` where + `sizeclass_full_to_size(sc) != sizeclass_full_to_slab_size(sc)`: + - Call `malloc(sizeclass_full_to_size(sc))`, save `p`. Assert + `is_start_of_object(p)`. + - For every chunk offset `j * MIN_CHUNK_SIZE` with + `j ∈ [1, size_full / MIN_CHUNK_SIZE)`, assert + `external_pointer(p + j * MIN_CHUNK_SIZE) == p` and + `remaining_bytes(p + j * MIN_CHUNK_SIZE) == size_full - j * + MIN_CHUNK_SIZE`. + - Assert `malloc_usable_size(p) == size_full` (the new actual + reservation, not `next_pow2(size_full)`). + - Free, then re-allocate and confirm address re-use behaves + sanely. + - Also allocate a *non-boundary* request between adjacent class + sizes (e.g. `malloc(size_full - 1)` for a non-pow2 class, + `malloc(prev_class + 1)`) and assert `malloc_usable_size(p)` + equals `size_full` — this is what proves the raw request maps + to the smallest enclosing class. + - Pure table-level properties (every large sizeclass round-trips + through `size_to_sizeclass_full` ∘ `sizeclass_full_to_size`) + can be checked without allocating; loop over the full large + range there. + +- `src/test/func/sizeclass/sizeclass.cc` lines 160-175 currently + assert that a non-pow2 large size strictly between adjacent + pow2 rounds to the next pow2. Phase 15 changes this: a non-pow2 + size now rounds to the next exp+mantissa class. Compute the + expected value independently of the function under test — scan + the representable large classes (e.g. iterate sizeclasses 0 .. + `NUM_LARGE_CLASSES`) and pick the smallest `sizeclass_full_to_size(sc) >= mid`. + Then assert `size_to_sizeclass_full(mid)` equals that sizeclass + and `sizeclass_full_to_size(size_to_sizeclass_full(mid))` equals + the independently-computed class size. Update the comment + ("pow2 rounding still in force") accordingly. The surrounding + `b == ENCODED_ADDRESS_BITS` bound logic stays. + + **Add a deterministic `round_size` regression gate alongside.** + For each representable large sizeclass `sc` with size `S = + sizeclass_full_to_size(sc)`, and `S_prev` the previous class + size, assert: + - `round_size(S) == S` + - `round_size(S_prev + 1) == S` (i.e. the request is rounded + to the smallest enclosing class, not blown up to the next + pow2). + - `large_size_to_chunk_size(S_prev + 1) == round_size(S_prev + 1)` + (the chunk-size and round-size views agree). + + This is the primary `round_size` gate. If `round_size` is left + as `next_pow2`, these assertions fail deterministically — unlike + the calloc zeroing smoke test below, which may not fault when + `memset` overruns into backend free range. + +- `src/test/func/release-rounding/rounding.cc` lines 86-127 + exercise pow2 large sizes end-to-end via + `index_in_object`/`is_start_of_object`. Phase 15 does not + change behaviour for pow2 sizes (they still round to themselves), + so this loop continues to pass unchanged. Optionally extend + the loop with a non-pow2 case (e.g. `mid = S + (S >> 2)`) to + exercise the new front-end-materialised non-pow2 classes. + +- `src/test/func/malloc/malloc.cc:82-87` uses + `natural_alignment(size)` symbolically. Because + `natural_alignment` derives from `round_size`, the test + auto-tracks Phase 15: a 96 KiB alloc now reports 32 KiB + alignment (today: 128 KiB). No code change in the test, but + cross-check that no test elsewhere hard-codes "pow2 large + alignment". + +- `src/test/func/statistics/` (and any other test asserting + per-sizeclass alloc counts): verify the assertion model does + not assume pow2 large counts. Inspection-only first; update + only if tests fail. + +- **Calloc zeroing correctness smoke test.** The existing calloc + tests (`memory.cc::test_calloc_16M`, `test_calloc` loop in + `malloc.cc`) mostly use sizes that round to a pow2 reservation + even today, so they would not catch `round_size` being left as + `next_pow2` after Phase 15. Add a test in + `src/test/func/memory/memory.cc` that calls `calloc(1, S)` for + a non-pow2 large class size `S` and asserts + `malloc_usable_size(p) == S` and that every byte in `[p, p + S)` + is zero. This is a smoke test only — the deterministic gate for + the `round_size` regression lives in `sizeclass.cc` (above) + because a `memset` overshoot into backend free range may not + fault and would not be caught by zeroing the visible range. ## Test gates -1. **Build**: clean build passes. -2. **Full ctest suite**: all existing tests pass. Existing tests - that exercise large allocations now allocate chunk-multiples, - not pow2 sizes. Reservation footprint shrinks; functional - results are unchanged. -3. **Extend `src/test/func/memcpy/func-memcpy.cc`** with a - non-pow2 large case: - - For sizes `S` strictly between adjacent pow2 (e.g. `S = - 1.5 * MAX_SMALL_SIZECLASS_SIZE`), call `malloc(S)`. Verify: - - `memcpy(p + sizeclass_full_to_size(sc) - 1, src, 1)` succeeds. - - `memcpy(p + sizeclass_full_to_size(sc), src, 1)` traps (in - the bounds-checking variant). - - **Prerequisite**: Phase 14 must have already replaced - `globalalloc.h::remaining_bytes` with the Config-aware - pagemap-offset path. Without that prerequisite, this test - does not exercise the offset path. (Verify by inspection: - confirm the new `remaining_bytes` consults - `entry.get_offset()`, not just `start_of_object_small`.) -4. **Extend existing `test/func/memory/memory.cc`** with a - non-pow2 pointer-recovery case (mirroring the Phase 14 test but - on front-end-issued non-pow2 allocations): - - For sizes `S` strictly between adjacent pow2 in the large - range, call `malloc(S)`, save `p`. Compute - `S_rounded = sizeclass_full_to_size(size_to_sizeclass_full(S))`. - - For every interior address `q = p + j` with `j ∈ {0, 1, - MIN_CHUNK_SIZE, S_rounded / 2, S_rounded - 1}`, assert - `external_pointer(q) == p`. - - Assert `is_start_of_object(p)` is true; `is_start_of_object(p - + 1)` is false; `is_start_of_object(p + MIN_CHUNK_SIZE)` is - false (every interior chunk has offset != 0). - - Assert reservation footprint matches `S_rounded / - MIN_CHUNK_SIZE` chunks (NOT `next_pow2(S) / MIN_CHUNK_SIZE`). -5. **Extend `src/test/func/release-rounding/rounding.cc`** to cover - non-pow2 large sizeclasses now that they're materialised - end-to-end. -6. **Existing memory-stress tests** (e.g. `external_pointer.cc`) - continue to pass. +1. **Build**: clean build passes. The `static_assert` chain from + Phase 14 is unchanged — `compute_max_large_slab_index` in + `sizeclasstable.h:419-437` still uses + `bits::next_pow2_const(meta.size)`, which is *conservative* + under Phase 15 (the front-end now reserves at most that much, + often less), so the budget bound continues to hold. +2. **Full ctest suite**: all 88 existing tests pass after + expectation updates in `sizeclass.cc`. Tests exercising large + allocations now allocate exp+mantissa-rounded chunk sizes; + reservation footprint shrinks; functional results unchanged. +3. **New `large_offset_frontend` test** passes — per-chunk offsets + are now produced by the front-end and recovered by + `external_pointer` / `remaining_bytes`. +4. **`perf-external_pointer-fast`**: median within noise of the + Phase 14 baseline (~290 ms on the dev machine). The hot path + for small allocations is unchanged; the only change in + instruction count comes from `__malloc_start_pointer` for + non-pow2 large allocations, which now exercises the slow arm of + the `offset == 0` branch added in Phase 14 — but only for + genuinely non-pow2 allocations, of which the benchmark has + none. +5. **`perf-singlethread-check`**: within noise. +6. **Memory footprint**: a synthetic benchmark allocating + `malloc(96 KiB)` × N reports peak RSS lower by ~25% vs the + pre-Phase-15 baseline. (Optional diagnostic; not a gate.) ## Risks -1. **Existing tests assume pow2 reservation footprint.** Grep tests - for `next_pow2`, `pow2`, and any size-arithmetic over allocations - returned from `malloc`. Likely small handful; convert each to - `sizeclass_full_to_size` or to a less assumption-laden check. -2. **`calloc` zeroing range.** Mitigated by updating `round_size` - for large (item above). Verify by inspecting - `corealloc.h:34-47` (`DefaultConts::success`) — it should now - zero exactly the reservation size. -3. **`SlabMetadata` reuse boundary.** The current - `slab_metadata == &slab_metadata` assertion in `dealloc_chunk` - relies on every chunk in the allocation pointing to the same - `SlabMetadata`. Phase 14's per-chunk-offset path keeps the - `meta` field's pointer bits unchanged across chunks (only - offset differs), so the assertion continues to hold after the - Phase 14 mask update. Verify by re-reading the assertion site. -4. **`remaining_bytes` overflow for very large allocations.** - After Phase 15, the rounded size can be just below the next - pow2, which is still bounded by `2^MAX_address_bits` — no - arithmetic overflow. Verify with a max-size allocation test. -5. **Performance.** Front-end alloc path: `next_pow2` is replaced - by an exp+mantissa table lookup. Dealloc-large already moved - to a table lookup in Phase 13. Net neutral. +1. **`calloc` zeroing range overshoot**. Mitigated by updating + `round_size` for large. Verify by inspecting + `corealloc.h:34-47` (`DefaultConts::success`) — must zero + exactly the reservation size returned by `round_size`. The new + non-pow2 calloc test in `memory.cc` is the regression gate. +2. **External clients assuming pow2-aligned large allocations.** + `natural_alignment` automatically reports the reduced + alignment, but any external code that hard-codes "large allocs + are pow2-aligned" silently breaks. Document in the commit + message; consider a release note if there is a CHANGELOG. +3. **`aligned_alloc` overflow at extreme sizes.** `aligned_size` + already handles SIZE_MAX overflow; behaviour unchanged. +4. **Performance regression on the front-end alloc path.** + `next_pow2(size)` is replaced by `to_exp_mant(size)` plus a + table lookup. Both are constant-time and small; perf gate + confirms no regression. +5. **External pagemap / fixed-region builds.** The fixed-region + tests (`src/test/func/fixed_region/`, + `src/test/func/external_pagemap/`) construct allocations via + different paths. Re-run them in the full suite. +6. **Statistics counters.** `func-statistics` checks per-sizeclass + counts. Verify the test doesn't hard-code "every large is + pow2". ## Out of scope - Reducing `INTERMEDIATE_BITS` to gain bits in the sizeclass tag - (Phase 13 already chose 2 = the existing value). + (Phase 13 chose the existing value). - Generalising small allocations (already exp+mantissa). - Any change to `alloc_range` / `dealloc_range` of arbitrary byte-multiples — front-end always rounds via the sizeclass - encoding. + encoding before reaching the backend. +- Removing the offset==0 fast-path branch in `start_of_object`. + After Phase 15 the slow arm is reachable from the front-end, but + the branch is fully predicted on small-allocation workloads + (which dominate the benchmark) and the slow arm's cost is small. + +## Implementation order (every step has a test gate) + +1. **Front-end flip + `alloc_chunk` precondition + frontend test + in a single commit.** This is one atomic refactor: the + precondition cannot be relaxed safely until the front-end has + reasons to call with non-pow2 sizes, and the front-end flip + cannot be exercised end-to-end without the precondition + relaxation. Files touched in this commit: + - `src/snmalloc/ds/sizeclasstable.h`: drop `next_pow2` from + `size_to_sizeclass_full`; rewrite `large_size_to_chunk_size` + and `round_size` per the "Changes" section; update doc + comments. + - `src/snmalloc/backend/backend.h`: replace + `alloc_chunk`'s `is_pow2(size)` precondition with the + slab-tile invariant; rewrite the surrounding comment. + - `src/snmalloc/mem/corealloc.h`: hoist the duplicated + `size_to_sizeclass_full(size)` / `large_size_to_chunk_size` + calls in the large-alloc path (lines 723-728) into locals. + - `src/test/func/large_offset_frontend/`: new test (the + gate). Covers per-chunk pagemap recovery and non-boundary + requests. + - `src/test/func/large_offset/large_offset.cc`: update header + comment now that the backend-API and front-end exercise the + same path. + - `src/test/func/sizeclass/sizeclass.cc`: update the + non-pow2-rounds-to-next-pow2 expectation at lines 160-175. + - `src/test/func/memory/memory.cc`: add non-pow2-large calloc + test (the `round_size` regression gate). + Gate: full ctest suite passes including + `large_offset_frontend` and the new calloc test. + +2. **Perf gate** (per the perf-gate protocol from Phase 14): + measure `perf-external_pointer-fast` and + `perf-singlethread-check` against the Phase-14 baseline (~290 + ms / ~580 ms median); 5 runs × 3 reps each; report median + + range. If a regression is found, root-cause via perf annotate + before committing — do not paper over with workarounds. + +3. **Mandatory pre-commit review loop** before the commit. # Review plan for Phases 13–15 diff --git a/src/snmalloc/ds/sizeclasstable.h b/src/snmalloc/ds/sizeclasstable.h index 66b68f86a..ba880d243 100644 --- a/src/snmalloc/ds/sizeclasstable.h +++ b/src/snmalloc/ds/sizeclasstable.h @@ -587,8 +587,7 @@ namespace snmalloc * first slab tile holds an allocation base, so a non-zero * `offset_bytes` short-circuits to false. */ - constexpr bool - is_start_of_object(offset_and_sizeclass_t osc, address_t addr) + constexpr bool is_start_of_object(offset_and_sizeclass_t osc, address_t addr) { if (sizeclass_metadata.start(osc).offset_bytes != 0) return false; @@ -725,41 +724,4 @@ namespace snmalloc return 1; return bits::one_at_bit(bits::ctz(rsize)); } - - constexpr SNMALLOC_FAST_PATH static size_t - aligned_size(size_t alignment, size_t size) - { - // Client responsible for checking alignment is not zero - SNMALLOC_ASSERT(alignment != 0); - // Client responsible for checking alignment is a power of two - SNMALLOC_ASSERT(bits::is_pow2(alignment)); - - // There are a class of corner cases to consider - // alignment = 0x8 - // size = 0xfff...fff7 - // for this result will be 0. This should fail an allocation, so we need to - // check for this overflow. - // However, - // alignment = 0x8 - // size = 0x0 - // will also result in 0, but this should be allowed to allocate. - // So we need to check for overflow, and return SIZE_MAX in this first case, - // and 0 in the second. - size_t result = ((alignment - 1) | (size - 1)) + 1; - // The following code is designed to fuse well with a subsequent - // sizeclass calculation. We use the same fast path constant to - // move the case where result==0 to the slow path, and then check for which - // case we are in. - if (is_small_sizeclass(result)) - return result; - - // We are in the slow path, so we need to check for overflow. - if (SNMALLOC_UNLIKELY(result == 0)) - { - // Check for overflow and return the maximum size. - if (SNMALLOC_UNLIKELY(result < size)) - return SIZE_MAX; - } - return result; - } } // namespace snmalloc diff --git a/src/snmalloc/ds_core/sizeclassstatic.h b/src/snmalloc/ds_core/sizeclassstatic.h index 011f69830..cf66851bc 100644 --- a/src/snmalloc/ds_core/sizeclassstatic.h +++ b/src/snmalloc/ds_core/sizeclassstatic.h @@ -74,4 +74,51 @@ namespace snmalloc return (size - 1) < sizeclass_to_size_const(smallsizeclass_t(NUM_SMALL_SIZECLASSES - 1)); } + + /** + * @brief Round `size` up so the resulting allocation can satisfy + * the requested `alignment`. `alignment` must be a non-zero power + * of two. + * + * Lives in sizeclassstatic.h (not sizeclasstable.h) so it is + * available to compile-time-only consumers — notably the test + * library header — without pulling in the full runtime sizeclass + * machinery. + */ + constexpr SNMALLOC_FAST_PATH size_t + aligned_size(size_t alignment, size_t size) + { + // Client responsible for checking alignment is not zero + SNMALLOC_ASSERT(alignment != 0); + // Client responsible for checking alignment is a power of two + SNMALLOC_ASSERT(bits::is_pow2(alignment)); + + // There are a class of corner cases to consider + // alignment = 0x8 + // size = 0xfff...fff7 + // for this result will be 0. This should fail an allocation, so we need to + // check for this overflow. + // However, + // alignment = 0x8 + // size = 0x0 + // will also result in 0, but this should be allowed to allocate. + // So we need to check for overflow, and return SIZE_MAX in this first case, + // and 0 in the second. + size_t result = ((alignment - 1) | (size - 1)) + 1; + // The following code is designed to fuse well with a subsequent + // sizeclass calculation. We use the same fast path constant to + // move the case where result==0 to the slow path, and then check for which + // case we are in. + if (is_small_sizeclass(result)) + return result; + + // We are in the slow path, so we need to check for overflow. + if (SNMALLOC_UNLIKELY(result == 0)) + { + // Check for overflow and return the maximum size. + if (SNMALLOC_UNLIKELY(result < size)) + return SIZE_MAX; + } + return result; + } } // namespace snmalloc diff --git a/src/snmalloc/global/globalalloc.h b/src/snmalloc/global/globalalloc.h index 8bc22e87c..1d7f05a18 100644 --- a/src/snmalloc/global/globalalloc.h +++ b/src/snmalloc/global/globalalloc.h @@ -391,10 +391,17 @@ namespace snmalloc ThreadAlloc::get().dealloc(p); } - template + /** + * Compile-time sized dealloc. The optional `align` parameter mirrors + * the `align` parameter on `alloc` so the + * sized-dealloc sanity check sees the size that was actually + * reserved (post `aligned_size`), not the raw requested `size`. + */ + template SNMALLOC_FAST_PATH_INLINE void dealloc(void* p) { - check_size(p, size); + constexpr size_t sz = aligned_size(align, size); + check_size(p, sz); ThreadAlloc::get().dealloc(p); } diff --git a/src/test/func/aligned_dealloc/aligned_dealloc.cc b/src/test/func/aligned_dealloc/aligned_dealloc.cc new file mode 100644 index 000000000..0ce8f5d92 --- /dev/null +++ b/src/test/func/aligned_dealloc/aligned_dealloc.cc @@ -0,0 +1,90 @@ +/** + * Regression test for the compile-time aligned alloc/dealloc API. + * + * `snmalloc::alloc()` applies + * `aligned_size(align, size)` internally so the underlying reservation + * is large enough to satisfy `align`. The matching + * `snmalloc::dealloc(p)` overload mirrors that: it applies + * the same `aligned_size` before `check_size`, so the size fed to the + * sized-dealloc sanity check is the size that was actually reserved. + * + * Without the aligned dealloc overload, callers either had to use the + * unsized `dealloc(p)` or manually pass `dealloc(p)`. Calling `dealloc(p)` instead trips `check_size` + * under `mitigations(sanity_checks)` whenever the alignment upgrade + * pushes the reservation into a different sizeclass than `size` + * itself (e.g. `S = 33 KiB`, `A = 128 KiB`: the reservation lives in + * a 128 KiB sizeclass but `check_size` evaluates + * `size_to_sizeclass_full(33 KiB)`, a smaller class). + */ + +#include "test/setup.h" + +#include +#include + +using namespace snmalloc; + +namespace +{ + bool any_failures = false; + + void fail(const char* msg) + { + std::cout << "FAIL: " << msg << std::endl; + any_failures = true; + } + + template + void check_round_trip(const char* label) + { + void* p = snmalloc::alloc(); + if (p == nullptr) + { + fail(label); + return; + } + constexpr size_t reserved = aligned_size(align, size); + if (alloc_size(p) < reserved) + { + std::cout << " reservation too small: alloc_size=" << alloc_size(p) + << " expected>=" << reserved << std::endl; + fail(label); + return; + } + snmalloc::dealloc(p); + } +} // namespace + +int main(int, char**) +{ + setup(); + + // The canonical pre-existing reproducer: today's pow2 rounding maps + // 33 KiB to one large sizeclass while the alignment-driven + // reservation lands in a strictly larger one. + check_round_trip<33 * 1024, 128 * 1024>("S=33KiB A=128KiB"); + + // Small-to-large alignment upgrade. + check_round_trip<48, 64 * 1024>("S=48B A=64KiB"); + + // Wider gap between requested size and required alignment. + check_round_trip<17 * 1024, 256 * 1024>("S=17KiB A=256KiB"); + + // align == size: alloc and dealloc sees the same value pre- and + // post-aligned_size; serves as a baseline that the overload + // doesn't pessimise the simple case. + check_round_trip<64 * 1024, 64 * 1024>("S=64KiB A=64KiB"); + + // Small allocation, natural alignment. + check_round_trip<32, 32>("S=32B A=32B"); + + if (any_failures) + { + std::cout << "aligned_dealloc test FAILED" << std::endl; + return 1; + } + + std::cout << "aligned_dealloc test passed" << std::endl; + return 0; +} diff --git a/src/test/snmalloc_testlib.h b/src/test/snmalloc_testlib.h index 5b51ff7bd..00b0513e4 100644 --- a/src/test/snmalloc_testlib.h +++ b/src/test/snmalloc_testlib.h @@ -41,10 +41,18 @@ namespace snmalloc void dealloc(void* p, size_t size); void dealloc(void* p, size_t size, size_t align); - template + /** + * Compile-time sized dealloc with optional alignment. + * + * The `align` parameter mirrors the `align` parameter on the + * `alloc` overload below: it is applied via + * `aligned_size` so the size fed to the sized-dealloc sanity check + * matches the size that was actually reserved. + */ + template inline void dealloc(void* p) { - dealloc(p, size); + dealloc(p, aligned_size(align, size)); } void debug_teardown(); @@ -115,12 +123,13 @@ namespace snmalloc * goes straight to the sizeclass-based fast path. Otherwise falls back * to the dynamic alloc. */ - template + template inline void* alloc() { - if constexpr (is_small_sizeclass(size)) + constexpr size_t sz = aligned_size(align, size); + if constexpr (is_small_sizeclass(sz)) { - constexpr auto sc = size_to_sizeclass_const(size); + constexpr auto sc = size_to_sizeclass_const(sz); if constexpr (zero_mem == ZeroMem::YesZero) { return libc::malloc_small_zero(sc); @@ -132,7 +141,7 @@ namespace snmalloc } else { - return alloc(size); + return alloc(sz); } } } // namespace snmalloc From 01e65c2ffa9708e3a02642e3a0c545d71aaf4876 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Thu, 4 Jun 2026 13:57:56 +0100 Subject: [PATCH 15/31] Phase 15: front-end requests non-pow2 large allocations A request like malloc(70 KiB) at the default INTERMEDIATE_BITS = 2 now reserves the smallest enclosing exp+mantissa sizeclass (80 KiB) rather than next_pow2(size) (128 KiB). Sizes that already land on a class boundary reserve exactly that size; mid-exponent sizes shrink by up to ~33%. Mechanics: sizeclasstable.h - size_to_sizeclass_full drops next_pow2(size); to_exp_mant ceils directly to the smallest enclosing class. - round_size's large branch matches the reservation (sizeclass_full_to_size of the chosen class), so DefaultConts::success zeroes exactly the reservation for calloc. - large_size_to_chunk_size removed (the one caller in corealloc uses sizeclass_full_to_size(sc) directly with a hoisted sc). - compute_max_large_slab_index tightened to meta.size / slab_size - 1 (the actual worst case the runtime pagemap loop writes). backend.h - alloc_chunk's pow2 precondition relaxed to the slab-tile invariant: size is a positive multiple of slab_size. corealloc.h - large alloc path hoists size_to_sizeclass_full / chunk size into locals so each table lookup happens once. Tests: - large_offset_frontend/: new front-end counterpart to large_offset/. Exhaustively round-trips every large sizeclass and walks every chunk-aligned interior pointer for a boundary and a non-boundary request. - memory/: adds test_calloc_non_pow2_large as a calloc zeroing smoke test; clamps the end-of-stride probe in check_external_pointer_large since non-pow2 reservations are tighter than the next pow2. - sizeclass/: deterministic round_size gate over every large class (S maps to itself; S_prev+1 ceils to S). - large_offset/: backend test now passes the chunk-multiple reserve (= sizeclass_full_to_size(sc)) instead of next_pow2(size). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PLAN.md | 67 +++--- src/snmalloc/backend/backend.h | 21 +- src/snmalloc/ds/sizeclasstable.h | 35 ++-- src/snmalloc/mem/corealloc.h | 9 +- src/test/func/large_offset/large_offset.cc | 26 +-- .../large_offset_frontend.cc | 191 ++++++++++++++++++ src/test/func/memory/memory.cc | 54 ++++- src/test/func/sizeclass/sizeclass.cc | 78 +++++-- 8 files changed, 397 insertions(+), 84 deletions(-) create mode 100644 src/test/func/large_offset_frontend/large_offset_frontend.cc diff --git a/PLAN.md b/PLAN.md index 2bdcde19e..5343f06dd 100644 --- a/PLAN.md +++ b/PLAN.md @@ -3651,53 +3651,50 @@ listed so reviewers can re-check): ### `src/snmalloc/ds/sizeclasstable.h` -- `size_to_sizeclass_full(size)` (line 679): large branch currently - does `to_exp_mant(next_pow2(size))`. Drop the `next_pow2` step - and call `to_exp_mant(size)` directly. The encoding's ceil - semantic means a request of `S` lands in the smallest sizeclass - whose size is `>= S`. -- `large_size_to_chunk_size(size)` (line 598): replace - `bits::next_pow2(size)` with the rounded sizeclass-derived size: - `sizeclass_full_to_size(size_to_sizeclass_full(size))`. With the - change above this collapses to one table lookup. -- `round_size(size)` (line 693): the large branch currently returns - `bits::next_pow2(size)`. Update to match - `large_size_to_chunk_size`: - `return sizeclass_full_to_size(size_to_sizeclass_full(size));`. - This is correctness-critical because `DefaultConts::success` in +- `size_to_sizeclass_full(size)`: large branch calls + `to_exp_mant(size)` + directly. The encoding's ceil semantic selects the smallest + sizeclass whose size is `>= size`. +- `large_size_to_chunk_size` is removed. After the change above it + would just be `sizeclass_full_to_size(size_to_sizeclass_full(size))`, + which is exactly what `round_size` returns on the large branch; the + one in-tree caller (`corealloc.h` large path) is hoisted to use + `sizeclass_full_to_size(sc)` directly with a single `sc` lookup, so + the wrapper carries no remaining work. +- `round_size(size)`: large branch returns + `sizeclass_full_to_size(size_to_sizeclass_full(size))`. This is + correctness-critical because `DefaultConts::success` in `corealloc.h:34-47` uses `round_size` to determine the zeroing - range for `calloc`. Without this update, `calloc` would zero - beyond the actual reservation. -- Update the doc-comments on `size_to_sizeclass_full` and - `round_size` to drop the "rounded up to the next power of two" - language; describe the exp+mantissa rounding instead. + range for `calloc`. Without it `calloc` would zero beyond the + actual reservation. +- `compute_max_large_slab_index` tightens its bound to + `meta.size / slab_size - 1` (the actual worst case the runtime + loop writes). The previous `next_pow2(meta.size) / slab_size - 1` + overestimates now that no caller reserves `next_pow2(size)`. +- Doc-comments on `size_to_sizeclass_full` and `round_size` describe + the exp+mantissa rounding. ### `src/snmalloc/backend/backend.h` -- `alloc_chunk` precondition (line 95): currently - `SNMALLOC_ASSERT(bits::is_pow2(size))`. Replace with the - slab-tile invariant: +- `alloc_chunk` precondition: the slab-tile invariant ``` const size_t slab_size = sizeclass_full_to_slab_size(sizeclass); SNMALLOC_ASSERT(size >= slab_size); SNMALLOC_ASSERT((size & (slab_size - 1)) == 0); ``` - These match the pagemap loop's stride exactly and are the - minimum required for the per-chunk write to terminate at `size`. - The existing `size >= slab_size` assert on line 136 becomes - redundant once the precondition asserts it; consolidate. -- The Phase 14 offset-bits-zero assert on `ras` (lines 140-141) - stays — front-end still uses `encode(remote, sc)` with default - offset. -- Comment on lines 132-135 ("`size` and `slab_size` are powers of - two") is invalidated by Phase 15; rewrite to "`size` is a - multiple of `slab_size` with `size >= slab_size`". + matches the pagemap loop's stride exactly and is the minimum + required for the per-chunk write to terminate at `size`. The + previous duplicate `size >= slab_size` assert inside the loop is + consolidated. +- The offset-bits-zero assert on `ras` stays — the front-end uses + `encode(remote, sc)` with default offset 0. +- Loop comment describes `size` as a multiple of `slab_size` with + `size >= slab_size`. ### `src/snmalloc/global/globalalloc.h` -No change in Phase 15. The runtime sized-dealloc check is correct -after Phase 15 because every legitimate caller pre-applies -`aligned_size`: +No change. The runtime sized-dealloc check is correct because every +legitimate caller pre-applies `aligned_size`: - Unaligned `sized_dealloc(p, S)`: alloc was `malloc(S)`, which goes through `size_to_sizeclass_full(S)`; the dealloc check evaluates diff --git a/src/snmalloc/backend/backend.h b/src/snmalloc/backend/backend.h index 80ff58da8..2fcdf2a57 100644 --- a/src/snmalloc/backend/backend.h +++ b/src/snmalloc/backend/backend.h @@ -92,7 +92,15 @@ namespace snmalloc uintptr_t ras, sizeclass_t sizeclass) { - SNMALLOC_ASSERT(bits::is_pow2(size)); + // `size` must be a positive multiple of the sizeclass's slab + // tile size: the pagemap loop below writes one entry per + // `slab_size` stride and must terminate exactly at `size`. + // Front-end callers satisfy this by construction because they + // pass `sizeclass_full_to_size(sizeclass)`, whose largest pow2 + // divisor is `sizeclass_full_to_slab_size(sizeclass)`. + const size_t slab_size = sizeclass_full_to_slab_size(sizeclass); + SNMALLOC_ASSERT(size >= slab_size); + SNMALLOC_ASSERT((size & (slab_size - 1)) == 0); SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE); // Calculate the extra bytes required to store the client meta-data. @@ -128,12 +136,11 @@ namespace snmalloc return {nullptr, nullptr}; } - const size_t slab_size = sizeclass_full_to_slab_size(sizeclass); - // `size` and `slab_size` are powers of two with `size >= slab_size`, - // so `size = k * slab_size` for some integer `k >= 1`. Each slab - // tile gets the same `ras_in | (slab_index << SIZECLASS_BITS)` - // entry, written in one `set_metaentry` call. - SNMALLOC_ASSERT(size >= slab_size); + // `slab_size` was computed and asserted against `size` at the + // top of `alloc_chunk`. `size = k * slab_size` for some integer + // `k >= 1`; each slab tile gets the same + // `ras | (slab_index << SIZECLASS_BITS)` entry, written in one + // `set_metaentry` call. // The OR below assumes the per-chunk-offset bits of `ras` are // zero; `MetaEntryBase::encode` defaults offset to 0, and the // backend is the only place per-chunk offsets are written. diff --git a/src/snmalloc/ds/sizeclasstable.h b/src/snmalloc/ds/sizeclasstable.h index ba880d243..635ff7a45 100644 --- a/src/snmalloc/ds/sizeclasstable.h +++ b/src/snmalloc/ds/sizeclasstable.h @@ -415,7 +415,11 @@ namespace snmalloc static_assert( bits::BITS - sizeclass_metadata.DIV_MULT_SHIFT <= MAX_CAPACITY_BITS); - // Largest slab index for any large class: `OFFSET_BITS` must cover it. + // Largest slab index for any large class: `OFFSET_BITS` must cover + // it. Each large allocation reserves exactly `meta.size` bytes (a + // positive multiple of `slab_size`), so the largest `slab_index` + // the pagemap loop in `Backend::alloc_chunk` writes is + // `meta.size / slab_size - 1`. constexpr size_t compute_max_large_slab_index() { size_t max_idx = 0; @@ -424,8 +428,7 @@ namespace snmalloc const auto& meta = sizeclass_metadata.start(sizeclass_t::from_large_class(lc)); const size_t slab_size = meta.slab_mask + 1; - const size_t reserve = bits::next_pow2_const(meta.size); - const size_t idx = (reserve / slab_size) - 1; + const size_t idx = (meta.size / slab_size) - 1; if (idx > max_idx) max_idx = idx; } @@ -594,11 +597,6 @@ namespace snmalloc return is_start_of_object(osc.sizeclass(), addr); } - inline static size_t large_size_to_chunk_size(size_t size) - { - return bits::next_pow2(size); - } - constexpr SNMALLOC_PURE size_t sizeclass_lookup_index(const size_t s) { // We subtract and shift to reduce the size of the table, i.e. we don't have @@ -672,8 +670,13 @@ namespace snmalloc } /** - * Map a requested size to its sizeclass. Large requests are rounded up - * to the next power of two. + * Map a requested size to its sizeclass. + * + * Small requests use the dense lookup table. Large requests are + * encoded with `to_exp_mant`, + * whose ceil semantic (`v = v - 1; ...`) selects the smallest + * sizeclass whose size is `>= size`. The raw `size` is passed in + * directly — the encoding does the rounding. */ static inline sizeclass_t size_to_sizeclass_full(size_t size) { @@ -683,9 +686,8 @@ namespace snmalloc } SNMALLOC_ASSERT(size != 0); SNMALLOC_ASSERT(size <= MAX_LARGE_SIZECLASS_SIZE); - size_t pow2 = bits::next_pow2(size); size_t global = - bits::to_exp_mant(pow2); + bits::to_exp_mant(size); return sizeclass_t::from_large_class(global - NUM_SMALL_SIZECLASSES); } @@ -712,7 +714,14 @@ namespace snmalloc // failed allocation later. return size; } - return bits::next_pow2(size); + // Large branch: round to the smallest enclosing exp+mantissa + // sizeclass. Must agree with `round_size`'s small-class branch in + // semantics: every request rounds to the smallest enclosing + // class. `DefaultConts::success` (corealloc.h) uses `round_size` + // to compute the `calloc` zeroing range, so any drift between + // the actual reservation and `round_size` would over- or + // under-zero. + return sizeclass_full_to_size(size_to_sizeclass_full(size)); } /// Returns the alignment that this size naturally has, that is diff --git a/src/snmalloc/mem/corealloc.h b/src/snmalloc/mem/corealloc.h index fa5f1389c..942b7f514 100644 --- a/src/snmalloc/mem/corealloc.h +++ b/src/snmalloc/mem/corealloc.h @@ -720,12 +720,13 @@ namespace snmalloc // Grab slab of correct size // Set remote as large allocator remote. + const auto sc = size_to_sizeclass_full(size); + const size_t chunk_size = sizeclass_full_to_size(sc); auto [chunk, meta] = Config::Backend::alloc_chunk( self->get_backend_local_state(), - large_size_to_chunk_size(size), - PagemapEntry::encode( - self->public_state(), size_to_sizeclass_full(size)), - size_to_sizeclass_full(size)); + chunk_size, + PagemapEntry::encode(self->public_state(), sc), + sc); #ifdef SNMALLOC_TRACING message<1024>( diff --git a/src/test/func/large_offset/large_offset.cc b/src/test/func/large_offset/large_offset.cc index e7c45c246..d89bf03ba 100644 --- a/src/test/func/large_offset/large_offset.cc +++ b/src/test/func/large_offset/large_offset.cc @@ -1,22 +1,21 @@ /** - * Targeted test for the per-chunk pagemap offset write path in - * `BackendAllocator::alloc_chunk`. + * Backend-API counterpart of `large_offset_frontend` for the per-chunk + * pagemap offset write path in `BackendAllocator::alloc_chunk`. * - * The front end currently only issues pow2 large requests (the - * `slab_size >= size` fast path), so the multi-slab-tile branch in - * `alloc_chunk` writing per-chunk offsets is otherwise unreachable - * from the in-tree allocation paths. This test reaches it via the - * public backend API. + * This test pins the contract at the *backend* boundary + * (`Config::Backend::alloc_chunk` / `dealloc_chunk`) so it holds + * independently of any front-end path: a non-pow2 large allocation + * spans multiple slab tiles, and `alloc_chunk` writes a per-chunk + * pagemap entry whose offset bits encode the slab index. * * Method: * - Pick a non-pow2 large sizeclass `sc` whose * `sizeclass_full_to_slab_size(sc) < sizeclass_full_to_size(sc)`, * so the multi-slab-tile branch triggers. - * - Compute the pow2 reservation `next_pow2(size)` (the size - * `alloc_chunk` asserts). - * - Call `Config::Backend::alloc_chunk` directly with that pow2 size + * - Call `Config::Backend::alloc_chunk` directly with + * `sizeclass_full_to_size(sc)` (the chunk-multiple reservation) * and the non-pow2 sc. - * - For each chunk in the pow2 region verify the pagemap entry's + * - For each chunk in the region verify the pagemap entry's * `get_offset_and_sizeclass()` decomposes into the expected * (sc, slab_index) pair. * - For sampled interior addresses verify that @@ -84,7 +83,10 @@ namespace } const size_t size = sizeclass_full_to_size(sc); const size_t slab_size = sizeclass_full_to_slab_size(sc); - const size_t reserve = bits::next_pow2(size); + // The chunk-multiple reservation: the backend precondition is + // that `size` is a positive multiple of `slab_size`, satisfied + // here by passing the exact sizeclass size. + const size_t reserve = size; std::cout << "non-pow2 sc raw=" << sc.raw() << " size=" << size << " slab_size=" << slab_size << " reserve=" << reserve diff --git a/src/test/func/large_offset_frontend/large_offset_frontend.cc b/src/test/func/large_offset_frontend/large_offset_frontend.cc new file mode 100644 index 000000000..207abb4d7 --- /dev/null +++ b/src/test/func/large_offset_frontend/large_offset_frontend.cc @@ -0,0 +1,191 @@ +/** + * Front-end counterpart to `src/test/func/large_offset/`. + * + * The front-end allocates non-pow2 large allocations directly: + * `malloc(80 KiB)` reserves exactly 80 KiB (a sizeclass boundary) + * rather than rounding up to the next power of two. This test + * exercises the resulting per-chunk pagemap state via the public + * recovery API (`external_pointer`, `remaining_bytes`). + * + * `large_offset.cc` covers the same ground at the backend boundary + * (`Config::Backend::alloc_chunk` / `dealloc_chunk`), so the + * per-chunk contract is gated independently of any front-end path. + * This test gates that the front-end actually produces such + * allocations. + * + * Two sets of checks: + * + * 1. Pure table-level round-tripping over every large sizeclass: + * `size_to_sizeclass_full(sizeclass_full_to_size(sc)) == sc`. + * No allocation. Cheap and exhaustive. + * + * 2. End-to-end on a bounded set of representative sizeclasses + * (the smallest non-pow2 large class, plus a non-boundary + * request whose smallest enclosing class is non-pow2): allocate + * via the public front-end API, walk every chunk-aligned + * interior pointer in the logical allocation, assert + * `external_pointer` recovers the base and + * `remaining_bytes` reports the expected residual. + */ + +#include +#include +#include + +#ifdef assert +# undef assert +#endif +#define assert please_use_SNMALLOC_ASSERT + +using namespace snmalloc; + +namespace +{ + bool any_failures = false; + + void fail(const char* msg) + { + std::cout << "FAIL: " << msg << std::endl; + any_failures = true; + } + + /** + * For every representable large sizeclass `sc`, check that the + * sizeclass encoding round-trips: a request of exactly + * `sizeclass_full_to_size(sc)` maps back to `sc`. Failure here is + * a pure table-encoding bug and is independent of any allocation. + */ + void test_roundtrip_all_large() + { + for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) + { + sizeclass_t sc = sizeclass_t::from_large_class(lc); + size_t S = sizeclass_full_to_size(sc); + sizeclass_t sc2 = size_to_sizeclass_full(S); + if (!(sc2 == sc)) + { + std::cout << "Round-trip fail: lc=" << lc << " S=" << S + << " sc.raw=" << sc.raw() << " sc2.raw=" << sc2.raw() + << std::endl; + fail("round-trip"); + } + } + } + + /** + * Allocate `request` via the public front-end, then walk every + * `MIN_CHUNK_SIZE`-aligned interior address and verify pointer + * recovery. `expected_reserve` is the reservation the allocator + * should produce (the smallest enclosing sizeclass size). + */ + void test_alloc_chunkwalk(size_t request, size_t expected_reserve) + { + void* p = snmalloc::libc::malloc(request); + if (p == nullptr) + { + fail("malloc returned null"); + return; + } + + const size_t usable = snmalloc::alloc_size(p); + if (usable != expected_reserve) + { + std::cout << "alloc_size mismatch: request=" << request + << " usable=" << usable << " expected=" << expected_reserve + << std::endl; + fail("alloc_size != expected reserve"); + } + + // Use the `Start` pointer recovery as the start-of-object check + // (no `libc::is_start_of_object`): `external_pointer(p)` + // returning `p` itself is the same property. + + for (size_t off = 0; off < usable; off += MIN_CHUNK_SIZE) + { + void* interior = pointer_offset(p, off); + void* base = snmalloc::external_pointer(interior); + if (base != p) + { + std::cout << "external_pointer(p + " << off << ") = " << base + << " expected " << p << std::endl; + fail("external_pointer mismatch"); + } + size_t rem = snmalloc::remaining_bytes(interior); + if (rem != usable - off) + { + std::cout << "remaining_bytes(p + " << off << ") = " << rem + << " expected " << usable - off << std::endl; + fail("remaining_bytes mismatch"); + } + } + + snmalloc::libc::free(p); + } + + /** + * Find a non-pow2 large sizeclass to exercise. Returns the + * sentinel `sizeclass_t{}` if none exists (e.g. INTERMEDIATE_BITS + * == 0, all classes are pow2). + */ + sizeclass_t find_non_pow2_large_sc() + { + for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) + { + sizeclass_t sc = sizeclass_t::from_large_class(lc); + size_t S = sizeclass_full_to_size(sc); + if (!bits::is_pow2(S)) + return sc; + } + return sizeclass_t{}; + } + + void test_end_to_end() + { + sizeclass_t sc = find_non_pow2_large_sc(); + if (sc.raw() == 0) + { + std::cout + << "No non-pow2 large sizeclass available (INTERMEDIATE_BITS == 0?); " + "skipping end-to-end test." + << std::endl; + return; + } + + const size_t S = sizeclass_full_to_size(sc); + + // Boundary request: ask for exactly the class size. + test_alloc_chunkwalk(S, S); + + // Non-boundary request: ask for (S_prev + 1) to land at S via + // the ceil encoding. S_prev is the previous class's size; if sc + // is the very first large class, fall back to MAX_SMALL+1. + size_t S_prev; + if (sc.as_large() == 0) + { + S_prev = MAX_SMALL_SIZECLASS_SIZE; + } + else + { + S_prev = sizeclass_full_to_size( + sizeclass_t::from_large_class(sc.as_large() - 1)); + } + if (S_prev + 1 < S) + { + test_alloc_chunkwalk(S_prev + 1, S); + } + } +} // namespace + +int main() +{ + setup(); + test_roundtrip_all_large(); + test_end_to_end(); + if (any_failures) + { + std::cout << "FAILED" << std::endl; + return 1; + } + std::cout << "PASSED" << std::endl; + return 0; +} diff --git a/src/test/func/memory/memory.cc b/src/test/func/memory/memory.cc index 253628282..2d5e6d287 100644 --- a/src/test/func/memory/memory.cc +++ b/src/test/func/memory/memory.cc @@ -307,12 +307,19 @@ void check_offset(void* base, void* interior) void check_external_pointer_large(size_t* base) { + // Probe `__malloc_start_pointer` at both ends of each 16 MiB + // stride within the allocation. The allocation size is recorded in + // the first word of the allocation itself. The end-of-stride probe + // is clamped to the last byte of the allocation. size_t size = *base; char* curr = (char*)base; for (size_t offset = 0; offset < size; offset += 1 << 24) { check_offset(base, (void*)(curr + offset)); - check_offset(base, (void*)(curr + offset + (1 << 24) - 1)); + size_t end = offset + (1 << 24) - 1; + if (end >= size) + end = size - 1; + check_offset(base, (void*)(curr + end)); } } @@ -439,6 +446,50 @@ void test_calloc_large_bug() snmalloc::dealloc(p1); } +/** + * `calloc` zeroing must cover exactly the reservation `round_size` + * reports — no more, no less. For a large request that lands in a + * non-pow2 sizeclass, the reservation is tighter than the next pow2, + * so a stray `next_pow2`-sized zeroing loop would overshoot into + * backend free range. This test allocates such a non-pow2 large + * request and verifies (a) the usable size is strictly less than the + * next pow2, and (b) every byte of the visible allocation is zero. + * + * Note: an overshoot may not fault — the deterministic gate for the + * `round_size` contract lives in the sizeclass test. + */ +void test_calloc_non_pow2_large() +{ + if constexpr (snmalloc::INTERMEDIATE_BITS == 0) + { + // All sizeclasses are powers of two in this configuration, so + // there is no non-pow2 large request to test. + std::cout + << "INTERMEDIATE_BITS == 0: all sizeclasses pow2; skipping." + << std::endl; + return; + } + + // 2.5 * MAX_SMALL_SIZECLASS_SIZE: definitely large, definitely not + // a power of two, and (with INTERMEDIATE_BITS >= 1) the smallest + // enclosing sizeclass is strictly less than the next pow2 above. + const size_t mss = size_t{1} << snmalloc::max_small_sizeclass_bits(); + const size_t request = (mss << 1) + (mss >> 1); + const size_t next_pow2 = snmalloc::bits::next_pow2(request); + + void* p = snmalloc::alloc(request); + SNMALLOC_CHECK(p != nullptr); + const size_t usable = snmalloc::alloc_size(p); + SNMALLOC_CHECK(usable >= request); + SNMALLOC_CHECK(usable < next_pow2); + auto* bytes = static_cast(p); + for (size_t i = 0; i < usable; i++) + { + SNMALLOC_CHECK(bytes[i] == 0); + } + snmalloc::dealloc(p); +} + template void test_static_sized_alloc() { @@ -589,6 +640,7 @@ int main(int, char**) TEST(test_external_pointer); TEST(test_alloc_16M); TEST(test_calloc_16M); + TEST(test_calloc_non_pow2_large); TEST(test_consolidaton_bug); std::cout << "Tests completeed successfully!" << std::endl; diff --git a/src/test/func/sizeclass/sizeclass.cc b/src/test/func/sizeclass/sizeclass.cc index 093b17424..0b0c73eb3 100644 --- a/src/test/func/sizeclass/sizeclass.cc +++ b/src/test/func/sizeclass/sizeclass.cc @@ -140,11 +140,12 @@ void test_uniform_large_sizeclasses() prev_size = size; } - // Round-trip identity on pow2 large sizes in Phase 13: every pow2 size - // S in [MAX_SMALL_SIZECLASS_SIZE * 2, MAX_LARGE_SIZECLASS_SIZE] must satisfy - // sizeclass_full_to_size(size_to_sizeclass_full(S)) == S. Bound the loop by - // ENCODED_ADDRESS_BITS so `bits::one_at_bit(bits)` never shifts by >= BITS - // (the bound check itself would fail on 32-bit otherwise). + // Round-trip identity on pow2 large sizes: every pow2 size S in + // [MAX_SMALL_SIZECLASS_SIZE * 2, MAX_LARGE_SIZECLASS_SIZE] must + // satisfy sizeclass_full_to_size(size_to_sizeclass_full(S)) == S. + // Bound the loop by ENCODED_ADDRESS_BITS so `bits::one_at_bit(b)` + // never shifts by >= BITS (the bound check itself would fail on + // 32-bit otherwise). for (size_t b = MAX_SMALL_SIZECLASS_BITS + 1; b <= ENCODED_ADDRESS_BITS; b++) { size_t S = bits::one_at_bit(b); @@ -157,21 +158,74 @@ void test_uniform_large_sizeclasses() failed = true; } - // For every non-pow2 size X strictly between adjacent pow2 [P, 2P), the - // result must round up to 2P (pow2 rounding still in force in Phase 13). - // Only check when 2P is still representable. + // For every non-pow2 size X strictly between adjacent pow2 [P, 2P), + // `size_to_sizeclass_full(X)` must select the smallest sizeclass + // whose size is >= X. Compute the expected sizeclass independently + // by scanning all large classes. Only check when 2P is still + // representable. if (b < ENCODED_ADDRESS_BITS) { size_t mid = S + (S >> 1); sizeclass_t sc_mid = size_to_sizeclass_full(mid); size_t rs_mid = sizeclass_full_to_size(sc_mid); - size_t expect = bits::one_at_bit(b + 1); - if (rs_mid != expect) + + // Independent computation: smallest large class size >= mid. + size_t expect = 0; + for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) + { + size_t sz = sizeclass_full_to_size(sizeclass_t::from_large_class(lc)); + if (sz >= mid) + { + expect = sz; + break; + } + } + if (expect == 0) { - std::cout << "Non-pow2 should round to next pow2: X=" << mid - << " round=" << rs_mid << " expected=" << expect << std::endl; + std::cout << "No large class >= mid=" << mid << std::endl; failed = true; } + else if (rs_mid != expect) + { + std::cout << "Non-pow2 should round to smallest enclosing class: X=" + << mid << " round=" << rs_mid << " expected=" << expect + << std::endl; + failed = true; + } + } + } + + // `round_size` contract: for every representable large class size + // S, `round_size(S) == S` and `round_size(S_prev + 1) == S` (the + // smallest enclosing class). `DefaultConts::success` (corealloc.h) + // uses `round_size` to size the `calloc` zeroing range, so any + // drift here would over- or under-zero. This is the deterministic + // gate for that contract; the `calloc` smoke test in `memory.cc` + // would not necessarily fault on an overshoot into backend free + // range. + { + size_t prev = 0; + for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) + { + size_t S = sizeclass_full_to_size(sizeclass_t::from_large_class(lc)); + if (round_size(S) != S) + { + std::cout << "round_size identity failed at large class: S=" << S + << " round_size=" << round_size(S) << std::endl; + failed = true; + } + if (prev != 0 && prev + 1 < S) + { + size_t probe = prev + 1; + if (round_size(probe) != S) + { + std::cout << "round_size(prev+1) blow-up: probe=" << probe + << " round_size=" << round_size(probe) << " expected=" << S + << std::endl; + failed = true; + } + } + prev = S; } } From c6fe5db60a07d1efdf0f0bf9fb9582a43ec8e4d3 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Thu, 4 Jun 2026 15:32:30 +0100 Subject: [PATCH 16/31] BackendArena slow-path hardening - RBTree::neighbours's "value absent" precondition is now release-checked with a single post-descent comparison. A duplicate key would otherwise return an arbitrary neighbour pair that callers (e.g. BackendArena::add_block) would consume as valid, corrupting dual-tree consolidation. Equivalent to the prior per-node debug assert for BST-ordered trees, with no per-node cost. - BackendArena::range_from_addr's Large branch asserts the structural invariants on the size returned by Rep::get_large_size (> TWO_UNITS, unit-aligned, below the arena-size cap). Debug-only: this is an internal Rep invariant, not defense against external corruption. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/backend_helpers/backend_arena.h | 8 +++++++- src/snmalloc/ds_core/redblacktree.h | 11 +++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/snmalloc/backend_helpers/backend_arena.h b/src/snmalloc/backend_helpers/backend_arena.h index 149c2f567..f577b4ce0 100644 --- a/src/snmalloc/backend_helpers/backend_arena.h +++ b/src/snmalloc/backend_helpers/backend_arena.h @@ -113,7 +113,13 @@ namespace snmalloc case BackendArenaVariant::OddTwo: return {a, TWO_UNITS}; case BackendArenaVariant::Large: - return {a, Rep::get_large_size(a)}; + { + size_t s = Rep::get_large_size(a); + SNMALLOC_ASSERT( + s > TWO_UNITS && s < bits::one_at_bit(MAX_SIZE_BITS) && + bits::align_down(s, UNIT_SIZE) == s); + return {a, s}; + } } SNMALLOC_ASSERT(false); return {0, 0}; diff --git a/src/snmalloc/ds_core/redblacktree.h b/src/snmalloc/ds_core/redblacktree.h index 6f86c8523..f38754624 100644 --- a/src/snmalloc/ds_core/redblacktree.h +++ b/src/snmalloc/ds_core/redblacktree.h @@ -797,7 +797,13 @@ namespace snmalloc * root-to-leaf descent then records both neighbours: every left * turn (parent key > value) updates the successor candidate to the * parent's key, every right turn updates the predecessor candidate. - * In Debug an assert fires if `value` is encountered on the descent. + * `SNMALLOC_CHECK` aborts in any build if `value` is encountered + * on the descent: a duplicate key would make `neighbours` return + * an arbitrary neighbour pair that the caller would consume as + * valid, corrupting dependent state. The check uses only one + * post-descent comparison because a duplicate key is always + * recorded into `pred` on the right-going branch (`compare(k, + * value)` is false when `k == value`). */ stl::Pair neighbours(K value) { @@ -808,7 +814,6 @@ namespace snmalloc while (!cur.is_null()) { K k = cur; - SNMALLOC_ASSERT(!Rep::equal(k, value)); if (Rep::compare(k, value)) { // k > value: go left; k is the tightest successor seen so far. @@ -822,6 +827,8 @@ namespace snmalloc } } + SNMALLOC_CHECK(!Rep::equal(pred, value)); + return {pred, succ}; } From 243b8363b4b369052620fa43fd80f31c87a5e408 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Thu, 4 Jun 2026 15:46:28 +0100 Subject: [PATCH 17/31] PagemapRep: static_assert Word::Two markerless discriminator invariant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Range tree stores chunk-aligned addresses in Word::Two of the pagemap entry. The markerless ownership discriminator (is_backend_owned == (remote_and_sizeclass & COMBINED_MASK) == 0) requires those addresses to have zero in the BACKEND_RESERVED_MASK_WORD_TWO (= COMBINED_MASK) bits — i.e., the reserved mask must fit entirely below the chunk alignment. The invariant held silently for default configs; a future config change shrinking the chunk alignment or growing INTERMEDIATE_BITS would have turned backend-owned writes into spurious frontend entries with no compile-time guard. This assertion mirrors the existing Word::One BIN_META_MASK assert. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/backend_helpers/backend_arena_range.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/snmalloc/backend_helpers/backend_arena_range.h b/src/snmalloc/backend_helpers/backend_arena_range.h index 8fc3cbd68..fc3eae55c 100644 --- a/src/snmalloc/backend_helpers/backend_arena_range.h +++ b/src/snmalloc/backend_helpers/backend_arena_range.h @@ -73,6 +73,15 @@ namespace snmalloc static_assert(BIN_META_MASK < UNIT_SIZE); static_assert( Entry::is_backend_allowed_value(Entry::Word::One, BIN_META_MASK)); + static_assert( + Entry::is_backend_allowed_value( + Entry::Word::Two, ~uintptr_t(UNIT_SIZE - 1)), + "RangeRep stores chunk-aligned addresses in Word::Two; the " + "markerless ownership discriminator requires their low " + "BACKEND_RESERVED_MASK_WORD_TWO bits to be zero. This asserts " + "that the reserved mask fits entirely below the chunk alignment, " + "so no chunk-aligned value (any bit set only at position " + ">= MIN_SIZE_BITS) can collide."); using Word = typename Entry::Word; using Handle = typename Entry::BackendStateWordRef; From bf7ecab83c4e75228278373343d2cf2981993fee Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Thu, 4 Jun 2026 16:08:55 +0100 Subject: [PATCH 18/31] BackendArena::unlink_block: use Bins::bin_index for classification unlink_block called bitmap.add(range) purely to compute the bin id, relying on the idempotent set-bit side effect being harmless on the unlink path. Bins::bin_index is the pure classifier with no side effect and a name that matches the intent. The bitmap-tree consistency invariant is unchanged (still verified by check_invariant Clause 4). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/backend_helpers/backend_arena.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snmalloc/backend_helpers/backend_arena.h b/src/snmalloc/backend_helpers/backend_arena.h index f577b4ce0..4ad42d6f0 100644 --- a/src/snmalloc/backend_helpers/backend_arena.h +++ b/src/snmalloc/backend_helpers/backend_arena.h @@ -149,7 +149,7 @@ namespace snmalloc void unlink_block(uintptr_t addr, size_t size) { auto range = typename Bins::range_t{addr, size}; - size_t bin = bitmap.add(range); + size_t bin = Bins::bin_index(range); bin_trees[bin].remove_elem(addr); if (size >= TWO_UNITS) range_tree.remove_elem(addr); From a5c224a02eb45d55454fc48d798aed97a144736e Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Thu, 4 Jun 2026 16:22:12 +0100 Subject: [PATCH 19/31] Fix RBTree::neighbours check for Rep::null probes A null probe value can never collide with a tree entry (null is not insertable), so it must be exempt from the post-descent duplicate check. The redblack functional test deliberately probes with key 0 to exercise the boundary case. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/ds_core/redblacktree.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/snmalloc/ds_core/redblacktree.h b/src/snmalloc/ds_core/redblacktree.h index f38754624..6009260f1 100644 --- a/src/snmalloc/ds_core/redblacktree.h +++ b/src/snmalloc/ds_core/redblacktree.h @@ -797,13 +797,15 @@ namespace snmalloc * root-to-leaf descent then records both neighbours: every left * turn (parent key > value) updates the successor candidate to the * parent's key, every right turn updates the predecessor candidate. - * `SNMALLOC_CHECK` aborts in any build if `value` is encountered - * on the descent: a duplicate key would make `neighbours` return - * an arbitrary neighbour pair that the caller would consume as - * valid, corrupting dependent state. The check uses only one - * post-descent comparison because a duplicate key is always - * recorded into `pred` on the right-going branch (`compare(k, - * value)` is false when `k == value`). + * `SNMALLOC_CHECK` aborts in any build if a non-null `value` is + * encountered on the descent: a duplicate key would make + * `neighbours` return an arbitrary neighbour pair that the + * caller would consume as valid, corrupting dependent state. The + * check uses only one post-descent comparison because a duplicate + * key is always recorded into `pred` on the right-going branch + * (`compare(k, value)` is false when `k == value`). `Rep::null` + * can never be present in the tree, so probing with it is benign + * and exempt from the check. */ stl::Pair neighbours(K value) { @@ -827,7 +829,7 @@ namespace snmalloc } } - SNMALLOC_CHECK(!Rep::equal(pred, value)); + SNMALLOC_CHECK(Rep::equal(pred, Rep::null) || !Rep::equal(pred, value)); return {pred, succ}; } From 242ac0f9945c7b86c76ff8a078b156f69c008055 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Thu, 4 Jun 2026 16:22:16 +0100 Subject: [PATCH 20/31] Unify backend_arena_range too-large boundary with add_block alloc_range and dealloc_range both bypassed to the parent when the request equalled or exceeded the bin range, but used mask_bits(N) = (1< --- .../backend_helpers/backend_arena_range.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/snmalloc/backend_helpers/backend_arena_range.h b/src/snmalloc/backend_helpers/backend_arena_range.h index fc3eae55c..6c84faea9 100644 --- a/src/snmalloc/backend_helpers/backend_arena_range.h +++ b/src/snmalloc/backend_helpers/backend_arena_range.h @@ -333,12 +333,24 @@ namespace snmalloc constexpr Type() = default; + /** + * `size` exceeds the arena's representable range and must be + * routed to the parent (or refused if no parent exists). Matches + * `BackendArena::add_block`'s `size < bits::one_at_bit(MAX_SIZE_BITS)` + * precondition exactly, so alloc and dealloc bypass on the same + * boundary. + */ + static constexpr bool is_too_large(size_t size) + { + return size >= bits::one_at_bit(MAX_SIZE_BITS); + } + capptr::Arena alloc_range(size_t size) { SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE); SNMALLOC_ASSERT((size & (MIN_CHUNK_SIZE - 1)) == 0); - if (size >= bits::mask_bits(MAX_SIZE_BITS)) + if (is_too_large(size)) { if (ParentRange::Aligned) return parent.alloc_range(size); @@ -363,7 +375,7 @@ namespace snmalloc if constexpr (MAX_SIZE_BITS != (bits::BITS - 1)) { - if (size >= bits::mask_bits(MAX_SIZE_BITS)) + if (is_too_large(size)) { parent_dealloc(base.unsafe_uintptr(), size); return; From 60b0fddc0b4f85372bf91ed738a9ca9d5710b045 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Thu, 4 Jun 2026 16:36:29 +0100 Subject: [PATCH 21/31] Style and documentation cleanup on BackendArena - Note that MetaEntryBase::operator= is load-bearing: the pagemap writes back through it, so META_BOUNDARY_BIT survives every metadata mutation without explicit preservation by callers. - Correct the BackendStateWordRef single-pointer ctor comment: it is required by RBRepMethods for sentinel construction from &Rep::root, not a legacy convenience. - BackendArena: drop spurious const on contains_min and check_invariant (no const callers exist), removing the const_cast laundering. - BackendArena::check_invariant: lift the five clause titles into the docblock; trim the inline labels to single-line markers. - BackendArena::add_block: drop cross-file line-number reference to buddy.h. - backend_arena_range.h / backend_arena_bins.h: replace SNMALLOC_CHECK(false && "msg") with SNMALLOC_CHECK_MSG. - backend_arena_range.h: rename `auto refill` to `refill_range` to avoid shadowing the enclosing function. - Tests: use "test/..." quoted include style for consistency. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/backend_helpers/backend_arena.h | 75 +++++++++++-------- .../backend_helpers/backend_arena_bins.h | 4 +- .../backend_helpers/backend_arena_range.h | 8 +- src/snmalloc/mem/metadata.h | 20 +++-- .../func/aligned_dealloc/aligned_dealloc.cc | 2 +- .../large_offset_frontend.cc | 2 +- 6 files changed, 64 insertions(+), 47 deletions(-) diff --git a/src/snmalloc/backend_helpers/backend_arena.h b/src/snmalloc/backend_helpers/backend_arena.h index 4ad42d6f0..5b1168df9 100644 --- a/src/snmalloc/backend_helpers/backend_arena.h +++ b/src/snmalloc/backend_helpers/backend_arena.h @@ -125,11 +125,10 @@ namespace snmalloc return {0, 0}; } - bool contains_min(uintptr_t a) const + bool contains_min(uintptr_t a) { - auto& self = const_cast(*this); - auto path = self.bin_trees[0].get_root_path(); - return self.bin_trees[0].find(path, a) && + auto path = bin_trees[0].get_root_path(); + return bin_trees[0].find(path, a) && Rep::get_variant(a) == BackendArenaVariant::Min; } @@ -213,8 +212,7 @@ namespace snmalloc // only known to exist after a tree lookup confirms succ_addr is in // our region — succ_addr can be one past the registered range when // the input block ends at the high edge of the arena. Order the - // checks so the tree check gates the pagemap read (matching the - // pattern in buddy.h:90-93). + // checks so the tree check gates the pagemap read. auto [sa, ss] = range_from_addr(s_key); uintptr_t succ_addr = addr + size; if (sa == succ_addr && Rep::can_consolidate(succ_addr)) @@ -290,25 +288,39 @@ namespace snmalloc } /** - * Five-clause structural invariant. Runs when `enabled` is true; - * defaults to `Debug` so in-tree callers compile away in Release - * while tests can opt in by passing `true` explicitly. Uses - * `SNMALLOC_CHECK` rather than `SNMALLOC_ASSERT` so that - * test-driven invocations are checked even under NDEBUG. + * Structural invariant. Runs when `enabled` is true; defaults to + * `Debug` so in-tree callers compile away in Release while tests + * can opt in by passing `true` explicitly. Uses `SNMALLOC_CHECK` + * rather than `SNMALLOC_ASSERT` so that test-driven invocations + * are checked even under NDEBUG. + * + * Five clauses are verified: + * 1. Maximally consolidated — no adjacent free blocks could be + * merged: (a) no two non-min range-tree entries touch across + * a consolidatable boundary, (b) no non-min entry touches a + * min entry, (c) no two min entries are adjacent. + * 2. Cross-tree consistency — every range-tree entry appears in + * exactly one bin tree, and every non-min bin-tree entry + * appears in the range tree. + * 3. Bin classification — every bin-tree entry sits in the bin + * its size selects. + * 4. Bitmap consistency — the non-empty bin bit is set iff the + * corresponding bin tree has entries. + * 5. Variant-tag consistency — each entry's pagemap variant tag + * matches the tag implied by its address and size, and Large + * variant entries carry the correct stored size. */ - void check_invariant(bool enabled = Debug) const + void check_invariant(bool enabled = Debug) { if (!enabled) return; - auto& self = const_cast(*this); - // Clause 1: Maximally consolidated. // 1a. No two adjacent non-min blocks (unless boundary prevents merge). { uintptr_t prev_addr = 0; size_t prev_size = 0; bool prev_valid = false; - self.range_tree.for_each([&](uintptr_t node) { + range_tree.for_each([&](uintptr_t node) { auto [a, s] = range_from_addr(node); if (prev_valid) { @@ -322,7 +334,7 @@ namespace snmalloc } // 1b. No non-min block adjacent to a min block (unless boundary). - self.range_tree.for_each([&](uintptr_t node) { + range_tree.for_each([&](uintptr_t node) { auto [a, s] = range_from_addr(node); if (a >= UNIT_SIZE) SNMALLOC_CHECK( @@ -335,7 +347,7 @@ namespace snmalloc { uintptr_t prev = 0; bool prev_valid = false; - self.bin_trees[0].for_each([&](uintptr_t node) { + bin_trees[0].for_each([&](uintptr_t node) { if (Rep::get_variant(node) != BackendArenaVariant::Min) return; if (prev_valid) @@ -346,43 +358,40 @@ namespace snmalloc }); } - // Clause 2: Cross-tree consistency. - // Every non-min bin-tree entry must be in the range tree; - // every range-tree entry must be in exactly one bin tree. + // 2. Cross-tree consistency. { size_t range_tree_count = 0; size_t bin_tree_nonmin_count = 0; for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++) { - self.bin_trees[bin].for_each([&](uintptr_t node) { + bin_trees[bin].for_each([&](uintptr_t node) { auto [a, s] = range_from_addr(node); if (s >= TWO_UNITS) { - auto path = self.range_tree.get_root_path(); - SNMALLOC_CHECK(self.range_tree.find(path, node)); + auto path = range_tree.get_root_path(); + SNMALLOC_CHECK(range_tree.find(path, node)); bin_tree_nonmin_count++; } }); } - // Reverse: every range-tree entry must be in its expected bin tree. - self.range_tree.for_each([&](uintptr_t node) { + range_tree.for_each([&](uintptr_t node) { range_tree_count++; auto [a, s] = range_from_addr(node); auto range = typename Bins::range_t{a, s}; size_t expected_bin = Bins::bin_index(range); - auto path = self.bin_trees[expected_bin].get_root_path(); - SNMALLOC_CHECK(self.bin_trees[expected_bin].find(path, node)); + auto path = bin_trees[expected_bin].get_root_path(); + SNMALLOC_CHECK(bin_trees[expected_bin].find(path, node)); }); SNMALLOC_CHECK(bin_tree_nonmin_count == range_tree_count); } - // Clause 3: Bin classification correctness. + // 3. Bin classification correctness. for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++) { - self.bin_trees[bin].for_each([&](uintptr_t node) { + bin_trees[bin].for_each([&](uintptr_t node) { auto [a, s] = range_from_addr(node); auto range = typename Bins::range_t{a, s}; size_t expected_bin = Bins::bin_index(range); @@ -390,18 +399,18 @@ namespace snmalloc }); } - // Clause 4: Bitmap consistency. + // 4. Bitmap consistency. for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++) { - bool has_entries = !self.bin_trees[bin].is_empty(); + bool has_entries = !bin_trees[bin].is_empty(); bool bit_set = bitmap.test(bin); SNMALLOC_CHECK(has_entries == bit_set); } - // Clause 5: Variant-tag consistency. + // 5. Variant-tag consistency. for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++) { - self.bin_trees[bin].for_each([&](uintptr_t node) { + bin_trees[bin].for_each([&](uintptr_t node) { auto v = Rep::get_variant(node); auto [a, s] = range_from_addr(node); SNMALLOC_CHECK(v == variant_of(s, a)); diff --git a/src/snmalloc/backend_helpers/backend_arena_bins.h b/src/snmalloc/backend_helpers/backend_arena_bins.h index 0d045983e..ae8e62d3e 100644 --- a/src/snmalloc/backend_helpers/backend_arena_bins.h +++ b/src/snmalloc/backend_helpers/backend_arena_bins.h @@ -806,8 +806,8 @@ namespace snmalloc // evaluation non-constant and surfaces the violation as // a compile error. if (discrim_set == 0) - SNMALLOC_CHECK( - false && "bin_subsets violates strict-chain invariant"); + SNMALLOC_CHECK_MSG( + false, "bin_subsets violates strict-chain invariant"); cascade_steps[m_top][i].m_test = bits::ctz_const(discrim_set); cascade_steps[m_top][i].bin = b; } diff --git a/src/snmalloc/backend_helpers/backend_arena_range.h b/src/snmalloc/backend_helpers/backend_arena_range.h index 6c84faea9..89e6f2c60 100644 --- a/src/snmalloc/backend_helpers/backend_arena_range.h +++ b/src/snmalloc/backend_helpers/backend_arena_range.h @@ -255,7 +255,7 @@ namespace snmalloc } else { - SNMALLOC_CHECK(false && "Global range overflow should not happen"); + SNMALLOC_CHECK_MSG(false, "Global range overflow should not happen"); } } @@ -302,12 +302,12 @@ namespace snmalloc auto refill_size = bits::max(needed_size, REFILL_SIZE); while (needed_size <= refill_size) { - auto refill = parent.alloc_range(refill_size); + auto refill_range = parent.alloc_range(refill_size); - if (refill != nullptr) + if (refill_range != nullptr) { requested_total += refill_size; - add_range(refill, refill_size); + add_range(refill_range, refill_size); SNMALLOC_ASSERT(refill_size < bits::one_at_bit(MAX_SIZE_BITS)); static_assert( diff --git a/src/snmalloc/mem/metadata.h b/src/snmalloc/mem/metadata.h index 423bd8772..7992f5ba7 100644 --- a/src/snmalloc/mem/metadata.h +++ b/src/snmalloc/mem/metadata.h @@ -168,10 +168,15 @@ namespace snmalloc /** * Explicit assignment operator, copies the data preserving the boundary bit * in the target if it is set. + * + * Load-bearing: the pagemap writes back through this operator (its + * `set(p, t)` is `body[p >> SHIFT] = t`), so the boundary bit set + * once at OS-range registration survives every subsequent metadata + * mutation — including chunk reuse via `dealloc_chunk` — without + * any consolidation path having to touch it explicitly. */ MetaEntryBase& operator=(const MetaEntryBase& other) { - // Don't overwrite the boundary bit with the other's meta = (other.meta & ~META_BOUNDARY_BIT) | address_cast(meta & META_BOUNDARY_BIT); remote_and_sizeclass = other.remote_and_sizeclass; @@ -329,11 +334,14 @@ namespace snmalloc {} /** - * Single-pointer constructor for sentinel storage that the back - * end never writes through (e.g. red-black tree concept-check - * null/root nodes — see `largebuddyrange.h`). Reserved mask is - * 0, so the `operator=` assertion is vacuous; safety relies on - * the sentinels being `static const`, making any write UB. + * Single-pointer constructor required by the `RBRepMethods` + * concept, which constructs a Handle from `&Rep::root` to + * verify sentinel constructibility (see + * `ds_core/redblacktree.h`). Reserved mask is zero, which is + * safe because `Rep::root` is a `static const` sentinel that + * the red-black tree never assigns through — any write would + * trap on the const data — and on read the underlying value is + * zero so `get()` returns zero regardless of the mask. */ constexpr BackendStateWordRef(uintptr_t* v) : val(v) {} diff --git a/src/test/func/aligned_dealloc/aligned_dealloc.cc b/src/test/func/aligned_dealloc/aligned_dealloc.cc index 0ce8f5d92..6b1deb80c 100644 --- a/src/test/func/aligned_dealloc/aligned_dealloc.cc +++ b/src/test/func/aligned_dealloc/aligned_dealloc.cc @@ -21,7 +21,7 @@ #include "test/setup.h" #include -#include +#include "test/snmalloc_testlib.h" using namespace snmalloc; diff --git a/src/test/func/large_offset_frontend/large_offset_frontend.cc b/src/test/func/large_offset_frontend/large_offset_frontend.cc index 207abb4d7..f45d6f7e7 100644 --- a/src/test/func/large_offset_frontend/large_offset_frontend.cc +++ b/src/test/func/large_offset_frontend/large_offset_frontend.cc @@ -30,7 +30,7 @@ #include #include -#include +#include "test/setup.h" #ifdef assert # undef assert From 26eed8a69d4e467fb7152702ce049700aa8c2049 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Mon, 8 Jun 2026 21:12:31 +0100 Subject: [PATCH 22/31] Add non-pow2 metadata sub-allocator: InplaceRep + SmallArenaRange MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces the building blocks for Phase A of the SmallBuddyRange -> SmallArenaRange migration. Nothing is wired into the production pipeline yet (the existing SmallBuddyRange remains the LocalMetaRange) — this commit only adds the new components and their gate test. * InplaceRep: in-band red-black-tree node Rep for BackendArena that stores the tree pointers inside the free block itself. Supports CHERI provenance via the Authmap mechanism (the same write-once cap table used by dealloc_meta_data); node accesses go through Authmap::amplify_from_address. can_consolidate refuses merging across MIN_CHUNK_SIZE boundaries to keep BackendArena's MAX_SIZE_BITS == MIN_CHUNK_BITS invariant intact. * SmallArenaRange::Type: a wrapper around BackendArena, MIN_BITS, MIN_CHUNK_BITS> presenting the standard Range interface. Serves arbitrarily-unit-aligned sizes (not just powers of two). Replaces the historical alloc_range_with_leftover with alloc_size_with_align(size, align), which makes alignment an explicit parameter and donates the unit-aligned tail back to the arena. * amplify_from_address(address_t) on DummyAuthmap (pass-through reinterpret_cast) and BasicAuthmap (lookup + pointer_offset). Lets InplaceRep recover an arena cap for an address it knows only as an integer. * New test target backend_arena_inplace covering the rep accessor round-trips, arena add/remove/consolidation/carve, a 30-seed x 500-op stress, the can_consolidate chunk-boundary refusal, and four alloc_size_with_align scenarios (exact fit, pow2 align over non-pow2 size, align larger than size, MIN_CHUNK_SIZE bypass). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CMakeLists.txt | 1 + src/snmalloc/backend_helpers/authmap.h | 30 + .../backend_helpers/backend_helpers.h | 1 + src/snmalloc/backend_helpers/inplacerep.h | 279 +++++++ .../backend_helpers/smallarenarange.h | 166 ++++ .../backend_arena_inplace.cc | 752 ++++++++++++++++++ 6 files changed, 1229 insertions(+) create mode 100644 src/snmalloc/backend_helpers/inplacerep.h create mode 100644 src/snmalloc/backend_helpers/smallarenarange.h create mode 100644 src/test/func/backend_arena_inplace/backend_arena_inplace.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index d9c01b0ab..f9e9b3aab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -551,6 +551,7 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) aligned_dealloc backend_arena backend_arena_bins + backend_arena_inplace backend_arena_range bits first_operation memory memory_usage multi_atexit multi_threadatexit redblack statistics teardown diff --git a/src/snmalloc/backend_helpers/authmap.h b/src/snmalloc/backend_helpers/authmap.h index e2a00085b..c0ad74258 100644 --- a/src/snmalloc/backend_helpers/authmap.h +++ b/src/snmalloc/backend_helpers/authmap.h @@ -23,6 +23,19 @@ namespace snmalloc { return capptr::Arena::unsafe_from(c.unsafe_ptr()); } + + /** + * Address-keyed sibling of `amplify`: returns a capability with + * address `a` and (on real capability hardware) the registered + * arena's permissions. The non-StrictProvenance pass-through + * variant simply fabricates a pointer at `a`. + */ + template + static SNMALLOC_FAST_PATH capptr::Arena + amplify_from_address(address_t a) + { + return capptr::Arena::unsafe_from(reinterpret_cast(a)); + } }; /** @@ -67,6 +80,23 @@ namespace snmalloc concreteAuthmap.template get(address_cast(c)), c); } + + /** + * Address-keyed sibling of `amplify`: returns a capability at + * address `a` with the registered arena's permissions, suitable + * for cases where the caller holds only an integer address (for + * example, in-band tree-node access in `InplaceRep`). The + * authmap is set once per arena registration and never mutated + * thereafter, so this lookup is safe under concurrent allocator + * activity. + */ + template + static SNMALLOC_FAST_PATH capptr::Arena + amplify_from_address(address_t a) + { + auto arena = concreteAuthmap.template get(a); + return pointer_offset(arena, a - address_cast(arena)); + } }; /** diff --git a/src/snmalloc/backend_helpers/backend_helpers.h b/src/snmalloc/backend_helpers/backend_helpers.h index 10382c611..10740b8ce 100644 --- a/src/snmalloc/backend_helpers/backend_helpers.h +++ b/src/snmalloc/backend_helpers/backend_helpers.h @@ -17,6 +17,7 @@ #include "pagemapregisterrange.h" #include "palrange.h" #include "range_helpers.h" +#include "smallarenarange.h" #include "smallbuddyrange.h" #include "staticconditionalrange.h" #include "statsrange.h" diff --git a/src/snmalloc/backend_helpers/inplacerep.h b/src/snmalloc/backend_helpers/inplacerep.h new file mode 100644 index 000000000..95275fb72 --- /dev/null +++ b/src/snmalloc/backend_helpers/inplacerep.h @@ -0,0 +1,279 @@ +#pragma once + +#include "../ds_core/bits.h" +#include "../ds_core/defines.h" +#include "../ds_core/sizeclassconfig.h" +#include "backend_arena.h" + +#include + +namespace snmalloc +{ + /** + * In-band tree node stored at the head of a free block managed by + * `BackendArena`. Two pointer-sized words per unit; bit-packing of + * red and variant tags lives in `word_one`. Stored as `uintptr_t` + * so we can OR meta bits into the pointer slot without UB on + * non-capability platforms (on CHERI, capabilities to access these + * words are re-derived from the `Authmap` — see `InplaceRep`). + */ + template + struct InplaceNode + { + uintptr_t word_one; + uintptr_t word_two; + }; + + /** + * In-band `Rep` for `BackendArena`. Each free block carries its + * own tree-node and metadata storage in its first few units: + * + * Unit 0 (addr): bin-tree node + variant tag. + * Unit 1 (addr + UNIT_SIZE): range-tree node (size >= 2 units). + * Unit 2 (addr + 2*UNIT_SIZE): large-size word (size >= 3 units). + * + * Bit layout in `word_one` of each unit: + * bit 0 : red bit (both trees) + * bits 1..2 : variant tag (`BackendArenaVariant`, unit 0 only) + * `word_two` holds the second child pointer with no packed meta. + * Both child pointers are unit-aligned, so their low `MIN_BITS` + * bits are zero — the packed meta occupies bits below + * `1 << MIN_BITS` and never collides with a stored pointer value. + * + * `MIN_BITS = next_pow2_bits_const(sizeof(InplaceNode))`: the + * smallest free block must hold one tree node, so the unit IS the + * node footprint rounded up. + * + * CHERI: in-band storage is accessed via + * `Authmap::amplify_from_address(addr)`, which returns a + * capability at `addr` with the registered arena's permissions. + * The authmap is set once per arena registration and never + * mutated, so this lookup carries no concurrency hazard. On + * non-CHERI platforms the authmap is the pass-through + * `DummyAuthmap` and the cap collapses to a raw pointer. + */ + template + class InplaceRep + { + public: + static constexpr size_t MIN_BITS = + bits::next_pow2_bits_const(sizeof(InplaceNode)); + static constexpr size_t UNIT_SIZE = size_t(1) << MIN_BITS; + + // 3 meta bits (variant 2 + red 1) packed below the unit + // alignment boundary. Block addresses are UNIT_SIZE-aligned, so + // a value v with `(v & (UNIT_SIZE - 1)) == 0` writes the + // pointer cleanly without touching meta. + static_assert(MIN_BITS >= 3, "Need 3 low bits for red+variant packing"); + static_assert(MIN_BITS < MIN_CHUNK_BITS, "Arena needs a non-trivial range"); + static_assert( + MIN_ALLOC_SIZE >= (size_t(1) << MIN_BITS), + "Front-end minimum allocation must be >= in-band unit size; " + "otherwise a free block cannot hold the tree node."); + + static constexpr uintptr_t RED_BIT = 1; + static constexpr unsigned VARIANT_SHIFT = 1; + static constexpr unsigned VARIANT_BITS = 2; + static constexpr uintptr_t VARIANT_MASK = + ((uintptr_t(1) << VARIANT_BITS) - 1) << VARIANT_SHIFT; + static constexpr uintptr_t BIN_META_MASK = RED_BIT | VARIANT_MASK; + static constexpr uintptr_t RANGE_META_MASK = RED_BIT; + + static_assert(BIN_META_MASK < UNIT_SIZE); + + /** + * Wraps a `uintptr_t*` storage slot plus the meta-bit mask that + * this slot owns. `get()` returns the slot value with meta bits + * cleared; assignment preserves them. Mirrors the role of + * `BackendStateWordRef` but with an inline mask field (we own + * the only mask here, unlike `BackendStateWordRef` which layers + * on top of the frontend-reserved mask). + */ + class Handle + { + uintptr_t* val{nullptr}; + uintptr_t mask{0}; + + public: + constexpr Handle() = default; + + constexpr Handle(uintptr_t* v, uintptr_t m) : val(v), mask(m) {} + + /** + * Single-pointer constructor required by the `RBRepMethods` + * concept (`ds_core/redblacktree.h:64-67`) for sentinel + * construction from `&Rep::root`. The tree's root field + * carries no meta bits, so mask defaults to zero. + */ + constexpr Handle(uintptr_t* v) : val(v) {} + + [[nodiscard]] uintptr_t get() const + { + return *val & ~mask; + } + + Handle& operator=(uintptr_t v) + { + SNMALLOC_ASSERT((v & mask) == 0); + *val = v | (*val & mask); + return *this; + } + + bool operator!=(const Handle& other) const + { + return val != other.val; + } + + uintptr_t printable_address() const + { + return reinterpret_cast(val); + } + }; + + private: + template + static InplaceNode* unit_at(uintptr_t addr) + { + auto cap = Authmap::amplify_from_address(addr + UnitIdx * UNIT_SIZE); + return static_cast*>(cap.unsafe_ptr()); + } + + /** + * Tree rep shared by `BinRep` and `RangeRep`. `UnitIdx` is the + * block-relative unit (0 or 1) that holds this rep's node; + * `MetaMask` covers the bits in that unit's `word_one` owned + * by this rep (red + variant for `BinRep`, red only for + * `RangeRep`) and is preserved across `set`. + * + * Convention (mirrors `PagemapRep`): direction `true` selects + * `word_one` (the meta-bearing word); direction `false` + * selects `word_two`. + */ + template + struct TreeRep + { + using Handle = InplaceRep::Handle; + using Contents = uintptr_t; + + static constexpr Contents null = 0; + static constexpr Contents root = 0; + + static Handle ref(bool direction, Contents k) + { + // Sentinel handle for the null key, mirroring + // `PagemapRep::TreeRep::ref`. Reads return 0; writes are + // disallowed by the tree's algorithm but the storage is + // still backing in case of accidental writes during + // debugging. + static uintptr_t null_entry = 0; + if (SNMALLOC_UNLIKELY(k == 0)) + return Handle{&null_entry, 0}; + auto* node = unit_at(k); + return direction ? Handle{&node->word_one, MetaMask} : + Handle{&node->word_two, 0}; + } + + static Contents get(Handle h) + { + return h.get(); + } + + static void set(Handle h, Contents v) + { + h = v; + } + + static bool is_red(Contents k) + { + if (k == 0) + return false; + return (unit_at(k)->word_one & RED_BIT) != 0; + } + + static void set_red(Contents k, bool new_is_red) + { + auto* w = &unit_at(k)->word_one; + if (((*w & RED_BIT) != 0) != new_is_red) + *w ^= RED_BIT; + SNMALLOC_ASSERT(is_red(k) == new_is_red); + } + + static bool compare(Contents k1, Contents k2) + { + return k1 > k2; + } + + static bool equal(Contents k1, Contents k2) + { + return k1 == k2; + } + + static uintptr_t printable(Contents k) + { + return k; + } + + static uintptr_t printable(Handle h) + { + return h.printable_address(); + } + + static const char* name() + { + return Name; + } + }; + + static constexpr char BIN_REP_NAME[] = "InplaceBinRep"; + static constexpr char RANGE_REP_NAME[] = "InplaceRangeRep"; + + public: + using BinRep = TreeRep<0, BIN_META_MASK, BIN_REP_NAME>; + using RangeRep = TreeRep<1, RANGE_META_MASK, RANGE_REP_NAME>; + + static BackendArenaVariant get_variant(uintptr_t addr) + { + auto w = unit_at<0>(addr)->word_one; + return static_cast( + (w & VARIANT_MASK) >> VARIANT_SHIFT); + } + + static void set_variant(uintptr_t addr, BackendArenaVariant v) + { + auto* w = &unit_at<0>(addr)->word_one; + *w = (*w & ~VARIANT_MASK) | (static_cast(v) << VARIANT_SHIFT); + } + + /** + * Exact byte size for `Large` blocks. Stored as a plain + * `uintptr_t` in unit 2's `word_one`; unlike `PagemapRep` we + * do not need to compress (the pagemap word has reserved low + * bits but our in-band word has the full width). + */ + static size_t get_large_size(uintptr_t addr) + { + return static_cast(unit_at<2>(addr)->word_one); + } + + static void set_large_size(uintptr_t addr, size_t size) + { + SNMALLOC_ASSERT((size & (UNIT_SIZE - 1)) == 0); + unit_at<2>(addr)->word_one = static_cast(size); + } + + /** + * Refuse consolidation across `MIN_CHUNK_SIZE` boundaries. + * `SmallArenaRange::add_range_impl` splits incoming ranges at + * chunk boundaries, but does not eagerly merge across them on + * the wrapper side; this check is what stops `BackendArena` + * from later merging two adjacent intra-chunk fragments that + * happen to abut the same chunk boundary, which would create a + * free block straddling chunks. Chunk-aligned `higher_addr` + * means the lower neighbour ends at a chunk boundary — refuse. + */ + static bool can_consolidate(uintptr_t higher_addr) + { + return (higher_addr & (MIN_CHUNK_SIZE - 1)) != 0; + } + }; +} // namespace snmalloc diff --git a/src/snmalloc/backend_helpers/smallarenarange.h b/src/snmalloc/backend_helpers/smallarenarange.h new file mode 100644 index 000000000..5253c3af5 --- /dev/null +++ b/src/snmalloc/backend_helpers/smallarenarange.h @@ -0,0 +1,166 @@ +#pragma once + +#include "../pal/pal.h" +#include "backend_arena.h" +#include "empty_range.h" +#include "inplacerep.h" +#include "range_helpers.h" + +namespace snmalloc +{ + /** + * Small-grained range backed by `BackendArena` with in-band + * (`InplaceRep`) tree-node storage. Serves blocks of any + * unit-aligned size — not restricted to powers of two — for + * `SlabMetadata` allocations. + * + * Each arena instance covers exactly one chunk + * (`MAX_SIZE_BITS = MIN_CHUNK_BITS`): refill takes one chunk + * from the parent, sub-chunk fragments live in the arena, + * consolidated whole chunks flow back to the parent. + */ + template + struct SmallArenaRange + { + template> + class Type : public ContainsParent + { + public: + using ChunkBounds = typename ParentRange::ChunkBounds; + + private: + using ContainsParent::parent; + + using RepT = InplaceRep; + static constexpr size_t MIN_BITS = RepT::MIN_BITS; + + BackendArena arena; + + public: + static constexpr size_t UNIT_SIZE = RepT::UNIT_SIZE; + + private: + /** + * Split `[base, base+length)` at chunk boundaries. + * Intra-chunk fragments are unit-trimmed and submitted to + * the arena; segments that begin and end chunk-aligned go + * to the parent. Accepts arbitrary unaligned input — + * `dealloc_meta_data` forwards `make()`'s unaligned spare + * here; sub-unit edges are discarded by design. + */ + void add_range_impl(CapPtr base, size_t length) + { + uintptr_t lo = base.unsafe_uintptr(); + uintptr_t hi = lo + length; + + while (lo < hi) + { + uintptr_t chunk_end = bits::align_up(lo + 1, MIN_CHUNK_SIZE); + uintptr_t seg_end = bits::min(hi, chunk_end); + + if ( + lo == bits::align_down(lo, MIN_CHUNK_SIZE) && seg_end == chunk_end) + { + auto chunk_base = CapPtr::unsafe_from( + reinterpret_cast(lo)); + parent.dealloc_range(chunk_base, MIN_CHUNK_SIZE); + } + else + { + uintptr_t f_lo = bits::align_up(lo, UNIT_SIZE); + uintptr_t f_hi = bits::align_down(seg_end, UNIT_SIZE); + if (f_lo < f_hi) + { + auto [ov_a, ov_s] = arena.add_block(f_lo, f_hi - f_lo); + if (ov_a != 0) + { + // Arena consolidated up to MAX_SIZE_BITS = chunk: + // hand the whole-chunk piece back to the parent. + auto ov_base = CapPtr::unsafe_from( + reinterpret_cast(ov_a)); + parent.dealloc_range(ov_base, ov_s); + } + } + } + + lo = seg_end; + } + } + + CapPtr refill(size_t size) + { + auto refill_range = parent.alloc_range(MIN_CHUNK_SIZE); + if (refill_range == nullptr) + return nullptr; + + add_range_impl( + pointer_offset(refill_range, size), MIN_CHUNK_SIZE - size); + + return refill_range; + } + + public: + static constexpr bool Aligned = true; + static_assert(ParentRange::Aligned, "ParentRange must be aligned"); + + static constexpr bool ConcurrencySafe = false; + + constexpr Type() = default; + + CapPtr alloc_range(size_t size) + { + SNMALLOC_ASSERT((size & (UNIT_SIZE - 1)) == 0); + + if (size >= MIN_CHUNK_SIZE) + return parent.alloc_range(size); + + uintptr_t a = arena.remove_block(size); + if (a != 0) + return CapPtr::unsafe_from( + reinterpret_cast(a)); + + return refill(size); + } + + /** + * Allocate `align`-aligned space large enough for `size`, + * donating the unit-aligned tail back to the arena. + * + * Requests `requested = align_up(size, align)` bytes; because + * `align` is pow2 and `requested` is a multiple of `align`, + * `BackendArena`'s carve returns an `align`-aligned base + * without a caller-side over-allocate-and-trim. The tail + * `[align_up(size, UNIT_SIZE), requested)` is donated via + * `add_range_impl`. The sub-unit slice + * `[size, align_up(size, UNIT_SIZE))` cannot be represented + * and is leaked — pre-round `size` to `UNIT_SIZE` to avoid it. + */ + CapPtr alloc_size_with_align(size_t size, size_t align) + { + SNMALLOC_ASSERT(size > 0); + SNMALLOC_ASSERT(bits::is_pow2(align)); + SNMALLOC_ASSERT(align >= UNIT_SIZE); + SNMALLOC_ASSERT(align <= MIN_CHUNK_SIZE); + + size_t requested = bits::align_up(size, align); + auto p = alloc_range(requested); + if (p == nullptr) + return nullptr; + + size_t used = bits::align_up(size, UNIT_SIZE); + if (used < requested) + { + add_range_impl(pointer_offset(p, used), requested - used); + } + + return p; + } + + // No precondition on `size`: sub-unit edges discarded. + void dealloc_range(CapPtr base, size_t size) + { + add_range_impl(base, size); + } + }; + }; +} // namespace snmalloc diff --git a/src/test/func/backend_arena_inplace/backend_arena_inplace.cc b/src/test/func/backend_arena_inplace/backend_arena_inplace.cc new file mode 100644 index 000000000..c7ee62723 --- /dev/null +++ b/src/test/func/backend_arena_inplace/backend_arena_inplace.cc @@ -0,0 +1,752 @@ +/** + * Unit tests for `InplaceRep` exercised through `BackendArena`. + * + * Distinct from the `backend_arena` test (which uses an array-backed + * MockRep): here the Rep is the production in-band representation, + * and each free block's tree-node storage lives at the block's own + * head bytes. The test allocates a single chunk-aligned backing + * buffer and treats addresses within it as block bases. + */ + +#include "test/setup.h" +#include "test/snmalloc_testlib.h" +#include "test/xoroshiro.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace snmalloc +{ + using Rep = InplaceRep; + static constexpr size_t UNIT_SIZE = Rep::UNIT_SIZE; + static constexpr size_t MIN_BITS = Rep::MIN_BITS; + + // Arena spans one chunk's worth of space (max block size = + // MIN_CHUNK_SIZE - UNIT_SIZE, since the arena's MAX is exclusive). + static constexpr size_t MAX_SIZE_BITS = MIN_CHUNK_BITS; + using Arena = BackendArena; + + // Backing buffer: must be UNIT_SIZE-aligned so block bases are + // unit-aligned and the in-band node fields land at the expected + // offsets. Sized to comfortably cover the arena's full range plus + // a small base offset that keeps block addresses non-zero (zero + // is the tree null sentinel). + alignas(MIN_CHUNK_SIZE) static unsigned char backing[2 * MIN_CHUNK_SIZE]; + + static uintptr_t base_addr() + { + // Offset by MIN_CHUNK_SIZE to keep addresses well clear of zero. + return reinterpret_cast(&backing[MIN_CHUNK_SIZE]); + } + + static void reset_backing() + { + for (size_t i = 0; i < sizeof(backing); i++) + backing[i] = 0; + } + + static uintptr_t unit_addr(size_t unit_idx) + { + return base_addr() + unit_idx * UNIT_SIZE; + } + + static constexpr size_t unit_size(size_t n_units) + { + return n_units * UNIT_SIZE; + } + + // ================================================================== + // (A) Round-trip: variant tag and large-size storage survive + // independent of bin/range pointer writes. + // ================================================================== + + static void test_variant_roundtrip() + { + reset_backing(); + uintptr_t a = unit_addr(0); + + for (auto v : + {BackendArenaVariant::Min, + BackendArenaVariant::EvenTwo, + BackendArenaVariant::OddTwo, + BackendArenaVariant::Large}) + { + Rep::set_variant(a, v); + SNMALLOC_CHECK(Rep::get_variant(a) == v); + } + + // Variant tag must not interfere with the red bit at bit 0. + Rep::set_variant(a, BackendArenaVariant::OddTwo); + Rep::BinRep::set_red(a, true); + SNMALLOC_CHECK(Rep::BinRep::is_red(a)); + SNMALLOC_CHECK(Rep::get_variant(a) == BackendArenaVariant::OddTwo); + + Rep::BinRep::set_red(a, false); + SNMALLOC_CHECK(!Rep::BinRep::is_red(a)); + SNMALLOC_CHECK(Rep::get_variant(a) == BackendArenaVariant::OddTwo); + + printf(" Variant + red roundtrip: OK\n"); + } + + static void test_large_size_roundtrip() + { + reset_backing(); + uintptr_t a = unit_addr(0); + + for (size_t s : {unit_size(3), unit_size(7), unit_size(17), unit_size(125)}) + { + Rep::set_large_size(a, s); + SNMALLOC_CHECK(Rep::get_large_size(a) == s); + } + + printf(" Large-size roundtrip: OK\n"); + } + + // ================================================================== + // (B) Bin-tree and range-tree red bits live in different units and + // must not alias. + // ================================================================== + + static void test_red_bits_independent() + { + reset_backing(); + uintptr_t a = unit_addr(0); + + Rep::BinRep::set_red(a, true); + Rep::RangeRep::set_red(a, false); + SNMALLOC_CHECK(Rep::BinRep::is_red(a)); + SNMALLOC_CHECK(!Rep::RangeRep::is_red(a)); + + Rep::BinRep::set_red(a, false); + Rep::RangeRep::set_red(a, true); + SNMALLOC_CHECK(!Rep::BinRep::is_red(a)); + SNMALLOC_CHECK(Rep::RangeRep::is_red(a)); + + printf(" Bin/range red bits independent: OK\n"); + } + + // ================================================================== + // (B2) `can_consolidate` refuses chunk-boundary merges. + // SmallArenaRange splits incoming ranges at chunk boundaries, but + // adjacent intra-chunk fragments meeting at a boundary would + // otherwise be merged by BackendArena. The predicate is what + // prevents that. + // ================================================================== + + static void test_can_consolidate_chunk_boundary() + { + // Chunk-aligned higher_addr means the lower neighbour ends at + // a chunk boundary — refuse. + SNMALLOC_CHECK(!Rep::can_consolidate(MIN_CHUNK_SIZE)); + SNMALLOC_CHECK(!Rep::can_consolidate(2 * MIN_CHUNK_SIZE)); + // Non-chunk-aligned higher_addr is fine to merge. + SNMALLOC_CHECK(Rep::can_consolidate(MIN_CHUNK_SIZE + UNIT_SIZE)); + SNMALLOC_CHECK(Rep::can_consolidate(MIN_CHUNK_SIZE - UNIT_SIZE)); + SNMALLOC_CHECK(Rep::can_consolidate(UNIT_SIZE)); + + printf(" can_consolidate chunk-boundary refuse: OK\n"); + } + + // ================================================================== + // (C) Through the arena: add a single block and remove it. + // ================================================================== + + static void test_arena_add_remove_single() + { + reset_backing(); + Arena arena; + arena.check_invariant(true); + + auto a = unit_addr(0); + auto [ov_a, ov_s] = arena.add_block(a, unit_size(4)); + SNMALLOC_CHECK(ov_a == 0 && ov_s == 0); + arena.check_invariant(true); + + auto got = arena.remove_block(unit_size(4)); + SNMALLOC_CHECK(got == a); + arena.check_invariant(true); + + printf(" Arena add/remove single: OK\n"); + } + + // ================================================================== + // (D) Consolidation across two adjacent free blocks. + // ================================================================== + + static void test_arena_consolidation() + { + reset_backing(); + Arena arena; + + auto a = unit_addr(0); + auto b = unit_addr(4); + arena.add_block(a, unit_size(4)); + arena.check_invariant(true); + auto [ov_a, ov_s] = arena.add_block(b, unit_size(4)); + SNMALLOC_CHECK(ov_a == 0 && ov_s == 0); + arena.check_invariant(true); + + // A single 8-unit block should now be removable from the + // consolidated region. + auto got = arena.remove_block(unit_size(8)); + SNMALLOC_CHECK(got == a); + arena.check_invariant(true); + + printf(" Arena consolidation: OK\n"); + } + + // ================================================================== + // (E) Carving: request a smaller size than the free block has. + // ================================================================== + + static void test_arena_carve() + { + reset_backing(); + Arena arena; + + auto a = unit_addr(0); + arena.add_block(a, unit_size(8)); + arena.check_invariant(true); + + auto got = arena.remove_block(unit_size(3)); + SNMALLOC_CHECK(got != 0); + arena.check_invariant(true); + + // The remainder is still available; total removed should sum to + // 8 units across this and subsequent removes. + size_t total_removed = 3; + while (true) + { + auto r = arena.remove_block(unit_size(1)); + if (r == 0) + break; + total_removed += 1; + arena.check_invariant(true); + } + SNMALLOC_CHECK(total_removed == 8); + + printf(" Arena carve + drain: OK\n"); + } + + // ================================================================== + // (F) Randomised stress: oracle-checked add/remove over a single + // chunk's worth of units. Equivalent to the MockRep stress test in + // shape but operates on real in-band storage. + // ================================================================== + + static constexpr size_t STRESS_UNITS = + (size_t(1) << MAX_SIZE_BITS) / UNIT_SIZE - 1; + + using Bins = BackendArenaBins<2, MIN_BITS>; + + struct OracleRange + { + size_t addr_units; + size_t size_units; + + bool operator<(const OracleRange& o) const + { + return addr_units < o.addr_units; + } + }; + + // Mirrors the arena's bin-based allocator: classify entries into + // bins, pick the bin via the bitmap's find_for_request, then + // pick the lowest-address entry within that bin and carve. + class Oracle + { + std::set ranges; + + public: + void add(size_t addr_units, size_t size_units) + { + OracleRange key{addr_units, size_units}; + auto it = ranges.lower_bound(key); + + size_t new_addr = addr_units; + size_t new_size = size_units; + + if (it != ranges.end() && it->addr_units == new_addr + new_size) + { + new_size += it->size_units; + it = ranges.erase(it); + } + + if (it != ranges.begin()) + { + auto prev = std::prev(it); + if (prev->addr_units + prev->size_units == new_addr) + { + new_addr = prev->addr_units; + new_size += prev->size_units; + ranges.erase(prev); + } + } + + ranges.insert({new_addr, new_size}); + } + + // Returns {addr_units, len_units} or {0, 0} if nothing fits. + std::pair remove(size_t n_units) + { + size_t n_bytes = n_units * UNIT_SIZE; + if (n_bytes == 0 || n_bytes > Bins::max_supported_size()) + return {0, 0}; + + typename Bins::Bitmap bm{}; + std::map::iterator>> by_bin; + + for (auto it = ranges.begin(); it != ranges.end(); ++it) + { + typename Bins::range_t r{ + unit_addr(it->addr_units), it->size_units * UNIT_SIZE}; + size_t bin = bm.add(r); + by_bin[bin].push_back(it); + } + + size_t bin_id = bm.find_for_request(n_bytes); + if (bin_id == SIZE_MAX) + return {0, 0}; + + auto& entries = by_bin[bin_id]; + auto best_it = entries[0]; + for (size_t i = 1; i < entries.size(); i++) + { + if (entries[i]->addr_units < best_it->addr_units) + best_it = entries[i]; + } + + OracleRange block = *best_it; + ranges.erase(best_it); + + auto carved = Bins::carve( + {unit_addr(block.addr_units), block.size_units * UNIT_SIZE}, n_bytes); + auto base = base_addr(); + if (carved.pre.size != 0) + ranges.insert( + {(carved.pre.base - base) / UNIT_SIZE, carved.pre.size / UNIT_SIZE}); + if (carved.post.size != 0) + ranges.insert( + {(carved.post.base - base) / UNIT_SIZE, + carved.post.size / UNIT_SIZE}); + + return { + (carved.req.base - base) / UNIT_SIZE, carved.req.size / UNIT_SIZE}; + } + }; + + static void test_stress_seed(size_t seed, size_t num_ops) + { + reset_backing(); + Arena arena; + Oracle oracle; + + // All units initially allocated (i.e., not in the arena). + std::vector allocated(STRESS_UNITS, true); + + xoroshiro::p128r64 rng(seed); + + for (size_t op = 0; op < num_ops; op++) + { + bool do_add = (rng.next() % 3) != 0; + + if (do_add) + { + size_t max_size = STRESS_UNITS / 4; + if (max_size < 1) + max_size = 1; + size_t size = (rng.next() % max_size) + 1; + size_t start = rng.next() % STRESS_UNITS; + + bool found = false; + for (size_t try_start = start; try_start < STRESS_UNITS; try_start++) + { + size_t actual = 0; + for (size_t j = try_start; j < STRESS_UNITS && j < try_start + size; + j++) + { + if (!allocated[j]) + break; + actual++; + } + if (actual >= 1) + { + size = actual; + start = try_start; + found = true; + break; + } + } + if (!found) + continue; + + for (size_t j = start; j < start + size; j++) + allocated[j] = false; + + auto result = arena.add_block(unit_addr(start), unit_size(size)); + if (result.first == 0) + oracle.add(start, size); + else + { + // Overflow: arena spilled the consolidated block back to + // the caller. Treat as if everything went back to + // "allocated"; clear the oracle. + for (size_t j = 0; j < STRESS_UNITS; j++) + allocated[j] = true; + oracle = Oracle{}; + } + arena.check_invariant(true); + } + else + { + size_t max_req = STRESS_UNITS / 4; + if (max_req < 1) + max_req = 1; + size_t n = (rng.next() % max_req) + 1; + + auto arena_addr = arena.remove_block(unit_size(n)); + auto [o_start, o_len] = oracle.remove(n); + + if (o_len == 0) + { + SNMALLOC_CHECK(arena_addr == 0); + } + else + { + SNMALLOC_CHECK(arena_addr != 0); + SNMALLOC_CHECK(arena_addr == unit_addr(o_start)); + for (size_t j = o_start; j < o_start + o_len; j++) + allocated[j] = true; + } + arena.check_invariant(true); + } + } + } + + static void test_stress() + { + constexpr size_t NUM_OPS = 500; + constexpr size_t NUM_SEEDS = 30; + for (size_t s = 1; s <= NUM_SEEDS; s++) + test_stress_seed(s, NUM_OPS); + printf(" Stress (%zu seeds x %zu ops): OK\n", NUM_SEEDS, NUM_OPS); + } + + // ================================================================== + // (G) SmallArenaRange — chunk-granularity parent + sub-chunk + // sub-allocations served by the in-band arena. + // ================================================================== + + // Pool of chunk-aligned buffers, handed out as a chunk-granularity + // parent range to SmallArenaRange. + static constexpr size_t POOL_CHUNKS = 8; + alignas(MIN_CHUNK_SIZE) static unsigned char pool_storage + [POOL_CHUNKS * MIN_CHUNK_SIZE]; + static bool pool_in_use[POOL_CHUNKS]; + // Track returns to detect leaks / double-frees. + static size_t pool_alloc_count; + static size_t pool_dealloc_count; + + static void reset_pool() + { + for (size_t i = 0; i < POOL_CHUNKS; i++) + pool_in_use[i] = false; + for (size_t i = 0; i < sizeof(pool_storage); i++) + pool_storage[i] = 0; + pool_alloc_count = 0; + pool_dealloc_count = 0; + } + + class MockParent + { + public: + static constexpr bool Aligned = true; + static constexpr bool ConcurrencySafe = true; + using ChunkBounds = capptr::bounds::Arena; + + constexpr MockParent() = default; + + CapPtr alloc_range(size_t size) + { + SNMALLOC_CHECK(size == MIN_CHUNK_SIZE); + for (size_t i = 0; i < POOL_CHUNKS; i++) + { + if (!pool_in_use[i]) + { + pool_in_use[i] = true; + pool_alloc_count++; + return CapPtr::unsafe_from( + &pool_storage[i * MIN_CHUNK_SIZE]); + } + } + return nullptr; + } + + void dealloc_range(CapPtr base, size_t size) + { + SNMALLOC_CHECK(size == MIN_CHUNK_SIZE); + auto p = static_cast(base.unsafe_ptr()); + auto idx = static_cast((p - pool_storage) / MIN_CHUNK_SIZE); + SNMALLOC_CHECK(idx < POOL_CHUNKS); + SNMALLOC_CHECK(pool_in_use[idx]); + pool_in_use[idx] = false; + pool_dealloc_count++; + } + }; + + using SmallArena = SmallArenaRange::Type; + + static void test_small_arena_basic() + { + reset_pool(); + SmallArena r; + + // First alloc triggers a refill of one chunk; the rest of the + // chunk is internally available for further sub-allocations. + auto a = r.alloc_range(UNIT_SIZE); + SNMALLOC_CHECK(a != nullptr); + SNMALLOC_CHECK(pool_alloc_count == 1); + + auto b = r.alloc_range(unit_size(3)); + SNMALLOC_CHECK(b != nullptr); + SNMALLOC_CHECK(pool_alloc_count == 1); + + // Non-pow2 size — the whole point of SmallArenaRange. + auto c = r.alloc_range(unit_size(5)); + SNMALLOC_CHECK(c != nullptr); + SNMALLOC_CHECK(pool_alloc_count == 1); + + r.dealloc_range(a, UNIT_SIZE); + r.dealloc_range(b, unit_size(3)); + r.dealloc_range(c, unit_size(5)); + + printf(" SmallArenaRange basic alloc/dealloc: OK\n"); + } + + static void test_small_arena_chunk_pass_through() + { + reset_pool(); + SmallArena r; + + // A chunk-or-larger alloc should pass through to the parent + // without touching the arena. + auto a = r.alloc_range(MIN_CHUNK_SIZE); + SNMALLOC_CHECK(a != nullptr); + SNMALLOC_CHECK(pool_alloc_count == 1); + + r.dealloc_range(a, MIN_CHUNK_SIZE); + SNMALLOC_CHECK(pool_dealloc_count == 1); + + printf(" SmallArenaRange chunk pass-through: OK\n"); + } + + static void test_small_arena_unaligned_dealloc() + { + reset_pool(); + SmallArena r; + + // Get some sub-chunk space populated. + auto a = r.alloc_range(unit_size(4)); + SNMALLOC_CHECK(a != nullptr); + + // Donate an unaligned spare (mirrors make()'s spare-seed + // donation). Length is not unit-aligned; sub-unit edges must + // be silently discarded. + auto unaligned_base = pointer_offset(a, 1); + r.dealloc_range(unaligned_base, unit_size(4) - 1); + + // Should not have leaked chunks to the parent (sub-chunk + // fragments stay in the arena). + SNMALLOC_CHECK(pool_dealloc_count == 0); + + printf(" SmallArenaRange unaligned dealloc: OK\n"); + } + + static void test_small_arena_consolidation_returns_chunk() + { + reset_pool(); + SmallArena r; + + // Fully consume one chunk via small allocs; record the chunk + // base so we can rebuild the full chunk via deallocs. + constexpr size_t N = MIN_CHUNK_SIZE / UNIT_SIZE; + std::vector> ps; + for (size_t i = 0; i < N; i++) + { + auto p = r.alloc_range(UNIT_SIZE); + SNMALLOC_CHECK(p != nullptr); + ps.push_back(p); + } + // We expect at least one refill happened (likely just one, + // since N units == one chunk; but in either case all + // sub-allocs come from the same backing chunk). + SNMALLOC_CHECK(pool_alloc_count >= 1); + + size_t deallocs_before = pool_dealloc_count; + for (auto p : ps) + r.dealloc_range(p, UNIT_SIZE); + + // Consolidation should reassemble the whole chunk and donate + // it back to the parent. + SNMALLOC_CHECK(pool_dealloc_count > deallocs_before); + + printf(" SmallArenaRange consolidation returns chunk: OK\n"); + } + + // alloc_size_with_align + + static void test_alloc_size_with_align_exact() + { + reset_pool(); + SmallArena r; + + size_t size = unit_size(4); + size_t align = UNIT_SIZE; + auto p = r.alloc_size_with_align(size, align); + SNMALLOC_CHECK(p != nullptr); + SNMALLOC_CHECK((address_cast(p) & (align - 1)) == 0); + + r.dealloc_range(p, size); + printf(" alloc_size_with_align exact (no leftover): OK\n"); + } + + static void test_alloc_size_with_align_pow2_align_over_size() + { + reset_pool(); + SmallArena r; + + size_t size = unit_size(3) + 2; + size_t align = 256; + SNMALLOC_CHECK(align <= MIN_CHUNK_SIZE); + SNMALLOC_CHECK(align >= UNIT_SIZE); + SNMALLOC_CHECK(bits::is_pow2(align)); + + auto p = r.alloc_size_with_align(size, align); + SNMALLOC_CHECK(p != nullptr); + SNMALLOC_CHECK((address_cast(p) & (align - 1)) == 0); + + size_t used = bits::align_up(size, UNIT_SIZE); + size_t requested = bits::align_up(size, align); + SNMALLOC_CHECK(requested - used > 0); + + // Donated tail and the carved-but-unused chunk remainder both + // sit in the arena, so the follow-up alloc must succeed + // without a second parent refill — exact address is not + // pinned down. + auto tail = r.alloc_range(requested - used); + SNMALLOC_CHECK(tail != nullptr); + SNMALLOC_CHECK(pool_alloc_count == 1); + + r.dealloc_range(p, used); + r.dealloc_range(tail, requested - used); + printf(" alloc_size_with_align pow2 align over non-pow2 size: OK\n"); + } + + static void test_alloc_size_with_align_align_larger_than_size() + { + reset_pool(); + SmallArena r; + + // User's motivating example, scaled into the test arena. + size_t align = 4096; + SNMALLOC_CHECK(align <= MIN_CHUNK_SIZE); + size_t size = align - 254; + + auto p = r.alloc_size_with_align(size, align); + SNMALLOC_CHECK(p != nullptr); + SNMALLOC_CHECK((address_cast(p) & (align - 1)) == 0); + + size_t used = bits::align_up(size, UNIT_SIZE); + auto tail = r.alloc_range(align - used); + SNMALLOC_CHECK(tail != nullptr); + SNMALLOC_CHECK(pool_alloc_count == 1); + + r.dealloc_range(p, used); + r.dealloc_range(tail, align - used); + printf(" alloc_size_with_align align > size: OK\n"); + } + + static void test_alloc_size_with_align_chunk_bypass() + { + reset_pool(); + SmallArena r; + + size_t size = MIN_CHUNK_SIZE - 100; + size_t align = MIN_CHUNK_SIZE; + auto p = r.alloc_size_with_align(size, align); + SNMALLOC_CHECK(p != nullptr); + SNMALLOC_CHECK((address_cast(p) & (align - 1)) == 0); + SNMALLOC_CHECK(pool_alloc_count == 1); + + // requested == MIN_CHUNK_SIZE bypasses to parent (whole chunk, + // no carve-time leftover), so the only free arena fragment is + // the donated tail — pin its exact address. Tail stays + // intra-chunk, so no dealloc to parent. + SNMALLOC_CHECK(pool_dealloc_count == 0); + + size_t used = bits::align_up(size, UNIT_SIZE); + if (used < MIN_CHUNK_SIZE) + { + auto tail = r.alloc_range(MIN_CHUNK_SIZE - used); + SNMALLOC_CHECK(tail != nullptr); + SNMALLOC_CHECK(address_cast(tail) == address_cast(p) + used); + r.dealloc_range(tail, MIN_CHUNK_SIZE - used); + } + r.dealloc_range(p, used); + + printf(" alloc_size_with_align chunk-sized bypass: OK\n"); + } +} // namespace snmalloc + +int main() +{ + printf("--- InplaceRep tests ---\n"); + printf( + " UNIT_SIZE=%zu, MIN_BITS=%zu, MAX_SIZE_BITS=%zu, STRESS_UNITS=%zu\n", + snmalloc::UNIT_SIZE, + snmalloc::MIN_BITS, + snmalloc::MAX_SIZE_BITS, + snmalloc::STRESS_UNITS); + + printf("(A) Accessor round-trips:\n"); + snmalloc::test_variant_roundtrip(); + snmalloc::test_large_size_roundtrip(); + + printf("(B) Red bits independent:\n"); + snmalloc::test_red_bits_independent(); + snmalloc::test_can_consolidate_chunk_boundary(); + + printf("(C) Arena add/remove:\n"); + snmalloc::test_arena_add_remove_single(); + + printf("(D) Arena consolidation:\n"); + snmalloc::test_arena_consolidation(); + + printf("(E) Arena carve:\n"); + snmalloc::test_arena_carve(); + + printf("(F) Stress:\n"); + snmalloc::test_stress(); + + printf("(G) SmallArenaRange:\n"); + snmalloc::test_small_arena_basic(); + snmalloc::test_small_arena_chunk_pass_through(); + snmalloc::test_small_arena_unaligned_dealloc(); + snmalloc::test_small_arena_consolidation_returns_chunk(); + snmalloc::test_alloc_size_with_align_exact(); + snmalloc::test_alloc_size_with_align_pow2_align_over_size(); + snmalloc::test_alloc_size_with_align_align_larger_than_size(); + snmalloc::test_alloc_size_with_align_chunk_bypass(); + + printf("All InplaceRep tests passed.\n"); + return 0; +} From ad486943ede616391afa24c0c321ce6856b6013d Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Tue, 9 Jun 2026 09:25:03 +0100 Subject: [PATCH 23/31] Wire SmallArenaRange in as the LocalMetaRange Phase B of the SmallBuddyRange -> SmallArenaRange migration. * StandardLocalState and MetaProtectedRangeLocalState gain an Authmap template parameter, plumbed through alongside Pagemap. Both configs and the domestication test pass their Authmap into the LocalState instantiation. * The three SmallBuddyRange uses in the meta-range pipes are replaced with SmallArenaRange. * BackendAllocator::alloc_meta_data calls the new alloc_size_with_align(size, alignment) primitive, with alignment = max(next_pow2(size), MetaRangeT::UNIT_SIZE). The next_pow2 keeps Phase B behaviour identical to the previous buddy-rounded path; the max floors the alignment at the meta range's UNIT_SIZE so alloc_size_with_align's precondition holds for any positive size. * FixedRangeConfig's inline Authmap gains amplify_from_address (the new SmallArenaRange path needs it). SmallBuddyRange.h is now orphaned but stays in tree until Phase D removes it. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/backend/backend.h | 9 ++++++--- src/snmalloc/backend/fixedglobalconfig.h | 9 ++++++++- src/snmalloc/backend/globalconfig.h | 4 ++-- src/snmalloc/backend/meta_protected_range.h | 7 ++++--- src/snmalloc/backend/standard_range.h | 5 +++-- src/test/func/domestication/domestication.cc | 2 +- 6 files changed, 24 insertions(+), 12 deletions(-) diff --git a/src/snmalloc/backend/backend.h b/src/snmalloc/backend/backend.h index 2fcdf2a57..adaccc128 100644 --- a/src/snmalloc/backend/backend.h +++ b/src/snmalloc/backend/backend.h @@ -46,7 +46,11 @@ namespace snmalloc if (local_state != nullptr) { - p = local_state->get_meta_range().alloc_range_with_leftover(size); + auto& meta_range = local_state->get_meta_range(); + using MetaRangeT = stl::remove_reference_t; + size_t alignment = + bits::max(bits::next_pow2(size), MetaRangeT::UNIT_SIZE); + p = meta_range.alloc_size_with_align(size, alignment); } else { @@ -156,8 +160,7 @@ namespace snmalloc SNMALLOC_ASSERT(slab_index < (size_t{1} << OFFSET_BITS)); const uintptr_t ras_i = ras | (slab_index << SIZECLASS_BITS); typename Pagemap::Entry t_i(meta, ras_i); - Pagemap::set_metaentry( - address_cast(p) + chunk_offset, slab_size, t_i); + Pagemap::set_metaentry(address_cast(p) + chunk_offset, slab_size, t_i); } return {Aal::capptr_bound(p, size), meta}; diff --git a/src/snmalloc/backend/fixedglobalconfig.h b/src/snmalloc/backend/fixedglobalconfig.h index 5bd3b68b5..94c3c67f1 100644 --- a/src/snmalloc/backend/fixedglobalconfig.h +++ b/src/snmalloc/backend/fixedglobalconfig.h @@ -39,10 +39,17 @@ namespace snmalloc { return Aal::capptr_rebound(arena, c); } + + template + static SNMALLOC_FAST_PATH capptr::Arena + amplify_from_address(address_t a) + { + return pointer_offset(arena, a - address_cast(arena)); + } }; public: - using LocalState = StandardLocalState; + using LocalState = StandardLocalState; using GlobalPoolState = PoolState>; diff --git a/src/snmalloc/backend/globalconfig.h b/src/snmalloc/backend/globalconfig.h index 208210b65..9bdada06c 100644 --- a/src/snmalloc/backend/globalconfig.h +++ b/src/snmalloc/backend/globalconfig.h @@ -68,8 +68,8 @@ namespace snmalloc */ using LocalState = stl::conditional_t< mitigations(metadata_protection), - MetaProtectedRangeLocalState, - StandardLocalState>; + MetaProtectedRangeLocalState, + StandardLocalState>; /** * Use the default backend. diff --git a/src/snmalloc/backend/meta_protected_range.h b/src/snmalloc/backend/meta_protected_range.h index df0245beb..021f4750b 100644 --- a/src/snmalloc/backend/meta_protected_range.h +++ b/src/snmalloc/backend/meta_protected_range.h @@ -24,6 +24,7 @@ namespace snmalloc template< typename PAL, typename Pagemap, + typename Authmap, typename Base, size_t MinSizeBits = MinBaseSizeBits()> struct MetaProtectedRangeLocalState : BaseLocalStateConstants @@ -104,7 +105,7 @@ namespace snmalloc LocalCacheSizeBits - SubRangeRatioBits, bits::BITS - 1, Pagemap>, - SmallBuddyRange>; + SmallArenaRange>; ObjectRange object_range; @@ -124,9 +125,9 @@ namespace snmalloc } // Create global range that can service small meta-data requests. - // Don't want to add the SmallBuddyRange to the CentralMetaRange as that + // Don't want to add the SmallArenaRange to the CentralMetaRange as that // would require committing memory inside the main global lock. using GlobalMetaRange = - Pipe; + Pipe, GlobalRange>; }; } // namespace snmalloc diff --git a/src/snmalloc/backend/standard_range.h b/src/snmalloc/backend/standard_range.h index 2d9d5e961..7387ca3c9 100644 --- a/src/snmalloc/backend/standard_range.h +++ b/src/snmalloc/backend/standard_range.h @@ -22,6 +22,7 @@ namespace snmalloc template< typename PAL, typename Pagemap, + typename Authmap, typename Base = EmptyRange<>, size_t MinSizeBits = MinBaseSizeBits()> struct StandardLocalState : BaseLocalStateConstants @@ -56,7 +57,7 @@ namespace snmalloc page_size_bits>>>; private: - using ObjectRange = Pipe; + using ObjectRange = Pipe>; ObjectRange object_range; @@ -67,7 +68,7 @@ namespace snmalloc /** * Where we turn for allocations of user chunks. * - * Reach over the SmallBuddyRange that's at the near end of the ObjectRange + * Reach over the SmallArenaRange that's at the near end of the ObjectRange * pipe, rather than having that range adapter dynamically branch to its * parent. */ diff --git a/src/test/func/domestication/domestication.cc b/src/test/func/domestication/domestication.cc index 1c2eb9fef..63b8b380d 100644 --- a/src/test/func/domestication/domestication.cc +++ b/src/test/func/domestication/domestication.cc @@ -39,7 +39,7 @@ namespace snmalloc PagemapRegisterRange, PagemapRegisterRange>; - using LocalState = StandardLocalState; + using LocalState = StandardLocalState; using GlobalPoolState = PoolState>; From 7e005cd51a58f38d75b1472cae2baf8a11ba78ae Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Tue, 9 Jun 2026 12:10:54 +0100 Subject: [PATCH 24/31] =?UTF-8?q?Remove=20buddy=20allocators=20and=20renam?= =?UTF-8?q?e=20BackendArena=20=E2=86=92=20Arena?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SmallBuddyRange was orphaned by the previous commit; LargeBuddyRange, SmallBuddyRange and their shared buddy.h are now all dead. Delete them (-848 lines) and clean up stale references in comments, README, AddressSpace.md, and the MIN_HEAP_SIZE_FOR_THREAD_LOCAL_BUDDY constant (renamed ..._CACHE). Now that there is only one Arena type and the Small/Large pair of range adapters built on it, rename for symmetry and to drop the redundant 'Backend' prefix: BackendArena -> Arena BackendArenaBins -> ArenaBins BackendArenaRange -> LargeArenaRange (pairs with SmallArenaRange) Files and test directories renamed to match. The test-internal 'using Arena = ...<...>;' aliases become 'TestArena' to avoid colliding with the renamed class template. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CMakeLists.txt | 8 +- PLAN.md | 356 ++++++++-------- docs/AddressSpace.md | 43 +- src/snmalloc/README.md | 2 +- src/snmalloc/backend/fixedglobalconfig.h | 2 +- src/snmalloc/backend/meta_protected_range.h | 12 +- src/snmalloc/backend/standard_range.h | 8 +- .../{backend_arena.h => arena.h} | 42 +- .../{backend_arena_bins.h => arenabins.h} | 18 +- .../backend_helpers/backend_helpers.h | 5 +- src/snmalloc/backend_helpers/buddy.h | 199 --------- src/snmalloc/backend_helpers/inplacerep.h | 16 +- ...ackend_arena_range.h => largearenarange.h} | 20 +- .../backend_helpers/largebuddyrange.h | 397 ------------------ .../backend_helpers/smallarenarange.h | 8 +- .../backend_helpers/smallbuddyrange.h | 252 ----------- .../backend_helpers/staticconditionalrange.h | 4 +- src/snmalloc/mem/metadata.h | 2 +- src/snmalloc/mitigations/allocconfig.h | 9 +- .../backend_arena.cc => arena/arena.cc} | 131 +++--- .../arenabins.cc} | 86 ++-- src/test/func/cheri/cheri.cc | 4 +- .../largearenarange.cc} | 12 +- .../smallarenarange.cc} | 36 +- 24 files changed, 408 insertions(+), 1264 deletions(-) rename src/snmalloc/backend_helpers/{backend_arena.h => arena.h} (93%) rename src/snmalloc/backend_helpers/{backend_arena_bins.h => arenabins.h} (98%) delete mode 100644 src/snmalloc/backend_helpers/buddy.h rename src/snmalloc/backend_helpers/{backend_arena_range.h => largearenarange.h} (95%) delete mode 100644 src/snmalloc/backend_helpers/largebuddyrange.h delete mode 100644 src/snmalloc/backend_helpers/smallbuddyrange.h rename src/test/func/{backend_arena/backend_arena.cc => arena/arena.cc} (94%) rename src/test/func/{backend_arena_bins/backend_arena_bins.cc => arenabins/arenabins.cc} (94%) rename src/test/func/{backend_arena_range/backend_arena_range.cc => largearenarange/largearenarange.cc} (95%) rename src/test/func/{backend_arena_inplace/backend_arena_inplace.cc => smallarenarange/smallarenarange.cc} (96%) diff --git a/CMakeLists.txt b/CMakeLists.txt index f9e9b3aab..0d04b5995 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -549,10 +549,10 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) # against both fast and check testlib variants. set(TESTLIB_ONLY_TESTS aligned_dealloc - backend_arena - backend_arena_bins - backend_arena_inplace - backend_arena_range + arena + arenabins + largearenarange + smallarenarange bits first_operation memory memory_usage multi_atexit multi_threadatexit redblack statistics teardown contention external_pointer large_alloc lotsofthreads post_teardown diff --git a/PLAN.md b/PLAN.md index 5343f06dd..3679ed2e9 100644 --- a/PLAN.md +++ b/PLAN.md @@ -6,8 +6,8 @@ The design will use the Red-Black tree that currently underlies the buddy alloca Each block will be part of two structures: -* [Bin] A red-black tree of all blocks held by this BackendArena, in the same bin, ordered by address. -* [Range] A red-black tree of all blocks held by this BackendArena, ordered by address. +* [Bin] A red-black tree of all blocks held by this Arena, in the same bin, ordered by address. +* [Range] A red-black tree of all blocks held by this Arena, ordered by address. Note that for block of the minimum size will be handled specially as there is insufficient space to have them particpate in both structures, so they will only particpate in the first. @@ -38,7 +38,7 @@ As all the state is looked up from the RB-tree, then we can have multiple instan ## Implementation -### Build BackendArena +### Build Arena This should use two RB-trees. @@ -50,11 +50,11 @@ There should be a runtime checked invariant that * the system is maximally consolidated, and * the system is consistent between the two RB-trees. -### Build BackendArenaRange +### Build LargeArenaRange -This should wrap the BackendArena using the snmalloc Range approach that is used in the current backend pipelines. +This should wrap the Arena using the snmalloc Range approach that is used in the current backend pipelines. -### Update backend to use BackendArenaRange +### Update backend to use LargeArenaRange ### Update front-end to request non-power of two size classes for the backend. @@ -80,15 +80,15 @@ We can extend the system by effectively having multiple "Range" RB-trees, and th --- -# Implementation plan: BackendArena phase +# Implementation plan: Arena phase ## Scope of this phase -This plan covers **only** the `BackendArena` data structure and its standalone +This plan covers **only** the `Arena` data structure and its standalone unit tests. The following are explicitly deferred to follow-up plans, each of which will become its own PLAN.md revision: -- `BackendArenaRange` — wrapping `BackendArena` behind snmalloc's Range API. +- `LargeArenaRange` — wrapping `Arena` behind snmalloc's Range API. - Backend integration — replacing `LargeBuddyRange` in `backend/standard_range.h` and `backend/meta_protected_range.h`. - Front-end requesting non-power-of-two chunk sizes from the backend. @@ -112,7 +112,7 @@ For `INTERMEDIATE_BITS=B` the bin count per exponent is `B=1: 2, B=2: 5, B=3: 13, B=4: 34`. The snmalloc default is `B=2`. The number of exponents in range is `MAX_SIZE_BITS - MIN_CHUNK_BITS + 1`. -Each `BackendArena` instance owns: +Each `Arena` instance owns: - A flat array of RBTree roots, indexed by bin id (one bin id per (exponent, servable-set) pair). @@ -143,7 +143,7 @@ bitmap of non-empty bins). ### Range: single tree across all blocks (with min-size exception) -A single RBTree per `BackendArena` orders all *non-min-size* free blocks by +A single RBTree per `Arena` orders all *non-min-size* free blocks by address. This is the structure used for adjacency lookup during consolidation: @@ -158,10 +158,10 @@ RBTree. A free block occupies one or more `MIN_CHUNK_SIZE` chunks, with a pagemap entry per chunk. The first pagemap entry of a free block carries a -**variant tag** that tells `BackendArena` how to interpret the other +**variant tag** that tells `Arena` how to interpret the other entries in the block: -| Variant | Value | Block size | Alignment | Pagemap entries used by BackendArena | +| Variant | Value | Block size | Alignment | Pagemap entries used by Arena | |-------------|-------|----------------|----------------|------------------------------------------------------------------------| | `Min` | 0 | exactly min | any | 1 entry — both words store the Bin RBTree node (left/right + colour). | | `TwoMin` | 1 | exactly 2× min | 2-aligned | 2 entries — first stores Bin node, second stores Range node. | @@ -195,7 +195,7 @@ Note: only blocks at even chunk addresses can be `TwoMin`. The distinguish `TwoMin` from `OddTwo`. **Tree membership is the source of truth for "is this block free?".** -The variant tag is only meaningful for entries `BackendArena` reaches via +The variant tag is only meaningful for entries `Arena` reaches via its own RBTrees; nothing outside the data structure probes the tag. The tag is therefore not a state machine that needs an explicit "BackendOwned" / "allocated" value: when a block is removed from its @@ -233,8 +233,8 @@ memcpy-fix follow-up plan. ### Adjacency lookup All adjacency lookups are performed via RB-tree finds in this -`BackendArena`'s own trees. **No pagemap probing.** The pagemap is shared -across `BackendArena` instances (e.g. thread-local + global), and reading +`Arena`'s own trees. **No pagemap probing.** The pagemap is shared +across `Arena` instances (e.g. thread-local + global), and reading entries owned by another instance would be unsafe under concurrent modification. By restricting reads to RB-tree traversals — which only follow pointers we wrote, into entries we own — adjacency detection is @@ -310,7 +310,7 @@ excludes min-size blocks. Adjacency for min-size neighbours is found via ### Write ordering within add/remove Because adjacency lookups are RB-tree-only, a block is "visible to -adjacency" exactly when it is reachable from one of this `BackendArena`'s +adjacency" exactly when it is reachable from one of this `Arena`'s RBTree roots. The ordering rules collapse to two: - **add_block**: write the variant tag and any auxiliary data (precise @@ -324,11 +324,11 @@ RBTree roots. The ordering rules collapse to two: No transient "BackendOwned" marker is needed: a chunk's free-ness is synonymous with its membership in some RBTree owned by this -`BackendArena`. +`Arena`. ### Invariants (debug-only, runtime-checked) -The `BackendArena::invariant()` method checks: +The `Arena::invariant()` method checks: 1. **Maximally consolidated**: walking the Range tree in order, no two adjacent entries have `prev.addr + prev.size == curr.addr`; no min-size @@ -351,7 +351,7 @@ The `BackendArena::invariant()` method checks: The new bins are indexed in **chunk units** (1 chunk = `MIN_CHUNK_SIZE` bytes), not bytes, and not the front-end `sizeclass_t` whose large variant -is currently power-of-two only. `backend_arena_bins.h` defines the +is currently power-of-two only. `arenabins.h` defines the chunk-unit size-class scheme using the snmalloc size-class formula `S = 2^e + m · 2^(e − B)` applied at **chunk-count exponents starting from zero**. Low-exponent special cases (chunk counts 1, 2, 3, …) follow @@ -359,8 +359,8 @@ the same pattern as `bits::from_exp_mant` in `src/snmalloc/ds_core/sizeclassstatic.h`: at small exponents the mantissa space is degenerate, handled by enumeration. -The public API of `BackendArenaBins` — the integration contract -`BackendArena` builds on — is intentionally narrow: +The public API of `ArenaBins` — the integration contract +`Arena` builds on — is intentionally narrow: - `struct range_t { size_t base; size_t size; }` — a chunk-count range used to describe free blocks and carved sub-ranges. @@ -405,7 +405,7 @@ rodata records, the bin-scheme constants (`B`, `MANTISSAS_PER_EXP`, `BINS_PER_EXP`, `MAX_SC`), `bitmap_info_for_request` / `carve_info_for_request`, `bin_index`, and the constexpr per-sc accessors — are private implementation details. They are reachable -only via the friend struct `BackendArenaBinsTestAccess` (defined in +only via the friend struct `ArenaBinsTestAccess` (defined in the test translation unit, see Phase 1) so unit tests can exercise them directly; code outside this header does not depend on them. @@ -420,7 +420,7 @@ their precise chunk count where needed (`Large` variant). ### Exponent / bin-count bounds -`BackendArena` takes **byte-size +`Arena` takes **byte-size exponent** bounds (mirroring `Buddy`). `MIN_SIZE_BITS` is the log2 of the unit of allocation; everything inside the arena is in multiples of `1 << MIN_SIZE_BITS`. The upper bound is @@ -437,7 +437,7 @@ reconcile the front-end large size classes with this scheme. ### Multiple instances All state lives in pagemap-backed nodes and in per-instance roots/bitmaps; -no global state. Multiple `BackendArena` instances can coexist (thread-local +no global state. Multiple `Arena` instances can coexist (thread-local and global) for the future Range wrapper. ## Phases @@ -482,10 +482,10 @@ broken, stop and report — do not start implementation on a broken base. **Test gate**: full ctest run completes; record pass/fail status of each test for later comparison. -### Phase 1: BackendArenaBins — bin scheme, per-sc tables, and bitmap +### Phase 1: ArenaBins — bin scheme, per-sc tables, and bitmap -Add `src/snmalloc/backend_helpers/backend_arena_bins.h` defining -`BackendArenaBins`: the chunk-unit size-class +Add `src/snmalloc/backend_helpers/arenabins.h` defining +`ArenaBins`: the chunk-unit size-class scheme, two per-sc rodata tables, the free-block classifier, and the nested non-empty-bins bitmap that the allocation fast path scans. @@ -535,7 +535,7 @@ Following `prototype/skip_analysis.py`: - `BINS_PER_EXP` = 2 / 5 / 13 for `B` = 1 / 2 / 3 — the count of distinct *servable subsets* of mantissas at each exponent. Each bin is a single bit in the bitmap and a single RB-tree at the - `BackendArena` layer; bins are not size classes (multiple size + `Arena` layer; bins are not size classes (multiple size classes share a bin) and not exponents (each exponent has multiple bins). - `MAX_SC = ((bits::BITS - B) << B) + ((1 << B) - 1)` — one past the @@ -594,16 +594,16 @@ A private `BinTable` struct holds (all `ModArray<...>`): - `bitmap_info[MAX_SC]`, `carve_info[MAX_SC]` — the per-sc tables above. - `exp_first_sc[bits::BITS + 1]` — first raw sc id at each - BackendArenaBins exponent (sentinel at index `bits::BITS` equals + ArenaBins exponent (sentinel at index `bits::BITS` equals `MAX_SC`). NOTE: this is not uniform stride — at the bottom of the - encoding the low regime squashes multiple BackendArenaBins exponents + encoding the low regime squashes multiple ArenaBins exponents into encoded-exponent 0. - `exp_bin_base[bits::BITS + 1]` — `e * BINS_PER_EXP`, precomputed so `bin_index` does no runtime multiply. - `cascade_steps[MANTISSAS_PER_EXP][MAX_CASCADE_STEPS]` — per-`m_top` decision lists for `bin_offset_at`. -A `static constexpr BinTable table_{}` member of `BackendArenaBins` +A `static constexpr BinTable table_{}` member of `ArenaBins` holds the populated instance. Tables sit in `.rodata`; no static initialiser runs at program start. Combined size at B=3 is on the order of tens of KB (estimate: 16 B/sc × 495 + 32 B/sc × 495 + small @@ -676,7 +676,7 @@ compile-time helpers (`clz` / `clz_const`, `next_pow2` / ```cpp class Bitmap { - friend struct BackendArenaBinsTestAccess; + friend struct ArenaBinsTestAccess; public: static constexpr size_t TOTAL_BINS = BINS_PER_EXP * bits::BITS; @@ -705,8 +705,8 @@ private: AND with the precomputed masks has no width mismatch; `bits::ctz` on a `size_t` produces the bit index. -Friend declarations: `BackendArenaBins` and its nested `Bitmap` -each carry their own `friend struct BackendArenaBinsTestAccess<...>;` +Friend declarations: `ArenaBins` and its nested `Bitmap` +each carry their own `friend struct ArenaBinsTestAccess<...>;` (C++ friendship does not transit to nested classes). Static asserts on bitmap layout: @@ -751,8 +751,8 @@ The two ANDs are the entire bin-selection cost; no shifts, no #### Test surface -`BackendArenaBinsTestAccess` is **forward-declared** -in `backend_arena_bins.h` (so the friend declarations can refer to it) +`ArenaBinsTestAccess` is **forward-declared** +in `arenabins.h` (so the friend declarations can refer to it) and **defined in the test translation unit** `src/test/func/backend_arena_bins/backend_arena_bins.cc` (inside `namespace snmalloc`). The header therefore carries no @@ -863,7 +863,7 @@ Spec slice = the Phase 1 section above. Reviewer checks: - The in-tree header carries no test-only surface (no `chunk_sc_t` handle class, no `request`, no `_const` variants, no test-only per-sc accessors — those live only in - `BackendArenaBinsTestAccess` in the test cc). + `ArenaBinsTestAccess` in the test cc). - Fast path uses runtime `bits::to_exp_mant` / `bits::clz` (not the `_const` variants); the `_const` variants are reachable only from the constexpr `BinTable` constructor and the test's @@ -893,7 +893,7 @@ Add a single helper: returns `(largest entry < K, smallest entry > K)`. Either component is `Rep::null` when no such neighbour exists. **Precondition**: `K` is not present in the tree. This matches the - `BackendArena` use case (two free blocks cannot share a starting + `Arena` use case (two free blocks cannot share a starting address, so `add_block` only calls `neighbours` on addresses not already in the tree); in Debug an assert fires if `K` is encountered on the descent. @@ -916,13 +916,13 @@ keys, and `K` between two consecutive keys all match the oracle; the "K not in tree" precondition is asserted in Debug; no structural changes to `RBTree`'s existing invariants. -### Phase 3+4: Full BackendArena data structure (atomic) +### Phase 3+4: Full Arena data structure (atomic) -Create `src/snmalloc/backend_helpers/backend_arena.h` with: +Create `src/snmalloc/backend_helpers/arena.h` with: - A `BackendArenaRep` concept describing word-level accessors over the three pagemap entries, the variant tag, and the large-size accessor: - - `get_variant(addr) -> BackendArenaVariant` / `set_variant` + - `get_variant(addr) -> ArenaVariant` / `set_variant` - `get_word1(addr)` / `set_word1`, `get_word2(addr)` / `set_word2` (first entry, used by BinRep) - `get_range_word1(addr)` / `set_range_word1`, @@ -944,7 +944,7 @@ Create `src/snmalloc/backend_helpers/backend_arena.h` with: - Both: `compare(k1, k2) = k1 > k2` so `remove_min` returns the lowest address. `null = root = 0`. -- `BackendArena`: +- `Arena`: - `B = 2` hardcoded; `INTERMEDIATE_BITS` wiring deferred. - `MIN_SIZE_BITS` selects the unit of allocation (= pagemap stride when used with `PagemapRep`). @@ -975,11 +975,11 @@ Create `src/snmalloc/backend_helpers/backend_arena.h` with: - `get_root_key()` added to `RBTree` (public method, returns root key or `Rep::null` when empty). -- `Bitmap::test(size_t bin_id)` added to `BackendArenaBins` (read-only +- `Bitmap::test(size_t bin_id)` added to `ArenaBins` (read-only accessor used by `invariant()`). Modifications to existing files: -- `src/snmalloc/backend_helpers/backend_arena_bins.h`: added +- `src/snmalloc/backend_helpers/arenabins.h`: added `Bitmap::test()` and made `bin_index` public. - `src/snmalloc/ds_core/redblacktree.h`: added `get_root_key()`. - `CMakeLists.txt`: added `backend_arena` to `TESTLIB_ONLY_TESTS`. @@ -1006,9 +1006,9 @@ places it in bin 0 (size-1 servable set). But: 2. `contains_min` probes bin 0 for single-chunk neighbours — finding a size-2 block there and treating it as size 1 corrupts metadata. -All changes are in `backend_arena.h` and the test file. +All changes are in `arena.h` and the test file. -1. **Add `OddTwo = 3`** to `BackendArenaVariant` enum. +1. **Add `OddTwo = 3`** to `ArenaVariant` enum. 2. **Change `variant_of`** to take `(size_chunks, chunk_index)`: - size 1 → `Min` - size 2, even chunk → `TwoMin` @@ -1019,7 +1019,7 @@ All changes are in `backend_arena.h` and the test file. 4. **Update `insert_block`**: pass `addr_to_chunk(addr)` to `variant_of`. The `if (size_chunks >= 2)` range-tree checks already cover `OddTwo`. 5. **Update `contains_min`**: after finding addr in bin 0, check - `Rep::get_variant(addr) == BackendArenaVariant::Min`. Return false + `Rep::get_variant(addr) == ArenaVariant::Min`. Return false for `OddTwo` entries. 6. **Update invariant clause 5**: pass chunk address to `variant_of`. 7. **Update invariant clause 1c** ("no two adjacent min blocks"): @@ -1043,7 +1043,7 @@ entries when possible" section above. ### Phase 7: Multi-instance test -Instantiate two `BackendArena` over disjoint address ranges in +Instantiate two `Arena` over disjoint address ranges in the same test process, drive workloads against both, verify each invariant independently. @@ -1062,29 +1062,29 @@ Per `claude.md` mandatory review checkpoints: --- -# Implementation plan: BackendArenaRange phase +# Implementation plan: LargeArenaRange phase ## Scope -Build `BackendArenaRange` — a Range pipeline component that wraps -`BackendArena` behind snmalloc's Range API, suitable for replacing +Build `LargeArenaRange` — a Range pipeline component that wraps +`Arena` behind snmalloc's Range API, suitable for replacing `LargeBuddyRange`. This plan covers: -- Generalising BackendArena's Rep interface for pagemap compatibility. -- `PagemapRep` — adapting pagemap entries to BackendArena's Rep concept. -- `BackendArenaRange` — the Range wrapper with refill and overflow handling. +- Generalising Arena's Rep interface for pagemap compatibility. +- `PagemapRep` — adapting pagemap entries to Arena's Rep concept. +- `LargeArenaRange` — the Range wrapper with refill and overflow handling. - Boundary-bit support for safe consolidation across PAL allocations. - Unit tests for all of the above. The pipeline integration (replacing `LargeBuddyRange` in `standard_range.h` and `meta_protected_range.h`) is a separate step ("Update backend to use -BackendArenaRange") that follows once this plan is complete. +LargeArenaRange") that follows once this plan is complete. ## Design ### Rep generalisation: representation-agnostic data structure -`BackendArena` must be representation-agnostic, mirroring how +`Arena` must be representation-agnostic, mirroring how `Buddy<>` is generic over its node `Rep` (see `buddy.h`). The existing buddy ecosystem demonstrates the layering: @@ -1095,7 +1095,7 @@ existing buddy ecosystem demonstrates the layering: - `smallbuddyrange.h` defines `BuddyInplaceRep` — an inline Rep that stores tree pointers in the free chunk itself (red bit at bit 0). -`BackendArena` must support the same two representation paths so it +`Arena` must support the same two representation paths so it can eventually replace both `LargeBuddyRange` (pagemap) and `SmallBuddyRange` (inline) in the standard pipeline. @@ -1115,16 +1115,16 @@ as `BuddyChunkRep` / `BuddyInplaceRep`): provides `Handle`, `Contents`, `null`, `root`, `ref`, `get`, `set`, `is_red`, `set_red`, `compare`, `equal`, `printable`, `name`. **All bit-packing decisions (red bit position, mask layout) are private to the Rep** — -`BackendArena` carries no `RED_BIT` / `VARIANT_MASK` / `META_MASK` +`Arena` carries no `RED_BIT` / `VARIANT_MASK` / `META_MASK` constants of its own. -`BackendArena` instantiates `RBTree` and +`Arena` instantiates `RBTree` and `RBTree` directly. It never inspects the bit layout used by the Rep. #### PagemapRep -Lives in `backend_arena_range.h`. Privately owns its bit layout: +Lives in `largearenarange.h`. Privately owns its bit layout: - Bin tree node in pagemap entry at `addr`, Word::One/Two. `BinRep` packs the red bit at bit 8 and the variant tag at bits 9–10 of @@ -1147,7 +1147,7 @@ their own ref/get/set/is_red/set_red implementations. Mirroring `BuddyInplaceRep`: tree pointers live inside the free memory itself. `BinRep` / `RangeRep` would use pointer-low-bits for red and variant tags. This is what enables a future -`BackendArena`-based replacement for `SmallBuddyRange`. +`Arena`-based replacement for `SmallBuddyRange`. ### Boundary-bit consolidation check @@ -1156,7 +1156,7 @@ the pagemap sets a boundary bit on the first chunk of each PAL allocation to prevent consolidation across allocation boundaries (`BuddyChunkRep::can_consolidate` checks this). -BackendArena's `add_block` consolidation must respect the same contract. +Arena's `add_block` consolidation must respect the same contract. A new method on the Rep concept: ``` @@ -1176,7 +1176,7 @@ PagemapRep: returns `!get_metaentry_mut(higher_addr).is_boundary()`. Templated on `Pagemap`, `MIN_SIZE_BITS`, and `MAX_SIZE_BITS` (mirroring `Buddy`'s shape). `MIN_SIZE_BITS` is the log2 of the pagemap stride -(snmalloc's `MIN_CHUNK_BITS` when wired through `BackendArenaRange`); +(snmalloc's `MIN_CHUNK_BITS` when wired through `LargeArenaRange`); `MAX_SIZE_BITS` is needed for the large-size-shift static assertion: ``` @@ -1228,7 +1228,7 @@ an unowned entry, so pagemap ownership transitions happen implicitly. The boundary bit (bit 0 of `meta`) is in the reserved-mask zone and is preserved by both `claim_for_backend()` and `BackendStateWordRef::operator=`. -### BackendArenaRange +### LargeArenaRange Outer template matches `LargeBuddyRange`'s shape so it is a drop-in replacement in `Pipe<...>` compositions: @@ -1239,7 +1239,7 @@ template< size_t MAX_SIZE_BITS, SNMALLOC_CONCEPT(IsWritablePagemap) Pagemap, size_t MIN_REFILL_SIZE_BITS = 0> -class BackendArenaRange +class LargeArenaRange { public: template> @@ -1248,7 +1248,7 @@ public: using ContainsParent::parent; using PagemapRepT = PagemapRep; - BackendArena arena; + Arena arena; size_t requested_total = 0; public: @@ -1296,7 +1296,7 @@ public: Overflow from `add_block` is forwarded directly to the parent's `dealloc_range`. The parent does not require power-of-two input — all -non-Buddy ranges accept any chunk-aligned size, and `BackendArenaRange` +non-Buddy ranges accept any chunk-aligned size, and `LargeArenaRange` itself accepts any chunk-multiple size — so no decomposition is needed. ``` @@ -1370,7 +1370,7 @@ Safety guards (both from `LargeBuddyRange`): ### Static properties -- `Aligned = true`: BackendArena's carving ensures that a request of +- `Aligned = true`: Arena's carving ensures that a request of size `n` (power-of-two, chunk-aligned) is placed at an `n`-aligned address within the source block. For non-power-of-two requests, the bin scheme's alignment rules still hold (alignment matches the @@ -1381,7 +1381,7 @@ Safety guards (both from `LargeBuddyRange`): ### MAX_SIZE_BITS = BITS - 1 (global range) The global `LargeBuddyRange` uses `MAX_SIZE_BITS = BITS - 1`, meaning -the buddy can hold up to half the address space. For BackendArenaRange: +the buddy can hold up to half the address space. For LargeArenaRange: the maximum block size in chunks is `2^(MAX_SIZE_BITS - MIN_CHUNK_BITS)`. On 64-bit with `MIN_CHUNK_BITS = 14`, this gives a chunk-bit width of 49 — the arena can hold up to 2^49 chunks. The arena's overflow path @@ -1400,12 +1400,12 @@ where this would overflow. **Status**: implemented; staged (not committed); awaiting review. -Changes to `backend_arena.h`: +Changes to `arena.h`: 1. Delete the private `WordRef` nested struct, the `TreeRep` template, and all bit-layout constants (`RED_BIT`/`VARIANT_MASK`/`META_MASK` and `BACKEND_RESERVED_MASK`). - `BackendArena` is now representation-agnostic, mirroring how + `Arena` is now representation-agnostic, mirroring how `buddy.h` is generic over its node `Rep`. 2. Replace the internal `using BinRep = TreeRep` / `RangeRep = TreeRep` aliases with direct use @@ -1434,10 +1434,10 @@ Changes to `backend_arena.cc` (test file): that boundary. Test both predecessor and successor merges being independently blocked. -**Test gate**: all existing BackendArena tests pass unchanged; new +**Test gate**: all existing Arena tests pass unchanged; new boundary test passes. -### Phase 10: PagemapRep + BackendArenaRange + tests +### Phase 10: PagemapRep + LargeArenaRange + tests **Status**: implemented and tested. Committed in `9c1ca745`. @@ -1450,10 +1450,10 @@ boundary test passes. > implementation uses bytes; where they say `dealloc_overflow`, the > implementation uses `parent_dealloc`. -**Phase 10b refactor (also implemented):** `BackendArena` and `PagemapRep` +**Phase 10b refactor (also implemented):** `Arena` and `PagemapRep` were both retemplated to mirror `Buddy`'s 3-parameter shape: -- `template class BackendArena` +- `template class Arena` — the always-zero `MIN_CHUNKS_BITS` placeholder is gone, and the unit of allocation is named explicitly via `MIN_SIZE_BITS` instead of being implicitly tied to snmalloc's global `MIN_CHUNK_BITS`. Internally, @@ -1465,30 +1465,30 @@ were both retemplated to mirror `Buddy`'s 3-parameter shape: `(MAX_SIZE_BITS - MIN_SIZE_BITS) + LARGE_SIZE_SHIFT <= bits::BITS`; `LARGE_SIZE_SHIFT` is private. The Rep's pagemap stride is `UNIT_SIZE = 1 << MIN_SIZE_BITS`. -- `BackendArenaRange::Type` wires snmalloc's `MIN_CHUNK_BITS` as - `MIN_SIZE_BITS` for both PagemapRep and BackendArena: +- `LargeArenaRange::Type` wires snmalloc's `MIN_CHUNK_BITS` as + `MIN_SIZE_BITS` for both PagemapRep and Arena: `PagemapRep` and - `BackendArena`. + `Arena`. -New file: `src/snmalloc/backend_helpers/backend_arena_range.h` +New file: `src/snmalloc/backend_helpers/largearenarange.h` 1. `PagemapRep` — full Rep implementation using pagemap entries as described above, with all static assertions. -2. `BackendArenaRange` — the Range wrapper with `alloc_range`, `dealloc_range`, `refill`, and `dealloc_overflow`. Modified: `src/snmalloc/backend_helpers/backend_helpers.h` -3. Add `#include "backend_arena_range.h"` so the new header is +3. Add `#include "largearenarange.h"` so the new header is available through the standard include path. New file: `src/test/func/backend_arena_range/backend_arena_range.cc` 4. Test with snmalloc's `BasicPagemap` (or a test-appropriate pagemap): - PagemapRep word round-trips (variant, tree words, large size). - - BackendArenaRange `alloc_range` / `dealloc_range` smoke test with + - LargeArenaRange `alloc_range` / `dealloc_range` smoke test with a simple parent range. - Refill: verify that allocating when the arena is empty triggers a parent refill and returns memory. @@ -1509,7 +1509,7 @@ Modified: `CMakeLists.txt` 5. Register `backend_arena_range` in `TESTLIB_ONLY_TESTS`. -**Test gate**: BackendArenaRange tests pass; existing tests unaffected. +**Test gate**: LargeArenaRange tests pass; existing tests unaffected. ### Phase 11: Final review @@ -1523,22 +1523,22 @@ Per `claude.md` mandatory review checkpoints: ### Phase 10d: Bytes throughout (replace chunk-count internal API) **Goal**: drop the `size_chunks` / chunk-count internal convention from -`BackendArena` and `PagemapRep` so byte sizes (multiples of UNIT_SIZE) +`Arena` and `PagemapRep` so byte sizes (multiples of UNIT_SIZE) flow end-to-end, removing the `<< MIN_CHUNK_BITS` conversion dance at -the BackendArenaRange ↔ BackendArena boundary and the matching reverse +the LargeArenaRange ↔ Arena boundary and the matching reverse shifts inside the range wrapper. -**Substep 1 (DONE)**: generalise `BackendArenaBins` on a new +**Substep 1 (DONE)**: generalise `ArenaBins` on a new `MIN_SIZE_BITS` template parameter so its `range_t.size`, carve arguments, and `max_supported_size()` are byte sizes (multiples of `UNIT_SIZE = 1 << MIN_SIZE_BITS`). Renames inside Bins: `size_chunks → size`, `align_chunks → align`, `max_supported_chunks → max_supported_size`. Tests cover `MIN_SIZE_BITS ∈ {0, 4, 14}`. -**Substep 2 (DONE)**: flip `BackendArena`, `PagemapRep`, and -`BackendArenaRange` to bytes throughout: -- `BackendArena` now uses - `BackendArenaBins`; `add_block` / `remove_block` +**Substep 2 (DONE)**: flip `Arena`, `PagemapRep`, and +`LargeArenaRange` to bytes throughout: +- `Arena` now uses + `ArenaBins`; `add_block` / `remove_block` take/return bytes; `addr_to_chunk` / `chunk_to_addr` / `CHUNKS_BITS` deleted; `variant_of(size, addr)` works in byte units with parity from `(addr >> MIN_SIZE_BITS) & 1`. @@ -1548,7 +1548,7 @@ arguments, and `max_supported_size()` are byte sizes (multiples of - `PagemapRep::get_large_size` / `set_large_size` (renamed from `*_chunks`) take and return bytes; internal storage still scales by `MIN_SIZE_BITS` so the shifted field fits a pagemap word. -- `BackendArenaRange::add_range` / `dealloc_range` / +- `LargeArenaRange::add_range` / `dealloc_range` / `parent_dealloc` (unified from `parent_dealloc_range` and `dealloc_overflow`) drop chunk-count conversions; `add_range` uses `bits::align_up` / `bits::align_down`. @@ -1564,19 +1564,19 @@ before opening a PR; then proceed to Phase 12 (pipeline integration). *Pipeline integration (replacing `LargeBuddyRange` in `standard_range.h` and `meta_protected_range.h`) is a separate follow-up plan: "Update -backend to use BackendArenaRange."* +backend to use LargeArenaRange."* ## Files added / changed (anticipated, this phase) -- Modified: `src/snmalloc/backend_helpers/backend_arena.h` — +- Modified: `src/snmalloc/backend_helpers/arena.h` — representation-agnostic: delete private `WordRef`, `TreeRep`, and all bit-layout constants (`RED_BIT`/`VARIANT_MASK`/`META_MASK`/ reserved); use `Rep::BinRep` and `Rep::RangeRep` directly; `can_consolidate` check in `add_block`; invariant clauses updated. -- New: `src/snmalloc/backend_helpers/backend_arena_range.h` — - `PagemapRep` + `BackendArenaRange`. +- New: `src/snmalloc/backend_helpers/largearenarange.h` — + `PagemapRep` + `LargeArenaRange`. - Modified: `src/snmalloc/backend_helpers/backend_helpers.h` — include - `backend_arena_range.h`. + `largearenarange.h`. - Modified: `src/test/func/backend_arena/backend_arena.cc` — define `BackendArenaWordRef` test helper at top of file; MockRep updated (`BackendArenaWordRef` returns, `can_consolidate`); boundary tests. @@ -1586,7 +1586,7 @@ backend to use BackendArenaRange."* ## Key design decisions -1. **Representation-agnostic data structure** — `BackendArena` +1. **Representation-agnostic data structure** — `Arena` carries no bit-layout constants. All red/variant packing decisions live in the user-supplied `Rep::BinRep` / `Rep::RangeRep`, matching how `BuddyChunkRep` and `BuddyInplaceRep` each own their own @@ -1596,13 +1596,13 @@ backend to use BackendArenaRange."* 2. **PagemapRep variant in bin-tree Word::One** — PagemapRep packs the variant tag at bits 9–10 of Word::One alongside the red bit (bit 8) and child pointer (bits ≥ MIN_CHUNK_BITS). These are - private constants inside PagemapRep, not exposed by BackendArena. + private constants inside PagemapRep, not exposed by Arena. 3. **Large size stored shifted** — PagemapRep stores the chunk count as `count << 8` to avoid the pagemap's reserved low byte; recovered via `>> 8`. Guarded by `static_assert((MAX_SIZE_BITS - MIN_CHUNK_BITS) + 8 <= bits::BITS)`. -4. **Boundary checks in BackendArena** — not in BackendArenaRange. +4. **Boundary checks in Arena** — not in LargeArenaRange. Consolidation decisions happen inside `add_block`, so the boundary check must be there. The Rep concept cleanly abstracts this via `can_consolidate`. @@ -1614,13 +1614,13 @@ backend to use BackendArenaRange."* 6. **PagemapRep auto-claims entries** — `get_backend_word` calls `claim_for_backend()` on first access. No explicit ownership - management needed in BackendArena or BackendArenaRange. + management needed in Arena or LargeArenaRange. 7. **Overflow forwarding** — `add_block` overflow may produce non- power-of-two sizes (consolidated blocks from multiple PAL allocs). `dealloc_overflow` forwards the overflow directly to the parent's `dealloc_range`; no power-of-two decomposition is needed because - `BackendArenaRange` (which is what replaces `LargeBuddyRange` in + `LargeArenaRange` (which is what replaces `LargeBuddyRange` in the pipeline) accepts any chunk-multiple size. 8. **`BackendArenaWordRef` lives in the test file** — the in-tree @@ -1644,7 +1644,7 @@ backend to use BackendArenaRange."* - Overflow handling: `add_block` can return non-power-of-two sizes when blocks from multiple PAL allocations consolidate. `dealloc_overflow` forwards the overflow directly to the parent — no decomposition is - required because `BackendArenaRange` itself accepts arbitrary + required because `LargeArenaRange` itself accepts arbitrary chunk-multiple sizes and replaces `LargeBuddyRange` in the pipeline. (Rubber-duck finding #2 superseded by Option B refactor.) - Handle visibility / layering: original plan promoted bit-layout @@ -1652,7 +1652,7 @@ backend to use BackendArenaRange."* the in-tree header and tests could share them. Subsequent review observed that this broke the Buddy/`BuddyChunkRep`/`BuddyInplaceRep` layering: the data structure should be representation-agnostic. - Resolved by making `BackendArena` carry no bit-layout state and + Resolved by making `Arena` carry no bit-layout state and requiring `Rep::BinRep` / `Rep::RangeRep` to own all packing decisions. `PagemapRep` keeps its layout private; the test `BackendArenaWordRef` lives in the test file alongside MockRep. @@ -1671,18 +1671,18 @@ backend to use BackendArenaRange."* --- -## Files added / changed (BackendArena phase, completed) +## Files added / changed (Arena phase, completed) -- New: `src/snmalloc/backend_helpers/backend_arena_bins.h` — +- New: `src/snmalloc/backend_helpers/arenabins.h` — `range_t`, `carve_t`, `carve`, `max_supported_chunks`, and nested `Bitmap` with `add` / `find_for_request` / `clear` (public surface); the size-class encoding (`bitmap_info_t`, `carve_info_t`, constexpr `BinTable`, `bitmap_info_for_request` / `carve_info_for_request`, `bin_index`) is private and reachable via - `BackendArenaBinsTestAccess` (forward-declared in the header, + `ArenaBinsTestAccess` (forward-declared in the header, defined in the test cc) for unit tests. Templated on `INTERMEDIATE_BITS` for testability. -- New: `src/snmalloc/backend_helpers/backend_arena.h` — the data structure, +- New: `src/snmalloc/backend_helpers/arena.h` — the data structure, templated on a `BackendArenaRep` concept exposing variant-tag and node/size accessors (no pagemap-probing API). - New: `src/test/func/backend_arena_bins/backend_arena_bins.cc` — bin @@ -1704,18 +1704,18 @@ No in-tree code path is changed in this phase: the existing - One Bin tree per IDEA servable-set bin (not per size class or per exponent). -- Scope is the BackendArena data structure + tests only. +- Scope is the Arena data structure + tests only. - The pagemap encoding carries a 2-bit **variant tag** (`Min` / `TwoMin` / `Large`) on the first entry of each free block. Tree membership — not the tag — is the source of truth for "is this block free?". No transient `BackendOwned` / "claimed" tag is required. - **No pagemap probing.** All adjacency lookups are restricted to this - `BackendArena`'s own RBTrees: non-min neighbours come from a single + `Arena`'s own RBTrees: non-min neighbours come from a single `Range.neighbours(addr_A)` walk that returns both `(largest < addr_A, smallest > addr_A)`; min-size neighbours come from `MinSizeBin.find(addr_A ± MIN_CHUNK_SIZE)`. The pagemap is never read at speculative addresses (concurrency hazard and no defined contract - for pagemap entries the BackendArena does not own). + for pagemap entries the Arena does not own). - Free blocks may have **arbitrary chunk counts**, not just exact size-class sizes — carving produces non-class remainders. `bin_index` operates on `(addr_chunks, size_chunks)` pairs; `Large` blocks store @@ -1728,38 +1728,38 @@ No in-tree code path is changed in this phase: the existing - `add_block` returns `{0, 0}` on success; on overflow it returns the unabsorbed range, mirroring `Buddy::add_block`'s overflow-return contract. Oversize inputs (`size_chunks >= 2^(MAX_SIZE_BITS - MIN_CHUNK_BITS)`) bypass - `BackendArena` entirely — the wrapping `BackendArenaRange` layer + `Arena` entirely — the wrapping `LargeArenaRange` layer handles them before calling `add_block`, and `add_block` asserts `size_chunks < 2^(MAX_SIZE_BITS - MIN_CHUNK_BITS)`. The only overflow case is consolidation growing a coalesced block to exactly `2^(MAX_SIZE_BITS - MIN_CHUNK_BITS)` (the consolidated range is returned, neighbours - having been removed first). The future `BackendArenaRange` wrapper is - responsible for handling overflow; the standalone `BackendArena` only + having been removed first). The future `LargeArenaRange` wrapper is + responsible for handling overflow; the standalone `Arena` only exposes the contract. - `BackendArenaRep` is a chunk-keyed accessor concept (variant tag plus - word/size accessors for entries 1–3). `BackendArena` builds two + word/size accessors for entries 1–3). `Arena` builds two internal `RBTree`-Rep adapters (`BinRep`, `RangeRep`) over it; user code never sees the adapter shape. - Backend chunk size classes are a new chunk-unit size-class scheme in - `backend_arena_bins.h` (not bytes), independent of the + `arenabins.h` (not bytes), independent of the power-of-two-only large variant of front-end `sizeclass_t`, with low-exponent special cases handled in the spirit of `bits::from_exp_mant`. -- `BackendArena` uses byte-size +- `Arena` uses byte-size exponent bounds with **exclusive max** semantics, matching the existing `Buddy<..., MIN, MAX>`. - Multi-`B` testing is via a templated bin-table generator in a single test binary, not via separate CMake configurations. - Phase 5 verifies the reuse optimisation via Range-tree insert/remove - *call counters* at the `BackendArena` layer (no `RBTree` modification). + *call counters* at the `Arena` layer (no `RBTree` modification). ## Still open (resolve during implementation) - ~~Exact bit positions in the first-word pagemap encoding for the variant-tag field.~~ **Resolved** (Phase 3+4): bits 9–10 encode - `BackendArenaVariant` (`VARIANT_MASK = 0x600`); bit 8 is `RED_BIT`; + `ArenaVariant` (`VARIANT_MASK = 0x600`); bit 8 is `RED_BIT`; bits 0–7 are `BACKEND_RESERVED_MASK`. Documented in - `backend_arena.h`. + `arena.h`. - ~~Whether Bin tree roots are stored flat (`Array`) or exponent-keyed.~~ **Resolved** (Phase 3+4): flat `stl::Array`. @@ -1771,25 +1771,25 @@ No in-tree code path is changed in this phase: the existing --- -# Phase 12: Update backend to use BackendArenaRange +# Phase 12: Update backend to use LargeArenaRange ## Status: implementation complete, awaiting commit approval Substitution implemented and tested in the working tree (uncommitted on -top of `9c1ca745`). `BackendArena::add_block` had a latent +top of `9c1ca745`). `Arena::add_block` had a latent out-of-region pagemap-probe bug in its successor-min branch that -became reachable once `BackendArenaRange` started serving fixed-region +became reachable once `LargeArenaRange` started serving fixed-region allocations; fixed in this phase (see "Issue found during Phase 12 test run" below). Full ctest suite passes (86/86). Diff: 6 files, 183/45 +/- (PLAN.md, both pipeline range headers, -`backend_arena.h`, `backend_arena_bins.h`, `backend_arena.cc`). +`arena.h`, `arenabins.h`, `backend_arena.cc`). ## Goal Replace every `LargeBuddyRange` instantiation in the range -pipelines with `BackendArenaRange`. After this phase, snmalloc uses -the BackendArena bin-tree allocator instead of the power-of-two buddy +pipelines with `LargeArenaRange`. After this phase, snmalloc uses +the Arena bin-tree allocator instead of the power-of-two buddy for all large-range management. The `LargeBuddyRange` and `BuddyChunkRep` classes are **not deleted** — they remain available for alternative configurations and external embedders. Only the @@ -1798,17 +1798,17 @@ default pipeline wiring changes. ## Scope - Modify `standard_range.h` — replace all `LargeBuddyRange` with - `BackendArenaRange` (same template parameters). + `LargeArenaRange` (same template parameters). - Modify `meta_protected_range.h` — replace all `LargeBuddyRange` - with `BackendArenaRange` (same template parameters). -- **No other source files change.** `BackendArenaRange` is already a + with `LargeArenaRange` (same template parameters). +- **No other source files change.** `LargeArenaRange` is already a drop-in replacement: same template signature, same `Type` shape, same `alloc_range`/`dealloc_range` API, same `Aligned`, `ConcurrencySafe`, and `ChunkBounds` constants. ## Pre-conditions -- Phase 10 (BackendArenaRange) is committed and all its tests pass +- Phase 10 (LargeArenaRange) is committed and all its tests pass (commit `9c1ca745`). - Phase 11 (final review of Phases 9–10) was waived by the user; Phase 12 proceeds without it. @@ -1823,15 +1823,15 @@ default pipeline wiring changes. ```cpp LargeBuddyRange ``` -→ `BackendArenaRange` +→ `LargeArenaRange` - `MAX_SIZE_BITS = bits::BITS - 1` → global-range mode (no parent - dealloc). `BackendArenaRange` handles this identically. + dealloc). `LargeArenaRange` handles this identically. - `MIN_REFILL_SIZE_BITS = MinSizeBits` (Windows: 16, otherwise PAL- - dependent). `BackendArenaRange` passes this through. + dependent). `LargeArenaRange` passes this through. - Parent is `Base` (PalRange + PagemapRegisterRange chain). Parent is **unaligned** on PALs without `AlignedAllocation` (e.g. Linux mmap) - and aligned otherwise. `BackendArenaRange::refill` currently still + and aligned otherwise. `LargeArenaRange::refill` currently still carries the aligned/unaligned dual path inherited from `LargeBuddyRange`; collapsing this into a single path is deferred to Phase 13. @@ -1840,13 +1840,13 @@ LargeBuddyRange ```cpp LargeBuddyRange ``` -→ `BackendArenaRange` +→ `LargeArenaRange` - `MAX_SIZE_BITS = LocalCacheSizeBits = 21` (2 MiB). Non-global mode. Overflow goes to parent. -- `BackendArenaRange::parent_dealloc` forwards directly to parent +- `LargeArenaRange::parent_dealloc` forwards directly to parent without decomposition (single block returned by - `BackendArena::add_block` when consolidation reaches the arena-scale + `Arena::add_block` when consolidation reaches the arena-scale upper bound). The size is a chunk multiple up to `2^MAX_SIZE_BITS`, not necessarily power-of-two — the parent must accept arbitrary chunk-multiple sizes. @@ -1860,7 +1860,7 @@ LargeBuddyRange ```cpp LargeBuddyRange ``` -→ `BackendArenaRange` +→ `LargeArenaRange` - `MIN_REFILL_SIZE_BITS = 0` (default). Global-range mode. @@ -1868,7 +1868,7 @@ LargeBuddyRange ```cpp LargeBuddyRange ``` -→ `BackendArenaRange` +→ `LargeArenaRange` - Global-range mode. @@ -1881,7 +1881,7 @@ stl::conditional_t< Pagemap, page_size_bits>, NopRange> ``` -→ Replace `LargeBuddyRange` with `BackendArenaRange` inside the +→ Replace `LargeBuddyRange` with `LargeArenaRange` inside the `conditional_t`. - This is a small local cache for huge-page consolidation. @@ -1895,7 +1895,7 @@ stl::conditional_t< ```cpp LargeBuddyRange ``` -→ `BackendArenaRange` +→ `LargeArenaRange` - Same shape as standard_range.h #2. @@ -1903,7 +1903,7 @@ LargeBuddyRange ```cpp LargeBuddyRange ``` -→ `BackendArenaRange` +→ `LargeArenaRange` - `REFILL_SIZE_BITS = 21 - 6 = 15`. Global-range mode. `MIN_REFILL_SIZE_BITS = 0`. @@ -1911,10 +1911,10 @@ LargeBuddyRange ## Implementation The change is a mechanical text substitution — replace the string -`LargeBuddyRange` with `BackendArenaRange` in both files. No +`LargeBuddyRange` with `LargeArenaRange` in both files. No template parameters, no API calls, no structural changes. -### Step 1: Replace LargeBuddyRange → BackendArenaRange +### Step 1: Replace LargeBuddyRange → LargeArenaRange In `src/snmalloc/backend/standard_range.h`: - 2 instantiations of `LargeBuddyRange<` (GlobalR, LargeObjectRange). @@ -1928,7 +1928,7 @@ In `src/snmalloc/backend/meta_protected_range.h`: Both files include `"../backend/backend.h"` which includes `"../backend_helpers/backend_helpers.h"` which already includes -`"backend_arena_range.h"`. **No new includes needed.** +`"largearenarange.h"`. **No new includes needed.** ### Step 3: Build and test @@ -1948,14 +1948,14 @@ test suite exercises the pipeline end-to-end. ### Issue found during Phase 12 test run: out-of-region pagemap probe `func-fixed_region_alloc-check` segfaulted in `PagemapRep::can_consolidate` -when `BackendArena::add_block` was called with a block whose +when `Arena::add_block` was called with a block whose `succ_addr = addr + size` sat one chunk past the registered pagemap range (the last 8 MiB of a 256 MiB FixedRange). The bug shape matches the `buddy.h:90-93` comment exactly: `can_consolidate` reads the pagemap entry at `succ_addr`, and that read is only safe once a tree-membership test has confirmed the address is in our region. -**Fix.** In `BackendArena::add_block`, the successor-min branch was +**Fix.** In `Arena::add_block`, the successor-min branch was reordered so the tree-membership check (`contains_min(succ_addr)`) short-circuits before the pagemap probe (`Rep::can_consolidate`). All other can_consolidate call sites already had their preconditions @@ -1979,7 +1979,7 @@ This unification also subsumed the previous `BoundaryMockRep` and its run on `Arena` and set `mock_store[mock_index(addr)].boundary = true` instead. Net −35 lines in `backend_arena.cc`. -A leftover `throw "..."` in `backend_arena_bins.h:807` (used as a +A leftover `throw "..."` in `arenabins.h:807` (used as a constexpr-failure trick in the `BinTable` constructor) caused a build failure in `-fno-exceptions` configurations during Phase 12. Replaced with `SNMALLOC_CHECK(false && "...")`, which is non-constexpr and @@ -2003,7 +2003,7 @@ Phase 12 ends after Step 3 with the test suite green. ## Investigated and dropped: Retire `ParentRange::Aligned` **Status: dropped on review.** Phase 13 was deferred from Phase 12 with -the intent of collapsing `BackendArenaRange::refill`'s two-path +the intent of collapsing `LargeArenaRange::refill`'s two-path conditional and (optionally) removing `ParentRange::Aligned` from the range concept. Closer inspection of the existing code found the conditional is load-bearing, not vestigial: @@ -2035,11 +2035,11 @@ conditional is load-bearing, not vestigial: pass-through ranges doesn't shrink — defeating the only structural-cleanup motivation. -The BackendArena refactor (Phases 1–12) ends with Phase 12. No Phase 13. +The Arena refactor (Phases 1–12) ends with Phase 12. No Phase 13. ## Risks -1. **BackendArenaRange behaviour differences.** The bin-tree allocator +1. **LargeArenaRange behaviour differences.** The bin-tree allocator returns blocks with different internal fragmentation characteristics than the power-of-two buddy. Functionally, the caller always gets at least the requested size (power-of-two), so correctness is @@ -2048,19 +2048,19 @@ The BackendArena refactor (Phases 1–12) ends with Phase 12. No Phase 13. 2. **Overflow behaviour.** `LargeBuddyRange::dealloc_overflow` returns a single block of exactly `1 << MAX_SIZE_BITS`. - `BackendArenaRange::parent_dealloc` forwards a single block of the + `LargeArenaRange::parent_dealloc` forwards a single block of the consolidated size directly to the parent. The size can be any chunk multiple up to `2^MAX_SIZE_BITS`, not just power-of-two, but - the parent (now itself a `BackendArenaRange` or pass-through layer) + the parent (now itself a `LargeArenaRange` or pass-through layer) accepts arbitrary chunk-multiple sizes. 3. **`FixedRangeConfig` uses `StandardLocalState`.** The fixed-region configuration pushes memory directly into `GlobalR.dealloc_range`. - This works with `BackendArenaRange` because `dealloc_range` has the + This works with `LargeArenaRange` because `dealloc_range` has the same signature and contract. -4. **Pagemap metadata footprint.** `BackendArenaRange` uses up to - three pagemap entries per free block (`backend_arena_range.h:12-17`) +4. **Pagemap metadata footprint.** `LargeArenaRange` uses up to + three pagemap entries per free block (`largearenarange.h:12-17`) — one at the base, one at `base + UNIT_SIZE`, one at `base + 2*UNIT_SIZE`. `LargeBuddyRange`'s `BuddyChunkRep` only touched the base entry. Pagemap registration covers every @@ -2071,14 +2071,14 @@ The BackendArena refactor (Phases 1–12) ends with Phase 12. No Phase 13. ## Resolved during plan review -- `backend_arena_range.h` was missing `#include "empty_range.h"` for +- `largearenarange.h` was missing `#include "empty_range.h"` for its `EmptyRange<>` default template parameter. Fixed pre-commit. (Rubber-duck finding #2.) - The `conditional_t` huge-page path in `meta_protected_range.h` may not be instantiated on default builds. CI tests multiple PAL configurations. Risk acknowledged but no custom build added — the conditional branch is structurally identical to other - `BackendArenaRange` uses and shares the same template. (Rubber-duck + `LargeArenaRange` uses and shares the same template. (Rubber-duck finding #1.) ## Out of scope @@ -2303,11 +2303,11 @@ above the reserved range must shift up by Verified consumers of `BACKEND_RESERVED_MASK` / bits immediately above bit 7: -- `backend_arena_range.h:42-50`: `RED_BIT_POS = 8`, +- `largearenarange.h:42-50`: `RED_BIT_POS = 8`, `VARIANT_SHIFT = 9`, `LARGE_SIZE_SHIFT = 8`. Today these sit at bits 8/9-10/8. After Phase 13 they shift to bits 9/10-11/9. The `static_assert(Entry::is_backend_allowed_value(...))` at - `backend_arena_range.h:64-66` catches any miss at compile time. + `largearenarange.h:64-66` catches any miss at compile time. - `backend_helpers/largebuddyrange.h:40-46`: `BuddyChunkRep` `RED_BIT = 1 << 8`. Same shift required. (The plan previously said "`backend_helpers/buddy.h`" — corrected. Grep `RED_BIT` to @@ -2400,7 +2400,7 @@ auto-propagates. to use `meta.slab_mask`. The metadata table builder sets `slab_mask = info.align - 1` for large (where `info.align = size & (~size + 1)`, the natural alignment from - `backend_arena_bins.h:741`). For pow2 sizes, `info.align == size`, + `arenabins.h:741`). For pow2 sizes, `info.align == size`, so `slab_mask = size - 1` — matching today's value. For non-pow2 sizes (table-populated but unreachable in Phase 13), `slab_mask = info.align - 1 < size - 1`. Phase 14 adds the @@ -2429,7 +2429,7 @@ auto-propagates. /** * Bit position of the first bit available to backend metadata * layouts above the reserved region. Used by - * `backend_arena_range.h` and `largebuddyrange.h` to derive + * `largearenarange.h` and `largebuddyrange.h` to derive * RED_BIT_POS, VARIANT_SHIFT, and LARGE_SIZE_SHIFT. */ static constexpr size_t BACKEND_LAYOUT_FIRST_FREE_BIT = @@ -2438,17 +2438,17 @@ auto-propagates. The `+1` reserves `REMOTE_BACKEND_MARKER`'s own bit (it lives at `next_pow2_bits_const(REMOTE_BACKEND_MARKER)`). -### `src/snmalloc/backend_helpers/backend_arena_range.h` and `src/snmalloc/backend_helpers/largebuddyrange.h` +### `src/snmalloc/backend_helpers/largearenarange.h` and `src/snmalloc/backend_helpers/largebuddyrange.h` - Replace hard-coded `RED_BIT_POS = 8`, `VARIANT_SHIFT = 9`, - `LARGE_SIZE_SHIFT = 8` in `backend_arena_range.h` with + `LARGE_SIZE_SHIFT = 8` in `largearenarange.h` with derivations from the new public `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT`: `RED_BIT_POS = MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT;` `LARGE_SIZE_SHIFT = MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT;` `VARIANT_SHIFT = MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT + 1;` (the `+1` reserves the RED bit). -- `backend_arena_range.h:64-66` `static_assert` continues to enforce +- `largearenarange.h:64-66` `static_assert` continues to enforce no clash with reserved bits. - `largebuddyrange.h:40-46`: `BuddyChunkRep::RED_BIT = 1 << 8`. Replace with `1 << MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT`. @@ -2536,12 +2536,12 @@ caller path skips the bound check. 1. **Build**: clean build of the default config passes. The `static_assert(Entry::is_backend_allowed_value(...))` checks at - `backend_arena_range.h:64-66` catch any bit-layout mismatch. + `largearenarange.h:64-66` catch any bit-layout mismatch. 2. **Full ctest suite**: all existing tests pass (no behaviour regression — front-end still issues pow2 large requests, so non-pow2 large sizeclasses exist in tables but are unreachable from the API). -3. **BackendArena unit tests** (`test_backend_arena`) continue to +3. **Arena unit tests** (`test_backend_arena`) continue to pass — they exercise the shifted RED/variant bits in the pagemap encoding. 4. **Extend `src/test/func/sizeclass/sizeclass.cc`** with a @@ -2580,7 +2580,7 @@ caller path skips the bound check. 1. **SIZECLASS_BITS widening cascades.** Caught by the existing `static_assert`s in `metadata.h:64-67` and - `backend_arena_range.h:64-66`. + `largearenarange.h:64-66`. 2. **Some embedder set REMOTE_MIN_ALIGN tighter than chain allows.** Would surface as a compile-error on the cacheline-vs-REP_SIZE max. Address only if it actually fires. @@ -2879,7 +2879,7 @@ expected ownership transition. `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT` is derived from `REMOTE_BACKEND_MARKER`; since the marker moves up by `OFFSET_BITS`, the backend's `RED_BIT`, `VARIANT_SHIFT`, `LARGE_SIZE_SHIFT` -(`backend_arena_range.h:50-67`) auto-shift up by the same amount. +(`largearenarange.h:50-67`) auto-shift up by the same amount. Verify the existing `static_assert((MAX_SIZE_BITS - MIN_SIZE_BITS) + LARGE_SIZE_SHIFT <= bits::BITS, ...)` still holds. For the default config: @@ -3119,7 +3119,7 @@ those positions become bits 12, 13, 14 — and bit 14 collides with the `MIN_CHUNK_BITS = 14` unit-address packing in the backend's buddy-tree pointer storage, tripping the `BIN_META_MASK < UNIT_SIZE` assertion in -`backend_arena_range.h:72`. +`largearenarange.h:72`. Changes: @@ -3136,7 +3136,7 @@ Changes: pass the mask into the `BackendStateWordRef` constructor and store it as a member; `get_backend_word(Word w)` selects the right mask at the call site. -- `backend_arena_range.h`: +- `largearenarange.h`: - Move `RED_BIT_POS` and `VARIANT_SHIFT` down to start at bit 1 (just above `META_BOUNDARY_BIT`). `RED_BIT_POS = 1`, `VARIANT_SHIFT = 2`. `BIN_META_MASK = (1<<1) | (3<<2) = 14`. @@ -3165,7 +3165,7 @@ NOT moved yet (still at SIZECLASS_REP_SIZE), so the layout change is invisible to allocation behaviour; only the relaxation of asserts and the lowered bit positions for RED/VARIANT/LARGE_SIZE_SHIFT differ. Run a focused build to -re-trigger the static_asserts in `backend_arena_range.h` and +re-trigger the static_asserts in `largearenarange.h` and confirm they all pass. ### Step 2: Marker move + ras encoding (no offset writers yet) @@ -3183,7 +3183,7 @@ Changes: above). If any check fails, fix before continuing. **Gate**: clean build (the size-budget `static_assert` in -`backend_arena_range.h` is the compile-time guard for the marker +`largearenarange.h` is the compile-time guard for the marker shift). All existing tests still pass — every `ras` write still encodes with `offset = 0` (the new default arg), so every combined value still equals `sc.raw()`. @@ -3302,7 +3302,7 @@ narrower `align_` rows (4-per-cache-line vs the baseline's 1. **Build**: clean build passes. The new `static_assert` in `sizeclasstable.h` (max large slab index < `1 << OFFSET_BITS`) guards the OFFSET_BITS choice. The size-budget assert in - `backend_arena_range.h` (`(MAX_SIZE_BITS - MIN_SIZE_BITS) + + `largearenarange.h` (`(MAX_SIZE_BITS - MIN_SIZE_BITS) + LARGE_SIZE_SHIFT <= bits::BITS`) guards the upward shift of backend bits. 2. **Full ctest suite**: all existing tests pass. Front-end still @@ -3411,7 +3411,7 @@ narrower `align_` rows (4-per-cache-line vs the baseline's message-passing. 2. **Backend bit budget.** `MAX_SIZE_BITS - MIN_SIZE_BITS + LARGE_SIZE_SHIFT <= bits::BITS` (the assert in - `backend_arena_range.h:68-70`). With `LARGE_SIZE_SHIFT` + `largearenarange.h:68-70`). With `LARGE_SIZE_SHIFT` auto-shifted up by `OFFSET_BITS`, default config goes from ~44 to ~47 bits used, still ≤ 64. The assert is the gate. 3. **Combined-index table size.** The combined-index `start_` table @@ -3588,7 +3588,7 @@ After this commit lands, Phase 15 begins on top of it. Flip the front-end so that large allocations request exactly the sizeclass-encoded size (chunk-multiple, exp+mantissa-rounded), instead of always the next power of two. This is the long-running -goal of the refactor: the backend (`BackendArenaRange`) has +goal of the refactor: the backend (`LargeArenaRange`) has supported arbitrary chunk-multiple sizes since Phase 10–12, the sizeclass encoding has supported non-pow2 large since Phase 13, and the per-chunk offset machinery has supported pointer recovery @@ -3636,9 +3636,9 @@ listed so reviewers can re-check): - The Phase 14 assert that `ras`'s offset bits are zero on entry to `alloc_chunk` continues to hold: front-end calls `PagemapEntry::encode(remote, sc)` with default `offset = 0`. -- `BackendArenaBins::carve` returns a base aligned to +- `ArenaBins::carve` returns a base aligned to `info.align = size & -size` (the largest pow2 divisor of size, - set in the bin-table ctor at `backend_arena_bins.h:742`). For a + set in the bin-table ctor at `arenabins.h:742`). For a 96 KiB request that is 32 KiB = `slab_size` = `sizeclass_full_to_slab_size(sc)` — exactly what `start_of_object`'s `addr & ~slab_mask` requires. diff --git a/docs/AddressSpace.md b/docs/AddressSpace.md index 1e28491ee..030023513 100644 --- a/docs/AddressSpace.md +++ b/docs/AddressSpace.md @@ -26,14 +26,14 @@ For simplicity, we gloss over much of the "lazy initialization" that would actua Because the two exercise similar bits of machinery, we now track them in parallel in prose despite their sequential nature. 4. The `BackendAllocator` has a chain of "range" types that it uses to manage address space. - By default (and in the case we are considering), that chain begins with a per-thread "small buddy allocator range". + By default (and in the case we are considering), that chain begins with a per-thread *small arena range*. 1. For the metadata allocation, the size is (well) below `MIN_CHUNK_SIZE` and so this allocator, which by supposition is empty, attempts to `refill` itself from its parent. This results in a request for a `MIN_CHUNK_SIZE` chunk from the parent allocator. 2. For the chunk allocation, the size is `MIN_CHUNK_SIZE` or larger, so this allocator immediately forwards the request to its parent. -5. The next range allocator in the chain is a per-thread *large* buddy allocator that refills in 2 MiB granules. +5. The next range allocator in the chain is a per-thread `LargeArenaRange` that refills in 2 MiB granules. (2 MiB chosen because it is a typical superpage size.) At this point, both requests are for at least one and no more than a few times `MIN_CHUNK_SIZE` bytes. @@ -48,7 +48,7 @@ For simplicity, we gloss over much of the "lazy initialization" that would actua 8. The next entry in the chain is a `StatsRange` which serves to accumulate statistics. We ignore this stage and continue onwards. -9. The next entry in the chain is another *large* buddy allocator which refills at 16 MiB but can hold regions +9. The next entry in the chain is another `LargeArenaRange` which refills at 16 MiB but can hold regions of any size up to the entire address space. The first request triggers a `refill`, continuing along the chain as a 16 MiB request. (Recall that the second allocation will be handled at an earlier point on the chain.) @@ -61,15 +61,15 @@ For simplicity, we gloss over much of the "lazy initialization" that would actua 12. Having wound the chain onto our stack, we now unwind! The `PagemapRegisterRange` ensures that the Pagemap entries for allocations passing through it are mapped and returns the allocation unaltered. -13. The global large buddy allocator splits the 16 MiB refill into 8, 4, and 2 MiB regions it retains as well as returning the remaining 2 MiB back along the chain. +13. The global `LargeArenaRange` carves the request out of its 16 MiB refill and keeps the unused remainder as a single free block in its internal red-black trees of free ranges, returning the carved portion back along the chain. 14. The `StatsRange` makes its observations, the `GlobalRange` now unlocks the global component of the chain, and the `CommitRange` ensures that the allocation is mapped. Aside from these side effects, these propagate the allocation along the chain unaltered. -15. We now arrive back at the thread-local large buddy allocator, which takes its 2 MiB refill and breaks it down into powers of two down to the requested `MIN_CHUNK_SIZE`. - The second allocation (of the chunk), will either return or again break down one of these intermediate chunks. +15. We now arrive back at the thread-local `LargeArenaRange`, which takes its 2 MiB refill and carves out the requested chunk(s); the unused remainder stays in its free-range trees. + The second allocation (of the chunk) will either be satisfied from this leftover or trigger another carve. -16. For the first (metadata) allocation, the thread-local *small* allocator breaks the `MIN_CHUNK_SIZE` allocation down into powers of two down to `PAGEMAP_METADATA_STRUCT_SIZE` and returns one of that size. +16. For the first (metadata) allocation, the thread-local *small arena range* takes its `MIN_CHUNK_SIZE` refill, hands back a sub-chunk fragment large enough for `PAGEMAP_METADATA_STRUCT_SIZE`, and tracks the remainder as free sub-chunk space using tree nodes stored inside the free fragments themselves. The second allocation will have been forwarded and so is not additionally handled here. Exciting, no? @@ -98,26 +98,19 @@ For chunks owned by the *frontend* (`REMOTE_BACKEND_MARKER` not asserted), 2. A bit (`META_BOUNDARY_BIT`) that serves to limit chunk coalescing on platforms where that may not be possible, such as CHERI. -See `src/backend/metatypes.h` and `src/mem/metaslab.h`. +See `src/snmalloc/mem/metadata.h`. For chunks owned by a *backend* (`REMOTE_BACKEND_MARKER` asserted), there are again multiple possibilities. -For chunks owned by a *small buddy allocator*, the remainder of the `MetaEntry` is zero. +For chunks owned by a *small arena range* (`SmallArenaRange`), the remainder of the `MetaEntry` is zero. That is, it appears to have small sizeclass 0 and an implausible `RemoteAllocator*`. +The free-fragment tree itself is stored in-band, inside the free space of the chunk, rather than in the pagemap (see `InplaceRep` in `src/snmalloc/backend_helpers/inplacerep.h`). -For chunks owned by a *large buddy allocator*, the `MetaEntry` is instead a node in a red-black tree of all such chunks. -Its contents can be decoded as follows: +For chunks owned by a `LargeArenaRange`, the `MetaEntry` is instead a node in the red-black trees of free ranges. +A free block of *N* units consumes the `MetaEntry`s of its first *min(N, 3)* unit-aligned addresses; their words encode the bin-tree node (unit 0), the range-tree node (unit 1, for blocks of two or more units), and the large-chunk count (unit 2, for blocks of three or more units). +The pagemap reserves the low `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT` bits of each word for the meta-entry layout itself; the tree-node encoding (left/right pointers, red bit, variant tag, large-size count) lives at or above that bit. -1. The `meta` field's `META_BOUNDARY_BIT` is preserved, with the same meaning as in the frontend case, above. - -2. `meta` (resp. `remote_and_sizeclass`) includes a pointer to the left (resp. right) *chunk* of address space. - (The corresponding child *node* in this tree is found by taking the *address* of this chunk and looking up the `MetaEntry` in the Pagemap. - This trick of pointing at the child's chunk rather than at the child `MetaEntry` is particularly useful on CHERI: - it allows us to capture the authority to the chunk without needing another pointer and costs just a shift and add.) - -3. The `meta` field's `LargeBuddyRep::RED_BIT` is used to carry the red/black color of this node. - -See `src/backend/largebuddyrange.h`. +See `PagemapRep` in `src/snmalloc/backend_helpers/largearenarange.h`. ### Encoding a MetaEntry @@ -131,18 +124,20 @@ The following cases apply: * has "small" sizeclass 0, which has size 0. * has no associated metadata structure. -2. The address is part of a free chunk in a backend's Large Buddy Allocator: +2. The address is part of a free chunk in a backend `LargeArenaRange`: The `MetaEntry`... * has `REMOTE_BACKEND_MARKER` asserted in `remote_and_sizeclass`. * has "small" sizeclass 0, which has size 0. - * the remainder of its `MetaEntry` structure will be a Large Buddy Allocator rbtree node. + * the remainder of its `MetaEntry` structure (and those of the next one or two unit-aligned `MetaEntry`s if the free block spans them) carries the `Arena`'s red-black-tree node encoding. * has no associated metadata structure. -3. The address is part of a free chunk inside a backend's Small Buddy Allocator: +3. The address is part of a free fragment inside a backend `SmallArenaRange`: Here, the `MetaEntry` is zero aside from the asserted `REMOTE_BACKEND_MARKER` bit, and so it... * has "small" sizeclass 0, which has size 0. * has no associated metadata structure. + The tree of free sub-chunk fragments for this chunk is stored inside the free fragments themselves (`InplaceRep`), not in the pagemap. + 4. The address is part of a live large allocation (spanning one or more 16KiB chunks): Here, the `MetaEntry`... * has `REMOTE_BACKEND_MARKER` clear in `remote_and_sizeclass`. diff --git a/src/snmalloc/README.md b/src/snmalloc/README.md index 2549320fb..f598f8171 100644 --- a/src/snmalloc/README.md +++ b/src/snmalloc/README.md @@ -20,7 +20,7 @@ These are arranged in a hierarchy such that each of the directories may include - `mem/` provides the core allocator abstractions. The code here is templated over a back-end, which defines a particular embedding of snmalloc. - `backend_helpers/` provides helper classes for use in defining a back end. - This includes data structures such as pagemap implementations (efficient maps from a chunk address to associated metadata) and buddy allocators for managing address-space ranges. + This includes data structures such as pagemap implementations (efficient maps from a chunk address to associated metadata) and range allocators for managing address-space ranges. - `backend/` provides some example implementations for snmalloc embeddings that provide a global memory allocator for an address space. Users may ignore this entirely and use the types in `mem/` with a custom back end to expose an snmalloc instance with specific behaviour. Layers above this can be used with a custom configuration by defining `SNMALLOC_PROVIDE_OWN_CONFIG` and exporting a type as `snmalloc::Config` that defines the configuration. diff --git a/src/snmalloc/backend/fixedglobalconfig.h b/src/snmalloc/backend/fixedglobalconfig.h index 94c3c67f1..68b41f860 100644 --- a/src/snmalloc/backend/fixedglobalconfig.h +++ b/src/snmalloc/backend/fixedglobalconfig.h @@ -93,7 +93,7 @@ namespace snmalloc Pagemap::concretePagemap.init(base, length); // Make this a alloc_config constant. - if (length < MIN_HEAP_SIZE_FOR_THREAD_LOCAL_BUDDY) + if (length < MIN_HEAP_SIZE_FOR_THREAD_LOCAL_CACHE) { LocalState::set_small_heap(); } diff --git a/src/snmalloc/backend/meta_protected_range.h b/src/snmalloc/backend/meta_protected_range.h index 021f4750b..e11e0d3e4 100644 --- a/src/snmalloc/backend/meta_protected_range.h +++ b/src/snmalloc/backend/meta_protected_range.h @@ -33,7 +33,7 @@ namespace snmalloc // Global range of memory using GlobalR = Pipe< Base, - BackendArenaRange< + LargeArenaRange< GlobalCacheSizeBits, bits::BITS - 1, Pagemap, @@ -52,7 +52,7 @@ namespace snmalloc // would be able to corrupt meta-data. using CentralObjectRange = Pipe< GlobalR, - BackendArenaRange, + LargeArenaRange, LogRange<3>, GlobalRange, CommitRange, @@ -68,7 +68,7 @@ namespace snmalloc GlobalR, SubRange, // Use SubRange to introduce guard // pages. - BackendArenaRange< + LargeArenaRange< GlobalCacheSizeBits, bits::BITS - 1, Pagemap, @@ -78,7 +78,7 @@ namespace snmalloc // page, so commit in the global range. stl::conditional_t< (max_page_chunk_size_bits > MIN_CHUNK_BITS), - BackendArenaRange< + LargeArenaRange< max_page_chunk_size_bits, max_page_chunk_size_bits, Pagemap, @@ -91,7 +91,7 @@ namespace snmalloc // Local caching of object range using ObjectRange = Pipe< CentralObjectRange, - BackendArenaRange< + LargeArenaRange< LocalCacheSizeBits, LocalCacheSizeBits, Pagemap, @@ -101,7 +101,7 @@ namespace snmalloc // Local caching of meta-data range using MetaRange = Pipe< CentralMetaRange, - BackendArenaRange< + LargeArenaRange< LocalCacheSizeBits - SubRangeRatioBits, bits::BITS - 1, Pagemap>, diff --git a/src/snmalloc/backend/standard_range.h b/src/snmalloc/backend/standard_range.h index 7387ca3c9..706f6ab1e 100644 --- a/src/snmalloc/backend/standard_range.h +++ b/src/snmalloc/backend/standard_range.h @@ -30,7 +30,7 @@ namespace snmalloc // Global range of memory, expose this so can be filled by init. using GlobalR = Pipe< Base, - BackendArenaRange< + LargeArenaRange< GlobalCacheSizeBits, bits::BITS - 1, Pagemap, @@ -46,11 +46,11 @@ namespace snmalloc bits::next_pow2_bits_const(PAL::page_size); public: - // Source for object allocations and metadata - // Use buddy allocators to cache locally. + // Source for object allocations and metadata; thread-local cache + // for chunk-sized ranges. using LargeObjectRange = Pipe< Stats, - StaticConditionalRange #include namespace snmalloc { - struct BackendArenaTestAccess; + struct ArenaTestAccess; /** * Size encoding for a free block's first pagemap entry. @@ -21,7 +21,7 @@ namespace snmalloc * placed in a size-1 bin (cannot serve aligned size-2 requests). * Large: 3+ chunks; precise size stored in a separate entry. */ - enum class BackendArenaVariant : uint8_t + enum class ArenaVariant : uint8_t { Min = 0, EvenTwo = 1, @@ -45,7 +45,7 @@ namespace snmalloc * - `using RangeRep` — full RBTree Rep for the range tree, same * shape as `BinRep`. * - `get_variant(addr)` / `set_variant(addr, v)` — the - * `BackendArenaVariant` tag for the block starting at `addr`. + * `ArenaVariant` tag for the block starting at `addr`. * - `get_large_size(addr)` / `set_large_size(addr, size)` — * exact byte size for `Large` blocks (3+ units). * - `can_consolidate(higher_addr) -> bool` — whether the block at @@ -62,7 +62,7 @@ namespace snmalloc * returned to the caller. */ template - class BackendArena + class Arena { static_assert(MAX_SIZE_BITS > MIN_SIZE_BITS); static_assert(MAX_SIZE_BITS < bits::BITS); @@ -72,7 +72,7 @@ namespace snmalloc static constexpr size_t TWO_UNITS = size_t(2) << MIN_SIZE_BITS; static constexpr size_t B = 2; - using Bins = BackendArenaBins; + using Bins = ArenaBins; static_assert( bits::one_at_bit(MAX_SIZE_BITS) - 1 <= Bins::max_supported_size()); @@ -89,15 +89,15 @@ namespace snmalloc // ---- Metadata helpers ---- - static BackendArenaVariant variant_of(size_t size, uintptr_t addr) + static ArenaVariant variant_of(size_t size, uintptr_t addr) { if (size == UNIT_SIZE) - return BackendArenaVariant::Min; + return ArenaVariant::Min; if (size == TWO_UNITS) return ((addr >> MIN_SIZE_BITS) & 1) == 0 ? - BackendArenaVariant::EvenTwo : - BackendArenaVariant::OddTwo; - return BackendArenaVariant::Large; + ArenaVariant::EvenTwo : + ArenaVariant::OddTwo; + return ArenaVariant::Large; } static stl::Pair range_from_addr(uintptr_t a) @@ -107,12 +107,12 @@ namespace snmalloc auto v = Rep::get_variant(a); switch (v) { - case BackendArenaVariant::Min: + case ArenaVariant::Min: return {a, UNIT_SIZE}; - case BackendArenaVariant::EvenTwo: - case BackendArenaVariant::OddTwo: + case ArenaVariant::EvenTwo: + case ArenaVariant::OddTwo: return {a, TWO_UNITS}; - case BackendArenaVariant::Large: + case ArenaVariant::Large: { size_t s = Rep::get_large_size(a); SNMALLOC_ASSERT( @@ -129,7 +129,7 @@ namespace snmalloc { auto path = bin_trees[0].get_root_path(); return bin_trees[0].find(path, a) && - Rep::get_variant(a) == BackendArenaVariant::Min; + Rep::get_variant(a) == ArenaVariant::Min; } void insert_block(uintptr_t addr, size_t size) @@ -156,12 +156,12 @@ namespace snmalloc bitmap.clear(bin); } - friend struct BackendArenaTestAccess; + friend struct ArenaTestAccess; public: using addr_t = uintptr_t; - constexpr BackendArena() = default; + constexpr Arena() = default; /** * Add a free block at `addr` with `size` bytes. The block is @@ -179,7 +179,7 @@ namespace snmalloc // Unit alignment is required: callers feeding parent ranges (e.g. // mmap-backed PalRange returns page-aligned but not chunk-aligned // memory) must trim their input to UNIT_SIZE before reaching here. - // BackendArenaRange::add_range does this trim. + // LargeArenaRange::add_range does this trim. SNMALLOC_ASSERT((addr & (UNIT_SIZE - 1)) == 0); SNMALLOC_ASSERT(size > 0); SNMALLOC_ASSERT((size & (UNIT_SIZE - 1)) == 0); @@ -348,7 +348,7 @@ namespace snmalloc uintptr_t prev = 0; bool prev_valid = false; bin_trees[0].for_each([&](uintptr_t node) { - if (Rep::get_variant(node) != BackendArenaVariant::Min) + if (Rep::get_variant(node) != ArenaVariant::Min) return; if (prev_valid) SNMALLOC_CHECK( @@ -414,7 +414,7 @@ namespace snmalloc auto v = Rep::get_variant(node); auto [a, s] = range_from_addr(node); SNMALLOC_CHECK(v == variant_of(s, a)); - if (v == BackendArenaVariant::Large) + if (v == ArenaVariant::Large) SNMALLOC_CHECK(Rep::get_large_size(node) == s); }); } diff --git a/src/snmalloc/backend_helpers/backend_arena_bins.h b/src/snmalloc/backend_helpers/arenabins.h similarity index 98% rename from src/snmalloc/backend_helpers/backend_arena_bins.h rename to src/snmalloc/backend_helpers/arenabins.h index ae8e62d3e..07a572b45 100644 --- a/src/snmalloc/backend_helpers/backend_arena_bins.h +++ b/src/snmalloc/backend_helpers/arenabins.h @@ -8,11 +8,11 @@ namespace snmalloc { template - struct BackendArenaBinsTestAccess; + struct ArenaBinsTestAccess; /** * Size class enumeration and bin classification used by the - * BackendArena. + * Arena. * * Template parameter `B` (mantissa-bit width of snmalloc's * non-power-of-two size class scheme) determines the number of @@ -40,14 +40,14 @@ namespace snmalloc * `add` / `find_for_request` / `clear`. * * Everything else is private; tests reach it via - * `BackendArenaBinsTestAccess`. + * `ArenaBinsTestAccess`. */ template - class BackendArenaBins + class ArenaBins { static_assert( INTERMEDIATE_BITS >= 1 && INTERMEDIATE_BITS <= 3, - "BackendArenaBins supports B in {1, 2, 3}"); + "ArenaBins supports B in {1, 2, 3}"); static_assert( MIN_SIZE_BITS + INTERMEDIATE_BITS < bits::BITS, "MIN_SIZE_BITS + INTERMEDIATE_BITS must leave room for at least one " @@ -73,7 +73,7 @@ namespace snmalloc }; private: - friend struct BackendArenaBinsTestAccess; + friend struct ArenaBinsTestAccess; static constexpr size_t B = INTERMEDIATE_BITS; @@ -309,7 +309,7 @@ namespace snmalloc */ class Bitmap { - friend struct BackendArenaBinsTestAccess< + friend struct ArenaBinsTestAccess< INTERMEDIATE_BITS, MIN_SIZE_BITS>; @@ -342,7 +342,7 @@ namespace snmalloc } /// Read-only test: is the bit for `bin_id` set? - /// Used by `BackendArena::invariant()`. + /// Used by `Arena::invariant()`. bool test(size_t bin_id) const { SNMALLOC_ASSERT(bin_id < TOTAL_BINS); @@ -680,7 +680,7 @@ namespace snmalloc * `carve_info[sc]` is the size/alignment record for each in-range * sc (consumed by `carve` and by `bin_offset_at`'s `fits` * predicate during free-side classification). - * `exp_first_sc[e]` is the first raw sc id at BackendArenaBins + * `exp_first_sc[e]` is the first raw sc id at ArenaBins * exponent e (with `exp_first_sc[bits::BITS] = MAX_SC` as a sentinel * so `[exp_first_sc[e], exp_first_sc[e + 1])` is a valid raw range * for every `e < bits::BITS`). diff --git a/src/snmalloc/backend_helpers/backend_helpers.h b/src/snmalloc/backend_helpers/backend_helpers.h index 10740b8ce..8a388171c 100644 --- a/src/snmalloc/backend_helpers/backend_helpers.h +++ b/src/snmalloc/backend_helpers/backend_helpers.h @@ -2,15 +2,13 @@ #include "../mem/mem.h" #include "authmap.h" -#include "backend_arena_range.h" -#include "buddy.h" +#include "largearenarange.h" #include "commitrange.h" #include "commonconfig.h" #include "defaultpagemapentry.h" #include "empty_range.h" #include "globalrange.h" #include "indirectrange.h" -#include "largebuddyrange.h" #include "logrange.h" #include "noprange.h" #include "pagemap.h" @@ -18,7 +16,6 @@ #include "palrange.h" #include "range_helpers.h" #include "smallarenarange.h" -#include "smallbuddyrange.h" #include "staticconditionalrange.h" #include "statsrange.h" #include "subrange.h" diff --git a/src/snmalloc/backend_helpers/buddy.h b/src/snmalloc/backend_helpers/buddy.h deleted file mode 100644 index 58cafacb1..000000000 --- a/src/snmalloc/backend_helpers/buddy.h +++ /dev/null @@ -1,199 +0,0 @@ -#pragma once - -#include "../ds/ds.h" - -namespace snmalloc -{ - /** - * Class representing a buddy allocator - * - * Underlying node `Rep` representation is passed in. - * - * The allocator can handle blocks between inclusive MIN_SIZE_BITS and - * exclusive MAX_SIZE_BITS. - */ - template - class Buddy - { - static_assert(MAX_SIZE_BITS > MIN_SIZE_BITS); - - struct Entry - { - typename Rep::Contents cache[3]; - RBTree tree{}; - }; - - stl::Array entries{}; - // All RBtrees at or above this index should be empty. - size_t empty_at_or_above{0}; - - size_t to_index(size_t size) - { - SNMALLOC_ASSERT(size != 0); - SNMALLOC_ASSERT(bits::is_pow2(size)); - auto log = snmalloc::bits::next_pow2_bits(size); - SNMALLOC_ASSERT_MSG( - log >= MIN_SIZE_BITS, "Size too big: {} log {}.", size, log); - SNMALLOC_ASSERT_MSG( - log < MAX_SIZE_BITS, "Size too small: {} log {}.", size, log); - - return log - MIN_SIZE_BITS; - } - - void validate_block(typename Rep::Contents addr, size_t size) - { - SNMALLOC_ASSERT(bits::is_pow2(size)); - SNMALLOC_ASSERT(addr == Rep::align_down(addr, size)); - UNUSED(addr, size); - } - - void invariant() - { -#ifndef NDEBUG - for (size_t i = empty_at_or_above; i < entries.size(); i++) - { - SNMALLOC_ASSERT(entries[i].tree.is_empty()); - // TODO check cache is empty - } -#endif - } - - bool remove_buddy(typename Rep::Contents addr, size_t size) - { - auto idx = to_index(size); - - // Empty at this range. - if (idx >= empty_at_or_above) - return false; - - auto buddy = Rep::buddy(addr, size); - - // Check local cache first - for (auto& e : entries[idx].cache) - { - if (Rep::equal(buddy, e)) - { - if (!Rep::can_consolidate(addr, size)) - return false; - - e = entries[idx].tree.remove_min(); - return true; - } - } - - auto path = entries[idx].tree.get_root_path(); - bool contains_buddy = entries[idx].tree.find(path, buddy); - - if (!contains_buddy) - return false; - - // Only check if we can consolidate after we know the buddy is in - // the buddy allocator. This is required to prevent possible segfaults - // from looking at the buddies meta-data, which we only know exists - // once we have found it in the red-black tree. - if (!Rep::can_consolidate(addr, size)) - return false; - - entries[idx].tree.remove_path(path); - return true; - } - - public: - constexpr Buddy() = default; - - /** - * Add a block to the buddy allocator. - * - * Blocks needs to be power of two size and aligned to the same power of - * two. - * - * Returns null, if the block is successfully added. Otherwise, returns the - * consolidated block that is MAX_SIZE_BITS big, and hence too large for - * this allocator. - */ - typename Rep::Contents add_block(typename Rep::Contents addr, size_t size) - { - validate_block(addr, size); - - if (remove_buddy(addr, size)) - { - // Add to next level cache - size *= 2; - addr = Rep::align_down(addr, size); - if (size == bits::one_at_bit(MAX_SIZE_BITS)) - { - // Invariant should be checked on all non-tail return paths. - // Holds trivially here with current design. - invariant(); - // Too big for this buddy allocator. - return addr; - } - return add_block(addr, size); - } - - auto idx = to_index(size); - empty_at_or_above = bits::max(empty_at_or_above, idx + 1); - - for (auto& e : entries[idx].cache) - { - if (Rep::equal(Rep::null, e)) - { - e = addr; - return Rep::null; - } - } - - auto path = entries[idx].tree.get_root_path(); - entries[idx].tree.find(path, addr); - entries[idx].tree.insert_path(path, addr); - invariant(); - return Rep::null; - } - - /** - * Removes a block of size from the buddy allocator. - * - * Return Rep::null if this cannot be satisfied. - */ - typename Rep::Contents remove_block(size_t size) - { - invariant(); - auto idx = to_index(size); - if (idx >= empty_at_or_above) - return Rep::null; - - auto addr = entries[idx].tree.remove_min(); - for (auto& e : entries[idx].cache) - { - if (Rep::equal(Rep::null, addr) || Rep::compare(e, addr)) - { - addr = stl::exchange(e, addr); - } - } - - if (addr != Rep::null) - { - validate_block(addr, size); - return addr; - } - - if (size * 2 == bits::one_at_bit(MAX_SIZE_BITS)) - // Too big for this buddy allocator - return Rep::null; - - auto bigger = remove_block(size * 2); - if (bigger == Rep::null) - { - empty_at_or_above = idx; - invariant(); - return Rep::null; - } - - auto second = Rep::offset(bigger, size); - - // Split large block - add_block(second, size); - return bigger; - } - }; -} // namespace snmalloc diff --git a/src/snmalloc/backend_helpers/inplacerep.h b/src/snmalloc/backend_helpers/inplacerep.h index 95275fb72..ab0ee9709 100644 --- a/src/snmalloc/backend_helpers/inplacerep.h +++ b/src/snmalloc/backend_helpers/inplacerep.h @@ -3,7 +3,7 @@ #include "../ds_core/bits.h" #include "../ds_core/defines.h" #include "../ds_core/sizeclassconfig.h" -#include "backend_arena.h" +#include "arena.h" #include @@ -11,7 +11,7 @@ namespace snmalloc { /** * In-band tree node stored at the head of a free block managed by - * `BackendArena`. Two pointer-sized words per unit; bit-packing of + * `Arena`. Two pointer-sized words per unit; bit-packing of * red and variant tags lives in `word_one`. Stored as `uintptr_t` * so we can OR meta bits into the pointer slot without UB on * non-capability platforms (on CHERI, capabilities to access these @@ -25,7 +25,7 @@ namespace snmalloc }; /** - * In-band `Rep` for `BackendArena`. Each free block carries its + * In-band `Rep` for `Arena`. Each free block carries its * own tree-node and metadata storage in its first few units: * * Unit 0 (addr): bin-tree node + variant tag. @@ -34,7 +34,7 @@ namespace snmalloc * * Bit layout in `word_one` of each unit: * bit 0 : red bit (both trees) - * bits 1..2 : variant tag (`BackendArenaVariant`, unit 0 only) + * bits 1..2 : variant tag (`ArenaVariant`, unit 0 only) * `word_two` holds the second child pointer with no packed meta. * Both child pointers are unit-aligned, so their low `MIN_BITS` * bits are zero — the packed meta occupies bits below @@ -231,14 +231,14 @@ namespace snmalloc using BinRep = TreeRep<0, BIN_META_MASK, BIN_REP_NAME>; using RangeRep = TreeRep<1, RANGE_META_MASK, RANGE_REP_NAME>; - static BackendArenaVariant get_variant(uintptr_t addr) + static ArenaVariant get_variant(uintptr_t addr) { auto w = unit_at<0>(addr)->word_one; - return static_cast( + return static_cast( (w & VARIANT_MASK) >> VARIANT_SHIFT); } - static void set_variant(uintptr_t addr, BackendArenaVariant v) + static void set_variant(uintptr_t addr, ArenaVariant v) { auto* w = &unit_at<0>(addr)->word_one; *w = (*w & ~VARIANT_MASK) | (static_cast(v) << VARIANT_SHIFT); @@ -265,7 +265,7 @@ namespace snmalloc * Refuse consolidation across `MIN_CHUNK_SIZE` boundaries. * `SmallArenaRange::add_range_impl` splits incoming ranges at * chunk boundaries, but does not eagerly merge across them on - * the wrapper side; this check is what stops `BackendArena` + * the wrapper side; this check is what stops `Arena` * from later merging two adjacent intra-chunk fragments that * happen to abut the same chunk boundary, which would create a * free block straddling chunks. Chunk-aligned `higher_addr` diff --git a/src/snmalloc/backend_helpers/backend_arena_range.h b/src/snmalloc/backend_helpers/largearenarange.h similarity index 95% rename from src/snmalloc/backend_helpers/backend_arena_range.h rename to src/snmalloc/backend_helpers/largearenarange.h index 89e6f2c60..f53643bf4 100644 --- a/src/snmalloc/backend_helpers/backend_arena_range.h +++ b/src/snmalloc/backend_helpers/largearenarange.h @@ -1,13 +1,13 @@ #pragma once -#include "backend_arena.h" +#include "arena.h" #include "empty_range.h" #include "range_helpers.h" namespace snmalloc { /** - * PagemapRep — Rep for `BackendArena` over a Pagemap. + * PagemapRep — Rep for `Arena` over a Pagemap. * * Each free block uses three pagemap entries at unit-aligned offsets: * @@ -176,14 +176,14 @@ namespace snmalloc using BinRep = TreeRep<0, BIN_META_MASK, BIN_REP_NAME>; using RangeRep = TreeRep<1, RANGE_META_MASK, RANGE_REP_NAME>; - static BackendArenaVariant get_variant(uintptr_t addr) + static ArenaVariant get_variant(uintptr_t addr) { auto w = word_at<0>(addr, Word::One); - return static_cast( + return static_cast( (w.get() & VARIANT_MASK) >> VARIANT_SHIFT); } - static void set_variant(uintptr_t addr, BackendArenaVariant v) + static void set_variant(uintptr_t addr, ArenaVariant v) { auto w = word_at<0>(addr, Word::One); w = (w.get() & ~VARIANT_MASK) | @@ -214,15 +214,15 @@ namespace snmalloc }; /** - * Range wrapper around BackendArena. Drop-in replacement for - * LargeBuddyRange in Pipe<...> compositions. + * Range wrapper around Arena, presenting the standard + * Range interface for use in Pipe<...> compositions. */ template< size_t REFILL_SIZE_BITS, size_t MAX_SIZE_BITS, SNMALLOC_CONCEPT(IsWritablePagemap) Pagemap, size_t MIN_REFILL_SIZE_BITS = 0> - class BackendArenaRange + class LargeArenaRange { static_assert( REFILL_SIZE_BITS <= MAX_SIZE_BITS, "REFILL_SIZE_BITS > MAX_SIZE_BITS"); @@ -242,7 +242,7 @@ namespace snmalloc using PagemapRepT = PagemapRep; - BackendArena arena; + Arena arena; size_t requested_total = 0; void parent_dealloc(uintptr_t addr, size_t size) @@ -336,7 +336,7 @@ namespace snmalloc /** * `size` exceeds the arena's representable range and must be * routed to the parent (or refused if no parent exists). Matches - * `BackendArena::add_block`'s `size < bits::one_at_bit(MAX_SIZE_BITS)` + * `Arena::add_block`'s `size < bits::one_at_bit(MAX_SIZE_BITS)` * precondition exactly, so alloc and dealloc bypass on the same * boundary. */ diff --git a/src/snmalloc/backend_helpers/largebuddyrange.h b/src/snmalloc/backend_helpers/largebuddyrange.h deleted file mode 100644 index 3eb5f5c21..000000000 --- a/src/snmalloc/backend_helpers/largebuddyrange.h +++ /dev/null @@ -1,397 +0,0 @@ -#pragma once - -#include "../ds/ds.h" -#include "../mem/mem.h" -#include "buddy.h" -#include "empty_range.h" -#include "range_helpers.h" - -namespace snmalloc -{ - /** - * Class for using the pagemap entries for the buddy allocator. - */ - template - class BuddyChunkRep - { - public: - /* - * The values we store in our rbtree are the addresses of (combined spans - * of) chunks of the address space; as such, bits in (MIN_CHUNK_SIZE - 1) - * are unused and so the RED_BIT is packed therein. However, in practice, - * these are not "just any" uintptr_t-s, but specifically the uintptr_t-s - * inside the Pagemap's BackendAllocator::Entry structures. - * - * The BackendAllocator::Entry provides us with helpers that guarantee that - * we use only the bits that we are allowed to. - * @{ - */ - using Handle = MetaEntryBase::BackendStateWordRef; - using Contents = uintptr_t; - ///@} - - /** - * The bit that we will use to mark an entry as red. - * This has constraints in two directions, it must not be one of the - * reserved bits from the perspective of the meta entry and it must not be - * a bit that is a valid part of the address of a chunk. - * @{ - */ - static constexpr address_t RED_BIT = address_t(1) - << MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT; - - static_assert(RED_BIT < MIN_CHUNK_SIZE); - static_assert(MetaEntryBase::is_backend_allowed_value( - MetaEntryBase::Word::One, RED_BIT)); - static_assert(MetaEntryBase::is_backend_allowed_value( - MetaEntryBase::Word::Two, RED_BIT)); - ///@} - - /// The value of a null node, as returned by `get` - static constexpr Contents null = 0; - /// The value of a null node, as stored in a `uintptr_t`. - static constexpr Contents root = 0; - - /** - * Set the value. Preserve the red/black colour. - */ - static void set(Handle ptr, Contents r) - { - ptr = r | (static_cast(ptr.get()) & RED_BIT); - } - - /** - * Returns the value, stripping out the red/black colour. - */ - static Contents get(const Handle ptr) - { - return ptr.get() & ~RED_BIT; - } - - /** - * Returns a pointer to the tree node for the specified address. - */ - static Handle ref(bool direction, Contents k) - { - // Special case for accessing the null entry. We want to make sure - // that this is never modified by the back end, so we make it point to - // a constant entry and use the MMU to trap even in release modes. - // The mask passed to the handle is irrelevant: the null entry is - // never written (any attempt would trap), and on read its underlying - // value is zero so `get()` returns zero regardless of the mask. - static const Contents null_entry = 0; - if (SNMALLOC_UNLIKELY(address_cast(k) == 0)) - { - return {const_cast(&null_entry), 0}; - } - auto& entry = Pagemap::template get_metaentry_mut(address_cast(k)); - if (direction) - return entry.get_backend_word(Pagemap::Entry::Word::One); - - return entry.get_backend_word(Pagemap::Entry::Word::Two); - } - - static bool is_red(Contents k) - { - return (ref(true, k).get() & RED_BIT) == RED_BIT; - } - - static void set_red(Contents k, bool new_is_red) - { - if (new_is_red != is_red(k)) - { - auto v = ref(true, k); - v = v.get() ^ RED_BIT; - } - SNMALLOC_ASSERT(is_red(k) == new_is_red); - } - - static Contents offset(Contents k, size_t size) - { - return k + size; - } - - static Contents buddy(Contents k, size_t size) - { - return k ^ size; - } - - static Contents align_down(Contents k, size_t size) - { - return k & ~(size - 1); - } - - static bool compare(Contents k1, Contents k2) - { - return k1 > k2; - } - - static bool equal(Contents k1, Contents k2) - { - return k1 == k2; - } - - static uintptr_t printable(Contents k) - { - return k; - } - - /** - * Convert the pointer wrapper into something that the snmalloc debug - * printing code can print. - */ - static address_t printable(Handle k) - { - return k.printable_address(); - } - - /** - * Returns the name for use in debugging traces. Not used in normal builds - * (release or debug), only when tracing is enabled. - */ - static const char* name() - { - return "BuddyChunkRep"; - } - - static bool can_consolidate(Contents k, size_t size) - { - // Need to know both entries exist in the pagemap. - // This must only be called if that has already been - // ascertained. - // The buddy could be in a part of the pagemap that has - // not been registered and thus could segfault on access. - auto larger = bits::max(k, buddy(k, size)); - auto& entry = - Pagemap::template get_metaentry_mut(address_cast(larger)); - return !entry.is_boundary(); - } - }; - - /** - * Used to represent a consolidating range of memory. Uses a buddy allocator - * to consolidate adjacent blocks. - * - * ParentRange - Represents the range to get memory from to fill this range. - * - * REFILL_SIZE_BITS - Maximum size of a refill, may ask for less during warm - * up phase. - * - * MAX_SIZE_BITS - Maximum size that this range will store. - * - * Pagemap - How to access the pagemap, which is used to store the red black - * tree nodes for the buddy allocators. - * - * MIN_REFILL_SIZE_BITS - The minimum size that the ParentRange can be asked - * for - */ - template< - size_t REFILL_SIZE_BITS, - size_t MAX_SIZE_BITS, - SNMALLOC_CONCEPT(IsWritablePagemap) Pagemap, - size_t MIN_REFILL_SIZE_BITS = 0> - class LargeBuddyRange - { - static_assert( - REFILL_SIZE_BITS <= MAX_SIZE_BITS, "REFILL_SIZE_BITS > MAX_SIZE_BITS"); - static_assert( - MIN_REFILL_SIZE_BITS <= REFILL_SIZE_BITS, - "MIN_REFILL_SIZE_BITS > REFILL_SIZE_BITS"); - - /** - * Maximum size of a refill - */ - static constexpr size_t REFILL_SIZE = bits::one_at_bit(REFILL_SIZE_BITS); - - /** - * Minimum size of a refill - */ - static constexpr size_t MIN_REFILL_SIZE = - bits::one_at_bit(MIN_REFILL_SIZE_BITS); - - public: - template> - class Type : public ContainsParent - { - using ContainsParent::parent; - - /** - * The size of memory requested so far. - * - * This is used to determine the refill size. - */ - size_t requested_total = 0; - - /** - * Buddy allocator used to represent this range of memory. - */ - Buddy, MIN_CHUNK_BITS, MAX_SIZE_BITS> buddy_large; - - /** - * The parent might not support deallocation if this buddy allocator - * covers the whole range. Uses template insanity to make this work. - */ - template - stl::enable_if_t - parent_dealloc_range(capptr::Arena base, size_t size) - { - static_assert( - MAX_SIZE_BITS != (bits::BITS - 1), "Don't set SFINAE parameter"); - parent.dealloc_range(base, size); - } - - void dealloc_overflow(capptr::Arena overflow) - { - if constexpr (MAX_SIZE_BITS != (bits::BITS - 1)) - { - if (overflow != nullptr) - { - parent.dealloc_range(overflow, bits::one_at_bit(MAX_SIZE_BITS)); - } - } - else - { - if (overflow != nullptr) - abort(); - } - } - - /** - * Add a range of memory to the address space. - * Divides blocks into power of two sizes with natural alignment - */ - void add_range(capptr::Arena base, size_t length) - { - range_to_pow_2_blocks( - base, length, [this](capptr::Arena base, size_t align, bool) { - auto overflow = - capptr::Arena::unsafe_from(reinterpret_cast( - buddy_large.add_block(base.unsafe_uintptr(), align))); - - dealloc_overflow(overflow); - }); - } - - capptr::Arena refill(size_t size) - { - if (ParentRange::Aligned) - { - // Use amount currently requested to determine refill size. - // This will gradually increase the usage of the parent range. - // So small examples can grow local caches slowly, and larger - // examples will grow them by the refill size. - // - // The heuristic is designed to allocate the following sequence for - // 16KiB requests 16KiB, 16KiB, 32Kib, 64KiB, ..., REFILL_SIZE/2, - // REFILL_SIZE, REFILL_SIZE, ... Hence if this if they are coming from - // a contiguous aligned range, then they could be consolidated. This - // depends on the ParentRange behaviour. - size_t refill_size = bits::min(REFILL_SIZE, requested_total); - refill_size = bits::max(refill_size, MIN_REFILL_SIZE); - refill_size = bits::max(refill_size, size); - refill_size = bits::next_pow2(refill_size); - - auto refill_range = parent.alloc_range(refill_size); - if (refill_range != nullptr) - { - requested_total += refill_size; - add_range(pointer_offset(refill_range, size), refill_size - size); - } - return refill_range; - } - - // Note the unaligned parent path does not use - // requested_total in the heuristic for the initial size - // this is because the request needs to introduce alignment. - // Currently the unaligned variant is not used as a local cache. - // So the gradual growing of refill_size is not needed. - - // Need to overallocate to get the alignment right. - bool overflow = false; - size_t needed_size = bits::umul(size, 2, overflow); - if (overflow) - { - return nullptr; - } - - auto refill_size = bits::max(needed_size, REFILL_SIZE); - while (needed_size <= refill_size) - { - auto refill = parent.alloc_range(refill_size); - - if (refill != nullptr) - { - requested_total += refill_size; - add_range(refill, refill_size); - - SNMALLOC_ASSERT(refill_size < bits::one_at_bit(MAX_SIZE_BITS)); - static_assert( - (REFILL_SIZE < bits::one_at_bit(MAX_SIZE_BITS)) || - ParentRange::Aligned, - "Required to prevent overflow."); - - return alloc_range(size); - } - - refill_size >>= 1; - } - - return nullptr; - } - - public: - static constexpr bool Aligned = true; - - static constexpr bool ConcurrencySafe = false; - - /* The large buddy allocator always deals in Arena-bounded pointers. */ - using ChunkBounds = capptr::bounds::Arena; - static_assert( - stl::is_same_v); - - constexpr Type() = default; - - capptr::Arena alloc_range(size_t size) - { - SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE); - SNMALLOC_ASSERT(bits::is_pow2(size)); - - if (size >= bits::mask_bits(MAX_SIZE_BITS)) - { - if (ParentRange::Aligned) - return parent.alloc_range(size); - - return nullptr; - } - - auto result = capptr::Arena::unsafe_from( - reinterpret_cast(buddy_large.remove_block(size))); - - if (result != nullptr) - return result; - - return refill(size); - } - - void dealloc_range(capptr::Arena base, size_t size) - { - SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE); - SNMALLOC_ASSERT(bits::is_pow2(size)); - - if constexpr (MAX_SIZE_BITS != (bits::BITS - 1)) - { - if (size >= bits::mask_bits(MAX_SIZE_BITS)) - { - parent_dealloc_range(base, size); - return; - } - } - - auto overflow = - capptr::Arena::unsafe_from(reinterpret_cast( - buddy_large.add_block(base.unsafe_uintptr(), size))); - dealloc_overflow(overflow); - } - }; - }; -} // namespace snmalloc diff --git a/src/snmalloc/backend_helpers/smallarenarange.h b/src/snmalloc/backend_helpers/smallarenarange.h index 5253c3af5..f5820c8f0 100644 --- a/src/snmalloc/backend_helpers/smallarenarange.h +++ b/src/snmalloc/backend_helpers/smallarenarange.h @@ -1,7 +1,7 @@ #pragma once #include "../pal/pal.h" -#include "backend_arena.h" +#include "arena.h" #include "empty_range.h" #include "inplacerep.h" #include "range_helpers.h" @@ -9,7 +9,7 @@ namespace snmalloc { /** - * Small-grained range backed by `BackendArena` with in-band + * Small-grained range backed by `Arena` with in-band * (`InplaceRep`) tree-node storage. Serves blocks of any * unit-aligned size — not restricted to powers of two — for * `SlabMetadata` allocations. @@ -34,7 +34,7 @@ namespace snmalloc using RepT = InplaceRep; static constexpr size_t MIN_BITS = RepT::MIN_BITS; - BackendArena arena; + Arena arena; public: static constexpr size_t UNIT_SIZE = RepT::UNIT_SIZE; @@ -128,7 +128,7 @@ namespace snmalloc * * Requests `requested = align_up(size, align)` bytes; because * `align` is pow2 and `requested` is a multiple of `align`, - * `BackendArena`'s carve returns an `align`-aligned base + * `Arena`'s carve returns an `align`-aligned base * without a caller-side over-allocate-and-trim. The tail * `[align_up(size, UNIT_SIZE), requested)` is donated via * `add_range_impl`. The sub-unit slice diff --git a/src/snmalloc/backend_helpers/smallbuddyrange.h b/src/snmalloc/backend_helpers/smallbuddyrange.h deleted file mode 100644 index 6f8400e83..000000000 --- a/src/snmalloc/backend_helpers/smallbuddyrange.h +++ /dev/null @@ -1,252 +0,0 @@ -#pragma once - -#include "../pal/pal.h" -#include "empty_range.h" -#include "range_helpers.h" - -namespace snmalloc -{ - /** - * struct for representing the redblack nodes - * directly inside the meta data. - */ - template - struct FreeChunk - { - CapPtr left; - CapPtr right; - }; - - /** - * Class for using the allocations own space to store in the RBTree. - */ - template - class BuddyInplaceRep - { - public: - using Handle = CapPtr, bounds>*; - using Contents = CapPtr, bounds>; - - static constexpr Contents null = nullptr; - static constexpr Contents root = nullptr; - - static constexpr address_t MASK = 1; - - static void set(Handle ptr, Contents r) - { - SNMALLOC_ASSERT((address_cast(r) & MASK) == 0); - if (r == nullptr) - *ptr = CapPtr, bounds>::unsafe_from( - reinterpret_cast*>((*ptr).unsafe_uintptr() & MASK)); - else - // Preserve lower bit. - *ptr = pointer_offset(r, (address_cast(*ptr) & MASK)) - .template as_static>(); - } - - static Contents get(Handle ptr) - { - return pointer_align_down<2, FreeChunk>((*ptr).as_void()); - } - - static Handle ref(bool direction, Contents r) - { - if (direction) - return &r->left; - - return &r->right; - } - - static bool is_red(Contents k) - { - if (k == nullptr) - return false; - return (address_cast(*ref(false, k)) & MASK) == MASK; - } - - static void set_red(Contents k, bool new_is_red) - { - if (new_is_red != is_red(k)) - { - auto r = ref(false, k); - auto old_addr = pointer_align_down<2, FreeChunk>(r->as_void()); - - if (new_is_red) - { - if (old_addr == nullptr) - *r = CapPtr, bounds>::unsafe_from( - reinterpret_cast*>(MASK)); - else - *r = pointer_offset(old_addr, MASK) - .template as_static>(); - } - else - { - *r = old_addr; - } - SNMALLOC_ASSERT(is_red(k) == new_is_red); - } - } - - static Contents offset(Contents k, size_t size) - { - return pointer_offset(k, size).template as_static>(); - } - - static Contents buddy(Contents k, size_t size) - { - // This is just doing xor size, but with what API - // exists on capptr. - auto base = pointer_align_down>(k.as_void(), size * 2); - auto offset = (address_cast(k) & size) ^ size; - return pointer_offset(base, offset) - .template as_static>(); - } - - static Contents align_down(Contents k, size_t size) - { - return pointer_align_down>(k.as_void(), size); - } - - static bool compare(Contents k1, Contents k2) - { - return address_cast(k1) > address_cast(k2); - } - - static bool equal(Contents k1, Contents k2) - { - return address_cast(k1) == address_cast(k2); - } - - static address_t printable(Contents k) - { - return address_cast(k); - } - - /** - * Return the holder in some format suitable for printing by snmalloc's - * debug log mechanism. Used only when used in tracing mode, not normal - * debug or release builds. Raw pointers are printable already, so this is - * the identity function. - */ - static Handle printable(Handle k) - { - return k; - } - - /** - * Return a name for use in tracing mode. Unused in any other context. - */ - static const char* name() - { - return "BuddyInplaceRep"; - } - - static bool can_consolidate(Contents k, size_t size) - { - UNUSED(k, size); - return true; - } - }; - - struct SmallBuddyRange - { - template> - class Type : public ContainsParent - { - public: - using ChunkBounds = typename ParentRange::ChunkBounds; - - private: - using ContainsParent::parent; - - static constexpr size_t MIN_BITS = - bits::next_pow2_bits_const(sizeof(FreeChunk)); - - Buddy, MIN_BITS, MIN_CHUNK_BITS> buddy_small; - - /** - * Add a range of memory to the address space. - * Divides blocks into power of two sizes with natural alignment - */ - void add_range(CapPtr base, size_t length) - { - range_to_pow_2_blocks( - base, - length, - [this](CapPtr base, size_t align, bool) { - if (align < MIN_CHUNK_SIZE) - { - CapPtr overflow = - buddy_small - .add_block( - base.template as_reinterpret>(), - align) - .template as_reinterpret(); - if (overflow != nullptr) - parent.dealloc_range( - overflow, bits::one_at_bit(MIN_CHUNK_BITS)); - } - else - { - parent.dealloc_range(base, align); - } - }); - } - - CapPtr refill(size_t size) - { - auto refill = parent.alloc_range(MIN_CHUNK_SIZE); - - if (refill != nullptr) - add_range(pointer_offset(refill, size), MIN_CHUNK_SIZE - size); - - return refill; - } - - public: - static constexpr bool Aligned = true; - static_assert(ParentRange::Aligned, "ParentRange must be aligned"); - - static constexpr bool ConcurrencySafe = false; - - constexpr Type() = default; - - CapPtr alloc_range(size_t size) - { - if (size >= MIN_CHUNK_SIZE) - return parent.alloc_range(size); - - auto result = buddy_small.remove_block(size); - if (result != nullptr) - { - result->left = nullptr; - result->right = nullptr; - return result.template as_reinterpret(); - } - return refill(size); - } - - CapPtr alloc_range_with_leftover(size_t size) - { - auto rsize = bits::next_pow2(size); - - auto result = alloc_range(rsize); - - if (result == nullptr) - return nullptr; - - auto remnant = pointer_offset(result, size); - - add_range(remnant, rsize - size); - - return result.template as_reinterpret(); - } - - void dealloc_range(CapPtr base, size_t size) - { - add_range(base, size); - } - }; - }; -} // namespace snmalloc diff --git a/src/snmalloc/backend_helpers/staticconditionalrange.h b/src/snmalloc/backend_helpers/staticconditionalrange.h index 682c2f1fb..f5d46441b 100644 --- a/src/snmalloc/backend_helpers/staticconditionalrange.h +++ b/src/snmalloc/backend_helpers/staticconditionalrange.h @@ -10,8 +10,8 @@ namespace snmalloc { // This is a range that can bypass the OptionalRange if it is disabled. // Disabling is global, and not local. - // This is used to allow disabling thread local buddy allocators when the - // initial fixed size heap is small. + // This is used to allow disabling the thread-local cache range when + // the initial fixed-size heap is small. // // The range builds a more complex parent // Pipe diff --git a/src/snmalloc/mem/metadata.h b/src/snmalloc/mem/metadata.h index 7992f5ba7..cfc13755e 100644 --- a/src/snmalloc/mem/metadata.h +++ b/src/snmalloc/mem/metadata.h @@ -112,7 +112,7 @@ namespace snmalloc public: /** * First bit on Word::One available for backend layouts; the bits - * below are frontend-reserved. Backends in `backend_arena_range.h` + * below are frontend-reserved. Backends in `largearenarange.h` * derive `RED_BIT`, `VARIANT_SHIFT`, etc. from this. */ static constexpr size_t BACKEND_LAYOUT_FIRST_FREE_BIT = 1; diff --git a/src/snmalloc/mitigations/allocconfig.h b/src/snmalloc/mitigations/allocconfig.h index 3f326a570..3626e613a 100644 --- a/src/snmalloc/mitigations/allocconfig.h +++ b/src/snmalloc/mitigations/allocconfig.h @@ -94,9 +94,10 @@ namespace snmalloc #endif ; - // Used to configure when the backend should use thread local buddies. - // This only basically is used to disable some buddy allocators on small - // fixed heap scenarios like OpenEnclave. - static constexpr size_t MIN_HEAP_SIZE_FOR_THREAD_LOCAL_BUDDY = + // Used to configure when the backend should use the thread-local + // range cache. Disabled below this heap size for small fixed-heap + // scenarios like OpenEnclave, where the per-thread cache would + // dominate the heap. + static constexpr size_t MIN_HEAP_SIZE_FOR_THREAD_LOCAL_CACHE = bits::one_at_bit(27); } // namespace snmalloc diff --git a/src/test/func/backend_arena/backend_arena.cc b/src/test/func/arena/arena.cc similarity index 94% rename from src/test/func/backend_arena/backend_arena.cc rename to src/test/func/arena/arena.cc index e94ca990d..7a88e3492 100644 --- a/src/test/func/backend_arena/backend_arena.cc +++ b/src/test/func/arena/arena.cc @@ -1,5 +1,5 @@ /** - * Unit tests for BackendArena. + * Unit tests for Arena. * * Exercises the Rep adapters (BinRep, RangeRep), RBTree integration, * add_block with consolidation, remove_block with carving, the @@ -21,7 +21,7 @@ #endif #include "test/snmalloc_testlib.h" -#include +#include namespace snmalloc { @@ -32,26 +32,26 @@ namespace snmalloc * BackendStateWordRef (get, operator=, operator!=). Used by MockRep * to avoid requiring a real pagemap in unit tests. */ - struct BackendArenaWordRef + struct ArenaWordRef { uintptr_t* val{nullptr}; - constexpr BackendArenaWordRef() = default; + constexpr ArenaWordRef() = default; - constexpr BackendArenaWordRef(uintptr_t* p) : val(p) {} + constexpr ArenaWordRef(uintptr_t* p) : val(p) {} uintptr_t get() const { return *val; } - BackendArenaWordRef& operator=(uintptr_t v) + ArenaWordRef& operator=(uintptr_t v) { *val = v; return *this; } - bool operator!=(const BackendArenaWordRef& other) const + bool operator!=(const ArenaWordRef& other) const { return val != other.val; } @@ -73,7 +73,7 @@ namespace snmalloc uintptr_t word2{0}; uintptr_t range_word1{0}; uintptr_t range_word2{0}; - BackendArenaVariant variant{BackendArenaVariant::Min}; + ArenaVariant variant{ArenaVariant::Min}; size_t large_size{0}; bool boundary{false}; }; @@ -102,7 +102,7 @@ namespace snmalloc template struct MockTreeRep { - using Handle = BackendArenaWordRef; + using Handle = ArenaWordRef; using Contents = uintptr_t; static constexpr Contents null = 0; @@ -179,12 +179,12 @@ namespace snmalloc using BinRep = MockTreeRep; using RangeRep = MockTreeRep; - static BackendArenaVariant get_variant(uintptr_t addr) + static ArenaVariant get_variant(uintptr_t addr) { return mock_store[mock_index(addr)].variant; } - static void set_variant(uintptr_t addr, BackendArenaVariant v) + static void set_variant(uintptr_t addr, ArenaVariant v) { mock_store[mock_index(addr)].variant = v; } @@ -203,10 +203,9 @@ namespace snmalloc // entry.is_boundary() from the pagemap. The boundary flag lives // per-chunk in mock_store; mock_index asserts the index is in // range, so any caller that probes outside the arena trips the - // assertion — this catches the buddy.h:90-93 unsafe-probe pattern - // (calling can_consolidate before confirming the address is in - // our region) in BackendArena unit tests rather than as a runtime - // segfault in release builds. + // assertion — this catches accidental out-of-region probes in + // Arena unit tests rather than as a release-build + // segfault. static bool can_consolidate(uintptr_t addr) { return !mock_store[mock_index(addr)].boundary; @@ -214,7 +213,7 @@ namespace snmalloc }; // ---- Test access ---- - struct BackendArenaTestAccess + struct ArenaTestAccess { template static auto& get_bin_trees(Arena& a) @@ -251,9 +250,9 @@ namespace snmalloc // K = number of address bits the arena covers above MIN_CHUNK_BITS. // K=6 → arena of 64 chunks, K=8 → 256 chunks, K=10 → 1024 chunks. template - using Arena = BackendArena; + using TestArena = Arena; - using Bins = BackendArenaBins<2, MIN_CHUNK_BITS>; + using Bins = ArenaBins<2, MIN_CHUNK_BITS>; // ================================================================== // (A) Accessor round-trips @@ -263,14 +262,14 @@ namespace snmalloc reset_mock_store(); uintptr_t a = chunk_addr(10); - MockRep::set_variant(a, BackendArenaVariant::Min); - SNMALLOC_ASSERT(MockRep::get_variant(a) == BackendArenaVariant::Min); + MockRep::set_variant(a, ArenaVariant::Min); + SNMALLOC_ASSERT(MockRep::get_variant(a) == ArenaVariant::Min); - MockRep::set_variant(a, BackendArenaVariant::EvenTwo); - SNMALLOC_ASSERT(MockRep::get_variant(a) == BackendArenaVariant::EvenTwo); + MockRep::set_variant(a, ArenaVariant::EvenTwo); + SNMALLOC_ASSERT(MockRep::get_variant(a) == ArenaVariant::EvenTwo); - MockRep::set_variant(a, BackendArenaVariant::Large); - SNMALLOC_ASSERT(MockRep::get_variant(a) == BackendArenaVariant::Large); + MockRep::set_variant(a, ArenaVariant::Large); + SNMALLOC_ASSERT(MockRep::get_variant(a) == ArenaVariant::Large); printf(" Variant round-trip: OK\n"); } @@ -318,16 +317,16 @@ namespace snmalloc // (B) RBTree / RBTree smoke // ================================================================== - // We can't directly instantiate BinRep/RangeRep outside BackendArena + // We can't directly instantiate BinRep/RangeRep outside Arena // since they are private nested types. Instead, test them through - // BackendArena's add_block/remove_block which exercise both trees. + // Arena's add_block/remove_block which exercise both trees. // For smoke testing of tree operations directly, we test through - // the BackendArena's own invariant and operation correctness. + // the Arena's own invariant and operation correctness. static void test_rbtree_smoke_via_arena() { reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; arena.check_invariant(true); // Insert a few non-adjacent blocks. @@ -370,7 +369,7 @@ namespace snmalloc static void test_empty_invariant() { reset_mock_store(); - Arena arena; + TestArena arena; arena.check_invariant(true); printf(" Empty invariant (K=%zu): OK\n", K); } @@ -381,7 +380,7 @@ namespace snmalloc static void test_add_no_consolidation() { reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; // Insert several non-adjacent blocks of various sizes. struct @@ -414,7 +413,7 @@ namespace snmalloc static void test_remove_exact() { reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; // Insert 3 blocks of size 5 at non-adjacent locations. arena.add_block(chunk_addr(10), chunk_size(5)); @@ -442,7 +441,7 @@ namespace snmalloc static void test_remove_carving() { reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; // Insert one block of size 10. arena.add_block(chunk_addr(10), chunk_size(10)); @@ -513,7 +512,7 @@ namespace snmalloc static void test_consolidation_p_min() { reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; add_and_check(arena, 10, 1); add_and_check(arena, 11, 3); @@ -529,7 +528,7 @@ namespace snmalloc static void test_consolidation_p_nonmin() { reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; add_and_check(arena, 10, 3); add_and_check(arena, 13, 2); @@ -544,7 +543,7 @@ namespace snmalloc static void test_consolidation_s_min() { reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; add_and_check(arena, 14, 1); add_and_check(arena, 11, 3); @@ -559,7 +558,7 @@ namespace snmalloc static void test_consolidation_s_nonmin() { reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; add_and_check(arena, 14, 4); add_and_check(arena, 11, 3); @@ -574,7 +573,7 @@ namespace snmalloc static void test_consolidation_ps_both_min() { reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; add_and_check(arena, 10, 1); add_and_check(arena, 12, 1); add_and_check(arena, 11, 1); @@ -590,7 +589,7 @@ namespace snmalloc static void test_consolidation_ps_p_min_s_nonmin() { reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; add_and_check(arena, 10, 1); add_and_check(arena, 14, 3); add_and_check(arena, 11, 3); @@ -606,7 +605,7 @@ namespace snmalloc static void test_consolidation_ps_p_nonmin_s_min() { reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; add_and_check(arena, 10, 3); add_and_check(arena, 16, 1); add_and_check(arena, 13, 3); @@ -622,7 +621,7 @@ namespace snmalloc static void test_consolidation_ps_both_nonmin() { reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; add_and_check(arena, 10, 4); add_and_check(arena, 19, 5); add_and_check(arena, 14, 5); @@ -642,29 +641,29 @@ namespace snmalloc { // Odd chunk index → OddTwo, even → EvenTwo. reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; // Odd address: chunk 11, size 2 arena.add_block(chunk_addr(11), chunk_size(2)); SNMALLOC_ASSERT( - MockRep::get_variant(chunk_addr(11)) == BackendArenaVariant::OddTwo); + MockRep::get_variant(chunk_addr(11)) == ArenaVariant::OddTwo); arena.check_invariant(true); // Even address: chunk 20, size 2 arena.add_block(chunk_addr(20), chunk_size(2)); SNMALLOC_ASSERT( - MockRep::get_variant(chunk_addr(20)) == BackendArenaVariant::EvenTwo); + MockRep::get_variant(chunk_addr(20)) == ArenaVariant::EvenTwo); arena.check_invariant(true); // Both should be in the range tree. - auto& rt = BackendArenaTestAccess::get_range_tree(arena); + auto& rt = ArenaTestAccess::get_range_tree(arena); auto p1 = rt.get_root_path(); SNMALLOC_ASSERT(rt.find(p1, chunk_addr(11))); auto p2 = rt.get_root_path(); SNMALLOC_ASSERT(rt.find(p2, chunk_addr(20))); // OddTwo (chunk 11) should be in bin 0 (size-1 servable set). - auto& bt0 = BackendArenaTestAccess::get_bin_trees(arena)[0]; + auto& bt0 = ArenaTestAccess::get_bin_trees(arena)[0]; auto p3 = bt0.get_root_path(); SNMALLOC_ASSERT(bt0.find(p3, chunk_addr(11))); @@ -679,7 +678,7 @@ namespace snmalloc { // contains_min must not match OddTwo entries. reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; // Add OddTwo block at chunk 11 (odd, size 2). arena.add_block(chunk_addr(11), chunk_size(2)); @@ -709,7 +708,7 @@ namespace snmalloc { // OddTwo block should consolidate via the range tree. reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; // Add OddTwo at chunk 11 (odd, size 2 → chunks 11-12). arena.add_block(chunk_addr(11), chunk_size(2)); @@ -733,7 +732,7 @@ namespace snmalloc { // Consolidation where the new block is a predecessor of OddTwo. reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; // Add OddTwo at chunk 11 (odd, size 2 → chunks 11-12). arena.add_block(chunk_addr(11), chunk_size(2)); @@ -755,7 +754,7 @@ namespace snmalloc { // remove_block(1) from an OddTwo block should carve correctly. reset_mock_store(); - Arena<8> arena; + TestArena<8> arena; // Add OddTwo at chunk 11 (odd, size 2). arena.add_block(chunk_addr(11), chunk_size(2)); @@ -786,7 +785,7 @@ namespace snmalloc { // K=4 → 16-chunk arena. Use base offset 16 to avoid address 0. reset_mock_store(); - Arena<4> arena; + TestArena<4> arena; constexpr size_t BASE = 16; @@ -821,7 +820,7 @@ namespace snmalloc { // K=4 → 16-chunk arena. Use base offset 16 to avoid address 0. reset_mock_store(); - Arena<4> arena; + TestArena<4> arena; constexpr size_t BASE = 16; @@ -971,7 +970,7 @@ namespace snmalloc static void test_stress_seed(size_t seed, size_t num_ops) { reset_mock_store(); - Arena arena; + TestArena arena; constexpr size_t ARENA_CHUNKS = bits::one_at_bit(K); // Offset all chunk addresses to avoid address 0 (tree null). @@ -1109,8 +1108,8 @@ namespace snmalloc static void test_multi_instance_basic() { reset_mock_store(); - Arena<8> arena_a; - Arena<8> arena_b; + TestArena<8> arena_a; + TestArena<8> arena_b; constexpr size_t BASE = 256; // avoid address 0 // Add distinct blocks to each arena. @@ -1143,8 +1142,8 @@ namespace snmalloc static void test_multi_instance_consolidation() { reset_mock_store(); - Arena<8> arena_a; - Arena<8> arena_b; + TestArena<8> arena_a; + TestArena<8> arena_b; constexpr size_t BASE = 256; // Arena B holds two blocks with a gap: [20..24) and [28..32). @@ -1176,8 +1175,8 @@ namespace snmalloc static void test_multi_stress_seed(size_t seed, size_t num_ops) { reset_mock_store(); - Arena arena_a; - Arena arena_b; + TestArena arena_a; + TestArena arena_b; constexpr size_t ARENA_CHUNKS = bits::one_at_bit(K); constexpr size_t BASE = ARENA_CHUNKS; @@ -1359,7 +1358,7 @@ namespace snmalloc { reset_mock_store(); constexpr size_t K = 6; - Arena arena; + TestArena arena; uintptr_t p_addr = chunk_addr(2); uintptr_t a_addr = chunk_addr(4); @@ -1385,7 +1384,7 @@ namespace snmalloc { reset_mock_store(); constexpr size_t K = 6; - Arena arena; + TestArena arena; uintptr_t a_addr = chunk_addr(2); uintptr_t s_addr = chunk_addr(4); @@ -1411,7 +1410,7 @@ namespace snmalloc { reset_mock_store(); constexpr size_t K = 6; - Arena arena; + TestArena arena; // Three adjacent blocks: chunks [4,6), [6,8), [8,10). // Boundary at chunk 8 blocks [6,8) ↔ [8,10) merge but allows @@ -1434,7 +1433,7 @@ namespace snmalloc // Regression test: a block whose successor address sits one past // the arena's pagemap must not trigger a can_consolidate probe of - // that out-of-range chunk. The fix is in BackendArena::add_block — + // that out-of-range chunk. The fix is in Arena::add_block — // tree-membership tests gate the can_consolidate read. MockRep's // can_consolidate now dereferences mock_store via mock_index, which // asserts on out-of-range indices, so an unguarded probe in @@ -1444,7 +1443,7 @@ namespace snmalloc { reset_mock_store(); constexpr size_t K = 10; - Arena arena; + TestArena arena; constexpr size_t ARENA_CHUNKS = size_t{1} << K; // Block ending at the very top of the arena (succ_addr would @@ -1464,7 +1463,7 @@ namespace snmalloc { reset_mock_store(); constexpr size_t K = 6; - Arena arena; + TestArena arena; uintptr_t p_addr = chunk_addr(4); uintptr_t a_addr = chunk_addr(5); @@ -1489,7 +1488,7 @@ namespace snmalloc int main() { - printf("--- BackendArena tests ---\n"); + printf("--- Arena tests ---\n"); printf("(A) Accessor round-trips:\n"); snmalloc::test_variant_roundtrip(); @@ -1547,6 +1546,6 @@ int main() snmalloc::test_block_at_arena_top_edge(); snmalloc::test_boundary_blocks_min_predecessor(); - printf("All BackendArena tests passed.\n"); + printf("All Arena tests passed.\n"); return 0; } diff --git a/src/test/func/backend_arena_bins/backend_arena_bins.cc b/src/test/func/arenabins/arenabins.cc similarity index 94% rename from src/test/func/backend_arena_bins/backend_arena_bins.cc rename to src/test/func/arenabins/arenabins.cc index 8190c3909..8ef495c6b 100644 --- a/src/test/func/backend_arena_bins/backend_arena_bins.cc +++ b/src/test/func/arenabins/arenabins.cc @@ -1,8 +1,8 @@ /** - * Unit tests for BackendArenaBins. + * Unit tests for ArenaBins. * * Exercises: - * - the chunk size class encoding (via `BackendArenaBinsTestAccess`), + * - the chunk size class encoding (via `ArenaBinsTestAccess`), * - the private bin classification (`bin_index`), * - the narrow public surface: `Bitmap::add` / `find_for_request` / * `clear`, and the pure `carve(range_t, n)` decomposition. @@ -15,7 +15,7 @@ * cross-checked against a slow reference scanner that formulates * "bin b serves request n" directly in terms of the canonical * `bin_subsets` table; raw word access for tests goes through - * `BackendArenaBinsTestAccess::raw_*`. + * `ArenaBinsTestAccess::raw_*`. */ #include "test/setup.h" @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include @@ -31,15 +31,15 @@ namespace snmalloc { /** * Friend struct exposing private internals of - * `BackendArenaBins` (and its nested `Bitmap`) - * for unit tests. Forward-declared in `backend_arena_bins.h`; + * `ArenaBins` (and its nested `Bitmap`) + * for unit tests. Forward-declared in `arenabins.h`; * defined here to keep the test-access implementation out of the * in-tree header. */ template - struct BackendArenaBinsTestAccess + struct ArenaBinsTestAccess { - using Bins = BackendArenaBins; + using Bins = ArenaBins; using Bitmap = typename Bins::Bitmap; using range_t = typename Bins::range_t; @@ -195,15 +195,15 @@ namespace snmalloc }; } // namespace snmalloc -using snmalloc::BackendArenaBinsTestAccess; +using snmalloc::ArenaBinsTestAccess; // Compile-time checks: a few size-class encoding properties that we want // to fail the build (not the runtime) if regressed. namespace static_checks { - using B1 = BackendArenaBinsTestAccess<1, 0>; - using B2 = BackendArenaBinsTestAccess<2, 0>; - using B3 = BackendArenaBinsTestAccess<3, 0>; + using B1 = ArenaBinsTestAccess<1, 0>; + using B2 = ArenaBinsTestAccess<2, 0>; + using B3 = ArenaBinsTestAccess<3, 0>; static_assert(B1::BINS_PER_EXP == 2, "B=1 BINS_PER_EXP"); static_assert(B2::BINS_PER_EXP == 5, "B=2 BINS_PER_EXP"); @@ -243,7 +243,7 @@ namespace template constexpr bool serves(size_t bin, size_t n) { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; size_t e_b = bin / Bins::BINS_PER_EXP; size_t o_b = bin % Bins::BINS_PER_EXP; size_t raw = snmalloc::bits::to_exp_mant_const(n); @@ -275,7 +275,7 @@ namespace template void check_chunk_sc_roundtrip() { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; // Properties (together these imply request is the smallest size class // with size >= s): @@ -310,7 +310,7 @@ namespace template void check_sc_align() { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; for (size_t s = 1; s <= 4096; s++) { @@ -345,10 +345,10 @@ namespace /// Collect all sc_t classes whose size fits in the test grid. template - std::vector::sc_t> + std::vector::sc_t> collect_classes(size_t max_size) { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using sc_t = typename Bins::sc_t; std::vector v; @@ -372,7 +372,7 @@ namespace template void check_bin_classification(size_t max_addr, size_t max_n) { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; auto classes = collect_classes(max_n); for (size_t addr = 0; addr < max_addr; addr++) @@ -411,7 +411,7 @@ namespace template void check_bin_id_range() { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; // bin_index always returns a value in [0, BINS_PER_EXP * (e+1)) for the // block's natural exponent e. @@ -443,7 +443,7 @@ namespace template void check_info_consistency() { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; for (size_t s = 1; s <= 4096; s++) { @@ -494,7 +494,7 @@ namespace template void check_to_exp_mant_equivalence() { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; auto check_one = [&](size_t n) { size_t r = snmalloc::bits::to_exp_mant(n); @@ -543,9 +543,9 @@ namespace template size_t reference_find( size_t n_chunks, - const typename BackendArenaBinsTestAccess::Bitmap& bm) + const typename ArenaBinsTestAccess::Bitmap& bm) { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; for (size_t b = 0; b < Bitmap::TOTAL_BINS; b++) { @@ -560,7 +560,7 @@ namespace template void check_bitmap_smoke() { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; Bitmap bm; if (!Bins::raw_empty(bm)) @@ -591,7 +591,7 @@ namespace template void for_each_class_info(F body) { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; for (size_t raw = 0; raw < Bins::MAX_SC; raw++) { size_t s = snmalloc::bits::from_exp_mant(raw); @@ -603,7 +603,7 @@ namespace template void check_bitmap_find_empty() { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; Bitmap bm; for_each_class_info([&](size_t n, const auto& /*info*/) { @@ -618,7 +618,7 @@ namespace template void check_bitmap_exhaustive_single_bit() { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; // Gather a representative set of entries (one per distinct bitmap @@ -671,7 +671,7 @@ namespace template void check_bitmap_multi_bit_random() { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; struct Entry @@ -741,7 +741,7 @@ namespace template void check_bitmap_word_boundary() { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; auto check_predicted = @@ -857,7 +857,7 @@ namespace template void check_bitmap_bin_index_integration() { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; auto classes = collect_classes(64); @@ -902,7 +902,7 @@ namespace template void check_bitmap_add() { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; using range_t = typename Bins::range_t; @@ -976,7 +976,7 @@ namespace template void check_bitmap_find_min() { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; struct Entry @@ -1048,7 +1048,7 @@ namespace template void check_carve() { - using Bins = BackendArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using range_t = typename Bins::range_t; auto classes = collect_classes(64); @@ -1137,7 +1137,7 @@ namespace template void run_all() { - std::printf("--- Running BackendArenaBinsTestAccess<%zu> tests ---\n", B); + std::printf("--- Running ArenaBinsTestAccess<%zu> tests ---\n", B); check_chunk_sc_roundtrip(); std::printf(" sc_t round-trip: OK\n"); check_sc_align(); @@ -1174,7 +1174,7 @@ namespace /// catch silent breakage of the canonical numbering. void check_known_values() { - using B2 = BackendArenaBinsTestAccess<2, 0>; + using B2 = ArenaBinsTestAccess<2, 0>; // size 1 -> raw 0, size 2 -> raw 1, size 3 -> raw 2, size 4 -> raw 3, // size 5 -> raw 4, ..., size 8 -> raw 7, size 10 -> raw 8. @@ -1204,12 +1204,12 @@ namespace if (B2::BINS_PER_EXP != 5) std::abort(); - using B3 = BackendArenaBinsTestAccess<3, 0>; + using B3 = ArenaBinsTestAccess<3, 0>; if (B3::BINS_PER_EXP != 13) std::abort(); - using B1 = BackendArenaBinsTestAccess<1, 0>; + using B1 = ArenaBinsTestAccess<1, 0>; if (B1::BINS_PER_EXP != 2) std::abort(); } @@ -1217,8 +1217,8 @@ namespace /** * Verify that scaling the encoding by `UNIT_SIZE = 1 << MIN_SIZE_BITS` * is a structural equivalence: every public observation about a - * `BackendArenaBins` instance equals the - * corresponding observation on `BackendArenaBins` when the + * `ArenaBins` instance equals the + * corresponding observation on `ArenaBins` when the * input is scaled by `UNIT_SIZE` (and outputs, where they are sizes * or addresses, are also scaled by `UNIT_SIZE`). * @@ -1228,8 +1228,8 @@ namespace template void check_min_size_bits_equivalence() { - using Scaled = BackendArenaBinsTestAccess; - using Base = BackendArenaBinsTestAccess; + using Scaled = ArenaBinsTestAccess; + using Base = ArenaBinsTestAccess; static_assert(MIN_SIZE_BITS > 0, "this check is for MIN_SIZE_BITS > 0"); constexpr size_t U = size_t(1) << MIN_SIZE_BITS; @@ -1329,7 +1329,7 @@ namespace /// raw 0 decodes to UNIT_SIZE bytes, etc. void check_known_values_unit_16() { - using BU = BackendArenaBinsTestAccess<2, 4>; + using BU = ArenaBinsTestAccess<2, 4>; constexpr size_t U = size_t(1) << 4; // size U (UNIT_SIZE) -> raw 0; size 2U -> raw 1; ... @@ -1371,6 +1371,6 @@ int main(int, char**) run_all<2>(); run_all<3>(); - std::printf("All BackendArenaBins tests passed.\n"); + std::printf("All ArenaBins tests passed.\n"); return 0; } diff --git a/src/test/func/cheri/cheri.cc b/src/test/func/cheri/cheri.cc index 7e2318e11..424a2eae2 100644 --- a/src/test/func/cheri/cheri.cc +++ b/src/test/func/cheri/cheri.cc @@ -58,8 +58,8 @@ int main() } /* - * This large object is sized to end up in our alloc's local buddy allocators - * when it's released. + * This large object is sized to end up in our alloc's thread-local + * cache range when it's released. */ message("Grab large object"); ptraddr_t alarge; diff --git a/src/test/func/backend_arena_range/backend_arena_range.cc b/src/test/func/largearenarange/largearenarange.cc similarity index 95% rename from src/test/func/backend_arena_range/backend_arena_range.cc rename to src/test/func/largearenarange/largearenarange.cc index 342253c4b..9ce736e65 100644 --- a/src/test/func/backend_arena_range/backend_arena_range.cc +++ b/src/test/func/largearenarange/largearenarange.cc @@ -1,7 +1,7 @@ /** - * Unit tests for BackendArenaRange and PagemapRep. + * Unit tests for LargeArenaRange and PagemapRep. * - * Tests the Range wrapper around BackendArena using a real pagemap, + * Tests the Range wrapper around Arena using a real pagemap, * exercising alloc_range, dealloc_range, refill, and overflow paths. */ @@ -43,14 +43,14 @@ namespace // Simple parent: PalRange + PagemapRegisterRange. using ParentSource = Pipe, PagemapRegisterRange>; - // BackendArenaRange under test: global range (MAX_SIZE_BITS = BITS - 1). + // LargeArenaRange under test: global range (MAX_SIZE_BITS = BITS - 1). // This means overflow dealloc never goes to parent (matches the global // range configuration). static constexpr size_t REFILL_BITS = 20; static constexpr size_t MAX_BITS = bits::BITS - 1; using ArenaRange = - Pipe>; + Pipe>; // --- Tests --- @@ -294,7 +294,7 @@ int main() { setup(); - printf("--- BackendArenaRange tests ---\n"); + printf("--- LargeArenaRange tests ---\n"); test_basic_alloc_dealloc(); test_multiple_sizes(); @@ -304,6 +304,6 @@ int main() test_large_then_small(); test_non_pow2_sizes(); - printf("All BackendArenaRange tests passed.\n"); + printf("All LargeArenaRange tests passed.\n"); return 0; } diff --git a/src/test/func/backend_arena_inplace/backend_arena_inplace.cc b/src/test/func/smallarenarange/smallarenarange.cc similarity index 96% rename from src/test/func/backend_arena_inplace/backend_arena_inplace.cc rename to src/test/func/smallarenarange/smallarenarange.cc index c7ee62723..596bf20b8 100644 --- a/src/test/func/backend_arena_inplace/backend_arena_inplace.cc +++ b/src/test/func/smallarenarange/smallarenarange.cc @@ -1,8 +1,8 @@ /** - * Unit tests for `InplaceRep` exercised through `BackendArena`. + * Unit tests for `InplaceRep` exercised through `Arena`. * - * Distinct from the `backend_arena` test (which uses an array-backed - * MockRep): here the Rep is the production in-band representation, + * Distinct from the `arena` test (which uses an array-backed + * MockRep): here the Rep is the in-band representation, * and each free block's tree-node storage lives at the block's own * head bytes. The test allocates a single chunk-aligned backing * buffer and treats addresses within it as block bases. @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -33,7 +33,7 @@ namespace snmalloc // Arena spans one chunk's worth of space (max block size = // MIN_CHUNK_SIZE - UNIT_SIZE, since the arena's MAX is exclusive). static constexpr size_t MAX_SIZE_BITS = MIN_CHUNK_BITS; - using Arena = BackendArena; + using TestArena = Arena; // Backing buffer: must be UNIT_SIZE-aligned so block bases are // unit-aligned and the in-band node fields land at the expected @@ -75,24 +75,24 @@ namespace snmalloc uintptr_t a = unit_addr(0); for (auto v : - {BackendArenaVariant::Min, - BackendArenaVariant::EvenTwo, - BackendArenaVariant::OddTwo, - BackendArenaVariant::Large}) + {ArenaVariant::Min, + ArenaVariant::EvenTwo, + ArenaVariant::OddTwo, + ArenaVariant::Large}) { Rep::set_variant(a, v); SNMALLOC_CHECK(Rep::get_variant(a) == v); } // Variant tag must not interfere with the red bit at bit 0. - Rep::set_variant(a, BackendArenaVariant::OddTwo); + Rep::set_variant(a, ArenaVariant::OddTwo); Rep::BinRep::set_red(a, true); SNMALLOC_CHECK(Rep::BinRep::is_red(a)); - SNMALLOC_CHECK(Rep::get_variant(a) == BackendArenaVariant::OddTwo); + SNMALLOC_CHECK(Rep::get_variant(a) == ArenaVariant::OddTwo); Rep::BinRep::set_red(a, false); SNMALLOC_CHECK(!Rep::BinRep::is_red(a)); - SNMALLOC_CHECK(Rep::get_variant(a) == BackendArenaVariant::OddTwo); + SNMALLOC_CHECK(Rep::get_variant(a) == ArenaVariant::OddTwo); printf(" Variant + red roundtrip: OK\n"); } @@ -138,7 +138,7 @@ namespace snmalloc // (B2) `can_consolidate` refuses chunk-boundary merges. // SmallArenaRange splits incoming ranges at chunk boundaries, but // adjacent intra-chunk fragments meeting at a boundary would - // otherwise be merged by BackendArena. The predicate is what + // otherwise be merged by Arena. The predicate is what // prevents that. // ================================================================== @@ -163,7 +163,7 @@ namespace snmalloc static void test_arena_add_remove_single() { reset_backing(); - Arena arena; + TestArena arena; arena.check_invariant(true); auto a = unit_addr(0); @@ -185,7 +185,7 @@ namespace snmalloc static void test_arena_consolidation() { reset_backing(); - Arena arena; + TestArena arena; auto a = unit_addr(0); auto b = unit_addr(4); @@ -211,7 +211,7 @@ namespace snmalloc static void test_arena_carve() { reset_backing(); - Arena arena; + TestArena arena; auto a = unit_addr(0); arena.add_block(a, unit_size(8)); @@ -246,7 +246,7 @@ namespace snmalloc static constexpr size_t STRESS_UNITS = (size_t(1) << MAX_SIZE_BITS) / UNIT_SIZE - 1; - using Bins = BackendArenaBins<2, MIN_BITS>; + using Bins = ArenaBins<2, MIN_BITS>; struct OracleRange { @@ -347,7 +347,7 @@ namespace snmalloc static void test_stress_seed(size_t seed, size_t num_ops) { reset_backing(); - Arena arena; + TestArena arena; Oracle oracle; // All units initially allocated (i.e., not in the arena). From 4f2e6ecc3bf9da794bc43c9d4850a8b72d3798bc Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Tue, 9 Jun 2026 12:58:35 +0100 Subject: [PATCH 25/31] Round metadata to MIN_META_ALIGN, not next_pow2 (Phase C) BackendArenaRange / SmallArenaRange accept any UNIT_SIZE-aligned request; the pow2 rounding the backend was applying to metadata sizes was a leftover from the buddy era and inflated every slab's metadata block to the next power of two. With a ClientMeta provider whose per-slab storage is non-pow2 (e.g. allocation bitmap + small fixed header), this rounding doubled the metadata overhead. Publish MIN_META_ALIGN on each LocalState (= MetaRange::UNIT_SIZE). Add BackendAllocator::meta_size_round, which pads to MIN_META_ALIGN and steps up to MIN_CHUNK_SIZE for requests that would bypass the small range to the parent. Replace all four next_pow2-rounded metadata sites in backend.h with this helper. A new test func/client_meta_nonpow2 installs a ClientMetaDataProvider whose per-slab storage is non-pow2 and exercises alloc/dealloc round-tripping across several sizeclasses; any disagreement between alloc-side and dealloc-side rounding would trip the meta range's dealloc_range assertions. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/backend/backend.h | 33 +++++-- src/snmalloc/backend/meta_protected_range.h | 6 ++ src/snmalloc/backend/standard_range.h | 6 ++ .../client_meta_nonpow2.cc | 93 +++++++++++++++++++ 4 files changed, 130 insertions(+), 8 deletions(-) create mode 100644 src/test/func/client_meta_nonpow2/client_meta_nonpow2.cc diff --git a/src/snmalloc/backend/backend.h b/src/snmalloc/backend/backend.h index adaccc128..5f8a0aca6 100644 --- a/src/snmalloc/backend/backend.h +++ b/src/snmalloc/backend/backend.h @@ -23,7 +23,27 @@ namespace snmalloc using Pal = PAL; using SlabMetadata = typename PagemapEntry::SlabMetadata; - public: + /** + * Round a metadata allocation size to a value the meta range can + * service. + * + * - Pads to `LocalState::MIN_META_ALIGN` so that the in-band small + * meta range (`SmallArenaRange`) accepts it. + * - If the result reaches `MIN_CHUNK_SIZE`, the request will bypass + * the small range to the parent `LargeArenaRange`, which requires + * `MIN_CHUNK_SIZE` alignment; step up to satisfy that. + * + * Alloc and dealloc sites MUST share this helper so a chunk's + * metadata is freed at the same size it was allocated. + */ + SNMALLOC_FAST_PATH static size_t meta_size_round(size_t size) + { + size_t r = bits::align_up(size, LocalState::MIN_META_ALIGN); + if (r >= MIN_CHUNK_SIZE) + r = bits::align_up(r, MIN_CHUNK_SIZE); + return r; + } + /** * Provide a block of meta-data with size and align. * @@ -47,10 +67,7 @@ namespace snmalloc if (local_state != nullptr) { auto& meta_range = local_state->get_meta_range(); - using MetaRangeT = stl::remove_reference_t; - size_t alignment = - bits::max(bits::next_pow2(size), MetaRangeT::UNIT_SIZE); - p = meta_range.alloc_size_with_align(size, alignment); + p = meta_range.alloc_range(meta_size_round(size)); } else { @@ -58,7 +75,7 @@ namespace snmalloc GlobalMetaRange::ConcurrencySafe, "Global meta data range needs to be concurrency safe."); GlobalMetaRange global_state; - p = global_state.alloc_range(bits::next_pow2(size)); + p = global_state.alloc_range(meta_size_round(size)); } if (p == nullptr) @@ -110,7 +127,7 @@ namespace snmalloc // Calculate the extra bytes required to store the client meta-data. size_t extra_bytes = SlabMetadata::get_extra_bytes(sizeclass); - auto meta_size = bits::next_pow2(sizeof(SlabMetadata) + extra_bytes); + auto meta_size = meta_size_round(sizeof(SlabMetadata) + extra_bytes); #ifdef SNMALLOC_TRACING message<1024>( @@ -210,7 +227,7 @@ namespace snmalloc // Calculate the extra bytes required to store the client meta-data. size_t extra_bytes = SlabMetadata::get_extra_bytes(sizeclass); - auto meta_size = bits::next_pow2(sizeof(SlabMetadata) + extra_bytes); + auto meta_size = meta_size_round(sizeof(SlabMetadata) + extra_bytes); local_state.get_meta_range().dealloc_range( capptr::Arena::unsafe_from(&slab_metadata), meta_size); diff --git a/src/snmalloc/backend/meta_protected_range.h b/src/snmalloc/backend/meta_protected_range.h index e11e0d3e4..76916e82f 100644 --- a/src/snmalloc/backend/meta_protected_range.h +++ b/src/snmalloc/backend/meta_protected_range.h @@ -112,6 +112,12 @@ namespace snmalloc MetaRange meta_range; public: + /// Granularity of the local meta range. Backend rounds metadata + /// allocation sizes up to this; replaces pow2 rounding. + static constexpr size_t MIN_META_ALIGN = MetaRange::UNIT_SIZE; + static_assert( + bits::is_pow2(MIN_META_ALIGN), "MIN_META_ALIGN must be a power of two"); + using Stats = StatsCombiner; ObjectRange* get_object_range() diff --git a/src/snmalloc/backend/standard_range.h b/src/snmalloc/backend/standard_range.h index 706f6ab1e..f46e6085d 100644 --- a/src/snmalloc/backend/standard_range.h +++ b/src/snmalloc/backend/standard_range.h @@ -62,6 +62,12 @@ namespace snmalloc ObjectRange object_range; public: + /// Granularity of the local meta range. Backend rounds metadata + /// allocation sizes up to this; replaces pow2 rounding. + static constexpr size_t MIN_META_ALIGN = ObjectRange::UNIT_SIZE; + static_assert( + bits::is_pow2(MIN_META_ALIGN), "MIN_META_ALIGN must be a power of two"); + // Expose a global range for the initial allocation of meta-data. using GlobalMetaRange = Pipe; diff --git a/src/test/func/client_meta_nonpow2/client_meta_nonpow2.cc b/src/test/func/client_meta_nonpow2/client_meta_nonpow2.cc new file mode 100644 index 000000000..31cb84ee0 --- /dev/null +++ b/src/test/func/client_meta_nonpow2/client_meta_nonpow2.cc @@ -0,0 +1,93 @@ +/** + * Exercises the slab metadata allocation path with a ClientMetaDataProvider + * whose per-slab extra_bytes is non-power-of-two. + * + * Before Phase C the backend rounded slab metadata sizes up to the next + * power of two, hiding any non-pow2 storage cost. With Phase C the + * backend rounds to `MIN_META_ALIGN` (= meta range UNIT_SIZE), so a + * non-pow2 client meta size now actually occupies a non-pow2 slab + * metadata block. This test gates the alloc/dealloc round-trip on that + * path: if `meta_size_round` is wrong, an inconsistent alloc/dealloc + * size would either trip an assertion in the meta range or leak. + */ + +#include "test/setup.h" + +#include +#include +#include +#include + +namespace snmalloc +{ + /** + * Per-slab client meta: `max_count + 7` bytes of storage. With + * `StorageType = uint8_t`, the resulting extra_bytes + * (= (required_count - 1) * 1) is non-power-of-two for typical + * sizeclass slab object counts. + */ + struct NonPow2ClientMetaDataProvider + { + using StorageType = uint8_t; + using DataRef = uint8_t&; + + static size_t required_count(size_t max_count) + { + return max_count + 7; + } + + static DataRef get(StorageType* base, size_t index) + { + return base[index]; + } + }; + + using Config = snmalloc::StandardConfigClientMeta; +} // namespace snmalloc + +#define SNMALLOC_PROVIDE_OWN_CONFIG +#include + +int main() +{ +#if defined(SNMALLOC_ENABLE_GWP_ASAN_INTEGRATION) + // This test does not make sense in GWP-ASan mode. + return 0; +#else + // Spread allocations across several small sizeclasses to force a + // variety of slab metadata sizes; each combination of (slab object + // count, +7 bytes) produces a different non-pow2 extra_bytes. + constexpr size_t sizes[] = {16, 48, 96, 192, 512, 1024}; + std::vector> ptrs; + + for (size_t round = 0; round < 5; round++) + { + for (size_t s : sizes) + { + for (size_t i = 0; i < 200; i++) + { + auto p = snmalloc::libc::malloc(s); + auto& meta = snmalloc::get_client_meta_data(p); + uint8_t tag = static_cast((round * 31 + s + i) & 0xff); + meta = tag; + memset(p, tag, s); + ptrs.emplace_back(p, tag); + } + } + } + + for (auto [p, tag] : ptrs) + { + auto& meta = snmalloc::get_client_meta_data(p); + if (meta != tag) + { + std::cout << "Meta mismatch: expected " << int(tag) << " got " + << int(meta) << std::endl; + abort(); + } + snmalloc::libc::free(p); + } + + return 0; +#endif +} From 4a8cca9374e36694d01f827dd83c2500b6784f27 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Tue, 9 Jun 2026 15:12:06 +0100 Subject: [PATCH 26/31] Move IDEA.md to docs/Arena.md; untrack PLAN.md Rewrite the design intro as a docs/ companion to AddressSpace.md: * Drop the LargeBuddyRange framing (that range no longer exists). * Align the mechanism description with the in-tree code, which builds positive serve masks rather than the inverse skip masks the original sketch used. * Add brief sections on the two-tree structure (one bin tree per non-empty bin + one range tree for coalescing) and on the two reps Arena ships with: PagemapRep behind LargeArenaRange for whole-chunk allocations, InplaceRep behind SmallArenaRange for sub-chunk metadata. * Link out to AddressSpace.md and the prototype scripts. PLAN.md is the working planning document; untrack it and add to .gitignore so it stays local. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .gitignore | 1 + IDEA.md | 125 -- PLAN.md | 3975 ------------------------------------------------- docs/Arena.md | 156 ++ 4 files changed, 157 insertions(+), 4100 deletions(-) delete mode 100644 IDEA.md delete mode 100644 PLAN.md create mode 100644 docs/Arena.md diff --git a/.gitignore b/.gitignore index 122a68c2f..93f844c22 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ # rust target /target +PLAN.md diff --git a/IDEA.md b/IDEA.md deleted file mode 100644 index 15d2c5282..000000000 --- a/IDEA.md +++ /dev/null @@ -1,125 +0,0 @@ -# Bitmap-Indexed Coalescing Range - -## The problem - -snmalloc's `LargeBuddyRange` only stores power-of-two blocks. A request for 5 -chunks must be served from an 8-chunk buddy block, wasting 3 chunks. We want -to store blocks at their actual size and use snmalloc's full size class -sequence at the range level. - -## The core idea: search upward, skip a mask - -Free blocks are binned by the set of size classes they can serve. To allocate, -search upward through bins — any larger block can be carved down. This almost -works perfectly, but some bins hold blocks whose alignment is too poor to -serve certain smaller, more-aligned sizes. Those bins must be masked out -during the search. - -The mechanism: `find_first_set(bitmap & ~skip_mask)`. The skip mask depends -only on the requested size class, not on the block. It's a small constant -that can be precomputed. - -## Why skips exist - -snmalloc's size classes follow `S = 2^e + m · 2^(e−B)`, where `B` is the -number of intermediate bits. Each size class has a natural alignment -`align(S) = S & ~(S−1)`. - -A size class with high alignment needs padding to reach an aligned address -within a block. A block of a *larger* size class with *lower* alignment may -not have room for that padding. Concretely: a block of size 5 at address 1 -can serve size 5 (alignment 1) but cannot serve size 4 (alignment 4) — there -aren't enough chunks left after padding to the first 4-aligned address. - -Same size block, different address, different capability. This is what creates -the need for separate bins and skip masks. - -## The general structure - -At each exponent level, the distinct "servable sets" (which size classes a -block can serve) form a structure with some incomparable pairs. Exhaustive -enumeration shows: - -| B | Mantissas/exponent | Bins/exponent | Max skip mask bits | -|---|-------------------:|--------------:|-------------------:| -| 1 | 2 | 2 | 0 | -| 2 | 4 | 5 | 1 | -| 3 | 8 | 13 | 4 | -| 4 | 16 | 34 | 11 | - -Each bin corresponds to a distinct servable set. The bins are ordered so that -upward search is almost always correct — the skip mask handles the exceptions. - -For any B, the structure is: -- **Most requests need no skips.** Only size classes with alignment higher - than expected for their position in the sequence need to mask anything. -- **The skip mask is a small constant** per size class, precomputable at - compile time. -- **The mechanism is identical** regardless of B: - `find_first_set(bitmap & ~skip_mask, start_bit)`. - -`prototype/skip_analysis.py` verifies this exhaustively for B = 1, 2, 3. - -## The bitmap design - -Each free block gets one bin based on its size and alignment. Within each -exponent, there are as many bins as there are distinct servable sets (5 for -B=2, 13 for B=3). A flat bitmap tracks which bins are non-empty. - -To allocate size class `(e, m)`: - -1. Compute the **start bit** — the first bin that could serve this size class. -2. Compute the **skip mask** — bits for bins that can't serve this request. -3. `find_first_set(bitmap & ~skip_mask, start_bit)` → pop a block from that - bin. - -The returned block may not be exactly aligned for the requested size class. -The caller **carves** the aligned region and returns any prefix/suffix -remainders to the free pool. - -## Contrast with buddy allocators - -A buddy allocator guarantees alignment by construction — a 16-chunk buddy is -always 16-aligned — but wastes space by decomposing everything into -power-of-two pieces. - -This design stores blocks at their actual size (no decomposition, no waste) -and handles alignment at allocation time by carving. The skip mask makes -lookup O(1) despite blocks having arbitrary size and alignment. - -## Concrete example (B = 2) - -At exponent `e = 2`, the size classes are 4, 5, 6, 7. There are 5 bins, -each labeled by the set of size classes it can serve at this exponent: - - Bin 0: serves {4} - Bin 1: serves {5} - Bin 2: serves {4, 5} - Bin 3: serves {4, 5, 6} - Bin 4: serves {4, 5, 6, 7} - -Allocation searches upward from the smallest sufficient bin: - - Request for 7: can use bin 4 → search bits {4} - Request for 6: can use bins 3, 4 → search bits {3, 4} - Request for 5: can use bins 1, 2, 3, 4 → search bits {1, 2, 3, 4} - Request for 4: can use bins 0, 2, 3, 4 — skip 1 → search bits {0, 2, 3, 4} - -Only the request for size 4 needs to skip a bin: bin 1 holds blocks that can -serve 5 but not 4. The skip mask is just bit 1. - -## Concrete example (B = 3) - -At exponent `e = 4`, the size classes are 16, 18, 20, 22, 24, 26, 28, 30. -There are 13 bins. The skip analysis shows: - - Request for 16 (align 16): must skip bins for {18}, {20}, {22}, {26} - Request for 24 (align 8): must skip bin for {26} - All other requests: no skips needed - -The pattern: size 16 has high alignment and must skip 4 bins whose blocks -are large enough but too poorly aligned. Size 24 is a "sub-power-of-two" -(alignment 8) and must skip 1 bin. All odd-coefficient sizes have low -alignment and never need to skip anything. - -Same mechanism, wider mask, same `find_first_set(bitmap & ~mask)` operation. diff --git a/PLAN.md b/PLAN.md deleted file mode 100644 index 3679ed2e9..000000000 --- a/PLAN.md +++ /dev/null @@ -1,3975 +0,0 @@ -# User Plan - -We need to refactor the backend buddy allocator to use the more general concept in IDEA.md, which uses a more general concept of sizeclasses than powers of two to avoid internal fragmentation. - -The design will use the Red-Black tree that currently underlies the buddy allocator, but in a different shape: two parallel trees instead of one-per-exponent. - -Each block will be part of two structures: - -* [Bin] A red-black tree of all blocks held by this Arena, in the same bin, ordered by address. -* [Range] A red-black tree of all blocks held by this Arena, ordered by address. - -Note that for block of the minimum size will be handled specially as there is insufficient space to have them particpate in both structures, so they will only particpate in the first. - -## Representation - -We use 2 bits to represent the mode of this block of memory - -00 - Minimum size, only in first red-black tree. Single pagemap entry for this block is used for the RB-tree -01 - 2 * minimum size (2-aligned), in both red-black trees. Two pagemap entries for this block are used for the RB-tree -10 - > 2 * minimum size, in both red-black trees. Three pagemap entries used for this block, first two redblack tree, third stores accurate size of block. -11 - 2 * minimum size (NOT 2-aligned), in both red-black trees. Two pagemap entries for this block are used for the RB-tree. Goes into a size-1 bin since it cannot serve aligned size-2 requests. - -This means it is possible to find the precise size of a block which can account for additional state that is lost by the binning. - -## Maximal consolidation. - -When a block, A, is added, we check if the predeccessor, P, and successor, S, blocks are also in the "Range" red-black tree, if they are then we can combine this block A with P and/or S if they were in the RB-tree. We must also check the "Bin" for the minimum size RB-tree as the minimum sized blocks are not in the "Range" RB tree. - -When we combine a block, we remove the blocks from the appropiate "Bin", and then add the combined block to the appropriate "Bin" for the combined block, we remove one block from the "Range" RB-tree, as we can continue to reuse its entry and not need to mutate the RB-tree. - -## Allocation - -Follows the IDEA.md design, find the smallest bin that can serve the request. Then add back any things that are carved off the block to the free pool. - -## Multiple instances - -As all the state is looked up from the RB-tree, then we can have multiple instances of the data-structure. This allows us to have both thread-local and global RB-trees. - -## Implementation - -### Build Arena - -This should use two RB-trees. - -It should support adding and removing blocks. - -There should be unit tests that check that it is functioning correctly. - -There should be a runtime checked invariant that -* the system is maximally consolidated, and -* the system is consistent between the two RB-trees. - -### Build LargeArenaRange - -This should wrap the Arena using the snmalloc Range approach that is used in the current backend pipelines. - -### Update backend to use LargeArenaRange - -### Update front-end to request non-power of two size classes for the backend. - -### Generalise the large size classes to no longer be just power of two. - -### Fix memcpy protection - -To find the start of a block will require the pagemap to additionally store an offset. - -Currently, the find the start of a block. Performs an alignment to find the start of the "slab", and uses reciprocal division to find the offset within the slab. For large allocations, we just used the start of the slab as everything was aligned to a power of two. We know need to do - -align(ptr, slab_size) - (offset(ptr) * slab_size) - -Here, offset is stored in the pagemap, and allows us to find the start of the block. We will need to store the offset in every entry of the pagemap for the block as we need to support requests in each offset within the block. - -## Extensions - -This is not to be done in this initial implementation, but we should consider this for possible future extensions, and should not be ruled out by any design. - -### Integrated Decay Range - -We can extend the system by effectively having multiple "Range" RB-trees, and then use multiple ranges to track how long a block has been in the backend. We would always add blocks to the "most recent" Range, and as time passes switch which RB-tree is considered the most recent. The oldest one, can then be passed back to the OS, or alter whether it is MADV_FREE or MADV_DONTNEED. - ---- - -# Implementation plan: Arena phase - -## Scope of this phase - -This plan covers **only** the `Arena` data structure and its standalone -unit tests. The following are explicitly deferred to follow-up plans, each of -which will become its own PLAN.md revision: - -- `LargeArenaRange` — wrapping `Arena` behind snmalloc's Range API. -- Backend integration — replacing `LargeBuddyRange` in - `backend/standard_range.h` and `backend/meta_protected_range.h`. -- Front-end requesting non-power-of-two chunk sizes from the backend. -- Generalising the large size classes to no longer be power-of-two only. -- Memcpy protection fix — storing per-chunk `offset` in the pagemap so the - start of a large allocation can be recovered from any address within it. - -The pagemap encoding chosen in this phase **must leave room** for the future -per-entry `offset` field, so the memcpy fix can land later without re-doing -the encoding work. See "Pagemap encoding" below. - -## Design notes - -### Bins: one Bin tree per IDEA servable-set bin - -Per `IDEA.md` and `prototype/skip_analysis.py`, free blocks are classified by -the *servable set* — the set of size classes they can serve, given their size -and alignment. - -For `INTERMEDIATE_BITS=B` the bin count per exponent is `B=1: 2, B=2: 5, -B=3: 13, B=4: 34`. The snmalloc default is `B=2`. The number of exponents in -range is `MAX_SIZE_BITS - MIN_CHUNK_BITS + 1`. - -Each `Arena` instance owns: - -- A flat array of RBTree roots, indexed by bin id (one bin id per - (exponent, servable-set) pair). -- A flat bitmap (`size_t words[NUM_BITMAP_WORDS]`) tracking which bin - RBTrees are non-empty. Word width tracks `bits::BITS` so 32-bit - builds work too. - -Allocation for a request of `n_chunks`: - -1. `bitmap.find_for_request(n_chunks)` returns the bin id of the - smallest serving bin (or `SIZE_MAX` if none). Internally this loads - the per-sc `(start_word, first_mask, second_mask)` triple for - `n_chunks` and applies one AND per word to locate the first set bit - in a serving position. -2. Pop a block from that bin's RBTree (smallest address — `remove_min`). - If the tree empties, `bitmap.clear(bin_id)`. -3. `carve(block, n_chunks)` splits into pre-pad / aligned request of - exactly `n_chunks` chunks / post-pad. SC rounding stays internal: - the SC for `n_chunks` only fixes the alignment of the request and - the minimum block size required; any remainder beyond `n_chunks` - rolls into `post`. Re-add any non-empty pre/post via `add_block` - (which classifies the remainder via `bitmap.add(remainder)`). - -The bin classification and per-sc search masks (`start_word`, -`first_mask`, `second_mask`) are precomputed at `constexpr` time -directly from the size-class structure (no runtime tables beyond the -bitmap of non-empty bins). - -### Range: single tree across all blocks (with min-size exception) - -A single RBTree per `Arena` orders all *non-min-size* free blocks by -address. This is the structure used for adjacency lookup during -consolidation: - -- `predecessor(A)`: largest Range-tree entry with address less than A. -- `successor(A)` : smallest Range-tree entry with address greater than A. - -Min-size blocks are **not** in the Range tree (see "Min-size special case" -below); their adjacency is detected via a `find` in the min-size Bin -RBTree. - -### Block size variants and pagemap encoding - -A free block occupies one or more `MIN_CHUNK_SIZE` chunks, with a pagemap -entry per chunk. The first pagemap entry of a free block carries a -**variant tag** that tells `Arena` how to interpret the other -entries in the block: - -| Variant | Value | Block size | Alignment | Pagemap entries used by Arena | -|-------------|-------|----------------|----------------|------------------------------------------------------------------------| -| `Min` | 0 | exactly min | any | 1 entry — both words store the Bin RBTree node (left/right + colour). | -| `TwoMin` | 1 | exactly 2× min | 2-aligned | 2 entries — first stores Bin node, second stores Range node. | -| `Large` | 2 | > 2× min | any | 3 entries — first Bin, second Range, third stores precise block size. | -| `OddTwo` | 3 | exactly 2× min | **not** 2-aligned | 2 entries — first stores Bin node, second stores Range node. | - -#### Unaligned size-2 blocks (`OddTwo`) - -A size-2 block at an odd chunk address (e.g. chunk 3) cannot serve any -size-2 allocation request because all size-2 SCs require 2-chunk -alignment. `bin_index({odd, 2})` correctly places such blocks into a -size-1 bin. However, the `Min` variant can only store one pagemap entry, -and a size-2 block occupies two entries and participates in the range -tree. - -The `OddTwo` variant resolves this: it marks a size-2 block that is not -2-aligned. Like `TwoMin`, it uses two pagemap entries and lives in the -range tree. Unlike `TwoMin`, it goes into a size-1 bin (since it can't -serve aligned size-2 requests). - -The consolidation code's `contains_min` check probes bin 0 for -single-chunk neighbours. Since `OddTwo` blocks also land in bin 0 -(both `Min` and `OddTwo` have a size-1 servable set at exponent 0), -`contains_min` must filter by variant: after finding an address in -bin 0, it checks `get_variant(addr) == Min` to confirm the block is -truly single-chunk. `OddTwo` blocks are found via range-tree neighbour -lookup instead, which correctly returns their size as 2. - -Note: only blocks at even chunk addresses can be `TwoMin`. The -`variant_of` function must take both size and chunk address to -distinguish `TwoMin` from `OddTwo`. - -**Tree membership is the source of truth for "is this block free?".** -The variant tag is only meaningful for entries `Arena` reaches via -its own RBTrees; nothing outside the data structure probes the tag. The -tag is therefore not a state machine that needs an explicit -"BackendOwned" / "allocated" value: when a block is removed from its -trees, the tag's bits are simply not consulted again until the same -chunk(s) are re-added. - -This phase needs to allow **arbitrary chunk counts** for `Large` blocks, -not just exact size-class sizes. Carving will produce non-class -remainders (e.g. for `B=2`, a 9-chunk prefix), and those must round-trip -through `add_block` / `remove_block` without any silent rounding. The -`Large` block's precise chunk count is stored in the third pagemap entry; -`Min` and `TwoMin` sizes are implicit in the variant. - -**Bit positions** are an internal detail of the new Rep: - -- The variant tag needs 2 bits. They live in the first word of the first - pagemap entry of the block, in bits above `BACKEND_RESERVED_MASK` - (bits 0–7) and the existing `RED_BIT` (bit 8) — e.g. bits 9–10. The - chunk-aligned-address keys leave bits below `MIN_CHUNK_BITS` (=14) - free, so this is comfortably within budget. -- The Rep's `get`/`set` for the first word must preserve **both** - `RED_BIT` and the variant-tag bits, generalising `BuddyChunkRep`'s - current `RED_BIT`-only preservation. - -The exact bit positions are documented only inside the new Rep next to -the accessors. `BuddyChunkRep` and `largebuddyrange.h` are not modified -in this phase. - -This phase does **not** define any storage for the future per-entry -`offset` field that the memcpy fix will need. The plan only claims that -choosing 2 bits above `RED_BIT` does not preclude a sensible future -offset layout; the concrete offset design is the responsibility of the -memcpy-fix follow-up plan. - -### Adjacency lookup - -All adjacency lookups are performed via RB-tree finds in this -`Arena`'s own trees. **No pagemap probing.** The pagemap is shared -across `Arena` instances (e.g. thread-local + global), and reading -entries owned by another instance would be unsafe under concurrent -modification. By restricting reads to RB-tree traversals — which only -follow pointers we wrote, into entries we own — adjacency detection is -race-free without any synchronisation at this layer. - -For an incoming block `A` of size `S` at address `addr_A`: - -- `(P_range, S_range) := Range.neighbours(addr_A)` — one walk yields both - non-min neighbours. - - If `P_range.addr + P_range.size == addr_A`, the non-min left - neighbour is `P_range`; merge. - - If `S_range.addr == addr_A + size_A`, the non-min right neighbour is - `S_range`; merge. -- If no non-min left neighbour was found and `A` is min-eligible at its - boundary: `MinSizeBin.find(addr_A - MIN_CHUNK_SIZE)`; if present, - merge. -- If no non-min right neighbour was found: `MinSizeBin.find(addr_A + - size_A)`; if present, merge. - -`MinSizeBin` is the single Bin RBTree that holds all blocks whose -servable set is `{1 chunk}` (bin 0). This includes both `Min` (size-1) -and `OddTwo` (unaligned size-2) blocks. The `contains_min` helper -performs a `find` in bin 0, then checks `get_variant(addr) == Min` to -confirm the block is truly single-chunk — `OddTwo` entries are skipped -so they are handled by the range-tree neighbour lookup instead. - -Min-size adjacency therefore costs at most one Bin-tree `find` per side -per `add_block`. The Range-tree `neighbours(addr_A)` query yields both -non-min neighbours in a single `O(log n)` walk; no additional pagemap -touches are introduced. - -### Consolidation: reusing tree entries when possible - -The user plan calls out that when consolidating `A` with predecessor `P`, -the Range tree node belonging to `P` can be reused for the consolidated -block without any RB-tree mutation: the combined block has the same -starting address as `P`, so its Range tree key is unchanged. Only the Bin -tree is mutated (remove `P` from its bin, insert combined into its new -bin). - -This optimisation applies **only when `P` is non-min**, i.e. when `P` has a -Range tree entry to reuse. When `P` is min-size, `P` has no Range entry, -and the merged block (which is non-min) must be inserted into the Range -tree normally. The same applies to the `P+S` case: reuse `P`'s Range entry -only if `P` is non-min; otherwise insert the merged block into the Range -tree, then remove `S`'s entry. - -When consolidating `A` with successor `S` (and no `P`), the combined block -starts at `addr_A`, not `S.addr`. Two strategies: - -- **Simple (initial)**: remove `S` from the Range tree, insert combined at - `addr_A`. Two RB-tree operations. -- **Optimised (deferred)**: walk to `S`'s parent via the path returned by - `find`, redirect that parent's child pointer to the new node at `addr_A`, - and copy `S`'s left/right/colour into the new node's pagemap entry. Zero - rotations. - -This plan implements the simple strategy first and gates the optimised -strategy behind a follow-up step with an A/B test. - -After a merge, the combined block may itself become adjacent to a further -block (in principle yes — but if the system was maximally consolidated -before `A` was added, then `P` and `S` were not adjacent to anything else, -so a single merge step suffices). The invariant check verifies this. - -### Min-size special case - -A min-size free block has only one pagemap entry, which fits one RBTree -node. The Bin tree for min-size blocks uses that entry. The Range tree -excludes min-size blocks. Adjacency for min-size neighbours is found via -`MinSizeBin.find(addr)`, never by reading the pagemap directly. - -### Write ordering within add/remove - -Because adjacency lookups are RB-tree-only, a block is "visible to -adjacency" exactly when it is reachable from one of this `Arena`'s -RBTree roots. The ordering rules collapse to two: - -- **add_block**: write the variant tag and any auxiliary data (precise - size for `Large`) into the block's pagemap entries *before* the final - RB-tree insertion that makes the block reachable. Anyone who finds the - new block via a tree walk must see a fully-initialised block. -- **remove_block / consolidation**: remove the block(s) from the - RB-tree(s) *before* reusing or overwriting their pagemap entries. - After unlinking, the block is no longer reachable, and the entries are - free for reuse by the next operation. - -No transient "BackendOwned" marker is needed: a chunk's free-ness is -synonymous with its membership in some RBTree owned by this -`Arena`. - -### Invariants (debug-only, runtime-checked) - -The `Arena::invariant()` method checks: - -1. **Maximally consolidated**: walking the Range tree in order, no two - adjacent entries have `prev.addr + prev.size == curr.addr`; no min-size - block in `MinSizeBin` is adjacent (at `addr ± MIN_CHUNK_SIZE`) to - anything else in either tree. -2. **Cross-tree consistency**: every non-min block in the Bin trees is in - the Range tree; every Range tree entry is in exactly one Bin tree; - sizes agree between the two views. -3. **Bin classification correctness**: each block is in the bin determined - by its `(addr, size)` servable set; arbitrary chunk counts (not just - exact size-class sizes) are classified correctly. -4. **Bitmap consistency**: the non-empty-bins bitmap is set iff the - corresponding RBTree is non-empty. -5. **Variant-tag consistency**: every block reachable from a tree root - has a variant tag (`Min` / `TwoMin` / `Large`) matching its actual - chunk count. (No "BackendOwned" tag is needed because tree membership - is the source of truth for freeness.) - -### Backend chunk size classes - -The new bins are indexed in **chunk units** (1 chunk = `MIN_CHUNK_SIZE` -bytes), not bytes, and not the front-end `sizeclass_t` whose large variant -is currently power-of-two only. `arenabins.h` defines the -chunk-unit size-class scheme using the snmalloc size-class formula -`S = 2^e + m · 2^(e − B)` applied at **chunk-count exponents starting -from zero**. Low-exponent special cases (chunk counts 1, 2, 3, …) follow -the same pattern as `bits::from_exp_mant` in -`src/snmalloc/ds_core/sizeclassstatic.h`: at small exponents the mantissa -space is degenerate, handled by enumeration. - -The public API of `ArenaBins` — the integration contract -`Arena` builds on — is intentionally narrow: - -- `struct range_t { size_t base; size_t size; }` — a chunk-count range - used to describe free blocks and carved sub-ranges. -- `struct carve_t { range_t pre, req, post; }` — output of a carving - operation; either of `pre`/`post` may have `size == 0` (absent). -- `static carve_t carve(range_t block, size_t n_chunks)` — given a - free block and an allocation request, split into pre-pad / aligned - request of exactly `n_chunks` chunks / post-pad. SC rounding stays - inside the carve: the SC for `n_chunks` only fixes alignment and the - servability precondition. Pure function; does not touch the bitmap. -- `max_supported_chunks() -> size_t` — upper bound on legal `n_chunks`; - used for assertions. -- nested `Bitmap` — the routing layer; see below. - -The `Bitmap` is a per-arena non-empty-bins bitmap that owns the -classification of `(base, size)` pairs to bin ids. Its public surface is -exactly three operations: - -- `add(range_t block) -> size_t` — classify `block` into a bin, ensure - the bit for that bin is set, return the `bin_id` so the caller - inserts the block into `bin_trees[bin_id]`. **Idempotent**: callable - on a block already represented in the trees; setting an already-set - bit is a no-op. This is the only public way to learn a bin id for a - given `(base, size)` block, including during consolidation lookups - for neighbours that are already present. -- `find_for_request(size_t n_chunks) -> size_t` — locate the first set - bin satisfying a request for `n_chunks`. Returns `SIZE_MAX` if no bin - in this arena fits. -- `clear(size_t bin_id)` — caller has popped the last element from - `bin_trees[bin_id]`; the bitmap bit is cleared. - -There are deliberately no general bitmap operations (`set`/`has`/ -`empty`/etc.) on the public surface — the bitmap is not a generic data -structure but a routing index whose only meaningful operations are the -three above. The `bitmap_info_t` / `carve_info_t` rodata layouts, the -`bin_index` classifier, and `bitmap_info_for_request` / -`carve_info_for_request` are private (the bitmap and `carve` consume -them internally). - -The size-class encoding details — the `bitmap_info_t` / `carve_info_t` -rodata records, the bin-scheme constants (`B`, `MANTISSAS_PER_EXP`, -`BINS_PER_EXP`, `MAX_SC`), `bitmap_info_for_request` / -`carve_info_for_request`, `bin_index`, and the constexpr per-sc -accessors — are private implementation details. They are reachable -only via the friend struct `ArenaBinsTestAccess` (defined in -the test translation unit, see Phase 1) so unit tests can exercise -them directly; code outside this header does not depend on -them. - -**Free blocks may have arbitrary chunk counts**, including non-class -sizes that arise from carving (e.g. a 9-chunk prefix at `B=2`). The -private `bin_index` operates on arbitrary `(address, size)` pairs and -classifies into a bin by the block's servable set; the public `Bitmap` -exposes this only through `add(range_t)`, which returns the bin id. -Exact size classes appear only on the request side; free blocks store -their precise chunk count where needed (`Large` variant). - -### Exponent / bin-count bounds - -`Arena` takes **byte-size -exponent** bounds (mirroring `Buddy`). -`MIN_SIZE_BITS` is the log2 of the unit of allocation; everything inside -the arena is in multiples of `1 << MIN_SIZE_BITS`. The upper bound is -**exclusive**. The total number of bins is -`(MAX_SIZE_BITS - MIN_SIZE_BITS) * BINS_PER_EXP` plus the -degenerate-low-exponent bins. Static assertions encode the exclusive -semantics; tests exercise minimum, just-below-max, and exact-max sizes -(the last triggers overflow back to the parent, mirroring `Buddy`). - -The chunk-unit bin scheme is independent of `sizeclass_t::as_large()` -for now. The "Generalise the large size classes" follow-up plan will -reconcile the front-end large size classes with this scheme. - -### Multiple instances - -All state lives in pagemap-backed nodes and in per-instance roots/bitmaps; -no global state. Multiple `Arena` instances can coexist (thread-local -and global) for the future Range wrapper. - -## Phases - -Each phase produces a test gate that must pass before the next phase begins. -A phase that touches the tree itself must also keep all existing tests -(including `redblack.cc`) green. - -### Reviewer protocol (applies to every phase below) - -Each phase ends with **two** gates, both of which must clear before -the next phase starts: - -- **Test gate** — the listed tests pass on a Debug build (per - `.github/skills/building_and_testing.md`). -- **Review gate** — spawn a fresh-context `code-review` subagent on - the diff added in that phase. The reviewer prompt includes: - 1. The plan section for the current phase (treat as spec). - 2. The diff produced by the phase (compared to the previous - phase's tip). - 3. A reminder that this phase's scope is *only* what the plan - section describes; cross-phase concerns are out of scope. - 4. A pointer to `claude.md` for codebase conventions (no raw - compiler attributes, no C++ STL, `SNMALLOC_*` - macros, etc.). - - Address findings, re-spawn a fresh-context reviewer, loop until - a reviewer reports no issues. Disputes with reviewer findings - escalate to the user, not resolved unilaterally. - -Phases 0 and 6 are exempted from the review gate: Phase 0 adds no -code; Phase 6 is test-only over already-reviewed code. -Phase 7 is the final mandatory review per `claude.md`. - -### Phase 0: Baseline - -Per `claude.md` "Baseline the checkout before starting work": run a clean -Debug build and the full test suite via the testing subagent protocol in -`.github/skills/building_and_testing.md`. Record the results. If the baseline is -broken, stop and report — do not start implementation on a broken base. - -**Test gate**: full ctest run completes; record pass/fail status of each -test for later comparison. - -### Phase 1: ArenaBins — bin scheme, per-sc tables, and bitmap - -Add `src/snmalloc/backend_helpers/arenabins.h` defining -`ArenaBins`: the chunk-unit size-class -scheme, two per-sc rodata tables, the free-block classifier, and the -nested non-empty-bins bitmap that the allocation fast path scans. - -#### Public surface — the integration contract - -- `struct range_t { size_t base; size_t size; }` — chunk-count range. -- `struct carve_t { range_t pre; range_t req; range_t post; }` — output - of a carving operation; `pre` and/or `post` may have `size == 0`. -- `static SNMALLOC_FAST_PATH carve_t carve(range_t block, size_t n_chunks)` - — split a free `block` into pre-pad, aligned request of exactly - `n_chunks` chunks, and post-pad. SC rounding stays internal: the SC - for `n_chunks` only fixes alignment and the servability precondition; - any rounding remainder absorbs into `post`. Pure. **Preconditions** - (asserted): - `n_chunks >= 1 && n_chunks <= max_supported_chunks()`, - `block.size > 0`, and `block` is servable for `n_chunks` (the caller - has already used `Bitmap::find_for_request`). -- `static constexpr size_t max_supported_chunks()` — upper bound on - legal `n_chunks`; used for assertions. -- nested `class Bitmap` — three methods, all that other code - calls into: - - `size_t add(range_t block)` — classify `block`, ensure the bit - for the resulting bin is set, return the bin id so the caller can - insert `block` into `bin_trees[bin_id]`. **Idempotent**: also the - way to obtain the bin id of an existing neighbour during - consolidation. **Precondition**: `block.size >= 1 && - block.size <= max_supported_chunks()`. - - `size_t find_for_request(size_t n_chunks) const` — smallest set - bin servable for `n_chunks`; `SIZE_MAX` if none. - - `void clear(size_t bin_id)` — caller has popped the last element - from `bin_trees[bin_id]`; clears the bit. - - `static constexpr size_t TOTAL_BINS` — strict upper bound on bin - ids; exposed so callers can size `bin_trees`. - -No general bitmap operations and no size-class handles are exposed. -All other members are private; the unit test reaches them through a -friend struct, defined in the test translation unit (see "Test surface" -below). - -#### Bin scheme - -Following `prototype/skip_analysis.py`: - -- `B = INTERMEDIATE_BITS` (mantissa bits, currently restricted to - `{1, 2, 3}`). -- `MANTISSAS_PER_EXP = 1 << B` (4 / 8 mantissa positions; 2 for B=1). -- `BINS_PER_EXP` = 2 / 5 / 13 for `B` = 1 / 2 / 3 — the count of - distinct *servable subsets* of mantissas at each exponent. Each bin - is a single bit in the bitmap and a single RB-tree at the - `Arena` layer; bins are not size classes (multiple size - classes share a bin) and not exponents (each exponent has multiple - bins). -- `MAX_SC = ((bits::BITS - B) << B) + ((1 << B) - 1)` — one past the - largest raw id that `bits::to_exp_mant_const` produces whose - decoded size fits in `size_t`. The architectural max raw id decodes - to `2^bits::BITS`, which overflows; the tables stop one entry short - to keep `from_exp_mant(MAX_SC - 1)` valid. Sizes for `B` = 1 / - 2 / 3 on 64-bit: 127 / 251 / 495. -- `max_supported_chunks() = bits::from_exp_mant(MAX_SC - 1)` — - enormous in practice (far beyond any real arena). - -#### Per-sc rodata tables - -Two power-of-two-sized structs, each indexed by raw sc id with a -single shift+add: - -```cpp -struct alignas(4 * sizeof(size_t)) bitmap_info_t { - size_t start_word, first_mask, second_mask; -}; -struct carve_info_t { size_t size_chunks, align_chunks; }; -``` - -`alignas(4 * sizeof(size_t))` on `bitmap_info_t` rounds its `sizeof` -up to a power of two (C++ requires `sizeof(T)` to be a multiple of -`alignof(T)`), so the table indexes with a shift+add without needing -a named padding member. - -Split into two tables — rather than one combined record — because the -two consumers run at different phases of allocation/free: - -- `bitmap_info_t` is read by `Bitmap::find_for_request` (bin-selection - on allocate). -- `carve_info_t` is read by `carve` (post-pop split on allocate) and by - `bin_index`'s cascade-fit predicate (free-side classification). - -The fields of `bitmap_info_t` are **pre-shifted into the bitmap's word -layout** so the search is two ANDs: - -- `start_word`: the bitmap word containing the SC's lowest serving - bin. -- `first_mask`: serve mask pre-shifted into `start_word`. Bit `i` set - iff `words_[start_word]` bit `i` serves this SC. -- `second_mask`: serve mask carried into `start_word + 1`. When - `start_bit` is word-aligned (`shift == 0`) there is no within-exp - carry and every bit in that word is higher-exponent, so - `second_mask = ~size_t(0)`. - -`static_assert` pins both struct sizes (4 words and 2 words) so the -table index lowers to a shift+add. - -#### Tables and classifier — populated at constexpr build time - -A private `BinTable` struct holds (all `ModArray<...>`): - -- `bitmap_info[MAX_SC]`, `carve_info[MAX_SC]` — the per-sc tables - above. -- `exp_first_sc[bits::BITS + 1]` — first raw sc id at each - ArenaBins exponent (sentinel at index `bits::BITS` equals - `MAX_SC`). NOTE: this is not uniform stride — at the bottom of the - encoding the low regime squashes multiple ArenaBins exponents - into encoded-exponent 0. -- `exp_bin_base[bits::BITS + 1]` — `e * BINS_PER_EXP`, precomputed so - `bin_index` does no runtime multiply. -- `cascade_steps[MANTISSAS_PER_EXP][MAX_CASCADE_STEPS]` — per-`m_top` - decision lists for `bin_offset_at`. - -A `static constexpr BinTable table_{}` member of `ArenaBins` -holds the populated instance. Tables sit in `.rodata`; no static -initialiser runs at program start. Combined size at B=3 is on the -order of tens of KB (estimate: 16 B/sc × 495 + 32 B/sc × 495 + small -cascade table ≈ 24 KB). - -The constructor populates `bitmap_info[sc]` from the canonical -`bin_subsets` table (single source of truth, matches -`prototype/skip_analysis.py`): - -- `start_bin_offset_for_m(m)`: first within-exp bin offset whose - subset contains mantissa `m`. -- `serve_mask_for_m(m)`: bitmask, relative to `start_bin_offset_for_m`, - of bins that serve `m`. Built **positively** (bit set = "serves") - rather than as a "skip" mask: the hot path AND's this directly - against the bitmap word, no NOT. -- `start_bit = exp_bin_base[e] + start_bin_offset_for_m(m)`, then - `start_word = start_bit / bits::BITS`, - `first_mask = serve_mask << (start_bit & (bits::BITS - 1))`, - `second_mask = (shift == 0) ? ~size_t(0) : ((mask >> (bits::BITS - - shift)) | (~size_t(0) << shift))`. - -For `cascade_steps`: for each `m_top`, the bins whose subset has -`m_top` as max element must form a strict containment chain when -sorted descending by popcount. This invariant is **checked at -constexpr build time** (`throw "..."` in the constexpr ctor surfaces -the violation as a compile error). Given the invariant, each -non-default candidate's discriminator is a single mantissa probe; the -list ends with a `NO_TEST` default. - -Two free-side primitives, both private (used internally by `add` and -by `carve`): - -- `bin_index(range_t block) -> size_t`: returns the bin id of `block`, - operating on arbitrary chunk counts (not just exact SCs). Walks - `m_top` from `MANTISSAS_PER_EXP - 1` down at the natural exponent - `e = prev_pow2_bits(block.size)`. If alignment padding eats every - fit at `e`, drops to `e - 1`; one drop is always sufficient (the - smallest SC at `e - 1` has size and alignment `2^(e-1)`, so worst- - case `size + pad < 2^e <= block.size`). -- `bitmap_info_for_request(n_chunks) -> const bitmap_info_t&`, - `carve_info_for_request(n_chunks) -> const carve_info_t&`: single - table read each. Both call `bits::to_exp_mant(n_chunks)` (the - runtime CLZ intrinsic variant) so the encode is fast and is expected - to be CSE'd when both calls appear in the same fast path. - -#### Runtime CLZ on the fast path - -Fast-path calls use the runtime intrinsic, not the -constexpr software fallback: - -- `src/snmalloc/ds_core/bits.h` provides - `template inline - SNMALLOC_FAST_PATH size_t to_exp_mant(size_t value)` — body - identical to `to_exp_mant_const` but using `bits::clz` instead of - `clz_const`. `static_assert(MANTISSA_BITS + LOW_BITS > 0, ...)` — - the runtime variant relies on `LEADING_BIT != 0` to guarantee - `clz`'s non-zero precondition. -- Header uses `bits::to_exp_mant(n_chunks)` on the - `bin_index` / `find_for_request` / `carve` paths; - `bits::to_exp_mant_const(...)` is used **only** at table - construction time inside the constexpr `BinTable` constructor (and - in test-only static_asserts — see "Test surface"). - -This is the existing snmalloc convention for paired runtime / -compile-time helpers (`clz` / `clz_const`, `next_pow2` / -`next_pow2_const`). - -#### Nested `Bitmap` - -```cpp -class Bitmap -{ - friend struct ArenaBinsTestAccess; - -public: - static constexpr size_t TOTAL_BINS = BINS_PER_EXP * bits::BITS; - - Bitmap() : words_{} {} - SNMALLOC_FAST_PATH size_t add(range_t block); - SNMALLOC_FAST_PATH void clear(size_t bin_id); - SNMALLOC_FAST_PATH size_t find_for_request(size_t n_chunks) const; - -private: - static constexpr size_t NUM_BITMAP_WORDS = - (TOTAL_BINS + bits::BITS - 1) / bits::BITS; - - size_t words_[NUM_BITMAP_WORDS]; -}; -``` - -- `TOTAL_BINS = BINS_PER_EXP * bits::BITS` is the strict upper bound - on `bin_index` output: `bin_index` returns `e * BINS_PER_EXP + - offset` with `e <= bits::BITS - 1` and `offset < BINS_PER_EXP`, so - the maximum is `BINS_PER_EXP * bits::BITS - 1 < TOTAL_BINS`. Values: - 128 / 320 / 832 for `B` = 1 / 2 / 3 on 64-bit. -- `NUM_BITMAP_WORDS == BINS_PER_EXP` exactly (2 / 5 / 13 words); on - 32-bit each word is 4 B instead of 8 B, halving storage. -- `words_` is zero-initialised. Word width tracks `bits::BITS` so the - AND with the precomputed masks has no width mismatch; `bits::ctz` - on a `size_t` produces the bit index. - -Friend declarations: `ArenaBins` and its nested `Bitmap` -each carry their own `friend struct ArenaBinsTestAccess<...>;` -(C++ friendship does not transit to nested classes). - -Static asserts on bitmap layout: - -- `TOTAL_BINS == BINS_PER_EXP * bits::BITS`. -- `NUM_BITMAP_WORDS == BINS_PER_EXP`. -- `TOTAL_BINS < SIZE_MAX` — so the `SIZE_MAX` sentinel cannot collide - with a valid bin id. -- `BINS_PER_EXP <= bits::BITS` — `find_for_request` assumes the - within-exp range fits in a single word so the search straddles at - most one word boundary. Holds on 32-bit (W=32) and 64-bit (W=64) - for the current B values. If a future B pushes this above - `bits::BITS`, the two-word body must be generalised. - -`find_for_request` body: - -```cpp -SNMALLOC_FAST_PATH size_t find_for_request(size_t n_chunks) const -{ - const bitmap_info_t& info = bitmap_info_for_request(n_chunks); - SNMALLOC_ASSERT(info.start_word < NUM_BITMAP_WORDS); - - // First word: start bin + any within-exp neighbours in same word. - size_t word = info.start_word; - size_t bits = words_[word] & info.first_mask; - if (bits != 0) return word * bits::BITS + bits::ctz(bits); - if (++word == NUM_BITMAP_WORDS) return SIZE_MAX; - - // Second word: within-exp carry plus any higher-exp bits. - bits = words_[word] & info.second_mask; - if (bits != 0) return word * bits::BITS + bits::ctz(bits); - - // Remaining words: purely higher-exponent, any bit serves. - while (++word < NUM_BITMAP_WORDS) - if (words_[word] != 0) return word * bits::BITS + bits::ctz(words_[word]); - return SIZE_MAX; -} -``` - -The two ANDs are the entire bin-selection cost; no shifts, no -`shift == 0` branches at runtime (folded in at construction). - -#### Test surface - -`ArenaBinsTestAccess` is **forward-declared** -in `arenabins.h` (so the friend declarations can refer to it) -and **defined in the test translation unit** -`src/test/func/backend_arena_bins/backend_arena_bins.cc` (inside -`namespace snmalloc`). The header therefore carries no -test-only members. - -What the test access struct exposes (all delegating to private -internals through the friend grant): - -- Re-exports of the public types and methods, for convenience. -- The bin-scheme constants `B`, `MANTISSAS_PER_EXP`, `BINS_PER_EXP`, - `MAX_SC`. -- `using chunk_sc_t = size_t;` — raw sc id as plain `size_t`; the - header does NOT define a `chunk_sc_t` handle type. -- `request(n) -> size_t` — `bits::to_exp_mant(n)` (runtime). -- `size_chunks(sc) -> size_t`, `align_chunks(sc) -> size_t` — direct - reads of `Bins::table_.carve_info[sc]`. -- `bitmap_info(sc) -> const bitmap_info_t&`, `carve_info(sc) -> const - carve_info_t&` — direct table reads. -- `bitmap_info_for_request_const(n)`, - `carve_info_for_request_const(n)` — constexpr variants that use - `bits::to_exp_mant_const(n)`; used only inside - `static_assert`s in the test file. -- `bin_index(block) -> size_t`, `bitmap_info_for_request(n)`, - `carve_info_for_request(n)`, `carve(block, n)`, - `max_supported_chunks()` — passthroughs to the private members. -- The canonical `bin_subsets` table. -- Raw-word access on `Bitmap`: `raw_set(b, bin_id)`, `raw_has(b, - bin_id)`, `raw_empty(b)`, `raw_word(b, i)` — for exhaustive - single-bit and "no other bit changed" tests. - -#### Test gate - -New test `src/test/func/backend_arena_bins/backend_arena_bins.cc` -(auto-discovered via `subdirlist` of `src/test/func/`; registered in -`TESTLIB_ONLY_TESTS`). For each `B ∈ {1, 2, 3}`: - -- Compile-time properties via `static_assert` (`BINS_PER_EXP`, - `MAX_SC`, sample sizes/alignments through the `_const` variants). -- Runtime/constexpr CLZ agreement: - `to_exp_mant(n) == to_exp_mant_const(n)` over a - representative range of `n` (1, every power of two and ±1, near - `max_supported_chunks()`, several thousand random values). -- `from_exp_mant` round-trip: - `from_exp_mant(to_exp_mant(n)) >= n` and minimality - (no smaller raw id satisfies the bound). -- Bin-scheme primitives: `size_chunks(sc) >= s` for - `sc = request(s)`; idempotence `request(size_chunks(sc)) == sc`; - monotonicity of `request`; `align_chunks(sc)` is a power of two, - divides `size_chunks(sc)`, and is the largest such. -- `bin_index`: enumerate `(addr_chunks, n_chunks)` over a small grid - (including arbitrary non-class sizes) and check that `bin_index` - matches a brute-force servable-set computation expressed via the - canonical `bin_subsets` table. -- `Bitmap` raw smoke (via friend-struct raw-word access): set / clear - round-trips on individual bin ids; multi-bit states; empty check. -- `find_for_request` on empty bitmap returns `SIZE_MAX` for all - representative request sizes (including `max_supported_chunks()`). -- **Exhaustive single-bit**: for each `bin_id < TOTAL_BINS`, set - exactly that bit (raw access) and verify - `find_for_request(n_chunks)` matches a reference brute-force - scanner over a representative set of request sizes. The reference - predicate "bin b serves request n" is expressed via a - `serves(bin, n)` helper that consults `bin_subsets` directly — - the canonical source from which the precomputed - `start_word`/`first_mask`/`second_mask` are themselves derived, so - any divergence in the derivation chain is caught. -- **Multi-bit randomised**: thousands of random arena states - (uniformly random subset of bin ids) cross-checked against the - reference scanner over representative requests. -- **Word-boundary targeted cases**: classify the table entries - `bitmap_info_for_request_const(...)` produces by `start_bit = - start_word * bits::BITS + bits::ctz(first_mask)` (the start bin is - always the lowest set bit of `first_mask` by construction) into - aligned / fits-in-one-word / boundary-straddling. For each - category, exercise: (i) a single set bit in the first word's - considered region; (ii) first word empty + set bit in the second - word's within-exp carry; (iii) first word empty + set bit in the - second word's higher-exp region; (iv) set bits only in word 3 or - beyond. -- **`add` / `find_for_request` single-block integration**: for - representative `(base, size)` blocks, `bin_id = bm.add({base, - size})`, then `find_for_request(n_chunks) == bin_id` iff - `can_serve(base, size, n_chunks)` (the brute-force predicate using - per-class `size_chunks` / `align_chunks` from the friend struct). -- **`add` / `find_for_request` multi-block integration**: insert - several blocks; for each request, the expected result is the - smallest bin id among the added blocks that can serve it (or - `SIZE_MAX`). Pins the "first serving bin" contract. -- **`add` idempotence**: calling `add(block)` twice returns the same - bin id both times and leaves the bitmap unchanged (verified via raw - word access before and after the second call). -- `carve`: for a representative grid of `(block, n_chunks)`, the - output triple has `pre.base = block.base`, - `pre.base + pre.size = req.base`, `req.size = size_chunks(sc)`, - `req.base` is `align_chunks(sc)`-aligned, - `req.base + req.size = post.base`, and - `post.base + post.size = block.base + block.size`. - -`MAX_SC`-related `static_assert`s use `snmalloc::bits::BITS` (not -hard-coded 64) so they hold on both 32-bit and 64-bit builds. - -#### Review gate - -Spec slice = the Phase 1 section above. Reviewer checks: - -- Tables match the canonical `bin_subsets` (single source of truth); - `prototype/skip_analysis.py` reproduces the same numbering. -- The in-tree header carries no test-only surface (no `chunk_sc_t` - handle class, no `request`, no `_const` variants, no test-only - per-sc accessors — those live only in - `ArenaBinsTestAccess` in the test cc). -- Fast path uses runtime `bits::to_exp_mant` / `bits::clz` (not the - `_const` variants); the `_const` variants are reachable only from - the constexpr `BinTable` constructor and the test's - `static_assert`s. -- `Bitmap::find_for_request` matches the reference scanner; word- - boundary straddle is correctly handled. -- `SIZE_MAX` sentinel is unambiguous (`TOTAL_BINS << SIZE_MAX`). -- Tables sit in `.rodata` (no program-start initialiser). -- Comments earn their length: cut anything that justifies layout, - restates code, or doesn't carry correctness-relevant information. - -### Phase 2: RBTree neighbours-of-probe helper - -The current `RBTree` exposes `find`, `remove_min`, `remove_path` (taking -an `RBPath`). For Range-tree adjacency lookups we don't need predecessor -and successor as independent operations — we always want **both -neighbours of a probe value** when classifying an incoming block. A -single tree walk for `K` already records exactly that information: every -"go-right" descent passes through a node with key strictly less than `K` -(predecessor candidate); every "go-left" descent passes through a node -with key strictly greater than `K` (successor candidate). The last turn -of each kind is the tight answer. - -Add a single helper: - -- `neighbours(K) -> stl::Pair` — performs one walk for `K` and - returns `(largest entry < K, smallest entry > K)`. Either component - is `Rep::null` when no such neighbour exists. - **Precondition**: `K` is not present in the tree. This matches the - `Arena` use case (two free blocks cannot share a starting - address, so `add_block` only calls `neighbours` on addresses not - already in the tree); in Debug an assert fires if `K` is encountered - on the descent. - -This replaces two separate `O(log n)` walks per `add_block` with one and -keeps the API surface small. Implement on top of the existing tree -walking primitives (`get_root`, `get_dir`) — no structural changes to -`RBTree` required. - -**Test gate**: extend `src/test/func/redblack/redblack.cc` with a -randomised test of `neighbours(K)` against `std::set::lower_bound` / -`upper_bound` as oracle, over thousands of operations and probe values -drawn from `K` values **not** present in the tree. Existing tests must -remain green. - -**Review gate**: spec slice = the Phase 2 section above. Reviewer -checks: walk correctly records both turn points; behaviour at empty -tree, single-node tree, `K` smaller than all keys, `K` larger than all -keys, and `K` between two consecutive keys all match the oracle; the -"K not in tree" precondition is asserted in Debug; no structural -changes to `RBTree`'s existing invariants. - -### Phase 3+4: Full Arena data structure (atomic) - -Create `src/snmalloc/backend_helpers/arena.h` with: - -- A `BackendArenaRep` concept describing word-level accessors over the - three pagemap entries, the variant tag, and the large-size accessor: - - `get_variant(addr) -> ArenaVariant` / `set_variant` - - `get_word1(addr)` / `set_word1`, `get_word2(addr)` / `set_word2` - (first entry, used by BinRep) - - `get_range_word1(addr)` / `set_range_word1`, - `get_range_word2(addr)` / `set_range_word2` (second entry, used - by RangeRep) - - `get_large_size_chunks(addr)` / `set_large_size_chunks` (third - entry) - - Rep word setters preserve only `BACKEND_RESERVED_MASK` (bits 0–7). - RED_BIT and VARIANT_MASK preservation is handled by the adapters - via read-modify-write. - -- Two internal RBRep adapters: - - **BinRep**: tagged `BinHandle` (root-pointer mode or child-slot - mode dispatching to Rep word1/word2). `META_MASK = RED_BIT | - VARIANT_MASK` preserved on `set`. - - **RangeRep**: tagged `RangeHandle` dispatching to Rep - range_word1/range_word2. Same `META_MASK` (paranoid masking - defends against stale variant bits from pagemap reuse). - - Both: `compare(k1, k2) = k1 > k2` so `remove_min` returns the - lowest address. `null = root = 0`. - -- `Arena`: - - `B = 2` hardcoded; `INTERMEDIATE_BITS` wiring deferred. - - `MIN_SIZE_BITS` selects the unit of allocation (= pagemap stride - when used with `PagemapRep`). - - `stl::Array bin_trees` - - `RangeTree range_tree` - - `Bins::Bitmap bitmap` - -- Full `add_block(addr, size_chunks)` with consolidation: - - Uses `range_tree.neighbours(addr)` + `contains_min()` for - adjacency. - - Unlinks merged neighbours from both trees and bitmap. - - Returns overflow `{c_addr, c_size}` when consolidation grows to - arena scale (case (ii)); returns `{0, 0}` on success. - - Asserts `addr != 0`, alignment, and size bounds. - -- Full `remove_block(n_chunks)` with carving: - - `bitmap.find_for_request(n_chunks)` → peek min via Rep → - remove from trees → `Bins::carve` → recursive `add_block` for - remainders. - -- Five-clause `invariant()`: - 1. Maximally consolidated (range-tree adjacency + min-block adjacency) - 2. Cross-tree consistency (forward and reverse membership checks) - 3. Bin classification correctness - 4. Bitmap consistency - 5. Variant-tag consistency - -- `get_root_key()` added to `RBTree` (public method, returns root key - or `Rep::null` when empty). - -- `Bitmap::test(size_t bin_id)` added to `ArenaBins` (read-only - accessor used by `invariant()`). - -Modifications to existing files: -- `src/snmalloc/backend_helpers/arenabins.h`: added - `Bitmap::test()` and made `bin_index` public. -- `src/snmalloc/ds_core/redblacktree.h`: added `get_root_key()`. -- `CMakeLists.txt`: added `backend_arena` to `TESTLIB_ONLY_TESTS`. - -**Test gate**: `src/test/func/backend_arena/backend_arena.cc` with -MockRep and 8 test stages (A–H): -- (A) Accessor round-trips -- (B) RBTree smoke via arena -- (C) Empty-state invariant for K ∈ {4, 5, 6} -- (D) add_block without consolidation -- (E) remove_block exact + carving -- (F) Consolidation case matrix (8 cases: all P/S × min/non-min) -- (G) Overflow (interleaved + precise) -- (H) Randomised stress (50 seeds × 500 ops) with Oracle using - `Bins::Bitmap` for exact bin-classification matching - -### Phase 5: `OddTwo` variant for unaligned size-2 blocks - -A size-2 block at an odd chunk address cannot serve size-2 requests -(which require 2-chunk alignment). `bin_index({odd, 2})` correctly -places it in bin 0 (size-1 servable set). But: - -1. `Min` variant uses only 1 pagemap entry; a size-2 block needs 2. -2. `contains_min` probes bin 0 for single-chunk neighbours — finding - a size-2 block there and treating it as size 1 corrupts metadata. - -All changes are in `arena.h` and the test file. - -1. **Add `OddTwo = 3`** to `ArenaVariant` enum. -2. **Change `variant_of`** to take `(size_chunks, chunk_index)`: - - size 1 → `Min` - - size 2, even chunk → `TwoMin` - - size 2, odd chunk → `OddTwo` - - size 3+ → `Large` -3. **Update `range_from_addr`**: `OddTwo` returns `{addr, 2}` (same as - `TwoMin`). -4. **Update `insert_block`**: pass `addr_to_chunk(addr)` to `variant_of`. - The `if (size_chunks >= 2)` range-tree checks already cover `OddTwo`. -5. **Update `contains_min`**: after finding addr in bin 0, check - `Rep::get_variant(addr) == ArenaVariant::Min`. Return false - for `OddTwo` entries. -6. **Update invariant clause 5**: pass chunk address to `variant_of`. -7. **Update invariant clause 1c** ("no two adjacent min blocks"): - skip non-`Min` entries in bin 0 (i.e., `OddTwo` blocks). -8. **Add test cases**: - - Odd-address size-2 block: verify variant is `OddTwo`, goes in - correct bin, lives in range tree. - - Consolidation with `OddTwo` predecessor/successor. - - `contains_min` does not match `OddTwo` addresses. - - `remove_block(1)` from an `OddTwo` block: verify carving works - and the remainder becomes `Min`. - -**Test gate**: all existing tests pass + new `OddTwo`-specific tests pass. - -### Phase 6: Consolidation — reuse predecessor's Range entry (optimisation) - -**Deferred.** Self-contained optimisation that saves two RB-tree operations -per predecessor consolidation. Can be added later if profiling shows it -matters. The design is recorded in the "Consolidation: reusing tree -entries when possible" section above. - -### Phase 7: Multi-instance test - -Instantiate two `Arena` over disjoint address ranges in -the same test process, drive workloads against both, verify each -invariant independently. - -**Test gate**: multi-instance test passes; total memory accounted for -via both instances matches expectations. - -### Phase 8: Final review and self-review - -Per `claude.md` mandatory review checkpoints: - -- Run the recursive principle check (self-review). -- Spawn a fresh-context reviewer subagent. Address findings. Loop until - reviewer finds no issues. - -**Test gate**: full ctest run (Debug) passes; reviewer reports no issues. - ---- - -# Implementation plan: LargeArenaRange phase - -## Scope - -Build `LargeArenaRange` — a Range pipeline component that wraps -`Arena` behind snmalloc's Range API, suitable for replacing -`LargeBuddyRange`. This plan covers: - -- Generalising Arena's Rep interface for pagemap compatibility. -- `PagemapRep` — adapting pagemap entries to Arena's Rep concept. -- `LargeArenaRange` — the Range wrapper with refill and overflow handling. -- Boundary-bit support for safe consolidation across PAL allocations. -- Unit tests for all of the above. - -The pipeline integration (replacing `LargeBuddyRange` in `standard_range.h` -and `meta_protected_range.h`) is a separate step ("Update backend to use -LargeArenaRange") that follows once this plan is complete. - -## Design - -### Rep generalisation: representation-agnostic data structure - -`Arena` must be representation-agnostic, mirroring how -`Buddy<>` is generic over its node `Rep` (see `buddy.h`). The -existing buddy ecosystem demonstrates the layering: - -- `buddy.h` — pure data structure, no representation. -- `largebuddyrange.h` defines `BuddyChunkRep` — a pagemap-backed Rep - (red bit at bit 8, layout chosen to coexist with the pagemap's - reserved low bits). -- `smallbuddyrange.h` defines `BuddyInplaceRep` — an inline Rep that - stores tree pointers in the free chunk itself (red bit at bit 0). - -`Arena` must support the same two representation paths so it -can eventually replace both `LargeBuddyRange` (pagemap) and -`SmallBuddyRange` (inline) in the standard pipeline. - -#### Rep concept - -`Rep` provides: - -- `using BinRep` — full RBTree Rep for the bin trees. -- `using RangeRep` — full RBTree Rep for the range tree. -- `get_variant(addr)` / `set_variant(addr, v)` — block variant tag. -- `get_large_size_chunks(addr)` / `set_large_size_chunks(addr, n)` — - precise chunk count for `Large` blocks. -- `can_consolidate(higher_addr)` — false at PAL allocation boundaries. - -Each inner `BinRep` / `RangeRep` is a complete RBTree Rep (same shape -as `BuddyChunkRep` / `BuddyInplaceRep`): provides `Handle`, -`Contents`, `null`, `root`, `ref`, `get`, `set`, `is_red`, `set_red`, -`compare`, `equal`, `printable`, `name`. **All bit-packing decisions -(red bit position, mask layout) are private to the Rep** — -`Arena` carries no `RED_BIT` / `VARIANT_MASK` / `META_MASK` -constants of its own. - -`Arena` instantiates `RBTree` and -`RBTree` directly. It never inspects the bit -layout used by the Rep. - -#### PagemapRep - -Lives in `largearenarange.h`. Privately owns its bit layout: - -- Bin tree node in pagemap entry at `addr`, Word::One/Two. `BinRep` - packs the red bit at bit 8 and the variant tag at bits 9–10 of - Word::One; bits 0–7 are reserved by the pagemap. -- Range tree node in pagemap entry at `addr + MIN_CHUNK_SIZE`, with - the same layout for `RangeRep`. -- Large-size chunk count stored as `count << 8` in Word::One of the - entry at `addr + 2*MIN_CHUNK_SIZE`. - -#### MockRep (test only) - -Lives in `src/test/func/backend_arena/backend_arena.cc`. Backs its -storage with an array of `mock_entry` and uses the same `RED_BIT = -1 << 8` layout (so it exercises the same code paths the pagemap Rep -will). The mock's `BinRep` and `RangeRep` are inner structs that own -their own ref/get/set/is_red/set_red implementations. - -#### Future inline Rep (not in this phase) - -Mirroring `BuddyInplaceRep`: tree pointers live inside the free -memory itself. `BinRep` / `RangeRep` would use pointer-low-bits for -red and variant tags. This is what enables a future -`Arena`-based replacement for `SmallBuddyRange`. - -### Boundary-bit consolidation check - -On platforms where `CONSOLIDATE_PAL_ALLOCS` is false (CHERI, Windows), -the pagemap sets a boundary bit on the first chunk of each PAL allocation -to prevent consolidation across allocation boundaries -(`BuddyChunkRep::can_consolidate` checks this). - -Arena's `add_block` consolidation must respect the same contract. -A new method on the Rep concept: - -``` -static bool can_consolidate(uintptr_t higher_addr); -``` - -Returns `true` if the block at `higher_addr` may be consolidated with the -block immediately below it. `add_block` checks this before each merge: - -- P ↔ A merge: `Rep::can_consolidate(addr)` (A is the higher address). -- A ↔ S merge: `Rep::can_consolidate(succ_addr)` (S is the higher address). - -MockRep: always returns `true` (no boundaries). -PagemapRep: returns `!get_metaentry_mut(higher_addr).is_boundary()`. - -### PagemapRep - -Templated on `Pagemap`, `MIN_SIZE_BITS`, and `MAX_SIZE_BITS` (mirroring -`Buddy`'s shape). `MIN_SIZE_BITS` is the log2 of the pagemap stride -(snmalloc's `MIN_CHUNK_BITS` when wired through `LargeArenaRange`); -`MAX_SIZE_BITS` is needed for the large-size-shift static assertion: - -``` -template< - SNMALLOC_CONCEPT(IsWritablePagemap) Pagemap, - size_t MIN_SIZE_BITS, - size_t MAX_SIZE_BITS> -struct PagemapRep { ... }; -``` - -Each free block uses pagemap entries at three offsets from its base -address (where `UNIT_SIZE = 1 << MIN_SIZE_BITS`): - -- **Unit 0** (`addr`): Word::One / Word::Two → bin-tree node. - Bits 9–10 of Word::One → variant tag. Bit 8 → RED_BIT. All coexist - because `TreeRep::set` preserves `META_MASK` on writes. -- **Unit 1** (`addr + UNIT_SIZE`): Word::One / Word::Two → - range-tree node (only for blocks ≥ 2 units). -- **Unit 2** (`addr + 2 * UNIT_SIZE`): Word::One → large chunk - count (only for blocks ≥ 3 units). Stored as `count << 8` to avoid - the 8 reserved low bits; recovered via `word.get() >> 8`. - -**Static assertions in PagemapRep** (catch configuration errors early): - -- `static_assert((VARIANT_MASK | RED_BIT) < UNIT_SIZE)` — metadata - bits don't collide with address bits. -- `static_assert(MetaEntryBase::is_backend_allowed_value(Word::One, - VARIANT_MASK | RED_BIT))` — all metadata bits are in the backend- - allowed range. -- `static_assert((MAX_SIZE_BITS - MIN_SIZE_BITS) + LARGE_SIZE_SHIFT <= - bits::BITS)` — shifted large size fits in a pagemap word. - -Method mapping: - -| Method | Implementation | -|--------|---------------| -| `ref_word(dir, addr)` | `get_metaentry_mut(addr).get_backend_word(dir ? One : Two)` | -| `ref_range_word(dir, addr)` | `get_metaentry_mut(addr + MCS).get_backend_word(dir ? One : Two)` | -| `get_variant(addr)` | `(ref_word(true, addr).get() & VARIANT_MASK) >> 9` | -| `set_variant(addr, v)` | RMW on `ref_word(true, addr)`: clear VARIANT_MASK, OR new value | -| `get_large_size_chunks(addr)` | `get_metaentry_mut(addr + 2*MCS).get_backend_word(One).get() >> 8` | -| `set_large_size_chunks(addr, s)` | `...get_backend_word(One) = s << 8` | -| `can_consolidate(addr)` | `!get_metaentry_mut(addr).is_boundary()` | - -(`MCS = MIN_CHUNK_SIZE`) - -`get_backend_word` auto-calls `claim_for_backend()` on first access to -an unowned entry, so pagemap ownership transitions happen implicitly. -The boundary bit (bit 0 of `meta`) is in the reserved-mask zone and is -preserved by both `claim_for_backend()` and `BackendStateWordRef::operator=`. - -### LargeArenaRange - -Outer template matches `LargeBuddyRange`'s shape so it is a drop-in -replacement in `Pipe<...>` compositions: - -``` -template< - size_t REFILL_SIZE_BITS, - size_t MAX_SIZE_BITS, - SNMALLOC_CONCEPT(IsWritablePagemap) Pagemap, - size_t MIN_REFILL_SIZE_BITS = 0> -class LargeArenaRange -{ -public: - template> - class Type : public ContainsParent - { - using ContainsParent::parent; - - using PagemapRepT = PagemapRep; - Arena arena; - size_t requested_total = 0; - - public: - static constexpr bool Aligned = true; - static constexpr bool ConcurrencySafe = false; - using ChunkBounds = capptr::bounds::Arena; - - capptr::Arena alloc_range(size_t size); - void dealloc_range(capptr::Arena base, size_t size); - }; -}; -``` - -**`alloc_range(size)`**: - -1. `SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE)`. -2. `SNMALLOC_ASSERT((size & (MIN_CHUNK_SIZE - 1)) == 0)` — size must be - a chunk multiple, but no power-of-two restriction. The arena handles - any size in `[MIN_CHUNK_SIZE, 2^MAX_SIZE_BITS)`. -3. `n_chunks = size >> MIN_CHUNK_BITS`. -4. Oversize bypass: if `n_chunks >= bits::one_at_bit(MAX_SIZE_BITS - MIN_CHUNK_BITS)`, - delegate to `parent.alloc_range(size)` (if `ParentRange::Aligned`), - else return `nullptr`. Same as `LargeBuddyRange`. -5. `auto [addr, actual] = arena.remove_block(n_chunks)`. The arena - carves exactly `n_chunks` chunks via `Bins::carve`; `actual` is - always `n_chunks` on success and is asserted as such. -6. If `addr != 0`, return - `capptr::Arena::unsafe_from(reinterpret_cast(addr))`. -7. If `addr == 0`, call `refill(size)`. - -**`dealloc_range(base, size)`**: - -1. `SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE)`, - `SNMALLOC_ASSERT((size & (MIN_CHUNK_SIZE - 1)) == 0)` — chunk multiple - only; no power-of-two restriction. -2. Oversize bypass: if `size >= 2^MAX_SIZE_BITS`, delegate to - `parent.dealloc_range(base, size)`. Same SFINAE guard as - `LargeBuddyRange::parent_dealloc_range`. -3. `n_chunks = size >> MIN_CHUNK_BITS`, - `auto [ov_addr, ov_size] = arena.add_block(base.unsafe_uintptr(), n_chunks)`. -4. If overflow (`ov_addr != 0`): call `dealloc_overflow(ov_addr, - ov_size)`. - -**`dealloc_overflow(addr, size_chunks)`**: - -Overflow from `add_block` is forwarded directly to the parent's -`dealloc_range`. The parent does not require power-of-two input — all -non-Buddy ranges accept any chunk-aligned size, and `LargeArenaRange` -itself accepts any chunk-multiple size — so no decomposition is needed. - -``` -void dealloc_overflow(uintptr_t addr, size_t size_chunks) -{ - if constexpr (MAX_SIZE_BITS != (bits::BITS - 1)) - { - auto base = capptr::Arena::unsafe_from( - reinterpret_cast(addr)); - size_t size_bytes = size_chunks << MIN_CHUNK_BITS; - parent.dealloc_range(base, size_bytes); - } - else - { - // Global range: no parent to return to. - SNMALLOC_CHECK(false && "Global range overflow should not happen"); - } -} -``` - -When `MAX_SIZE_BITS == BITS - 1` (global range), the arena covers the -entire address space. Overflow would mean all managed memory has -coalesced — this should not happen in normal operation. If it does, -abort (matching `LargeBuddyRange`'s behaviour for the unreachable -case). - -**`refill(size)`** — closely follows `LargeBuddyRange::refill`: - -For `ParentRange::Aligned` (the standard path): - -1. Compute `refill_size = min(REFILL_SIZE, requested_total)`, clamped to - `max(MIN_REFILL_SIZE, size)`, rounded up to next power of two. -2. `auto refill_range = parent.alloc_range(refill_size)`. -3. If `refill_range != nullptr`: - - `requested_total += refill_size`. - - `remainder_size = refill_size - size`. - - If `remainder_size > 0`: - `arena.add_block(refill_range.unsafe_uintptr() + size, - remainder_size >> MIN_CHUNK_BITS)`. - Handle overflow (send to parent). - - Return `refill_range`. -4. If `nullptr`, return `nullptr`. - -The returned portion (`refill_range` to `refill_range + size`) bypasses -the arena entirely — it is not inserted or tracked. The remainder is -added to the arena for future allocations. Since the remainder comes -from a fresh refill and has no neighbours in the arena, `add_block` -performs a simple insertion with no consolidation (boundary bit on the -refill base may prevent consolidation with any pre-existing blocks -below it, which is correct). - -For the unaligned parent path: over-allocate `2 * size` (with overflow -check), add everything to the arena via `add_range`, then call -`alloc_range(size)` recursively. - -**`add_range(base, length)`** trims `(base, length)` to chunk boundaries -on both ends (PalRange returns page-aligned but not chunk-aligned -addresses) and inserts a single block via `add_block` — no power-of-two -decomposition is needed because `add_block` accepts any size in -`[1, 2^CHUNKS_BITS)` chunks. Any overflow from `add_block` is forwarded -to `dealloc_overflow`. - -Safety guards (both from `LargeBuddyRange`): -- `static_assert((REFILL_SIZE < bits::one_at_bit(MAX_SIZE_BITS)) || - ParentRange::Aligned)` — prevents the unaligned path from adding a - block that violates `add_block`'s `size_chunks < 2^(MAX_SIZE_BITS - MIN_CHUNK_BITS)` - precondition. -- Runtime: `SNMALLOC_ASSERT(refill_size < bits::one_at_bit(MAX_SIZE_BITS))` - — catches the computed `refill_size` (which may be larger than - `REFILL_SIZE` when `needed_size = 2 * size` dominates). - -### Static properties - -- `Aligned = true`: Arena's carving ensures that a request of - size `n` (power-of-two, chunk-aligned) is placed at an `n`-aligned - address within the source block. For non-power-of-two requests, the - bin scheme's alignment rules still hold (alignment matches the - lowest set bit of the size class). -- `ConcurrencySafe = false`: same as `LargeBuddyRange`. -- `ChunkBounds = capptr::bounds::Arena`: same as `LargeBuddyRange`. - -### MAX_SIZE_BITS = BITS - 1 (global range) - -The global `LargeBuddyRange` uses `MAX_SIZE_BITS = BITS - 1`, meaning -the buddy can hold up to half the address space. For LargeArenaRange: -the maximum block size in chunks is `2^(MAX_SIZE_BITS - MIN_CHUNK_BITS)`. -On 64-bit with `MIN_CHUNK_BITS = 14`, this gives a chunk-bit width of -49 — the arena can hold up to 2^49 chunks. The arena's overflow path -returns consolidated blocks that reach this size, handled by -`dealloc_overflow` (see above). - -The `large_size_chunks` field (stored shifted by 8 in a pagemap word) -needs at most 49 bits, which fits in the 56 backend-usable bits of a -64-bit pagemap word. A `static_assert((MAX_SIZE_BITS - MIN_SIZE_BITS) + -LARGE_SIZE_SHIFT <= bits::BITS)` in `PagemapRep` catches configurations -where this would overflow. - -## Phases - -### Phase 9: Rep generalisation + boundary support - -**Status**: implemented; staged (not committed); awaiting review. - -Changes to `arena.h`: - -1. Delete the private `WordRef` nested struct, the `TreeRep` - template, and all bit-layout constants - (`RED_BIT`/`VARIANT_MASK`/`META_MASK` and `BACKEND_RESERVED_MASK`). - `Arena` is now representation-agnostic, mirroring how - `buddy.h` is generic over its node `Rep`. -2. Replace the internal `using BinRep = TreeRep` / - `RangeRep = TreeRep` aliases with direct use - of `typename Rep::BinRep` and `typename Rep::RangeRep` — full - RBTree Reps supplied by the user, owning their own bit packing. -3. Update the Rep concept doc to require `BinRep`, `RangeRep`, - `get_variant`/`set_variant`, - `get_large_size_chunks`/`set_large_size_chunks`, and - `can_consolidate`. -4. Add `can_consolidate` calls in `add_block` before each merge - (predecessor and successor) and update the invariant clauses to - tolerate boundary-blocked adjacency. - -Changes to `backend_arena.cc` (test file): - -5. Define `BackendArenaWordRef` (test-only proxy) at the top of the - test file. -6. MockRep grows inner `BinRep` and `RangeRep` structs that each - provide the full RBTree Rep interface (ref/get/set/is_red/etc.) - over the mock-entry array. Each owns its own private bit layout - (red bit at bit 8 to match the PagemapRep layout). -7. MockRep keeps top-level `get_variant`/`set_variant`/large-size - accessors and adds `can_consolidate(uintptr_t) → true`. -8. New test: verify that a MockRep variant with `can_consolidate` - returning false at a specific address prevents consolidation across - that boundary. Test both predecessor and successor merges being - independently blocked. - -**Test gate**: all existing Arena tests pass unchanged; new -boundary test passes. - -### Phase 10: PagemapRep + LargeArenaRange + tests - -**Status**: implemented and tested. Committed in `9c1ca745`. - -> **Note**: the design notes below were written before Phase 10d -> (bytes-throughout). The as-built code uses byte sizes everywhere -> at the arena/range API and a unified `parent_dealloc(uintptr_t, -> size_t)` helper in place of the old `dealloc_overflow` / -> `parent_dealloc_range` pair. See the Phase 10d section for the -> current shape. Where the notes below say `size_chunks`, the -> implementation uses bytes; where they say `dealloc_overflow`, the -> implementation uses `parent_dealloc`. - -**Phase 10b refactor (also implemented):** `Arena` and `PagemapRep` -were both retemplated to mirror `Buddy`'s 3-parameter shape: - -- `template class Arena` - — the always-zero `MIN_CHUNKS_BITS` placeholder is gone, and the unit - of allocation is named explicitly via `MIN_SIZE_BITS` instead of being - implicitly tied to snmalloc's global `MIN_CHUNK_BITS`. Internally, - `UNIT_SIZE = 1 << MIN_SIZE_BITS` and `CHUNKS_BITS = MAX_SIZE_BITS - - MIN_SIZE_BITS` replace the old `MIN_CHUNK_SIZE` / `MAX_CHUNKS_BITS` - usages. -- `template class PagemapRep` - — owns the large-size-shift capacity static_assert - `(MAX_SIZE_BITS - MIN_SIZE_BITS) + LARGE_SIZE_SHIFT <= bits::BITS`; - `LARGE_SIZE_SHIFT` is private. The Rep's pagemap stride is - `UNIT_SIZE = 1 << MIN_SIZE_BITS`. -- `LargeArenaRange::Type` wires snmalloc's `MIN_CHUNK_BITS` as - `MIN_SIZE_BITS` for both PagemapRep and Arena: - `PagemapRep` and - `Arena`. - -New file: `src/snmalloc/backend_helpers/largearenarange.h` - -1. `PagemapRep` — full Rep - implementation using pagemap entries as described above, with all - static assertions. -2. `LargeArenaRange` — the Range wrapper with `alloc_range`, - `dealloc_range`, `refill`, and `dealloc_overflow`. - -Modified: `src/snmalloc/backend_helpers/backend_helpers.h` - -3. Add `#include "largearenarange.h"` so the new header is - available through the standard include path. - -New file: `src/test/func/backend_arena_range/backend_arena_range.cc` - -4. Test with snmalloc's `BasicPagemap` (or a test-appropriate pagemap): - - PagemapRep word round-trips (variant, tree words, large size). - - LargeArenaRange `alloc_range` / `dealloc_range` smoke test with - a simple parent range. - - Refill: verify that allocating when the arena is empty triggers a - parent refill and returns memory. - - Overflow: verify that deallocating a block that triggers arena-scale - consolidation forwards the overflow to the parent via - `dealloc_overflow`. - - Non-power-of-two sizes: verify `alloc_range` / `dealloc_range` work - for chunk-multiple but non-power-of-two sizes, including sizes that - are not representable size classes. The arena carves exactly the - requested chunk count internally, so callers see no excess. - - Boundary: verify that a boundary bit in the pagemap prevents - consolidation of adjacent blocks from different refills (when - `CONSOLIDATE_PAL_ALLOCS` is false). - - Test at largest configured `MAX_SIZE_BITS` values, especially - `MAX_SIZE_BITS == bits::BITS - 1` if feasible. - -Modified: `CMakeLists.txt` - -5. Register `backend_arena_range` in `TESTLIB_ONLY_TESTS`. - -**Test gate**: LargeArenaRange tests pass; existing tests unaffected. - -### Phase 11: Final review - -Per `claude.md` mandatory review checkpoints: - -- Spawn a fresh-context reviewer on the full diff (Phases 9–10). -- Address findings, loop until clean. - -**Test gate**: full ctest run passes; reviewer reports no issues. - -### Phase 10d: Bytes throughout (replace chunk-count internal API) - -**Goal**: drop the `size_chunks` / chunk-count internal convention from -`Arena` and `PagemapRep` so byte sizes (multiples of UNIT_SIZE) -flow end-to-end, removing the `<< MIN_CHUNK_BITS` conversion dance at -the LargeArenaRange ↔ Arena boundary and the matching reverse -shifts inside the range wrapper. - -**Substep 1 (DONE)**: generalise `ArenaBins` on a new -`MIN_SIZE_BITS` template parameter so its `range_t.size`, carve -arguments, and `max_supported_size()` are byte sizes (multiples of -`UNIT_SIZE = 1 << MIN_SIZE_BITS`). Renames inside Bins: -`size_chunks → size`, `align_chunks → align`, `max_supported_chunks -→ max_supported_size`. Tests cover `MIN_SIZE_BITS ∈ {0, 4, 14}`. - -**Substep 2 (DONE)**: flip `Arena`, `PagemapRep`, and -`LargeArenaRange` to bytes throughout: -- `Arena` now uses - `ArenaBins`; `add_block` / `remove_block` - take/return bytes; `addr_to_chunk` / `chunk_to_addr` / `CHUNKS_BITS` - deleted; `variant_of(size, addr)` works in byte units with - parity from `(addr >> MIN_SIZE_BITS) & 1`. -- `remove_block(size)` returns a scalar `addr_t` (0 = failure). The - size in the returned pair was tautological (always equal to the - requested `size` on success). -- `PagemapRep::get_large_size` / `set_large_size` (renamed from - `*_chunks`) take and return bytes; internal storage still scales - by `MIN_SIZE_BITS` so the shifted field fits a pagemap word. -- `LargeArenaRange::add_range` / `dealloc_range` / - `parent_dealloc` (unified from `parent_dealloc_range` and - `dealloc_overflow`) drop chunk-count conversions; `add_range` - uses `bits::align_up` / `bits::align_down`. -- Test scaffolding (`MockRep`, `BoundaryMockRep`, `Oracle`) - updated; tests introduce `chunk_size(N) = N << MIN_CHUNK_BITS` - helper. - -**Test gate**: `func-backend_arena-check`, `func-backend_arena_bins-check`, -`func-backend_arena_range-check` all pass; full `ninja` build clean. - -**Remaining**: code-review checkpoint for Phase 10d combined diff -before opening a PR; then proceed to Phase 12 (pipeline integration). - -*Pipeline integration (replacing `LargeBuddyRange` in `standard_range.h` -and `meta_protected_range.h`) is a separate follow-up plan: "Update -backend to use LargeArenaRange."* - -## Files added / changed (anticipated, this phase) - -- Modified: `src/snmalloc/backend_helpers/arena.h` — - representation-agnostic: delete private `WordRef`, `TreeRep`, and - all bit-layout constants (`RED_BIT`/`VARIANT_MASK`/`META_MASK`/ - reserved); use `Rep::BinRep` and `Rep::RangeRep` directly; - `can_consolidate` check in `add_block`; invariant clauses updated. -- New: `src/snmalloc/backend_helpers/largearenarange.h` — - `PagemapRep` + `LargeArenaRange`. -- Modified: `src/snmalloc/backend_helpers/backend_helpers.h` — include - `largearenarange.h`. -- Modified: `src/test/func/backend_arena/backend_arena.cc` — define - `BackendArenaWordRef` test helper at top of file; MockRep updated - (`BackendArenaWordRef` returns, `can_consolidate`); boundary tests. -- New: `src/test/func/backend_arena_range/backend_arena_range.cc` — - Range wrapper tests. -- Modified: `CMakeLists.txt` — register `backend_arena_range` test. - -## Key design decisions - -1. **Representation-agnostic data structure** — `Arena` - carries no bit-layout constants. All red/variant packing decisions - live in the user-supplied `Rep::BinRep` / `Rep::RangeRep`, matching - how `BuddyChunkRep` and `BuddyInplaceRep` each own their own - layouts. This is what makes a future inline Rep (to replace - `SmallBuddyRange`) possible. - -2. **PagemapRep variant in bin-tree Word::One** — PagemapRep packs - the variant tag at bits 9–10 of Word::One alongside the red bit - (bit 8) and child pointer (bits ≥ MIN_CHUNK_BITS). These are - private constants inside PagemapRep, not exposed by Arena. - -3. **Large size stored shifted** — PagemapRep stores the chunk count - as `count << 8` to avoid the pagemap's reserved low byte; recovered - via `>> 8`. Guarded by `static_assert((MAX_SIZE_BITS - MIN_CHUNK_BITS) + 8 <= bits::BITS)`. - -4. **Boundary checks in Arena** — not in LargeArenaRange. - Consolidation decisions happen inside `add_block`, so the boundary - check must be there. The Rep concept cleanly abstracts this via - `can_consolidate`. - -5. **Refill returns prefix directly** — like LargeBuddyRange, the - first `size` bytes of a refill bypass the arena. Only the remainder - enters the arena. This avoids unnecessary tree operations on the - hot path. - -6. **PagemapRep auto-claims entries** — `get_backend_word` calls - `claim_for_backend()` on first access. No explicit ownership - management needed in Arena or LargeArenaRange. - -7. **Overflow forwarding** — `add_block` overflow may produce non- - power-of-two sizes (consolidated blocks from multiple PAL allocs). - `dealloc_overflow` forwards the overflow directly to the parent's - `dealloc_range`; no power-of-two decomposition is needed because - `LargeArenaRange` (which is what replaces `LargeBuddyRange` in - the pipeline) accepts any chunk-multiple size. - -8. **`BackendArenaWordRef` lives in the test file** — the in-tree - `PagemapRep` returns `BackendStateWordRef` directly (mirroring - `BuddyChunkRep` in `largebuddyrange.h`). The test-only - `BackendArenaWordRef` proxy is defined in - `src/test/func/backend_arena/backend_arena.cc` and used only by - MockRep, so the in-tree headers carry no test scaffolding. - -9. **No power-of-two restriction on the public API** — `alloc_range` - and `dealloc_range` accept any chunk-multiple size; the only - restriction is `size >= MIN_CHUNK_SIZE` and `size < 2^MAX_SIZE_BITS`. - The arena's `Bins::carve` delivers exactly the requested chunk - count, rolling any size-class rounding remainder into the post - fragment that is re-inserted internally. SC rounding therefore - stays a private arena detail. This lifts a restriction inherited - from `LargeBuddyRange`. - -## Resolved during plan review - -- Overflow handling: `add_block` can return non-power-of-two sizes when - blocks from multiple PAL allocations consolidate. `dealloc_overflow` - forwards the overflow directly to the parent — no decomposition is - required because `LargeArenaRange` itself accepts arbitrary - chunk-multiple sizes and replaces `LargeBuddyRange` in the pipeline. - (Rubber-duck finding #2 superseded by Option B refactor.) -- Handle visibility / layering: original plan promoted bit-layout - constants and a `BackendArenaWordRef` proxy to namespace scope so - the in-tree header and tests could share them. Subsequent review - observed that this broke the Buddy/`BuddyChunkRep`/`BuddyInplaceRep` - layering: the data structure should be representation-agnostic. - Resolved by making `Arena` carry no bit-layout state and - requiring `Rep::BinRep` / `Rep::RangeRep` to own all packing - decisions. `PagemapRep` keeps its layout private; the - test `BackendArenaWordRef` lives in the test file alongside MockRep. - (Rubber-duck finding #1, then revised after layering review.) -- Size shift overflow: `static_assert((MAX_SIZE_BITS - MIN_CHUNK_BITS) + 8 <= BITS)` in - `PagemapRep` prevents shift overflow. (Rubber-duck finding #4.) -- Unaligned refill guard: both static assert AND runtime assert copied - from `LargeBuddyRange` to prevent `add_block` precondition violation. - (Rubber-duck finding #6, strengthened in second review.) -- Pipeline integration (Phase 11) removed from this plan's scope — - separate follow-up plan. (Rubber-duck finding #8.) -- `PagemapRep` templated on `MIN_SIZE_BITS` and `MAX_SIZE_BITS` so the - size-shift static_assert is in scope. (Second review finding #1.) -- `remove_block` exact-size guarantee is scoped to power-of-two - requests only. (Second review finding #4.) - ---- - -## Files added / changed (Arena phase, completed) - -- New: `src/snmalloc/backend_helpers/arenabins.h` — - `range_t`, `carve_t`, `carve`, `max_supported_chunks`, and nested - `Bitmap` with `add` / `find_for_request` / `clear` (public surface); - the size-class encoding (`bitmap_info_t`, `carve_info_t`, constexpr - `BinTable`, `bitmap_info_for_request` / `carve_info_for_request`, - `bin_index`) is private and reachable via - `ArenaBinsTestAccess` (forward-declared in the header, - defined in the test cc) for unit tests. Templated on - `INTERMEDIATE_BITS` for testability. -- New: `src/snmalloc/backend_helpers/arena.h` — the data structure, - templated on a `BackendArenaRep` concept exposing variant-tag and - node/size accessors (no pagemap-probing API). -- New: `src/test/func/backend_arena_bins/backend_arena_bins.cc` — bin - classification tests and `find_for_request` tests for - `B ∈ {1, 2, 3}`, using `bin_subsets` as the canonical "serves" - predicate. -- New: `src/test/func/backend_arena/backend_arena.cc` — data-structure - tests with a mock Rep (array-backed pagemap, modelled on `redblack.cc`). -- Modified: `src/snmalloc/ds_core/redblacktree.h` — `neighbours(K)` - helper on `RBTree` returning `(largest < K, smallest > K)` in one walk. -- Modified: `src/test/func/redblack/redblack.cc` — randomised - `neighbours(K)` tests against `std::set::lower_bound` / - `upper_bound` as oracle. - -No in-tree code path is changed in this phase: the existing -`LargeBuddyRange` continues to be the active large-block allocator. - -## Resolved during plan review - -- One Bin tree per IDEA servable-set bin (not per size class or per - exponent). -- Scope is the Arena data structure + tests only. -- The pagemap encoding carries a 2-bit **variant tag** - (`Min` / `TwoMin` / `Large`) on the first entry of each free block. - Tree membership — not the tag — is the source of truth for "is this - block free?". No transient `BackendOwned` / "claimed" tag is required. -- **No pagemap probing.** All adjacency lookups are restricted to this - `Arena`'s own RBTrees: non-min neighbours come from a single - `Range.neighbours(addr_A)` walk that returns both - `(largest < addr_A, smallest > addr_A)`; min-size neighbours come from - `MinSizeBin.find(addr_A ± MIN_CHUNK_SIZE)`. The pagemap is never read - at speculative addresses (concurrency hazard and no defined contract - for pagemap entries the Arena does not own). -- Free blocks may have **arbitrary chunk counts**, not just exact - size-class sizes — carving produces non-class remainders. `bin_index` - operates on `(addr_chunks, size_chunks)` pairs; `Large` blocks store - their precise chunk count in the third pagemap entry. -- Write-ordering rule: when adding a free block, the variant tag and any - auxiliary fields are written before the final RB-tree insertion that - makes the block reachable; when removing, the block is unlinked from - its trees before its pagemap entries are reused. -- Predecessor-Range-entry-reuse only applies when `P` is non-min. -- `add_block` returns `{0, 0}` on success; on overflow it returns the - unabsorbed range, mirroring `Buddy::add_block`'s overflow-return - contract. Oversize inputs (`size_chunks >= 2^(MAX_SIZE_BITS - MIN_CHUNK_BITS)`) bypass - `Arena` entirely — the wrapping `LargeArenaRange` layer - handles them before calling `add_block`, and `add_block` asserts - `size_chunks < 2^(MAX_SIZE_BITS - MIN_CHUNK_BITS)`. The only overflow case is - consolidation growing a coalesced block to exactly - `2^(MAX_SIZE_BITS - MIN_CHUNK_BITS)` (the consolidated range is returned, neighbours - having been removed first). The future `LargeArenaRange` wrapper is - responsible for handling overflow; the standalone `Arena` only - exposes the contract. -- `BackendArenaRep` is a chunk-keyed accessor concept (variant tag plus - word/size accessors for entries 1–3). `Arena` builds two - internal `RBTree`-Rep adapters (`BinRep`, `RangeRep`) over it; user - code never sees the adapter shape. -- Backend chunk size classes are a new chunk-unit size-class scheme in - `arenabins.h` (not bytes), independent of the - power-of-two-only large variant of front-end `sizeclass_t`, with - low-exponent special cases handled in the spirit of - `bits::from_exp_mant`. -- `Arena` uses byte-size - exponent bounds with **exclusive max** semantics, matching the existing - `Buddy<..., MIN, MAX>`. -- Multi-`B` testing is via a templated bin-table generator in a single - test binary, not via separate CMake configurations. -- Phase 5 verifies the reuse optimisation via Range-tree insert/remove - *call counters* at the `Arena` layer (no `RBTree` modification). - -## Still open (resolve during implementation) - -- ~~Exact bit positions in the first-word pagemap encoding for the - variant-tag field.~~ **Resolved** (Phase 3+4): bits 9–10 encode - `ArenaVariant` (`VARIANT_MASK = 0x600`); bit 8 is `RED_BIT`; - bits 0–7 are `BACKEND_RESERVED_MASK`. Documented in - `arena.h`. -- ~~Whether Bin tree roots are stored flat - (`Array`) or exponent-keyed.~~ **Resolved** - (Phase 3+4): flat `stl::Array`. -- Whether the future memcpy `offset` field is best placed in the second - word of every pagemap entry, in dedicated entries, or in a side table. - Out of scope for this phase; flagged for the memcpy-fix plan to design. -- Whether `INTERMEDIATE_BITS=4` (34 bins/exp) needs to be tested in this - phase. Currently `B ∈ {1, 2, 3}` only. - ---- - -# Phase 12: Update backend to use LargeArenaRange - -## Status: implementation complete, awaiting commit approval - -Substitution implemented and tested in the working tree (uncommitted on -top of `9c1ca745`). `Arena::add_block` had a latent -out-of-region pagemap-probe bug in its successor-min branch that -became reachable once `LargeArenaRange` started serving fixed-region -allocations; fixed in this phase (see "Issue found during Phase 12 -test run" below). Full ctest suite passes (86/86). - -Diff: 6 files, 183/45 +/- (PLAN.md, both pipeline range headers, -`arena.h`, `arenabins.h`, `backend_arena.cc`). - -## Goal - -Replace every `LargeBuddyRange` instantiation in the range -pipelines with `LargeArenaRange`. After this phase, snmalloc uses -the Arena bin-tree allocator instead of the power-of-two buddy -for all large-range management. The `LargeBuddyRange` and -`BuddyChunkRep` classes are **not deleted** — they remain available -for alternative configurations and external embedders. Only the -default pipeline wiring changes. - -## Scope - -- Modify `standard_range.h` — replace all `LargeBuddyRange` with - `LargeArenaRange` (same template parameters). -- Modify `meta_protected_range.h` — replace all `LargeBuddyRange` - with `LargeArenaRange` (same template parameters). -- **No other source files change.** `LargeArenaRange` is already a - drop-in replacement: same template signature, same `Type` - shape, same `alloc_range`/`dealloc_range` API, same `Aligned`, - `ConcurrencySafe`, and `ChunkBounds` constants. - -## Pre-conditions - -- Phase 10 (LargeArenaRange) is committed and all its tests pass - (commit `9c1ca745`). -- Phase 11 (final review of Phases 9–10) was waived by the user; - Phase 12 proceeds without it. -- Baseline: the checkout builds and all tests pass before this change. - Recorded after 9c1ca745: 86/86 ctest passed, no warnings. - -## Analysis of every LargeBuddyRange instantiation - -### `standard_range.h` - -**1. GlobalR** -```cpp -LargeBuddyRange -``` -→ `LargeArenaRange` - -- `MAX_SIZE_BITS = bits::BITS - 1` → global-range mode (no parent - dealloc). `LargeArenaRange` handles this identically. -- `MIN_REFILL_SIZE_BITS = MinSizeBits` (Windows: 16, otherwise PAL- - dependent). `LargeArenaRange` passes this through. -- Parent is `Base` (PalRange + PagemapRegisterRange chain). Parent is - **unaligned** on PALs without `AlignedAllocation` (e.g. Linux mmap) - and aligned otherwise. `LargeArenaRange::refill` currently still - carries the aligned/unaligned dual path inherited from - `LargeBuddyRange`; collapsing this into a single path is deferred to - Phase 13. - -**2. LargeObjectRange (local cache)** -```cpp -LargeBuddyRange -``` -→ `LargeArenaRange` - -- `MAX_SIZE_BITS = LocalCacheSizeBits = 21` (2 MiB). Non-global mode. - Overflow goes to parent. -- `LargeArenaRange::parent_dealloc` forwards directly to parent - without decomposition (single block returned by - `Arena::add_block` when consolidation reaches the arena-scale - upper bound). The size is a chunk multiple up to `2^MAX_SIZE_BITS`, - not necessarily power-of-two — the parent must accept arbitrary - chunk-multiple sizes. -- Wrapped in `StaticConditionalRange` — no impact on the substitution. - -### `meta_protected_range.h` - -**3. GlobalR** — identical to standard_range.h #1. - -**4. CentralObjectRange** -```cpp -LargeBuddyRange -``` -→ `LargeArenaRange` - -- `MIN_REFILL_SIZE_BITS = 0` (default). Global-range mode. - -**5. CentralMetaRange** -```cpp -LargeBuddyRange -``` -→ `LargeArenaRange` - -- Global-range mode. - -**6. CentralMetaRange conditional huge-page buddy** -```cpp -stl::conditional_t< - (max_page_chunk_size_bits > MIN_CHUNK_BITS), - LargeBuddyRange< - max_page_chunk_size_bits, max_page_chunk_size_bits, - Pagemap, page_size_bits>, - NopRange> -``` -→ Replace `LargeBuddyRange` with `LargeArenaRange` inside the - `conditional_t`. - -- This is a small local cache for huge-page consolidation. - `MAX_SIZE_BITS = max_page_chunk_size_bits` (typically - `page_size_bits` when page_size_bits > MIN_CHUNK_BITS, e.g. - huge pages at 21 bits). -- Non-global mode. Overflow forwarded to parent as one consolidated - chunk-multiple block via `parent_dealloc`. - -**7. ObjectRange (local)** -```cpp -LargeBuddyRange -``` -→ `LargeArenaRange` - -- Same shape as standard_range.h #2. - -**8. MetaRange (local)** -```cpp -LargeBuddyRange -``` -→ `LargeArenaRange` - -- `REFILL_SIZE_BITS = 21 - 6 = 15`. Global-range mode. - `MIN_REFILL_SIZE_BITS = 0`. - -## Implementation - -The change is a mechanical text substitution — replace the string -`LargeBuddyRange` with `LargeArenaRange` in both files. No -template parameters, no API calls, no structural changes. - -### Step 1: Replace LargeBuddyRange → LargeArenaRange - -In `src/snmalloc/backend/standard_range.h`: -- 2 instantiations of `LargeBuddyRange<` (GlobalR, LargeObjectRange). - -In `src/snmalloc/backend/meta_protected_range.h`: -- 6 instantiations of `LargeBuddyRange<` (GlobalR, CentralObjectRange, - CentralMetaRange, the `conditional_t` huge-page cache, - ObjectRange, MetaRange). - -### Step 2: Verify include paths - -Both files include `"../backend/backend.h"` which includes -`"../backend_helpers/backend_helpers.h"` which already includes -`"largearenarange.h"`. **No new includes needed.** - -### Step 3: Build and test - -- Full `ctest` suite must pass. This is the primary validation: - hundreds of functional tests exercise the full allocator pipeline. -- Specific tests to watch: - - `func-memory-fast` — core malloc/free workloads - - `func-pool-fast` — pool allocator - - `func-domestication-fast` — boundary/domestication - - `func-fixed_region-fast` — fixed-region (uses `FixedRangeConfig` - which uses `StandardLocalState`) - - `perf-*` — performance tests (functional correctness only) - -**Test gate**: full `ctest` passes. No new tests needed — the existing -test suite exercises the pipeline end-to-end. - -### Issue found during Phase 12 test run: out-of-region pagemap probe - -`func-fixed_region_alloc-check` segfaulted in `PagemapRep::can_consolidate` -when `Arena::add_block` was called with a block whose -`succ_addr = addr + size` sat one chunk past the registered pagemap -range (the last 8 MiB of a 256 MiB FixedRange). The bug shape matches -the `buddy.h:90-93` comment exactly: `can_consolidate` reads the -pagemap entry at `succ_addr`, and that read is only safe once a -tree-membership test has confirmed the address is in our region. - -**Fix.** In `Arena::add_block`, the successor-min branch was -reordered so the tree-membership check (`contains_min(succ_addr)`) -short-circuits before the pagemap probe (`Rep::can_consolidate`). -All other can_consolidate call sites already had their preconditions -established (either `addr` is the input block, or the address was -returned from `range_tree.neighbours()` and is in the tree). - -**Regression coverage.** `MockRep` was extended with a per-chunk -`boundary` flag stored on `mock_entry`. `MockRep::can_consolidate(addr)` -now returns `!mock_store[mock_index(addr)].boundary` — faithful to the -real `PagemapRep::can_consolidate` reading `entry.is_boundary()`. The -`mock_index` bounds assertion fires on any out-of-range probe, so the -unsafe pattern trips in unit tests rather than only as a segfault in -release builds. A new test `test_block_at_arena_top_edge` adds a block -whose `succ_addr` sits one past the arena's pagemap; without the -reorder this test reproduces the original failure. - -This unification also subsumed the previous `BoundaryMockRep` and its -`boundary_addrs` global `std::set`: the four boundary tests -(`test_boundary_blocks_predecessor`, `test_boundary_blocks_successor`, -`test_boundary_partial`, `test_boundary_blocks_min_predecessor`) now -run on `Arena` and set `mock_store[mock_index(addr)].boundary = true` -instead. Net −35 lines in `backend_arena.cc`. - -A leftover `throw "..."` in `arenabins.h:807` (used as a -constexpr-failure trick in the `BinTable` constructor) caused a build -failure in `-fno-exceptions` configurations during Phase 12. Replaced -with `SNMALLOC_CHECK(false && "...")`, which is non-constexpr and -fails compile-time evaluation the same way without requiring -exception support. - -### Step 4: Retire the `ParentRange::Aligned` concept - -**Deferred to Phase 13.** Originally listed here but moved out for the -following reasons (rubber-duck review): -- It touches `LargeBuddyRange`, which Phase 12 explicitly keeps - available for alternative configurations / embedders. -- It changes the public range concept (every pass-through range loses - a static field) — a structural change, not a wiring change. -- It would split Phase 12 across an atomic-substitution commit and a - separate concept-cleanup commit anyway; better to make that split - explicit in the plan. - -Phase 12 ends after Step 3 with the test suite green. - -## Investigated and dropped: Retire `ParentRange::Aligned` - -**Status: dropped on review.** Phase 13 was deferred from Phase 12 with -the intent of collapsing `LargeArenaRange::refill`'s two-path -conditional and (optionally) removing `ParentRange::Aligned` from the -range concept. Closer inspection of the existing code found the -conditional is load-bearing, not vestigial: - -- **The two paths give different capabilities, not just different - efficiencies.** The aligned-parent path serves caller sizes up to - `(1 << MAX_SIZE_BITS) - 1`. The unaligned-parent path's - `while (needed_size <= refill_size)` guard caps caller size at - ~`REFILL_SIZE / 2`. Unifying on the unaligned strategy reduces - capability for aligned-parent configs. - -- **The aligned-parent path's carve shortcut is precise, not a perf - optimisation.** It hands the caller's `size` bytes back directly - and calls `add_range(refill + size, refill_size - size)` — - passing `refill_size - size` (strictly less than `refill_size`) - to `add_block`, which satisfies `add_block`'s - `size < 2^MAX_SIZE_BITS` precondition even when - `REFILL_SIZE_BITS == MAX_SIZE_BITS` (the `LargeObjectRange` config - in `standard_range.h:52-56`). A unified "add the whole refill then - recurse" path violates that precondition for the same config. - -- **The proposed "fix" for the precondition has real cost.** Either - cut `LocalCacheSizeBits` by 1 (half the per-local cache) or bump - `MAX_SIZE_BITS` by 1 (double the local arena's internal state), - for no behavioural win. - -- **`LargeBuddyRange` would still consume `Aligned`** under the - agreed-minimal (a)+(ii) scope, so the field's footprint in - pass-through ranges doesn't shrink — defeating the only - structural-cleanup motivation. - -The Arena refactor (Phases 1–12) ends with Phase 12. No Phase 13. - -## Risks - -1. **LargeArenaRange behaviour differences.** The bin-tree allocator - returns blocks with different internal fragmentation characteristics - than the power-of-two buddy. Functionally, the caller always gets - at least the requested size (power-of-two), so correctness is - maintained. The arena may produce different carving patterns, but - `alloc_range` always returns exactly the requested size. - -2. **Overflow behaviour.** `LargeBuddyRange::dealloc_overflow` returns - a single block of exactly `1 << MAX_SIZE_BITS`. - `LargeArenaRange::parent_dealloc` forwards a single block of the - consolidated size directly to the parent. The size can be any - chunk multiple up to `2^MAX_SIZE_BITS`, not just power-of-two, but - the parent (now itself a `LargeArenaRange` or pass-through layer) - accepts arbitrary chunk-multiple sizes. - -3. **`FixedRangeConfig` uses `StandardLocalState`.** The fixed-region - configuration pushes memory directly into `GlobalR.dealloc_range`. - This works with `LargeArenaRange` because `dealloc_range` has the - same signature and contract. - -4. **Pagemap metadata footprint.** `LargeArenaRange` uses up to - three pagemap entries per free block (`largearenarange.h:12-17`) - — one at the base, one at `base + UNIT_SIZE`, one at - `base + 2*UNIT_SIZE`. `LargeBuddyRange`'s `BuddyChunkRep` only - touched the base entry. Pagemap registration covers every - `MIN_CHUNK_SIZE` stride for the full reserved address range - (`pagemap.h:60-65`), so this is safe in the in-tree pipeline, but - external embedders with custom Pagemap implementations should - verify their pagemap entries cover the per-unit stride. - -## Resolved during plan review - -- `largearenarange.h` was missing `#include "empty_range.h"` for - its `EmptyRange<>` default template parameter. Fixed pre-commit. - (Rubber-duck finding #2.) -- The `conditional_t` huge-page path in `meta_protected_range.h` may - not be instantiated on default builds. CI tests multiple PAL - configurations. Risk acknowledged but no custom build added — the - conditional branch is structurally identical to other - `LargeArenaRange` uses and shares the same template. (Rubber-duck - finding #1.) - -## Out of scope - -- Deleting `LargeBuddyRange` / `BuddyChunkRep` (keep for embedders). -- Modifying `Buddy<>` or `redblacktree.h`. -- Non-power-of-two `alloc_range` requests (deferred to front-end - generalisation phase). -- Performance benchmarking (separate task). -- Any front-end changes. - -# Phase 13: Uniform exp+mantissa sizeclass encoding - -## Goal - -Replace the large-sizeclass encoding `from_large_class(clz(size - 1))` -(which can only represent powers of two) with the same exp+mantissa -scheme small classes already use. After this phase, **every** size -class — small or large — is represented as -`bits::from_exp_mant(global_index)`, -where `global_index` is a single continuous index that runs across -small AND large. So a single uniform table accessor works across the -whole range, and the large table is the natural continuation of the -small one. - -Specifically: -- Small class `sc ∈ [0, NUM_SMALL_SIZECLASSES)` corresponds to - `from_exp_mant(sc)` - (unchanged from today's small encoding in `sizeclassstatic.h:62-64`). -- Large class `lc ∈ [0, NUM_LARGE_CLASSES)` corresponds to - `from_exp_mant(NUM_SMALL_SIZECLASSES + lc)`. - -This is the "continuation of the small exp+mantissa", NOT a separate -exp+mantissa space starting at `MAX_SMALL_SIZECLASS_BITS`. Adjacent -classes step by `2^(E - INTERMEDIATE_BITS)` continuously, with no -jump at the small/large boundary. - -No front-end behaviour changes yet: the front-end still calls -`large_size_to_chunk_size(size) = next_pow2(size)` and writes the -pagemap with the corresponding pow2-rounded sizeclass. The non-pow2 -large sizeclasses are **populated in the table** (so the size / -slab_mask metadata is correct should any code path query them) but -are **unreachable** from `size_to_sizeclass_full` and -`large_size_to_chunk_size` until Phase 15. - -That means: -- `size_to_sizeclass_full(non_pow2_large_size)` must continue to - return the pow2-rounded sizeclass (the one whose - `sizeclass_full_to_size` equals `next_pow2(size)`). Phase 15 - changes this to return the exp+mantissa-rounded sizeclass. -- `large_size_to_chunk_size(size)` continues to return - `next_pow2(size)`. Phase 15 changes this to return - `sizeclass_full_to_size(size_to_sizeclass_full(size))`. - -The two functions stay in lock-step: the front-end's reservation -size and the pagemap-recorded sizeclass must agree, or `dealloc_chunk` -gets the wrong size. Phase 15 changes both together. - -Phase 13 lays the encoding ground; Phase 14 adds the per-chunk -offset; Phase 15 flips the front-end. - -## Why now - -- The large-class table is currently indexed by leading-zero count, - which has exactly one entry per power-of-two size — fundamentally - pow2-only. -- Switching to exp+mantissa multiplies the large-class count by - `1 << INTERMEDIATE_BITS = 4` (default), taking the default - (`MAX_SMALL_SIZECLASS_BITS=16`, `address_bits=48`) from 32 large - entries to 128. -- Once small and large share the same exp+mantissa scheme, the - small/large tag bit in `sizeclass_t` becomes redundant: the two - ranges can live in a single contiguous index space, and - `is_small()` becomes `value < 1 + NUM_SMALL_SIZECLASSES` instead - of `(value & TAG) != 0`. This drops one bit from - `SIZECLASS_REP_SIZE`, undoing roughly half of the alignment - cascade widening that Phase 13 would otherwise cause. - -(For the default config, `NUM_SMALL_SIZECLASSES = 44`, defined as -`size_to_sizeclass_const(MAX_SMALL_SIZECLASS_SIZE) + 1` in -`sizeclassstatic.h:53-54`. All numeric examples below use the -symbol `NUM_SMALL_SIZECLASSES` for the count, with `44` as the -default-config concrete value.) - -## Uniform (untagged) sizeclass encoding - -Today `sizeclass_t::value` packs `[small: TAG | sc]` and -`[large: large_class]`, with a discriminator bit at position -`TAG_SIZECLASS_BITS`. Width consumed in the pagemap word = -`TAG_SIZECLASS_BITS + 1`. - -After Phase 13 the discriminator is unnecessary because the small -and large ranges are both exp+mantissa-indexed and can sit in a -single contiguous index space. - -### Index 0 is reserved as the unmapped sentinel - -`value == 0` MUST remain the "default / unmapped" sentinel. An -unmapped pagemap entry reads as all-zero, and the size-lookup -machinery relies on `sizeclass == 0 ⇒ size == 0` to safely answer -`malloc_usable_size` / `remaining_bytes` queries on -not-an-allocation pointers without branching on validity. - -So the uniform layout reserves index 0 and shifts everything up by -one: - -``` -0 -> unmapped sentinel -[1, 1 + NUM_SMALL_SIZECLASSES) -> small (sc index = value - 1) -[1 + NUM_SMALL_SIZECLASSES, - 1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES) -> large (lc index = value - 1 - NUM_SMALL) -``` - -Width consumed = `next_pow2_bits_const(1 + NUM_SMALL_SIZECLASSES + -NUM_LARGE_CLASSES)`. For the default config: -`next_pow2_bits_const(1 + 44 + 128) = next_pow2_bits_const(173) = 8`. -The tagged scheme today (without the same uniform shift) would need -`max(6, 8) + 1 = 9`. **One bit saved** -(`SIZECLASS_REP_SIZE = 256`, `REMOTE_MIN_ALIGN = 512`). - -### Table padding to avoid the subtract - -The shift introduces a `- 1` on the size/metadata-lookup hot path -(`sizeclass_metadata[value - 1]`). We pay that on every dealloc. - -Cheaper option: pad the table by one slot at index 0 (a dummy -"default" entry whose `size` is 0 and whose `slab_mask` is 0). Then -the lookup is `sizeclass_metadata[value]` with no subtract — and -querying the sentinel returns "size 0 / slab_mask 0" naturally, -which is the answer the existing API wants for unmapped pointers. - -Cost: one wasted slot per table indexed by `sizeclass_t::raw()`. -Inspect each such table in `sizeclasstable.h`: -- `sizeclass_metadata` ModArray (`sizeclass_data_fast` / - `sizeclass_data_slow`): pad slot 0 with zeros. Worth it (hot - path). -- `sizeclass_compress_t` reverse lookups, if any: pad similarly. -- `ChunkSizeMetadata` (if indexed by raw value): inspect first. - -The wasted-slot cost is `sizeof(slot) * num_tables` ≈ tens to -hundreds of bytes — negligible compared to the hot-path subtract -saved. - -### `sizeclass_t` accessors - -- `from_small_class(sc) { return {sc + 1}; }` -- `from_large_class(lc) { return {1 + NUM_SMALL_SIZECLASSES + lc}; }` -- `as_small() { return value - 1; }` (asserts `is_small()`). -- `as_large() { return value - 1 - NUM_SMALL_SIZECLASSES; }` -- `is_small() { return value < 1 + NUM_SMALL_SIZECLASSES && - value != 0; }` — but in practice - `is_small` is only meaningful for non-sentinel values; the - default-sentinel case is filtered upstream. Simplification: - `is_small() { return value - 1 < NUM_SMALL_SIZECLASSES; }` - (works because for `value == 0`, `value - 1` underflows to - SIZE_MAX which is ≥ NUM_SMALL_SIZECLASSES, so returns false — - matching today's semantics where `sizeclass_t{}` is not "small"). -- `is_default() { return value == 0; }` — unchanged. -- `raw()` returns the new shifted value — audit all callers (see - Risks). - -## RemoteAllocator alignment chain - -Verified by inspection of `sizeclasstable.h:18-33` and -`metadata.h:16-45`: - -``` -SIZECLASS_BITS = next_pow2_bits_const( - 1 - + NUM_SMALL_SIZECLASSES - + NUM_LARGE_CLASSES) - (uniform encoding; no separate tag bit; - index 0 reserved as the unmapped sentinel) - (= 8 after Phase 13 with - INTERMEDIATE_BITS=2 large-classes) - -> SIZECLASS_REP_SIZE = 1 << SIZECLASS_BITS - (= 256 after Phase 13) - (used as ModArray length for sizeclass_metadata; the - encoding occupies bits 0..SIZECLASS_BITS-1) - -> REMOTE_MIN_ALIGN = max(CACHELINE_SIZE, SIZECLASS_REP_SIZE) << 1 - (= 512 after Phase 13) - -> REMOTE_BACKEND_MARKER currently hard-coded `1 << 7`. After - Phase 13, MUST be derived: - `static constexpr address_t REMOTE_BACKEND_MARKER = - SIZECLASS_REP_SIZE;` - (= bit 8 after Phase 13). - -> BACKEND_RESERVED_MASK = (REMOTE_BACKEND_MARKER << 1) - 1 - (= 0x1FF after Phase 13 — low 9 bits reserved for backend.) -``` - -NOTE: the `+ 1` previously in `SIZECLASS_REP_SIZE = 1 << -(TAG_SIZECLASS_BITS + 1)` is dropped — that `+ 1` was for the -small/large tag bit which uniform encoding eliminates. (Phase 13 also -renames the constant to `SIZECLASS_BITS` since the encoding no longer -carries a separate tag.) The static-assert in `metadata.h:64-67` that -enforces `REMOTE_BACKEND_MARKER == SIZECLASS_REP_SIZE` continues to -hold by construction. - -**Concrete instances of `RemoteAllocator`** (all places where the -alignment cost lands): - -- Inline `RemoteAllocator` inside `CoreAllocator` — one per - allocator (`corealloc.h:155-159`), allocated from the meta range. -- `unused_remote` BSS global (`commonconfig.h:120`) — once, static. -- Stack-local instances in tests `sandbox.cc:162`, - `domestication.cc` — test-only. - -No other code constrains `RemoteAllocator` alignment. - -**Cost of widening `REMOTE_MIN_ALIGN` (256 → 512 in this phase):** at -most ~256 bytes additional padding per CoreAllocator / -`unused_remote` — under 1 KB total, paid once, not per allocation. -Cheap. - -## Cascading bit-layout changes elsewhere - -`BACKEND_RESERVED_MASK` widens from `0xFF` (8 bits) to `0x1FF` (9 -bits). All backend-side `PagemapRep` layouts that sit immediately -above the reserved range must shift up by -`new_marker_pos - old_marker_pos = log2(REMOTE_BACKEND_MARKER) - 7`. - -Verified consumers of `BACKEND_RESERVED_MASK` / bits immediately -above bit 7: - -- `largearenarange.h:42-50`: `RED_BIT_POS = 8`, - `VARIANT_SHIFT = 9`, `LARGE_SIZE_SHIFT = 8`. Today these sit at - bits 8/9-10/8. After Phase 13 they shift to bits 9/10-11/9. The - `static_assert(Entry::is_backend_allowed_value(...))` at - `largearenarange.h:64-66` catches any miss at compile time. -- `backend_helpers/largebuddyrange.h:40-46`: `BuddyChunkRep` - `RED_BIT = 1 << 8`. Same shift required. (The plan previously - said "`backend_helpers/buddy.h`" — corrected. Grep `RED_BIT` to - confirm no other site.) - -The plan does the shift in terms of an existing or new constant -(e.g. `BACKEND_LAYOUT_FIRST_FREE_BIT = log2(REMOTE_BACKEND_MARKER) -+ 1`) rather than hard-coding new bit numbers — so a future widening -auto-propagates. - -## Changes - -### `src/snmalloc/ds/sizeclasstable.h` - -- `NUM_LARGE_CLASSES`: redefine as - `(address_bits - MAX_SMALL_SIZECLASS_BITS) << INTERMEDIATE_BITS` - so it tracks the exp+mantissa scheme. Update the comment. -- `SIZECLASS_BITS` (renamed from `TAG_SIZECLASS_BITS` — the encoding - no longer carries a separate tag): redefine as - `next_pow2_bits_const(1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES)`. - The `+ 1` reserves index 0 as the unmapped sentinel. -- `SIZECLASS_REP_SIZE`: redefine as `1 << SIZECLASS_BITS` (drop - the `+ 1` that came from the tag bit). -- `sizeclass_t`: rewrite the encoding per "Uniform (untagged) - sizeclass encoding" (small at `value = sc + 1`, large at - `value = 1 + NUM_SMALL_SIZECLASSES + lc`, index 0 is the - default sentinel). Drop the `TAG` constant. Convert all accessors - (`from_small_class`, `from_large_class`, `as_small`, `as_large`, - `is_small`, `index`). Default-construction (`sizeclass_t{}`) and - `is_default()` keep their `value == 0` semantics. -- Audit callers of `sizeclass_t::raw()` — the raw value's meaning - has changed (no tag bit, shifted by 1). Most callers use it as a - `ModArray` index into `sizeclass_metadata`, which still works - with the size-0 padding slot. -- `sizeclass_metadata` ModArray (`sizeclass_data_fast` and - `sizeclass_data_slow`): pad slot 0 with zero-initialised entries - (`size = 0`, `slab_mask = 0`, all other fields zero). A - `static_assert` after construction enforces this. -- `sizeclass_metadata` constructor: rewrite as a single contiguous - loop over `global_index ∈ [0, NUM_SMALL_SIZECLASSES + - NUM_LARGE_CLASSES)`, writing slot `global_index + 1` with the - size derived from - `bits::from_exp_mant(global_index)`. - Small-specific fields populate only for indices < NUM_SMALL; - large-specific fields populate only for indices ≥ NUM_SMALL. The - current split between small (lines ~190-225) and large - (lines 226-238) loops collapses into one, eliminating the - pow2-vs-exp+mantissa mismatch the old large loop had. -- `large_size_to_chunk_size(size)`: **semantics unchanged in this - phase** — continues to return `bits::next_pow2(size)`. (Phase 15 - changes the body to - `sizeclass_full_to_size(size_to_sizeclass_full(size))`.) -- `size_to_sizeclass_full(size)`: **semantics unchanged in this - phase** for both branches — still pow2-rounded for large. - *Implementation* of the large branch must change because the - old body assumed `from_large_class` was a leading-zero-count - mapping. Phase 13 redefines `from_large_class(lc)` to mean - `from_exp_mant(NUM_SMALL_SIZECLASSES + lc)`, - so the new body uses `bits::to_exp_mant` (the literal inverse of - `from_exp_mant`) directly: - ``` - size_t pow2 = bits::next_pow2(size); - size_t global = - bits::to_exp_mant(pow2); - return sizeclass_t::from_large_class(global - NUM_SMALL_SIZECLASSES); - ``` - (Pre-Phase-13 there was a separate helper - `large_size_to_chunk_sizeclass(size)` returning an `lc` index. - Post-Phase-13, small and large share a single global exp+mantissa - index space, so the `+NUM_SMALL_SIZECLASSES` / `-NUM_SMALL_SIZECLASSES` - round-trip via `lc` cancels out and the helper is removed. The - large branch of `size_to_sizeclass_full` inlines the - `to_exp_mant` call directly.) - (An earlier draft tried a manual - `(next_pow2_bits - MIN_ALLOC_STEP_BITS) << INTERMEDIATE_BITS` - formula. That is wrong — `to_exp_mant` does not simply place the - exponent at MANTISSA_BITS; it uses a `b` offset that makes - consecutive pow2 inputs differ by exactly `2^INTERMEDIATE_BITS`. - Always use `to_exp_mant`, which is the literal inverse of the - table-build helper.) - Phase 15 replaces `pow2` with `size` (no `next_pow2`); - `to_exp_mant` rounds non-pow2 sizes up to the next exp+mantissa - step. -- The non-pow2 large slots in `sizeclass_metadata` are populated - with correct size/slab_mask values in Phase 13 but are - unreachable from `size_to_sizeclass_full` / - `large_size_to_chunk_size` until Phase 15. This keeps Phase 13's - end-to-end behaviour identical to today's. -- `slab_index`, `start_of_object`, `is_start_of_object`: continue - to use `meta.slab_mask`. The metadata table builder sets - `slab_mask = info.align - 1` for large (where - `info.align = size & (~size + 1)`, the natural alignment from - `arenabins.h:741`). For pow2 sizes, `info.align == size`, - so `slab_mask = size - 1` — matching today's value. For - non-pow2 sizes (table-populated but unreachable in Phase 13), - `slab_mask = info.align - 1 < size - 1`. Phase 14 adds the - per-chunk offset that lets recovery work for non-pow2 once - Phase 15 lights them up. -- `round_size(size)` (lines 478-501): large branch left unchanged - in this phase — still rounds to next pow2. Phase 15 updates it - to match the new `large_size_to_chunk_size`. - -### `src/snmalloc/mem/metadata.h` - -- Line 45: change - `static constexpr address_t REMOTE_BACKEND_MARKER = 1 << 7;` to - `static constexpr address_t REMOTE_BACKEND_MARKER = - SIZECLASS_REP_SIZE;` - so the marker tracks the (now-untagged) sizeclass field width. - Adjust the comment to point at `sizeclasstable.h` for the - derivation. -- The existing static-asserts at `metadata.h:64-67` already - enforce the invariant; verify they still pass. -- **Add a public layout constant** on `MetaEntryBase` so backend - code can derive shift positions without violating the protected - access of `REMOTE_BACKEND_MARKER`. Insert into the `public:` - section (after line 113): - ```cpp - /** - * Bit position of the first bit available to backend metadata - * layouts above the reserved region. Used by - * `largearenarange.h` and `largebuddyrange.h` to derive - * RED_BIT_POS, VARIANT_SHIFT, and LARGE_SIZE_SHIFT. - */ - static constexpr size_t BACKEND_LAYOUT_FIRST_FREE_BIT = - bits::next_pow2_bits_const(REMOTE_BACKEND_MARKER) + 1; - ``` - The `+1` reserves `REMOTE_BACKEND_MARKER`'s own bit (it lives at - `next_pow2_bits_const(REMOTE_BACKEND_MARKER)`). - -### `src/snmalloc/backend_helpers/largearenarange.h` and `src/snmalloc/backend_helpers/largebuddyrange.h` - -- Replace hard-coded `RED_BIT_POS = 8`, `VARIANT_SHIFT = 9`, - `LARGE_SIZE_SHIFT = 8` in `largearenarange.h` with - derivations from the new public - `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT`: - `RED_BIT_POS = MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT;` - `LARGE_SIZE_SHIFT = MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT;` - `VARIANT_SHIFT = MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT + 1;` - (the `+1` reserves the RED bit). -- `largearenarange.h:64-66` `static_assert` continues to enforce - no clash with reserved bits. -- `largebuddyrange.h:40-46`: `BuddyChunkRep::RED_BIT = 1 << 8`. - Replace with `1 << MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT`. - (The plan previously cited the wrong filename `buddy.h` — - corrected.) -- Grep `RED_BIT` and `1 << 8` / `<< 8` across `backend_helpers/` - to confirm no other site needs the same shift. - -### `src/snmalloc/mem/corealloc.h` - -- Line 1120-1121: replace - `size_t size = bits::one_at_bit(entry_sizeclass);` - with - `size_t size = sizeclass_full_to_size(entry.get_sizeclass());` - so the dealloc-large path reads the precise sizeclass-encoded - size instead of reconstructing it from a leading-zero count. This - is a no-op today (pow2 only); it makes Phase 15's behaviour change - land at a single accessor. -- Grep `corealloc.h` for other `one_at_bit(` calls that derive - large-allocation size from an `as_large()` value and convert all - of them. (Known candidate: `corealloc.h:1576` — verify scope.) - -### `src/snmalloc/global/globalalloc.h` and other consumers - -- Grep for any other consumer of `as_large()` that interprets the - value as a leading-zero count. Convert each to - `sizeclass_full_to_size` or to the exp+mantissa accessor. -- Audit any code that uses `sizeclass_t::raw()` directly assuming - the tag-bit-set-means-small invariant. The uniform encoding - changes the meaning of `raw()`. -- Verified candidates from inspection: `globalalloc.h:145-220` - (`remaining_bytes`, `index_in_object`, `external_pointer`) — these - go through `start_of_object`/`slab_index`, so they pick up the - change automatically via `slab_mask`. - -## User-input size bounds (`MAX_LARGE_SIZECLASS_SIZE`) - -Before Phase 13, the largest representable large allocation was -`1 << (address_bits - 1)` (half the address space, derived from -`from_large_class` being a leading-zero-count mapping). The -pre-existing bound check used `size > (size_t(1) << 63)` as a sloppy -upper limit and let anything below through. With the exp+mantissa -encoding, the largest representable size is the exact value of the -top large class — sizes between that and `2^address_bits` no longer -map to any valid sizeclass and must be rejected at the API boundary. - -Define a derived constant alongside the encoding: - -- `ENCODED_ADDRESS_BITS = bits::min(DefaultPal::address_bits, bits::BITS - 1)`. - Caps the encoding range one bit below the native word width so that - `from_exp_mant(NUM_SMALL + NUM_LARGE - 1) = 1 << ENCODED_ADDRESS_BITS` - does not overflow `size_t` on 32-bit (`address_bits == BITS == 32`). - On x86_64 (`address_bits = 48`) this is unchanged. -- `NUM_LARGE_CLASSES = (ENCODED_ADDRESS_BITS - MAX_SMALL_SIZECLASS_BITS) - << INTERMEDIATE_BITS` (use `ENCODED_ADDRESS_BITS`, not `address_bits`). -- `MAX_LARGE_SIZECLASS_SIZE = from_exp_mant(NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES - 1)`. -- Add `static_assert(MAX_LARGE_SIZECLASS_SIZE == bits::one_at_bit( - ENCODED_ADDRESS_BITS))` to pin the encoding invariant (a strict - nonzero check would not catch a wrong table-build mantissa offset). -- Add `static_assert(ENCODED_ADDRESS_BITS > MAX_SMALL_SIZECLASS_BITS)` - so `NUM_LARGE_CLASSES > 0` is structural, not coincidental. - -Fan out the new bound to every site that accepts a user-supplied size -and feeds it into the size→sizeclass lookup (these were either -unguarded or had the loose `> 2^63` check): - -- `src/snmalloc/mem/corealloc.h` `alloc_not_small`: replace the - `1 << 63` bound with `MAX_LARGE_SIZECLASS_SIZE`. -- `src/snmalloc/ds/sizeclasstable.h` `round_size`: same. -- `src/snmalloc/global/globalalloc.h` `check_size`: early-return via - `snmalloc_check_client` when `size > MAX_LARGE_SIZECLASS_SIZE`. -- `src/snmalloc/override/rust.cc` `rust_realloc`: gate the - equality-fast-path on both aligned sizes being - `<= MAX_LARGE_SIZECLASS_SIZE`. - -Add defensive `SNMALLOC_ASSERT`s in the large branch of -`size_to_sizeclass_full`: `size != 0`, `size <= -MAX_LARGE_SIZECLASS_SIZE`. These document the preconditions at the -function whose behaviour is constrained by them ("document coupling at -the point of breakage") and turn into noisy debug failures if a future -caller path skips the bound check. - -## Test gates - -1. **Build**: clean build of the default config passes. The - `static_assert(Entry::is_backend_allowed_value(...))` checks at - `largearenarange.h:64-66` catch any bit-layout mismatch. -2. **Full ctest suite**: all existing tests pass (no behaviour - regression — front-end still issues pow2 large requests, so - non-pow2 large sizeclasses exist in tables but are unreachable - from the API). -3. **Arena unit tests** (`test_backend_arena`) continue to - pass — they exercise the shifted RED/variant bits in the pagemap - encoding. -4. **Extend `src/test/func/sizeclass/sizeclass.cc`** with a - `uniform_large_sizeclasses` test case: - - For every large `sizeclass_t` index `lc ∈ [0, NUM_LARGE_CLASSES)`, - assert `sizeclass_full_to_size(from_large_class(lc))` is - strictly increasing in `lc`. - - For every pow2 size `S` in - `[MAX_SMALL_SIZECLASS_SIZE * 2, 2^(address_bits - 1)]`, assert - `sizeclass_full_to_size(size_to_sizeclass_full(S)) == S` - (round-trip identity on pow2 — still holds in Phase 13 - because `size_to_sizeclass_full` for large still rounds to - next pow2). - - For every non-pow2 size `X` strictly between adjacent pow2 - `[P, 2P)`, assert - `sizeclass_full_to_size(size_to_sizeclass_full(X)) == 2P` - (still pow2-rounded in Phase 13 — Phase 15 changes this). - - Sentinel sanity: `sizeclass_t{}.raw() == 0`; - `sizeclass_t{}.is_default()` is true; - `sizeclass_data_fast[0].size == 0`; - `sizeclass_data_fast[0].slab_mask == 0`; - `is_small(sizeclass_t{})` is false. - - Encoding sanity: `is_small(from_small_class(0))` is true; - `is_small(from_large_class(0))` is false; small range and - large range are disjoint and adjacent in the value space. -5. **Extend `src/test/func/release-rounding/rounding.cc`** to - exercise non-trivial pow2 large sizeclasses. Today this test - covers small only. Add cases that exercise - `start_of_object` / `is_start_of_object` for the pow2 large - sizeclasses materialised end-to-end in Phase 13. (Phase 14 - extends to the per-chunk offset; Phase 15 to non-pow2.) - (The plan previously cited `test/func/sizeclass/rounding.cc`, - which does not exist — corrected.) - -## Risks - -1. **SIZECLASS_BITS widening cascades.** Caught by the existing - `static_assert`s in `metadata.h:64-67` and - `largearenarange.h:64-66`. -2. **Some embedder set REMOTE_MIN_ALIGN tighter than chain allows.** - Would surface as a compile-error on the cacheline-vs-REP_SIZE - max. Address only if it actually fires. -3. **Stale `as_large()` callers.** Mitigation: grep + convert ALL - uses of `as_large()` in this phase. Phase 13 is not done until - the leading-zero-count semantics are retired. -4. **Stale `raw()` callers assuming tag-bit semantics.** The - uniform encoding changes `raw()`'s meaning (no tag bit, shifted - by 1). Grep all callers and convert each to the appropriate - accessor (`as_small`, `as_large`, `is_small`, or — if it's a - `ModArray` index — leave alone, relying on the size-0 padding - slot at index 0 to make the no-subtract lookup return the right - sentinel values). -5. **Adding the size-0 padding slot at index 0.** The padding slot - in `sizeclass_metadata` must have `size = 0` and `slab_mask = 0` - (and any other fields zero-initialised) so that - `sizeclass_full_to_size(sizeclass_t{}) == 0` and any accidental - slab-mask arithmetic on the sentinel returns 0 / a no-op. Verify - by reading every field in `sizeclass_data_fast` / - `sizeclass_data_slow`. Add a static-assert that index 0 has - `size == 0` after table init. - -## Out of scope - -- Per-chunk pagemap offset (Phase 14). -- Non-pow2 reservations (Phase 15). -- Changes to the small sizeclass encoding (other than dropping - the tag bit). -- `round_size(size)` for large: still pow2 here; Phase 15 fixes. - -# Phase 14: Per-chunk offset in `ras` + combined-indexed metadata - -## Goal - -Recover the start address of a large allocation from any interior -address, independent of allocation alignment. Stored as a per-chunk -slab-offset in the pagemap entry, packed alongside the sizeclass in -the `ras` (`remote_and_sizeclass`) word so that the same pagemap -word loaded for the sizeclass directly yields the index into the -metadata table that already has the offset-recovery delta -pre-baked. This unlocks Phase 15. - -## Design summary - -- **Layout**: offset bits sit in `ras` directly above the sizeclass - bits and directly below the `REMOTE_BACKEND_MARKER`. Reading the - same `ras` word the sizeclass-extract path already loads, masking - with `COMBINED_MASK` yields the combined sizeclass+offset value - ready to use as a table index — no extra load, no shift, no OR, - no multiply. (Default config: 11 bits of combined index; the mask - widens from `SIZECLASS_REP_SIZE - 1` to `COMBINED_REP_SIZE - 1` - but is still a single `and`-with-imm.) -- **Metadata table**: `sizeclass_metadata.fast_` is widened from - `SIZECLASS_REP_SIZE` rows to `COMBINED_REP_SIZE` rows - (= `SIZECLASS_REP_SIZE << OFFSET_BITS`). Each row gains a - pre-computed `offset_bytes` field equal to `offset * slab_size` - for that sizeclass. Recovery is - `alloc_start = (addr & ~slab_mask) - offset_bytes`. -- **Code**: `start_of_object` and friends take a *combined* index - (`size_t`); the wrapper in `globalalloc.h` passes - `entry.get_offset_and_sizeclass()`. No branches, no extra word loads - on the fast path. -- **Backend**: in `alloc_chunk`, the small-and-pow2-large fast path - (`slab_size >= size`) uses the existing `set_metaentry`. The - non-pow2-large (multi-slab-tile) path writes a per-chunk - `ras = encode(remote, sc, slab_index)` via `concretePagemap.set`. - -## Why now - -- Phase 15 introduces non-pow2 reservations. The existing - `addr & ~slab_mask` answer is wrong for non-pow2 sizes/alignments. -- A per-chunk offset is the long-identified mechanism (PLAN.md - intro). Phase 14 implements that mechanism with offset = 0 - semantics matching the existing pow2 path — so it lands without - changing observable behaviour for today's allocations. -- Packing the offset into `ras` (not `meta`) at the time we land - the field avoids a second `meta`-word load on - `__malloc_start_pointer` and avoids a runtime multiply on every - external_pointer query. - -## Design - -### Bit layout of `ras` - -``` -ras = [ RemoteAllocator* | BACKEND_MARKER | offset_bits | sizeclass_bits ] - ↑ - low bits -``` - -Bit positions (low to high): -- bits `[0, SIZECLASS_BITS)`: sizeclass — **unchanged** position. -- bits `[SIZECLASS_BITS, SIZECLASS_BITS + OFFSET_BITS)`: offset - (frontend-owned, non-zero only for non-pow2 large in Phase 15+). -- bit `[SIZECLASS_BITS + OFFSET_BITS]`: `REMOTE_BACKEND_MARKER` - (moves up by `OFFSET_BITS` positions). -- bits above: `RemoteAllocator*` payload. - -Constants: - -```cpp -// in sizeclasstable.h (alongside existing SIZECLASS_BITS): -constexpr size_t OFFSET_BITS = INTERMEDIATE_BITS + 1; -constexpr size_t COMBINED_BITS = SIZECLASS_BITS + OFFSET_BITS; -constexpr size_t COMBINED_REP_SIZE = bits::one_at_bit(COMBINED_BITS); -``` - -`REMOTE_BACKEND_MARKER` in `metadata.h` redefines from -`SIZECLASS_REP_SIZE` to `COMBINED_REP_SIZE`. `REMOTE_MIN_ALIGN` -follows: `max(CACHELINE_SIZE, COMBINED_REP_SIZE) << 1`. For the -default config (SIZECLASS_BITS=8, OFFSET_BITS=3): the marker moves -from bit 8 to bit 11, and `REMOTE_MIN_ALIGN` from 512 B to 4096 B. - -Existing `MetaEntryBase::get_sizeclass()` must continue to return -pure sizeclass; with the marker moving up, masking by -`REMOTE_WITH_BACKEND_MARKER_ALIGN - 1` would now include the offset -bits. Define a dedicated `SIZECLASS_MASK = SIZECLASS_REP_SIZE - 1` -(unchanged in value from today's effective mask) and use it -explicitly in `get_sizeclass()`. The new `COMBINED_MASK = -COMBINED_REP_SIZE - 1` is what `get_offset_and_sizeclass()` uses. - -### `OFFSET_BITS` derivation - -With `INTERMEDIATE_BITS = M`, the worst-case non-pow2 large -sizeclass tiles into `2^(M+1)` slabs (e.g., a 7×slab_size class -with M=2: reserve rounds up to 8 slabs, max slab index = 7). So -`OFFSET_BITS = M + 1` gives `2^(M+1)` distinct values, exactly -enough for `[0, 2^(M+1))`. A `static_assert` on -`max_large_slab_index() < (1 << OFFSET_BITS)` (existing helper at -`sizeclasstable.h:273-285`) guards against any sizeclass-table -change. - -### `meta` word stays simple - -The `meta` word goes back to its pre-Phase-14 layout: - -``` -meta = [ SlabMetadata* | META_BOUNDARY_BIT ] -``` - -No offset bits. No `META_FRONTEND_RESERVED_MASK`. No alignas on -`FrontendSlabMetadata`. `get_slab_metadata()` masks just -`META_BOUNDARY_BIT`. This removes a load on the pointer-recovery -hot path (no `mov (%rdx),%rcx` to fish offset out of `meta`). - -### Combined-indexed metadata table - -`SizeClassTable::fast_` (`sizeclasstable.h:181`) widens: - -```cpp -struct sizeclass_data_fast { - size_t size; - size_t slab_mask; - size_t div_mult; - size_t mod_zero_mult; - size_t offset_bytes; // NEW: precomputed (combined >> SIZECLASS_BITS) * slab_size -}; - -ModArray fast_{}; -``` - -Memory: `COMBINED_REP_SIZE × sizeof(sizeclass_data_fast)`. With -SIZECLASS_BITS=8, OFFSET_BITS=3, sizeof=40: ~80 KB. Fits L2. -(`fast_small`'s today-1KB working set still fits L1 for the -small-only paths because those index `sc.raw()` directly, which -lands in the first `SIZECLASS_REP_SIZE` rows.) - -`slow_` stays sc-indexed at `SIZECLASS_REP_SIZE` rows: it is only -read by slow paths that don't care about offset. - -Table initialization fills every `(sc, offset)` cell: -- Other fields duplicate the `(sc, 0)` row. -- `offset_bytes = offset * sizeclass_full_to_slab_size(sc)`. - -For `offset == 0` rows: `offset_bytes = 0`. The first -`SIZECLASS_REP_SIZE` rows of the new `fast_` are byte-identical to -today's table plus a trailing `offset_bytes = 0`. - -**`fast()` overloads.** Keep the existing -`fast(sizeclass_t sc)` overload (`sizeclasstable.h:186-193`) -unchanged — it forwards to `fast_[sc.raw()]`, which hits the -offset = 0 row, identical to today's behaviour. Add a new -overload `fast(size_t combined)` that does `fast_[combined]`. -Call sites that have a sizeclass_t (most existing code) keep -calling `fast(sc)`; sites that have a combined index from the -pagemap call `fast(combined)`. No source change for the majority -of existing call sites. - -### Accessors on `MetaEntryBase` / `FrontendMetaEntry` - -Add to `MetaEntryBase`: - -```cpp -// returns the value to use as an index into sizeclass_metadata.fast_ -[[nodiscard]] SNMALLOC_FAST_PATH size_t get_offset_and_sizeclass() const { - return static_cast(remote_and_sizeclass) & COMBINED_MASK; -} -``` - -Keep `get_sizeclass()` returning a `sizeclass_t` (pure sizeclass, -low SIZECLASS_BITS only). Add an offset accessor for tests / -diagnostics: - -```cpp -[[nodiscard]] SNMALLOC_FAST_PATH size_t get_offset() const { - return (static_cast(remote_and_sizeclass) >> SIZECLASS_BITS) - & ((1 << OFFSET_BITS) - 1); -} -``` - -`encode(RemoteAllocator*, sizeclass_t)` gains an optional `offset` -parameter (defaults to 0 so existing callers compile): - -```cpp -[[nodiscard]] static SNMALLOC_FAST_PATH uintptr_t -encode(RemoteAllocator* remote, sizeclass_t sizeclass, size_t offset = 0) { - return pointer_offset( - reinterpret_cast(remote), - sizeclass.raw() | (offset << SIZECLASS_BITS)); -} -``` - -Compile-time check: `offset < (1 << OFFSET_BITS)` (assert). - -### `start_of_object` and friends - -Refactor signatures to take a combined index (`size_t`) instead of -`(sizeclass_t, slab_offset)`. The recovery formula collapses to a -single subtract because `offset_bytes` is precomputed: - -```cpp -SNMALLOC_FAST_PATH constexpr address_t -start_of_object(size_t combined, address_t addr) { - auto meta = sizeclass_metadata.fast(combined); - address_t alloc_start = (addr & ~meta.slab_mask) - meta.offset_bytes; - size_t index = slab_index_via(meta, addr - alloc_start); - return alloc_start + (index * meta.size); -} -``` - -`slab_index_via(meta, addr)` is the existing `slab_index` body -(`sizeclasstable.h:358-383`) refactored to take an already-loaded -`sizeclass_data_fast` instead of doing its own -`sizeclass_metadata.fast(sc)` lookup. All current behaviour is -preserved: the `offset = addr & meta.slab_mask` mask, the 64-bit -reciprocal-division (`(offset * meta.div_mult) >> DIV_MULT_SHIFT`), -and the 32-bit `offset / size` fallback for `sizeof(size_t) < 8` -platforms with the `size == 0` short-circuit. The original -`slab_index(sizeclass_t sc, address_t addr)` is kept as a -one-line wrapper that resolves `sc` to a row and forwards to -`slab_index_via` so call sites that don't already have the row -(e.g., `globalalloc.h:231,260` — which today pass -`entry.get_sizeclass()`) keep compiling unchanged. - -`index_in_object`, `remaining_bytes`, `is_start_of_object` follow -the same shape, all taking `size_t combined`. Where callers have -only a `sizeclass_t` (e.g., for self-allocations they did -themselves), they pass `sc.raw()` directly — that selects the -offset=0 row, equivalent to today. - -### Backend write in `alloc_chunk` - -For the small / pow2-large (single-slab-tile) case (`slab_size >= -size`), keep `set_metaentry(addr, size, t)` where -`t = Entry(meta, encode(remote, sc))` — encoded with offset=0 -implicitly. - -For multi-slab-tile (Phase 15+, currently dormant): - -```cpp -size_t slab_size = sizeclass_full_to_slab_size(sizeclass); -for (size_t chunk_offset = 0; chunk_offset < size; - chunk_offset += MIN_CHUNK_SIZE) -{ - size_t slab_index = chunk_offset / slab_size; - uintptr_t ras_i = Pagemap::Entry::encode(remote, sizeclass, slab_index); - typename Pagemap::Entry t_i(meta, ras_i); - Pagemap::concretePagemap.set(address_cast(p) + chunk_offset, t_i); -} -``` - -Only the `META_BOUNDARY_BIT` in `meta` is preserved across this -write: `MetaEntryBase::operator=` (`metadata.h:235-242`) -explicitly preserves the target's boundary bit and otherwise -overwrites both `meta` (modulo that bit) and `remote_and_sizeclass` -in full. Any prior backend-owned state in the old `ras` is gone -once the frontend claims the chunk (in `claim_for_backend`, -`metadata.h:313-317`, which resets `ras` to -`REMOTE_BACKEND_MARKER`), so the frontend's per-chunk write -overwriting `ras` from that pristine `REMOTE_BACKEND_MARKER`-only -state to the encoded `(remote, sc, offset)` is exactly the -expected ownership transition. - -### Backend bits relocate automatically - -`MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT` is derived from -`REMOTE_BACKEND_MARKER`; since the marker moves up by `OFFSET_BITS`, -the backend's `RED_BIT`, `VARIANT_SHIFT`, `LARGE_SIZE_SHIFT` -(`largearenarange.h:50-67`) auto-shift up by the same amount. -Verify the existing -`static_assert((MAX_SIZE_BITS - MIN_SIZE_BITS) + LARGE_SIZE_SHIFT -<= bits::BITS, ...)` still holds. For the default config: -- `MAX_SIZE_BITS = bits::BITS - 1 = 63` -- `MIN_CHUNK_BITS = 14`, so the large size field needs - `MAX_SIZE_BITS - MIN_CHUNK_BITS = 49` bits. -- Pre-Phase-14: `BACKEND_LAYOUT_FIRST_FREE_BIT = SIZECLASS_BITS = 8`, - so `LARGE_SIZE_SHIFT ≈ 9` → `49 + 9 = 58 ≤ 64`. ✓ -- Phase 14: `BACKEND_LAYOUT_FIRST_FREE_BIT = SIZECLASS_BITS + - OFFSET_BITS = 11`, so `LARGE_SIZE_SHIFT ≈ 12` → `49 + 12 = 61 ≤ - 64`. ✓ (Three bits of headroom remain; OFFSET_BITS = 4 — the - `INTERMEDIATE_BITS = 3` config — would still pass.) - -### Pre-existing pagemap bug (still fixed in prep commit `1144eab4`) - -Same as before: `FlatPagemap::get_mut` double-base-adjust on -`PALNoAlloc`. Fix unrelated to Phase 14 layout choice. - -### Consumers that MUST be updated in Phase 14 - -Phase 14 is incomplete until every caller of the -sizeclass-table-only `start_of_object` / `is_start_of_object` / -`remaining_bytes` on a **user-supplied** potentially-large pointer -is offset-aware. - -The offset support is pushed into the inner helpers in -`sizeclasstable.h` themselves: `start_of_object`, `index_in_object`, -`remaining_bytes`, and `is_start_of_object` take a mandatory -*combined* `size_t` index parameter (sizeclass + offset packed into -the low `COMBINED_BITS` of `ras`). Callers must explicitly pass -either `sc.raw()` (when local context proves the address is in the -allocation's first slab — offset implicitly 0) or -`entry.get_offset_and_sizeclass()` (from the address's pagemap entry). -Removing default arguments forces every call site to make a -deliberate choice and prevents a future Phase 15 caller from -accidentally inheriting offset = 0 when it should consult the -pagemap. - -Inside each helper, the formula uses a single -`sizeclass_metadata.start(combined)` lookup — the `start_` table is -indexed by `COMBINED_REP_SIZE` rows so the combined index lands -directly in a precomputed row. `offset_bytes` collapses to 0 for the -offset = 0 rows, which today are the only rows reached from -front-end allocation paths. This keeps `globalalloc.h` and -`corealloc.h` branch-free at the call site and avoids duplicating -the slab-mask / slab-size arithmetic across files. - -- `globalalloc.h:138-144` (`remaining_bytes`): reads the metaentry, - then unconditionally calls - `snmalloc::remaining_bytes(entry.get_offset_and_sizeclass(), p)`. - No small/large dispatch. -- `globalalloc.h:158-167` (`index_in_object`): same pattern. -- `bounds_checks.h:101` memcpy gate: calls `remaining_bytes(...)`. - Moves to the offset-aware version transitively via the inner-helper - rewrite — no source change here, and no extra branch on the - bounds-check fast path. - -Audit of all `is_start_of_object` call sites (verified against the -post-Phase-13 tree via `grep -rn is_start_of_object src/snmalloc`): - -| File:line | Sizeclass source | Pointer source | Action | -|---|---|---|---| -| `corealloc.h:41` (`DefaultConts::success`) | requested-size→sc | allocator-output base | **Keep** — slab-mask check on the allocator's own freshly-returned base is tight enough; pass `sc.raw()`. | -| `override/new.cc:40` (`handler::Base::success`) | requested-size→sc | allocator-output base | **Keep** — same rationale as above; pass `sc.raw()`. | -| `corealloc.h:536` (`dealloc_local_object_meta`) | `entry.get_sizeclass()` | **user input** | **Update** — pass `entry.get_offset_and_sizeclass()`; the helper folds the offset check internally. | -| `corealloc.h:1084` (`dealloc_local_object`) | `entry.get_sizeclass()` | **user input** | **Update** — same: pass `entry.get_offset_and_sizeclass()`. | -| `corealloc.h:1258` | `from_small_class(...)` | small allocation | **Keep** — small-only path; pass `sc.raw()`. | -| `corealloc.h:1438` | `from_small_class(...)` | small allocation | **Keep** — small-only path; pass `sc.raw()`. | - -Additionally, `slab_index` itself has two call sites outside the -`start_of_object` family: - -- `globalalloc.h:231` (`remaining_bytes` wrapper, large-class - arm): calls `slab_index(entry.get_sizeclass(), address_cast(p))`. -- `globalalloc.h:260` (`index_in_object` wrapper, large-class - arm): same shape. - -After the helper-signature refactor these two wrappers fold into -the new `start_of_object(combined, addr)` path entirely (the -combined-index version of `remaining_bytes`/`index_in_object` -calls `start_of_object` internally, which itself dispatches to -`slab_index_via`). Neither wrapper calls `slab_index` directly -post-refactor. - -The "Keep" rows on allocator-output base pointers are safe because -the allocator itself always returns the allocation base, which by -construction is slab-aligned (`addr & info.slab_mask == 0`) *and* -allocation-start (offset == 0 in pagemap, so combined == -`sc.raw()`). The old `is_start_of_object(sc, addr)` test reduces to -`(addr & info.slab_mask) == 0`, which holds for all such bases -both today and after Phase 15. - -The dealloc-API consumers (rows 3 and 4) get the offset folded -inside the combined index because for non-pow2 large in Phase 15 -every natural-alignment slab boundary *inside* the allocation would -satisfy the old `slab_mask`-only check; the precomputed -`offset_bytes` in the combined row distinguishes the actual -allocation base. These call sites remain gated by -`snmalloc_check_client(mitigations(sanity_checks), ...)`, so the -additional comparison is dead in release/non-checked builds. - -`slab_index` for large: irrelevant — large allocations are a single -"object" of size `sizeclass_full_to_size(sc)`, not a slab of -multiple. The refactored `start_of_object` uses -`addr - alloc_start` (offset within the *allocation*, not the slab) -as the dividend, which is 0 for any in-range large pointer. - -### Backend changes - -- `backend.h:131-156` (alloc_chunk small/large dispatch): replace - the single `set_metaentry(p, size, t)` call with the small/large - dispatch described in "Backend write in `alloc_chunk`" above. - Phase 14 keeps `alloc_chunk`'s `bits::is_pow2(size)` assertion - (Phase 15 relaxes it). This is fine: today only pow2 large - allocations reach this site, so `slab_size == size` and offset - is always 0; the entries written by the new large path are - bit-identical to the entries written by the old uniform path. -- `backend.h:172-196` (dealloc_chunk): constructs - `Entry t(nullptr, 0)`, calls `claim_for_backend()`, then - `set_metaentry(p, size, t)`. The `Entry(nullptr, 0)` - constructor's `ras = 0` clears both the sizeclass and offset - fields. `claim_for_backend()` (`metadata.h:313-317`) sets `ras` - to `REMOTE_BACKEND_MARKER` and only the boundary bit on `meta` is - preserved. The subsequent `set_metaentry` writes the - cleared-ras `Entry` to every pagemap cell in the range. No - further change is needed: the offset is meaningful only while - the chunk is owned by the frontend. - -### `RemoteAllocator` alignment - -`REMOTE_MIN_ALIGN` bumps from 512 B to 4096 B (default config: -`COMBINED_REP_SIZE = 2048`, doubled for the marker, so -`max(CACHELINE, 2048) << 1 = 4096`). - -`RemoteAllocator` (`remoteallocator.h:292-310`) gets its alignment -from its `FreeListMPSCQ` member (`freelist_queue.h`), -which is declared `alignas(REMOTE_MIN_ALIGN)`. So bumping -`REMOTE_MIN_ALIGN` automatically widens `alignof(RemoteAllocator)` -to 4096 with no source change to `RemoteAllocator` itself. - -Verifications (do during step 2): - -1. `sizeof(RemoteAllocator)` does not blow up. The structure is a - small fixed-size queue head plus padding; rounding up to a - 4096-B alignment unit only consumes extra padding in - surrounding containers (allocators, pool slots), not inside - `RemoteAllocator`. -2. `CommonConfig::unused_remote` (`commonconfig.h:119-120`) — a - static `RemoteAllocator` — inherits the new alignment from - `RemoteAllocator`'s natural alignof. Confirm it still compiles - and the linker honours the alignment (compilers do; some older - linkers cap `.bss` alignment, but 4096 is the page size, so it - is universally supported). -3. Per-allocator-pool storage: the pool allocates `Allocator` - instances; each `Allocator` contains a `RemoteAllocator` - (transitively), and the pool's metadata-allocation path is - already aligned to `alignof(Allocator)` via the backend's - metadata allocator. Confirm via inspection that - `Pool::acquire` honours `alignof(Allocator)` after - the bump. -4. `unused_remote_address`-style runtime checks (any assertion that - `(uintptr_t)remote & (REMOTE_MIN_ALIGN - 1) == 0`) — grep for - `REMOTE_MIN_ALIGN` to find them and confirm they pass with the - bumped value. - -## Implementation steps - -Each step must produce a testable result before moving to the next. -Steps are ordered so that earlier steps' tests don't depend on -later steps' code. - -### Step 0: Revert the current (meta-based) Phase-14 implementation - -The current working tree carries a partial, meta-word-based -Phase 14 (`META_OFFSET_BITS`, `META_OFFSET_SHIFT`, -`META_OFFSET_MASK`, `META_FRONTEND_RESERVED_MASK`, `set_offset` / -`get_offset` on `FrontendMetaEntry`, `alignas(...)` on -`FrontendSlabMetadata`, branchless three-parameter -`start_of_object(sc, addr, slab_offset)` / `index_in_object` / -`remaining_bytes` / `is_start_of_object`, three-parameter wrapper -calls in `globalalloc.h` / `corealloc.h` / `override/new.cc` / -`test/func/release-rounding/rounding.cc`, and the small/large -dispatch in `backend.h::alloc_chunk`). The new design replaces all -of this. Revert these files to the pre-Phase-14 head (commit -`1144eab4`), keeping only: -- The new test scaffolding in `src/test/func/memory/memory.cc` - (`test_large_alloc_pointer_recovery`) and - `src/test/func/large_offset/large_offset.cc` — to be updated for - the combined-index API in steps 4 and 6. - -**Gate**: clean build, full ctest suite passes (this is the -pre-Phase-14 head with two test additions that will be updated -later — the additions either compile and pass or are temporarily -gated out until step 4). - -### Step 1: Constants + table widening (no behaviour change) - -> **Implementation note**: the as-shipped design splits the metadata -> table into `start_` / `align_` / `slab_` rather than widening the -> single `fast_` table described below. See Step 7 Outcome for the -> rationale (perf gate). The constants and `(sc, offset)` -> initialisation described here apply to `start_`. - -Changes: -- `sizeclasstable.h`: add `OFFSET_BITS`, `COMBINED_BITS`, - `COMBINED_REP_SIZE`. Add `offset_bytes` column to - `sizeclass_data_fast`. Widen `fast_` to `COMBINED_REP_SIZE`. - Initialise every `(sc, offset)` cell — non-zero rows duplicate - the `(sc, 0)` row's fields except `offset_bytes = offset * - slab_size`. Add new overload `fast(size_t combined)`. Keep - `fast(sizeclass_t)` unchanged. -- Add `static_assert(max_large_slab_index() < (1 << - OFFSET_BITS))`. - -**Gate**: clean build. All existing tests still pass — nothing -reads `fast(combined)` yet, and the offset = 0 rows of the widened -table are byte-identical to today's rows for callers that index -via `sc.raw()` (whose value lies in `[0, SIZECLASS_REP_SIZE)`). - -### Step 1.5: Per-word backend-reserved mask + lower BIN/RANGE bit positions - -Motivation: today's `BACKEND_RESERVED_MASK = (REMOTE_BACKEND_MARKER -<< 1) - 1` applies symmetrically to both `meta` (Word::One) and -`ras` (Word::Two). That is overly conservative: in backend mode, -the only invariants are -- `meta` must preserve `META_BOUNDARY_BIT` (bit 0) across the - ownership transition (frontend reads it to detect PAL - boundaries), and -- `ras` must keep `REMOTE_BACKEND_MARKER` set while backend-owns - (frontend reads bit MARKER to detect ownership). - -Everything else on both words is free for the backend. Today's -unified mask forces `RED + VARIANT` (which live on `meta`) up to -`BACKEND_LAYOUT_FIRST_FREE_BIT`, i.e., just above the marker -position. After Step 2 moves the marker from bit 8 to bit 11, -those positions become bits 12, 13, 14 — and bit 14 collides with -the `MIN_CHUNK_BITS = 14` unit-address packing in the backend's -buddy-tree pointer storage, tripping the -`BIN_META_MASK < UNIT_SIZE` assertion in -`largearenarange.h:72`. - -Changes: - -- `metadata.h`: - - Replace `BACKEND_RESERVED_MASK` with two per-word constants: - - `BACKEND_RESERVED_MASK_WORD_ONE = META_BOUNDARY_BIT` - - `BACKEND_RESERVED_MASK_WORD_TWO = (REMOTE_BACKEND_MARKER << - 1) - 1` (the old value — unchanged in behaviour for `ras`). - - Make `is_backend_allowed_value(Word w, uintptr_t v)` use the - right mask per `w`. - - Change `BackendStateWordRef` to carry the relevant mask (or - its `Word` identity) so its `get()` and `operator=` use the - correct per-word mask. The simplest mechanical change is to - pass the mask into the `BackendStateWordRef` constructor and - store it as a member; `get_backend_word(Word w)` selects the - right mask at the call site. -- `largearenarange.h`: - - Move `RED_BIT_POS` and `VARIANT_SHIFT` down to start at bit 1 - (just above `META_BOUNDARY_BIT`). `RED_BIT_POS = 1`, - `VARIANT_SHIFT = 2`. `BIN_META_MASK = (1<<1) | (3<<2) = 14`. - - Move `LARGE_SIZE_SHIFT` to bit 1 too (it stores the large - chunk count on `Word::One` of unit 2 — same word, same - relaxed reservation). - - The `is_backend_allowed_value(Word::Two, RED_BIT)` assert at - line 75 — RANGE_META_MASK applied to Word::Two of unit 1 - stores bit 1 in the left-child mask region. Bit 1 ≠ bit - MARKER (= 11 in new layout or 8 today), so the marker bit - is not disturbed. Verify the per-word mask check passes for - Word::Two with bit 1 (it should: the new Word::Two mask still - forbids the backend from writing the marker bit, but bit 1 is - not the marker). - - **Note**: After this step, `Word::Two`'s relaxed mask still - requires the backend not to disturb the marker. Today's - Word::Two mask was bits 0..MARKER, which forbade *any* bits - in that range. The relaxed mask forbids only the marker bit - itself. So the backend can now write low bits of `ras` - (sizeclass/offset positions) — those are zero in backend mode - (cleared by `claim_for_backend()`) and overwritten on - ownership transition, so no real change. - -**Gate**: clean build. Full ctest suite passes. The marker has -NOT moved yet (still at SIZECLASS_REP_SIZE), so the layout -change is invisible to allocation behaviour; only the -relaxation of asserts and the lowered bit positions for -RED/VARIANT/LARGE_SIZE_SHIFT differ. Run a focused build to -re-trigger the static_asserts in `largearenarange.h` and -confirm they all pass. - -### Step 2: Marker move + ras encoding (no offset writers yet) - -Changes: -- `metadata.h`: change `REMOTE_BACKEND_MARKER` from - `SIZECLASS_REP_SIZE` to `COMBINED_REP_SIZE`. Define - `SIZECLASS_MASK = SIZECLASS_REP_SIZE - 1` and - `COMBINED_MASK = COMBINED_REP_SIZE - 1`. Update - `get_sizeclass()` to mask with `SIZECLASS_MASK` explicitly. - Add `get_offset_and_sizeclass()`. Extend `encode(remote, sc)` to - `encode(remote, sc, size_t offset = 0)`; assert - `offset < (1 << OFFSET_BITS)`. -- Verify alignment chain (RemoteAllocator alignment section - above). If any check fails, fix before continuing. - -**Gate**: clean build (the size-budget `static_assert` in -`largearenarange.h` is the compile-time guard for the marker -shift). All existing tests still pass — every `ras` write still -encodes with `offset = 0` (the new default arg), so every -combined value still equals `sc.raw()`. - -### Step 3: Refactor `slab_index` into `slab_index_via` - -Changes: -- `sizeclasstable.h`: introduce - `slab_index_via(sizeclass_data_fast const& meta, address_t - addr)` carrying the existing body (mask, 64-bit reciprocal-mul, - 32-bit fallback). Make `slab_index(sizeclass_t, addr)` a - one-line wrapper over `slab_index_via`. - -**Gate**: clean build. All existing tests still pass — pure -refactor. - -### Step 4: Switch helpers to combined index - -Changes: -- `sizeclasstable.h`: change `start_of_object`, `index_in_object`, - `remaining_bytes`, `is_start_of_object` to take a single - `size_t combined` parameter. Body uses `fast(combined)` and - reads `meta.offset_bytes`; recovery is - `(addr & ~slab_mask) - offset_bytes`. Mark `index_in_object` - and `remaining_bytes` `SNMALLOC_FAST_PATH`. -- Update all call sites per the audit table above: - - `globalalloc.h:138-144`, `:158-167`: pass - `entry.get_offset_and_sizeclass()`. - - `globalalloc.h:231,260` (`slab_index` direct callers): fold - into the new `start_of_object`-based path (large arm of - `remaining_bytes` / `index_in_object` now goes through - `start_of_object(combined, addr)` and no longer calls - `slab_index` directly). - - `corealloc.h:41`, `override/new.cc:40`, - `corealloc.h:1258`, `corealloc.h:1438`: pass - `sc.raw()`. - - `corealloc.h:536`, `corealloc.h:1084`: pass - `entry.get_offset_and_sizeclass()`. - - `src/test/func/release-rounding/rounding.cc`: pass - `sc.raw()`. - -**Gate**: clean build. Full ctest suite passes. All combined -values are still `sc.raw()` because no offset writer exists yet -(step 5). - -### Step 5: Backend `alloc_chunk` writes per-chunk offset - -Changes: -- `backend.h::alloc_chunk` (~lines 131-156 today): keep the - `slab_size >= size` fast path using `set_metaentry` (offset = 0 - for every chunk). Add the multi-slab-tile branch (currently - dormant — only reached after Phase 15) that loops over chunks - and writes `ras_i = encode(remote, sc, slab_index)` via - `concretePagemap.set`. - -**Gate**: clean build. Full ctest suite passes — every Phase-14 -allocation today is single-slab-tile, so the new branch is -dormant. - -### Step 6: Targeted test for the per-chunk offset write - -Add `src/test/func/large_offset/large_offset.cc` per the -"Targeted test" subsection of "Final acceptance gates" below; -this exercises the multi-slab-tile write path by calling -`Config::Backend::alloc_chunk` directly with a synthetic -non-pow2 sizeclass. - -**Gate**: the new test passes; full suite still passes. - -### Step 7: Performance gate - -Run `perf-external_pointer` and `perf-large_alloc` on -`build-rel-base` vs `build-rel-p14`, 10× medians, per -`.github/skills/building_and_testing.md`. Compare against the -baseline-noise band measured pre-Phase-14. - -**Gate**: `perf-external_pointer` and `perf-large_alloc` within -noise of baseline (no statistically significant regression). -Disassemble `__malloc_start_pointer` to confirm: one `ras`-word -load, mask + table lookup with `offset_bytes`, no `meta`-word -load on the recovery path, no `imul`. - -**Outcome**: gate met after splitting the sizeclass metadata table -into three by purpose, plus an offset-aware branch in -`start_of_object`. - -1. Three tables, replacing the previous `fast_`/`slow_` pair: - - `start_` (4 × size_t = 32 B/row, indexed by - `offset_and_sizeclass_t`): `size`, `slab_mask`, `div_mult`, - `offset_bytes`. Power-of-two stride keeps the - `__malloc_start_pointer` index calc to a single `ubfiz #5`, - matching the baseline shape. - - `align_` (2 × size_t = 16 B/row, indexed by `sizeclass_t`): - `slab_mask` (duplicated), `mod_zero_mult`. - `is_start_of_object` reads both fields from one row instead of - straddling two tables; cold in `-fast` builds. - - `slab_` (2 × uint16 = 4 B/row, indexed by `sizeclass_t`): - `capacity`, `waking`. Slab init thresholds; cold. -2. `start_of_object` branches on `osc.offset() == 0` (testable from - bits already loaded in the `ras` word, before any metadata-table - access). The common arm skips the `offset_bytes` field load and - the offset-shift arithmetic; the slow arm handles non-pow2 large - interior chunks. Branch fully predicted on small-allocation - workloads. - -Without these refinements `perf-external_pointer-fast` regressed by -~24% (median ~360 ms vs baseline ~290 ms). With them, median -~290 ms — within noise of baseline. `perf-singlethread-check` -(exercises `is_start_of_object` on every dealloc) is also within -noise: identical 9-instruction codegen, now reading from the -narrower `align_` rows (4-per-cache-line vs the baseline's -2-per-cache-line). - -## Final acceptance gates - -1. **Build**: clean build passes. The new `static_assert` in - `sizeclasstable.h` (max large slab index < `1 << OFFSET_BITS`) - guards the OFFSET_BITS choice. The size-budget assert in - `largearenarange.h` (`(MAX_SIZE_BITS - MIN_SIZE_BITS) + - LARGE_SIZE_SHIFT <= bits::BITS`) guards the upward shift of - backend bits. -2. **Full ctest suite**: all existing tests pass. Front-end still - issues pow2 large requests, so for every materialised large - allocation `info.align == size` and offset is always 0 — the - combined index for every entry equals `sc.raw()`, indexing the - offset = 0 row of `start_`, which is bit-identical to the - pre-split row layout. -3. **`src/test/func/release-rounding/rounding.cc`** continues to - pass — small path unchanged; large path uses offset = 0 always. -4. **Extend `src/test/func/memory/memory.cc`** with a - `large_alloc_pointer_recovery` test (public-API path): - - Allocate several large sizes via the public API. For each - allocation `p` of requested size `S_req`, the actual reservation - in Phase 14 is `S_res = bits::next_pow2(S_req)` (front-end is - still pow2-only). For each: - - For every chunk offset `k * MIN_CHUNK_SIZE` for - `k = 0..S_res/MIN_CHUNK_SIZE - 1`, assert - `remaining_bytes(p + k * MIN_CHUNK_SIZE) == S_res - k * - MIN_CHUNK_SIZE`. The public `remaining_bytes` routes through - `index_in_object` and therefore consumes the - combined index from the pagemap entry; any miscalculation - in `offset_bytes` would produce a wrong residual. - - For every interior address `q = p + j` with `j ∈ {0, 1, - S_res/2, S_res-1}`, assert - `address_cast(snmalloc::external_pointer( - reinterpret_cast(q))) == p` (offset-aware public API, - which uses `index_in_object` → pagemap entry → combined - index → `offset_bytes` subtraction). -5. **New test or extension** to exercise the non-zero offset write - path directly (Phase 14 is otherwise un-tested with non-zero - offsets, because the front-end is still pow2-only). Path: - - Add a test in `src/test/func/large_offset/large_offset.cc` - that calls `Config::Backend::alloc_chunk` directly. The test - obtains a `LocalState&` from a constructed `snmalloc::Allocator` - via its public `get_backend_local_state()` accessor - (`corealloc.h:378`). - - Sizeclass selection: pick a non-pow2 large `sc` via - `sizeclass_t::from_raw(raw)` for a raw index whose - `sizeclass_metadata` entry has non-pow2 `size` but a smaller - `slab_mask` (= `info.align - 1`). These entries are - table-populated in Phase 13 and unreachable from the public - allocation API, but they are usable here because - `alloc_chunk`'s sizeclass argument is only consulted in the - pagemap write loop (which is what we want to exercise). - - Size argument: `alloc_chunk` asserts `bits::is_pow2(size)` - (`backend.h:95`). Pass `bits::next_pow2(sizeclass_full_to_size(sc))` - so the assert holds. This is *larger* than the sizeclass's - `size`, but the pagemap write loop iterates over the - passed-in pow2 region, computing per-chunk offsets via - `chunk_offset / slab_size` (where `slab_size = - sizeclass_full_to_slab_size(sc) < size`). Non-zero offsets - are therefore written for all chunks past the first slab. - - `ras` argument: construct via - `Config::PagemapEntry::encode(nullptr, sc)` (see - `metadata.h:211-219`), which matches how the front end builds - `ras` in `corealloc.h:723-728`. Avoids hard-coding the bit - layout in the test. The per-chunk `alloc_chunk` loop re-encodes - `ras` per chunk with the appropriate offset. - - Capability handling: `alloc_chunk` returns - `capptr::Chunk` (`backend.h:89-93`). Use - `address_cast(chunk)` for pagemap/start-of-object checks. - Before calling `dealloc_chunk`, convert via - `capptr_chunk_is_alloc(capptr_to_user_address_control(chunk))` - to get the `capptr::Alloc` it expects. - - Verify: - - For each chunk in the pow2 region: - - `Config::Backend::get_metaentry(address_cast(chunk) + - k * MIN_CHUNK_SIZE).get_offset_and_sizeclass()` decomposes - as `sc.raw() | (expected_slab_idx << SIZECLASS_BITS)` - where `expected_slab_idx = (k * MIN_CHUNK_SIZE) / - sizeclass_full_to_slab_size(sc)`. - - The same entry's `get_sizeclass()` (low-bits-only mask) - still returns `sc`. - - `address_cast(snmalloc::external_pointer(reinterpret_cast(address_cast(chunk) + - interior_offset))) == address_cast(chunk)` for a sample of - interior addresses spanning multiple slabs (one address - per slab boundary, plus mid-slab). `external_pointer` - routes through `index_in_object` which consults the - pagemap entry's combined index and the precomputed - `offset_bytes`. - - Then `Config::Backend::dealloc_chunk` with the *same* pow2 - size, and verify all chunks' offsets are cleared - (`get_offset_and_sizeclass() == 0`) — the dealloc path - constructs `Entry(nullptr, 0)`, whose `ras = 0` clears the - combined-index field entirely. -6. **Backend-bit-preservation test**: with the synthetic-sizeclass - test from (5) in place, allocate a region whose pow2 size spans - a PAL-allocation boundary so the backend has set bits in `meta` - and (after the move) the upper bits of `ras`. Verify the - boundary bit and other backend-owned bits survive the per-chunk - frontend write loop. (This is implicitly already covered by the - existing ctest suite — every multi-PAL-chunk allocation today - already does this, just without per-chunk offset writes — but - the explicit large_offset test makes the guarantee local.) - -## Risks - -1. **`RemoteAllocator` alignment bump.** `REMOTE_MIN_ALIGN` rises - from 512 B to 4096 B. Mitigation: verify the structure size and - pool-storage alignment annotations before changing the - constant; bump pool alignment if needed. Caught at runtime by - the existing `snmalloc_check_client` assertions on `ras` - pointer-bit-extraction, and by misaligned-pointer crashes in - message-passing. -2. **Backend bit budget.** `MAX_SIZE_BITS - MIN_SIZE_BITS + - LARGE_SIZE_SHIFT <= bits::BITS` (the assert in - `largearenarange.h:68-70`). With `LARGE_SIZE_SHIFT` - auto-shifted up by `OFFSET_BITS`, default config goes from ~44 - to ~47 bits used, still ≤ 64. The assert is the gate. -3. **Combined-index table size.** The combined-index `start_` table - holds `1 << OFFSET_BITS` × `sizeof(sizeclass_data_start)` more - rows than the original sizeclass-indexed table. Default: 8 × 32 B - × `SIZECLASS_REP_SIZE` ≈ 64 KB. Acceptable for an L2-resident - metadata table; if `INTERMEDIATE_BITS` is raised to 3 - (`OFFSET_BITS = 4`) the table grows to ~128 KB — also - acceptable. -4. **Encode-time offset overflow.** `encode(remote, sc, offset)` - asserts `offset < (1 << OFFSET_BITS)`. The `alloc_chunk` loop - bounds `slab_index` to `size / slab_size`, which is bounded by - the worst-case slab count for the chosen sizeclass — the same - bound the `static_assert` on `OFFSET_BITS` enforces. Caught at - build time by `static_assert`, at runtime by the encode assert. -5. **Combined-index masks elsewhere.** Anywhere that previously - masked `ras` by `SIZECLASS_REP_SIZE - 1` (or equivalent) to - extract a sizeclass needs an audit: does it want pure sizeclass - (`SIZECLASS_MASK`) or combined (`COMBINED_MASK`)? Grep for - `SIZECLASS_REP_SIZE`, `(0xff)` style masks on `ras`, and - `get_sizeclass()` callers. Convert each deliberately. The - primary risk site is the backend's claim/release flow, which is - already gated on the marker bit and so unaffected. - -## Out of scope - -- Front-end requesting non-pow2 large sizes (Phase 15). -- Per-chunk offset for small allocations (small uses slab_mask - recovery, no per-chunk offset needed). -- Configs where `slab_size < MIN_CHUNK_SIZE` (multiple logical - slabs per pagemap entry). The default `INTERMEDIATE_BITS = 2` - config does not hit this. Deferred to a future phase if needed. - -## Performance characterisation - -Goal: the layout-aware design should bring `perf-external_pointer` -back to baseline (or within noise). The two costs the previous, -`meta`-word-based, Phase 14 design carried — - -- one extra 8-byte load of the `meta` word per `external_pointer` - query, just to extract the offset; and -- one `imul` for `offset * slab_size` on the critical path — - -are both eliminated: - -- The combined index is the same `ras` word already loaded for the - sizeclass; masking with `COMBINED_MASK` is a single `and`-with-imm. -- `offset_bytes` is a table column; the subtraction is a load + a - sub, with no multiplication. - -`perf-large_alloc` is unchanged from the prior fix (single-slab-tile -fast path keeps `set_metaentry` as before; the per-chunk loop is -dormant until Phase 15). `perf-singlethread` and `perf-memcpy` were -within noise before and should remain so. - -Measure with five-run medians on `build-rel-base` (commit -`1144eab4`) vs `build-rel-p14` (head + Phase 14 layout-aware), per -the perf workflow in `.github/skills/building_and_testing.md`. If -`perf-external_pointer` is not within noise of baseline, -disassemble the new `__malloc_start_pointer` to confirm the load -count matches baseline (one 8-byte load of the pagemap byte, no -`meta` word load, no `imul`). - -# Pre-Phase-15: compile-time aligned dealloc overload - -## Goal - -Fix a pre-existing latent bug in the compile-time templated alloc / -dealloc API. This is independent of Phase 15 and is committed as a -sibling commit before Phase 15 begins. - -## The bug - -`globalalloc.h:341-356` `alloc` applies -`aligned_size(align, size)` internally: -``` -constexpr size_t sz = aligned_size(align, size); -… alloc(sz); -``` - -`globalalloc.h:394-399` `dealloc(p)` does not — it passes the -raw `size` to `check_size`: -``` -template -SNMALLOC_FAST_PATH_INLINE void dealloc(void* p) -{ - check_size(p, size); - … -} -``` - -When the alignment-driven upgrade pushes the alloc into a different -sizeclass than `size` itself, `check_size` fires. Concretely today -(pre-Phase-15), with `S = 33 KiB`, `A = 128 KiB`: - -- `alloc<33 KiB, Uninit, 128 KiB>()` → `aligned_size(128 KiB, 33 KiB) - = 128 KiB` → pagemap `sc(128 KiB)`. -- `dealloc<33 KiB>(p)` → `check_size(p, 33 KiB)` → - `size_to_sizeclass_full(33 KiB) = sc(40 KiB)` (sc(64 KiB) once - Phase 15 lands). -- Mismatch — `check_size` fires under `mitigations(sanity_checks)`. - Verified on `main` with a manual reproducer: - `Dealloc rounded size mismatch: 0xa000 != 0x20000`. - -The bug exists in `main` today; it does not require Phase 15. Phase -15 lowers the threshold (more (A, S) pairs cross a sizeclass -boundary) but does not introduce the asymmetry. - -## Fix - -Merge `dealloc` into a single template with `align` defaulted -to 1, so the same body handles both calling forms: -``` -template -SNMALLOC_FAST_PATH_INLINE void dealloc(void* p) -{ - constexpr size_t sz = aligned_size(align, size); - check_size(p, sz); - ThreadAlloc::get().dealloc(p); -} -``` -`aligned_size(1, size) == size` for all `size`, so existing -single-argument `dealloc(p)` callers are bit-equivalent to -their previous behaviour. - -To make `aligned_size` reachable from the test library header (which -deliberately avoids pulling in the full runtime sizeclass tables), -move its definition from `sizeclasstable.h` to `sizeclassstatic.h`. -The function is a pure compile-time-friendly utility — it depends -only on `is_small_sizeclass`, `bits::is_pow2`, and the SNMALLOC_* -macros, all of which are already available in `sizeclassstatic.h`. -Consumers of `aligned_size` previously included via `sizeclasstable.h` -still pick it up transitively through the existing include chain -(`pal.h` → `ds_core.h` → `sizeclassstatic.h`). - -Apply the same merge in the test library: -- `template void dealloc(void* p)` - replaces the previous `template` testlib overload. -- `template void* alloc()` - replaces the previous two-parameter testlib `alloc`. The body - computes `sz = aligned_size(align, size)` and routes to the - small/large path based on `sz`. - -## Test - -`src/test/func/aligned_dealloc/aligned_dealloc.cc`, listed in -`TESTLIB_ONLY_TESTS` so it is compiled once and linked against both -testlib flavours. - -- Includes `test/snmalloc_testlib.h` only — exercises the public - templated `alloc` / `dealloc` - surface through the testlib layering. -- The canonical reproducer `(S = 33 KiB, A = 128 KiB)` fires the bug - on `main` under the `check` flavour. Confirmed by hand before the - fix. -- Additional `(S, A)` pairs cover a small-to-large alignment upgrade, - a wider gap, the `align == size` baseline, and a small natural - alignment case. - -## Gate - -1. Build clean. -2. New test passes under both `fast` and `check`. -3. Full ctest suite green. -4. Pre-commit review loop. -5. Commit approval. - -After this commit lands, Phase 15 begins on top of it. - -# Phase 15: Front-end requests non-pow2 large allocations - -## Goal - -Flip the front-end so that large allocations request exactly the -sizeclass-encoded size (chunk-multiple, exp+mantissa-rounded), -instead of always the next power of two. This is the long-running -goal of the refactor: the backend (`LargeArenaRange`) has -supported arbitrary chunk-multiple sizes since Phase 10–12, the -sizeclass encoding has supported non-pow2 large since Phase 13, -and the per-chunk offset machinery has supported pointer recovery -since Phase 14. - -Effect: a request for e.g. 70 KiB on the default config -(`INTERMEDIATE_BITS = 2`) currently reserves 128 KiB (next pow2); -after Phase 15 it reserves 80 KiB (the next exp+mantissa class, -saving ~37.5%). A request for 96 KiB + 1 byte currently reserves -128 KiB; after Phase 15 it reserves 112 KiB. Sizes that already -land on a class boundary (e.g. 80 KiB, 96 KiB) reserve exactly -their requested size where today they reserve the next pow2. Net -effect across workloads is a reduction of large-allocation -footprint up to ~33% for sizes that fall mid-exponent. - -## Why now - -Phase 14 added the per-chunk offset write in `Backend::alloc_chunk`, -the three-table sizeclass metadata split (`start_` / `align_` / -`slab_`), and the offset==0 fast-path branch in `start_of_object`. -All of this is dormant on the front-end today because -`large_size_to_chunk_size(size) = next_pow2(size)` means every -materialised large allocation has `offset = 0` in every chunk. The -Phase 14 `large_offset` test reaches the per-chunk path via the -public *backend* API to confirm the dormant code is correct; Phase -15 is what makes the front-end actually exercise it. - -## Pre-flight verification - -Before implementing, confirm these Phase 14 facts (all true today — -listed so reviewers can re-check): - -- `bits::to_exp_mant(v)` - ceil-encodes (`v = v - 1; …`), so passing the raw size (not - `next_pow2(size)`) maps to the smallest enclosing sizeclass. -- `Backend::alloc_chunk` currently asserts - `bits::is_pow2(size)`. The Phase 14 pagemap loop advances by - `slab_size = sizeclass_full_to_slab_size(sizeclass)`, so the - correct precondition is `size >= slab_size` *and* - `(size & (slab_size - 1)) == 0`. Both already hold by - construction for front-end calls because - `size = sizeclass_full_to_size(sc)` and `slab_size = size & -size` - is the largest pow2 divisor of `size`; the loop terminates - exactly at `size`. We will tighten/relax the assert to match. -- The Phase 14 assert that `ras`'s offset bits are zero on entry - to `alloc_chunk` continues to hold: front-end calls - `PagemapEntry::encode(remote, sc)` with default `offset = 0`. -- `ArenaBins::carve` returns a base aligned to - `info.align = size & -size` (the largest pow2 divisor of size, - set in the bin-table ctor at `arenabins.h:742`). For a - 96 KiB request that is 32 KiB = `slab_size` = - `sizeclass_full_to_slab_size(sc)` — exactly what - `start_of_object`'s `addr & ~slab_mask` requires. -- `globalalloc::remaining_bytes` / `index_in_object` already route - through `entry.get_offset_and_sizeclass()` (committed in Phase - 14's API cleanup), so they will pick up non-zero offsets - automatically once the front-end produces them. - -## Changes - -### `src/snmalloc/ds/sizeclasstable.h` - -- `size_to_sizeclass_full(size)`: large branch calls - `to_exp_mant(size)` - directly. The encoding's ceil semantic selects the smallest - sizeclass whose size is `>= size`. -- `large_size_to_chunk_size` is removed. After the change above it - would just be `sizeclass_full_to_size(size_to_sizeclass_full(size))`, - which is exactly what `round_size` returns on the large branch; the - one in-tree caller (`corealloc.h` large path) is hoisted to use - `sizeclass_full_to_size(sc)` directly with a single `sc` lookup, so - the wrapper carries no remaining work. -- `round_size(size)`: large branch returns - `sizeclass_full_to_size(size_to_sizeclass_full(size))`. This is - correctness-critical because `DefaultConts::success` in - `corealloc.h:34-47` uses `round_size` to determine the zeroing - range for `calloc`. Without it `calloc` would zero beyond the - actual reservation. -- `compute_max_large_slab_index` tightens its bound to - `meta.size / slab_size - 1` (the actual worst case the runtime - loop writes). The previous `next_pow2(meta.size) / slab_size - 1` - overestimates now that no caller reserves `next_pow2(size)`. -- Doc-comments on `size_to_sizeclass_full` and `round_size` describe - the exp+mantissa rounding. - -### `src/snmalloc/backend/backend.h` - -- `alloc_chunk` precondition: the slab-tile invariant - ``` - const size_t slab_size = sizeclass_full_to_slab_size(sizeclass); - SNMALLOC_ASSERT(size >= slab_size); - SNMALLOC_ASSERT((size & (slab_size - 1)) == 0); - ``` - matches the pagemap loop's stride exactly and is the minimum - required for the per-chunk write to terminate at `size`. The - previous duplicate `size >= slab_size` assert inside the loop is - consolidated. -- The offset-bits-zero assert on `ras` stays — the front-end uses - `encode(remote, sc)` with default offset 0. -- Loop comment describes `size` as a multiple of `slab_size` with - `size >= slab_size`. - -### `src/snmalloc/global/globalalloc.h` - -No change. The runtime sized-dealloc check is correct because every -legitimate caller pre-applies `aligned_size`: - -- Unaligned `sized_dealloc(p, S)`: alloc was `malloc(S)`, which goes - through `size_to_sizeclass_full(S)`; the dealloc check evaluates - the same function on the same `S`. Same sizeclass. -- Aligned `sized_dealloc(p, S, A)` (line 401): computes - `aligned_size(A, S)` *before* calling `check_size`. -- `rust.cc:33` and `rust.cc:51`: both apply `aligned_size` before - the 2-arg `dealloc(ptr, size)` path. -- `jemalloc_compat::sdallocx`: ignores the size argument. - -A 2-arg `sized_dealloc(p, S)` after `aligned_alloc(A, S)` with -`aligned_size(A, S) > S` would mismatch — but that is a client bug: -the client should use the 3-arg form for aligned allocations. - -The compile-time `alloc` / `dealloc` -asymmetry is being fixed in the **pre-Phase-15 sibling commit** -(see the "Pre-Phase-15: compile-time aligned dealloc overload" -section below). Phase 15 does not touch `globalalloc.h`. - -### `src/snmalloc/mem/corealloc.h` - -- Large-alloc handler at lines 723-728 currently invokes - `size_to_sizeclass_full(size)` three times and - `large_size_to_chunk_size(size)` once. Hoist into locals so the - table lookups happen once: - ``` - const auto sc = size_to_sizeclass_full(size); - const size_t chunk_sz = sizeclass_full_to_size(sc); - auto [chunk, meta] = Config::Backend::alloc_chunk( - self->get_backend_local_state(), - chunk_sz, - PagemapEntry::encode(self->public_state(), sc), - sc); - ``` - - Phase 15 still leaves the large path through the same handler; - the hoist removes duplicated work on the large-allocation path - rather than changing any small-allocation hot loop. - -### `src/snmalloc/backend_helpers/smallbuddyrange.h:232` and similar - -- `alloc_range_with_leftover` uses `bits::next_pow2(size)` to size - its parent request. This range serves the *meta-data* allocator, - not the user object range — meta_size is always pow2 (line 203 - of `backend.h` already calls `next_pow2(sizeof(SlabMetadata) + - extra_bytes)`). No change needed; verify by inspection that the - call site is not on the user-large path and note the conclusion - in the commit. - -### Tests - -- The existing `src/test/func/large_offset/large_offset.cc` test - exercises the per-chunk path via the *backend* API. Phase 15 - flips the *front-end* to do the same. The test's header - comment (lines 5-9) currently says "currently only issues pow2 - large requests" and that `alloc_chunk` "asserts pow2"; both - become false after Phase 15. Update the comment to describe - this test as the *low-level* / *backend-API* counterpart of the - new front-end test. - -- Add a sibling test `src/test/func/large_offset_frontend/` that - exercises a *bounded* set of representative large sizeclasses - (smallest non-pow2 large class, two mid-range classes spanning - different exponents, one near `MAX_LARGE_SIZECLASS_SIZE` only if - the total allocation is well under the available test-time - address budget — cap at a few MiB per allocation). For each - selected sizeclass `sc` where - `sizeclass_full_to_size(sc) != sizeclass_full_to_slab_size(sc)`: - - Call `malloc(sizeclass_full_to_size(sc))`, save `p`. Assert - `is_start_of_object(p)`. - - For every chunk offset `j * MIN_CHUNK_SIZE` with - `j ∈ [1, size_full / MIN_CHUNK_SIZE)`, assert - `external_pointer(p + j * MIN_CHUNK_SIZE) == p` and - `remaining_bytes(p + j * MIN_CHUNK_SIZE) == size_full - j * - MIN_CHUNK_SIZE`. - - Assert `malloc_usable_size(p) == size_full` (the new actual - reservation, not `next_pow2(size_full)`). - - Free, then re-allocate and confirm address re-use behaves - sanely. - - Also allocate a *non-boundary* request between adjacent class - sizes (e.g. `malloc(size_full - 1)` for a non-pow2 class, - `malloc(prev_class + 1)`) and assert `malloc_usable_size(p)` - equals `size_full` — this is what proves the raw request maps - to the smallest enclosing class. - - Pure table-level properties (every large sizeclass round-trips - through `size_to_sizeclass_full` ∘ `sizeclass_full_to_size`) - can be checked without allocating; loop over the full large - range there. - -- `src/test/func/sizeclass/sizeclass.cc` lines 160-175 currently - assert that a non-pow2 large size strictly between adjacent - pow2 rounds to the next pow2. Phase 15 changes this: a non-pow2 - size now rounds to the next exp+mantissa class. Compute the - expected value independently of the function under test — scan - the representable large classes (e.g. iterate sizeclasses 0 .. - `NUM_LARGE_CLASSES`) and pick the smallest `sizeclass_full_to_size(sc) >= mid`. - Then assert `size_to_sizeclass_full(mid)` equals that sizeclass - and `sizeclass_full_to_size(size_to_sizeclass_full(mid))` equals - the independently-computed class size. Update the comment - ("pow2 rounding still in force") accordingly. The surrounding - `b == ENCODED_ADDRESS_BITS` bound logic stays. - - **Add a deterministic `round_size` regression gate alongside.** - For each representable large sizeclass `sc` with size `S = - sizeclass_full_to_size(sc)`, and `S_prev` the previous class - size, assert: - - `round_size(S) == S` - - `round_size(S_prev + 1) == S` (i.e. the request is rounded - to the smallest enclosing class, not blown up to the next - pow2). - - `large_size_to_chunk_size(S_prev + 1) == round_size(S_prev + 1)` - (the chunk-size and round-size views agree). - - This is the primary `round_size` gate. If `round_size` is left - as `next_pow2`, these assertions fail deterministically — unlike - the calloc zeroing smoke test below, which may not fault when - `memset` overruns into backend free range. - -- `src/test/func/release-rounding/rounding.cc` lines 86-127 - exercise pow2 large sizes end-to-end via - `index_in_object`/`is_start_of_object`. Phase 15 does not - change behaviour for pow2 sizes (they still round to themselves), - so this loop continues to pass unchanged. Optionally extend - the loop with a non-pow2 case (e.g. `mid = S + (S >> 2)`) to - exercise the new front-end-materialised non-pow2 classes. - -- `src/test/func/malloc/malloc.cc:82-87` uses - `natural_alignment(size)` symbolically. Because - `natural_alignment` derives from `round_size`, the test - auto-tracks Phase 15: a 96 KiB alloc now reports 32 KiB - alignment (today: 128 KiB). No code change in the test, but - cross-check that no test elsewhere hard-codes "pow2 large - alignment". - -- `src/test/func/statistics/` (and any other test asserting - per-sizeclass alloc counts): verify the assertion model does - not assume pow2 large counts. Inspection-only first; update - only if tests fail. - -- **Calloc zeroing correctness smoke test.** The existing calloc - tests (`memory.cc::test_calloc_16M`, `test_calloc` loop in - `malloc.cc`) mostly use sizes that round to a pow2 reservation - even today, so they would not catch `round_size` being left as - `next_pow2` after Phase 15. Add a test in - `src/test/func/memory/memory.cc` that calls `calloc(1, S)` for - a non-pow2 large class size `S` and asserts - `malloc_usable_size(p) == S` and that every byte in `[p, p + S)` - is zero. This is a smoke test only — the deterministic gate for - the `round_size` regression lives in `sizeclass.cc` (above) - because a `memset` overshoot into backend free range may not - fault and would not be caught by zeroing the visible range. - -## Test gates - -1. **Build**: clean build passes. The `static_assert` chain from - Phase 14 is unchanged — `compute_max_large_slab_index` in - `sizeclasstable.h:419-437` still uses - `bits::next_pow2_const(meta.size)`, which is *conservative* - under Phase 15 (the front-end now reserves at most that much, - often less), so the budget bound continues to hold. -2. **Full ctest suite**: all 88 existing tests pass after - expectation updates in `sizeclass.cc`. Tests exercising large - allocations now allocate exp+mantissa-rounded chunk sizes; - reservation footprint shrinks; functional results unchanged. -3. **New `large_offset_frontend` test** passes — per-chunk offsets - are now produced by the front-end and recovered by - `external_pointer` / `remaining_bytes`. -4. **`perf-external_pointer-fast`**: median within noise of the - Phase 14 baseline (~290 ms on the dev machine). The hot path - for small allocations is unchanged; the only change in - instruction count comes from `__malloc_start_pointer` for - non-pow2 large allocations, which now exercises the slow arm of - the `offset == 0` branch added in Phase 14 — but only for - genuinely non-pow2 allocations, of which the benchmark has - none. -5. **`perf-singlethread-check`**: within noise. -6. **Memory footprint**: a synthetic benchmark allocating - `malloc(96 KiB)` × N reports peak RSS lower by ~25% vs the - pre-Phase-15 baseline. (Optional diagnostic; not a gate.) - -## Risks - -1. **`calloc` zeroing range overshoot**. Mitigated by updating - `round_size` for large. Verify by inspecting - `corealloc.h:34-47` (`DefaultConts::success`) — must zero - exactly the reservation size returned by `round_size`. The new - non-pow2 calloc test in `memory.cc` is the regression gate. -2. **External clients assuming pow2-aligned large allocations.** - `natural_alignment` automatically reports the reduced - alignment, but any external code that hard-codes "large allocs - are pow2-aligned" silently breaks. Document in the commit - message; consider a release note if there is a CHANGELOG. -3. **`aligned_alloc` overflow at extreme sizes.** `aligned_size` - already handles SIZE_MAX overflow; behaviour unchanged. -4. **Performance regression on the front-end alloc path.** - `next_pow2(size)` is replaced by `to_exp_mant(size)` plus a - table lookup. Both are constant-time and small; perf gate - confirms no regression. -5. **External pagemap / fixed-region builds.** The fixed-region - tests (`src/test/func/fixed_region/`, - `src/test/func/external_pagemap/`) construct allocations via - different paths. Re-run them in the full suite. -6. **Statistics counters.** `func-statistics` checks per-sizeclass - counts. Verify the test doesn't hard-code "every large is - pow2". - -## Out of scope - -- Reducing `INTERMEDIATE_BITS` to gain bits in the sizeclass tag - (Phase 13 chose the existing value). -- Generalising small allocations (already exp+mantissa). -- Any change to `alloc_range` / `dealloc_range` of arbitrary - byte-multiples — front-end always rounds via the sizeclass - encoding before reaching the backend. -- Removing the offset==0 fast-path branch in `start_of_object`. - After Phase 15 the slow arm is reachable from the front-end, but - the branch is fully predicted on small-allocation workloads - (which dominate the benchmark) and the slow arm's cost is small. - -## Implementation order (every step has a test gate) - -1. **Front-end flip + `alloc_chunk` precondition + frontend test - in a single commit.** This is one atomic refactor: the - precondition cannot be relaxed safely until the front-end has - reasons to call with non-pow2 sizes, and the front-end flip - cannot be exercised end-to-end without the precondition - relaxation. Files touched in this commit: - - `src/snmalloc/ds/sizeclasstable.h`: drop `next_pow2` from - `size_to_sizeclass_full`; rewrite `large_size_to_chunk_size` - and `round_size` per the "Changes" section; update doc - comments. - - `src/snmalloc/backend/backend.h`: replace - `alloc_chunk`'s `is_pow2(size)` precondition with the - slab-tile invariant; rewrite the surrounding comment. - - `src/snmalloc/mem/corealloc.h`: hoist the duplicated - `size_to_sizeclass_full(size)` / `large_size_to_chunk_size` - calls in the large-alloc path (lines 723-728) into locals. - - `src/test/func/large_offset_frontend/`: new test (the - gate). Covers per-chunk pagemap recovery and non-boundary - requests. - - `src/test/func/large_offset/large_offset.cc`: update header - comment now that the backend-API and front-end exercise the - same path. - - `src/test/func/sizeclass/sizeclass.cc`: update the - non-pow2-rounds-to-next-pow2 expectation at lines 160-175. - - `src/test/func/memory/memory.cc`: add non-pow2-large calloc - test (the `round_size` regression gate). - Gate: full ctest suite passes including - `large_offset_frontend` and the new calloc test. - -2. **Perf gate** (per the perf-gate protocol from Phase 14): - measure `perf-external_pointer-fast` and - `perf-singlethread-check` against the Phase-14 baseline (~290 - ms / ~580 ms median); 5 runs × 3 reps each; report median + - range. If a regression is found, root-cause via perf annotate - before committing — do not paper over with workarounds. - -3. **Mandatory pre-commit review loop** before the commit. - -# Review plan for Phases 13–15 - -Per claude.md "Mandatory review checkpoints": - -1. After this plan is written (now), run the rubber-duck review - pass on Phases 13–15 — read the plan + existing - `sizeclasstable.h`, `metadata.h`, `corealloc.h`, - `backend.h`, and confirm: - - Assumptions about bit availability in `FrontendMetaEntry` - (especially `alignof(SlabMetadata)`) are correct. - - No phase has a hidden cross-phase dependency that breaks the - "each phase ends with passing tests" invariant. - - The SIZECLASS_BITS widening doesn't break MetaEntry encoding. - - Tests proposed for each phase actually gate the right - invariants. -2. Address findings; present revised plan for explicit user - approval before any code changes. -3. After implementation of each phase, run the build/test subagent - per `.github/skills/building_and_testing.md`. -4. After all three phases land, pre-PR review (mandatory checkpoint). diff --git a/docs/Arena.md b/docs/Arena.md new file mode 100644 index 000000000..522ddf4c2 --- /dev/null +++ b/docs/Arena.md @@ -0,0 +1,156 @@ +# The Arena: A Bitmap-Indexed Coalescing Range + +`Arena` is snmalloc's address-space range that stores free blocks at their +**natural** size — no power-of-two rounding — and serves any request from the +full snmalloc size-class sequence. It sits in the per-thread range pipeline +underneath the slab caches and replaces the historical buddy-based ranges. + +This document is the conceptual introduction. For where `Arena` plugs into +the wider range chain, see [`AddressSpace.md`](AddressSpace.md). + +## The problem + +A buddy allocator only stores power-of-two blocks. A request for 5 chunks +must be served from an 8-chunk buddy block, wasting 3 chunks. We wanted a +range that + +* stores blocks at their actual size, +* uses snmalloc's full `(exponent, mantissa)` size-class sequence at the + range level, and +* still answers "find a block that can serve this request" in O(1). + +## The core idea: search upward, mask out exceptions + +Free blocks are binned by the *set of size classes they can serve* — the +**servable set**. To allocate, you walk a per-arena non-empty-bins bitmap +upward through the bins; any larger block can be carved down. This almost +works perfectly. The exception is alignment: some bins hold blocks whose +address alignment is too poor to serve certain smaller, *more* aligned size +classes. Those bins must be excluded from the search for those requests. + +The implementation builds the per-request filter *positively* as a **serve +mask** — bit `k` set means bin `k` can serve this request — and the lookup +is `find_first_set(bitmap & serve_mask, start_word)`. The serve mask +depends only on the requested size class, not on the block, so it is +precomputed at compile time. + +(The original sketch of this design used the equivalent inverse framing of +a "skip mask" with `bitmap & ~skip_mask`; see `arenabins.h` for the +in-tree explanation of why positive is preferred.) + +## Why the exceptions exist + +snmalloc's size classes follow `S = 2^e + m · 2^(e−B)`, where `B` is the +mantissa-bit width (`INTERMEDIATE_BITS`, 2 in production). Each size class +has a natural alignment `align(S) = S & -S`. + +A size class with high alignment needs padding to reach an aligned address +within a block. A block of a *larger* size class with *lower* alignment may +not have room for that padding. Concretely: a block of size 5 at address 1 +can serve size 5 (alignment 1) but cannot serve size 4 (alignment 4) — +there is not enough space after padding to the first 4-aligned address. + +Same size block, different address, different servable set. This is why +distinct bins per servable-set are needed. + +## Bin count grows slowly in B + +At each exponent, the distinct servable sets are enumerated exhaustively +by `prototype/skip_analysis.py`: + +| B | Mantissas/exponent | Bins/exponent | Max mask bits | +|---|-------------------:|--------------:|--------------:| +| 1 | 2 | 2 | 0 | +| 2 | 4 | 5 | 1 | +| 3 | 8 | 13 | 4 | +| 4 | 16 | 34 | 11 | + +Most requests need no exceptions at all. Only size classes whose alignment +exceeds the expected alignment for their position in the sequence have any +bits to mask. The whole structure is constant-folded into a few small tables. + +## The two-tree structure + +A bitmap alone is not enough — when a bin is non-empty, the arena still has +to *retrieve* and *coalesce* blocks. Each `Arena` therefore maintains: + +* **One red-black tree per non-empty bin** (the "bin trees"), keyed by + block address, giving O(log n) selection within a bin. The non-empty-bins + bitmap is the index over these trees. + +* **One red-black tree of all free blocks** (the "range tree"), keyed by + address, used to find a block's left/right neighbours for coalescing on + free. + +On allocation: bitmap lookup → choose the bin → pop a block from its +bin tree → `carve` returns pre-pad / aligned request / post-pad → pre and +post (if any) re-enter the arena via the bin and range trees. + +On free: range tree lookup → coalesce with neighbours if their tags allow +→ insert the resulting (possibly merged) block. + +## Two variants over the same Arena + +`Arena` is parameterised by a **Rep** (representation) that decides where +the per-block tree-node state lives. Two reps ship today: + +* **`PagemapRep`** — node state lives in the pagemap entry that already + covers the block. Used by **`LargeArenaRange`**, which manages whole + chunks and larger. Node access is a pagemap lookup; no in-band space is + consumed. + +* **`InplaceRep`** — node state lives *in the free block itself*, in the + first units. Used by **`SmallArenaRange`**, which manages sub-chunk + metadata fragments where no pagemap entry exists for the fragment. The + layout packs the bin tree pointers, the range tree pointers, and (for + blocks ≥ 3 units) a large-size word into the leading units of the free + block. Unit size is `next_pow2(2 · sizeof(CapPtr))` — 16 B without + CHERI, 32 B with pure-capability CHERI/Morello — large enough to hold + the two pointers a free block must store. + +Both reps drive the same bin / range tree logic in `arena.h`; the bin +classifier and bitmap in `arenabins.h` are shared. + +## Why this matters for metadata + +Slab metadata typically wants a pow2 client structure (e.g. a 128 B +bitmap) plus a fixed ~32 B header. A buddy-based small range rounds +`160 B → 256 B` (96 B wasted per slab). `SmallArenaRange` rounds to a unit +multiple (`MIN_META_ALIGN`), so the same allocation costs ~160 B. Across +many slabs and large heaps this is real memory. + +## Concrete example (B = 2, in-production) + +At exponent `e = 2` the size classes are 4, 5, 6, 7, and there are 5 bins, +each labeled by the set of sizes it can serve at this exponent: + + Bin 0: serves {4} + Bin 1: serves {5} + Bin 2: serves {4, 5} + Bin 3: serves {4, 5, 6} + Bin 4: serves {4, 5, 6, 7} + +The per-request serve masks (within this exponent — higher exponents +always serve, so their bits are set): + + Request for 7: serve bins {4} + Request for 6: serve bins {3, 4} + Request for 5: serve bins {1, 2, 3, 4} + Request for 4: serve bins {0, 2, 3, 4} — bin 1 holds only {5} blocks + +Only the size-4 request has an exception: bin 1 must not be picked. All +other requests get the simple "everything at or above" mask. + +## Where to look in the code + +* `src/snmalloc/backend_helpers/arenabins.h` — bin classification, serve + masks, the non-empty-bins bitmap, the `carve` primitive. +* `src/snmalloc/backend_helpers/arena.h` — bin-tree-per-bin + range-tree + structure, allocation and free / coalesce paths. +* `src/snmalloc/backend_helpers/largearenarange.h` — `Arena` + for whole-chunk allocations. +* `src/snmalloc/backend_helpers/smallarenarange.h`, + `inplacerep.h` — `Arena` for sub-chunk metadata. + +`prototype/skip_analysis.py` and `prototype/servable_sets.py` enumerate +the bin scheme and verify the serve-mask construction for B ∈ {1, 2, 3}. From 6d9dbcfbd713fde443a4e4838ded2ad2eb03edb6 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Tue, 9 Jun 2026 15:53:42 +0100 Subject: [PATCH 27/31] Fix release-build errors surfaced by CI * arenabins.h: Bitmap needs a constexpr default ctor for clang's require_constant_initialization on threadalloc.h's default_alloc. * redblacktree.h: print had an uninitialised s_indent; inlining changes from the new neighbours / for_each helpers tipped GCC's -Wmaybe-uninitialized over. * Several test locals are only consumed by SNMALLOC_ASSERT and become unused in -DNDEBUG builds. Wrap with UNUSED(...) to match the convention already in the file. * GCC's -Warray-bounds cannot prove the upper bound of indices read from compile-time tables once asserts are stripped. Add SNMALLOC_ASSUME alongside the existing asserts in ArenaBins::Bitmap::find_for_request and the test's mock_index. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/backend_helpers/arenabins.h | 3 ++- src/snmalloc/ds_core/redblacktree.h | 2 +- src/test/func/arena/arena.cc | 13 ++++++++++++- src/test/func/largearenarange/largearenarange.cc | 2 ++ 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/snmalloc/backend_helpers/arenabins.h b/src/snmalloc/backend_helpers/arenabins.h index 07a572b45..b9e32d092 100644 --- a/src/snmalloc/backend_helpers/arenabins.h +++ b/src/snmalloc/backend_helpers/arenabins.h @@ -318,7 +318,7 @@ namespace snmalloc /// so callers can size parallel arrays (one RB-tree per bin id). static constexpr size_t TOTAL_BINS = BINS_PER_EXP * bits::BITS; - Bitmap() : words_{} {} + constexpr Bitmap() : words_{} {} /** * Classify `block`, set its bin's bit, return the bin id. @@ -373,6 +373,7 @@ namespace snmalloc { const bitmap_info_t& info = bitmap_info_for_request(n); SNMALLOC_ASSERT(info.start_word < NUM_BITMAP_WORDS); + SNMALLOC_ASSUME(info.start_word < NUM_BITMAP_WORDS); // First word: start bin + any within-exp neighbours in same word. size_t word = info.start_word; diff --git a/src/snmalloc/ds_core/redblacktree.h b/src/snmalloc/ds_core/redblacktree.h index 6009260f1..3fda3b0c9 100644 --- a/src/snmalloc/ds_core/redblacktree.h +++ b/src/snmalloc/ds_core/redblacktree.h @@ -457,7 +457,7 @@ namespace snmalloc // insufficient to accurately display the tree, but it will still be // memory safe as the search code is bounded by the string size. static constexpr size_t max_depth = 128; - char s_indent[max_depth]; + char s_indent[max_depth] = {}; size_t end = 0; for (; end < max_depth - 1; end++) { diff --git a/src/test/func/arena/arena.cc b/src/test/func/arena/arena.cc index 7a88e3492..69ac4f02d 100644 --- a/src/test/func/arena/arena.cc +++ b/src/test/func/arena/arena.cc @@ -92,6 +92,7 @@ namespace snmalloc { size_t idx = addr >> MIN_CHUNK_BITS; SNMALLOC_ASSERT(idx < MOCK_ARENA_CHUNKS); + SNMALLOC_ASSUME(idx < MOCK_ARENA_CHUNKS); return idx; } @@ -462,6 +463,7 @@ namespace snmalloc { auto r2 = arena.remove_block(chunk_size(1)); SNMALLOC_ASSERT(r2 != 0); + UNUSED(r2); arena.check_invariant(true); remaining -= chunk_size(1); } @@ -666,6 +668,7 @@ namespace snmalloc auto& bt0 = ArenaTestAccess::get_bin_trees(arena)[0]; auto p3 = bt0.get_root_path(); SNMALLOC_ASSERT(bt0.find(p3, chunk_addr(11))); + UNUSED(p1, p2, p3); size_t total = drain_arena(arena); SNMALLOC_ASSERT(total == 4); @@ -1062,6 +1065,7 @@ namespace snmalloc auto arena_result = arena.remove_block(chunk_size(n)); auto oracle_result = oracle.remove(n); + UNUSED(arena_result); // Both should agree on success/failure. if (oracle_result.second == 0) @@ -1071,7 +1075,6 @@ namespace snmalloc else { SNMALLOC_ASSERT(arena_result != 0); - // Arena should return the address oracle predicts. SNMALLOC_ASSERT( arena_result == chunk_addr(BASE + oracle_result.first)); @@ -1166,6 +1169,7 @@ namespace snmalloc // B should now serve a size-12 request from the consolidated block. uintptr_t r_addr = arena_b.remove_block(chunk_size(12)); SNMALLOC_ASSERT(r_addr == chunk_addr(BASE + 20)); + UNUSED(r_addr); arena_b.check_invariant(true); printf(" Consolidation after migration: OK\n"); @@ -1262,6 +1266,7 @@ namespace snmalloc auto arena_r = arena.remove_block(chunk_size(n)); auto oracle_r = oracle.remove(n); + UNUSED(arena_r); if (oracle_r.second == 0) { @@ -1292,6 +1297,7 @@ namespace snmalloc auto& dst_oracle = from_a ? oracle_b : oracle_a; uint8_t src_id = from_a ? 1 : 2; uint8_t dst_id = from_a ? 2 : 1; + UNUSED(src_id); size_t n = (rng.next() % 3) + 1; uintptr_t src_r = src.remove_block(chunk_size(n)); @@ -1375,6 +1381,7 @@ namespace snmalloc SNMALLOC_ASSERT(r1_addr == p_addr); auto r2_addr = arena.remove_block(chunk_size(2)); SNMALLOC_ASSERT(r2_addr == a_addr); + UNUSED(r1_addr, r2_addr); printf(" Boundary blocks predecessor merge: OK\n"); } @@ -1401,6 +1408,7 @@ namespace snmalloc SNMALLOC_ASSERT(r1_addr == a_addr); auto r2_addr = arena.remove_block(chunk_size(4)); SNMALLOC_ASSERT(r2_addr == s_addr); + UNUSED(r1_addr, r2_addr); printf(" Boundary blocks successor merge: OK\n"); } @@ -1427,6 +1435,7 @@ namespace snmalloc SNMALLOC_ASSERT(r1_addr == chunk_addr(4)); auto r2_addr = arena.remove_block(chunk_size(2)); SNMALLOC_ASSERT(r2_addr == chunk_addr(8)); + UNUSED(r1_addr, r2_addr); printf(" Boundary partial (P merges, S blocked): OK\n"); } @@ -1454,6 +1463,7 @@ namespace snmalloc auto r1 = arena.remove_block(chunk_size(4)); SNMALLOC_ASSERT(r1 == top_addr); + UNUSED(r1); printf(" Block at arena top edge: OK\n"); } @@ -1480,6 +1490,7 @@ namespace snmalloc SNMALLOC_ASSERT( (r1_addr == p_addr && r2_addr == a_addr) || (r1_addr == a_addr && r2_addr == p_addr)); + UNUSED(r1_addr, r2_addr); printf(" Boundary blocks min predecessor merge: OK\n"); } diff --git a/src/test/func/largearenarange/largearenarange.cc b/src/test/func/largearenarange/largearenarange.cc index 9ce736e65..3a9e8536c 100644 --- a/src/test/func/largearenarange/largearenarange.cc +++ b/src/test/func/largearenarange/largearenarange.cc @@ -190,6 +190,7 @@ namespace uintptr_t addr = p.unsafe_uintptr(); SNMALLOC_ASSERT( (addr & (sizes[i] - 1)) == 0 && "Allocation not properly aligned"); + UNUSED(addr); range.dealloc_range(p, sizes[i]); } @@ -265,6 +266,7 @@ namespace uintptr_t lo_j = ptrs[j].unsafe_uintptr(); uintptr_t hi_j = lo_j + sizes[j]; SNMALLOC_ASSERT(hi_i <= lo_j || hi_j <= lo_i); + UNUSED(hi_i, hi_j); } } From 2b5831837d386fd40ec686f906fb0ae86ee869d3 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Tue, 9 Jun 2026 16:26:18 +0100 Subject: [PATCH 28/31] Cross-platform CI fixes: MSVC alignas, gnu++17 op!=, GCC OOB warning, sign-conv * sizeclasstable.h: add operator!= alongside operator==. Apple Clang in gnu++17 mode does not synthesise it from ==, so the release-rounding test fails to build on macOS. * arena.cc test mock_index: GCC's release -Warray-bounds still saw the OOB path at the mock_store[...] read site even with SNMALLOC_ASSUME inside mock_index. Replace the indirect probe in can_consolidate with an explicit in-range guard returning false on out-of-arena addresses (matches PagemapRep semantics: no neighbour outside the arena). The initializer-list size_t loop in test_large_size_roundtrip now uses size_t{} literals to silence -Wsign-conversion under the clang+UBSan+TSan CI configuration. * smallarenarange.cc: MSVC rejects alignas with values as large as MIN_CHUNK_SIZE (16384) on static storage. Oversize both backing buffers by one chunk and align the base up at runtime via base_addr() / pool_base(). Fix the resulting -Wsign-conversion in the pointer-difference index calculation. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/ds/sizeclasstable.h | 5 +++ src/test/func/arena/arena.cc | 18 ++++++----- .../func/smallarenarange/smallarenarange.cc | 31 +++++++++++++------ 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/src/snmalloc/ds/sizeclasstable.h b/src/snmalloc/ds/sizeclasstable.h index 635ff7a45..66005d936 100644 --- a/src/snmalloc/ds/sizeclasstable.h +++ b/src/snmalloc/ds/sizeclasstable.h @@ -136,6 +136,11 @@ namespace snmalloc { return value == other.value; } + + constexpr bool operator!=(sizeclass_t other) + { + return value != other.value; + } }; /** diff --git a/src/test/func/arena/arena.cc b/src/test/func/arena/arena.cc index 69ac4f02d..c310d6e0a 100644 --- a/src/test/func/arena/arena.cc +++ b/src/test/func/arena/arena.cc @@ -202,14 +202,17 @@ namespace snmalloc // Mirrors PagemapRep::can_consolidate, which reads // entry.is_boundary() from the pagemap. The boundary flag lives - // per-chunk in mock_store; mock_index asserts the index is in - // range, so any caller that probes outside the arena trips the - // assertion — this catches accidental out-of-region probes in - // Arena unit tests rather than as a release-build - // segfault. + // per-chunk in mock_store. An out-of-region probe returns false + // (cannot consolidate) — both because that is the right semantic + // (no neighbour exists outside the arena) and because it gives + // GCC's release-mode `-Warray-bounds` analysis a visible guard + // covering the `mock_store[...]` read on this branch. static bool can_consolidate(uintptr_t addr) { - return !mock_store[mock_index(addr)].boundary; + size_t idx = addr >> MIN_CHUNK_BITS; + if (idx >= MOCK_ARENA_CHUNKS) + return false; + return !mock_store[idx].boundary; } }; @@ -280,7 +283,8 @@ namespace snmalloc reset_mock_store(); uintptr_t a = chunk_addr(20); - for (size_t s : {3, 7, 15, 63, 255, 1000}) + for (size_t s : + {size_t{3}, size_t{7}, size_t{15}, size_t{63}, size_t{255}, size_t{1000}}) { MockRep::set_large_size(a, s); SNMALLOC_ASSERT(MockRep::get_large_size(a) == s); diff --git a/src/test/func/smallarenarange/smallarenarange.cc b/src/test/func/smallarenarange/smallarenarange.cc index 596bf20b8..7c9bf4606 100644 --- a/src/test/func/smallarenarange/smallarenarange.cc +++ b/src/test/func/smallarenarange/smallarenarange.cc @@ -39,13 +39,18 @@ namespace snmalloc // unit-aligned and the in-band node fields land at the expected // offsets. Sized to comfortably cover the arena's full range plus // a small base offset that keeps block addresses non-zero (zero - // is the tree null sentinel). - alignas(MIN_CHUNK_SIZE) static unsigned char backing[2 * MIN_CHUNK_SIZE]; + // is the tree null sentinel). Oversized by MIN_CHUNK_SIZE so the + // base can be aligned up at runtime — MSVC rejects alignas values + // as large as MIN_CHUNK_SIZE on static storage. + static unsigned char backing[3 * MIN_CHUNK_SIZE]; static uintptr_t base_addr() { - // Offset by MIN_CHUNK_SIZE to keep addresses well clear of zero. - return reinterpret_cast(&backing[MIN_CHUNK_SIZE]); + // Round up to MIN_CHUNK_SIZE, then offset by MIN_CHUNK_SIZE to + // keep addresses well clear of zero. + uintptr_t raw = reinterpret_cast(&backing[0]); + uintptr_t aligned = (raw + MIN_CHUNK_SIZE - 1) & ~(MIN_CHUNK_SIZE - 1); + return aligned + MIN_CHUNK_SIZE; } static void reset_backing() @@ -447,15 +452,23 @@ namespace snmalloc // ================================================================== // Pool of chunk-aligned buffers, handed out as a chunk-granularity - // parent range to SmallArenaRange. + // parent range to SmallArenaRange. Oversized by MIN_CHUNK_SIZE so + // `pool_base()` can align up at runtime — MSVC rejects alignas + // values as large as MIN_CHUNK_SIZE on static storage. static constexpr size_t POOL_CHUNKS = 8; - alignas(MIN_CHUNK_SIZE) static unsigned char pool_storage - [POOL_CHUNKS * MIN_CHUNK_SIZE]; + static unsigned char pool_storage[(POOL_CHUNKS + 1) * MIN_CHUNK_SIZE]; static bool pool_in_use[POOL_CHUNKS]; // Track returns to detect leaks / double-frees. static size_t pool_alloc_count; static size_t pool_dealloc_count; + static unsigned char* pool_base() + { + uintptr_t raw = reinterpret_cast(&pool_storage[0]); + uintptr_t aligned = (raw + MIN_CHUNK_SIZE - 1) & ~(MIN_CHUNK_SIZE - 1); + return reinterpret_cast(aligned); + } + static void reset_pool() { for (size_t i = 0; i < POOL_CHUNKS; i++) @@ -485,7 +498,7 @@ namespace snmalloc pool_in_use[i] = true; pool_alloc_count++; return CapPtr::unsafe_from( - &pool_storage[i * MIN_CHUNK_SIZE]); + pool_base() + i * MIN_CHUNK_SIZE); } } return nullptr; @@ -495,7 +508,7 @@ namespace snmalloc { SNMALLOC_CHECK(size == MIN_CHUNK_SIZE); auto p = static_cast(base.unsafe_ptr()); - auto idx = static_cast((p - pool_storage) / MIN_CHUNK_SIZE); + auto idx = static_cast(p - pool_base()) / MIN_CHUNK_SIZE; SNMALLOC_CHECK(idx < POOL_CHUNKS); SNMALLOC_CHECK(pool_in_use[idx]); pool_in_use[idx] = false; From 1377614b0f217fc06c3c9daeb88f13e395ec8fe4 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Wed, 10 Jun 2026 10:03:59 +0100 Subject: [PATCH 29/31] Fix foreign-pointer remaining_bytes for memcpy bound check Phase 14 changed the sentinel sizeclass slot to be the zero-initialised row 0 with both size = 0 and slab_mask = 0. Pre-Phase 14, the sentinel had slab_mask = SIZE_MAX (from `size - 1` underflow in the large-class init loop). The bounds- checked memcpy shim (`bounds_checks.h::check_bound`) calls `remaining_bytes` unconditionally on every memcpy destination, including foreign (non-snmalloc) heap addresses reached via LD_PRELOAD before snmalloc has seen them. With slab_mask = 0, `start_of_object(addr) = addr`, so `remaining_bytes = 0`, and every memcpy on a foreign pointer fatals. Restore the pre-Phase-14 behaviour by explicitly setting the sentinel slot's slab_mask to ~size_t(0), so `start_of_object` collapses to 0 and `remaining_bytes` underflows to a huge value that trivially passes any reasonable bound check. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/ds/sizeclasstable.h | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/snmalloc/ds/sizeclasstable.h b/src/snmalloc/ds/sizeclasstable.h index 66005d936..7d82a8e92 100644 --- a/src/snmalloc/ds/sizeclasstable.h +++ b/src/snmalloc/ds/sizeclasstable.h @@ -318,6 +318,17 @@ namespace snmalloc constexpr SizeClassTable() { + // Sentinel slot (sizeclass_t{} / raw 0) covers any address whose + // pagemap entry is unmapped or owned by the backend — including + // foreign (non-snmalloc) heap addresses reached via the + // bounds-checked memcpy shim before snmalloc has seen them. + // `slab_mask = ~size_t(0)` makes `start_of_object` collapse + // `addr & ~slab_mask` to 0 and `index_in_object` to `addr`, so + // `remaining_bytes = sentinel.size - addr` underflows to a very + // large value and any memcpy bound check trivially passes the + // sentinel through to the destination's native checks. + start_[0].slab_mask = ~size_t(0); + size_t max_capacity = 0; for (smallsizeclass_t sizeclass(0); sizeclass < NUM_SMALL_SIZECLASSES; @@ -409,13 +420,16 @@ namespace snmalloc constexpr SizeClassTable sizeclass_metadata = SizeClassTable(); // Sentinel must remain zero-initialised so fast-path lookups via - // `start(sc)` return zero size and slab_mask without a branch. + // `start(sc)` return zero size without a branch. Slab_mask is + // `~size_t(0)` so foreign-pointer `remaining_bytes` underflows to a + // huge value (see `SizeClassTable::SizeClassTable`). static_assert( sizeclass_metadata.start(sizeclass_t{}).size == 0, "sentinel slot must have size 0"); static_assert( - sizeclass_metadata.start(sizeclass_t{}).slab_mask == 0, - "sentinel slot must have slab_mask 0"); + sizeclass_metadata.start(sizeclass_t{}).slab_mask == ~size_t(0), + "sentinel slot must have slab_mask ~0 for foreign-pointer " + "remaining_bytes underflow"); static_assert( bits::BITS - sizeclass_metadata.DIV_MULT_SHIFT <= MAX_CAPACITY_BITS); From d6c5fabb453db12369e45028d0538bbe6c47c7c7 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Wed, 10 Jun 2026 10:32:16 +0100 Subject: [PATCH 30/31] Fix two CI failures from BackendArenaRange branch 1. start_of_object: replace inlined 64-bit div_mult fast path with a call to slab_index, which has the correct 32-bit offset / size fallback. The inlined version overflowed size_t on 32-bit arm-linux-gnueabihf, producing wrong remaining_bytes (off by one allocation size) for func-memory-check::test_remaining_bytes. 2. func-largearenarange-check test: pass MinBaseSizeBits() as the MIN_REFILL_SIZE_BITS template parameter so the first parent allocation is at least the PAL's minimum reserve size. Windows VirtualAlloc cannot reserve below 64 KiB allocation granularity, so PalRange returned nullptr for the test's 16 KiB request. Matches what production code (standard_range.h, meta_protected_range.h) does. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/ds/sizeclasstable.h | 3 +-- src/test/func/largearenarange/largearenarange.cc | 11 ++++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/snmalloc/ds/sizeclasstable.h b/src/snmalloc/ds/sizeclasstable.h index 7d82a8e92..df173bef5 100644 --- a/src/snmalloc/ds/sizeclasstable.h +++ b/src/snmalloc/ds/sizeclasstable.h @@ -555,8 +555,7 @@ namespace snmalloc if (SNMALLOC_LIKELY(osc.offset() == 0)) { address_t slab_base = addr & ~meta.slab_mask; - size_t in_slab = addr - slab_base; - size_t index = (in_slab * meta.div_mult) >> DIV_MULT_SHIFT; + size_t index = slab_index(osc, addr); return slab_base + (index * meta.size); } address_t alloc_start = (addr & ~meta.slab_mask) - meta.offset_bytes; diff --git a/src/test/func/largearenarange/largearenarange.cc b/src/test/func/largearenarange/largearenarange.cc index 3a9e8536c..94a6e360a 100644 --- a/src/test/func/largearenarange/largearenarange.cc +++ b/src/test/func/largearenarange/largearenarange.cc @@ -45,12 +45,17 @@ namespace // LargeArenaRange under test: global range (MAX_SIZE_BITS = BITS - 1). // This means overflow dealloc never goes to parent (matches the global - // range configuration). + // range configuration). MIN_REFILL_BITS = MinBaseSizeBits() so + // the first parent allocation is at least the PAL's minimum reserve + // size — Windows VirtualAlloc cannot reserve below its allocation + // granularity (64 KiB) and PalRange returns nullptr in that case. static constexpr size_t REFILL_BITS = 20; static constexpr size_t MAX_BITS = bits::BITS - 1; + static constexpr size_t MIN_REFILL_BITS = MinBaseSizeBits(); - using ArenaRange = - Pipe>; + using ArenaRange = Pipe< + ParentSource, + LargeArenaRange>; // --- Tests --- From e65515c7c9859db6bc30a82ad4e3832a92e5c373 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Wed, 10 Jun 2026 11:18:22 +0100 Subject: [PATCH 31/31] Apply clang-format-15 fixes from CI Extracted the exact diff from the CI clang-format-15 run on this PR and applied it. 13 files: whitespace, ternary line-breaks, include reordering, friend-decl single-line, init-list one-per-line for size_t{} literals. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/backend_helpers/arena.h | 23 +++++++++---------- src/snmalloc/backend_helpers/arenabins.h | 4 +--- .../backend_helpers/backend_helpers.h | 2 +- src/snmalloc/backend_helpers/inplacerep.h | 3 +-- src/snmalloc/override/new.cc | 3 +-- .../func/aligned_dealloc/aligned_dealloc.cc | 2 +- src/test/func/arena/arena.cc | 7 +++++- src/test/func/arenabins/arenabins.cc | 3 +-- .../client_meta_nonpow2.cc | 3 ++- .../large_offset_frontend.cc | 3 ++- src/test/func/memory/memory.cc | 5 ++-- src/test/func/pagemap/pagemap.cc | 3 +-- .../func/smallarenarange/smallarenarange.cc | 2 +- 13 files changed, 31 insertions(+), 32 deletions(-) diff --git a/src/snmalloc/backend_helpers/arena.h b/src/snmalloc/backend_helpers/arena.h index 51dd3cb34..4330637a7 100644 --- a/src/snmalloc/backend_helpers/arena.h +++ b/src/snmalloc/backend_helpers/arena.h @@ -94,9 +94,8 @@ namespace snmalloc if (size == UNIT_SIZE) return ArenaVariant::Min; if (size == TWO_UNITS) - return ((addr >> MIN_SIZE_BITS) & 1) == 0 ? - ArenaVariant::EvenTwo : - ArenaVariant::OddTwo; + return ((addr >> MIN_SIZE_BITS) & 1) == 0 ? ArenaVariant::EvenTwo : + ArenaVariant::OddTwo; return ArenaVariant::Large; } @@ -365,15 +364,15 @@ namespace snmalloc for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++) { - bin_trees[bin].for_each([&](uintptr_t node) { - auto [a, s] = range_from_addr(node); - if (s >= TWO_UNITS) - { - auto path = range_tree.get_root_path(); - SNMALLOC_CHECK(range_tree.find(path, node)); - bin_tree_nonmin_count++; - } - }); + bin_trees[bin].for_each([&](uintptr_t node) { + auto [a, s] = range_from_addr(node); + if (s >= TWO_UNITS) + { + auto path = range_tree.get_root_path(); + SNMALLOC_CHECK(range_tree.find(path, node)); + bin_tree_nonmin_count++; + } + }); } range_tree.for_each([&](uintptr_t node) { diff --git a/src/snmalloc/backend_helpers/arenabins.h b/src/snmalloc/backend_helpers/arenabins.h index b9e32d092..ccfb23ca8 100644 --- a/src/snmalloc/backend_helpers/arenabins.h +++ b/src/snmalloc/backend_helpers/arenabins.h @@ -309,9 +309,7 @@ namespace snmalloc */ class Bitmap { - friend struct ArenaBinsTestAccess< - INTERMEDIATE_BITS, - MIN_SIZE_BITS>; + friend struct ArenaBinsTestAccess; public: /// Strict upper bound on bin ids `bin_index` produces. Exposed diff --git a/src/snmalloc/backend_helpers/backend_helpers.h b/src/snmalloc/backend_helpers/backend_helpers.h index 8a388171c..baeeb1cfd 100644 --- a/src/snmalloc/backend_helpers/backend_helpers.h +++ b/src/snmalloc/backend_helpers/backend_helpers.h @@ -2,13 +2,13 @@ #include "../mem/mem.h" #include "authmap.h" -#include "largearenarange.h" #include "commitrange.h" #include "commonconfig.h" #include "defaultpagemapentry.h" #include "empty_range.h" #include "globalrange.h" #include "indirectrange.h" +#include "largearenarange.h" #include "logrange.h" #include "noprange.h" #include "pagemap.h" diff --git a/src/snmalloc/backend_helpers/inplacerep.h b/src/snmalloc/backend_helpers/inplacerep.h index ab0ee9709..3aacfb410 100644 --- a/src/snmalloc/backend_helpers/inplacerep.h +++ b/src/snmalloc/backend_helpers/inplacerep.h @@ -234,8 +234,7 @@ namespace snmalloc static ArenaVariant get_variant(uintptr_t addr) { auto w = unit_at<0>(addr)->word_one; - return static_cast( - (w & VARIANT_MASK) >> VARIANT_SHIFT); + return static_cast((w & VARIANT_MASK) >> VARIANT_SHIFT); } static void set_variant(uintptr_t addr, ArenaVariant v) diff --git a/src/snmalloc/override/new.cc b/src/snmalloc/override/new.cc index a3f8fc6ea..667ca9c45 100644 --- a/src/snmalloc/override/new.cc +++ b/src/snmalloc/override/new.cc @@ -37,8 +37,7 @@ namespace snmalloc SNMALLOC_ASSERT( secondary_allocator || - is_start_of_object( - size_to_sizeclass_full(size), address_cast(p))); + is_start_of_object(size_to_sizeclass_full(size), address_cast(p))); return p; } diff --git a/src/test/func/aligned_dealloc/aligned_dealloc.cc b/src/test/func/aligned_dealloc/aligned_dealloc.cc index 6b1deb80c..51646e39c 100644 --- a/src/test/func/aligned_dealloc/aligned_dealloc.cc +++ b/src/test/func/aligned_dealloc/aligned_dealloc.cc @@ -19,9 +19,9 @@ */ #include "test/setup.h" +#include "test/snmalloc_testlib.h" #include -#include "test/snmalloc_testlib.h" using namespace snmalloc; diff --git a/src/test/func/arena/arena.cc b/src/test/func/arena/arena.cc index c310d6e0a..9ccb83099 100644 --- a/src/test/func/arena/arena.cc +++ b/src/test/func/arena/arena.cc @@ -284,7 +284,12 @@ namespace snmalloc uintptr_t a = chunk_addr(20); for (size_t s : - {size_t{3}, size_t{7}, size_t{15}, size_t{63}, size_t{255}, size_t{1000}}) + {size_t{3}, + size_t{7}, + size_t{15}, + size_t{63}, + size_t{255}, + size_t{1000}}) { MockRep::set_large_size(a, s); SNMALLOC_ASSERT(MockRep::get_large_size(a) == s); diff --git a/src/test/func/arenabins/arenabins.cc b/src/test/func/arenabins/arenabins.cc index 8ef495c6b..65e24ba37 100644 --- a/src/test/func/arenabins/arenabins.cc +++ b/src/test/func/arenabins/arenabins.cc @@ -542,8 +542,7 @@ namespace /// (defined directly in terms of `bin_subsets`). template size_t reference_find( - size_t n_chunks, - const typename ArenaBinsTestAccess::Bitmap& bm) + size_t n_chunks, const typename ArenaBinsTestAccess::Bitmap& bm) { using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; diff --git a/src/test/func/client_meta_nonpow2/client_meta_nonpow2.cc b/src/test/func/client_meta_nonpow2/client_meta_nonpow2.cc index 31cb84ee0..913afff38 100644 --- a/src/test/func/client_meta_nonpow2/client_meta_nonpow2.cc +++ b/src/test/func/client_meta_nonpow2/client_meta_nonpow2.cc @@ -42,7 +42,8 @@ namespace snmalloc } }; - using Config = snmalloc::StandardConfigClientMeta; + using Config = + snmalloc::StandardConfigClientMeta; } // namespace snmalloc #define SNMALLOC_PROVIDE_OWN_CONFIG diff --git a/src/test/func/large_offset_frontend/large_offset_frontend.cc b/src/test/func/large_offset_frontend/large_offset_frontend.cc index f45d6f7e7..4b4fd7948 100644 --- a/src/test/func/large_offset_frontend/large_offset_frontend.cc +++ b/src/test/func/large_offset_frontend/large_offset_frontend.cc @@ -28,9 +28,10 @@ * `remaining_bytes` reports the expected residual. */ +#include "test/setup.h" + #include #include -#include "test/setup.h" #ifdef assert # undef assert diff --git a/src/test/func/memory/memory.cc b/src/test/func/memory/memory.cc index 2d5e6d287..6be2865a8 100644 --- a/src/test/func/memory/memory.cc +++ b/src/test/func/memory/memory.cc @@ -464,9 +464,8 @@ void test_calloc_non_pow2_large() { // All sizeclasses are powers of two in this configuration, so // there is no non-pow2 large request to test. - std::cout - << "INTERMEDIATE_BITS == 0: all sizeclasses pow2; skipping." - << std::endl; + std::cout << "INTERMEDIATE_BITS == 0: all sizeclasses pow2; skipping." + << std::endl; return; } diff --git a/src/test/func/pagemap/pagemap.cc b/src/test/func/pagemap/pagemap.cc index 8e024bf6b..f93f64840 100644 --- a/src/test/func/pagemap/pagemap.cc +++ b/src/test/func/pagemap/pagemap.cc @@ -180,8 +180,7 @@ int main(int argc, char** argv) auto size = bits::one_at_bit(GRANULARITY_BITS + 4); auto* base = NoLazyCommitPal::reserve(size); NoLazyCommitPal::notify_using(base, size); - auto [heap_base, heap_size] = - pagemap_test_bound_no_lazy.init(base, size); + auto [heap_base, heap_size] = pagemap_test_bound_no_lazy.init(base, size); auto low = address_cast(heap_base); pagemap_test_bound_no_lazy.set(low, T(7)); diff --git a/src/test/func/smallarenarange/smallarenarange.cc b/src/test/func/smallarenarange/smallarenarange.cc index 7c9bf4606..47d6b895c 100644 --- a/src/test/func/smallarenarange/smallarenarange.cc +++ b/src/test/func/smallarenarange/smallarenarange.cc @@ -18,8 +18,8 @@ #include #include #include -#include #include +#include #include #include #include