Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 179 additions & 18 deletions rust/lance-table/src/rowids/segment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@ use std::ops::{Range, RangeInclusive};
use super::{bitmap::Bitmap, encoded_array::EncodedU64Array};
use deepsize::DeepSizeOf;

/// Convert an estimated serialized byte cost from `u128` to `usize`, saturating
/// at [`usize::MAX`] when the value does not fit (infeasible encodings).
#[inline]
fn u128_byte_cost_to_usize(v: u128) -> usize {
usize::try_from(v).unwrap_or(usize::MAX)
}

/// Different ways to represent a sequence of distinct u64s.
///
/// This is designed to be especially efficient for sequences that are sorted,
Expand Down Expand Up @@ -88,13 +95,17 @@ struct SegmentStats {
}

impl SegmentStats {
fn n_holes(&self) -> u64 {
/// Number of missing values ("holes") in the range `[min, max]`.
///
/// Returns `u128` because the total slot count `max - min + 1` can be up
/// to `2^64` (when `min = 0, max = u64::MAX`), which exceeds `u64::MAX`.
fn n_holes(&self) -> u128 {
debug_assert!(self.sorted);
if self.count == 0 {
0
} else {
let total_slots = self.max - self.min + 1;
total_slots - self.count
let total_slots = self.max as u128 - self.min as u128 + 1;
total_slots - self.count as u128
}
}
}
Expand Down Expand Up @@ -149,15 +160,24 @@ impl U64Segment {
}
}

/// Estimate the serialized byte size of each sorted encoding variant.
///
/// All arithmetic is performed in `u128` to avoid overflow when the range
/// span `max - min + 1` approaches or exceeds `2^64`. Infeasible sizes
/// saturate to `usize::MAX` so they always lose the `min()` comparison.
fn sorted_sequence_sizes(stats: &SegmentStats) -> [usize; 3] {
let n_holes = stats.n_holes();
let total_slots = stats.max - stats.min + 1;
let total_slots = stats.max as u128 - stats.min as u128 + 1;

let range_with_holes = 24 + 4 * n_holes as usize;
let range_with_bitmap = 24 + (total_slots as f64 / 8.0).ceil() as usize;
let sorted_array = 24 + 2 * stats.count as usize;
let range_with_holes = 24u128.saturating_add(4u128.saturating_mul(n_holes));
let range_with_bitmap = 24u128.saturating_add(total_slots.div_ceil(8));
let sorted_array = 24u128.saturating_add(2u128.saturating_mul(stats.count as u128));

[range_with_holes, range_with_bitmap, sorted_array]
[
u128_byte_cost_to_usize(range_with_holes),
u128_byte_cost_to_usize(range_with_bitmap),
u128_byte_cost_to_usize(sorted_array),
]
}

fn from_stats_and_sequence(
Expand All @@ -166,38 +186,41 @@ impl U64Segment {
) -> Self {
if stats.sorted {
let n_holes = stats.n_holes();
// Range-backed encodings store an exclusive end as `Range<u64>`,
// which cannot represent `u64::MAX + 1`. Compute the end once and
// gate all range-backed branches on its representability.
let exclusive_end = stats.max.checked_add(1);
if stats.count == 0 {
Self::Range(0..0)
} else if n_holes == 0 {
Self::Range(stats.min..(stats.max + 1))
} else {
} else if n_holes == 0 && exclusive_end.is_some() {
Self::Range(stats.min..exclusive_end.unwrap())
} else if let Some(end) = exclusive_end {
let sizes = Self::sorted_sequence_sizes(&stats);
let min_size = sizes.iter().min().unwrap();
if min_size == &sizes[0] {
let range = stats.min..(stats.max + 1);
let range = stats.min..end;
let mut holes =
Self::holes_in_slice(stats.min..=stats.max, sequence).collect::<Vec<_>>();
holes.sort_unstable();
let holes = EncodedU64Array::from(holes);

Self::RangeWithHoles { range, holes }
} else if min_size == &sizes[1] {
let range = stats.min..(stats.max + 1);
let range = stats.min..end;
let mut bitmap = Bitmap::new_full((stats.max - stats.min) as usize + 1);

for hole in Self::holes_in_slice(stats.min..=stats.max, sequence) {
let offset = (hole - stats.min) as usize;
bitmap.clear(offset);
}

Self::RangeWithBitmap { range, bitmap }
} else {
// Must use array, but at least it's sorted
Self::SortedArray(EncodedU64Array::from_iter(sequence))
}
} else {
// max == u64::MAX: exclusive end is unrepresentable in Range<u64>,
// so no range-backed encoding can be used.
Self::SortedArray(EncodedU64Array::from_iter(sequence))
}
} else {
// Must use array
Self::Array(EncodedU64Array::from_iter(sequence))
}
}
Expand Down Expand Up @@ -707,6 +730,144 @@ mod test {
);
}

#[test]
fn test_segment_overflow_boundary() {
// Sparse range spanning i64::MAX — the original overflow reproducer.
// n_holes ≈ 2^63, which overflows `4 * n_holes as usize` without u128 arithmetic.
let values: Vec<u64> = vec![0, 1, 2, 100, i64::MAX as u64];
let segment = U64Segment::from_slice(&values);
assert!(
matches!(segment, U64Segment::SortedArray(_)),
"sparse range spanning i64::MAX should be SortedArray, got {:?}",
std::mem::discriminant(&segment)
);
assert_eq!(segment.len(), 5);
assert_eq!(segment.iter().collect::<Vec<_>>(), values);

// Two values at u64 extremes — triggers n_holes() total_slots overflow
// (u64::MAX - 0 + 1 wraps to 0 without u128).
let values: Vec<u64> = vec![0, u64::MAX];
let segment = U64Segment::from_slice(&values);
assert!(
matches!(segment, U64Segment::SortedArray(_)),
"full u64 span should be SortedArray, got {:?}",
std::mem::discriminant(&segment)
);
assert_eq!(segment.len(), 2);
assert_eq!(segment.iter().collect::<Vec<_>>(), values);

// Small dense set near u64::MAX — cost estimation correctly prefers a
// range-backed encoding, but Range<u64> cannot represent u64::MAX + 1
// as the exclusive end. Must fall back to SortedArray.
let values: Vec<u64> = vec![u64::MAX - 3, u64::MAX - 1, u64::MAX];
let segment = U64Segment::from_slice(&values);
assert!(
matches!(segment, U64Segment::SortedArray(_)),
"dense set near u64::MAX should be SortedArray (exclusive end unrepresentable), got {:?}",
std::mem::discriminant(&segment)
);
assert_eq!(segment.len(), 3);
assert_eq!(segment.iter().collect::<Vec<_>>(), values);

// Single value at u64::MAX — contiguous range with n_holes == 0, but
// exclusive end u64::MAX + 1 overflows.
let values: Vec<u64> = vec![u64::MAX];
let segment = U64Segment::from_slice(&values);
assert!(
matches!(segment, U64Segment::SortedArray(_)),
"single u64::MAX should be SortedArray, got {:?}",
std::mem::discriminant(&segment)
);
assert_eq!(segment.len(), 1);
assert_eq!(segment.iter().collect::<Vec<_>>(), values);

// Contiguous range ending just below u64::MAX — exclusive end is
// representable, so Range encoding should still be used.
let values: Vec<u64> = vec![u64::MAX - 3, u64::MAX - 2, u64::MAX - 1];
let segment = U64Segment::from_slice(&values);
assert_eq!(segment, U64Segment::Range((u64::MAX - 3)..u64::MAX));
assert_eq!(segment.len(), 3);
assert_eq!(segment.iter().collect::<Vec<_>>(), values);

// Regression: normal dense range with few holes still picks RangeWithHoles.
// Needs total_slots > 32 * n_holes for RangeWithHoles to beat RangeWithBitmap.
let values: Vec<u64> = (100..1100).filter(|&x| x != 500).collect();
let segment = U64Segment::from_slice(&values);
assert_eq!(
segment,
U64Segment::RangeWithHoles {
range: 100..1100,
holes: vec![500].into(),
}
);
assert_eq!(segment.len(), 999);
assert_eq!(segment.iter().collect::<Vec<_>>(), values);

// Regression: small dense range with hole picks RangeWithBitmap.
let values: Vec<u64> = vec![100, 101, 102, 103, 105];
let segment = U64Segment::from_slice(&values);
assert!(
matches!(segment, U64Segment::RangeWithBitmap { .. }),
"small dense range with hole should be RangeWithBitmap, got {:?}",
std::mem::discriminant(&segment)
);
assert_eq!(segment.len(), 5);
assert_eq!(segment.iter().collect::<Vec<_>>(), values);
}

#[test]
fn test_u128_byte_cost_to_usize() {
assert_eq!(super::u128_byte_cost_to_usize(0), 0);
assert_eq!(super::u128_byte_cost_to_usize(42), 42);
assert_eq!(
super::u128_byte_cost_to_usize(usize::MAX as u128),
usize::MAX
);
assert_eq!(super::u128_byte_cost_to_usize(u128::MAX), usize::MAX);
}

#[test]
fn test_sorted_sequence_sizes_sparse_span_saturates_range_with_holes_cost() {
let stats = super::SegmentStats {
min: 0,
max: i64::MAX as u64,
count: 5,
sorted: true,
};
let sizes = U64Segment::sorted_sequence_sizes(&stats);
assert_eq!(sizes[0], usize::MAX);
assert!(sizes[2] < sizes[0]);
}

#[test]
fn test_sorted_sequence_sizes_sorted_array_cost_saturates() {
// Nearly full [0, u64::MAX] with one hole: count = u64::MAX, n_holes = 1.
// SortedArray cost 24 + 2 * u64::MAX does not fit in usize on 64-bit.
let stats = super::SegmentStats {
min: 0,
max: u64::MAX,
count: u64::MAX,
sorted: true,
};
let sizes = U64Segment::sorted_sequence_sizes(&stats);
assert_eq!(sizes[2], usize::MAX);
}

#[test]
fn test_sorted_sequence_sizes_full_span_bitmap_cost() {
// Synthetic stats: full [0, u64::MAX] slot space; exercises `range_with_bitmap`
// cost path (always fits in `usize` on 64-bit targets).
let stats = super::SegmentStats {
min: 0,
max: u64::MAX,
count: 1,
sorted: true,
};
let sizes = U64Segment::sorted_sequence_sizes(&stats);
assert!(sizes[1] < sizes[0]);
assert!(sizes[1] < usize::MAX);
}

#[test]
fn test_with_new_high() {
// Test Range: contiguous sequence
Expand Down
Loading