From 2f9a175d2c3e6628787f453132205badffceb1ec Mon Sep 17 00:00:00 2001 From: ivscheianu Date: Tue, 14 Apr 2026 18:18:42 +0300 Subject: [PATCH 1/3] fix: prevent arithmetic overflow in U64Segment encoding selection for sparse/extreme row ID ranges --- rust/lance-table/src/rowids/segment.rs | 137 +++++++++++++++++++++---- 1 file changed, 119 insertions(+), 18 deletions(-) diff --git a/rust/lance-table/src/rowids/segment.rs b/rust/lance-table/src/rowids/segment.rs index 1c494f20f09..74602bbe6d8 100644 --- a/rust/lance-table/src/rowids/segment.rs +++ b/rust/lance-table/src/rowids/segment.rs @@ -88,13 +88,17 @@ struct SegmentStats { } impl SegmentStats { - fn n_holes(&self) -> u64 { + /// Number of missing values ("holes") in the range `[min, max]`. + /// + /// Returns `u128` because the total slot count `max - min + 1` can be up + /// to `2^64` (when `min = 0, max = u64::MAX`), which exceeds `u64::MAX`. + fn n_holes(&self) -> u128 { debug_assert!(self.sorted); if self.count == 0 { 0 } else { - let total_slots = self.max - self.min + 1; - total_slots - self.count + let total_slots = self.max as u128 - self.min as u128 + 1; + total_slots - self.count as u128 } } } @@ -149,15 +153,24 @@ impl U64Segment { } } + /// Estimate the serialized byte size of each sorted encoding variant. + /// + /// All arithmetic is performed in `u128` to avoid overflow when the range + /// span `max - min + 1` approaches or exceeds `2^64`. Infeasible sizes + /// saturate to `usize::MAX` so they always lose the `min()` comparison. fn sorted_sequence_sizes(stats: &SegmentStats) -> [usize; 3] { let n_holes = stats.n_holes(); - let total_slots = stats.max - stats.min + 1; + let total_slots = stats.max as u128 - stats.min as u128 + 1; - let range_with_holes = 24 + 4 * n_holes as usize; - let range_with_bitmap = 24 + (total_slots as f64 / 8.0).ceil() as usize; - let sorted_array = 24 + 2 * stats.count as usize; + let range_with_holes = 24u128.saturating_add(4u128.saturating_mul(n_holes)); + let range_with_bitmap = 24u128.saturating_add(total_slots.div_ceil(8)); + let sorted_array = 24u128.saturating_add(2u128.saturating_mul(stats.count as u128)); - [range_with_holes, range_with_bitmap, sorted_array] + [ + usize::try_from(range_with_holes).unwrap_or(usize::MAX), + usize::try_from(range_with_bitmap).unwrap_or(usize::MAX), + usize::try_from(sorted_array).unwrap_or(usize::MAX), + ] } fn from_stats_and_sequence( @@ -166,38 +179,41 @@ impl U64Segment { ) -> Self { if stats.sorted { let n_holes = stats.n_holes(); + // Range-backed encodings store an exclusive end as `Range`, + // which cannot represent `u64::MAX + 1`. Compute the end once and + // gate all range-backed branches on its representability. + let exclusive_end = stats.max.checked_add(1); if stats.count == 0 { Self::Range(0..0) - } else if n_holes == 0 { - Self::Range(stats.min..(stats.max + 1)) - } else { + } else if n_holes == 0 && exclusive_end.is_some() { + Self::Range(stats.min..exclusive_end.unwrap()) + } else if let Some(end) = exclusive_end { let sizes = Self::sorted_sequence_sizes(&stats); let min_size = sizes.iter().min().unwrap(); if min_size == &sizes[0] { - let range = stats.min..(stats.max + 1); + let range = stats.min..end; let mut holes = Self::holes_in_slice(stats.min..=stats.max, sequence).collect::>(); holes.sort_unstable(); let holes = EncodedU64Array::from(holes); - Self::RangeWithHoles { range, holes } } else if min_size == &sizes[1] { - let range = stats.min..(stats.max + 1); + let range = stats.min..end; let mut bitmap = Bitmap::new_full((stats.max - stats.min) as usize + 1); - for hole in Self::holes_in_slice(stats.min..=stats.max, sequence) { let offset = (hole - stats.min) as usize; bitmap.clear(offset); } - Self::RangeWithBitmap { range, bitmap } } else { - // Must use array, but at least it's sorted Self::SortedArray(EncodedU64Array::from_iter(sequence)) } + } else { + // max == u64::MAX: exclusive end is unrepresentable in Range, + // so no range-backed encoding can be used. + Self::SortedArray(EncodedU64Array::from_iter(sequence)) } } else { - // Must use array Self::Array(EncodedU64Array::from_iter(sequence)) } } @@ -707,6 +723,91 @@ mod test { ); } + #[test] + fn test_segment_overflow_boundary() { + // Sparse range spanning i64::MAX — the original overflow reproducer. + // n_holes ≈ 2^63, which overflows `4 * n_holes as usize` without u128 arithmetic. + let values: Vec = vec![0, 1, 2, 100, i64::MAX as u64]; + let segment = U64Segment::from_slice(&values); + assert!( + matches!(segment, U64Segment::SortedArray(_)), + "sparse range spanning i64::MAX should be SortedArray, got {:?}", + std::mem::discriminant(&segment) + ); + assert_eq!(segment.len(), 5); + assert_eq!(segment.iter().collect::>(), values); + + // Two values at u64 extremes — triggers n_holes() total_slots overflow + // (u64::MAX - 0 + 1 wraps to 0 without u128). + let values: Vec = vec![0, u64::MAX]; + let segment = U64Segment::from_slice(&values); + assert!( + matches!(segment, U64Segment::SortedArray(_)), + "full u64 span should be SortedArray, got {:?}", + std::mem::discriminant(&segment) + ); + assert_eq!(segment.len(), 2); + assert_eq!(segment.iter().collect::>(), values); + + // Small dense set near u64::MAX — cost estimation correctly prefers a + // range-backed encoding, but Range cannot represent u64::MAX + 1 + // as the exclusive end. Must fall back to SortedArray. + let values: Vec = vec![u64::MAX - 3, u64::MAX - 1, u64::MAX]; + let segment = U64Segment::from_slice(&values); + assert!( + matches!(segment, U64Segment::SortedArray(_)), + "dense set near u64::MAX should be SortedArray (exclusive end unrepresentable), got {:?}", + std::mem::discriminant(&segment) + ); + assert_eq!(segment.len(), 3); + assert_eq!(segment.iter().collect::>(), values); + + // Single value at u64::MAX — contiguous range with n_holes == 0, but + // exclusive end u64::MAX + 1 overflows. + let values: Vec = vec![u64::MAX]; + let segment = U64Segment::from_slice(&values); + assert!( + matches!(segment, U64Segment::SortedArray(_)), + "single u64::MAX should be SortedArray, got {:?}", + std::mem::discriminant(&segment) + ); + assert_eq!(segment.len(), 1); + assert_eq!(segment.iter().collect::>(), values); + + // Contiguous range ending just below u64::MAX — exclusive end is + // representable, so Range encoding should still be used. + let values: Vec = vec![u64::MAX - 3, u64::MAX - 2, u64::MAX - 1]; + let segment = U64Segment::from_slice(&values); + assert_eq!(segment, U64Segment::Range((u64::MAX - 3)..u64::MAX)); + assert_eq!(segment.len(), 3); + assert_eq!(segment.iter().collect::>(), values); + + // Regression: normal dense range with few holes still picks RangeWithHoles. + // Needs total_slots > 32 * n_holes for RangeWithHoles to beat RangeWithBitmap. + let values: Vec = (100..1100).filter(|&x| x != 500).collect(); + let segment = U64Segment::from_slice(&values); + assert_eq!( + segment, + U64Segment::RangeWithHoles { + range: 100..1100, + holes: vec![500].into(), + } + ); + assert_eq!(segment.len(), 999); + assert_eq!(segment.iter().collect::>(), values); + + // Regression: small dense range with hole picks RangeWithBitmap. + let values: Vec = vec![100, 101, 102, 103, 105]; + let segment = U64Segment::from_slice(&values); + assert!( + matches!(segment, U64Segment::RangeWithBitmap { .. }), + "small dense range with hole should be RangeWithBitmap, got {:?}", + std::mem::discriminant(&segment) + ); + assert_eq!(segment.len(), 5); + assert_eq!(segment.iter().collect::>(), values); + } + #[test] fn test_with_new_high() { // Test Range: contiguous sequence From a5e4cb09ec272d27a6ea04d058d191a3cda70cdc Mon Sep 17 00:00:00 2001 From: ivscheianu Date: Tue, 14 Apr 2026 18:56:12 +0300 Subject: [PATCH 2/3] chore: increased coverage --- rust/lance-table/src/rowids/segment.rs | 63 ++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/rust/lance-table/src/rowids/segment.rs b/rust/lance-table/src/rowids/segment.rs index 74602bbe6d8..0ac15d7117c 100644 --- a/rust/lance-table/src/rowids/segment.rs +++ b/rust/lance-table/src/rowids/segment.rs @@ -6,6 +6,13 @@ use std::ops::{Range, RangeInclusive}; use super::{bitmap::Bitmap, encoded_array::EncodedU64Array}; use deepsize::DeepSizeOf; +/// Convert an estimated serialized byte cost from `u128` to `usize`, saturating +/// at [`usize::MAX`] when the value does not fit (infeasible encodings). +#[inline] +fn u128_byte_cost_to_usize(v: u128) -> usize { + usize::try_from(v).unwrap_or(usize::MAX) +} + /// Different ways to represent a sequence of distinct u64s. /// /// This is designed to be especially efficient for sequences that are sorted, @@ -167,9 +174,9 @@ impl U64Segment { let sorted_array = 24u128.saturating_add(2u128.saturating_mul(stats.count as u128)); [ - usize::try_from(range_with_holes).unwrap_or(usize::MAX), - usize::try_from(range_with_bitmap).unwrap_or(usize::MAX), - usize::try_from(sorted_array).unwrap_or(usize::MAX), + u128_byte_cost_to_usize(range_with_holes), + u128_byte_cost_to_usize(range_with_bitmap), + u128_byte_cost_to_usize(sorted_array), ] } @@ -808,6 +815,56 @@ mod test { assert_eq!(segment.iter().collect::>(), values); } + #[test] + fn test_u128_byte_cost_to_usize() { + assert_eq!(super::u128_byte_cost_to_usize(0), 0); + assert_eq!(super::u128_byte_cost_to_usize(42), 42); + assert_eq!(super::u128_byte_cost_to_usize(usize::MAX as u128), usize::MAX); + assert_eq!(super::u128_byte_cost_to_usize(u128::MAX), usize::MAX); + } + + #[test] + fn test_sorted_sequence_sizes_sparse_span_saturates_range_with_holes_cost() { + let stats = super::SegmentStats { + min: 0, + max: i64::MAX as u64, + count: 5, + sorted: true, + }; + let sizes = U64Segment::sorted_sequence_sizes(&stats); + assert_eq!(sizes[0], usize::MAX); + assert!(sizes[2] < sizes[0]); + } + + #[test] + fn test_sorted_sequence_sizes_sorted_array_cost_saturates() { + // Nearly full [0, u64::MAX] with one hole: count = u64::MAX, n_holes = 1. + // SortedArray cost 24 + 2 * u64::MAX does not fit in usize on 64-bit. + let stats = super::SegmentStats { + min: 0, + max: u64::MAX, + count: u64::MAX, + sorted: true, + }; + let sizes = U64Segment::sorted_sequence_sizes(&stats); + assert_eq!(sizes[2], usize::MAX); + } + + #[test] + fn test_sorted_sequence_sizes_full_span_bitmap_cost() { + // Synthetic stats: full [0, u64::MAX] slot space; exercises `range_with_bitmap` + // cost path (always fits in `usize` on 64-bit targets). + let stats = super::SegmentStats { + min: 0, + max: u64::MAX, + count: 1, + sorted: true, + }; + let sizes = U64Segment::sorted_sequence_sizes(&stats); + assert!(sizes[1] < sizes[0]); + assert!(sizes[1] < usize::MAX); + } + #[test] fn test_with_new_high() { // Test Range: contiguous sequence From b563757e24f94457e5fa3d1600250ef9d7feff3c Mon Sep 17 00:00:00 2001 From: ivscheianu Date: Tue, 14 Apr 2026 18:58:04 +0300 Subject: [PATCH 3/3] chore: format --- rust/lance-table/src/rowids/segment.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/rust/lance-table/src/rowids/segment.rs b/rust/lance-table/src/rowids/segment.rs index 0ac15d7117c..061601592d1 100644 --- a/rust/lance-table/src/rowids/segment.rs +++ b/rust/lance-table/src/rowids/segment.rs @@ -819,7 +819,10 @@ mod test { fn test_u128_byte_cost_to_usize() { assert_eq!(super::u128_byte_cost_to_usize(0), 0); assert_eq!(super::u128_byte_cost_to_usize(42), 42); - assert_eq!(super::u128_byte_cost_to_usize(usize::MAX as u128), usize::MAX); + assert_eq!( + super::u128_byte_cost_to_usize(usize::MAX as u128), + usize::MAX + ); assert_eq!(super::u128_byte_cost_to_usize(u128::MAX), usize::MAX); }