From 482ea719feb65f4d7b3722bbbb0e8cc50bdaf5d7 Mon Sep 17 00:00:00 2001 From: tamirms Date: Mon, 25 May 2026 20:58:59 +0200 Subject: [PATCH] fastaggregation: avoid runContainer16.lazyIOR slow path in FastOr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (*Bitmap).lazyOR's same-key branch calls c1.lazyIOR(c2). When c1 is a runContainer16, lazyIOR falls back to ior -> inplaceUnion -> Add -> searchRange, which is O(N · logR) per element merged. This makes FastOr catastrophically slow over inputs containing run-encoded blocks: a synthetic benchmark with 15 bitmaps of ~6000-bit runs takes ~335 ms/op before this change. Pre-promote runContainer16 slots to bitmapContainer before the inner lazyIOR call. The bitmapContainer's lazy union is O(1024) regardless of cardinality, so the K-way fan-in stays linear in K. Mirrors the explicit toBitmapContainer pre-promotion in parallel.go ParHeapOr and the Java BitmapContainer.lazyor(RunContainer) path. Issue #81 has been tracking the deeper fix (proper runContainer16.lazyIOR/lazyOR implementations) since 2016; this is the surgical workaround at the single FastOr call site. BenchmarkFastOrRunContainers (added): before: 335 ms/op 12 KB/op 447 allocs/op after: 637 µs/op 335 KB/op 257 allocs/op (~526x) The output container type for slots that started as runContainer16 inputs is now bitmapContainer (or arrayContainer after repairAfterLazy if sparse). repairAfterLazy does not re-encode to runs; callers that want a run-optimised result should call RunOptimize() on the FastOr return value. ParHeapOr already behaves the same way. The full v2 test suite passes. Refs: https://github.com/RoaringBitmap/roaring/issues/81 --- benchmark_test.go | 62 ++++++++++++++++++++++++++++++++++++++++++++++ fastaggregation.go | 7 ++++++ 2 files changed, 69 insertions(+) diff --git a/benchmark_test.go b/benchmark_test.go index 8c85d27e..62153322 100644 --- a/benchmark_test.go +++ b/benchmark_test.go @@ -1356,3 +1356,65 @@ func BenchmarkCardinalityInRange(b *testing.B) { } } } + +// BenchmarkFastOrRunContainers measures FastOr over inputs containing +// runContainer16 slots (the shape AddRange and RunOptimize produce), +// exercising the runContainer16 -> bitmapContainer pre-promotion in +// (*Bitmap).lazyOR. See issue #81. +// +// go test -bench BenchmarkFastOrRunContainers -benchmem -run - -benchtime=2s +func BenchmarkFastOrRunContainers(b *testing.B) { + const numBitmaps = 15 + const blocksPerBitmap = 40 // each block = 1<<16 bits + const runsPerBlock = 8 + + rng := rand.New(rand.NewSource(42)) + bms := make([]*Bitmap, numBitmaps) + for i := 0; i < numBitmaps; i++ { + bm := NewBitmap() + for blk := 0; blk < blocksPerBitmap; blk++ { + base := uint64(blk) << 16 + offset := uint64(rng.Intn(1000)) + for r := 0; r < runsPerBlock; r++ { + start := base + offset + uint64(r)*8000 + uint64(i*37) + end := start + 6000 + blockEnd := base + (1 << 16) + if end > blockEnd { + end = blockEnd + } + if start >= blockEnd { + break + } + bm.AddRange(start, end) + } + } + bm.RunOptimize() + bms[i] = bm + } + + // Sanity-check: the workload must actually contain runContainer16, + // otherwise the bench wouldn't exercise the patched path. + hasRunContainer := false + for _, bm := range bms { + for _, c := range bm.highlowcontainer.containers { + if _, ok := c.(*runContainer16); ok { + hasRunContainer = true + break + } + } + if hasRunContainer { + break + } + } + if !hasRunContainer { + b.Fatalf("workload did not produce any runContainer16; bench would not exercise the patched path") + } + + b.ResetTimer() + for n := 0; n < b.N; n++ { + res := FastOr(bms...) + if res.GetCardinality() == 0 { + b.Fatal("unexpected empty result") + } + } +} diff --git a/fastaggregation.go b/fastaggregation.go index 7d0a92fe..3881b88a 100644 --- a/fastaggregation.go +++ b/fastaggregation.go @@ -81,6 +81,13 @@ main: s2 = x2.highlowcontainer.getKeyAtIndex(pos2) } else { c1 := x1.highlowcontainer.getWritableContainerAtIndex(pos1) + // runContainer16.lazyIOR falls back to a slow ior path + // (O(N log R) per merged element); promote to bitmapContainer + // first, whose lazy union is O(1024) regardless of cardinality. + // See https://github.com/RoaringBitmap/roaring/issues/81. + if rc, ok := c1.(*runContainer16); ok && !rc.isFull() { + c1 = rc.toBitmapContainer() + } x1.highlowcontainer.containers[pos1] = c1.lazyIOR(x2.highlowcontainer.getContainerAtIndex(pos2)) x1.highlowcontainer.needCopyOnWrite[pos1] = false pos1++