From 482ea719feb65f4d7b3722bbbb0e8cc50bdaf5d7 Mon Sep 17 00:00:00 2001
From: tamirms <tamirms@gmail.com>
Date: Mon, 25 May 2026 20:58:59 +0200
Subject: [PATCH] fastaggregation: avoid runContainer16.lazyIOR slow path in
 FastOr
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(*Bitmap).lazyOR's same-key branch calls c1.lazyIOR(c2). When c1 is
a runContainer16, lazyIOR falls back to ior -> inplaceUnion -> Add ->
searchRange, which is O(N · logR) per element merged. This makes FastOr
catastrophically slow over inputs containing run-encoded blocks: a
synthetic benchmark with 15 bitmaps of ~6000-bit runs takes ~335 ms/op
before this change.

Pre-promote runContainer16 slots to bitmapContainer before the inner
lazyIOR call. The bitmapContainer's lazy union is O(1024) regardless
of cardinality, so the K-way fan-in stays linear in K. Mirrors the
explicit toBitmapContainer pre-promotion in parallel.go ParHeapOr and
the Java BitmapContainer.lazyor(RunContainer) path. Issue #81 has been
tracking the deeper fix (proper runContainer16.lazyIOR/lazyOR
implementations) since 2016; this is the surgical workaround at the
single FastOr call site.

BenchmarkFastOrRunContainers (added):
  before:  335 ms/op    12 KB/op   447 allocs/op
  after:   637 µs/op   335 KB/op   257 allocs/op    (~526x)

The output container type for slots that started as runContainer16
inputs is now bitmapContainer (or arrayContainer after repairAfterLazy
if sparse). repairAfterLazy does not re-encode to runs; callers that
want a run-optimised result should call RunOptimize() on the FastOr
return value. ParHeapOr already behaves the same way.

The full v2 test suite passes.

Refs: https://github.com/RoaringBitmap/roaring/issues/81
---
 benchmark_test.go  | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 fastaggregation.go |  7 ++++++
 2 files changed, 69 insertions(+)

diff --git a/benchmark_test.go b/benchmark_test.go
index 8c85d27e..62153322 100644
--- a/benchmark_test.go
+++ b/benchmark_test.go
@@ -1356,3 +1356,65 @@ func BenchmarkCardinalityInRange(b *testing.B) {
 		}
 	}
 }
+
+// BenchmarkFastOrRunContainers measures FastOr over inputs containing
+// runContainer16 slots (the shape AddRange and RunOptimize produce),
+// exercising the runContainer16 -> bitmapContainer pre-promotion in
+// (*Bitmap).lazyOR. See issue #81.
+//
+// go test -bench BenchmarkFastOrRunContainers -benchmem -run - -benchtime=2s
+func BenchmarkFastOrRunContainers(b *testing.B) {
+	const numBitmaps = 15
+	const blocksPerBitmap = 40 // each block = 1<<16 bits
+	const runsPerBlock = 8
+
+	rng := rand.New(rand.NewSource(42))
+	bms := make([]*Bitmap, numBitmaps)
+	for i := 0; i < numBitmaps; i++ {
+		bm := NewBitmap()
+		for blk := 0; blk < blocksPerBitmap; blk++ {
+			base := uint64(blk) << 16
+			offset := uint64(rng.Intn(1000))
+			for r := 0; r < runsPerBlock; r++ {
+				start := base + offset + uint64(r)*8000 + uint64(i*37)
+				end := start + 6000
+				blockEnd := base + (1 << 16)
+				if end > blockEnd {
+					end = blockEnd
+				}
+				if start >= blockEnd {
+					break
+				}
+				bm.AddRange(start, end)
+			}
+		}
+		bm.RunOptimize()
+		bms[i] = bm
+	}
+
+	// Sanity-check: the workload must actually contain runContainer16,
+	// otherwise the bench wouldn't exercise the patched path.
+	hasRunContainer := false
+	for _, bm := range bms {
+		for _, c := range bm.highlowcontainer.containers {
+			if _, ok := c.(*runContainer16); ok {
+				hasRunContainer = true
+				break
+			}
+		}
+		if hasRunContainer {
+			break
+		}
+	}
+	if !hasRunContainer {
+		b.Fatalf("workload did not produce any runContainer16; bench would not exercise the patched path")
+	}
+
+	b.ResetTimer()
+	for n := 0; n < b.N; n++ {
+		res := FastOr(bms...)
+		if res.GetCardinality() == 0 {
+			b.Fatal("unexpected empty result")
+		}
+	}
+}
diff --git a/fastaggregation.go b/fastaggregation.go
index 7d0a92fe..3881b88a 100644
--- a/fastaggregation.go
+++ b/fastaggregation.go
@@ -81,6 +81,13 @@ main:
 				s2 = x2.highlowcontainer.getKeyAtIndex(pos2)
 			} else {
 				c1 := x1.highlowcontainer.getWritableContainerAtIndex(pos1)
+				// runContainer16.lazyIOR falls back to a slow ior path
+				// (O(N log R) per merged element); promote to bitmapContainer
+				// first, whose lazy union is O(1024) regardless of cardinality.
+				// See https://github.com/RoaringBitmap/roaring/issues/81.
+				if rc, ok := c1.(*runContainer16); ok && !rc.isFull() {
+					c1 = rc.toBitmapContainer()
+				}
 				x1.highlowcontainer.containers[pos1] = c1.lazyIOR(x2.highlowcontainer.getContainerAtIndex(pos2))
 				x1.highlowcontainer.needCopyOnWrite[pos1] = false
 				pos1++