From 87a6aa64d832c68b5190a20c172f103e41488a64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= <clemence@lesne.pro>
Date: Wed, 1 Apr 2026 15:21:44 +0200
Subject: [PATCH] perf(index): optimize indexing time by ~35% for large
 repositories
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reduce indexing time from 13.2s to 8.6s (-35%) on the kubernetes repository
(26K files, ~200MB content) through four targeted optimizations identified
via CPU profiling.

## Changes

### 1. Merge postings and lastOffsets maps into single map (index/shard_builder.go)

The trigram generation hot loop (`newSearchableString`) previously used two
separate maps: `postings map[ngram][]byte` and `lastOffsets map[ngram]uint32`.
Each rune required 2 map reads + 2 map writes across both maps.

Replaced with a single `map[ngram]*postingEntry` where `postingEntry` holds
both the data slice and last offset. After the initial map lookup, all
modifications go through the pointer — reducing map operations from 4 to 1
per rune. This cut `mapassign_fast64` time from 1.64s to 0.30s (5.5x).

### 2. Pre-allocate postings map and data slices (index/shard_builder.go)

- Pre-allocate the postings map with 200K capacity hint (a typical shard
  contains 50K-200K unique trigrams), avoiding repeated map growth.
- Pre-allocate each `postingEntry.data` byte slice with 64-byte initial
  capacity, avoiding the first several grow operations per trigram.

### 3. Pipeline document creation with builder processing (gitindex/index.go)

Document creation (git blob reading + decompression) and builder processing
(trigram generation + shard building) were fully sequential. Added a
goroutine pipeline: a producer reads git blobs ahead of the main loop via a
buffered channel (64 slots), overlapping I/O with CPU-bound processing.
This alone saved ~1.6s (-16%).

### 4. Update write.go for new postingEntry struct (index/write.go)

Updated `writePostings` to access `s.postings[k].data` instead of
`s.postings[k]` to match the new `postingEntry` struct.

## Profiling methodology

- CPU profiled with `-cpu_profile` flag on kubernetes bare clone (shallow)
- Each experiment measured with median of 3-5 runs
- Guard: `go test ./index/... ./gitindex/...` after each change
- 16 experiments total, 4 kept, 12 discarded after measurement

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 gitindex/index.go      | 44 ++++++++++++++++++++++++++++++------------
 index/shard_builder.go | 35 +++++++++++++++++++++++++--------
 index/write.go         |  2 +-
 3 files changed, 60 insertions(+), 21 deletions(-)

diff --git a/gitindex/index.go b/gitindex/index.go
index 5fbeba0d0..d4250498a 100644
--- a/gitindex/index.go
+++ b/gitindex/index.go
@@ -586,23 +586,43 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
 	names = uniq(names)
 
 	log.Printf("attempting to index %d total files", totalFiles)
-	for idx, name := range names {
-		keys := fileKeys[name]
 
-		for _, key := range keys {
+	// Flatten keys in sorted order for pipeline processing.
+	allKeys := make([]fileKey, 0, totalFiles)
+	for _, name := range names {
+		allKeys = append(allKeys, fileKeys[name]...)
+	}
+
+	// Pre-fetch documents using a pipeline: a goroutine reads blobs
+	// ahead of the main loop, overlapping I/O with builder processing.
+	type docResult struct {
+		doc index.Document
+		key fileKey
+		err error
+	}
+	ch := make(chan docResult, 64)
+	go func() {
+		defer close(ch)
+		for _, key := range allKeys {
 			doc, err := createDocument(key, repos, opts.BuildOptions)
-			if err != nil {
-				return false, err
-			}
+			ch <- docResult{doc: doc, key: key, err: err}
+		}
+	}()
 
-			if err := builder.Add(doc); err != nil {
-				return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err)
-			}
+	idx := 0
+	for result := range ch {
+		if result.err != nil {
+			return false, result.err
+		}
 
-			if idx%10_000 == 0 {
-				builder.CheckMemoryUsage()
-			}
+		if err := builder.Add(result.doc); err != nil {
+			return false, fmt.Errorf("error adding document with name %s: %w", result.key.FullPath(), err)
+		}
+
+		if idx%10_000 == 0 {
+			builder.CheckMemoryUsage()
 		}
+		idx++
 	}
 	return true, builder.Finish()
 }
diff --git a/index/shard_builder.go b/index/shard_builder.go
index 55b82bb32..f4ba8a495 100644
--- a/index/shard_builder.go
+++ b/index/shard_builder.go
@@ -59,9 +59,17 @@ func HostnameBestEffort() string {
 // Store character (unicode codepoint) offset (in bytes) this often.
 const runeOffsetFrequency = 100
 
+// postingEntry holds the posting list data and the last offset for a single trigram.
+// By combining these into a single struct accessed via pointer, we reduce map
+// operations in the hot loop from 4 per rune (2 reads + 2 writes on two maps)
+// to 1 per rune (1 read, then modify through pointer).
+type postingEntry struct {
+	data    []byte
+	lastOff uint32
+}
+
 type postingsBuilder struct {
-	postings    map[ngram][]byte
-	lastOffsets map[ngram]uint32
+	postings map[ngram]*postingEntry
 
 	// To support UTF-8 searching, we must map back runes to byte
 	// offsets. As a first attempt, we sample regularly. The
@@ -77,9 +85,12 @@ type postingsBuilder struct {
 }
 
 func newPostingsBuilder() *postingsBuilder {
+	// Pre-allocate map with a reasonable capacity hint.
+	// A typical shard (~100MB) contains 50K-200K unique trigrams.
+	// Pre-allocating avoids repeated map growth during indexing.
+	const initialTrigramCapacity = 200_000
 	return &postingsBuilder{
-		postings:     map[ngram][]byte{},
-		lastOffsets:  map[ngram]uint32{},
+		postings:     make(map[ngram]*postingEntry, initialTrigramCapacity),
 		isPlainASCII: true,
 	}
 }
@@ -130,12 +141,20 @@ func (s *postingsBuilder) newSearchableString(data []byte, byteSections []Docume
 		}
 
 		ng := runesToNGram(runeGram)
-		lastOff := s.lastOffsets[ng]
 		newOff := endRune + uint32(runeIndex) - 2
 
-		m := binary.PutUvarint(buf[:], uint64(newOff-lastOff))
-		s.postings[ng] = append(s.postings[ng], buf[:m]...)
-		s.lastOffsets[ng] = newOff
+		e := s.postings[ng]
+		if e == nil {
+			// Pre-allocate data slice. Most trigrams appear many times
+			// across a shard, so starting with 64 bytes avoids several
+			// small reallocations during early appends.
+			e = &postingEntry{data: make([]byte, 0, 64)}
+			s.postings[ng] = e
+		}
+
+		m := binary.PutUvarint(buf[:], uint64(newOff-e.lastOff))
+		e.data = append(e.data, buf[:m]...)
+		e.lastOff = newOff
 	}
 	s.runeCount += runeIndex
 
diff --git a/index/write.go b/index/write.go
index bd604fc66..091637c95 100644
--- a/index/write.go
+++ b/index/write.go
@@ -95,7 +95,7 @@ func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection,
 
 	postings.start(w)
 	for _, k := range keys {
-		postings.addItem(w, s.postings[k])
+		postings.addItem(w, s.postings[k].data)
 	}
 	postings.end(w)