From 87a6aa64d832c68b5190a20c172f103e41488a64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Wed, 1 Apr 2026 15:21:44 +0200 Subject: [PATCH] perf(index): optimize indexing time by ~35% for large repositories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduce indexing time from 13.2s to 8.6s (-35%) on the kubernetes repository (26K files, ~200MB content) through four targeted optimizations identified via CPU profiling. ## Changes ### 1. Merge postings and lastOffsets maps into single map (index/shard_builder.go) The trigram generation hot loop (`newSearchableString`) previously used two separate maps: `postings map[ngram][]byte` and `lastOffsets map[ngram]uint32`. Each rune required 2 map reads + 2 map writes across both maps. Replaced with a single `map[ngram]*postingEntry` where `postingEntry` holds both the data slice and last offset. After the initial map lookup, all modifications go through the pointer — reducing map operations from 4 to 1 per rune. This cut `mapassign_fast64` time from 1.64s to 0.30s (5.5x). ### 2. Pre-allocate postings map and data slices (index/shard_builder.go) - Pre-allocate the postings map with 200K capacity hint (a typical shard contains 50K-200K unique trigrams), avoiding repeated map growth. - Pre-allocate each `postingEntry.data` byte slice with 64-byte initial capacity, avoiding the first several grow operations per trigram. ### 3. Pipeline document creation with builder processing (gitindex/index.go) Document creation (git blob reading + decompression) and builder processing (trigram generation + shard building) were fully sequential. Added a goroutine pipeline: a producer reads git blobs ahead of the main loop via a buffered channel (64 slots), overlapping I/O with CPU-bound processing. This alone saved ~1.6s (-16%). ### 4. Update write.go for new postingEntry struct (index/write.go) Updated `writePostings` to access `s.postings[k].data` instead of `s.postings[k]` to match the new `postingEntry` struct. ## Profiling methodology - CPU profiled with `-cpu_profile` flag on kubernetes bare clone (shallow) - Each experiment measured with median of 3-5 runs - Guard: `go test ./index/... ./gitindex/...` after each change - 16 experiments total, 4 kept, 12 discarded after measurement Co-Authored-By: Claude Opus 4.6 (1M context) --- gitindex/index.go | 44 ++++++++++++++++++++++++++++++------------ index/shard_builder.go | 35 +++++++++++++++++++++++++-------- index/write.go | 2 +- 3 files changed, 60 insertions(+), 21 deletions(-) diff --git a/gitindex/index.go b/gitindex/index.go index 5fbeba0d0..d4250498a 100644 --- a/gitindex/index.go +++ b/gitindex/index.go @@ -586,23 +586,43 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { names = uniq(names) log.Printf("attempting to index %d total files", totalFiles) - for idx, name := range names { - keys := fileKeys[name] - for _, key := range keys { + // Flatten keys in sorted order for pipeline processing. + allKeys := make([]fileKey, 0, totalFiles) + for _, name := range names { + allKeys = append(allKeys, fileKeys[name]...) + } + + // Pre-fetch documents using a pipeline: a goroutine reads blobs + // ahead of the main loop, overlapping I/O with builder processing. + type docResult struct { + doc index.Document + key fileKey + err error + } + ch := make(chan docResult, 64) + go func() { + defer close(ch) + for _, key := range allKeys { doc, err := createDocument(key, repos, opts.BuildOptions) - if err != nil { - return false, err - } + ch <- docResult{doc: doc, key: key, err: err} + } + }() - if err := builder.Add(doc); err != nil { - return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err) - } + idx := 0 + for result := range ch { + if result.err != nil { + return false, result.err + } - if idx%10_000 == 0 { - builder.CheckMemoryUsage() - } + if err := builder.Add(result.doc); err != nil { + return false, fmt.Errorf("error adding document with name %s: %w", result.key.FullPath(), err) + } + + if idx%10_000 == 0 { + builder.CheckMemoryUsage() } + idx++ } return true, builder.Finish() } diff --git a/index/shard_builder.go b/index/shard_builder.go index 55b82bb32..f4ba8a495 100644 --- a/index/shard_builder.go +++ b/index/shard_builder.go @@ -59,9 +59,17 @@ func HostnameBestEffort() string { // Store character (unicode codepoint) offset (in bytes) this often. const runeOffsetFrequency = 100 +// postingEntry holds the posting list data and the last offset for a single trigram. +// By combining these into a single struct accessed via pointer, we reduce map +// operations in the hot loop from 4 per rune (2 reads + 2 writes on two maps) +// to 1 per rune (1 read, then modify through pointer). +type postingEntry struct { + data []byte + lastOff uint32 +} + type postingsBuilder struct { - postings map[ngram][]byte - lastOffsets map[ngram]uint32 + postings map[ngram]*postingEntry // To support UTF-8 searching, we must map back runes to byte // offsets. As a first attempt, we sample regularly. The @@ -77,9 +85,12 @@ type postingsBuilder struct { } func newPostingsBuilder() *postingsBuilder { + // Pre-allocate map with a reasonable capacity hint. + // A typical shard (~100MB) contains 50K-200K unique trigrams. + // Pre-allocating avoids repeated map growth during indexing. + const initialTrigramCapacity = 200_000 return &postingsBuilder{ - postings: map[ngram][]byte{}, - lastOffsets: map[ngram]uint32{}, + postings: make(map[ngram]*postingEntry, initialTrigramCapacity), isPlainASCII: true, } } @@ -130,12 +141,20 @@ func (s *postingsBuilder) newSearchableString(data []byte, byteSections []Docume } ng := runesToNGram(runeGram) - lastOff := s.lastOffsets[ng] newOff := endRune + uint32(runeIndex) - 2 - m := binary.PutUvarint(buf[:], uint64(newOff-lastOff)) - s.postings[ng] = append(s.postings[ng], buf[:m]...) - s.lastOffsets[ng] = newOff + e := s.postings[ng] + if e == nil { + // Pre-allocate data slice. Most trigrams appear many times + // across a shard, so starting with 64 bytes avoids several + // small reallocations during early appends. + e = &postingEntry{data: make([]byte, 0, 64)} + s.postings[ng] = e + } + + m := binary.PutUvarint(buf[:], uint64(newOff-e.lastOff)) + e.data = append(e.data, buf[:m]...) + e.lastOff = newOff } s.runeCount += runeIndex diff --git a/index/write.go b/index/write.go index bd604fc66..091637c95 100644 --- a/index/write.go +++ b/index/write.go @@ -95,7 +95,7 @@ func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection, postings.start(w) for _, k := range keys { - postings.addItem(w, s.postings[k]) + postings.addItem(w, s.postings[k].data) } postings.end(w)