diff --git a/gitindex/index.go b/gitindex/index.go index 5fbeba0d0..d4250498a 100644 --- a/gitindex/index.go +++ b/gitindex/index.go @@ -586,23 +586,43 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { names = uniq(names) log.Printf("attempting to index %d total files", totalFiles) - for idx, name := range names { - keys := fileKeys[name] - for _, key := range keys { + // Flatten keys in sorted order for pipeline processing. + allKeys := make([]fileKey, 0, totalFiles) + for _, name := range names { + allKeys = append(allKeys, fileKeys[name]...) + } + + // Pre-fetch documents using a pipeline: a goroutine reads blobs + // ahead of the main loop, overlapping I/O with builder processing. + type docResult struct { + doc index.Document + key fileKey + err error + } + ch := make(chan docResult, 64) + go func() { + defer close(ch) + for _, key := range allKeys { doc, err := createDocument(key, repos, opts.BuildOptions) - if err != nil { - return false, err - } + ch <- docResult{doc: doc, key: key, err: err} + } + }() - if err := builder.Add(doc); err != nil { - return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err) - } + idx := 0 + for result := range ch { + if result.err != nil { + return false, result.err + } - if idx%10_000 == 0 { - builder.CheckMemoryUsage() - } + if err := builder.Add(result.doc); err != nil { + return false, fmt.Errorf("error adding document with name %s: %w", result.key.FullPath(), err) + } + + if idx%10_000 == 0 { + builder.CheckMemoryUsage() } + idx++ } return true, builder.Finish() } diff --git a/index/shard_builder.go b/index/shard_builder.go index 55b82bb32..f4ba8a495 100644 --- a/index/shard_builder.go +++ b/index/shard_builder.go @@ -59,9 +59,17 @@ func HostnameBestEffort() string { // Store character (unicode codepoint) offset (in bytes) this often. const runeOffsetFrequency = 100 +// postingEntry holds the posting list data and the last offset for a single trigram. +// By combining these into a single struct accessed via pointer, we reduce map +// operations in the hot loop from 4 per rune (2 reads + 2 writes on two maps) +// to 1 per rune (1 read, then modify through pointer). +type postingEntry struct { + data []byte + lastOff uint32 +} + type postingsBuilder struct { - postings map[ngram][]byte - lastOffsets map[ngram]uint32 + postings map[ngram]*postingEntry // To support UTF-8 searching, we must map back runes to byte // offsets. As a first attempt, we sample regularly. The @@ -77,9 +85,12 @@ type postingsBuilder struct { } func newPostingsBuilder() *postingsBuilder { + // Pre-allocate map with a reasonable capacity hint. + // A typical shard (~100MB) contains 50K-200K unique trigrams. + // Pre-allocating avoids repeated map growth during indexing. + const initialTrigramCapacity = 200_000 return &postingsBuilder{ - postings: map[ngram][]byte{}, - lastOffsets: map[ngram]uint32{}, + postings: make(map[ngram]*postingEntry, initialTrigramCapacity), isPlainASCII: true, } } @@ -130,12 +141,20 @@ func (s *postingsBuilder) newSearchableString(data []byte, byteSections []Docume } ng := runesToNGram(runeGram) - lastOff := s.lastOffsets[ng] newOff := endRune + uint32(runeIndex) - 2 - m := binary.PutUvarint(buf[:], uint64(newOff-lastOff)) - s.postings[ng] = append(s.postings[ng], buf[:m]...) - s.lastOffsets[ng] = newOff + e := s.postings[ng] + if e == nil { + // Pre-allocate data slice. Most trigrams appear many times + // across a shard, so starting with 64 bytes avoids several + // small reallocations during early appends. + e = &postingEntry{data: make([]byte, 0, 64)} + s.postings[ng] = e + } + + m := binary.PutUvarint(buf[:], uint64(newOff-e.lastOff)) + e.data = append(e.data, buf[:m]...) + e.lastOff = newOff } s.runeCount += runeIndex diff --git a/index/write.go b/index/write.go index bd604fc66..091637c95 100644 --- a/index/write.go +++ b/index/write.go @@ -95,7 +95,7 @@ func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection, postings.start(w) for _, k := range keys { - postings.addItem(w, s.postings[k]) + postings.addItem(w, s.postings[k].data) } postings.end(w)