From 96e94a55419c33414db1a5f989ac3a2bd09c1c6e Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Thu, 26 Mar 2026 18:16:21 +0200 Subject: [PATCH] gitindex: set filter for cat-file At Sourcegraph we do a sparse clone which excludes files based on max file size. However, cat-file will hydrate in missing objects. So we pass in the same filter to avoid hydrating in those files. --- gitindex/catfile.go | 49 ++++++---- gitindex/catfile_bench_test.go | 6 +- gitindex/catfile_hardening_test.go | 149 +++++++++++++++-------------- gitindex/catfile_test.go | 91 ++++++++++++++---- gitindex/index.go | 24 ++++- gitindex/index_test.go | 30 ++++++ 6 files changed, 235 insertions(+), 114 deletions(-) diff --git a/gitindex/catfile.go b/gitindex/catfile.go index 0c7c7753c..1933c337c 100644 --- a/gitindex/catfile.go +++ b/gitindex/catfile.go @@ -28,6 +28,10 @@ import ( "github.com/go-git/go-git/v5/plumbing" ) +type catfileReaderOptions struct { + filterSpec string +} + // catfileReader provides streaming access to git blob objects via a pipelined // "git cat-file --batch --buffer" process. A writer goroutine feeds all blob // SHAs to stdin while the caller reads responses one at a time, similar to @@ -39,14 +43,15 @@ import ( // // Usage: // -// cr, err := newCatfileReader(repoDir, ids) +// cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) // if err != nil { ... } // defer cr.Close() // // for { -// size, missing, err := cr.Next() +// size, missing, excluded, err := cr.Next() // if err == io.EOF { break } // if missing { continue } +// if excluded { continue } // if size > maxSize { continue } // unread bytes auto-skipped // content := make([]byte, size) // io.ReadFull(cr, content) @@ -66,9 +71,14 @@ type catfileReader struct { // newCatfileReader starts a "git cat-file --batch --buffer" process and feeds // all ids to its stdin via a background goroutine. The caller must call Close -// when done. -func newCatfileReader(repoDir string, ids []plumbing.Hash) (*catfileReader, error) { - cmd := exec.Command("git", "cat-file", "--batch", "--buffer") +// when done. Pass a zero-value catfileReaderOptions when no options are needed. +func newCatfileReader(repoDir string, ids []plumbing.Hash, opts catfileReaderOptions) (*catfileReader, error) { + args := []string{"cat-file", "--batch", "--buffer"} + if opts.filterSpec != "" { + args = append(args, "--filter="+opts.filterSpec) + } + + cmd := exec.Command("git", args...) cmd.Dir = repoDir stdin, err := cmd.StdinPipe() @@ -114,16 +124,17 @@ func newCatfileReader(repoDir string, ids []plumbing.Hash) (*catfileReader, erro } // Next advances to the next blob entry. It returns the blob's size and whether -// it is missing. Any unread content from the previous entry is automatically -// discarded. Returns io.EOF when all entries have been consumed. +// it is missing or excluded by the configured filter. Any unread content from +// the previous entry is automatically discarded. Returns io.EOF when all +// entries have been consumed. // -// After Next returns successfully with missing=false, call Read to consume the -// blob content, or call Next again to skip it. -func (cr *catfileReader) Next() (size int, missing bool, err error) { +// After Next returns successfully with missing=false and excluded=false, call +// Read to consume the blob content, or call Next again to skip it. +func (cr *catfileReader) Next() (size int, missing bool, excluded bool, err error) { // Discard unread content from the previous entry. if cr.pending > 0 { if _, err := cr.reader.Discard(cr.pending); err != nil { - return 0, false, fmt.Errorf("discard pending bytes: %w", err) + return 0, false, false, fmt.Errorf("discard pending bytes: %w", err) } cr.pending = 0 } @@ -131,29 +142,33 @@ func (cr *catfileReader) Next() (size int, missing bool, err error) { headerBytes, err := cr.reader.ReadBytes('\n') if err != nil { if err == io.EOF { - return 0, false, io.EOF + return 0, false, false, io.EOF } - return 0, false, fmt.Errorf("read header: %w", err) + return 0, false, false, fmt.Errorf("read header: %w", err) } header := headerBytes[:len(headerBytes)-1] // trim \n if bytes.HasSuffix(header, []byte(" missing")) { - return 0, true, nil + return 0, true, false, nil + } + + if bytes.HasSuffix(header, []byte(" excluded")) { + return 0, false, true, nil } // Parse size from " ". lastSpace := bytes.LastIndexByte(header, ' ') if lastSpace == -1 { - return 0, false, fmt.Errorf("unexpected header: %q", header) + return 0, false, false, fmt.Errorf("unexpected header: %q", header) } size, err = strconv.Atoi(string(header[lastSpace+1:])) if err != nil { - return 0, false, fmt.Errorf("parse size from %q: %w", header, err) + return 0, false, false, fmt.Errorf("parse size from %q: %w", header, err) } // Track pending bytes: content + trailing LF. cr.pending = size + 1 - return size, false, nil + return size, false, false, nil } // Read reads from the current blob's content. Implements io.Reader. Returns diff --git a/gitindex/catfile_bench_test.go b/gitindex/catfile_bench_test.go index ec2626a7b..5b8897380 100644 --- a/gitindex/catfile_bench_test.go +++ b/gitindex/catfile_bench_test.go @@ -130,17 +130,17 @@ func BenchmarkBlobRead_CatfileReader(b *testing.B) { var totalBytes int64 for b.Loop() { totalBytes = 0 - cr, err := newCatfileReader(gitDir, subset) + cr, err := newCatfileReader(gitDir, subset, catfileReaderOptions{}) if err != nil { b.Fatalf("newCatfileReader: %v", err) } for range subset { - size, missing, err := cr.Next() + size, missing, excluded, err := cr.Next() if err != nil { cr.Close() b.Fatalf("Next: %v", err) } - if missing { + if missing || excluded { continue } content := make([]byte, size) diff --git a/gitindex/catfile_hardening_test.go b/gitindex/catfile_hardening_test.go index 06a730a25..7409f7fc4 100644 --- a/gitindex/catfile_hardening_test.go +++ b/gitindex/catfile_hardening_test.go @@ -22,13 +22,13 @@ func TestCatfileReader_DoubleClose(t *testing.T) { repoDir, blobs := createTestRepo(t) ids := []plumbing.Hash{blobs["hello.txt"]} - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } // Consume the entry so the process can exit cleanly. - if _, _, err := cr.Next(); err != nil { + if _, _, _, err := cr.Next(); err != nil { t.Fatal(err) } @@ -61,13 +61,13 @@ func TestCatfileReader_ConcurrentClose(t *testing.T) { blobs["binary.bin"], } - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } // Read one entry, leave two unconsumed. - if _, _, err := cr.Next(); err != nil { + if _, _, _, err := cr.Next(); err != nil { t.Fatal(err) } @@ -111,7 +111,7 @@ func TestCatfileReader_CloseWithoutReading(t *testing.T) { blobs["empty.txt"], } - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } @@ -168,13 +168,13 @@ git commit -m "many files" ids = append(ids, plumbing.NewHash(string(out[:len(out)-1]))) } - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } // Read only 1 of 200 entries. - if _, _, err := cr.Next(); err != nil { + if _, _, _, err := cr.Next(); err != nil { t.Fatal(err) } @@ -207,7 +207,7 @@ func TestCatfileReader_ReadWithoutNext(t *testing.T) { repoDir, blobs := createTestRepo(t) ids := []plumbing.Hash{blobs["hello.txt"]} - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } @@ -227,13 +227,13 @@ func TestCatfileReader_ReadAfterFullConsumption(t *testing.T) { repoDir, blobs := createTestRepo(t) ids := []plumbing.Hash{blobs["hello.txt"]} - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } defer cr.Close() - size, _, _ := cr.Next() + size, _, _, _ := cr.Next() content := make([]byte, size) if _, err := io.ReadFull(cr, content); err != nil { t.Fatal(err) @@ -256,13 +256,13 @@ func TestCatfileReader_SmallBufferReads(t *testing.T) { repoDir, blobs := createTestRepo(t) ids := []plumbing.Hash{blobs["hello.txt"]} - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } defer cr.Close() - size, _, _ := cr.Next() + size, _, _, _ := cr.Next() var result []byte buf := make([]byte, 1) @@ -297,14 +297,14 @@ func TestCatfileReader_PartialReadThenNext(t *testing.T) { blobs["binary.bin"], // variable, starts with 0x00 } - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } defer cr.Close() // Read only 5 of 12 bytes from hello.txt. - size, _, _ := cr.Next() + size, _, _, _ := cr.Next() if size != 12 { t.Fatalf("hello.txt size = %d, want 12", size) } @@ -317,7 +317,7 @@ func TestCatfileReader_PartialReadThenNext(t *testing.T) { } // Advance — must discard remaining 7 content bytes + trailing LF. - size, _, err = cr.Next() + size, _, _, err = cr.Next() if err != nil { t.Fatalf("Next binary.bin after partial read: %v", err) } @@ -343,13 +343,13 @@ func TestCatfileReader_PartialReadExactlyOneByteShort(t *testing.T) { blobs["binary.bin"], // starts with 0x00 } - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } defer cr.Close() - size, _, _ := cr.Next() + size, _, _, _ := cr.Next() // Read exactly size-1 bytes — leaves 1 content byte + trailing LF. buf := make([]byte, size-1) if _, err := io.ReadFull(cr, buf); err != nil { @@ -361,11 +361,11 @@ func TestCatfileReader_PartialReadExactlyOneByteShort(t *testing.T) { // Advance — pending should be 2 (1 content byte + 1 LF). The // Discard call must handle this exact boundary correctly. - size, missing, err := cr.Next() + size, missing, excluded, err := cr.Next() if err != nil { t.Fatalf("Next after size-1 partial read: %v", err) } - if missing { + if missing || excluded { t.Fatal("binary.bin unexpectedly missing") } @@ -386,13 +386,13 @@ func TestCatfileReader_PartialReadExactlyOneByteShort(t *testing.T) { func TestCatfileReader_EmptyIds(t *testing.T) { repoDir, _ := createTestRepo(t) - cr, err := newCatfileReader(repoDir, nil) + cr, err := newCatfileReader(repoDir, nil, catfileReaderOptions{}) if err != nil { t.Fatal(err) } defer cr.Close() - _, _, err = cr.Next() + _, _, _, err = cr.Next() if err != io.EOF { t.Fatalf("expected io.EOF for empty ids, got %v", err) } @@ -408,18 +408,18 @@ func TestCatfileReader_MultipleEmptyBlobs(t *testing.T) { emptyID := blobs["empty.txt"] ids := []plumbing.Hash{emptyID, emptyID, emptyID, emptyID, emptyID} - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } defer cr.Close() for i := range ids { - size, missing, err := cr.Next() + size, missing, excluded, err := cr.Next() if err != nil { t.Fatalf("Next #%d: %v", i, err) } - if missing { + if missing || excluded { t.Fatalf("#%d unexpectedly missing", i) } if size != 0 { @@ -428,7 +428,7 @@ func TestCatfileReader_MultipleEmptyBlobs(t *testing.T) { // Don't read — Next should discard the trailing LF for us. } - _, _, err = cr.Next() + _, _, _, err = cr.Next() if err != io.EOF { t.Fatalf("expected EOF after %d empty blobs, got %v", len(ids), err) } @@ -440,17 +440,17 @@ func TestCatfileReader_MultipleEmptyBlobs(t *testing.T) { func TestCatfileReader_EmptyBlobRead(t *testing.T) { repoDir, blobs := createTestRepo(t) ids := []plumbing.Hash{ - blobs["empty.txt"], // 0 bytes - blobs["hello.txt"], // 12 bytes — sentinel + blobs["empty.txt"], // 0 bytes + blobs["hello.txt"], // 12 bytes — sentinel } - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } defer cr.Close() - size, _, _ := cr.Next() + size, _, _, _ := cr.Next() if size != 0 { t.Fatalf("empty.txt size = %d", size) } @@ -464,7 +464,7 @@ func TestCatfileReader_EmptyBlobRead(t *testing.T) { // The trailing LF must have been consumed. Verify by reading the // next entry — if the LF leaked, the header parse would fail. - size, _, err = cr.Next() + size, _, _, err = cr.Next() if err != nil { t.Fatalf("Next hello.txt after empty blob Read: %v", err) } @@ -494,23 +494,26 @@ func TestCatfileReader_AllMissing(t *testing.T) { plumbing.NewHash("2222222222222222222222222222222222222222"), } - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } defer cr.Close() for i, id := range ids { - _, missing, err := cr.Next() + _, missing, excluded, err := cr.Next() if err != nil { t.Fatalf("Next #%d (%s): %v", i, id, err) } + if excluded { + t.Errorf("expected #%d (%s) to be missing, not excluded", i, id) + } if !missing { t.Errorf("expected #%d (%s) to be missing", i, id) } } - _, _, err = cr.Next() + _, _, _, err = cr.Next() if err != io.EOF { t.Fatalf("expected EOF after all missing, got %v", err) } @@ -532,22 +535,22 @@ func TestCatfileReader_AlternatingMissingPresent(t *testing.T) { blobs["binary.bin"], } - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } defer cr.Close() // fake1 — missing - _, missing, err := cr.Next() - if err != nil || !missing { - t.Fatalf("fake1: err=%v missing=%v", err, missing) + _, missing, excluded, err := cr.Next() + if err != nil || !missing || excluded { + t.Fatalf("fake1: err=%v missing=%v excluded=%v", err, missing, excluded) } // hello.txt — present, read it - size, missing, err := cr.Next() - if err != nil || missing { - t.Fatalf("hello.txt: err=%v missing=%v", err, missing) + size, missing, excluded, err := cr.Next() + if err != nil || missing || excluded { + t.Fatalf("hello.txt: err=%v missing=%v excluded=%v", err, missing, excluded) } content := make([]byte, size) if _, err := io.ReadFull(cr, content); err != nil { @@ -558,24 +561,24 @@ func TestCatfileReader_AlternatingMissingPresent(t *testing.T) { } // fake2 — missing - _, missing, err = cr.Next() - if err != nil || !missing { - t.Fatalf("fake2: err=%v missing=%v", err, missing) + _, missing, excluded, err = cr.Next() + if err != nil || !missing || excluded { + t.Fatalf("fake2: err=%v missing=%v excluded=%v", err, missing, excluded) } // empty.txt — present, skip it - size, missing, err = cr.Next() - if err != nil || missing { - t.Fatalf("empty.txt: err=%v missing=%v", err, missing) + size, missing, excluded, err = cr.Next() + if err != nil || missing || excluded { + t.Fatalf("empty.txt: err=%v missing=%v excluded=%v", err, missing, excluded) } if size != 0 { t.Errorf("empty.txt size = %d", size) } // binary.bin — present, read it - size, missing, err = cr.Next() - if err != nil || missing { - t.Fatalf("binary.bin: err=%v missing=%v", err, missing) + size, missing, excluded, err = cr.Next() + if err != nil || missing || excluded { + t.Fatalf("binary.bin: err=%v missing=%v excluded=%v", err, missing, excluded) } binContent := make([]byte, size) if _, err := io.ReadFull(cr, binContent); err != nil { @@ -585,7 +588,7 @@ func TestCatfileReader_AlternatingMissingPresent(t *testing.T) { t.Errorf("binary.bin[0] = 0x%02x, want 0x00", binContent[0]) } - _, _, err = cr.Next() + _, _, _, err = cr.Next() if err != io.EOF { t.Fatalf("expected EOF, got %v", err) } @@ -601,26 +604,26 @@ func TestCatfileReader_MissingThenSkip(t *testing.T) { fake := plumbing.NewHash("deadbeefdeadbeefdeadbeefdeadbeefdeadbeef") ids := []plumbing.Hash{ fake, - blobs["large.bin"], // 64KB — skip without reading - blobs["hello.txt"], // sentinel — read to verify integrity + blobs["large.bin"], // 64KB — skip without reading + blobs["hello.txt"], // sentinel — read to verify integrity } - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } defer cr.Close() // missing - _, missing, _ := cr.Next() - if !missing { + _, missing, excluded, _ := cr.Next() + if !missing || excluded { t.Fatal("expected missing") } // large.bin — skip - size, missing, err := cr.Next() - if err != nil || missing { - t.Fatalf("large.bin: err=%v missing=%v", err, missing) + size, missing, excluded, err := cr.Next() + if err != nil || missing || excluded { + t.Fatalf("large.bin: err=%v missing=%v excluded=%v", err, missing, excluded) } if size != 64*1024 { t.Fatalf("large.bin size = %d", size) @@ -628,9 +631,9 @@ func TestCatfileReader_MissingThenSkip(t *testing.T) { // deliberately don't read // hello.txt — read after missing+skip - size, missing, err = cr.Next() - if err != nil || missing { - t.Fatalf("hello.txt: err=%v missing=%v", err, missing) + size, missing, excluded, err = cr.Next() + if err != nil || missing || excluded { + t.Fatalf("hello.txt: err=%v missing=%v excluded=%v", err, missing, excluded) } content := make([]byte, size) if _, err := io.ReadFull(cr, content); err != nil { @@ -649,26 +652,26 @@ func TestCatfileReader_RepeatedNextAfterEOF(t *testing.T) { repoDir, blobs := createTestRepo(t) ids := []plumbing.Hash{blobs["hello.txt"]} - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } defer cr.Close() // Consume and skip the only entry. - if _, _, err := cr.Next(); err != nil { + if _, _, _, err := cr.Next(); err != nil { t.Fatal(err) } // First EOF. - _, _, err = cr.Next() + _, _, _, err = cr.Next() if err != io.EOF { t.Fatalf("first post-exhaust Next: %v, want io.EOF", err) } // Second and third EOF — must be stable. for i := 0; i < 2; i++ { - _, _, err = cr.Next() + _, _, _, err = cr.Next() if err != io.EOF { t.Fatalf("Next #%d after EOF: %v, want io.EOF", i+2, err) } @@ -684,13 +687,13 @@ func TestCatfileReader_LargeBlobBytePrecision(t *testing.T) { repoDir, blobs := createTestRepo(t) ids := []plumbing.Hash{blobs["large.bin"]} - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } defer cr.Close() - size, _, err := cr.Next() + size, _, _, err := cr.Next() if err != nil { t.Fatal(err) } @@ -732,13 +735,13 @@ func TestCatfileReader_LargeBlobChunkedRead(t *testing.T) { repoDir, blobs := createTestRepo(t) ids := []plumbing.Hash{blobs["large.bin"]} - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } defer cr.Close() - size, _, _ := cr.Next() + size, _, _, _ := cr.Next() if size != 64*1024 { t.Fatalf("size = %d", size) } @@ -780,18 +783,18 @@ func TestCatfileReader_DuplicateSHAs(t *testing.T) { sha := blobs["hello.txt"] ids := []plumbing.Hash{sha, sha, sha} - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatal(err) } defer cr.Close() for i := 0; i < 3; i++ { - size, missing, err := cr.Next() + size, missing, excluded, err := cr.Next() if err != nil { t.Fatalf("Next #%d: %v", i, err) } - if missing { + if missing || excluded { t.Fatalf("#%d unexpectedly missing", i) } if size != 12 { @@ -806,7 +809,7 @@ func TestCatfileReader_DuplicateSHAs(t *testing.T) { } } - _, _, err = cr.Next() + _, _, _, err = cr.Next() if err != io.EOF { t.Fatalf("expected EOF, got %v", err) } diff --git a/gitindex/catfile_test.go b/gitindex/catfile_test.go index c871bd7d2..489b1e256 100644 --- a/gitindex/catfile_test.go +++ b/gitindex/catfile_test.go @@ -70,18 +70,18 @@ func TestCatfileReader(t *testing.T) { blobs["large.bin"], } - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatalf("newCatfileReader: %v", err) } defer cr.Close() // hello.txt - size, missing, err := cr.Next() + size, missing, excluded, err := cr.Next() if err != nil { t.Fatalf("Next hello.txt: %v", err) } - if missing { + if missing || excluded { t.Fatal("hello.txt unexpectedly missing") } if size != 12 { @@ -96,19 +96,25 @@ func TestCatfileReader(t *testing.T) { } // empty.txt - size, missing, err = cr.Next() + size, missing, excluded, err = cr.Next() if err != nil { t.Fatalf("Next empty.txt: %v", err) } + if missing || excluded { + t.Fatal("empty.txt unexpectedly missing") + } if size != 0 { t.Errorf("empty.txt size = %d, want 0", size) } // binary.bin — read content and verify binary data survives. - size, missing, err = cr.Next() + size, missing, excluded, err = cr.Next() if err != nil { t.Fatalf("Next binary.bin: %v", err) } + if missing || excluded { + t.Fatal("binary.bin unexpectedly missing") + } binContent := make([]byte, size) if _, err := io.ReadFull(cr, binContent); err != nil { t.Fatalf("ReadFull binary.bin: %v", err) @@ -118,10 +124,13 @@ func TestCatfileReader(t *testing.T) { } // large.bin - size, missing, err = cr.Next() + size, missing, excluded, err = cr.Next() if err != nil { t.Fatalf("Next large.bin: %v", err) } + if missing || excluded { + t.Fatal("large.bin unexpectedly missing") + } if size != 64*1024 { t.Errorf("large.bin size = %d, want %d", size, 64*1024) } @@ -131,7 +140,7 @@ func TestCatfileReader(t *testing.T) { } // EOF after all entries. - _, _, err = cr.Next() + _, _, _, err = cr.Next() if err != io.EOF { t.Errorf("expected io.EOF after last entry, got %v", err) } @@ -146,20 +155,20 @@ func TestCatfileReader_Skip(t *testing.T) { blobs["binary.bin"], } - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatalf("newCatfileReader: %v", err) } defer cr.Close() // Skip hello.txt by calling Next again without reading. - _, _, err = cr.Next() + _, _, _, err = cr.Next() if err != nil { t.Fatalf("Next hello.txt: %v", err) } // Skip large.bin too. - size, _, err := cr.Next() + size, _, _, err := cr.Next() if err != nil { t.Fatalf("Next large.bin: %v", err) } @@ -168,7 +177,7 @@ func TestCatfileReader_Skip(t *testing.T) { } // Read binary.bin after skipping two entries. - size, _, err = cr.Next() + size, _, _, err = cr.Next() if err != nil { t.Fatalf("Next binary.bin: %v", err) } @@ -191,16 +200,16 @@ func TestCatfileReader_Missing(t *testing.T) { blobs["empty.txt"], } - cr, err := newCatfileReader(repoDir, ids) + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{}) if err != nil { t.Fatalf("newCatfileReader: %v", err) } defer cr.Close() // hello.txt — read normally. - size, missing, err := cr.Next() - if err != nil || missing { - t.Fatalf("Next hello.txt: err=%v missing=%v", err, missing) + size, missing, excluded, err := cr.Next() + if err != nil || missing || excluded { + t.Fatalf("Next hello.txt: err=%v missing=%v excluded=%v", err, missing, excluded) } content := make([]byte, size) if _, err := io.ReadFull(cr, content); err != nil { @@ -211,20 +220,64 @@ func TestCatfileReader_Missing(t *testing.T) { } // fakeHash — missing. - _, missing, err = cr.Next() + _, missing, excluded, err = cr.Next() if err != nil { t.Fatalf("Next fakeHash: %v", err) } + if excluded { + t.Error("expected fakeHash to be missing, not excluded") + } if !missing { t.Error("expected fakeHash to be missing") } // empty.txt — still works after missing entry. - size, missing, err = cr.Next() - if err != nil || missing { - t.Fatalf("Next empty.txt: err=%v missing=%v", err, missing) + size, missing, excluded, err = cr.Next() + if err != nil || missing || excluded { + t.Fatalf("Next empty.txt: err=%v missing=%v excluded=%v", err, missing, excluded) } if size != 0 { t.Errorf("empty.txt size = %d, want 0", size) } } + +func TestCatfileReader_Excluded(t *testing.T) { + repoDir, blobs := createTestRepo(t) + + ids := []plumbing.Hash{ + blobs["large.bin"], + blobs["hello.txt"], + } + + cr, err := newCatfileReader(repoDir, ids, catfileReaderOptions{filterSpec: "blob:limit=1k"}) + if err != nil { + t.Fatalf("newCatfileReader: %v", err) + } + defer cr.Close() + + _, missing, excluded, err := cr.Next() + if err != nil { + t.Fatalf("Next large.bin: %v", err) + } + if missing { + t.Fatal("large.bin unexpectedly missing") + } + if !excluded { + t.Fatal("large.bin unexpectedly included") + } + + size, missing, excluded, err := cr.Next() + if err != nil { + t.Fatalf("Next hello.txt: %v", err) + } + if missing || excluded { + t.Fatalf("hello.txt unexpectedly skipped: missing=%v excluded=%v", missing, excluded) + } + content := make([]byte, size) + if _, err := io.ReadFull(cr, content); err != nil { + t.Fatalf("ReadFull hello.txt: %v", err) + } + if string(content) != "hello world\n" { + t.Errorf("hello.txt = %q", content) + } +} diff --git a/gitindex/index.go b/gitindex/index.go index d2f5cb378..ad07e75fd 100644 --- a/gitindex/index.go +++ b/gitindex/index.go @@ -636,7 +636,10 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { // Stream main-repo blobs via pipelined cat-file --batch --buffer. // Large blobs are skipped without reading content into memory. if len(mainRepoIDs) > 0 { - cr, err := newCatfileReader(opts.RepoDir, mainRepoIDs) + crOpts := catfileReaderOptions{ + filterSpec: catfileFilterSpec(opts), + } + cr, err := newCatfileReader(opts.RepoDir, mainRepoIDs, crOpts) if err != nil { return false, fmt.Errorf("newCatfileReader: %w", err) } @@ -673,7 +676,7 @@ func indexCatfileBlobs(cr *catfileReader, keys []fileKey, repos map[fileKey]Blob defer cr.Close() for idx, key := range keys { - size, missing, err := cr.Next() + size, missing, excluded, err := cr.Next() if err != nil { return fmt.Errorf("cat-file next for %s: %w", key.FullPath(), err) } @@ -686,6 +689,8 @@ func indexCatfileBlobs(cr *catfileReader, keys []fileKey, repos map[fileKey]Blob // clone, or a race with git gc. Log a warning and skip. log.Printf("warning: blob %s missing for %s", key.ID, key.FullPath()) doc = skippedDoc(key, branches, index.SkipReasonMissing) + } else if excluded { + doc = skippedDoc(key, branches, index.SkipReasonTooLarge) } else { keyFullPath := key.FullPath() if size > opts.BuildOptions.SizeMax && !opts.BuildOptions.IgnoreSizeMax(keyFullPath) { @@ -776,6 +781,21 @@ type noopCloser struct{} func (noopCloser) Close() error { return nil } +func catfileFilterSpec(opts Options) string { + // Can't filter by size if we have large file exceptions + if len(opts.BuildOptions.LargeFiles) > 0 { + return "" + } + + if opts.BuildOptions.SizeMax <= 0 { + return "" + } + + // Git's blob:limit filter excludes blobs whose size is >= the given limit, + // while zoekt indexes files up to and including SizeMax bytes. + return fmt.Sprintf("blob:limit=%d", int64(opts.BuildOptions.SizeMax)+1) +} + func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) { ignoreFile, err := tree.File(ignore.IgnoreFile) if err == object.ErrFileNotFound { diff --git a/gitindex/index_test.go b/gitindex/index_test.go index 7f76e0338..075bc1dc5 100644 --- a/gitindex/index_test.go +++ b/gitindex/index_test.go @@ -165,6 +165,36 @@ func TestIndexGitRepo_Worktree(t *testing.T) { } } +func TestCatfileFilterSpec(t *testing.T) { + for _, tc := range []struct { + name string + opts Options + want string + }{ + { + name: "size max", + opts: Options{BuildOptions: index.Options{SizeMax: 1 << 20}}, + want: "blob:limit=1048577", + }, + { + name: "large file exception disables filter", + opts: Options{BuildOptions: index.Options{SizeMax: 1 << 20, LargeFiles: []string{"*.bin"}}}, + want: "", + }, + { + name: "zero size max disables filter", + opts: Options{BuildOptions: index.Options{SizeMax: 0}}, + want: "", + }, + } { + t.Run(tc.name, func(t *testing.T) { + if got := catfileFilterSpec(tc.opts); got != tc.want { + t.Fatalf("catfileFilterSpec() = %q, want %q", got, tc.want) + } + }) + } +} + func executeCommand(t *testing.T, dir string, cmd *exec.Cmd) *exec.Cmd { cmd.Dir = dir cmd.Env = []string{