From 1b4dbe5826571c92bccd2670b5cff740b8588f8d Mon Sep 17 00:00:00 2001 From: David Gageot Date: Fri, 5 Jun 2026 11:28:24 +0200 Subject: [PATCH] fix: skip binary files in content search --- pkg/tools/builtin/filesystem/filesystem.go | 54 +++++++++++++++++++ .../builtin/filesystem/filesystem_test.go | 46 ++++++++++++++++ pkg/tools/builtin/filesystem/limit.go | 4 ++ 3 files changed, 104 insertions(+) diff --git a/pkg/tools/builtin/filesystem/filesystem.go b/pkg/tools/builtin/filesystem/filesystem.go index 8b223321a..820777124 100644 --- a/pkg/tools/builtin/filesystem/filesystem.go +++ b/pkg/tools/builtin/filesystem/filesystem.go @@ -1,6 +1,7 @@ package filesystem import ( + "bytes" "context" "encoding/base64" "encoding/json" @@ -9,6 +10,7 @@ import ( "io" "io/fs" "log/slog" + "net/http" "os" "path/filepath" "regexp" @@ -1181,6 +1183,11 @@ func (t *ToolSet) handleSearchFilesContent(_ context.Context, args SearchFilesCo return nil } + binary, err := t.isBinaryFile(path) + if err != nil || binary { + return nil + } + content, err := t.readFile(path) if err != nil { return nil @@ -1265,6 +1272,53 @@ func (t *ToolSet) handleSearchFilesContent(_ context.Context, args SearchFilesCo }, nil } +func (t *ToolSet) readFileHeader(resolved string, size int) ([]byte, error) { + if size <= 0 { + return nil, nil + } + + root, rel, err := t.rootedAccess(resolved) + if err != nil { + return nil, err + } + + var file *os.File + if root != nil { + file, err = root.Open(rel) + } else { + file, err = os.Open(resolved) + } + if err != nil { + return nil, err + } + defer file.Close() + + buf := make([]byte, size) + n, err := io.ReadFull(file, buf) + if err != nil && !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) { + return nil, err + } + return buf[:n], nil +} + +func (t *ToolSet) isBinaryFile(resolved string) (bool, error) { + header, err := t.readFileHeader(resolved, maxBinarySniffBytes) + if err != nil { + return false, err + } + return isBinaryContent(header), nil +} + +func isBinaryContent(header []byte) bool { + if len(header) == 0 { + return false + } + if bytes.IndexByte(header, 0) >= 0 { + return true + } + return !strings.HasPrefix(http.DetectContentType(header), "text/") +} + func (t *ToolSet) handleWriteFile(ctx context.Context, args WriteFileArgs) (*tools.ToolCallResult, error) { resolvedPath, err := t.resolveAndCheckPath(args.Path) if err != nil { diff --git a/pkg/tools/builtin/filesystem/filesystem_test.go b/pkg/tools/builtin/filesystem/filesystem_test.go index 3d433e394..e6ab524b8 100644 --- a/pkg/tools/builtin/filesystem/filesystem_test.go +++ b/pkg/tools/builtin/filesystem/filesystem_test.go @@ -567,6 +567,52 @@ func TestFilesystemTool_SearchFilesContent_SkipsSymlinkToLargeFile(t *testing.T) assert.NotContains(t, result.Output, "target.bin") } +// TestFilesystemTool_SearchFilesContent_SkipsBinaryFiles verifies that +// small binary files are skipped even when they contain the query bytes. +func TestFilesystemTool_SearchFilesContent_SkipsBinaryFiles(t *testing.T) { + t.Parallel() + tmpDir := t.TempDir() + tool := New(tmpDir) + + require.NoError(t, os.WriteFile(filepath.Join(tmpDir, "text.txt"), []byte("findme\n"), 0o644)) + require.NoError(t, os.WriteFile(filepath.Join(tmpDir, "program.bin"), []byte{0x7f, 'E', 'L', 'F', 0, 'f', 'i', 'n', 'd', 'm', 'e', '\n'}, 0o644)) + require.NoError(t, os.WriteFile(filepath.Join(tmpDir, "doc.pdf"), []byte("%PDF-1.7\nfindme\n"), 0o644)) + + result, err := tool.handleSearchFilesContent(t.Context(), SearchFilesContentArgs{ + Path: ".", + Query: "findme", + }) + require.NoError(t, err) + assert.Contains(t, result.Output, "text.txt") + assert.NotContains(t, result.Output, "program.bin") + assert.NotContains(t, result.Output, "doc.pdf") + assert.Equal(t, SearchFilesContentMeta{MatchCount: 1, FileCount: 1}, result.Meta) +} + +func TestIsBinaryContent(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + header []byte + want bool + }{ + {name: "empty", header: nil, want: false}, + {name: "plain text", header: []byte("package main\nfunc main() {}\n"), want: false}, + {name: "utf8 text", header: []byte("Résumé\n"), want: false}, + {name: "elf", header: []byte{0x7f, 'E', 'L', 'F', 0, 1, 1, 0}, want: true}, + {name: "png", header: []byte{0x89, 'P', 'N', 'G', '\r', '\n', 0x1a, '\n'}, want: true}, + {name: "pdf", header: []byte("%PDF-1.7\nfindme\n"), want: true}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tc.want, isBinaryContent(tc.header)) + }) + } +} + func TestFilesystemTool_PostEditCommands(t *testing.T) { t.Parallel() tmpDir := t.TempDir() diff --git a/pkg/tools/builtin/filesystem/limit.go b/pkg/tools/builtin/filesystem/limit.go index eec4b1ff4..393e2bf0e 100644 --- a/pkg/tools/builtin/filesystem/limit.go +++ b/pkg/tools/builtin/filesystem/limit.go @@ -9,6 +9,10 @@ const maxFiles = 100 // Skipping them keeps a recursive search over a large tree bounded. const maxSearchFileSize = 10 << 20 // 10 MiB +// maxBinarySniffBytes is how much of a file header search_files_content +// reads to decide whether the file is binary before loading the rest. +const maxBinarySniffBytes = 512 + // maxSearchOutputBytes caps the total size of the joined match output. // Without it a broad search (e.g. across the home directory) produces a // multi-megabyte result that is then copied into the in-memory message