Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions fuse.go
Original file line number Diff line number Diff line change
Expand Up @@ -529,8 +529,8 @@ func (n *cacheDownloadNode) lookupUnderModule(ctx context.Context, name string,

dotExt := filepath.Ext(name)
switch dotExt {
case ".info", ".mod", ".ziphash":
ext := dotExt[1:] // "info", "mod", "ziphash"
case ".info", ".mod", ".ziphash", ".zip":
ext := dotExt[1:] // "info", "mod", "ziphash", "zip"

sp := n.fs.Stats.StartSpan("get-metafile-" + ext)
defer func() {
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ require (
github.com/prometheus/common v0.65.0 // indirect
github.com/prometheus/procfs v0.16.1 // indirect
github.com/sergi/go-diff v1.4.0 // indirect
github.com/tomhjp/synthzip v0.1.0 // indirect
golang.org/x/crypto v0.40.0 // indirect
golang.org/x/exp v0.0.0-20250210185358-939b2ce775ac // indirect
google.golang.org/protobuf v1.36.6 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/tomhjp/synthzip v0.1.0 h1:amKn0GMYLPtJdMgARvjY7nNkjHOc298pY0WdgNdOHwg=
github.com/tomhjp/synthzip v0.1.0/go.mod h1:xzyQ03xr90LexJYX91KIXNa193WoOFiBMwTMgTEMNj0=
github.com/willscott/go-nfs v0.0.3 h1:Z5fHVxMsppgEucdkKBN26Vou19MtEM875NmRwj156RE=
github.com/willscott/go-nfs v0.0.3/go.mod h1:VhNccO67Oug787VNXcyx9JDI3ZoSpqoKMT/lWMhUIDg=
github.com/willscott/go-nfs-client v0.0.0-20251022144359-801f10d98886 h1:DtrBtkgTJk2XGt4T7eKdKVkd9A5NCevN2e4inLXtsqA=
Expand Down
65 changes: 64 additions & 1 deletion gomodfs.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/tailscale/gomodfs/internal/lru"
"github.com/tailscale/gomodfs/stats"
"github.com/tailscale/gomodfs/store"
"github.com/tomhjp/synthzip"
"golang.org/x/mod/module"
"golang.org/x/mod/sumdb/dirhash"
"golang.org/x/sync/singleflight"
Expand Down Expand Up @@ -272,6 +273,7 @@ type putFile struct {

func (pf putFile) Path() string { return pf.path }
func (pf putFile) Size() int64 { return int64(pf.zf.UncompressedSize64) }
func (pf putFile) CRC32() uint32 { return pf.zf.CRC32 }
func (pf putFile) Open() (io.ReadCloser, error) { return pf.zf.Open() }
func (pf putFile) Mode() os.FileMode { return pf.zf.Mode() }

Expand Down Expand Up @@ -396,7 +398,7 @@ func (fs *FS) getZipRoot(ctx context.Context, mv store.ModuleVersion) (mh store.
return rooti.(store.ModHandle), nil
}

// ext is one of "mod", "ziphash", "info".
// ext is one of "mod", "ziphash", "info", "zip".
func (fs *FS) getMetaFileByExt(ctx context.Context, mv store.ModuleVersion, ext string) ([]byte, error) {
switch ext {
case "mod":
Expand All @@ -405,6 +407,8 @@ func (fs *FS) getMetaFileByExt(ctx context.Context, mv store.ModuleVersion, ext
return fs.getZiphash(ctx, mv)
case "info":
return fs.getInfoFile(ctx, mv)
case "zip":
return fs.getZipFileData(ctx, mv)
}
return nil, fmt.Errorf("unknown meta file extension %q", ext)
}
Expand Down Expand Up @@ -444,6 +448,64 @@ func (fs *FS) getZiphash(ctx context.Context, mv store.ModuleVersion) (data []by
return fs.Store.GetZipHash(ctx, zr)
}

// getZipArchive returns a synthetic zip archive for the given module version.
// The returned archive is cheap to construct (metadata only) and supports ReadAt
// for lazy content loading.
func (fs *FS) getZipArchive(ctx context.Context, mv store.ModuleVersion) (*synthzip.Archive, error) {
mh, err := fs.getZipRoot(ctx, mv)
if err != nil {
return nil, err
}
entries, err := fs.Store.GetZipFileEntries(ctx, mh)
if err != nil {
return nil, fmt.Errorf("GetZipFileEntries for %v: %w", mv, err)
}

// Module zip files have paths like "module@version/path".
prefix := mv.Module + "@" + mv.Version + "/"

files := make([]synthzip.File, len(entries))
for i, e := range entries {
files[i] = synthzip.File{
Name: prefix + e.Path,
Size: e.Size,
CRC32: e.CRC32,
}
}

return synthzip.New(files, func(name string) (io.ReadCloser, error) {
// Strip the module@version/ prefix to get the store path.
p := strings.TrimPrefix(name, prefix)
data, err := fs.Store.GetFile(ctx, mh, p)
if err != nil {
return nil, err
}
return io.NopCloser(bytes.NewReader(data)), nil
})
}

// getZipFileData returns the full zip file bytes for the given module version.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you seen any pathologically large module zips? I had kind of assumed we'd want to make some interface changes to support streaming rather than reading the whole thing into memory, but I don't have a good feel for the worst-case we see in practice.

func (fs *FS) getZipFileData(ctx context.Context, mv store.ModuleVersion) ([]byte, error) {
archive, err := fs.getZipArchive(ctx, mv)
if err != nil {
return nil, err
}
data := make([]byte, archive.Size())
if _, err := archive.ReadAt(data, 0); err != nil && err != io.EOF {
Comment on lines +493 to +494

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or io.ReadAll if you prefer

return nil, fmt.Errorf("reading zip archive for %v: %w", mv, err)
}
return data, nil
}

// getZipFileSize returns the size of the synthetic zip file without reading content.
func (fs *FS) getZipFileSize(ctx context.Context, mv store.ModuleVersion) (int64, error) {
archive, err := fs.getZipArchive(ctx, mv)
if err != nil {
return 0, err
}
return archive.Size(), nil
}

func (fs *FS) walkStoreModulePaths(ctx context.Context, mh store.ModHandle) iter.Seq[result.Of[string]] {
return func(yield func(result.Of[string]) bool) {
var doDir func(string) bool // recursive dir walker, called with "" (root) or "path/to/file-or-dir"
Expand Down Expand Up @@ -489,6 +551,7 @@ var (
cdFileInfo = mkWellKnownPathHash("info")
cdFileMod = mkWellKnownPathHash("mod")
cdFileZiphash = mkWellKnownPathHash("ziphash")
cdFileZip = mkWellKnownPathHash("zip")
pathHashTSGo = mkWellKnownPathHash(wkTSGoExtracted)
)

Expand Down
11 changes: 11 additions & 0 deletions nfs.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,15 @@ func (b billyFS) Lstat(filename string) (os.FileInfo, error) {
ctx := context.TODO()

if ext := mp.CacheDownloadFileExt; ext != "" {
if ext == "zip" {
// Avoid materializing the entire zip just for stat.
size, err := b.fs.getZipFileSize(ctx, mp.ModVersion)
if err != nil {
b.fs.logf("Failed to get zip size for %v: %v", mp.ModVersion, err)
return nil, syscall.EIO
}
return regFileInfo{name: filepath.Base(filename), size: size}, nil
}
v, err := b.fs.getMetaFileByExt(ctx, mp.ModVersion, ext)
if err != nil {
b.fs.logf("Failed to get %s file for %v: %v", ext, mp.ModVersion, err)
Expand Down Expand Up @@ -437,6 +446,8 @@ func (h *NFSHandler) fromHandle(handle handle) (ret handleTarget, err error) {
return mkTargetFromPath(cdPath(mv, "mod")), nil
case cdFileZiphash:
return mkTargetFromPath(cdPath(mv, "ziphash")), nil
case cdFileZip:
return mkTargetFromPath(cdPath(mv, "zip")), nil
case pathHashTSGo:
if trip, ok := isTSGoModule(mv); ok {
return mkTargetFromPath(tsGoZipRoot(trip) + ".extracted"), nil
Expand Down
74 changes: 74 additions & 0 deletions nfs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package gomodfs

import (
"archive/zip"
"bytes"
"context"
"fmt"
"io"
Expand Down Expand Up @@ -252,6 +253,7 @@ func TestNFSHandles(t *testing.T) {
"cache/download/go4.org/mem/@v/v0.0.0-20240501181205-ae6ca9944745.info",
"cache/download/go4.org/mem/@v/v0.0.0-20240501181205-ae6ca9944745.mod",
"cache/download/go4.org/mem/@v/v0.0.0-20240501181205-ae6ca9944745.ziphash",
"cache/download/go4.org/mem/@v/v0.0.0-20240501181205-ae6ca9944745.zip",
"tsgo-linux-amd64/1cd3bf1a6eaf559aa8c00e749289559c884cef09.extracted",
"tsgo-linux-amd64/1cd3bf1a6eaf559aa8c00e749289559c884cef09/fake.bash",
"tsgo-linux-amd64/1cd3bf1a6eaf559aa8c00e749289559c884cef09/bin/gofmt",
Expand Down Expand Up @@ -302,6 +304,78 @@ func TestNFSHandles(t *testing.T) {
}
}

func TestZipFile(t *testing.T) {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice test!

gitCacheDir := testGitDir(t)
h := testNFSHandler(t, gitCacheDir)

// Trigger download of the go4.org/mem module.
wantRegSize(t, h, "go4.org/mem@v0.0.0-20240501181205-ae6ca9944745/LICENSE", 11358)

// Read the synthetic .zip file.
zipPath := "cache/download/go4.org/mem/@v/v0.0.0-20240501181205-ae6ca9944745.zip"
ctx := context.Background()

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: t.Context()?

data, _, err := h.getFileContentsUncached(ctx, zipPath)
if err != nil {
t.Fatalf("getFileContentsUncached(%q): %v", zipPath, err)
}
if len(data) == 0 {
t.Fatal("zip file is empty")
}

// Verify it's a valid zip archive.
zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
if err != nil {
t.Fatalf("zip.NewReader: %v", err)
}

// Verify files are present and match individually-cached content.
const mod = "go4.org/mem"
const ver = "v0.0.0-20240501181205-ae6ca9944745"
prefix := mod + "@" + ver + "/"

for _, zf := range zr.File {
if !strings.HasPrefix(zf.Name, prefix) {
t.Errorf("zip entry %q doesn't start with expected prefix %q", zf.Name, prefix)
continue
}
relPath := strings.TrimPrefix(zf.Name, prefix)

// Read from zip.
rc, err := zf.Open()
if err != nil {
t.Errorf("open zip entry %q: %v", zf.Name, err)
continue
}
zipContent, err := io.ReadAll(rc)
rc.Close()
if err != nil {
t.Errorf("read zip entry %q: %v", zf.Name, err)
continue
}

// Read from store.
storePath := mod + "@" + ver + "/" + relPath
storeData, _, err := h.getFileContentsUncached(ctx, storePath)
if err != nil {
t.Errorf("getFileContentsUncached(%q): %v", storePath, err)
continue
}

if !bytes.Equal(zipContent, storeData) {
t.Errorf("content mismatch for %q: zip has %d bytes, store has %d bytes", relPath, len(zipContent), len(storeData))
}
}

// Verify stat returns the correct size.
fi, err := h.billyFS().Lstat(zipPath)
if err != nil {
t.Fatalf("Lstat(%q): %v", zipPath, err)
}
if fi.Size() != int64(len(data)) {
t.Errorf("Lstat size = %d; want %d", fi.Size(), len(data))
}
}

var (
handleOnce sync.Once
handlePath string
Expand Down
2 changes: 1 addition & 1 deletion path.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ func parsePath(name string) (ret gmPath) {
return
}
switch ext {
case ".mod", ".ziphash", ".info":
case ".mod", ".ziphash", ".info", ".zip":
ret.CacheDownloadFileExt = ext[1:]
default:
// Not a recognized cache/download file.
Expand Down
7 changes: 7 additions & 0 deletions path_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ var pathTests = []struct {
ModVersion: store.ModuleVersion{Module: "go4.org/mem", Version: "v0.0.0-20240501181205-ae6ca9944745"},
},
},
{
path: "cache/download/go4.org/mem/@v/v0.0.0-20240501181205-ae6ca9944745.zip",
want: gmPath{
CacheDownloadFileExt: "zip",
ModVersion: store.ModuleVersion{Module: "go4.org/mem", Version: "v0.0.0-20240501181205-ae6ca9944745"},
},
},
{
path: "cache/download/go4.org/mem/@v/v0.0.0-20240501181205-ae6ca9944745.bogusext",
want: gmPath{NotExist: true},
Expand Down
68 changes: 65 additions & 3 deletions store/gitstore/gitstore.go
Original file line number Diff line number Diff line change
Expand Up @@ -677,9 +677,10 @@ type modHandle struct {

// blobMeta is the metadata for a blob in a tree.
type blobMeta struct {
Blob objRef
Size int64
Mode os.FileMode
Blob objRef
Size int64
Mode os.FileMode
CRC32 uint32 // CRC-32 checksum; 0 if not known (old cache entries)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you care to, I think we could avoid the requirement to track CRC-32s with some library updates. It turns out most zip readers (including go's archive/zip) never cross-check the local file header CRC-32 against the CRC-32 stored in the central directory, so we should be able to zero out the CRC-32 in the central directory and only need to lazily calculate it in a trailer per-file as readers iterate through real file contents.

Unfortunately readers need to guess at the range they'll find the central directory marker within, so e.g. archive/zip reads the last 1KiB upfront (if that fails its second guess is the last 65KiB), which will inevitably include some real file content regions, but that's a small enough amount of duplicate file reading that it seems like a good trade-off for a simpler interface.

That said, given the work we already do on the way into the git store, it seems very reasonable to avoid all that complexity and just track the CRC-32s. I'm probably leaning slightly towards tracking them in gomodfs as you've got it already, WDYT?

}

func (s *Storage) GetFile(ctx context.Context, h store.ModHandle, path string) ([]byte, error) {
Expand Down Expand Up @@ -955,9 +956,50 @@ func (s *Storage) newModHandle(mv store.ModuleVersion, modTree objRef) (store.Mo
}
mh.dirEnts[dir] = append(mh.dirEnts[dir], ent)
}

// Try to load CRC32 data from the zipcrc blob.
if crcObj, err := s.getObject(context.TODO(), modTree.String()+":zipcrc", "blob"); err == nil {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not context.Background()?

for line := range bytes.Lines(crcObj.content) {
line = bytes.TrimRight(line, "\n")
if len(line) == 0 {
continue
}
hexCRC, filePath, ok := bytes.Cut(line, []byte("\t"))
if !ok || len(hexCRC) != 8 {
continue
}
crc, err := strconv.ParseUint(string(hexCRC), 16, 32)
if err != nil {
continue
}
p := string(filePath)
if bm, ok := mh.blobMeta[p]; ok {
bm.CRC32 = uint32(crc)
mh.blobMeta[p] = bm
}
}
}
// If zipcrc doesn't exist (old cache entries), CRC32 fields stay 0.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe expand this comment to explain the implications downstream? IIUC we'll then error when the zip gets constructed and end up reporting the zip doesn't exist, so essentially the existing behaviour we have today?


return mh, nil
}

func (s *Storage) GetZipFileEntries(_ context.Context, h store.ModHandle) ([]store.ZipFileEntry, error) {
mh := h.(*modHandle)
entries := make([]store.ZipFileEntry, 0, len(mh.blobMeta))
for p, bm := range mh.blobMeta {
entries = append(entries, store.ZipFileEntry{
Path: p,
Size: bm.Size,
CRC32: bm.CRC32,
})
}
slices.SortFunc(entries, func(a, b store.ZipFileEntry) int {
return cmp.Compare(a.Path, b.Path)
})
return entries, nil
}

func (s *Storage) GetZipHash(ctx context.Context, h store.ModHandle) ([]byte, error) {
tree := h.(*modHandle).modTree

Expand Down Expand Up @@ -995,6 +1037,26 @@ func (s *Storage) PutModule(ctx context.Context, mv store.ModuleVersion, data st
return nil, fmt.Errorf("failed to add %s file: %w", ext, err)
}
}
// Build the zipcrc blob: one line per file, sorted by path.
// Format: "<8-hex-crc32>\t<path>\n"
var hasCRC bool
for _, f := range data.Files {
if f.CRC32() != 0 {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As this is accessing data from the original zip file, the only way this could be zero is if the CRC-32 appears in the data descriptor (trailing fields after the file content) instead of the file header. If we get zip files of that form, we'll fail to extract the CRC-32. Maybe we don't care because it's always the same tool creating the zips so we probably won't see zips of that form? But if we want to catch that, we'd have to write our own data descriptor parsing logic because unfortunately the standard library doesn't expose it AFAICT.

However, we are also about to read the entire file contents in tb.addFile down below, so could calculate it on the fly very cheaply and skip any complex zip parsing.

Anyway, all that to say, there are a few alternatives to consider here, and quite a lot of subtlety behind this check. At the least, probably worth documenting the choices in more detail with a comment.

hasCRC = true
break
}
}
if hasCRC {
var crcBuf bytes.Buffer
for _, f := range data.Files {
fmt.Fprintf(&crcBuf, "%08x\t%s\n", f.CRC32(), f.Path())
}
crcData := crcBuf.Bytes()
if err := tb.addFile("zipcrc", static(crcData), 0644); err != nil {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would we get better git store deduplication across similar module versions if we had a zipcrc blob per path? Was the decision here intentional to reduce the number of blobs? I can see it would be a lot of extra blobs to do one CRC-32 per blob, but not sure what kind of impact that would have on performance.

return nil, fmt.Errorf("failed to add zipcrc file: %w", err)
}
}

for _, f := range data.Files {
if err := tb.addFile("zip/"+f.Path(), f.Open, f.Mode()); err != nil {
return nil, fmt.Errorf("failed to add zip file %q: %w", f.Path(), err)
Expand Down
Loading
Loading