-
Notifications
You must be signed in to change notification settings - Fork 5
gomodfs: implement synthetic .zip file serving via synthzip #26
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,6 +34,7 @@ import ( | |
| "github.com/tailscale/gomodfs/internal/lru" | ||
| "github.com/tailscale/gomodfs/stats" | ||
| "github.com/tailscale/gomodfs/store" | ||
| "github.com/tomhjp/synthzip" | ||
| "golang.org/x/mod/module" | ||
| "golang.org/x/mod/sumdb/dirhash" | ||
| "golang.org/x/sync/singleflight" | ||
|
|
@@ -272,6 +273,7 @@ type putFile struct { | |
|
|
||
| func (pf putFile) Path() string { return pf.path } | ||
| func (pf putFile) Size() int64 { return int64(pf.zf.UncompressedSize64) } | ||
| func (pf putFile) CRC32() uint32 { return pf.zf.CRC32 } | ||
| func (pf putFile) Open() (io.ReadCloser, error) { return pf.zf.Open() } | ||
| func (pf putFile) Mode() os.FileMode { return pf.zf.Mode() } | ||
|
|
||
|
|
@@ -396,7 +398,7 @@ func (fs *FS) getZipRoot(ctx context.Context, mv store.ModuleVersion) (mh store. | |
| return rooti.(store.ModHandle), nil | ||
| } | ||
|
|
||
| // ext is one of "mod", "ziphash", "info". | ||
| // ext is one of "mod", "ziphash", "info", "zip". | ||
| func (fs *FS) getMetaFileByExt(ctx context.Context, mv store.ModuleVersion, ext string) ([]byte, error) { | ||
| switch ext { | ||
| case "mod": | ||
|
|
@@ -405,6 +407,8 @@ func (fs *FS) getMetaFileByExt(ctx context.Context, mv store.ModuleVersion, ext | |
| return fs.getZiphash(ctx, mv) | ||
| case "info": | ||
| return fs.getInfoFile(ctx, mv) | ||
| case "zip": | ||
| return fs.getZipFileData(ctx, mv) | ||
| } | ||
| return nil, fmt.Errorf("unknown meta file extension %q", ext) | ||
| } | ||
|
|
@@ -444,6 +448,64 @@ func (fs *FS) getZiphash(ctx context.Context, mv store.ModuleVersion) (data []by | |
| return fs.Store.GetZipHash(ctx, zr) | ||
| } | ||
|
|
||
| // getZipArchive returns a synthetic zip archive for the given module version. | ||
| // The returned archive is cheap to construct (metadata only) and supports ReadAt | ||
| // for lazy content loading. | ||
| func (fs *FS) getZipArchive(ctx context.Context, mv store.ModuleVersion) (*synthzip.Archive, error) { | ||
| mh, err := fs.getZipRoot(ctx, mv) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| entries, err := fs.Store.GetZipFileEntries(ctx, mh) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("GetZipFileEntries for %v: %w", mv, err) | ||
| } | ||
|
|
||
| // Module zip files have paths like "module@version/path". | ||
| prefix := mv.Module + "@" + mv.Version + "/" | ||
|
|
||
| files := make([]synthzip.File, len(entries)) | ||
| for i, e := range entries { | ||
| files[i] = synthzip.File{ | ||
| Name: prefix + e.Path, | ||
| Size: e.Size, | ||
| CRC32: e.CRC32, | ||
| } | ||
| } | ||
|
|
||
| return synthzip.New(files, func(name string) (io.ReadCloser, error) { | ||
| // Strip the module@version/ prefix to get the store path. | ||
| p := strings.TrimPrefix(name, prefix) | ||
| data, err := fs.Store.GetFile(ctx, mh, p) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| return io.NopCloser(bytes.NewReader(data)), nil | ||
| }) | ||
| } | ||
|
|
||
| // getZipFileData returns the full zip file bytes for the given module version. | ||
| func (fs *FS) getZipFileData(ctx context.Context, mv store.ModuleVersion) ([]byte, error) { | ||
| archive, err := fs.getZipArchive(ctx, mv) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| data := make([]byte, archive.Size()) | ||
| if _, err := archive.ReadAt(data, 0); err != nil && err != io.EOF { | ||
|
Comment on lines
+493
to
+494
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or |
||
| return nil, fmt.Errorf("reading zip archive for %v: %w", mv, err) | ||
| } | ||
| return data, nil | ||
| } | ||
|
|
||
| // getZipFileSize returns the size of the synthetic zip file without reading content. | ||
| func (fs *FS) getZipFileSize(ctx context.Context, mv store.ModuleVersion) (int64, error) { | ||
| archive, err := fs.getZipArchive(ctx, mv) | ||
| if err != nil { | ||
| return 0, err | ||
| } | ||
| return archive.Size(), nil | ||
| } | ||
|
|
||
| func (fs *FS) walkStoreModulePaths(ctx context.Context, mh store.ModHandle) iter.Seq[result.Of[string]] { | ||
| return func(yield func(result.Of[string]) bool) { | ||
| var doDir func(string) bool // recursive dir walker, called with "" (root) or "path/to/file-or-dir" | ||
|
|
@@ -489,6 +551,7 @@ var ( | |
| cdFileInfo = mkWellKnownPathHash("info") | ||
| cdFileMod = mkWellKnownPathHash("mod") | ||
| cdFileZiphash = mkWellKnownPathHash("ziphash") | ||
| cdFileZip = mkWellKnownPathHash("zip") | ||
| pathHashTSGo = mkWellKnownPathHash(wkTSGoExtracted) | ||
| ) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,7 @@ package gomodfs | |
|
|
||
| import ( | ||
| "archive/zip" | ||
| "bytes" | ||
| "context" | ||
| "fmt" | ||
| "io" | ||
|
|
@@ -252,6 +253,7 @@ func TestNFSHandles(t *testing.T) { | |
| "cache/download/go4.org/mem/@v/v0.0.0-20240501181205-ae6ca9944745.info", | ||
| "cache/download/go4.org/mem/@v/v0.0.0-20240501181205-ae6ca9944745.mod", | ||
| "cache/download/go4.org/mem/@v/v0.0.0-20240501181205-ae6ca9944745.ziphash", | ||
| "cache/download/go4.org/mem/@v/v0.0.0-20240501181205-ae6ca9944745.zip", | ||
| "tsgo-linux-amd64/1cd3bf1a6eaf559aa8c00e749289559c884cef09.extracted", | ||
| "tsgo-linux-amd64/1cd3bf1a6eaf559aa8c00e749289559c884cef09/fake.bash", | ||
| "tsgo-linux-amd64/1cd3bf1a6eaf559aa8c00e749289559c884cef09/bin/gofmt", | ||
|
|
@@ -302,6 +304,78 @@ func TestNFSHandles(t *testing.T) { | |
| } | ||
| } | ||
|
|
||
| func TestZipFile(t *testing.T) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice test! |
||
| gitCacheDir := testGitDir(t) | ||
| h := testNFSHandler(t, gitCacheDir) | ||
|
|
||
| // Trigger download of the go4.org/mem module. | ||
| wantRegSize(t, h, "go4.org/mem@v0.0.0-20240501181205-ae6ca9944745/LICENSE", 11358) | ||
|
|
||
| // Read the synthetic .zip file. | ||
| zipPath := "cache/download/go4.org/mem/@v/v0.0.0-20240501181205-ae6ca9944745.zip" | ||
| ctx := context.Background() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: t.Context()? |
||
| data, _, err := h.getFileContentsUncached(ctx, zipPath) | ||
| if err != nil { | ||
| t.Fatalf("getFileContentsUncached(%q): %v", zipPath, err) | ||
| } | ||
| if len(data) == 0 { | ||
| t.Fatal("zip file is empty") | ||
| } | ||
|
|
||
| // Verify it's a valid zip archive. | ||
| zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data))) | ||
| if err != nil { | ||
| t.Fatalf("zip.NewReader: %v", err) | ||
| } | ||
|
|
||
| // Verify files are present and match individually-cached content. | ||
| const mod = "go4.org/mem" | ||
| const ver = "v0.0.0-20240501181205-ae6ca9944745" | ||
| prefix := mod + "@" + ver + "/" | ||
|
|
||
| for _, zf := range zr.File { | ||
| if !strings.HasPrefix(zf.Name, prefix) { | ||
| t.Errorf("zip entry %q doesn't start with expected prefix %q", zf.Name, prefix) | ||
| continue | ||
| } | ||
| relPath := strings.TrimPrefix(zf.Name, prefix) | ||
|
|
||
| // Read from zip. | ||
| rc, err := zf.Open() | ||
| if err != nil { | ||
| t.Errorf("open zip entry %q: %v", zf.Name, err) | ||
| continue | ||
| } | ||
| zipContent, err := io.ReadAll(rc) | ||
| rc.Close() | ||
| if err != nil { | ||
| t.Errorf("read zip entry %q: %v", zf.Name, err) | ||
| continue | ||
| } | ||
|
|
||
| // Read from store. | ||
| storePath := mod + "@" + ver + "/" + relPath | ||
| storeData, _, err := h.getFileContentsUncached(ctx, storePath) | ||
| if err != nil { | ||
| t.Errorf("getFileContentsUncached(%q): %v", storePath, err) | ||
| continue | ||
| } | ||
|
|
||
| if !bytes.Equal(zipContent, storeData) { | ||
| t.Errorf("content mismatch for %q: zip has %d bytes, store has %d bytes", relPath, len(zipContent), len(storeData)) | ||
| } | ||
| } | ||
|
|
||
| // Verify stat returns the correct size. | ||
| fi, err := h.billyFS().Lstat(zipPath) | ||
| if err != nil { | ||
| t.Fatalf("Lstat(%q): %v", zipPath, err) | ||
| } | ||
| if fi.Size() != int64(len(data)) { | ||
| t.Errorf("Lstat size = %d; want %d", fi.Size(), len(data)) | ||
| } | ||
| } | ||
|
|
||
| var ( | ||
| handleOnce sync.Once | ||
| handlePath string | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -677,9 +677,10 @@ type modHandle struct { | |
|
|
||
| // blobMeta is the metadata for a blob in a tree. | ||
| type blobMeta struct { | ||
| Blob objRef | ||
| Size int64 | ||
| Mode os.FileMode | ||
| Blob objRef | ||
| Size int64 | ||
| Mode os.FileMode | ||
| CRC32 uint32 // CRC-32 checksum; 0 if not known (old cache entries) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you care to, I think we could avoid the requirement to track CRC-32s with some library updates. It turns out most zip readers (including go's archive/zip) never cross-check the local file header CRC-32 against the CRC-32 stored in the central directory, so we should be able to zero out the CRC-32 in the central directory and only need to lazily calculate it in a trailer per-file as readers iterate through real file contents. Unfortunately readers need to guess at the range they'll find the central directory marker within, so e.g. archive/zip reads the last 1KiB upfront (if that fails its second guess is the last 65KiB), which will inevitably include some real file content regions, but that's a small enough amount of duplicate file reading that it seems like a good trade-off for a simpler interface. That said, given the work we already do on the way into the git store, it seems very reasonable to avoid all that complexity and just track the CRC-32s. I'm probably leaning slightly towards tracking them in gomodfs as you've got it already, WDYT? |
||
| } | ||
|
|
||
| func (s *Storage) GetFile(ctx context.Context, h store.ModHandle, path string) ([]byte, error) { | ||
|
|
@@ -955,9 +956,50 @@ func (s *Storage) newModHandle(mv store.ModuleVersion, modTree objRef) (store.Mo | |
| } | ||
| mh.dirEnts[dir] = append(mh.dirEnts[dir], ent) | ||
| } | ||
|
|
||
| // Try to load CRC32 data from the zipcrc blob. | ||
| if crcObj, err := s.getObject(context.TODO(), modTree.String()+":zipcrc", "blob"); err == nil { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not |
||
| for line := range bytes.Lines(crcObj.content) { | ||
| line = bytes.TrimRight(line, "\n") | ||
| if len(line) == 0 { | ||
| continue | ||
| } | ||
| hexCRC, filePath, ok := bytes.Cut(line, []byte("\t")) | ||
| if !ok || len(hexCRC) != 8 { | ||
| continue | ||
| } | ||
| crc, err := strconv.ParseUint(string(hexCRC), 16, 32) | ||
| if err != nil { | ||
| continue | ||
| } | ||
| p := string(filePath) | ||
| if bm, ok := mh.blobMeta[p]; ok { | ||
| bm.CRC32 = uint32(crc) | ||
| mh.blobMeta[p] = bm | ||
| } | ||
| } | ||
| } | ||
| // If zipcrc doesn't exist (old cache entries), CRC32 fields stay 0. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe expand this comment to explain the implications downstream? IIUC we'll then error when the zip gets constructed and end up reporting the zip doesn't exist, so essentially the existing behaviour we have today? |
||
|
|
||
| return mh, nil | ||
| } | ||
|
|
||
| func (s *Storage) GetZipFileEntries(_ context.Context, h store.ModHandle) ([]store.ZipFileEntry, error) { | ||
| mh := h.(*modHandle) | ||
| entries := make([]store.ZipFileEntry, 0, len(mh.blobMeta)) | ||
| for p, bm := range mh.blobMeta { | ||
| entries = append(entries, store.ZipFileEntry{ | ||
| Path: p, | ||
| Size: bm.Size, | ||
| CRC32: bm.CRC32, | ||
| }) | ||
| } | ||
| slices.SortFunc(entries, func(a, b store.ZipFileEntry) int { | ||
| return cmp.Compare(a.Path, b.Path) | ||
| }) | ||
| return entries, nil | ||
| } | ||
|
|
||
| func (s *Storage) GetZipHash(ctx context.Context, h store.ModHandle) ([]byte, error) { | ||
| tree := h.(*modHandle).modTree | ||
|
|
||
|
|
@@ -995,6 +1037,26 @@ func (s *Storage) PutModule(ctx context.Context, mv store.ModuleVersion, data st | |
| return nil, fmt.Errorf("failed to add %s file: %w", ext, err) | ||
| } | ||
| } | ||
| // Build the zipcrc blob: one line per file, sorted by path. | ||
| // Format: "<8-hex-crc32>\t<path>\n" | ||
| var hasCRC bool | ||
| for _, f := range data.Files { | ||
| if f.CRC32() != 0 { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As this is accessing data from the original zip file, the only way this could be zero is if the CRC-32 appears in the data descriptor (trailing fields after the file content) instead of the file header. If we get zip files of that form, we'll fail to extract the CRC-32. Maybe we don't care because it's always the same tool creating the zips so we probably won't see zips of that form? But if we want to catch that, we'd have to write our own data descriptor parsing logic because unfortunately the standard library doesn't expose it AFAICT. However, we are also about to read the entire file contents in Anyway, all that to say, there are a few alternatives to consider here, and quite a lot of subtlety behind this check. At the least, probably worth documenting the choices in more detail with a comment. |
||
| hasCRC = true | ||
| break | ||
| } | ||
| } | ||
| if hasCRC { | ||
| var crcBuf bytes.Buffer | ||
| for _, f := range data.Files { | ||
| fmt.Fprintf(&crcBuf, "%08x\t%s\n", f.CRC32(), f.Path()) | ||
| } | ||
| crcData := crcBuf.Bytes() | ||
| if err := tb.addFile("zipcrc", static(crcData), 0644); err != nil { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would we get better git store deduplication across similar module versions if we had a zipcrc blob per path? Was the decision here intentional to reduce the number of blobs? I can see it would be a lot of extra blobs to do one CRC-32 per blob, but not sure what kind of impact that would have on performance. |
||
| return nil, fmt.Errorf("failed to add zipcrc file: %w", err) | ||
| } | ||
| } | ||
|
|
||
| for _, f := range data.Files { | ||
| if err := tb.addFile("zip/"+f.Path(), f.Open, f.Mode()); err != nil { | ||
| return nil, fmt.Errorf("failed to add zip file %q: %w", f.Path(), err) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have you seen any pathologically large module zips? I had kind of assumed we'd want to make some interface changes to support streaming rather than reading the whole thing into memory, but I don't have a good feel for the worst-case we see in practice.