diff --git a/internal/project/refcountcache.go b/internal/project/refcountcache.go index 909b5aa4b1b..f93139ee7de 100644 --- a/internal/project/refcountcache.go +++ b/internal/project/refcountcache.go @@ -23,6 +23,12 @@ type RefCountCache[K comparable, V any, AcquireArgs any] struct { entries collections.SyncMap[K, *refCountCacheEntry[V]] parse func(K, AcquireArgs) V + + // refHook, if non-nil, is invoked at the very start of Ref, before the entry + // is loaded from the map. It exists only so tests can deterministically + // interleave a concurrent Deref into the window that produces the + // "cache entry not found" panic. It is always nil in normal operation. + refHook func(identity K) } func NewRefCountCache[K comparable, V any, AcquireArgs any]( @@ -60,6 +66,9 @@ func (c *RefCountCache[K, V, AcquireArgs]) Has(identity K) bool { // Ref increments the reference count for an existing entry. // Panics if the entry does not exist. func (c *RefCountCache[K, V, AcquireArgs]) Ref(identity K) { + if c.refHook != nil { + c.refHook(identity) + } entry, ok := c.entries.Load(identity) if !ok { panic("cache entry not found") diff --git a/internal/project/refcountcache_race_test.go b/internal/project/refcountcache_race_test.go new file mode 100644 index 00000000000..97483a310ac --- /dev/null +++ b/internal/project/refcountcache_race_test.go @@ -0,0 +1,256 @@ +package project + +import ( + "context" + "fmt" + "math/rand/v2" + "strings" + "sync" + "testing" + "time" + + "github.com/microsoft/typescript-go/internal/bundled" + "github.com/microsoft/typescript-go/internal/lsp/lsproto" + "github.com/microsoft/typescript-go/internal/project/logging" + "github.com/microsoft/typescript-go/internal/vfs/vfstest" +) + +// TestParseCacheConcurrentSnapshotRace reproduces a crash observed in telemetry: +// +// panic("cache entry not found") at internal/project/refcountcache.go +// RefCountCache.Ref -> Project.CreateProgram (single-file clone path) +// -> ProjectCollectionBuilder.DidRequestFile -> Snapshot.Clone +// -> Session.getSnapshot -> Session.GetLanguageService +// +// The crash is overwhelmingly Windows, occasionally macOS, never Linux, which +// points at a scheduling-sensitive interleaving rather than a logic bug on a +// single goroutine. +// +// Root-cause analysis +// ------------------- +// session.parseCache is a RefCountCache keyed by ParseCacheKey{ParseOptions, +// ScriptKind, Hash}. The cache itself is internally synchronized (a +// collections.SyncMap plus a per-entry sync.Mutex), so this is NOT a memory +// data race the -race detector can catch. The panic is a *logical* ref-count +// problem: RefCountCache.Ref(K) loads the entry for K and panics if the entry +// has already been deleted (refCount reached 0 and the entry was removed) by +// the time the load happens. +// +// The single Ref site that takes the panicking path is Project.CreateProgram's +// single-file clone branch, which Refs every reused source file AND every entry +// in newProgram.DuplicateSourceFiles(). The Deref/delete side is +// Snapshot.dispose, which Derefs the same keys when a program's last snapshot +// goes away. The foreground program rebuild is serialized by snapshotUpdateMu, +// but snapshot disposal runs unsynchronized with it (e.g. adoptSnapshotChange's +// discard path and other background Derefs run outside snapshotMu). +// +// DuplicateSourceFiles are the highest-contention entries by far: when the same +// package id (name + version) is installed under several node_modules locations, +// package deduplication keeps one primary copy in SourceFiles() and records the +// rest as DuplicateSourceFiles. Because the copies are byte-identical they all +// map to the SAME ParseCacheKey, so a single clone Refs that one key once per +// duplicate (here: many times) and a single dispose Derefs it just as many +// times. That makes the per-key Ref/Deref traffic on one shared entry an order +// of magnitude heavier than ordinary files, which is exactly the kind of hot +// shared entry the production crash needs to delete out from under a concurrent +// Ref. +// +// This test builds such a deduplicated-package workspace and hammers the window +// from many goroutines for many iterations. It is inherently probabilistic: it +// will not fail on every run, but it is designed to fail "at least some of the +// time", especially under `-count` and on Windows/macOS scheduling: +// +// go test -run TestParseCacheConcurrentSnapshotRace -count=50 ./internal/project +// +// It is CI-safe: when the race does not trigger it simply passes. +func TestParseCacheConcurrentSnapshotRace(t *testing.T) { + t.Parallel() + + if !bundled.Embedded { + t.Skip("bundled files are not embedded") + } + + const root = "/user/username/projects/myproject" + + // node10 resolution ("module": "commonjs") so a bare `import "x"` resolves to + // node_modules/x/index.d.ts and picks up the package id from package.json. + files := map[string]any{ + root + "/tsconfig.json": `{"compilerOptions":{"module":"commonjs","strict":true}}`, + } + + // Many wrapper packages, each nesting its OWN copy of x@1.0.0. The copies are + // byte-identical and share package id x@1.0.0, so package deduplication turns + // all but one into DuplicateSourceFiles that all map to the same parse-cache + // key. Editing the open src files Refs that single hot key once per duplicate + // on every clone and Derefs it just as many times on every dispose. + const wrapperCount = 16 + for i := range wrapperCount { + wrap := fmt.Sprintf("/node_modules/wrap%d", i) + files[root+wrap+"/package.json"] = fmt.Sprintf(`{"name":"wrap%d","version":"1.0.0"}`, i) + files[root+wrap+"/index.d.ts"] = "import X from \"x\";\nexport const w: X;\n" + files[root+wrap+"/node_modules/x/package.json"] = `{"name":"x","version":"1.0.0"}` + files[root+wrap+"/node_modules/x/index.d.ts"] = "export default class X {\n private x: number;\n}\n" + } + + // A hub file that imports every wrapper, pulling all x copies into the program + // so the duplicates are formed once for the whole project. + var hub strings.Builder + for i := range wrapperCount { + fmt.Fprintf(&hub, "import { w as w%d } from \"wrap%d\";\n", i, i) + } + hub.WriteString("export const hub = 0;\n") + files[root+"/src/hub.ts"] = hub.String() + + // Several "entry" files that import the hub and are opened/edited concurrently. + // Editing an open entry takes the single-file clone path in CreateProgram, + // which Refs all reused files plus all DuplicateSourceFiles. + const entryCount = 8 + entryContents := make([]string, entryCount) + for e := range entryCount { + content := "import { hub } from './hub';\n" + fmt.Sprintf("export const entry%d = hub;\n", e) + entryContents[e] = content + files[fmt.Sprintf("%s/src/entry%d.ts", root, e)] = content + } + + fs := bundled.WrapFS(vfstest.FromMap(files, false /*useCaseSensitiveFileNames*/)) + session := NewSession(&SessionInit{ + BackgroundCtx: context.Background(), + Logger: logging.NewTestLogger(), + Options: &SessionOptions{ + CurrentDirectory: "/", + DefaultLibraryPath: bundled.LibPath(), + TypingsLocation: "/home/src/Library/Caches/typescript", + PositionEncoding: lsproto.PositionEncodingKindUTF8, + WatchEnabled: false, + LoggingEnabled: false, + }, + FS: fs, + }) + defer session.Close() + + ctx := context.Background() + entryURIs := make([]lsproto.DocumentUri, entryCount) + for e := range entryCount { + entryURIs[e] = lsproto.DocumentUri(fmt.Sprintf("file://%s/src/entry%d.ts", root, e)) + session.DidOpenFile(ctx, entryURIs[e], 1, entryContents[e], lsproto.LanguageKindTypeScript) + } + + // Sanity-check that the workspace actually produced DuplicateSourceFiles; if + // it didn't, the test isn't exercising the implicated code path at all. + if _, err := session.GetLanguageService(ctx, entryURIs[0]); err != nil { + t.Fatalf("GetLanguageService: %v", err) + } + dupCount := 0 + for _, project := range session.Snapshot().ProjectCollection.Projects() { + if project.Program != nil { + dupCount += len(project.Program.DuplicateSourceFiles()) + } + } + if dupCount == 0 { + t.Fatalf("expected DuplicateSourceFiles to be produced by package deduplication, got 0 (workspace setup is not exercising the duplicate path)") + } + t.Logf("DuplicateSourceFiles in initial program: %d", dupCount) + + const iterations = 4000 + + // t.Fatal is not safe from non-test goroutines; collect panics and surface + // them from the main goroutine instead. + var failMu sync.Mutex + var failure any + recordPanic := func() { + if r := recover(); r != nil { + failMu.Lock() + if failure == nil { + failure = r + } + failMu.Unlock() + } + } + + var versionMu sync.Mutex + version := int32(1) + nextVersion := func() int32 { + versionMu.Lock() + defer versionMu.Unlock() + version++ + return version + } + + var wg sync.WaitGroup + + // A few editor goroutines that each edit their own open file with a small + // jitter between edits. The jitter lets the background warmAutoImportCache + // clone (which DidChangeFile cancels) actually make progress and run its + // program-clone Ref loop concurrently with foreground update clones, instead + // of being cancelled immediately by the next edit. + const editors = 3 + for e := range editors { + wg.Go(func() { + defer recordPanic() + uri := entryURIs[e%entryCount] + base := entryContents[e%entryCount] + rng := rand.New(rand.NewPCG(uint64(e), 0x9e3779b9)) + for i := range iterations { + newText := base + fmt.Sprintf("export const edit_%d_%d = %d;\n", e, i, i) + session.DidChangeFile(ctx, uri, nextVersion(), []lsproto.TextDocumentContentChangePartialOrWholeDocument{ + {WholeDocument: &lsproto.TextDocumentContentChangeWholeDocument{Text: newText}}, + }) + if rng.IntN(4) == 0 { + time.Sleep(time.Duration(rng.IntN(200)) * time.Microsecond) + } + } + }) + } + + // Reader goroutines that force foreground snapshot update clones by flushing + // the editors' pending changes. These foreground CreateProgram Ref loops are + // the ones that panic when a concurrent background dispose deletes a shared + // parse-cache entry out from under them. + const readers = 12 + for r := range readers { + wg.Go(func() { + defer recordPanic() + rng := rand.New(rand.NewPCG(uint64(r), 0x1234567)) + for range iterations { + uri := entryURIs[rng.IntN(entryCount)] + if _, err := session.GetLanguageService(ctx, uri); err != nil { + t.Errorf("GetLanguageService: %v", err) + return + } + } + }) + } + + // Auto-import requesters that clone the current snapshot WITHOUT holding + // snapshotMu (the unsynchronized clone path) and let it be adopted or + // discarded on the background queue. The discard path Derefs (and may dispose) + // a snapshot outside snapshotMu, racing the foreground Ref loops. + const autoImporters = 6 + for a := range autoImporters { + wg.Go(func() { + defer recordPanic() + rng := rand.New(rand.NewPCG(uint64(a), 0xdeadbeef)) + for range iterations { + uri := entryURIs[rng.IntN(entryCount)] + base := session.Snapshot() + if !base.tryRef() { + continue + } + if _, err := session.GetLanguageServiceWithAutoImports(ctx, base, uri); err != nil { + base.Deref(session) + continue + } + base.Deref(session) + } + }) + } + + wg.Wait() + session.WaitForBackgroundTasks() + + failMu.Lock() + defer failMu.Unlock() + if failure != nil { + t.Fatalf("snapshot clone panicked under concurrency (the bug): %v", failure) + } +} diff --git a/internal/project/refcountcache_test.go b/internal/project/refcountcache_test.go index 439588cd3e8..ec5d84eeee8 100644 --- a/internal/project/refcountcache_test.go +++ b/internal/project/refcountcache_test.go @@ -3,6 +3,7 @@ package project import ( "context" "strings" + "sync" "testing" "github.com/microsoft/typescript-go/internal/ast" @@ -321,3 +322,59 @@ func TestRefCountingCaches(t *testing.T) { }) }) } + +// TestRefCountCacheRefDeleteBeforeLoad is the deterministic counterpart to the +// probabilistic TestParseCacheConcurrentSnapshotRace stress test. It reproduces +// the production crash +// +// panic("cache entry not found") at RefCountCache.Ref +// +// every single time, by forcing the one interleaving that triggers it. +// +// In the field, a goroutine in Project.CreateProgram's single-file clone path +// calls parseCache.Ref(K) for a source file that a concurrent Snapshot.dispose +// is Deref-ing to zero. Deref drives the entry's refCount to 0 and calls +// entries.Delete(K). RefCountCache.Ref already tolerates ONE ordering of that +// race: if the entry is still found by entries.Load but its refCount has +// already hit 0 (the Delete raced Ref *after* the Load, while Ref was taking +// the per-entry lock), Ref re-stores the entry and returns. But it does NOT +// tolerate the other ordering: if entries.Delete(K) completes *before* Ref's +// entries.Load(K), the load misses and Ref panics. Ref cannot recover locally +// in that case because it has no AcquireArgs with which to re-parse the value, +// so the real fix belongs in the snapshot/program ref-counting that is supposed +// to keep K alive across the clone. This test pins the proximate mechanism. +// +// The refHook fires at the very start of Ref, before the load, and runs the +// racing Deref exactly once, forcing the delete-before-load ordering on every +// run rather than relying on scheduler luck. +func TestRefCountCacheRefDeleteBeforeLoad(t *testing.T) { + t.Parallel() + + cache := NewRefCountCache( + RefCountCacheOptions{}, + func(key string, value string) string { return value }, + ) + // A single owner holds the entry: refCount == 1. + cache.Acquire("k", "value") + + var once sync.Once + cache.refHook = func(identity string) { + once.Do(func() { + // The last other owner releases the entry: refCount 1 -> 0, and the + // entry is deleted from the map, all before Ref's entries.Load runs. + cache.Deref(identity) + }) + } + + defer func() { + r := recover() + if r == nil { + t.Fatal("expected Ref to panic with \"cache entry not found\" when the entry is deleted before the load, but it did not") + } + if msg, _ := r.(string); msg != "cache entry not found" { + t.Fatalf("unexpected panic value: %v", r) + } + }() + + cache.Ref("k") +}