diff --git a/src/utils/glob-oom.test.mts b/src/utils/glob-oom.test.mts new file mode 100644 index 000000000..f1fd8f11f --- /dev/null +++ b/src/utils/glob-oom.test.mts @@ -0,0 +1,66 @@ +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs' +import { tmpdir } from 'node:os' +import path from 'node:path' + +import { describe, expect, it } from 'vitest' + +import { normalizePath } from '@socketsecurity/registry/lib/path' + +import { globWithGitIgnore } from './glob.mts' + +// Defined at module scope to satisfy linting rules. +function filterJsonFiles(filepath: string): boolean { + return filepath.endsWith('.json') +} + +// This suite lives in its own file, with no mock-fs node_modules preload, so the +// large ignore set it builds is the only significant allocation in the worker. +describe('globWithGitIgnore() large monorepo memory', () => { + // Regression: scanning a large monorepo OOM'd because the whole unioned + // gitignore set was handed to fast-glob, which recompiled it per directory + // scan. The 100k-pattern tree below crashes the pre-fix path; the walk must + // complete with the right manifests. Real fs (mock-fs is too heavy here). + it('does not exhaust memory on a huge nested-.gitignore pattern set', async () => { + const realTmp = mkdtempSync(path.join(tmpdir(), 'socket-glob-oom-')) + try { + // 100 packages * 1000 lines = 100k distinct patterns. The pre-fix code + // (whole set handed to fast-glob, re-compiled per directory scan) exhausts + // a constrained test-worker heap at this count, while the reused `ignore` + // instance stays well within it. + const pkgCount = 100 + const linesPerPkg = 1_000 + // Each line anchors to a distinct local generated dir, so the flat union + // across packages is pkgCount * linesPerPkg distinct patterns. + const lines: string[] = [] + for (let l = 0; l < linesPerPkg; l += 1) { + lines.push(`generated_${l}/`) + } + const gitignoreBody = `${lines.join('\n')}\n` + // The root manifest and one manifest per package must be found. + writeFileSync(path.join(realTmp, 'package.json'), '{}') + const expected = [normalizePath(path.join(realTmp, 'package.json'))] + for (let d = 0; d < pkgCount; d += 1) { + const pkgDir = path.join(realTmp, 'packages', `pkg-${d}`) + const ignoredDir = path.join(pkgDir, 'generated_0') + mkdirSync(ignoredDir, { recursive: true }) + writeFileSync(path.join(pkgDir, '.gitignore'), gitignoreBody) + writeFileSync(path.join(pkgDir, 'package.json'), '{}') + // A manifest inside the package's own ignored generated dir must be + // excluded, proving the gitignore set is still honored. + writeFileSync(path.join(ignoredDir, 'package.json'), '{}') + expected.push(normalizePath(path.join(pkgDir, 'package.json'))) + } + + // Mirror the production call shape: a manifest filter forces the streaming + // branch that getPackageFilesForScan always takes. + const results = await globWithGitIgnore(['**/*'], { + cwd: realTmp, + filter: filterJsonFiles, + }) + + expect(results.map(normalizePath).sort()).toEqual(expected.sort()) + } finally { + rmSync(realTmp, { force: true, recursive: true }) + } + }, 60_000) +}) diff --git a/src/utils/glob.mts b/src/utils/glob.mts index e24cf54c7..a31c03c5d 100644 --- a/src/utils/glob.mts +++ b/src/utils/glob.mts @@ -8,6 +8,7 @@ import { parse as yamlParse } from 'yaml' import { isDirSync, safeReadFile } from '@socketsecurity/registry/lib/fs' import { defaultIgnore } from '@socketsecurity/registry/lib/globs' import { readPackageJson } from '@socketsecurity/registry/lib/packages' +import { normalizePath } from '@socketsecurity/registry/lib/path' import { transform } from '@socketsecurity/registry/lib/streams' import { isNonEmptyString } from '@socketsecurity/registry/lib/strings' @@ -290,22 +291,20 @@ export async function globWithGitIgnore( } } - let hasNegatedPattern = false - for (const p of ignores) { - if (p.charCodeAt(0) === 33 /*'!'*/) { - hasNegatedPattern = true - break - } - } + // Match the high-cardinality gitignore set through one reused `ignore` + // instance, not fast-glob's `ignore` (which recompiles its whole array per + // directory scan and OOMs on tens of thousands of patterns); fast-glob keeps + // only the bounded prune set. `ignorecase` tracks fast-glob's default. + const ig = ignore({ + ignorecase: additionalOptions.caseSensitiveMatch === false, + }).add([...ignores]) const globOptions = { __proto__: null, absolute: true, cwd, dot: true, - ignore: hasNegatedPattern - ? [...defaultIgnore, ...cliMinimatchIgnores] - : [...ignores, ...cliMinimatchIgnores].map(stripTrailingSlash), + ignore: [...defaultIgnore, ...cliMinimatchIgnores], ...additionalOptions, // Skip directories the running user cannot read rather than aborting the // whole walk on the first `EACCES` (see the .gitignore discovery walk @@ -316,33 +315,26 @@ export async function globWithGitIgnore( suppressErrors: true, } as GlobOptions - // When no filter is provided and no negated patterns exist, use the fast path. - if (!hasNegatedPattern && !filter) { - return await fastGlob.glob(patterns as string[], globOptions) - } - // Add support for negated "ignore" patterns which many globbing libraries, - // including 'fast-glob', 'globby', and 'tinyglobby', lack support for. - // Use streaming to avoid unbounded memory accumulation. - // This is critical for large monorepos with 100k+ files. + // Stream results so memory stays bounded on large monorepos with 100k+ files: + // `ig` applies the gitignore matching per entry and the optional caller filter + // (e.g. manifest files only) drops non-matches before they accumulate, instead + // of collecting every path and filtering afterward. const results: string[] = [] - const ig = hasNegatedPattern ? ignore().add([...ignores]) : null const stream = fastGlob.globStream( patterns as string[], globOptions, ) as AsyncIterable for await (const p of stream) { - // Check gitignore patterns with negation support. - if (ig) { - // Note: the input files must be INSIDE the cwd. If you get strange looking - // relative path errors here, most likely your path is outside the given cwd. - const relPath = globOptions.absolute ? path.relative(cwd, p) : p - if (ig.ignores(relPath)) { - continue - } + // Normalize to POSIX separators: the `ignore` patterns are forward-slash + // anchored (ignoreFileLinesToGlobPatterns), so a Windows backslash path from + // path.relative would never match. Input must be inside cwd, else + // path.relative returns an odd `..`-prefixed relative path. + const relPath = normalizePath( + globOptions.absolute ? path.relative(cwd, p) : p, + ) + if (ig.ignores(relPath)) { + continue } - // Apply the optional filter to reduce memory usage. - // When scanning large monorepos, this filters early (e.g., to manifest files only) - // instead of accumulating all 100k+ files and filtering later. if (filter && !filter(p)) { continue } diff --git a/src/utils/glob.test.mts b/src/utils/glob.test.mts index f403306cd..e381c5e02 100644 --- a/src/utils/glob.test.mts +++ b/src/utils/glob.test.mts @@ -210,6 +210,28 @@ describe('glob utilities', () => { ]) }) + it('matches gitignore entries case-sensitively, like fast-glob', async () => { + // The `ignore` package defaults to case-insensitive matching, but + // fast-glob (caseSensitiveMatch defaults to true) and git treat the + // ignore set case-sensitively. A `dist/` entry must ignore `dist/` but + // leave a differently-cased `Dist/` sibling alone. + mockTestFs({ + [`${mockFixturePath}/.gitignore`]: 'dist/\n', + [`${mockFixturePath}/package.json`]: '{}', + [`${mockFixturePath}/dist/a.json`]: '{}', + [`${mockFixturePath}/Dist/b.json`]: '{}', + }) + + const results = await globWithGitIgnore(['**/*.json'], { + cwd: mockFixturePath, + }) + + expect(results.map(normalizePath).sort()).toEqual([ + `${mockFixturePath}/Dist/b.json`, + `${mockFixturePath}/package.json`, + ]) + }) + it('keeps additionalIgnores anchored even when a gitignore negation forces the streaming path', async () => { // A bare `tests` pattern means "the entry `tests` at the scan root". // The streaming path uses the `ignore` package for gitignore-translated diff --git a/src/utils/path-resolve.mts b/src/utils/path-resolve.mts index 247d81ede..d2c1791ac 100644 --- a/src/utils/path-resolve.mts +++ b/src/utils/path-resolve.mts @@ -144,9 +144,10 @@ export async function getPackageFilesForScan( ...options, } as PackageFilesForScanOptions - // Apply the supported files filter during streaming to avoid accumulating - // all files in memory. This is critical for large monorepos with 100k+ files - // where accumulating all paths before filtering causes OOM errors. + // Apply the supported files filter during streaming so globWithGitIgnore drops + // non-manifest paths as they are walked. This bounds RESULT-path memory; it + // does NOT bound the gitignore ignore-pattern memory (that OOM is handled + // inside globWithGitIgnore via a single reused `ignore` instance). const filter = createSupportedFilesFilter(supportedFiles) const normalizedInputPaths = inputPaths.map(p =>