Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions src/utils/glob-oom.test.mts
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'
import { tmpdir } from 'node:os'
import path from 'node:path'

import { describe, expect, it } from 'vitest'

import { normalizePath } from '@socketsecurity/registry/lib/path'

import { globWithGitIgnore } from './glob.mts'

// Defined at module scope to satisfy linting rules.
function filterJsonFiles(filepath: string): boolean {
return filepath.endsWith('.json')
}

// This suite lives in its own file, with no mock-fs node_modules preload, so the
// large ignore set it builds is the only significant allocation in the worker.
describe('globWithGitIgnore() large monorepo memory', () => {
// Regression: scanning a large monorepo OOM'd because the whole unioned
// gitignore set was handed to fast-glob, which recompiled it per directory
// scan. The 100k-pattern tree below crashes the pre-fix path; the walk must
// complete with the right manifests. Real fs (mock-fs is too heavy here).
it('does not exhaust memory on a huge nested-.gitignore pattern set', async () => {
const realTmp = mkdtempSync(path.join(tmpdir(), 'socket-glob-oom-'))
try {
// 100 packages * 1000 lines = 100k distinct patterns. The pre-fix code
// (whole set handed to fast-glob, re-compiled per directory scan) exhausts
// a constrained test-worker heap at this count, while the reused `ignore`
// instance stays well within it.
const pkgCount = 100
const linesPerPkg = 1_000
// Each line anchors to a distinct local generated dir, so the flat union
// across packages is pkgCount * linesPerPkg distinct patterns.
const lines: string[] = []
for (let l = 0; l < linesPerPkg; l += 1) {
lines.push(`generated_${l}/`)
}
const gitignoreBody = `${lines.join('\n')}\n`
// The root manifest and one manifest per package must be found.
writeFileSync(path.join(realTmp, 'package.json'), '{}')
const expected = [normalizePath(path.join(realTmp, 'package.json'))]
for (let d = 0; d < pkgCount; d += 1) {
const pkgDir = path.join(realTmp, 'packages', `pkg-${d}`)
const ignoredDir = path.join(pkgDir, 'generated_0')
mkdirSync(ignoredDir, { recursive: true })
writeFileSync(path.join(pkgDir, '.gitignore'), gitignoreBody)
writeFileSync(path.join(pkgDir, 'package.json'), '{}')
// A manifest inside the package's own ignored generated dir must be
// excluded, proving the gitignore set is still honored.
writeFileSync(path.join(ignoredDir, 'package.json'), '{}')
expected.push(normalizePath(path.join(pkgDir, 'package.json')))
}

// Mirror the production call shape: a manifest filter forces the streaming
// branch that getPackageFilesForScan always takes.
const results = await globWithGitIgnore(['**/*'], {
cwd: realTmp,
filter: filterJsonFiles,
})

expect(results.map(normalizePath).sort()).toEqual(expected.sort())
} finally {
rmSync(realTmp, { force: true, recursive: true })
}
}, 60_000)
})
52 changes: 22 additions & 30 deletions src/utils/glob.mts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { parse as yamlParse } from 'yaml'
import { isDirSync, safeReadFile } from '@socketsecurity/registry/lib/fs'
import { defaultIgnore } from '@socketsecurity/registry/lib/globs'
import { readPackageJson } from '@socketsecurity/registry/lib/packages'
import { normalizePath } from '@socketsecurity/registry/lib/path'
import { transform } from '@socketsecurity/registry/lib/streams'
import { isNonEmptyString } from '@socketsecurity/registry/lib/strings'

Expand Down Expand Up @@ -290,22 +291,20 @@ export async function globWithGitIgnore(
}
}

let hasNegatedPattern = false
for (const p of ignores) {
if (p.charCodeAt(0) === 33 /*'!'*/) {
hasNegatedPattern = true
break
}
}
// Match the high-cardinality gitignore set through one reused `ignore`
// instance, not fast-glob's `ignore` (which recompiles its whole array per
// directory scan and OOMs on tens of thousands of patterns); fast-glob keeps
// only the bounded prune set. `ignorecase` tracks fast-glob's default.
const ig = ignore({
ignorecase: additionalOptions.caseSensitiveMatch === false,
}).add([...ignores])

const globOptions = {
__proto__: null,
absolute: true,
cwd,
dot: true,
ignore: hasNegatedPattern
? [...defaultIgnore, ...cliMinimatchIgnores]
: [...ignores, ...cliMinimatchIgnores].map(stripTrailingSlash),
ignore: [...defaultIgnore, ...cliMinimatchIgnores],
...additionalOptions,
// Skip directories the running user cannot read rather than aborting the
// whole walk on the first `EACCES` (see the .gitignore discovery walk
Expand All @@ -316,33 +315,26 @@ export async function globWithGitIgnore(
suppressErrors: true,
} as GlobOptions

// When no filter is provided and no negated patterns exist, use the fast path.
if (!hasNegatedPattern && !filter) {
return await fastGlob.glob(patterns as string[], globOptions)
}
// Add support for negated "ignore" patterns which many globbing libraries,
// including 'fast-glob', 'globby', and 'tinyglobby', lack support for.
// Use streaming to avoid unbounded memory accumulation.
// This is critical for large monorepos with 100k+ files.
// Stream results so memory stays bounded on large monorepos with 100k+ files:
// `ig` applies the gitignore matching per entry and the optional caller filter
// (e.g. manifest files only) drops non-matches before they accumulate, instead
// of collecting every path and filtering afterward.
const results: string[] = []
const ig = hasNegatedPattern ? ignore().add([...ignores]) : null
const stream = fastGlob.globStream(
patterns as string[],
globOptions,
) as AsyncIterable<string>
for await (const p of stream) {
// Check gitignore patterns with negation support.
if (ig) {
// Note: the input files must be INSIDE the cwd. If you get strange looking
// relative path errors here, most likely your path is outside the given cwd.
const relPath = globOptions.absolute ? path.relative(cwd, p) : p
if (ig.ignores(relPath)) {
continue
}
// Normalize to POSIX separators: the `ignore` patterns are forward-slash
// anchored (ignoreFileLinesToGlobPatterns), so a Windows backslash path from
// path.relative would never match. Input must be inside cwd, else
// path.relative returns an odd `..`-prefixed relative path.
const relPath = normalizePath(
globOptions.absolute ? path.relative(cwd, p) : p,
)
if (ig.ignores(relPath)) {
continue
}
// Apply the optional filter to reduce memory usage.
// When scanning large monorepos, this filters early (e.g., to manifest files only)
// instead of accumulating all 100k+ files and filtering later.
if (filter && !filter(p)) {
continue
}
Expand Down
22 changes: 22 additions & 0 deletions src/utils/glob.test.mts
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,28 @@ describe('glob utilities', () => {
])
})

it('matches gitignore entries case-sensitively, like fast-glob', async () => {
// The `ignore` package defaults to case-insensitive matching, but
// fast-glob (caseSensitiveMatch defaults to true) and git treat the
// ignore set case-sensitively. A `dist/` entry must ignore `dist/` but
// leave a differently-cased `Dist/` sibling alone.
mockTestFs({
[`${mockFixturePath}/.gitignore`]: 'dist/\n',
[`${mockFixturePath}/package.json`]: '{}',
[`${mockFixturePath}/dist/a.json`]: '{}',
[`${mockFixturePath}/Dist/b.json`]: '{}',
})

const results = await globWithGitIgnore(['**/*.json'], {
cwd: mockFixturePath,
})

expect(results.map(normalizePath).sort()).toEqual([
`${mockFixturePath}/Dist/b.json`,
`${mockFixturePath}/package.json`,
])
})

it('keeps additionalIgnores anchored even when a gitignore negation forces the streaming path', async () => {
// A bare `tests` pattern means "the entry `tests` at the scan root".
// The streaming path uses the `ignore` package for gitignore-translated
Expand Down
7 changes: 4 additions & 3 deletions src/utils/path-resolve.mts
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,10 @@ export async function getPackageFilesForScan(
...options,
} as PackageFilesForScanOptions

// Apply the supported files filter during streaming to avoid accumulating
// all files in memory. This is critical for large monorepos with 100k+ files
// where accumulating all paths before filtering causes OOM errors.
// Apply the supported files filter during streaming so globWithGitIgnore drops
// non-manifest paths as they are walked. This bounds RESULT-path memory; it
// does NOT bound the gitignore ignore-pattern memory (that OOM is handled
// inside globWithGitIgnore via a single reused `ignore` instance).
const filter = createSupportedFilesFilter(supportedFiles)

const normalizedInputPaths = inputPaths.map(p =>
Expand Down
Loading