From c4655ca5ee83f9dd67d4fcafd37d58e1f97c66d2 Mon Sep 17 00:00:00 2001 From: "Finn (EACG)" Date: Thu, 11 Jun 2026 06:55:32 +0000 Subject: [PATCH 1/2] feat(checkpoint): add append-only scan log for sub-phase resume and dynamic concurrency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces phase-boundary-only checkpointing with a JSONL append-only scan log (.sandyaa/scan-log-.jsonl). Each completed sub-step appends one line; on resume the log is replayed so the scan continues from the last saved step rather than from scratch. Five save points per scan: 1. After AI file prioritization — restores the ranked file list so the expensive LLM call is not repeated. 2. After each chunk's vulnerability detection — raw findings are stored; if interrupted mid-verification the detector is not re-invoked. 3. After each finding's recursive verification — verificationStatus, confidence, and contradictions are saved so already-verified findings are skipped on resume. 4. After each POC generation attempt — success/failed/error result and the POC code itself are saved so already-generated POCs are restored. 5. After SARIF generation — prevents duplicate report writes on resume. Additional changes: - RecursiveStrategyEngine.apply() now accepts an optional options bag with alreadyVerified (Map) and onFindingVerified callback. - Orchestrator restores allVulnerabilities from the scan log for completed chunks so the final SARIF includes findings from all runs. - STRATEGY_CONCURRENCY in context-analyzer now scales with os.cpus().length (max(2, cpuCount * 2 - 2), clamped to [2, 8]) instead of a hard-coded 4. - Old checkpoint files remain fully backwards-compatible. Co-Authored-By: Claude Sonnet 4.6 --- src/analyzer/context-analyzer.ts | 6 +- src/orchestrator/orchestrator.ts | 269 ++++++++++++++++++++-------- src/recursive/recursive-strategy.ts | 38 +++- src/utils/scan-log.ts | 146 +++++++++++++++ 4 files changed, 378 insertions(+), 81 deletions(-) create mode 100644 src/utils/scan-log.ts diff --git a/src/analyzer/context-analyzer.ts b/src/analyzer/context-analyzer.ts index 76c2924..62cd666 100644 --- a/src/analyzer/context-analyzer.ts +++ b/src/analyzer/context-analyzer.ts @@ -9,6 +9,7 @@ import { PathResolver } from '../utils/path-resolver.js'; import { LightweightCodeFilter } from '../utils/code-filter.js'; import * as fs from 'fs/promises'; import { realpathSync } from 'fs'; +import * as os from 'os'; import * as path from 'path'; import { fileURLToPath } from 'url'; import chalk from 'chalk'; @@ -451,7 +452,10 @@ export class ContextAnalyzer { const STRATEGY_CONCURRENCY = (() => { const raw = parseInt(process.env.SANDYAA_STRATEGY_CONCURRENCY || '', 10); if (Number.isFinite(raw) && raw >= 1 && raw <= 8) return raw; - return 4; + // Scale with available CPUs: leave 2 threads for the orchestrator process. + // Clamp to [2, 8] so we stay aggressive on beefy machines but sane elsewhere. + const cpuCount = os.cpus().length; + return Math.max(2, Math.min(8, cpuCount * 2 - 2)); })(); type StrategyOutcome = { diff --git a/src/orchestrator/orchestrator.ts b/src/orchestrator/orchestrator.ts index d738525..731f33c 100644 --- a/src/orchestrator/orchestrator.ts +++ b/src/orchestrator/orchestrator.ts @@ -4,6 +4,7 @@ import { POCGenerator } from '../poc-gen/poc-generator.js'; import { Reporter } from '../reporter/reporter.js'; import { SarifReporter } from '../reporter/sarif-reporter.js'; import { Checkpoint } from '../utils/checkpoint.js'; +import { ScanLog, ScanState } from '../utils/scan-log.js'; import { FileScanner } from '../utils/file-scanner.js'; import { RecursiveStrategyEngine } from '../recursive/recursive-strategy.js'; import { GitHelper } from '../utils/git-helper.js'; @@ -107,6 +108,7 @@ export interface Config { export class Orchestrator { private config: Config; private checkpoint: Checkpoint; + private scanLog: ScanLog; private analyzer: ContextAnalyzer; private detector: VulnerabilityDetector; private pocGen: POCGenerator; @@ -127,8 +129,9 @@ export class Orchestrator { constructor(config: Config) { this.config = config; - // Checkpoint, Reporter, and Detector will be initialized in run() with target-specific path + // Checkpoint, ScanLog, Reporter, and Detector will be initialized in run() with target-specific path this.checkpoint = null as any; // Temporary, will be set in run() + this.scanLog = null as any; // Temporary, will be set in run() this.reporter = null as any; // Temporary, will be set in run() this.detector = null as any; // Temporary, will be set in run() this.analyzer = new ContextAnalyzer(config); @@ -154,14 +157,17 @@ export class Orchestrator { this.dashboard = new DashboardRenderer(); } + private getSandyaaDir(): string { + return path.dirname(this.config.output.checkpoint_file); + } + private getCheckpointFile(targetPath: string): string { // Create unique checkpoint file for each project (based on absolute path hash) const hash = crypto.createHash('sha256') .update(path.resolve(targetPath)) .digest('hex') .substring(0, 12); - const checkpointDir = path.dirname(this.config.output.checkpoint_file); - return path.join(checkpointDir, `checkpoint-${hash}.json`); + return path.join(this.getSandyaaDir(), `checkpoint-${hash}.json`); } async run(startFresh: boolean = false, sarif: boolean = false, tsUpload?: string, tsProject?: string): Promise { @@ -205,9 +211,10 @@ export class Orchestrator { executor.setTargetPath(resolvedTarget); } - // Initialize project-specific checkpoint, reporter, and detector (after we know final target path) + // Initialize project-specific checkpoint, scan log, reporter, and detector const checkpointFile = this.getCheckpointFile(targetPath); this.checkpoint = new Checkpoint(checkpointFile); + this.scanLog = new ScanLog(ScanLog.getLogFile(targetPath, this.getSandyaaDir())); this.reporter = new Reporter(this.config, targetPath); this.detector = new VulnerabilityDetector(this.config, targetPath); if (sarif || tsUpload) { @@ -228,12 +235,20 @@ export class Orchestrator { // Check for existing checkpoint and ask user let processedFiles = new Set(); + let scanState: ScanState = { + prioritizedFiles: null, + detectedChunks: new Map(), + verifiedFindings: new Map(), + pocResults: new Map(), + sarifWritten: false, + }; const checkpointData = await this.checkpoint.loadForTarget(targetPath); if (startFresh) { // User explicitly wants fresh start if (checkpointData && checkpointData.processedFiles.length > 0) { await this.checkpoint.clear(); + await this.scanLog.clear(); console.log(chalk.green('Starting fresh analysis (checkpoint cleared)...\n')); } } else if (checkpointData && checkpointData.processedFiles.length > 0) { @@ -263,9 +278,14 @@ export class Orchestrator { if (shouldResume) { processedFiles = new Set(checkpointData.processedFiles); totalBugsFound = checkpointData.totalBugsFound; + scanState = await this.scanLog.loadState(); console.log(chalk.green('Resuming from checkpoint...\n')); + if (scanState.detectedChunks.size > 0) { + console.log(chalk.gray(` Scan log: ${scanState.detectedChunks.size} chunks with cached detection, ${scanState.verifiedFindings.size} verified findings, ${scanState.pocResults.size} POCs`)); + } } else { await this.checkpoint.clear(); + await this.scanLog.clear(); console.log(chalk.green('Starting fresh analysis...\n')); } } @@ -297,16 +317,27 @@ export class Orchestrator { let phaseStart = 0; if (filesToProcess.length > 1000 && processedFiles.size === 0) { - const prioritizer = new FilePrioritizer(targetPath, this.config.provider); - const prioritized = await prioritizer.prioritize(filesToProcess, { - phase: 'high-value', - samplingRate: 0.1, - focusAreas: [] - }); + // Re-use saved prioritization from the scan log if available (skips AI call) + if (scanState.prioritizedFiles && scanState.prioritizedFiles.length > 0) { + prioritizedFiles = scanState.prioritizedFiles; + console.log(chalk.gray(` Restored prioritized file list from scan log (${prioritizedFiles.length} files)`)); + } else { + const prioritizer = new FilePrioritizer(targetPath, this.config.provider); + const prioritized = await prioritizer.prioritize(filesToProcess, { + phase: 'high-value', + samplingRate: 0.1, + focusAreas: [] + }); - prioritizedFiles = prioritized - .sort((a: any, b: any) => b.priority - a.priority) - .map((p: any) => p.path); + prioritizedFiles = prioritized + .sort((a: any, b: any) => b.priority - a.priority) + .map((p: any) => p.path); + + await this.scanLog.append({ + step: 'prioritize', + result: { high_priority_count: prioritizedFiles.length, files: prioritizedFiles }, + }); + } // Phase 1: Analyze prioritized targets only filesToProcess = prioritizedFiles; @@ -316,10 +347,35 @@ export class Orchestrator { console.log(chalk.cyan(`Phase 2: Systematic coverage (${filesToProcess.length} files remaining)\n`)); } + // Restore allVulnerabilities for chunks already completed in a prior run. + // This ensures the final SARIF report is complete even on a resumed scan. + const allVulnerabilities: any[] = []; + for (const [, chunkData] of scanState.detectedChunks) { + const allFilesProcessed = chunkData.files.every(f => processedFiles.has(f)); + if (!allFilesProcessed) continue; // Will be (re-)processed in the main loop below + for (const finding of chunkData.findings) { + const enriched = { ...finding }; + const verifyResult = scanState.verifiedFindings.get(finding.id); + if (verifyResult) { + enriched.verificationStatus = verifyResult.status; + enriched.confidence = verifyResult.confidence; + enriched.needsManualReview = verifyResult.needsManualReview; + if (verifyResult.contradictions) enriched.contradictions = verifyResult.contradictions; + } + const pocResult = scanState.pocResults.get(finding.id); + if (pocResult?.poc) { + enriched.poc = { ...pocResult.poc }; + } + allVulnerabilities.push(enriched); + } + } + if (allVulnerabilities.length > 0) { + console.log(chalk.gray(` Restored ${allVulnerabilities.length} findings from scan log for prior-run chunks`)); + } + // Process files in dynamic chunks (adapts based on complexity) let iteration = 0; let i = 0; - const allVulnerabilities: any[] = []; while (i < filesToProcess.length) { iteration++; @@ -330,7 +386,7 @@ export class Orchestrator { const { bugsFound, findings } = await this.processChunk( chunk, iteration, phase, targetPath, processedFiles, totalBugsFound, - estimatedChunksRemaining, files.length + estimatedChunksRemaining, files.length, scanState ); totalBugsFound += bugsFound; allVulnerabilities.push(...findings); @@ -375,7 +431,7 @@ export class Orchestrator { const { bugsFound, findings } = await this.processChunk( chunk, iteration, 'systematic', targetPath, processedFiles, totalBugsFound, - estimatedChunksRemaining, files.length + estimatedChunksRemaining, files.length, scanState ); totalBugsFound += bugsFound; allVulnerabilities.push(...findings); @@ -409,8 +465,9 @@ export class Orchestrator { } // Generate SARIF report if requested - if (this.sarifReporter) { + if (this.sarifReporter && !scanState.sarifWritten) { await this.sarifReporter.generate(allVulnerabilities); + await this.scanLog.append({ step: 'sarif', result: { written: true } }); // Upload to TrustSource if --ts-upload was given if (tsUpload) { @@ -421,6 +478,8 @@ export class Orchestrator { console.log(chalk.yellow('The local SARIF file is still available in the findings directory.')); } } + } else if (this.sarifReporter && scanState.sarifWritten) { + console.log(chalk.gray('SARIF already written in a prior run — skipping duplicate generation.')); } // Generate summary report @@ -451,7 +510,14 @@ export class Orchestrator { processedFiles: Set, totalBugsFound: number, estimatedChunksRemaining: number, - totalFilesCount: number + totalFilesCount: number, + scanState: ScanState = { + prioritizedFiles: null, + detectedChunks: new Map(), + verifiedFindings: new Map(), + pocResults: new Map(), + sarifWritten: false, + } ): Promise<{ bugsFound: number; findings: any[] }> { console.log(chalk.bold(`\n[${phase}] Chunk ${iteration} (${chunk.length} files | ~${estimatedChunksRemaining} chunks remaining)`)); console.log(chalk.gray(` ${this.dynamicChunker.getExplanation()}`)); @@ -484,40 +550,55 @@ export class Orchestrator { `${contextTokens.toLocaleString()} tokens (${contextWindowPercent}% of ${(getDefaultContextWindow() / 1000).toFixed(0)}k)` )); - // Vulnerability Detection - console.log(chalk.cyan(`\n → Vulnerability detection: correlating findings and analyzing exploitability...`)); - const detectionStartTokens = this.totalTokensUsed; - let vulnerabilities = await this.detector.detect(context); - const detectionTokens = this.totalTokensUsed - detectionStartTokens; + // Vulnerability Detection — use scan log cache if this chunk was already detected + let vulnerabilities: any[] = []; + let detectionTokens = 0; + const chunkKey = ScanLog.chunkKey(chunk); + const cachedDetect = scanState.detectedChunks.get(chunkKey); - // Update dashboard with detection results - this.dashboard.update({ - phase: 'vulnerability-detection', - tokensUsed: this.totalTokensUsed, - }); + if (cachedDetect) { + vulnerabilities = cachedDetect.findings; + console.log(chalk.gray(`\n → Vulnerability detection: restored ${vulnerabilities.length} finding(s) from scan log (chunk already detected)`)); + this.dashboard.update({ phase: 'vulnerability-detection', tokensUsed: this.totalTokensUsed }); + } else { + console.log(chalk.cyan(`\n → Vulnerability detection: correlating findings and analyzing exploitability...`)); + const detectionStartTokens = this.totalTokensUsed; + vulnerabilities = await this.detector.detect(context); + detectionTokens = this.totalTokensUsed - detectionStartTokens; + + // Persist detection result to scan log immediately (before recursive verification) + await this.scanLog.append({ + step: 'detect', + chunk: iteration, + files: chunk, + result: { findings: vulnerabilities }, + }); - if (vulnerabilities.length > 0) { - console.log(chalk.green(` ✓ Found ${vulnerabilities.length} potential vulnerabilities | ${detectionTokens.toLocaleString()} tokens`)); + this.dashboard.update({ phase: 'vulnerability-detection', tokensUsed: this.totalTokensUsed }); - // Feed findings to dashboard - for (const v of vulnerabilities) { - const sev = (v.severity?.toLowerCase() || 'low') as 'critical' | 'high' | 'medium' | 'low'; - this.dashboard.addFinding(sev, `${v.type} at ${v.location?.file?.split('/').pop() || 'unknown'}`); - } + if (vulnerabilities.length > 0) { + console.log(chalk.green(` ✓ Found ${vulnerabilities.length} potential vulnerabilities | ${detectionTokens.toLocaleString()} tokens`)); - // Show sample of what was found (Claude decides what's important) - const sample = vulnerabilities.slice(0, 3); - for (const vuln of sample) { - const severity = vuln.severity?.toUpperCase() || 'UNKNOWN'; - const severityColor = ['critical', 'high'].includes(vuln.severity?.toLowerCase() || '') ? chalk.red : chalk.yellow; - const attackerNote = vuln.attackerControlled?.isControlled ? 'VERIFIED ' : ''; - console.log(severityColor(` ${attackerNote}[${severity}] ${vuln.type} at ${vuln.location.file.split('/').pop()}:${vuln.location.line}`)); - } - if (vulnerabilities.length > 3) { - console.log(chalk.gray(` ... and ${vulnerabilities.length - 3} more`)); + // Feed findings to dashboard + for (const v of vulnerabilities) { + const sev = (v.severity?.toLowerCase() || 'low') as 'critical' | 'high' | 'medium' | 'low'; + this.dashboard.addFinding(sev, `${v.type} at ${v.location?.file?.split('/').pop() || 'unknown'}`); + } + + // Show sample of what was found + const sample = vulnerabilities.slice(0, 3); + for (const vuln of sample) { + const severity = vuln.severity?.toUpperCase() || 'UNKNOWN'; + const severityColor = ['critical', 'high'].includes(vuln.severity?.toLowerCase() || '') ? chalk.red : chalk.yellow; + const attackerNote = vuln.attackerControlled?.isControlled ? 'VERIFIED ' : ''; + console.log(severityColor(` ${attackerNote}[${severity}] ${vuln.type} at ${vuln.location.file.split('/').pop()}:${vuln.location.line}`)); + } + if (vulnerabilities.length > 3) { + console.log(chalk.gray(` ... and ${vulnerabilities.length - 3} more`)); + } + } else { + console.log(chalk.gray(` ✓ No vulnerabilities in this chunk | ${detectionTokens.toLocaleString()} tokens`)); } - } else { - console.log(chalk.gray(` ✓ No vulnerabilities in this chunk | ${detectionTokens.toLocaleString()} tokens`)); } // Recursive Analysis (if enabled) @@ -525,7 +606,12 @@ export class Orchestrator { this.dashboard.update({ phase: 'validation' }); console.log(chalk.cyan(`\n → Recursive verification: tracing call chains, checking contradictions...`)); const recursiveStartTokens = this.totalTokensUsed; - const enhanced = await this.recursiveEngine.apply(vulnerabilities, context); + const enhanced = await this.recursiveEngine.apply(vulnerabilities, context, { + alreadyVerified: scanState.verifiedFindings, + onFindingVerified: async (id: string, result: any) => { + await this.scanLog.append({ step: 'verify', finding_id: id, result }); + }, + }); const recursiveTokens = this.totalTokensUsed - recursiveStartTokens; // Count verification statuses instead of filtering @@ -610,42 +696,71 @@ export class Orchestrator { const vuln = vulnerabilities[vi]; if (this.config.poc.generate) { - try { - process.stdout.write(chalk.hex('#FF8C00')(`\r ⚡ POC ${vi + 1}/${vulnerabilities.length}: Generating for ${vuln.id}...`)); - const poc = await this.pocGen.generate(vuln, context); - - // Anti-hallucination: Validate POC actually works - if (this.config.poc.validate) { - process.stdout.write(chalk.hex('#FF8C00')(`\r ⚡ POC ${vi + 1}/${vulnerabilities.length}: Validating ${vuln.id}... `)); - const isValid = await this.pocGen.validate(poc); - if (isValid) { - vuln.poc = poc; - vuln.poc.validated = true; - process.stdout.write('\r' + ' '.repeat(100) + '\r'); - console.log(chalk.green(` ✓ ${vuln.id}: POC validated`)); + // Restore POC from scan log if this finding was already handled in a prior run + const savedPoc = scanState.pocResults.get(vuln.id); + if (savedPoc) { + if (savedPoc.poc) { + vuln.poc = savedPoc.poc; + } + if (savedPoc.needsManualReview) { + vuln.needsManualReview = true; + } + process.stdout.write('\r' + ' '.repeat(100) + '\r'); + console.log(chalk.gray(` ${vuln.id}: POC restored from scan log (${savedPoc.status})`)); + } else { + try { + process.stdout.write(chalk.hex('#FF8C00')(`\r ⚡ POC ${vi + 1}/${vulnerabilities.length}: Generating for ${vuln.id}...`)); + const poc = await this.pocGen.generate(vuln, context); + + // Anti-hallucination: Validate POC actually works + if (this.config.poc.validate) { + process.stdout.write(chalk.hex('#FF8C00')(`\r ⚡ POC ${vi + 1}/${vulnerabilities.length}: Validating ${vuln.id}... `)); + const isValid = await this.pocGen.validate(poc); + if (isValid) { + vuln.poc = poc; + vuln.poc.validated = true; + process.stdout.write('\r' + ' '.repeat(100) + '\r'); + console.log(chalk.green(` ✓ ${vuln.id}: POC validated`)); + await this.scanLog.append({ + step: 'poc', finding_id: vuln.id, + result: { status: 'success', poc: vuln.poc, validated: true }, + }); + } else { + // POC didn't work - KEEP THE FINDING but mark it + vuln.poc = poc; + vuln.poc.validated = false; + vuln.needsManualReview = true; + if (!vuln.verificationStatus) { + vuln.verificationStatus = 'unverified'; + } + process.stdout.write('\r' + ' '.repeat(100) + '\r'); + console.log(chalk.yellow(` ⚠ ${vuln.id}: POC validation failed - marked for manual review`)); + await this.scanLog.append({ + step: 'poc', finding_id: vuln.id, + result: { status: 'failed', poc: vuln.poc, validated: false, needsManualReview: true }, + }); + } } else { - // POC didn't work - KEEP THE FINDING but mark it + // Validation skipped vuln.poc = poc; vuln.poc.validated = false; - vuln.needsManualReview = true; - if (!vuln.verificationStatus) { - vuln.verificationStatus = 'unverified'; - } process.stdout.write('\r' + ' '.repeat(100) + '\r'); - console.log(chalk.yellow(` ⚠ ${vuln.id}: POC validation failed - marked for manual review`)); + console.log(chalk.gray(` ${vuln.id}: POC generated (validation skipped)`)); + await this.scanLog.append({ + step: 'poc', finding_id: vuln.id, + result: { status: 'skipped', poc: vuln.poc }, + }); } - } else { - // Validation skipped - vuln.poc = poc; - vuln.poc.validated = false; + } catch (error) { + // POC generation failed - STILL KEEP THE FINDING process.stdout.write('\r' + ' '.repeat(100) + '\r'); - console.log(chalk.gray(` ${vuln.id}: POC generated (validation skipped)`)); + console.log(chalk.yellow(` ⚠ ${vuln.id}: POC generation failed, reported without POC`)); + vuln.needsManualReview = true; + await this.scanLog.append({ + step: 'poc', finding_id: vuln.id, + result: { status: 'error', needsManualReview: true }, + }); } - } catch (error) { - // POC generation failed - STILL KEEP THE FINDING - process.stdout.write('\r' + ' '.repeat(100) + '\r'); - console.log(chalk.yellow(` ⚠ ${vuln.id}: POC generation failed, reported without POC`)); - vuln.needsManualReview = true; } } diff --git a/src/recursive/recursive-strategy.ts b/src/recursive/recursive-strategy.ts index fffed8e..7fb8651 100644 --- a/src/recursive/recursive-strategy.ts +++ b/src/recursive/recursive-strategy.ts @@ -32,7 +32,13 @@ export class RecursiveStrategyEngine { async apply( vulnerabilities: Vulnerability[], - context: CodeContext + context: CodeContext, + options?: { + /** Findings already verified in a prior run — skip re-verification for these. */ + alreadyVerified?: Map; + /** Called after each finding is verified; use to append scan-log entries. */ + onFindingVerified?: (id: string, result: any) => Promise; + } ): Promise { if (!this.config.enabled) { return vulnerabilities.map(v => ({ ...v, recursive: null })); @@ -41,6 +47,21 @@ export class RecursiveStrategyEngine { const enhanced: EnhancedVulnerability[] = []; for (const vuln of vulnerabilities) { + // Restore from scan log if this finding was already verified in a prior run + const savedVerify = options?.alreadyVerified?.get(vuln.id); + if (savedVerify) { + console.log(` Recursive analysis: ${vuln.id} (restored from scan log)`); + enhanced.push({ + ...vuln, + recursive: null, + verificationStatus: savedVerify.status ?? 'verified', + confidence: savedVerify.confidence ?? 'high', + needsManualReview: savedVerify.needsManualReview ?? false, + contradictions: savedVerify.contradictions, + }); + continue; + } + console.log(` Recursive analysis: ${vuln.id}`); let recursiveData: RecursiveAnalysis | null = null; @@ -136,14 +157,25 @@ export class RecursiveStrategyEngine { const passedExploitabilityChecks = exploitabilityProof.filter(p => p.startsWith('✓')).length; const totalExploitabilityChecks = 5; // 5 validations + const derivedConfidence = verificationStatus === 'verified' ? 'high' + : verificationStatus === 'uncertain' ? 'medium' : 'low'; + + if (options?.onFindingVerified) { + await options.onFindingVerified(vuln.id, { + status: verificationStatus, + confidence: derivedConfidence, + needsManualReview: verificationStatus !== 'verified', + contradictions: contradictions.length > 0 ? contradictions : undefined, + }); + } + enhanced.push({ ...vuln, poc: refinedPOC, recursive: recursiveData, verificationStatus, contradictions: contradictions.length > 0 ? contradictions : undefined, - confidence: verificationStatus === 'verified' ? 'high' : - verificationStatus === 'uncertain' ? 'medium' : 'low', + confidence: derivedConfidence, needsManualReview: verificationStatus !== 'verified', // GOD-LEVEL: Add recursive exploitability proof recursiveExploitabilityProof: exploitabilityProof.length > 0 ? { diff --git a/src/utils/scan-log.ts b/src/utils/scan-log.ts new file mode 100644 index 0000000..2421454 --- /dev/null +++ b/src/utils/scan-log.ts @@ -0,0 +1,146 @@ +import * as fs from 'fs/promises'; +import * as path from 'path'; +import * as crypto from 'crypto'; + +export type ScanLogStepName = 'prioritize' | 'detect' | 'verify' | 'poc' | 'sarif'; + +export interface ScanLogEntry { + step: ScanLogStepName; + ts: string; + chunk?: number; + files?: string[]; + finding_id?: string; + result: any; +} + +export interface DetectedChunk { + files: string[]; + findings: any[]; +} + +export interface ScanState { + prioritizedFiles: string[] | null; + /** Keyed by ScanLog.chunkKey(files) — order-independent. */ + detectedChunks: Map; + /** finding_id → saved verify result */ + verifiedFindings: Map; + /** finding_id → saved poc result */ + pocResults: Map; + sarifWritten: boolean; +} + +/** + * Append-only JSONL scan log. Each completed sub-step appends one line: + * + * {"step":"prioritize","result":{"files":[...]},"ts":"..."} + * {"step":"detect","chunk":1,"files":[...],"result":{"findings":[...]},"ts":"..."} + * {"step":"verify","finding_id":"CTAE-001","result":{"status":"verified"},"ts":"..."} + * {"step":"poc","finding_id":"CTAE-001","result":{"status":"success","poc":{...}},"ts":"..."} + * + * On resume: read all lines, replay completed steps, continue from the first + * missing one. Partial writes don't corrupt earlier state. + */ +export class ScanLog { + private logFile: string; + + constructor(logFile: string) { + this.logFile = logFile; + } + + static getLogFile(targetPath: string, sandyaaDir: string): string { + const hash = crypto.createHash('sha256') + .update(path.resolve(targetPath)) + .digest('hex') + .substring(0, 12); + return path.join(sandyaaDir, `scan-log-${hash}.jsonl`); + } + + /** Append one step entry. Safe to call concurrently — each line is atomic. */ + async append(entry: Omit): Promise { + try { + const dir = path.dirname(this.logFile); + await fs.mkdir(dir, { recursive: true }); + const line = JSON.stringify({ ...entry, ts: new Date().toISOString() }) + '\n'; + await fs.appendFile(this.logFile, line, 'utf-8'); + } catch (error) { + console.warn('Failed to append to scan log:', error); + } + } + + /** Read all entries and materialise a resume-ready ScanState. */ + async loadState(): Promise { + const state: ScanState = { + prioritizedFiles: null, + detectedChunks: new Map(), + verifiedFindings: new Map(), + pocResults: new Map(), + sarifWritten: false, + }; + + try { + const content = await fs.readFile(this.logFile, 'utf-8'); + const lines = content.split('\n').filter((l: string) => l.trim()); + + for (const line of lines) { + try { + const entry: ScanLogEntry = JSON.parse(line); + switch (entry.step) { + case 'prioritize': + state.prioritizedFiles = entry.result?.files ?? null; + break; + case 'detect': + if (entry.files) { + const key = ScanLog.chunkKey(entry.files); + state.detectedChunks.set(key, { + files: entry.files, + findings: entry.result?.findings ?? [], + }); + } + break; + case 'verify': + if (entry.finding_id) { + state.verifiedFindings.set(entry.finding_id, entry.result); + } + break; + case 'poc': + if (entry.finding_id) { + state.pocResults.set(entry.finding_id, entry.result); + } + break; + case 'sarif': + state.sarifWritten = entry.result?.written === true; + break; + } + } catch { + // Skip malformed lines — partial writes don't corrupt prior entries + } + } + } catch { + // File doesn't exist yet + } + + return state; + } + + /** Stable, order-independent key for a set of file paths. */ + static chunkKey(files: string[]): string { + return [...files].sort().join('\0'); + } + + async exists(): Promise { + try { + await fs.access(this.logFile); + return true; + } catch { + return false; + } + } + + async clear(): Promise { + try { + await fs.unlink(this.logFile); + } catch { + // Ignore + } + } +} From 7b3a0c04b4b345f61fe9cadf79a19ffa984878cd Mon Sep 17 00:00:00 2001 From: "Finn (EACG)" Date: Thu, 11 Jun 2026 07:00:58 +0000 Subject: [PATCH 2/2] fix(checkpoint): address PR review findings on scan-log robustness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - fix(scan-log): normalise paths in chunkKey() to forward-slash absolute paths so relative vs. absolute and cross-OS paths produce the same cache key (silent cache miss on resume fixed) - fix(scan-log): replace silent console.warn in append() with a sticky writeFailed flag; emit console.error once with chalk.red so disk-full and permission errors are visible instead of silently losing durability - fix(scan-log): extract applyToMemState() helper; maintain an in-memory ScanState updated on every append(); expose getState() so future callers avoid re-reading the growing JSONL file - fix(orchestrator): guard restored allVulnerabilities against ghost findings — only apply verifiedFindings/pocResults whose IDs are present in the current chunk's detectedChunks findings set; stale verify/poc entries from prior aborted runs no longer bleed into the final report - fix(orchestrator): remove default value from processChunk's scanState parameter so TypeScript enforces the argument at every call-site - fix(context-analyzer): guard os.cpus().length with Math.max(1, ...) to handle restricted container environments that return an empty array Co-Authored-By: Claude Sonnet 4.6 --- src/analyzer/context-analyzer.ts | 3 +- src/orchestrator/orchestrator.ts | 18 +++--- src/utils/scan-log.ts | 97 +++++++++++++++++++++----------- 3 files changed, 74 insertions(+), 44 deletions(-) diff --git a/src/analyzer/context-analyzer.ts b/src/analyzer/context-analyzer.ts index 62cd666..9b71fb6 100644 --- a/src/analyzer/context-analyzer.ts +++ b/src/analyzer/context-analyzer.ts @@ -454,7 +454,8 @@ export class ContextAnalyzer { if (Number.isFinite(raw) && raw >= 1 && raw <= 8) return raw; // Scale with available CPUs: leave 2 threads for the orchestrator process. // Clamp to [2, 8] so we stay aggressive on beefy machines but sane elsewhere. - const cpuCount = os.cpus().length; + // Guard against os.cpus() returning [] in restricted container environments. + const cpuCount = Math.max(1, os.cpus().length); return Math.max(2, Math.min(8, cpuCount * 2 - 2)); })(); diff --git a/src/orchestrator/orchestrator.ts b/src/orchestrator/orchestrator.ts index 731f33c..3d0acfe 100644 --- a/src/orchestrator/orchestrator.ts +++ b/src/orchestrator/orchestrator.ts @@ -349,21 +349,27 @@ export class Orchestrator { // Restore allVulnerabilities for chunks already completed in a prior run. // This ensures the final SARIF report is complete even on a resumed scan. + // + // Guard: only apply verifiedFindings/pocResults whose finding IDs are present + // in the current chunk's findings. A prior run may have detected finding B, + // but a re-detection after a code change only produced finding A — B's verify/poc + // entries must not bleed into this run's output as ghost findings. const allVulnerabilities: any[] = []; for (const [, chunkData] of scanState.detectedChunks) { const allFilesProcessed = chunkData.files.every(f => processedFiles.has(f)); if (!allFilesProcessed) continue; // Will be (re-)processed in the main loop below + const currentFindingIds = new Set(chunkData.findings.map((f: any) => f.id)); for (const finding of chunkData.findings) { const enriched = { ...finding }; const verifyResult = scanState.verifiedFindings.get(finding.id); - if (verifyResult) { + if (verifyResult && currentFindingIds.has(finding.id)) { enriched.verificationStatus = verifyResult.status; enriched.confidence = verifyResult.confidence; enriched.needsManualReview = verifyResult.needsManualReview; if (verifyResult.contradictions) enriched.contradictions = verifyResult.contradictions; } const pocResult = scanState.pocResults.get(finding.id); - if (pocResult?.poc) { + if (pocResult?.poc && currentFindingIds.has(finding.id)) { enriched.poc = { ...pocResult.poc }; } allVulnerabilities.push(enriched); @@ -511,13 +517,7 @@ export class Orchestrator { totalBugsFound: number, estimatedChunksRemaining: number, totalFilesCount: number, - scanState: ScanState = { - prioritizedFiles: null, - detectedChunks: new Map(), - verifiedFindings: new Map(), - pocResults: new Map(), - sarifWritten: false, - } + scanState: ScanState ): Promise<{ bugsFound: number; findings: any[] }> { console.log(chalk.bold(`\n[${phase}] Chunk ${iteration} (${chunk.length} files | ~${estimatedChunksRemaining} chunks remaining)`)); console.log(chalk.gray(` ${this.dynamicChunker.getExplanation()}`)); diff --git a/src/utils/scan-log.ts b/src/utils/scan-log.ts index 2421454..38f89a2 100644 --- a/src/utils/scan-log.ts +++ b/src/utils/scan-log.ts @@ -1,6 +1,7 @@ import * as fs from 'fs/promises'; import * as path from 'path'; import * as crypto from 'crypto'; +import chalk from 'chalk'; export type ScanLogStepName = 'prioritize' | 'detect' | 'verify' | 'poc' | 'sarif'; @@ -42,6 +43,14 @@ export interface ScanState { */ export class ScanLog { private logFile: string; + private writeFailed = false; + private memState: ScanState = { + prioritizedFiles: null, + detectedChunks: new Map(), + verifiedFindings: new Map(), + pocResults: new Map(), + sarifWritten: false, + }; constructor(logFile: string) { this.logFile = logFile; @@ -57,19 +66,30 @@ export class ScanLog { /** Append one step entry. Safe to call concurrently — each line is atomic. */ async append(entry: Omit): Promise { + const full: ScanLogEntry = { ...entry, ts: new Date().toISOString() } as ScanLogEntry; + this.applyToMemState(full); try { const dir = path.dirname(this.logFile); await fs.mkdir(dir, { recursive: true }); - const line = JSON.stringify({ ...entry, ts: new Date().toISOString() }) + '\n'; + const line = JSON.stringify(full) + '\n'; await fs.appendFile(this.logFile, line, 'utf-8'); } catch (error) { - console.warn('Failed to append to scan log:', error); + if (!this.writeFailed) { + this.writeFailed = true; + console.error(chalk.red(`[scan-log] Write failed — resume capability lost for this run: ${error}`)); + } } } - /** Read all entries and materialise a resume-ready ScanState. */ + /** Return the current in-memory state (updated on every append, no disk read). */ + getState(): ScanState { + return this.memState; + } + + /** Read all entries from disk and materialise a resume-ready ScanState. + * Also populates the in-memory state so subsequent getState() calls are free. */ async loadState(): Promise { - const state: ScanState = { + this.memState = { prioritizedFiles: null, detectedChunks: new Map(), verifiedFindings: new Map(), @@ -84,33 +104,7 @@ export class ScanLog { for (const line of lines) { try { const entry: ScanLogEntry = JSON.parse(line); - switch (entry.step) { - case 'prioritize': - state.prioritizedFiles = entry.result?.files ?? null; - break; - case 'detect': - if (entry.files) { - const key = ScanLog.chunkKey(entry.files); - state.detectedChunks.set(key, { - files: entry.files, - findings: entry.result?.findings ?? [], - }); - } - break; - case 'verify': - if (entry.finding_id) { - state.verifiedFindings.set(entry.finding_id, entry.result); - } - break; - case 'poc': - if (entry.finding_id) { - state.pocResults.set(entry.finding_id, entry.result); - } - break; - case 'sarif': - state.sarifWritten = entry.result?.written === true; - break; - } + this.applyToMemState(entry); } catch { // Skip malformed lines — partial writes don't corrupt prior entries } @@ -119,12 +113,47 @@ export class ScanLog { // File doesn't exist yet } - return state; + return this.memState; + } + + private applyToMemState(entry: ScanLogEntry): void { + switch (entry.step) { + case 'prioritize': + this.memState.prioritizedFiles = entry.result?.files ?? null; + break; + case 'detect': + if (entry.files) { + const key = ScanLog.chunkKey(entry.files); + this.memState.detectedChunks.set(key, { + files: entry.files, + findings: entry.result?.findings ?? [], + }); + } + break; + case 'verify': + if (entry.finding_id) { + this.memState.verifiedFindings.set(entry.finding_id, entry.result); + } + break; + case 'poc': + if (entry.finding_id) { + this.memState.pocResults.set(entry.finding_id, entry.result); + } + break; + case 'sarif': + this.memState.sarifWritten = entry.result?.written === true; + break; + } } - /** Stable, order-independent key for a set of file paths. */ + /** Stable, order-independent key for a set of file paths. + * Normalises to absolute forward-slash paths so relative vs. absolute + * and Windows vs. POSIX paths all produce the same key. */ static chunkKey(files: string[]): string { - return [...files].sort().join('\0'); + return [...files] + .map(f => path.resolve(f).split(path.sep).join('/')) + .sort() + .join('\0'); } async exists(): Promise {