Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 29 additions & 25 deletions scripts/benchmark-comparators.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
import path from 'path';
import { fileURLToPath } from 'url';
import { readFileSync, writeFileSync, mkdirSync, existsSync } from 'fs';
import { execSync, exec } from 'child_process';
import { execSync, execFile } from 'child_process';
import { parseArgs } from 'util';
import { promisify } from 'util';
import { withManagedStdioClientSession } from './lib/managed-mcp-session.mjs';

const execAsync = promisify(exec);
const execFileAsync = promisify(execFile);

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const projectRoot = path.join(__dirname, '..');
Expand Down Expand Up @@ -96,15 +96,14 @@ function estimateTokens(bytes) {
* - searchArgs(task): map frozen task to tool arguments
* - extractPayload(result): extract string payload from MCP tool response
*/
const pythonCmd = process.platform === 'win32' ? 'python' : 'python3';

const COMPARATOR_ADAPTERS = [
{
name: 'codebase-memory-mcp',
checkInstalled() {
try {
// Installed via curl installer to ~/.local/bin or similar; also available via npx
execSync('which codebase-memory-mcp 2>/dev/null || npx --yes codebase-memory-mcp --version 2>/dev/null', {
stdio: 'pipe'
});
execSync('npx --yes codebase-memory-mcp --version', { stdio: 'pipe', timeout: 30000 });
return true;
} catch {
return false;
Expand All @@ -124,7 +123,7 @@ const COMPARATOR_ADAPTERS = [
serverCommand: 'npx',
serverArgs: ['--yes', 'codebase-memory-mcp'],
serverEnv: {},
initTimeout: 5000,
initTimeout: 10000,
indexTool: null, // auto-indexes on first query
searchTool: 'search_code',
searchArgs(task) {
Expand All @@ -141,23 +140,23 @@ const COMPARATOR_ADAPTERS = [
name: 'jCodeMunch',
checkInstalled() {
try {
execSync('python3 -c "import jcodemunch" 2>/dev/null', { stdio: 'pipe' });
execSync(`${pythonCmd} -c "import jcodemunch"`, { stdio: 'pipe' });
return true;
} catch {
return false;
}
},
async install() {
try {
execSync('pip install jcodemunch-mcp', { stdio: 'pipe', timeout: 120000 });
execSync(`${pythonCmd} -m pip install jcodemunch-mcp`, { stdio: 'pipe', timeout: 120000 });
} catch (err) {
throw new Error(`jCodeMunch install failed: ${err.message}`);
}
},
serverCommand: 'python3',
serverCommand: pythonCmd,
serverArgs: ['-m', 'jcodemunch.server'],
serverEnv: {},
initTimeout: 8000,
initTimeout: 15000,
indexTool: 'index_folder',
indexArgs(rootPath) {
return { path: path.resolve(rootPath) };
Expand All @@ -182,7 +181,7 @@ const COMPARATOR_ADAPTERS = [
name: 'GrepAI',
checkInstalled() {
try {
execSync('which grepai 2>/dev/null', { stdio: 'pipe' });
execSync('grepai --version', { stdio: 'pipe' });
return true;
} catch {
return false;
Expand All @@ -191,7 +190,7 @@ const COMPARATOR_ADAPTERS = [
async install() {
// GrepAI requires a Go binary + Ollama embedding provider. Likely setup_failed without Ollama.
try {
execSync('which grepai', { stdio: 'pipe' });
execSync('grepai --version', { stdio: 'pipe' });
} catch {
throw new Error(
'GrepAI requires Go binary installation (Homebrew: brew install yoanbernabeu/tap/grepai) ' +
Expand Down Expand Up @@ -220,23 +219,23 @@ const COMPARATOR_ADAPTERS = [
name: 'CodeGraphContext',
checkInstalled() {
try {
execSync('python3 -c "import codegraphcontext" 2>/dev/null', { stdio: 'pipe' });
execSync(`${pythonCmd} -c "import codegraphcontext"`, { stdio: 'pipe' });
return true;
} catch {
return false;
}
},
async install() {
try {
execSync('pip install codegraphcontext', { stdio: 'pipe', timeout: 120000 });
execSync(`${pythonCmd} -m pip install codegraphcontext`, { stdio: 'pipe', timeout: 120000 });
} catch (err) {
throw new Error(
`CodeGraphContext install failed: ${err.message}. ` +
'Requires Python 3.9+ and either Neo4j or FalkorDB Lite.'
);
}
},
serverCommand: 'python3',
serverCommand: pythonCmd,
serverArgs: ['-m', 'codegraphcontext.server'],
serverEnv: {},
initTimeout: 15000,
Expand All @@ -261,16 +260,15 @@ const COMPARATOR_ADAPTERS = [
name: 'raw Claude Code',
checkInstalled() {
try {
execSync('claude --version 2>/dev/null', { stdio: 'pipe' });
execSync('claude --version', { stdio: 'pipe' });
return true;
} catch {
return false;
}
},
async install() {
throw new Error(
'raw Claude Code baseline requires the Claude Code CLI (claude) to be installed and authenticated. ' +
'This is the manual-log-capture baseline — record as pending_evidence if claude CLI is unavailable.'
'raw Claude Code baseline requires the claude CLI. Install: npm install -g @anthropic-ai/claude-code'
);
},
// raw Claude Code is not an MCP server; handled separately via claude -p
Expand Down Expand Up @@ -411,11 +409,17 @@ async function runRawClaudeCode(rootPath, tasks) {

try {
const prompt = `You are exploring a codebase at ${path.resolve(rootPath)}. Answer this question using only grep, glob, and read file operations: ${task.prompt}`;
const { stdout } = await execAsync(
`claude -p "${prompt.replace(/"/g, '\\"')}" --allowedTools "Read,Grep,Glob"`,
{ timeout: 60000, cwd: path.resolve(rootPath) }
const { stdout } = await execFileAsync(
'claude',
['-p', prompt, '--output-format', 'json', '--allowedTools', 'Read,Grep,Glob'],
{ timeout: 120000, cwd: path.resolve(rootPath), shell: process.platform === 'win32' }
);
Comment on lines +412 to 416
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 execFileAsync won't resolve .cmd wrappers on Windows

child_process.execFile (and its promisified form) does not use a shell, so it cannot locate .cmd files through PATHEXT resolution. When @anthropic-ai/claude-code is installed globally via npm on Windows, npm creates claude.cmd — not claude.exe. Calling execFileAsync('claude', ...) without shell: true will throw ENOENT on Windows, even when the CLI is correctly installed and execSync('claude --version') (which does use cmd.exe internally) returns cleanly in checkInstalled(). The result would be setup_failed for the baseline lane on the very platform this PR targets.

Use shell: process.platform === 'win32' to get shell-based .cmd resolution on Windows while still avoiding quoting issues (the array form of args is safe with a shell):

const { stdout } = await execFileAsync(
  'claude',
  ['-p', prompt, '--output-format', 'json', '--allowedTools', 'Read,Grep,Glob'],
  { timeout: 120000, cwd: path.resolve(rootPath), shell: process.platform === 'win32' }
);

Or, if Claude Code ships a native .exe on Windows and this never arose during testing, the concern doesn't apply — but it's worth confirming before closing the Windows ticket.

payload = stdout;
try {
const parsed = JSON.parse(stdout);
payload = parsed.result ?? stdout;
} catch {
payload = stdout;
}
} catch (err) {
if (err.code === 'ENOENT' || err.message?.includes('command not found')) {
throw new Error('claude CLI not found');
Expand Down Expand Up @@ -510,8 +514,8 @@ async function runComparator(adapter, repoPaths, allFixtures) {
} catch (err) {
if (err.message.includes('claude CLI not found')) {
return {
status: 'pending_evidence',
reason: 'claude CLI not available. Run manually with: claude -p "<task>" --allowedTools "Read,Grep,Glob"'
status: 'setup_failed',
reason: 'claude CLI not found — required for baseline. Install: npm install -g @anthropic-ai/claude-code'
};
}
return { status: 'setup_failed', reason: err.message };
Expand Down
Loading