Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,10 @@ The experiment config will specify:
- Evaluations (evals) that are used to grade the output of the model
- Treatments that specify the different conditions you would like to test (for
example, testing with an MCP server versus without)

## Run output

Experiment runs show progress as treatments start and finish. Command stdout and
stderr from each treatment are saved under that treatment's artifact directory as
`stdout.log` and `stderr.log`. Pass `--show-output` to also mirror run output to
stdout with the experiment, treatment, eval, model, and stream as a prefix.
45 changes: 45 additions & 0 deletions packages/agent-eval/src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ const {values} = parseArgs({
short: 'e',
description: 'The file name of the experiment to run',
},
'show-output': {
type: 'boolean',
description: 'Show stdout and stderr from experiment runs, prefixed with experiment context',
},
},
})

Expand All @@ -41,6 +45,7 @@ const MAX_CONCURRENCY =
Number.isFinite(parsedConcurrency) && Number.isInteger(parsedConcurrency) && parsedConcurrency >= 1
? parsedConcurrency
: 1
const SHOW_OUTPUT = values['show-output'] ?? false
const experimentConfigs: Array<ExperimentConfig> = []

if (!existsSync(ARTIFACTS_DIR)) {
Expand Down Expand Up @@ -181,6 +186,24 @@ function formatNumber(value: number): string {
return new Intl.NumberFormat('en-US').format(value)
}

function getTreatmentPrefix(treatment: Treatment): string {
return [treatment.experiment.name, treatment.config.name, treatment.eval.id, treatment.model].join(' | ')
}

function writePrefixedOutput(prefix: string, chunk: string) {
const lines = chunk.split('\n')
const output = lines
.map((line, index) => {
if (line === '' && index === lines.length - 1) {
return ''
}
return `${prefix}${line}`
})
.join('\n')

process.stdout.write(output)
}

type TableRow = Record<string, string | number>

function formatTable(rows: Array<TableRow>, columns: Array<string>): string {
Expand Down Expand Up @@ -387,6 +410,28 @@ for (const config of experimentConfigs) {
artifactsDirectory: ARTIFACTS_DIR,
copilotToken: COPILOT_GITHUB_TOKEN,
maxConcurrency: MAX_CONCURRENCY,
onEvent(event) {
if (event.type === 'progress') {
console.log(
`Progress: ${event.completed}/${event.total} completed, ${event.running} running, ${event.remaining} left`,
)
return
}

if (event.type === 'log') {
const prefix = event.treatment ? `[${getTreatmentPrefix(event.treatment)}]` : `[${config.name}]`
if (event.level === 'error') {
console.error(`${prefix} ${event.message}`)
} else if (SHOW_OUTPUT) {
console.log(`${prefix} ${event.message}`)
}
return
}

if (SHOW_OUTPUT) {
writePrefixedOutput(`[${getTreatmentPrefix(event.treatment)} | ${event.stream}] `, event.chunk)
}
},
})
results.push(...runResults)
}
Expand Down
Loading