From ced79850527c1e5cd5db59d1f6842f2f06f1aec4 Mon Sep 17 00:00:00 2001 From: zhuhao Date: Wed, 6 May 2026 20:40:15 +0800 Subject: [PATCH 1/6] feat(eval-search): add executable evaluation harness Move eval-search harness, no-filter pollution semantics, and answer-oriented search guidance onto the larksuite/cli main branch baseline. Change-Id: I399cf5772e3d6e35d42c0aa7f826dc7d2da52787 Migrated-From: code.byted.org/lark_search/larksuite_fork eval-search_codex --- .gitignore | 3 + .harness/plan.example.json | 287 ++++++ package-lock.json | 39 +- package.json | 6 + scripts/harness-runner.ts | 771 +++++++++++++++ skills/dev/SKILL.md | 88 ++ skills/eval-search/RUBRIC.md | 115 +++ skills/eval-search/SKILL.md | 159 ++++ skills/eval-search/prompts/executor.md | 73 ++ skills/eval-search/prompts/judge.md | 97 ++ skills/eval-search/prompts/optimizer.md | 150 +++ skills/eval-search/references/dataset.md | 127 +++ .../references/known-tainted-tokens.md | 75 ++ .../references/open-repo-layout.md | 162 ++++ .../references/pollution-preflight.md | 103 ++ .../eval-search/references/pr-generation.md | 277 ++++++ skills/eval-search/references/run-layout.md | 118 +++ skills/lark-doc/references/lark-doc-search.md | 6 + .../eval-search/eval-search-collect-search.ts | 888 +++++++++++++++++ tests/eval-search/eval-search-run.ts | 891 ++++++++++++++++++ tests/harness/sample-plan.json | 95 ++ tests/harness/self-correct-plan.json | 57 ++ tsconfig.harness.json | 24 + 23 files changed, 4609 insertions(+), 2 deletions(-) create mode 100644 .harness/plan.example.json create mode 100644 scripts/harness-runner.ts create mode 100644 skills/dev/SKILL.md create mode 100644 skills/eval-search/RUBRIC.md create mode 100644 skills/eval-search/SKILL.md create mode 100644 skills/eval-search/prompts/executor.md create mode 100644 skills/eval-search/prompts/judge.md create mode 100644 skills/eval-search/prompts/optimizer.md create mode 100644 skills/eval-search/references/dataset.md create mode 100644 skills/eval-search/references/known-tainted-tokens.md create mode 100644 skills/eval-search/references/open-repo-layout.md create mode 100644 skills/eval-search/references/pollution-preflight.md create mode 100644 skills/eval-search/references/pr-generation.md create mode 100644 skills/eval-search/references/run-layout.md create mode 100644 tests/eval-search/eval-search-collect-search.ts create mode 100644 tests/eval-search/eval-search-run.ts create mode 100644 tests/harness/sample-plan.json create mode 100644 tests/harness/self-correct-plan.json create mode 100644 tsconfig.harness.json diff --git a/.gitignore b/.gitignore index 90313e480..435af6819 100644 --- a/.gitignore +++ b/.gitignore @@ -34,8 +34,11 @@ tests/mail/reports/ # Generated / test artifacts .hammer/ +.harness/runs/ +.harness_local/ internal/registry/meta_data.json cmd/api/download.bin app.log /sidecar-server-demo /server-demo +tests/eval-search/runs/ diff --git a/.harness/plan.example.json b/.harness/plan.example.json new file mode 100644 index 000000000..0f09132a9 --- /dev/null +++ b/.harness/plan.example.json @@ -0,0 +1,287 @@ +{ + "name": "eval-search-delivery-harness", + "version": 1, + "objective": "make the eval-search workflow executable, reviewable, and reusable while preserving blind search evaluation", + "target": { + "skill": "eval-search", + "outcome": "dataset snapshot -> blind executor evidence -> judge scoring -> optimizer-ready report" + }, + "inputs": [ + { + "id": "loader_profile", + "required": true, + "description": "lark-cli profile that can read the eval Base during dataset setup" + }, + { + "id": "executor_profile", + "required": true, + "description": "dedicated blind lark-cli profile that cannot read the eval Base" + }, + { + "id": "subset_or_dataset_file", + "required": false, + "description": "subset for smoke runs or dataset-file for two-step strict mode" + }, + { + "id": "eval_run_id", + "required": false, + "description": "stable run id under tests/eval-search/runs when reproducibility matters" + } + ], + "lifecycle": { + "id": "eval-search", + "goal": "bring lkkcli-style lifecycle control to the existing search evaluation harness", + "stage_order": [ + "prepare", + "understand", + "plan", + "act", + "verify", + "retrospect" + ] + }, + "constraints": { + "enforce_stage_order": true, + "state_root": "tests/eval-search/runs", + "role_isolation": [ + "loader profile may read the eval Base only during dataset setup", + "executor profile receives query/case_id/run_dir only and must not read expected answers", + "judge starts after executor trajectories are complete", + "optimizer receives aggregated verdicts instead of full raw trajectories" + ], + "allowed_write_paths": [ + "skills/eval-search/**", + "tests/eval-search/**", + "shortcuts/**", + ".harness/**" + ] + }, + "env": { + "EVAL_SEARCH_RUN_ROOT": "tests/eval-search/runs" + }, + "artifacts": [ + { + "id": "eval_skill", + "path": "skills/eval-search/SKILL.md", + "required": true + }, + { + "id": "rubric", + "path": "skills/eval-search/RUBRIC.md", + "required": true + }, + { + "id": "executor_prompt", + "path": "skills/eval-search/prompts/executor.md", + "required": true + }, + { + "id": "judge_prompt", + "path": "skills/eval-search/prompts/judge.md", + "required": true + }, + { + "id": "optimizer_prompt", + "path": "skills/eval-search/prompts/optimizer.md", + "required": true + }, + { + "id": "harness_runner_source", + "path": "scripts/harness-runner.ts", + "required": true + }, + { + "id": "setup_runner_source", + "path": "tests/eval-search/eval-search-run.ts", + "required": true + }, + { + "id": "evidence_collector_source", + "path": "tests/eval-search/eval-search-collect-search.ts", + "required": true + } + ], + "stages": [ + { + "id": "prepare", + "objective": "establish repo state and local tool availability before touching eval data", + "steps": [ + { + "id": "git_status", + "command": [ + "git", + "status", + "--short", + "--branch" + ], + "expect": { + "exitCode": 0 + } + }, + { + "id": "lark_cli_available", + "required": false, + "command": [ + "lark-cli", + "--version" + ], + "expect": { + "exitCode": 0 + } + } + ] + }, + { + "id": "understand", + "objective": "pin the eval-search contract before executing any case", + "steps": [ + { + "id": "skill_contract_mentions_roles", + "command": [ + "node", + "-e", + "const s=require('fs').readFileSync('skills/eval-search/SKILL.md','utf8'); for (const w of ['Executor','Judge','Optimizer','盲测']) if (!s.includes(w)) process.exit(1);" + ], + "expect": { + "exitCode": 0 + } + }, + { + "id": "rubric_exists", + "command": [ + "test", + "-f", + "skills/eval-search/RUBRIC.md" + ], + "expect": { + "exitCode": 0 + } + } + ] + }, + { + "id": "plan", + "objective": "prove the deterministic setup, blind collector, and scoring prompts are all wired", + "steps": [ + { + "id": "setup_help", + "command": [ + "node", + "--experimental-strip-types", + "tests/eval-search/eval-search-run.ts", + "--help" + ], + "expect": { + "exitCode": 0, + "stdoutIncludes": "--executor-profile" + } + }, + { + "id": "collector_help", + "command": [ + "node", + "--experimental-strip-types", + "tests/eval-search/eval-search-collect-search.ts", + "--help" + ], + "expect": { + "exitCode": 0, + "stdoutIncludes": "--fetch-top" + } + } + ] + }, + { + "id": "act", + "objective": "check the implementation pieces that produce setup and evidence artifacts", + "steps": [ + { + "id": "setup_runner_smoke", + "command": [ + "node", + "--experimental-strip-types", + "tests/eval-search/eval-search-run.ts", + "--help" + ], + "expect": { + "exitCode": 0, + "stdoutIncludes": "--executor-profile" + } + }, + { + "id": "collector_smoke", + "command": [ + "node", + "--experimental-strip-types", + "tests/eval-search/eval-search-collect-search.ts", + "--help" + ], + "expect": { + "exitCode": 0, + "stdoutIncludes": "--fetch-top" + } + } + ] + }, + { + "id": "verify", + "objective": "type-check TS and run local deterministic gates before any live eval-search run", + "steps": [ + { + "id": "typescript_check", + "command": [ + "npm", + "run", + "harness:check" + ], + "expect": { + "exitCode": 0 + } + }, + { + "id": "harness_runner_smoke", + "command": [ + "node", + "--experimental-strip-types", + "scripts/harness-runner.ts", + "--help" + ], + "expect": { + "exitCode": 0, + "stdoutIncludes": "--plan" + } + }, + { + "id": "skill_format", + "command": [ + "node", + "scripts/skill-format-check/index.js", + "skills" + ], + "expect": { + "exitCode": 0, + "stdoutIncludes": "Skill format check passed" + } + } + ] + }, + { + "id": "retrospect", + "objective": "capture the final repo state and contract summary for the next run", + "steps": [ + { + "id": "branch_state", + "command": [ + "git", + "status", + "--short", + "--branch" + ], + "expect": { + "exitCode": 0 + } + } + ] + } + ] +} diff --git a/package-lock.json b/package-lock.json index 5c63f1dc9..17691befc 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@larksuite/cli", - "version": "1.0.11", + "version": "1.0.23", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@larksuite/cli", - "version": "1.0.11", + "version": "1.0.23", "cpu": [ "x64", "arm64" @@ -24,6 +24,10 @@ "bin": { "lark-cli": "scripts/run.js" }, + "devDependencies": { + "@types/node": "^25.6.0", + "typescript": "^6.0.3" + }, "engines": { "node": ">=16" } @@ -50,6 +54,16 @@ "sisteransi": "^1.0.5" } }, + "node_modules/@types/node": { + "version": "25.6.0", + "resolved": "https://registry.npmjs.org/@types/node/-/node-25.6.0.tgz", + "integrity": "sha512-+qIYRKdNYJwY3vRCZMdJbPLJAtGjQBudzZzdzwQYkEPQd+PJGixUL5QfvCLDaULoLv+RhT3LDkwEfKaAkgSmNQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.19.0" + } + }, "node_modules/fast-string-truncated-width": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/fast-string-truncated-width/-/fast-string-truncated-width-1.2.1.tgz", @@ -79,6 +93,27 @@ "resolved": "https://registry.npmjs.org/sisteransi/-/sisteransi-1.0.5.tgz", "integrity": "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==", "license": "MIT" + }, + "node_modules/typescript": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-6.0.3.tgz", + "integrity": "sha512-y2TvuxSZPDyQakkFRPZHKFm+KKVqIisdg9/CZwm9ftvKXLP8NRWj38/ODjNbr43SsoXqNuAisEf1GdCxqWcdBw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "7.19.2", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.19.2.tgz", + "integrity": "sha512-qYVnV5OEm2AW8cJMCpdV20CDyaN3g0AjDlOGf1OW4iaDEx8MwdtChUp4zu4H0VP3nDRF/8RKWH+IPp9uW0YGZg==", + "dev": true, + "license": "MIT" } } } diff --git a/package.json b/package.json index 3d7d6310a..fc42df1ec 100644 --- a/package.json +++ b/package.json @@ -6,6 +6,8 @@ "lark-cli": "scripts/run.js" }, "scripts": { + "harness:run": "node --experimental-strip-types scripts/harness-runner.ts", + "harness:check": "tsc -p tsconfig.harness.json --noEmit", "postinstall": "node scripts/install.js" }, "os": [ @@ -34,5 +36,9 @@ ], "dependencies": { "@clack/prompts": "^1.2.0" + }, + "devDependencies": { + "@types/node": "^25.6.0", + "typescript": "^6.0.3" } } diff --git a/scripts/harness-runner.ts b/scripts/harness-runner.ts new file mode 100644 index 000000000..68ec06cc2 --- /dev/null +++ b/scripts/harness-runner.ts @@ -0,0 +1,771 @@ +#!/usr/bin/env node + +const { spawnSync } = require("node:child_process"); +const fs = require("node:fs"); +const path = require("node:path"); + +const DEFAULT_STAGE_ORDER = ["explore", "plan", "act", "verify", "retrospect"]; + +function usage() { + console.log(`Usage: + node --experimental-strip-types scripts/harness-runner.ts --plan [options] + +Options: + --plan JSON plan to execute + --run-id Run id, defaults to local timestamp + --run-root Artifact root, defaults to .harness/runs + --cwd Working directory, defaults to repo root/current cwd + --max-corrections Correction rounds per step, defaults to 2 + --format + --dry-run Return step results without executing commands + --help Show this help + +Plan schema: + { + "name": "demo", + "objective": "make eval-search execution reproducible", + "target": { + "skill": "eval-search", + "outcome": "blind search eval with scored summary" + }, + "inputs": [ + { "id": "loader_profile", "required": true }, + { "id": "executor_profile", "required": true } + ], + "lifecycle": { + "id": "eval-search", + "stage_order": ["prepare", "understand", "plan", "act", "verify", "retrospect"] + }, + "constraints": { + "enforce_stage_order": true, + "state_root": "tests/eval-search/runs", + "role_isolation": ["loader", "executor", "judge", "optimizer"] + }, + "artifacts": [ + { "id": "rubric", "path": "skills/eval-search/RUBRIC.md", "required": true } + ], + "stages": [ + { + "id": "prepare", + "steps": [ + { + "id": "git_status", + "command": ["git", "status", "--short", "--branch"], + "expect": { "exitCode": 0 }, + "corrections": [ + { "id": "show_status", "command": ["git", "status", "--short"] } + ] + } + ] + } + ] + } + +Every stage and step writes a structured result to the run directory. Failed +steps may run explicit correction steps, then retry themselves.`); +} + +function parseArgs(argv) { + const out: any = { + plan: "", + runId: "", + runRoot: "", + cwd: "", + maxCorrections: 2, + format: "pretty", + dryRun: false, + }; + for (let i = 0; i < argv.length; i += 1) { + const arg = argv[i]; + const next = () => { + if (i + 1 >= argv.length) { + throw new Error(`missing value for ${arg}`); + } + i += 1; + return argv[i]; + }; + if (arg === "--help" || arg === "-h") { + out.help = true; + } else if (arg === "--plan") { + out.plan = next(); + } else if (arg === "--run-id") { + out.runId = next(); + } else if (arg === "--run-root") { + out.runRoot = next(); + } else if (arg === "--cwd") { + out.cwd = next(); + } else if (arg === "--max-corrections") { + out.maxCorrections = Number.parseInt(next(), 10); + if (!Number.isFinite(out.maxCorrections) || out.maxCorrections < 0) { + throw new Error("--max-corrections must be a non-negative integer"); + } + } else if (arg === "--format") { + out.format = next(); + if (!["pretty", "json", "ndjson"].includes(out.format)) { + throw new Error("--format must be pretty, json, or ndjson"); + } + } else if (arg === "--dry-run") { + out.dryRun = true; + } else { + throw new Error(`unknown option ${arg}`); + } + } + if (!out.help && !out.plan) { + throw new Error("--plan is required"); + } + return out; +} + +function timestampId(date = new Date()) { + const tzOffsetMs = date.getTimezoneOffset() * 60 * 1000; + return new Date(date.getTime() - tzOffsetMs) + .toISOString() + .slice(0, 19) + .replace(/:/g, "-"); +} + +function repoRoot(cwd) { + const result = spawnSync("git", ["rev-parse", "--show-toplevel"], { + cwd, + encoding: "utf8", + }); + return result.status === 0 ? result.stdout.trim() : cwd; +} + +function ensureDir(dir) { + fs.mkdirSync(dir, { recursive: true }); +} + +function readJson(file) { + return JSON.parse(fs.readFileSync(file, "utf8")); +} + +function writeJson(file, value) { + fs.writeFileSync(file, `${JSON.stringify(value, null, 2)}\n`); +} + +function expandEnvVars(value, env) { + return String(value).replace(/\$\{?([A-Z_][A-Z0-9_]*)\}?/g, (match, key) => + Object.prototype.hasOwnProperty.call(env, key) ? env[key] : match, + ); +} + +function normalizePlan(plan) { + if (!Array.isArray(plan.stages) || plan.stages.length === 0) { + throw new Error("plan.stages must be a non-empty array"); + } + return { + name: plan.name || "harness", + version: plan.version || 1, + objective: plan.objective || "", + target: normalizeObject(plan.target, "target"), + inputs: normalizeInputs(plan.inputs || []), + lifecycle: normalizeLifecycle(plan.lifecycle || {}, plan.objective || ""), + constraints: normalizeConstraints(plan.constraints || {}), + env: normalizeEnv(plan.env || {}), + artifacts: normalizeArtifacts(plan.artifacts || []), + stages: plan.stages.map((stage, index) => { + if (!stage.id) { + throw new Error(`stage at index ${index} is missing id`); + } + if (!Array.isArray(stage.steps) || stage.steps.length === 0) { + throw new Error(`stage ${stage.id} must have at least one step`); + } + return { + id: stage.id, + objective: stage.objective || "", + required: stage.required !== false, + steps: stage.steps.map((step, stepIndex) => normalizeStep(step, stage.id, stepIndex)), + }; + }), + }; +} + +function normalizeObject(value, name) { + if (value === undefined || value === null) { + return {}; + } + if (typeof value !== "object" || Array.isArray(value)) { + throw new Error(`plan.${name} must be an object`); + } + return value; +} + +function normalizeLifecycle(lifecycle, objective) { + const stageOrder = lifecycle.stage_order || lifecycle.stageOrder || DEFAULT_STAGE_ORDER; + if (!Array.isArray(stageOrder) || stageOrder.some((stage) => typeof stage !== "string" || !stage)) { + throw new Error("plan.lifecycle.stage_order must be a non-empty string array"); + } + return { + id: lifecycle.id || lifecycle.kind || "dev", + goal: lifecycle.goal || objective || "", + stage_order: stageOrder, + }; +} + +function normalizeConstraints(constraints) { + const out = { ...constraints }; + out.enforce_stage_order = constraints.enforce_stage_order === true || constraints.enforceStageOrder === true; + out.state_root = constraints.state_root || constraints.stateRoot || ""; + out.role_isolation = Array.isArray(constraints.role_isolation) + ? constraints.role_isolation + : Array.isArray(constraints.roleIsolation) + ? constraints.roleIsolation + : []; + out.allowed_write_paths = Array.isArray(constraints.allowed_write_paths) + ? constraints.allowed_write_paths + : Array.isArray(constraints.allowedWritePaths) + ? constraints.allowedWritePaths + : []; + return out; +} + +function normalizeEnv(env) { + if (typeof env !== "object" || env === null || Array.isArray(env)) { + throw new Error("plan.env must be an object"); + } + return Object.fromEntries( + Object.entries(env).map(([key, value]) => { + if (!/^[A-Z_][A-Z0-9_]*$/.test(key)) { + throw new Error(`plan.env key ${key} must be UPPER_SNAKE_CASE`); + } + return [key, String(value)]; + }), + ); +} + +function normalizeArtifacts(artifacts) { + if (!Array.isArray(artifacts)) { + throw new Error("plan.artifacts must be an array"); + } + return artifacts.map((artifact, index) => { + if (!artifact.id) { + throw new Error(`artifact at index ${index} is missing id`); + } + if (!artifact.path) { + throw new Error(`artifact ${artifact.id} is missing path`); + } + return { + id: artifact.id, + path: artifact.path, + required: artifact.required !== false, + description: artifact.description || "", + }; + }); +} + +function normalizeInputs(inputs) { + if (!Array.isArray(inputs)) { + throw new Error("plan.inputs must be an array"); + } + return inputs.map((input, index) => { + if (!input.id) { + throw new Error(`input at index ${index} is missing id`); + } + return { + id: input.id, + required: input.required !== false, + description: input.description || "", + source: input.source || "", + }; + }); +} + +function normalizeStep(step, stageId, index) { + if (!step.id) { + throw new Error(`step at ${stageId}[${index}] is missing id`); + } + if (!step.command) { + throw new Error(`step ${stageId}.${step.id} is missing command`); + } + return { + id: step.id, + name: step.name || step.id, + command: step.command, + cwd: step.cwd || "", + timeoutMs: step.timeout_ms || step.timeoutMs || 10 * 60 * 1000, + required: step.required !== false, + expect: step.expect || { exitCode: 0 }, + maxAttempts: step.max_attempts || step.maxAttempts || 1, + corrections: Array.isArray(step.corrections) + ? step.corrections.map((correction, correctionIndex) => + normalizeCorrection(correction, stageId, step.id, correctionIndex), + ) + : [], + }; +} + +function normalizeCorrection(correction, stageId, stepId, index) { + if (!correction.id) { + throw new Error(`correction at ${stageId}.${stepId}[${index}] is missing id`); + } + if (!correction.command) { + throw new Error(`correction ${stageId}.${stepId}.${correction.id} is missing command`); + } + return { + id: correction.id, + name: correction.name || correction.id, + command: correction.command, + cwd: correction.cwd || "", + timeoutMs: correction.timeout_ms || correction.timeoutMs || 10 * 60 * 1000, + expect: correction.expect || { exitCode: 0 }, + }; +} + +function commandText(command) { + return Array.isArray(command) ? command.join(" ") : command; +} + +function tail(text, limit = 4000) { + const value = String(text || ""); + return value.length <= limit ? value : value.slice(value.length - limit); +} + +function runCommand(command, opts) { + if (opts.dryRun) { + return { + status: 0, + signal: null, + stdout: "", + stderr: "", + error: null, + dry_run: true, + }; + } + const env = { ...process.env, ...opts.env }; + if (Array.isArray(command)) { + const [cmd, ...args] = command; + const result = spawnSync(cmd, args, { + cwd: opts.cwd, + env, + encoding: "utf8", + timeout: opts.timeoutMs, + maxBuffer: 64 * 1024 * 1024, + }); + return normalizeCommandResult(result); + } + const result = spawnSync(command, { + cwd: opts.cwd, + env, + shell: true, + encoding: "utf8", + timeout: opts.timeoutMs, + maxBuffer: 64 * 1024 * 1024, + }); + return normalizeCommandResult(result); +} + +function normalizeCommandResult(result) { + return { + status: typeof result.status === "number" ? result.status : 1, + signal: result.signal || null, + stdout: result.stdout || "", + stderr: result.stderr || "", + error: result.error ? result.error.message : null, + dry_run: false, + }; +} + +function expectationPassed(result, expect) { + const failures = []; + const exitCode = expect.exitCode === undefined ? 0 : expect.exitCode; + if (result.status !== exitCode) { + failures.push(`exit code ${result.status}, expected ${exitCode}`); + } + if (expect.stdoutIncludes && !result.stdout.includes(expect.stdoutIncludes)) { + failures.push(`stdout missing ${JSON.stringify(expect.stdoutIncludes)}`); + } + if (expect.stderrIncludes && !result.stderr.includes(expect.stderrIncludes)) { + failures.push(`stderr missing ${JSON.stringify(expect.stderrIncludes)}`); + } + if (expect.stdoutMatches) { + const re = new RegExp(expect.stdoutMatches); + if (!re.test(result.stdout)) { + failures.push(`stdout did not match /${expect.stdoutMatches}/`); + } + } + return failures; +} + +function classifyFailure(step, result, failures) { + const text = `${result.stderr}\n${result.stdout}\n${result.error || ""}`; + const actions = []; + let category = "command_failed"; + if (result.error && /ENOENT/.test(result.error)) { + category = "missing_command"; + actions.push(`Install or put the command on PATH: ${commandText(step.command).split(/\s+/)[0]}`); + } else if (/command not found|not found/i.test(text)) { + category = "missing_command"; + actions.push("Install the missing command or adjust the plan command."); + } else if (/permission denied|not authorized|forbidden/i.test(text)) { + category = "permission"; + actions.push("Refresh auth or request the missing permission, then retry this step."); + } else if (/timed out|ETIMEDOUT|i\/o timeout/i.test(text)) { + category = "timeout"; + actions.push("Retry with a larger timeout or reduce the command scope."); + } else if (/working tree|worktree|uncommitted|dirty/i.test(`${step.id} ${text}`)) { + category = "dirty_worktree"; + actions.push("Inspect git status and decide whether to commit, stash, or narrow the plan."); + } + if (step.corrections.length > 0) { + actions.unshift("Run configured correction steps, then retry the failed step."); + } + if (actions.length === 0) { + actions.push("Inspect stdout/stderr and add a targeted correction step to the plan."); + } + return { + category, + failures, + next_actions: actions, + }; +} + +function makeEmitter(format, eventsFile) { + return function emit(event) { + fs.appendFileSync(eventsFile, `${JSON.stringify(event)}\n`); + if (format === "ndjson") { + console.log(JSON.stringify(event)); + } else if (format === "pretty" && event.type === "step_result") { + const mark = event.status === "passed" ? "PASS" : event.status === "corrected" ? "FIXED" : "FAIL"; + const retry = event.attempts > 1 ? ` attempts=${event.attempts}` : ""; + console.log(`[${mark}] ${event.stage_id}.${event.step_id}${retry} (${event.duration_ms}ms)`); + } else if (format === "pretty" && event.type === "stage_result") { + console.log(`[STAGE ${event.status.toUpperCase()}] ${event.stage_id}`); + } + }; +} + +function runCorrection(correction, context) { + const cwd = path.resolve(context.cwd, correction.cwd || "."); + const startedAt = new Date(); + const raw = runCommand(correction.command, { + cwd, + timeoutMs: correction.timeoutMs, + dryRun: context.dryRun, + env: context.env, + }); + const endedAt = new Date(); + const failures = expectationPassed(raw, correction.expect); + return { + id: correction.id, + name: correction.name, + command: commandText(correction.command), + cwd, + status: failures.length === 0 ? "passed" : "failed", + started_at: startedAt.toISOString(), + ended_at: endedAt.toISOString(), + duration_ms: endedAt.getTime() - startedAt.getTime(), + exit_code: raw.status, + signal: raw.signal, + stdout_tail: tail(raw.stdout), + stderr_tail: tail(raw.stderr), + error: raw.error, + expectation_failures: failures, + }; +} + +function runStep(stage, step, context) { + const startedAt = new Date(); + const attempts = []; + const correctionResults = []; + const maxAttempts = Math.max(1, step.maxAttempts + context.maxCorrections); + let finalStatus = "failed"; + let selfCorrection = null; + + for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { + const cwd = path.resolve(context.cwd, step.cwd || "."); + const raw = runCommand(step.command, { + cwd, + timeoutMs: step.timeoutMs, + dryRun: context.dryRun, + env: { + ...context.env, + HARNESS_STAGE_ID: stage.id, + HARNESS_STEP_ID: step.id, + HARNESS_ATTEMPT: String(attempt), + }, + }); + const failures = expectationPassed(raw, step.expect); + attempts.push({ + attempt, + command: commandText(step.command), + cwd, + exit_code: raw.status, + signal: raw.signal, + stdout_tail: tail(raw.stdout), + stderr_tail: tail(raw.stderr), + error: raw.error, + expectation_failures: failures, + status: failures.length === 0 ? "passed" : "failed", + }); + if (failures.length === 0) { + finalStatus = attempt === 1 ? "passed" : "corrected"; + break; + } + + selfCorrection = classifyFailure(step, raw, failures); + if (attempt >= maxAttempts || step.corrections.length === 0) { + break; + } + for (const correction of step.corrections) { + correctionResults.push(runCorrection(correction, context)); + } + } + + const endedAt = new Date(); + return { + type: "step_result", + stage_id: stage.id, + step_id: step.id, + name: step.name, + required: step.required, + status: finalStatus, + started_at: startedAt.toISOString(), + ended_at: endedAt.toISOString(), + duration_ms: endedAt.getTime() - startedAt.getTime(), + attempts: attempts.length, + command: commandText(step.command), + attempt_results: attempts, + corrections: correctionResults, + self_correction: finalStatus === "passed" ? null : selfCorrection, + }; +} + +function runStage(stage, context) { + const startedAt = new Date(); + const stepResults = []; + let status = "passed"; + for (const step of stage.steps) { + const result = runStep(stage, step, context); + stepResults.push(result); + context.emit(result); + if (result.status === "failed" && step.required) { + status = "failed"; + break; + } + if (result.status === "corrected" && status !== "failed") { + status = "corrected"; + } + } + const endedAt = new Date(); + const stageResult = { + type: "stage_result", + stage_id: stage.id, + objective: stage.objective, + required: stage.required, + status, + started_at: startedAt.toISOString(), + ended_at: endedAt.toISOString(), + duration_ms: endedAt.getTime() - startedAt.getTime(), + steps: stepResults, + }; + context.emit({ + type: "stage_result", + stage_id: stage.id, + status, + duration_ms: stageResult.duration_ms, + failed_steps: stepResults.filter((step) => step.status === "failed").map((step) => step.step_id), + }); + return stageResult; +} + +function summarize(plan, stageResults, context, startedAt) { + const endedAt = new Date(); + const failedStages = stageResults.filter((stage) => stage.status === "failed"); + const stageShape = validateStageShape(plan); + const artifactResults = validateArtifacts(plan, context); + const missingRequiredArtifacts = artifactResults.filter((artifact) => artifact.required && !artifact.exists); + const stageOrderFailed = + plan.constraints.enforce_stage_order && + (stageShape.missing.length > 0 || stageShape.unexpected.length > 0 || stageShape.out_of_order.length > 0); + const failedSteps = stageResults.flatMap((stage) => + stage.steps + .filter((step) => step.status === "failed") + .map((step) => ({ + stage_id: stage.stage_id, + step_id: step.step_id, + category: step.self_correction?.category || "unknown", + next_actions: step.self_correction?.next_actions || [], + })), + ); + const correctedSteps = stageResults.flatMap((stage) => + stage.steps + .filter((step) => step.status === "corrected") + .map((step) => ({ stage_id: stage.stage_id, step_id: step.step_id })), + ); + const status = + failedStages.length === 0 && missingRequiredArtifacts.length === 0 && !stageOrderFailed ? "passed" : "failed"; + return { + run_id: context.runId, + plan_name: plan.name, + objective: plan.objective, + target: plan.target, + inputs: plan.inputs, + lifecycle: plan.lifecycle, + status, + started_at: startedAt.toISOString(), + ended_at: endedAt.toISOString(), + duration_ms: endedAt.getTime() - startedAt.getTime(), + run_dir: context.runDir, + stage_shape: stageShape, + artifacts: artifactResults, + contract_failures: [ + ...(stageOrderFailed + ? [ + { + category: "stage_order", + missing: stageShape.missing, + unexpected: stageShape.unexpected, + out_of_order: stageShape.out_of_order, + }, + ] + : []), + ...missingRequiredArtifacts.map((artifact) => ({ + category: "missing_artifact", + artifact_id: artifact.id, + path: artifact.path, + })), + ], + stages: stageResults.map((stage) => ({ + stage_id: stage.stage_id, + status: stage.status, + steps: stage.steps.length, + failed_steps: stage.steps.filter((step) => step.status === "failed").length, + corrected_steps: stage.steps.filter((step) => step.status === "corrected").length, + })), + failed_steps: failedSteps, + corrected_steps: correctedSteps, + }; +} + +function validateStageShape(plan) { + const ids = plan.stages.map((stage) => stage.id); + const expected = plan.lifecycle.stage_order; + const missing = expected.filter((stage) => !ids.includes(stage)); + const unexpected = ids.filter((stage) => !expected.includes(stage)); + const outOfOrder = []; + let lastIndex = -1; + for (const id of ids) { + const index = expected.indexOf(id); + if (index < 0) { + continue; + } + if (index < lastIndex) { + outOfOrder.push(id); + } + lastIndex = Math.max(lastIndex, index); + } + return { + lifecycle_id: plan.lifecycle.id, + expected_order: expected, + present: ids, + missing, + unexpected, + out_of_order: outOfOrder, + order_matches: missing.length === 0 && unexpected.length === 0 && outOfOrder.length === 0, + }; +} + +function validateArtifacts(plan, context) { + return plan.artifacts.map((artifact) => { + const artifactPath = expandEnvVars(artifact.path, { ...process.env, ...context.env }); + const resolvedPath = path.isAbsolute(artifactPath) ? artifactPath : path.resolve(context.cwd, artifactPath); + const exists = fs.existsSync(resolvedPath); + return { + id: artifact.id, + path: artifact.path, + resolved_path: resolvedPath, + required: artifact.required, + exists, + status: exists ? "present" : artifact.required ? "missing" : "optional_missing", + }; + }); +} + +function main() { + const args = parseArgs(process.argv.slice(2)); + if (args.help) { + usage(); + return; + } + const baseCwd = args.cwd ? path.resolve(args.cwd) : repoRoot(process.cwd()); + const runId = args.runId || timestampId(); + const runRoot = path.resolve(baseCwd, args.runRoot || ".harness/runs"); + const runDir = path.join(runRoot, runId); + const stagesDir = path.join(runDir, "stages"); + ensureDir(stagesDir); + + const planPath = path.resolve(baseCwd, args.plan); + const plan = normalizePlan(readJson(planPath)); + const eventsFile = path.join(runDir, "events.ndjson"); + fs.writeFileSync(eventsFile, ""); + const emit = makeEmitter(args.format, eventsFile); + const startedAt = new Date(); + const context = { + cwd: baseCwd, + runId, + runDir, + dryRun: args.dryRun, + maxCorrections: args.maxCorrections, + emit, + env: { + ...plan.env, + HARNESS_RUN_ID: runId, + HARNESS_RUN_DIR: runDir, + HARNESS_PLAN: plan.name, + }, + }; + + writeJson(path.join(runDir, "plan.json"), plan); + writeJson(path.join(runDir, "stage_shape.json"), validateStageShape(plan)); + writeJson(path.join(runDir, "contract.json"), { + objective: plan.objective, + target: plan.target, + inputs: plan.inputs, + lifecycle: plan.lifecycle, + constraints: plan.constraints, + artifacts: plan.artifacts, + env: plan.env, + }); + emit({ + type: "run_started", + run_id: runId, + plan_name: plan.name, + objective: plan.objective, + target: plan.target, + inputs: plan.inputs, + lifecycle: plan.lifecycle, + cwd: baseCwd, + run_dir: runDir, + dry_run: args.dryRun, + }); + + const stageResults = []; + for (const stage of plan.stages) { + const result = runStage(stage, context); + stageResults.push(result); + writeJson(path.join(stagesDir, `${stage.id}.json`), result); + if (result.status === "failed" && stage.required) { + break; + } + } + + const summary = summarize(plan, stageResults, context, startedAt); + writeJson(path.join(runDir, "summary.json"), summary); + emit({ type: "run_finished", ...summary }); + if (args.format === "json") { + console.log(JSON.stringify(summary, null, 2)); + } else if (args.format === "pretty") { + console.log(JSON.stringify(summary, null, 2)); + } + if (summary.status !== "passed") { + process.exitCode = 1; + } +} + +try { + main(); +} catch (err) { + console.error(JSON.stringify({ ok: false, error: err.message }, null, 2)); + process.exitCode = 1; +} diff --git a/skills/dev/SKILL.md b/skills/dev/SKILL.md new file mode 100644 index 000000000..57d94612e --- /dev/null +++ b/skills/dev/SKILL.md @@ -0,0 +1,88 @@ +--- +name: dev +version: 0.3.0 +description: "eval-search 交付 Harness:借鉴 lkkcli /dev 的生命周期约束,把搜索评测目标落成可执行、可复盘、可修正的阶段计划。" +metadata: + requires: + bins: ["node", "git"] +--- + +# dev — eval-search 交付 Harness + +本 skill 只负责把 lkkcli `/dev` 的生命周期控制迁移到本仓库,不改变 `/eval-search` 的目标:评测 `lark-cli` 搜索能力,产出盲测轨迹、Judge 评分、归因和 Optimizer 可消费的报告。 + +## 定位 + +- `/eval-search` 是业务目标层:定义 Executor / Judge / Optimizer 隔离、评分、污染控制和 PR 生成。 +- `scripts/harness-runner.ts` 是状态执行层入口,直接通过 Node 的 TS type stripping 执行。 +- `.harness/plan.example.json` 是本仓库默认计划:用 lkkcli 风格的 `prepare -> understand -> plan -> act -> verify -> retrospect` 包住 eval-search。 + +不要把这个 skill 扩展成通用研发流水线;通用需求、部署、MR 和 CI 编排属于 lkkcli `/dev`。这里的交付标准仍然围绕搜索评测。 + +## 硬约束 + +1. **目标不漂移**:plan 的 `target.skill` 必须是 `eval-search`。 +2. **输入先声明**:loader profile、executor profile、subset/dataset-file、eval run id 必须写进 `inputs`。 +3. **生命周期可检查**:plan 必须声明 `lifecycle.stage_order`,并开启 `constraints.enforce_stage_order`。 +4. **角色隔离保留**:Loader、Executor、Judge、Optimizer 的输入边界必须写进 `constraints.role_isolation`。 +5. **TS 替代 JS**:runner、setup runner、evidence collector 只保留 `.ts`,不要再生成或维护同名 `.js`。 +6. **产物契约显式化**:rubric、Executor/Judge/Optimizer prompt、TS 入口必须列入 `artifacts`。 +7. **失败可恢复**:失败 step 必须输出 `self_correction`;能自动 correction 的写进 plan,不能自动处理的给出 next action。 + +## 标准入口 + +先运行本仓库默认计划,确认 eval-search 的静态契约和本地门禁都成立: + +```bash +node --experimental-strip-types scripts/harness-runner.ts --plan .harness/plan.example.json --format json +``` + +运行产物写入: + +```text +.harness/runs// + plan.json + contract.json + stage_shape.json + events.ndjson + stages/.json + summary.json +``` + +只有当 `summary.status == "passed"` 时,才继续执行真实 `/eval-search run` 或 `/eval-search propose-pr`。 + +## 生命周期语义 + +### Prepare + +确认 repo 状态、分支、dirty 文件和本地工具可用性。`lark-cli` 缺失不直接阻断静态门禁,但真实评测前必须补齐。 + +### Understand + +读取并确认 `/eval-search` 的核心契约:盲测、三角色隔离、rubric、污染控制。这个阶段不接触评测集答案。 + +### Plan + +确认 deterministic setup 和 evidence collector 可调用,并明确本轮使用的 loader/executor profile、subset、dataset-file 策略。 + +### Act + +检查或执行会产出 eval-search 运行材料的代码路径:dataset setup、pollution preflight、executor evidence collection。 + +### Verify + +运行递进式门禁:TypeScript check、runner syntax、eval-search 脚本 syntax、skill format。真实评测完成后,还要检查 `tests/eval-search/runs//summary.json` 和 regression 结果。 + +### Retrospect + +沉淀本轮的污染 token、失败归因、泛化改动声明和下一轮 correction。若需要新增经验,优先更新 `skills/eval-search/**` 或 `tests/eval-search/**`,不要散落到临时笔记。 + +## 收尾标准 + +最终回复用户前检查最新 summary: + +```bash +node --experimental-strip-types scripts/harness-runner.ts --plan .harness/plan.example.json --format json +``` + +如果 `summary.status != "passed"`,不能声称完成;必须给出 `summary.contract_failures` 和 `summary.failed_steps[*].next_actions`。 diff --git a/skills/eval-search/RUBRIC.md b/skills/eval-search/RUBRIC.md new file mode 100644 index 000000000..519f7563b --- /dev/null +++ b/skills/eval-search/RUBRIC.md @@ -0,0 +1,115 @@ +# RUBRIC — 4 维度评分细则 + +每个 case 按 4 维打分,每维 0-5 分,单 case 满分 15。总分 = sum(recall + accuracy + completeness)。 + +> 注:`total` 字段只聚合 3 个打分维度。第 4 维 `contamination_penalty` 是修饰项,见下。 + +## 维度定义 + +### recall(召回,0-5) + +"Executor 是否找到 / fetch 过**正确的目标文档**"。对应评测集 `数据源地址` 字段里的 URL / token。 + +| 分 | 判据 | +|----|------| +| 5 | trajectory 里显式 fetch 过全部 expected source;或 search 结果 top-5 里能看到全部 expected source 的 token | +| 4 | fetch 过一半以上(严格过半) | +| 3 | fetch 过至少 1 个但不到一半;或 top-5 里出现但未 fetch | +| 1-2 | 没 fetch、没在 top-5,但有相关命中(同主题不同文档) | +| 0 | 完全无关的命中 / 空结果 | + +**特例**:`企业内是否有知识 == 否` 的 case,recall 固定 5 分(agent 不该找到任何高置信答案,答"没找到"也算召回正确)。 + +### accuracy(准确性,0-5) + +"Executor 给出的最终答案**在事实层面**对不对"。对照评测集 `预期答复` 的【关键信息】段 + 【打分备注】里的 "可信无误" 说明。 + +| 分 | 判据 | +|----|------| +| 5 | 关键信息全部正确,无事实错误 | +| 4 | 主要信息正确,少量细节偏差(时间、数字小错) | +| 3 | 部分正确部分错 / 含明显可证伪陈述 | +| 1-2 | 大部分错误,但方向对 | +| 0 | 完全错 / 幻觉 / 答非所问 | + +**【打分备注】优先级高于通用判据**。例如某 case 备注 "给到 0.x 折这类可信要扣分",即使答案看起来合理,只要踩到就扣。 + +### completeness(完整性,0-5) + +"Executor 覆盖了多少 expected key points"。对照【关键信息】列出的条目 + 【打分备注】里的 "完整详实" 说明。 + +| 分 | 判据 | +|----|------| +| 5 | 覆盖 ≥80% key points,或满足 `完整详实` 备注的明确阈值(如"答出 5 个及以上不扣分") | +| 4 | 覆盖 60-80% | +| 3 | 覆盖 40-60% | +| 1-2 | 覆盖 20-40% | +| 0 | <20% 或未给答案 | + +### contamination_penalty(污染修饰,-3 ~ 0) + +仅当 pre-flight 标记了 `contamination_risk=true` 且 trajectory 显示 Executor **fetch 过 tainted token** 时触发。 + +| 分 | 判据 | +|----|------| +| 0 | 未命中 tainted token,或命中但未 fetch | +| -1 | fetch 了 tainted token 但最终答案未直接引用其内容 | +| -3 | fetch 了 tainted token 且答案明显抄袭其结构 / 原文 | + +该项**直接从 total 扣**,且在 verdict 里显式标注,避免"刷分嫌疑"。 + +## Verdict JSON schema + +每个 case 一个 verdict,合并写入 `verdicts.json`。 + +```json +{ + "case_id": "case_001", + "query": "...", + "scores": { + "recall": 4, + "accuracy": 5, + "completeness": 3, + "contamination_penalty": 0, + "total": 12 + }, + "rationale": { + "recall": "fetch 了 Es5wwNCyei3eYNkXc8Tcx35nnWe,top-3 里出现 HxnMwM9cyiFW1dkACUBcC7KWnEd 但未 fetch", + "accuracy": "8 个案例全部在参考文档里,无幻觉", + "completeness": "列了 5/10,备注要求 ≥5 不扣分,按备注打 5" + }, + "improvement": { + "tool_capability": [ + "docs +search 返回结果没有 body_preview,agent 必须 fetch 才能判断相关性。建议返回摘要字段减少 fetch 次数" + ], + "search_strategy": [ + "Executor 只用了原词 '华东 Aily 案例',没换 '客户成功故事' / '最佳实践' 等同义词" + ], + "skill_prompts": [ + "lark-doc-search.md 可新增同义词清单小节,含 'case / story / best practice' 映射" + ] + }, + "contamination": { + "risk_flagged": false, + "tainted_tokens_fetched": [], + "penalty_applied": 0 + } +} +``` + +## 聚合规则(summary.json) + +Judge 打完所有 case 后,主 agent 按以下规则聚合到 `summary.json`: + +1. **按改动落点文件聚类 improvements**,不按文本相似度: + - 同一条 skill_prompts 建议指向 `skills/lark-doc/SKILL.md` 的,合并成一条 finding + - finding 保留 `driving_cases: [case_003, case_007, ...]` 反向索引 +2. **计算一阶瓶颈**:三桶的建议条数之和,占比最大的那个桶就是 `primary_bottleneck` +3. **统计 contamination**:有多少 case 被 fetch 到 tainted token,若 >2 个输出警告 +4. **汇总每个维度的均值、总分** + +## 校准指引(给 Judge 看的) + +- 优先使用【打分备注】里的 per-case rubric;与通用判据冲突时**以备注为准** +- 宁低勿高:打分是迭代的信号源,乐观打分会让下一轮 optimizer 找不到方向 +- rationale 字段必填,且要引用 trajectory 里的具体命令或 URL。只写"还行""不够完整"等空洞判断会被 Optimizer 识别为低质量 verdict 并丢弃 diff --git a/skills/eval-search/SKILL.md b/skills/eval-search/SKILL.md new file mode 100644 index 000000000..9bd677a13 --- /dev/null +++ b/skills/eval-search/SKILL.md @@ -0,0 +1,159 @@ +--- +name: eval-search +version: 0.1.0 +description: "lark-cli 搜索能力端到端评测 Harness:拉取飞书评测集 → 盲测执行 → 四维打分 → 聚合归因 → 自动生成 PR 草稿。当用户要评测 lark-cli 搜索效果、做 v_n→v_{n+1} 迭代、让新人跑一轮优化闭环时使用。" +metadata: + requires: + bins: ["node", "lark-cli", "jq", "git", "gh"] +--- + +# eval-search — lark-cli 搜索能力评测 Harness + +**CRITICAL — 开始前 MUST 先用 Read 工具读取 [`../lark-shared/SKILL.md`](../lark-shared/SKILL.md)(认证)和 [`RUBRIC.md`](RUBRIC.md)(评分细则)。** + +## 目标 + +给 AI agent 一个自然语言搜索问题,它能否通过 lark-cli 在飞书企业知识库里找到正确答案?当它做不到,定位到: +- **(a) tool_capability** — 工具能力缺口(缺 shortcut / 缺 flag / 输出难解析) +- **(b) search_strategy** — agent 应该但没做的搜索动作 +- **(c) skill_prompts** — 方法论没在 skill 文档里 + +并把归因汇聚成可执行的 PR 草稿。 + +## 适用场景 + +- "跑一轮搜索评测" +- "新人想参与 lark-cli 优化,从哪里开始" +- "对比一下最近改动对搜索效果的影响" +- "看看上一轮评测还有哪些归因没处理" + +## 三个入口命令 + +``` +/eval-search run [--loader-profile NAME] [--executor-profile NAME] [--subset N] + # 跑一轮评测,产出 run-id。默认全量;--subset=3 抽样冒烟 +/eval-search run --snapshot-only # 只把评测集拉成本地 dataset.jsonl,供移除权限后复用 +/eval-search propose-pr # 基于 run 生成 PR 草稿(含 before/after + 泛化声明 + regression 告警) +/eval-search report # 读已有 run 的 summary.json +``` + +新人典型流程:`run` → 看 summary → `propose-pr` → review PR → merge。 + +## 状态层(向 lkkcli Harness 对齐) + +本仓库额外提供一个轻量状态层,把 lkkcli `/dev` 的生命周期约束套到 `/eval-search` 上,但不改变搜索评测目标: + +```bash +node --experimental-strip-types scripts/harness-runner.ts --plan .harness/plan.example.json --format json +``` + +这个 plan 的目标必须保持为 `target.skill = eval-search`,生命周期固定为: + +```text +prepare -> understand -> plan -> act -> verify -> retrospect +``` + +它只做四件事: +- 声明本轮 live run 需要的 loader profile、executor profile、subset/dataset-file、run-id +- 明确 Loader / Executor / Judge / Optimizer 的隔离边界 +- 检查 rubric、prompts、TS 入口等必备产物,并直接运行 `.ts` 入口 +- 把每个阶段的命令结果、失败归因、correction 和 contract failure 写入 `.harness/runs//summary.json` + +因此,真实评测仍然按下面的 `/eval-search run` 流程执行;状态层只是先把环境、约束和本地门禁变成可复盘的执行记录。若 `summary.status != "passed"`,不要启动真实评测或声称 PR 可交付。 + +## 三层架构(必须隔离,违反会让结果失真) + +``` +Executor (sub-agent, Task 工具) + 输入: query only 不知道: expected / rubric / source_urls + 工具: 仅 lark-cli + 产出: trajectory + answer + ↓ +Judge (主 agent 切 hat,时序隔离) + 输入: query + answer + expected + rubric + 产出: 4 维打分 + 三桶 improvement + ↓ +Optimizer (sub-agent, Task 工具) + 输入: 全部 verdicts summary + 失败 case 的关键错误片段(不喂 trajectory 全文) + 产出: diff + 泛化声明字段 +``` + +**隔离纪律**: +- Executor prompt 永远只注入 `query`,绝不传 expected/rubric/source_urls(盲测) +- Judge 必须在 Executor 全部跑完之后开始,不得和 Executor 共享 tool-use 窗口 +- Optimizer 只看 Judge 聚合出的 summary,**不喂 trajectory 原文全文**,只喂失败 case 的关键错误行(防过拟合 + 控 context) + +## `/eval-search run` 流程 + +详细步骤见 [`references/run-layout.md`](references/run-layout.md)。概要: + +1. **确定性 setup**:先运行 `node --experimental-strip-types tests/eval-search/eval-search-run.ts --loader-profile --executor-profile [--subset N]`。脚本会生成 run-id,建目录 `tests/eval-search/runs//`,并完成第 2-4 步。若只有一个账号,可先用 `--snapshot-only` 拉本地 `dataset.jsonl`,移除该账号的评测 Base 权限后,再用 `--dataset-file /dataset.jsonl` 继续 +2. **拉数据集**:按 [`references/dataset.md`](references/dataset.md) 用 loader profile 从评测 base 拉最新数据 → `dataset.jsonl` +3. **账号隔离**:按 [`references/pollution-preflight.md`](references/pollution-preflight.md) 检查 executor profile 不在 `excluded_user_ids`,并主动探测 executor 不能读取评测 Base;若能读取则阻断 +4. **污染预检**:用 executor profile 对每条 query 跑一次 `docs +search`,命中 [`references/known-tainted-tokens.md`](references/known-tainted-tokens.md) 里的 token 则标记 `contamination_risk`。只标记不阻断;Judge 阶段再决定是否扣分 +5. **Executor 并行**:用 Task 工具启动 sub-agent 按 [`prompts/executor.md`](prompts/executor.md) 跑全部 case。每个 case trajectory 落盘 `trajectories/.json` +6. **Judge 逐 case**:主 agent 按 [`prompts/judge.md`](prompts/judge.md) 打分,写 `verdicts.json` +7. **聚合**:按"改动落点文件"对 improvements 聚类,写 `summary.json`;输出 run-id 给用户 + +## `/eval-search propose-pr` 流程 + +详细见 [`references/pr-generation.md`](references/pr-generation.md)。概要: + +1. **Optimizer 生成 diff**:用 Task 工具启动 sub-agent 按 [`prompts/optimizer.md`](prompts/optimizer.md) 读 summary + 两个仓库代码,产出 **cli diff + open diff(如有)** 和泛化声明 +2. **应用 diff 到两个 worktree**: + - cli 仓库:独立分支 `eval-search/auto-pr/` + - open 仓库(若有改动):独立分支 `eval-search/auto-pr/`,互不污染 main +3. **Quality gate**(当前仅 cli 仓库):`make unit-test` + `golangci-lint run --new-from-rev=origin/main` 必须通过。失败 → Optimizer 最多迭代 2 次,仍失败 → 把触发失败的改动降级为 GitHub issue,不进 PR。open 仓库暂不跑 gate(CI 配置非 harness 可控) +4. **确定性 regression 重跑**:按 diff 之上重跑完整评测(复用 `/eval-search run` 内部流程),产出 after verdicts。**这一步不给 Optimizer 参与** +5. **组装两份 PR description**:按 [`references/pr-generation.md`](references/pr-generation.md) 里的模板,包含 before/after 数值、wins/regressions 逐 case 列表、泛化声明、未处理归因、**对端 PR 互相 link** +6. **`gh pr create --draft`**:双 PR 独立提,**独立 review、独立 merge**。不强绑定联动。一个 PR 先 merge 另一个还没 merge 也 OK,在 PR description 里标记 cross-ref + +## 权限边界(v0.1 软约束,迭代中调整) + +### cli 仓库(`larksuite/cli`,当前目录) + +Optimizer 默认允许改: +- `skills/**/*.md` +- 新增 `shortcuts//*.go` 及对应测试 + +Optimizer 不自动改: +- `internal/**`, `extension/**`, `cmd/root.go`, `cmd/service/**` 等基础设施 → 降级为 issue +- 任何旧 shortcut 的删除 / 重命名 / 破坏性改动 + +### open 仓库(`$GOPATH/src/code.byted.org/lark_as/open/`) + +详见 [`references/open-repo-layout.md`](references/open-repo-layout.md)。简要: + +Optimizer 默认允许改: +- `biz/search_open/entity/{name}.go` 的 `BuildDisplayInfo` / `BuildResponseItem` bug fix / `Prune` 及配套 `*_test.go` + +Optimizer 不自动改: +- IDL(在独立的 `lark/idl` 仓库,需要跑 overpass,不属于 PR 范畴) +- `api_meta/**/*.yml`(契约变更,走人工) +- `biz/search_open/handler.go` / `adapter.go` / `pagetoken.go` / `response.go` 等基础设施 +- 任何"新增 OAPI 字段"类需求(跨两个仓库 + 手工步骤,产出 issue 正文即可) + +### 违反白名单的处理 + +Optimizer 把该 finding 写进 PR description 的"未处理归因"段(含建议 issue 正文),由新人创建对应 GitHub issue。**不发**跨仓库 / 超出白名单的 PR。 + +## 关键纪律(不遵守分数会失真) + +1. **盲测纪律**:Executor prompt 只注入 `query`。即使主 agent fallback 接管 Executor,也必须自我约束不读 `dataset.jsonl` 的非 query 字段 +2. **三层隔离**:Judge 不能和 Executor 在同一轮 reasoning;Optimizer 不喂 trajectory 全文 +3. **Regression 软告警**:after 出现 regression 不硬 block,但必须在 PR description 里逐 case 列出;reviewer 判断 +4. **泛化声明必填**:Optimizer 必须区分"针对具体 case 的改动"和"泛化原则性改动"。前者过拟合风险高,reviewer 重点看 +5. **污染隔离**:harness 至少使用两个 profile。loader profile 可以读取评测 Base,但只允许用于拉数据集;executor profile 必须是专用测试账号(非 PM 账号、非 dataset owner 账号),且不能读取评测 Base。若 executor profile 的 `userOpenId` 出现在 [`references/known-tainted-tokens.md`](references/known-tainted-tokens.md) 的 `excluded_user_ids` 列表里,或 executor 可以读取评测 Base,拒绝启动 + +## 参考 + +- [`RUBRIC.md`](RUBRIC.md) — 4 维度评分细则 +- [`prompts/executor.md`](prompts/executor.md) — Executor sub-agent 模板 +- [`prompts/judge.md`](prompts/judge.md) — Judge 打分模板 +- [`prompts/optimizer.md`](prompts/optimizer.md) — Optimizer PR 生成模板 +- [`references/dataset.md`](references/dataset.md) — 评测集 schema + 拉取方式 +- [`references/pollution-preflight.md`](references/pollution-preflight.md) — 污染预检规则 +- [`references/known-tainted-tokens.md`](references/known-tainted-tokens.md) — 已知泄露文档标记清单 +- [`references/run-layout.md`](references/run-layout.md) — run 目录结构 + 中间产物约定 +- [`references/pr-generation.md`](references/pr-generation.md) — PR 生成流程 + description 模板(双 PR) +- [`references/open-repo-layout.md`](references/open-repo-layout.md) — `lark_as/open` 仓库允许改动的白名单导航 diff --git a/skills/eval-search/prompts/executor.md b/skills/eval-search/prompts/executor.md new file mode 100644 index 000000000..9f474c107 --- /dev/null +++ b/skills/eval-search/prompts/executor.md @@ -0,0 +1,73 @@ +# Executor sub-agent 模板 + +**使用方式**:主 agent 用 Task 工具启动 sub-agent(`subagent_type: general-purpose`),把本文件内容 + 具体 `query` 拼为 prompt 传入。**禁止在 prompt 里注入 expected / rubric / source_urls / 评测集任何其他字段**。 + +--- + +## SYSTEM(照原样复制到 Task prompt 开头) + +你是 lark-cli 搜索能力评测 harness 的**执行层 sub-agent**,任务是**盲测**:回答一个来自飞书企业知识库的自然语言问题。 + +### 你的约束 + +1. **工具只有 lark-cli**:可以用 `lark-cli` 的任何 shortcut、API、schema 命令。禁止使用 WebFetch / WebSearch / 其他外部工具。 +2. **身份为当前登录的 user**。不要主动切 bot。 +3. **你不知道标准答案**,也不知道答案在哪个文档。你唯一拥有的信息就是 `query`。 +4. **单 case round 预算:12 round**(一个 lark-cli 调用 = 1 round)。超过必须收尾给 best-effort 答案。 +5. **Context discipline**: + - 任何 lark-cli 输出 >30 行 → 先 `--format json -q '.data[].title'` 之类精简,或落盘到 `/tmp/case__.txt` 再 grep + - 不要把整篇文档正文贴进 reasoning + - 每一步的内部总结 ≤200 字符 +6. **增量持久化**:每完成 1 round,把 trajectory 追加写入 `/trajectories/.json`。崩溃恢复靠这个文件。 + +### 方法论(**必须先阅读**,不是建议) + +在发出第一条 lark-cli 命令之前,MUST 用 Read 读: +- `skills/lark-shared/SKILL.md` — 认证、全局参数 +- `skills/lark-doc/SKILL.md` + `skills/lark-doc/references/lark-doc-search.md` — 云空间搜索 +(搜索方法论直接在 `lark-doc-search.md` 里:关键词改写 / 失败退出 / 大文档 fallback 都在该文件的决策规则段) +- `skills/lark-wiki/SKILL.md` — wiki 节点是壳的关键概念 + +根据 query 类型可能还要读:`lark-im`、`lark-mail`、`lark-vc`、`lark-minutes`、`lark-contact` 等。 + +### 标准流程 + +1. 阅读 query,拆"实体"(人名 / 时间 / 关键词 / 资源类型) +2. 选择搜索入口(docs / im / mail / vc / minutes / ...) +3. 发起搜索;若返回空或无相关结果,按 `lark-doc-search.md` 的"决策规则 / `--query` 高级语法"换 2-3 轮词(同义词 / `intitle:` / 排除词) +4. 对 top 命中做进一步 fetch / resolve(wiki 节点必须先 `wiki +resolve-node`) +5. 综合信息给出答案;若 3 轮改写仍无结果,给 best-effort 结论并明确说"未找到直接证据" +6. 写 `/trajectories/.json`,结束 + +### 输出格式(最后一条消息,JSON) + +```json +{ + "case_id": "", + "answer": "<自然语言答案,markdown 允许>", + "referenced_urls": ["<从 lark-cli 命中的 URL>", ...], + "rounds_used": , + "gave_up": , + "notes": "<可选,给 Judge 的说明,例如:'时间窗超了,只跑了 8 round 提前收敛'>" +} +``` + +### 反模式(会被 Judge 扣分) + +- ❌ 不读 skill 文档直接 `lark-cli api GET /...` 手拼参数 +- ❌ 把 wiki token 当 doc token 传给 `docs +fetch` +- ❌ 搜不到时只重复同一个关键词 +- ❌ 一次性 `lark-cli ... | cat` 把 500 行塞进 reasoning +- ❌ 编造答案(没 fetch 过就说"根据文档 X...") + +--- + +## USER(主 agent 拼接时注入) + +``` +query: <来自 dataset.jsonl 的 query 字段原文> +case_id: +run_dir: > +``` + +**除以上三个字段,不注入任何评测集其他字段**。 diff --git a/skills/eval-search/prompts/judge.md b/skills/eval-search/prompts/judge.md new file mode 100644 index 000000000..b81dc9226 --- /dev/null +++ b/skills/eval-search/prompts/judge.md @@ -0,0 +1,97 @@ +# Judge 打分模板 + +**使用方式**:主 agent 切 hat 执行。Executor 全部跑完后,主 agent 逐 case 读 `trajectory + expected`,按本文件产出 verdict。 + +> **隔离纪律**:不要在 Executor 尚未跑完时开始 Judge(会污染 Executor 所在 reasoning 窗口)。Executor 全部完成、`trajectories/*.json` 落盘后再启动 Judge。 + +--- + +## Judge 每个 case 的输入 + +从磁盘读(**不要复用 Executor 的 reasoning context**): +- `dataset.jsonl` 中该 case 的 `query / expected / source_urls / has_knowledge / rubric_notes` +- `trajectories/.json`(含 rounds 列表 + 最终 answer) +- `preflight.json`(看 `contamination_risk` 和 `tainted_tokens`) +- `skills/eval-search/RUBRIC.md` + +## 每个 case 的打分步骤 + +1. **recall**:扫 trajectory 里的每一条 tool_use,提取被 fetch / resolve 过的 token 和 URL 集合。与 `source_urls` 做交集。按 RUBRIC 打分 +2. **accuracy**:把 `answer` 和 `expected.【关键信息】` 段逐条比对。优先应用 `expected.【打分备注】.可信无误` +3. **completeness**:数 key points 覆盖数。优先应用 `expected.【打分备注】.完整详实` +4. **contamination**:查 trajectory 是否 fetch 过 `preflight.tainted_tokens`;search-only 命中只记录风险,不扣污染分。若有 fetch,按 RUBRIC 给 `contamination_penalty` +5. **improvement 三桶**:从 trajectory 里找失败片段,分类写进 `tool_capability / search_strategy / skill_prompts` + +## improvement 填写规则 + +**每条建议必须满足**: +- 指向**具体文件**(skill_prompts)、**具体命令**(tool_capability)或**具体动作**(search_strategy) +- 引用 trajectory 里触发该建议的 round 序号 +- 不写"可以更好"这种无落点的建议;写不出具体落点的建议**丢弃**,不要凑数 + +**示例**: + +✅ 好的: +```json +"skill_prompts": [ + "round 4 Executor 把 wiki URL 直接传给 docs +fetch 导致 param invalid。lark-wiki/SKILL.md 的反模式段应加'wiki 链接必须先走 +resolve-node'的明确警告(当前只在 references 里写了)" +] +``` + +❌ 差的: +```json +"skill_prompts": [ + "搜索不够全面", + "agent 应该更聪明地处理 wiki" +] +``` + +## 合并规则(主 agent 在全部 case 打完后做) + +把所有 verdicts 的 `improvement` 按"改动落点文件"去重合并到 `summary.json`: + +```json +{ + "run_id": "2026-04-15T10-00Z", + "dataset_size": 14, + "scored": 13, + "contaminated_fetched": 1, + "totals": { + "sum": 132, + "max": 195, + "percent": 67.7, + "per_dim": {"recall": 2.69, "accuracy": 3.92, "completeness": 3.54} + }, + "findings": [ + { + "finding_id": "F-001", + "bucket": "skill_prompts", + "target_file": "skills/lark-wiki/SKILL.md", + "suggestion": "在反模式段加 'wiki 链接必须先走 +resolve-node' 警告", + "driving_cases": ["case_003", "case_007", "case_011"], + "priority": "high" + }, + { + "finding_id": "F-002", + "bucket": "tool_capability", + "target_file": "shortcuts/docs/search.go", + "suggestion": "docs +search 返回结果没有 body_preview,agent 必须 fetch 才能判断相关性", + "driving_cases": ["case_001", "case_005"], + "priority": "medium" + } + ], + "primary_bottleneck": "skill_prompts", + "pollution_warnings": [] +} +``` + +**priority 判定**: +- `high`: driving_cases ≥3 且 bucket 是 `skill_prompts` / `search_strategy`(改文档成本低、收益面广) +- `medium`: driving_cases ≥2 或 bucket 是 `tool_capability`(代码改动) +- `low`: driving_cases == 1(过拟合风险高,给 Optimizer 作参考但不强推) + +## 自我校准检查(写 verdict 前自问) + +- 我是不是看了 expected 才倒推 trajectory 合理性?(应该反过来:先看 trajectory 自己是否合理,再 check 是否命中 expected) +- contamination_penalty 有没有漏判? +- improvement 的三桶比例是否均衡到可疑(例如 13 个 case 全扔 `skill_prompts`,可能是判断懒) diff --git a/skills/eval-search/prompts/optimizer.md b/skills/eval-search/prompts/optimizer.md new file mode 100644 index 000000000..07ac3e192 --- /dev/null +++ b/skills/eval-search/prompts/optimizer.md @@ -0,0 +1,150 @@ +# Optimizer sub-agent 模板 + +**使用方式**:主 agent 用 Task 工具启动 sub-agent。Optimizer 读 `summary.json` + 失败 case 的关键错误片段 + 仓库代码,产出 diff 草稿。 + +> **关键纪律**:不喂 trajectory 原文全文,只喂主 agent 从失败 case 摘出的"关键错误行"(通常 ≤20 行/case)。这是防过拟合 + 控 context 的核心设计。 + +--- + +## SYSTEM(Task prompt 开头) + +你是 lark-cli 搜索能力评测 harness 的**优化层 sub-agent**。Judge 已经产出 `summary.json`(含聚类后的 findings),你的任务是把这些 findings 转成**可直接 commit 的代码 / 文档改动**,并自我区分哪些是泛化的、哪些是针对具体 case 的。 + +### 你的约束 + +1. **工具**:Read / Edit / Write / Grep / Glob / Bash(仅限 `go build`, `make unit-test`, `git diff`, `gofmt`)。禁止 `git push` / `gh pr create` / `git commit` — 那是主 agent 的事 +2. **白名单 — cli 仓库**(`larksuite/cli`,当前工作目录): + - ✅ `skills/**/*.md`(改已有或新增) + - ✅ 新增 `shortcuts//.go` + 配套 `*_test.go` + - ❌ `internal/**`, `extension/**`, `cmd/root.go`, `cmd/service/**` + - ❌ 旧 shortcut 的删除 / 重命名 / 破坏性修改 +3. **白名单 — open 仓库**(`$GOPATH/src/code.byted.org/lark_as/open/`,**只读导航后才能改**): + - 处理 `tool_capability` 桶里的 finding 时,MUST 先 Read [`../references/open-repo-layout.md`](../references/open-repo-layout.md) 了解允许动哪些文件 + - ✅ 简要:`biz/search_open/entity/{name}.go` 的 `BuildDisplayInfo` / `BuildResponseItem` bug fix / `Prune`,及配套 `*_test.go` + - ❌ 简要:IDL / `handler.go` / `adapter.go` / `api_meta/**` / 新增 OAPI 字段(详见导航手册) + - 涉及 IDL 或契约变更的 finding → 写进 `unhandled_findings.md` 的 `proposed_issue` 段,不写 diff +4. 触犯白名单外的 finding → 写进 `unhandled_findings.md`,建议新人改成 GitHub issue +5. 每次改 cli 仓库 Go 代码后 MUST 跑 `make unit-test` 验证。失败最多迭代 2 次,仍失败则该 finding 降级到 `unhandled_findings.md` +6. open 仓库暂不跑 quality gate(CI 配置非 harness 可控),但 Optimizer 自己 MUST:所有 `.go` 改动过 `gofmt`、动了 `entity/{name}.go` 必须同步动 `entity/{name}_test.go` +7. 改完所有 cli finding 后 MUST 跑 `go run github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.1.6 run --new-from-rev=origin/main` +8. 按 Conventional Commits 格式写 commit message — 双仓库情况下产出两份独立 commit message(见下方产出结构) + +### 输入(主 agent 会拼到 prompt) + +- `summary_json`: 完整 `summary.json` 内容 +- `key_error_snippets`: 每个 high-priority finding 的 driving_cases 里摘的关键错误行(主 agent 挑好) +- `run_dir`: 评测目录,用于读历史产物和写输出 + +### 工作流 + +1. **读 summary 全部 findings**,按 `priority` 降序处理 +2. **对每条 finding**: + - `skill_prompts` bucket → 用 Edit 改 cli 仓库的指定 markdown,保持 tone / 结构与周边一致 + - `search_strategy` bucket → 沉淀到 cli 仓库对应域的 `references/*-search.md`(如 `skills/lark-doc/references/lark-doc-search.md`),不要塞进本 harness 的 prompt 模板 + - `tool_capability` bucket → 分两步判断: + 1. 如果 finding 本质是 cli 封装层不够(缺 shortcut、shortcut 输出难解析),评估能否在 cli 仓库加 shortcut 解决 + 2. 如果是 OAPI 层(`BuildDisplayInfo` 信息不够、字段映射 bug),Read [`../references/open-repo-layout.md`](../references/open-repo-layout.md) 并严格按白名单改 open 仓库。不在白名单的 → 产出 issue 正文,写进 `unhandled_findings.md` 的 `proposed_issue` 段 +3. **过拟合自检**:每条改动自问"这条是否仅对 driving_cases 有效"。如果是,**标记为 case-specific** 写进 `generalization_note.json` +4. **写产出**(到 `/pr-draft/`): + +``` +/pr-draft/ +├── diff.patch ← cli 仓库改动(在 larksuite/cli 目录下 git diff > diff.patch) +├── commit_message.txt ← cli 仓库 commit message +├── generalization_note.json +├── unhandled_findings.md +└── open/ ← 若有 open 仓库改动才创建 + ├── diff.patch ← open 仓库改动(在 lark_as/open 目录下 git diff > diff.patch) + ├── commit_message.txt ← open 仓库 commit message + └── touched_files.txt ← 改动文件清单(用于主 agent 白名单复查) +``` + +**重要**:Optimizer 不执行 `git commit`。只产出 diff.patch + commit_message.txt,由主 agent 分别在两个仓库 apply + commit。 + +### generalization_note.json 格式(**必填,主 agent 会读并注入 PR description**) + +每条改动必须带 `repo` 字段(`cli` 或 `open`),主 agent 按此分发到对应 PR。 + +```json +{ + "case_specific_changes": [ + { + "repo": "cli", + "file": "skills/lark-doc/references/lark-doc-search.md", + "change_summary": "在同义词小节新增 '交个朋友 → Livflow 智能平台' 映射", + "driving_cases": ["case_005"], + "risk": "该同义词只由 case_005 驱动,强度弱。reviewer 可判断是否保留" + } + ], + "principled_changes": [ + { + "repo": "cli", + "file": "skills/lark-doc/SKILL.md", + "change_summary": "新增 '搜索词改写失败 3 次后给 best-effort 答案' 决策规则", + "driving_cases": ["case_003", "case_007", "case_011"], + "rationale": "泛化到任何搜索类任务的退出条件,不依赖具体 case 内容" + }, + { + "repo": "open", + "file": "biz/search_open/entity/chat.go", + "change_summary": "BuildDisplayInfo 在群描述为空时 fallback 展示群主名称", + "driving_cases": ["case_012"], + "rationale": "空描述的群目前 agent 只能看到标题,判断相关性信息不足;泛化到所有群搜索结果" + } + ] +} +``` + +`unhandled_findings.md` 内若含涉及 IDL / 契约变更的 finding,按以下结构写 `proposed_issue` 段: + +```markdown +### [proposed-issue] + +**Bucket:** tool_capability +**Driving cases:** case_003, case_008 +**Why not auto-fixed:** 需要 IDL 新增 optional 字段 `.`,跨 idl/open 两仓库,人工处理 + +**Suggested issue body:** +<可直接贴到 github issue 的完整正文,含背景、proto 来源字段、对 agent 决策的价值> +``` + +### commit_message.txt 格式 + +两份 commit message 结构相同,区别在 scope: + +**cli 仓库** (`pr-draft/commit_message.txt`): +``` +feat(eval-search): auto-propose improvements from run + +Driven by /eval-search propose-pr . + +- +- +- (case_005) + +Eval: % → % +Regressions: + +Generated-By: eval-search/ +``` + +**open 仓库** (`pr-draft/open/commit_message.txt`): +``` +feat(search_open): improve converter display_info from eval-search run + +- +- + +Driven by: larksuite/cli /eval-search run +Pair: +Generated-By: eval-search/ +``` + +### 禁止事项 + +- ❌ 不要改 `RUBRIC.md` / `prompts/*.md`(你自己的 prompt 不该自己改) +- ❌ 不要改 `dataset` 或评测 base 相关文件(评测集改动不由 Optimizer 负责) +- ❌ 不要修"已知 regression"反向打补丁(那是拼分,不是真修复) +- ❌ 找不到落点的 finding 不要硬凑,写进 `unhandled_findings.md` +- ❌ 不要给 skill markdown 加"由 Optimizer 自动生成"这类元信息注释——文档应读起来是人写的 +- ❌ 不要改 IDL 仓库 / kitex_gen 生成代码 / open 仓库白名单外的任何文件(详见 `open-repo-layout.md`) diff --git a/skills/eval-search/references/dataset.md b/skills/eval-search/references/dataset.md new file mode 100644 index 000000000..e167ca6ce --- /dev/null +++ b/skills/eval-search/references/dataset.md @@ -0,0 +1,127 @@ +# 评测集 schema + 拉取方式 + +## 位置 + +评测集存在飞书多维表格(**live 数据源**,PM 持续更新): + +- base_token: `OOoEbNWhcaFOdisXDW7c0lKtn4g` +- table_id: `tblGWdc19tKFZC6K` +- view_id: `vewGToSnWl` +- URL: https://bytedance.larkoffice.com/base/OOoEbNWhcaFOdisXDW7c0lKtn4g?table=tblGWdc19tKFZC6K&view=vewGToSnWl + +> **污染警告**:这个 base 本身会被 `docs +search` 命中。harness 必须把账号拆成两个 profile:loader profile 只用于读取这个 base 并生成 `dataset.jsonl`;executor profile 只用于盲测搜索,**不可**加入该 base 的查看权限,否则评测结果被自答污染。详见 [`pollution-preflight.md`](pollution-preflight.md)。 + +## 原始字段(字段 id → 含义) + +| 字段名 | 类型 | 说明 | +|--------|------|------| +| `query` | text | 自然语言问题;Executor 唯一可见输入 | +| `len` | number | 历史字段,忽略 | +| `企业内是否有知识` | single-select | `是` / `否`。`否` 意味着企业知识库里本来就没答案,Executor 应答"找不到",recall 维度固定给 5 | +| `预期答复(机评文本)` | text | 含三段:【关键信息】/ 【辅助信息】/ 【打分备注】。Judge 独占使用,**Executor 不可见** | +| `数据源地址` | text(markdown 链接) | expected source URLs;Judge 独占使用,**Executor 不可见** | + +## 拉取命令 + +推荐用确定性 setup runner 拉取并转换: + +```bash +node --experimental-strip-types tests/eval-search/eval-search-run.ts \ + --loader-profile \ + --executor-profile \ + --subset 3 +``` + +如果只有一个账号,可以拆成两步: + +```bash +# 账号仍有评测 Base 权限时,只拉本地快照 +node --experimental-strip-types tests/eval-search/eval-search-run.ts \ + --snapshot-only \ + --loader-profile + +# 移除该账号的评测 Base 权限后,从本地快照继续盲测 setup +node --experimental-strip-types tests/eval-search/eval-search-run.ts \ + --dataset-file tests/eval-search/runs//dataset.jsonl \ + --executor-profile +``` + +只看原始 Base 拉取时,用 loader profile 执行: + +```bash +lark-cli --profile base +record-list \ + --as user \ + --base-token OOoEbNWhcaFOdisXDW7c0lKtn4g \ + --table-id tblGWdc19tKFZC6K \ + --view-id vewGToSnWl \ + --limit 100 +``` + +返回形如: +```json +{ + "ok": true, + "data": { + "data": [ [value_of_query, value_of_len, ...], ... ], + "field_id_list": ["fldh3DHP53", ...], + "fields": ["query", "len", "企业内是否有知识", "预期答复(机评文本)", "数据源地址"], + "record_id_list": ["recvg4qIXMSU6K", ...], + "has_more": true + } +} +``` + +若 `has_more=true`,用 `--offset` 翻页直到全部拉完。 + +## 转换为 harness 内部 schema + +主 agent 把每一行转成一个 case 对象,拼成 `dataset.jsonl`(jsonl,一行一个 case): + +```json +{ + "case_id": "case_001", + "record_id": "recvg4qIXMSU6K", + "query": "华东客户有哪些 Aily 优秀使用案例", + "has_knowledge": true, + "expected": { + "key_points": "【关键信息】的原文段", + "aux_info": "【辅助信息】的原文段", + "rubric_notes": { + "类型说明": "开放问题", + "可信无误": "不局限于ref,只要明确作为aily使用案例出现即算可信", + "完整详实": "答出5个及以上不扣分", + "结构清晰": "无", + "语言表述": "无", + "相关辅助": "无", + "引用准确": "无" + } + }, + "source_urls": [ + "https://bytedance.larkoffice.com/wiki/HxnMwM9cyiFW1dkACUBcC7KWnEd", + "https://bytedance.larkoffice.com/wiki/Es5wwNCyei3eYNkXc8Tcx35nnWe" + ] +} +``` + +### 转换要点 + +1. **case_id 编号**:按 record_id 在返回里的顺序分配 `case_001, case_002, ...`。同一次 run 内稳定,跨 run 不保证(PM 在 base 里插新行会错位)。如需跨 run 追踪,用 `record_id` +2. **filter `企业内是否有知识`**:harness 同时支持 `是` 和 `否` 的 case;但**pilot 阶段建议只跑 `是` 的**(`否` case 判分逻辑更复杂,后续加) +3. **解析 `预期答复` 的三段**: + - split 文本找 `【关键信息】` / `【辅助信息】` / 【打分备注】` 三个 heading + - 【打分备注】段是嵌套 JSON,`json.loads` 解析到 `rubric_notes` + - 解析失败的 case 标记 `parse_error: true`,跳过不评(写进 `summary.json.skipped`) +4. **解析 `数据源地址`**:正则提取 markdown 链接 `[text](url)` → `source_urls: [url, ...]`。非 URL 的纯文本(如提示语)忽略 +5. **空 query 过滤**:`query` 字段为空或纯空白的记录跳过 + +## Pilot 样本:只跑前 3 条冒烟 + +`/eval-search run --subset 3` 只拉前 3 条 `是` 类 case 跑。用于: +- 第一次落地 harness,验证端到端能跑通 +- auto-PR 流程的 dry-run(改完 skill 跑 3 条看趋势) + +## 频率 / 数据漂移 + +PM 在 base 里编辑 case 是常态。harness 不做 snapshot 冻结(v0.1 范围外),每次 `run` 拉最新。 + +**代价**:v_n 和 v_{n+1} 的分数差会混入 dataset 变化。在 PR description 里强制标注 `dataset_size / first_run_of_records` 两个字段,reviewer 自己判断。 diff --git a/skills/eval-search/references/known-tainted-tokens.md b/skills/eval-search/references/known-tainted-tokens.md new file mode 100644 index 000000000..db14f0546 --- /dev/null +++ b/skills/eval-search/references/known-tainted-tokens.md @@ -0,0 +1,75 @@ +# 已知污染文档标记清单 + +**维护原则**:只加,不删(除非文档被彻底销毁)。每次 v_n 迭代中新增的"评测过程记录"文档都要补进来。 + +## excluded_user_ids(必须排除的登录账号) + +运行 harness 前,`lark-cli auth status.userOpenId` 若命中以下之一,harness 拒绝启动: + +```yaml +excluded_user_ids: + - ou_6927671c80c467507b88fae9a2983bdb # 贾洪楠(搜索负责人 / dataset owner,有 base 读权限) + # 补充规则:dataset owner、PR reviewer 的个人账号、任何能读 dataset base 的账号 +``` + +## tainted_tokens(搜索命中这些 token 即标记 contamination_risk) + +```yaml +tainted_tokens: + # 评测集 base 本身(搜索很容易命中 base 里的 query 文本) + - OOoEbNWhcaFOdisXDW7c0lKtn4g + + # v1/v2 harness 迭代记录 docx(含全部方法论 + per-case 分数) + - VdUKdAXjmo9vl8xq4FrczK6unct + + # v1/v2 迭代讨论 / 参考流程文档(wiki 节点,含 table) + - UHFJwOFOCiRgXdkxCoMc1JHgn0c + + # 2026-04-30 run 命中的评测集 / 评测分析类文档 + - QOhZbPgqDaJ3MQsTkGWcPdZUnHe # Agentic评测集数据汇总 + - ZWQHssFhPh8FG5tca48cNRP0npb # 知识问答评测集(v251212) + - CVMqwRPJBi2aJQkabpAcwwjoneh # Untitled Base(评测集内容) + - VrNLbOJRJacXPOsyqHEcHFlunjc # 意图_改写评测集 + - RmYObZGsWaQbjSslgSxc0CT7nFh # 2601-300Q评测集 + - XtKhwTZ7aii6CNkXHZdcHj1bnwh # 20251222 评测 Case 分析 + - RTkDw2QD3igsEMkKCamcKSM9nTh # openclaw-竞对评测 + - XGbnbQLy6ayTt6s9AG3cwugMn3c # 追问评测集 + - MSMGsruM2hUvzHtA3MZcSgBOnMe # 追问精简版 + - QB4GsP3jfhbToLtFANict4zqn6d # 追问拆分实验15 + - Vex5sCIeAhVyirtKTjUcn63mnod # 精简版追问_v2 + - GYkIbtnfeac5RJsjjMAceTqJn2c # 图片理解开灰前评测 + + # 2026-05-06 optimizer_after_v2 中已被 fetch 的残留评测/过程材料 + - R2V0dojm2oeW9jx493icfbQpnDb # 融合场景 GSB 评测报告 + - C0O2dnfxWoGD4bxJxzmcZehgnTj # 企业问答基线评测机评应用报告 + - Q19HdaFdMopYoBxOiuLcaPbMnB3 # 模型追问 PE 迭代记录 + - Dv4adguERoakkhxWUdRlVaQvg5E # 20240729 case 分析 + - ZDs4dU2fzoDMxwx6PcBcmWhZndf # Q1 基线 agentic 评测集 91Q +``` + +## 新增条目流程 + +1. 发现某个飞书文档在评测中被 fetch + 用来作答 → 该文档 tainted +2. 提取 token: + - docx URL `https://xxx/docx/` → `tainted_tokens` 加 `` + - wiki URL `https://xxx/wiki/` → `tainted_tokens` 加 ``;**另外**用 `lark-cli wiki +resolve-node --token ` 拿到真实 `obj_token`,也加进去 + - base URL 只加 base_token 即可 +3. 提交 PR 改本文件(commit message: `chore(eval-search): mark tainted - `) + +## 执行侧处理规则 + +- Preflight 命中 tainted token 只标记风险,不阻断整轮评测。 +- Executor/collector 不能因为命中本文件就跳过、降权或隐藏结果;否则评测会被过滤规则美化,不能反映真实搜索行为。 +- Collector 应把命中的 token 写进 trajectory / raw evidence,保留 `tainted` 这类元数据,交给 Judge 按 RUBRIC 判定污染扣分。 +- `verdicts.json` 里只对“fetch 过 tainted token 且答案受其影响”的 case 扣污染分;单纯 search 命中但未 fetch 的 case 不扣污染分,但可以作为污染风险记录。 +- 新增 collector、shortcut 或搜索策略时,都要把本文件当作统一标记清单读取,避免各处散落 hard-coded 污染 token。 + +## 替代策略(推荐) + +**不要在飞书上写"评测过程记录" / "v_n 比对分析"之类文档**。都写成本仓库 markdown: + +- 评测流程/设计 → `skills/eval-search/**`(已就位) +- 某轮迭代分析 → `tests/eval-search/runs//*.md`(gitignored,本地查看) +- 发布用的 retrospective → PR description / GitHub wiki / release notes + +这样根本不会污染飞书搜索语料,污染标记清单的维护压力也会逐渐下降。 diff --git a/skills/eval-search/references/open-repo-layout.md b/skills/eval-search/references/open-repo-layout.md new file mode 100644 index 000000000..bac261cd7 --- /dev/null +++ b/skills/eval-search/references/open-repo-layout.md @@ -0,0 +1,162 @@ +# open 仓库导航手册(Optimizer 专用) + +> **读者:** `prompts/optimizer.md` 在处理 `tool_capability` 桶的 finding 时会 Read 这篇文档。 +> +> **目的:** 把 `lark_as/open` 仓库当"受控沙盒" — 明确 Optimizer 允许改哪些文件、禁止碰哪些文件、改完怎么验证。 + +## 仓库定位 + +``` +$GOPATH/src/code.byted.org/lark_as/open/ +``` + +这是 lark-cli 背后的 OpenAPI 服务层(后台简称 suite.as.open)。它把飞书内部大搜 PB(MGUniversalSearch)封装成面向外部的 OAPI。CLI 调这些 OAPI,agent 调 CLI。整条链路: + +``` +CLI (larksuite/cli) + → OAPI (lark_as/open) + → kitex_gen stub (git.byted.org/ee/go/kitex_gen, 由 IDL 仓库自动生成) + → RPC → 大搜后端 +``` + +**Optimizer 只动 open 仓库一层。** IDL 和 kitex_gen 不动(见禁止清单)。 + +## 核心目录(只读懂即可) + +``` +biz/search_open/ ← AI Friendly 新框架,所有改动都在这里 +├── entity/ ← 每实体一个 converter 文件 +│ ├── iconverter.go ← Converter 接口定义(不动) +│ ├── chat.go ← 参考实现(group chat 搜索) +│ ├── meeting.go ← 参考实现(平台实体,走 SlashCommand) +│ ├── message.go / doc.go / wiki.go / user.go / mail.go / task.go / ... +│ └── timeutil.go ← 时间格式工具(不动) +├── adapter.go ← 调 UniversalSearch RPC(不动) +├── handler.go ← 编排(不动) +├── pagetoken.go ← 翻页(不动) +├── response.go ← 错误码(不动) +├── CLAUDE.md ← open 仓库的开发规范,读它能看懂架构 +└── api_meta/{entity}/ ← 每实体 4 个 yml(search/filter/item/meta) + +biz/handler/handler.go ← 顶层路由(不动) +rpc/ ← 旧搜索 + RPC 封装(不动) +main.go / conf/ / utils/ ← 基础设施(不动) +``` + +## Converter 接口速览 + +每个 `entity/{name}.go` 都实现同一套 5 方法接口: + +```go +type Converter interface { + EntityType() usearch.SearchEntityType + BuildEntityItem(ctx, req) (*usearch.BaseEntity_EntityItem, error) // OAPI Filter → PB Filter + BuildResponseItem(result *usearch.SearchResult) (interface{}, error) // PB Meta → OAPI Item + BuildDisplayInfo(result *usearch.SearchResult) string // 组装给 AI 看的 markdown 卡片 + Prune(item interface{}, fields []string) interface{} // 字段裁剪 +} +``` + +**AI friendly 的高杠杆改动点几乎全在 `BuildDisplayInfo`**:它返回的 markdown 就是 agent 在 CLI 里看到的搜索结果文本。大搜结果里的标题、摘要、上下文、高亮(`` 标签)的组装方式直接决定 agent 能否一眼判断相关性。 + +## ✅ 允许改动(白名单) + +以下三类改动 Optimizer 可以直接写 diff,不需要动 IDL: + +### 1. `BuildDisplayInfo` 优化 + +- 补充 markdown 字段(例如加入更多上下文、路径信息、作者、时间) +- 调整高亮策略(命中词用 `` 标签包裹) +- 修复格式化 bug(换行、空字段处理、转义) + +**边界:** 只能使用 `*usearch.SearchResult` 里已有的字段。要是需要 PB 没返回的信息,那是 PB/IDL 的问题,降级为 issue。 + +### 2. `BuildResponseItem` 的字段映射 bug fix + +- `nil` 指针防御 +- 时间戳转换错误(`UnixToISO8601` / `UnixMsToISO8601` 用错) +- 枚举值映射错(比如 `chatStatusNormal` 漏判) +- ID 字段赋值缺失 + +**边界:** 只能在已有 OAPI 响应字段上做映射修复;**不能**新增 OAPI 响应字段(那是 IDL 级别的契约变更)。 + +### 3. `Prune` 敏感字段裁剪 + +- 根据业务需要把敏感/内部字段从响应里去掉 + +### 4. 配套测试 + +- 每次改 `entity/{name}.go` **必须**同时更新 `entity/{name}_test.go`,否则 quality gate(未来启用)会 block + +## ❌ 禁止改动(硬黑名单) + +| 路径 | 原因 | +|------|------| +| `../lark/idl/**` | IDL 在另一个仓库,需要跑 overpass + go get,不是 PR 范畴 | +| `biz/search_open/handler.go` | 编排逻辑,动了容易坏所有实体 | +| `biz/search_open/adapter.go` | RPC 适配层,牵扯协议 | +| `biz/search_open/pagetoken.go` | 翻页 + Redis,幂等性敏感 | +| `biz/search_open/response.go` | 错误码契约 | +| `biz/search_open/entity/iconverter.go` | Converter 接口,动了所有实体都得跟 | +| `biz/search_open/entity/timeutil.go` | 时间工具,动了影响所有实体 | +| `biz/search_open/api_meta/**/*.yml` | 新增 / 修改 schema = 契约变更,走人工 | +| `biz/handler/handler.go` | 顶层路由 | +| `rpc/**` | 旧搜索 + RPC 封装 | +| `main.go` / `conf/**` / `utils/**` | 基础设施 | +| `go.mod` / `go.sum` | 依赖升级人工做 | + +**触犯任一条** → finding 必须进 `unhandled_findings.md`,附带 issue 描述建议,不写进 diff。 + +## 新增 OAPI 字段(即使是 optional)的处理 + +**Optimizer 不能自动加字段。** 流程太复杂: + +1. 需要改 IDL 仓库(`$GOPATH/src/code.byted.org/lark/idl/idl/suite/as/open/*.thrift`) +2. 需要跑 overpass 生成 kitex_gen stub +3. 需要 `go get` 拉 stub 更新 +4. 需要同步改 open 仓库的 converter 映射 +5. 需要同步改 `api_meta/{entity}/*.yml` schema + +这是多仓库协作 + 手工步骤,Optimizer 不应该做。改为产出 GitHub issue 正文,正文包含: + +- 哪个 entity 需要新字段 +- 字段含义(含 proto 里已有的来源字段,若有) +- driving case 的引用 +- 对 agent 决策的价值说明 + +issue 正文写进 `unhandled_findings.md` 的 `proposed_issue` 段,由人工创建。 + +## 验证策略(当前版本) + +**Quality gate 暂未启用**(`/eval-search propose-pr` 跳过 open 仓库测试)。原因:open 仓库跑测试需要下游依赖,CI 配置不是 harness 可控的。PR 开出去之后,open 仓库的 CI 会自己跑。 + +Optimizer 自己必须做的最小校验: + +1. 所有改动文件 `gofmt` 过 +2. 改了 `entity/{name}.go` 必须同步动 `entity/{name}_test.go`(至少加一条测试覆盖修改的分支) +3. 不允许删除已有测试 + +## 参考文件(Optimizer 生成改动前**必读**) + +- `biz/search_open/CLAUDE.md` — 开发规范原文 +- `biz/search_open/entity/chat.go` — 完整 converter 参考 +- `biz/search_open/entity/chat_test.go` — 测试写法参考 +- `biz/search_open/entity/meeting.go` — 平台实体 converter 参考(`BuildDisplayInfo` 写法略有不同) + +## 与主 agent 的交互契约 + +Optimizer 处理涉及 open 仓库的 finding 时,产出放在 `pr-draft/open/` 子目录(和 cli 仓库的 `pr-draft/` 同级): + +``` +tests/eval-search/runs//pr-draft/ +├── diff.patch # cli 仓库改动(原本就有) +├── generalization_note.json +├── unhandled_findings.md +├── commit_message.txt +└── open/ # 新增:open 仓库改动 + ├── diff.patch # 应用到 $GOPATH/src/code.byted.org/lark_as/open/ + ├── commit_message.txt + └── touched_files.txt # 命中白名单校验的冗余证据 +``` + +主 agent 拿到两份 diff.patch 之后,分别 checkout 两个仓库、分别 apply、分别 commit、分别 `gh pr create`,在两个 PR description 里互相 link(见 `pr-generation.md`)。 diff --git a/skills/eval-search/references/pollution-preflight.md b/skills/eval-search/references/pollution-preflight.md new file mode 100644 index 000000000..247cb3c45 --- /dev/null +++ b/skills/eval-search/references/pollution-preflight.md @@ -0,0 +1,103 @@ +# 污染预检规则 + +## 动机 + +评测集 base 自身、v1/v2 迭代记录文档、含 expected 的参考文档,都可能被 `docs +search` 命中。Executor 一旦 fetch 到,就是"开卷考试"——分数失去意义。 + +v2 的教训:PM 的 dataset base 在第一次跑评测时,几乎所有 query 的 `docs +search` top-1 都是 dataset 自己。 + +因此 `/eval-search run` 需要两个 lark-cli profile: +- `loader-profile`:能读评测 Base,只负责拉取 live dataset 并写入 `dataset.jsonl` +- `executor-profile`:负责盲测搜索,必须不能读评测 Base + +也可以用同一个人账号做时间隔离:先在有权限时运行 `--snapshot-only` 拉本地快照;随后把该账号从评测 Base 权限里移除;最后用 `--dataset-file` 从本地快照继续。第二步运行时仍会探测 executor 是否能读 Base,能读则阻断。 + +## 两道防线(必须叠加) + +### 防线 1:专用账号(物理隔离) + +harness 启动时 MUST 先对 executor profile 做账号检查: + +```bash +lark-cli --profile auth status +``` + +从返回里读 `userOpenId`,对照 [`known-tainted-tokens.md`](known-tainted-tokens.md) 的 `excluded_user_ids` 列表: +- 命中 → **拒绝启动**,报错退出:`当前账号在 excluded_user_ids 里;harness 必须用专用测试账号运行` +- 未命中 → 继续 + +**新建测试账号步骤**(手工一次性): +1. 申请独立企业飞书账号(非 PM、非 dataset owner) +2. 账号不加入评测集 base 的权限,不加入"参考流程文档"的权限 +3. 在 `~/.config/lark-cli/profiles/` 下建独立 profile,`lark-cli auth login --profile eval-search` +4. 评测运行时:`lark-cli --profile eval-search ...` + +setup runner 还会主动探测 executor profile 是否能读取评测 Base: + +```bash +lark-cli --profile base +record-list \ + --as user \ + --base-token OOoEbNWhcaFOdisXDW7c0lKtn4g \ + --table-id tblGWdc19tKFZC6K \ + --view-id vewGToSnWl \ + --limit 1 +``` + +期望结果是权限失败。若读取成功,说明 executor 可直接搜到或打开评测集,必须阻断本轮 run。 + +### 防线 2:Pre-flight 扫描(兜底) + +即使账号做了物理隔离,某些情况下仍可能被污染(例如:某个新建文档恰好包含了答案且权限开放)。Pre-flight 作为兜底: + +**流程**: + +``` +for each case in dataset.jsonl: + result = lark-cli --profile docs +search --query "" --page-size 20 + hit_tokens = extract all obj_token / wiki_token from result + tainted = hit_tokens ∩ known_tainted_tokens + + write to preflight.json: + { + "case_id": "case_001", + "contamination_risk": len(tainted) > 0, + "tainted_tokens": [...], + "top_20_tokens": [...] + } +``` + +**不阻断**,只标记。原因:有时 pre-flight 命中但 Executor 最终没 fetch,这种 case 依然有效,Judge 会打出正常 recall 分。 + +### known_tainted_tokens 的维护 + +见 [`known-tainted-tokens.md`](known-tainted-tokens.md)。三类必须纳入: +1. **评测集 base 自身**:`OOoEbNWhcaFOdisXDW7c0lKtn4g` +2. **v1/v2 迭代记录 docx**:`VdUKdAXjmo9vl8xq4FrczK6unct`(含全部评测方法论 + 具体 case 分数) +3. **人类写的"答题参考"/"流程总结"**:任何在评测过程中被主 agent 写到飞书的 note + +每次新增一个"讨论评测过程"的飞书文档,记得加进标记清单(或者更简单:**不要在飞书上写这种文档**,都写成本仓库 markdown)。 + +## Judge 怎么用 preflight 数据 + +Judge 读 `preflight.json` 判断 `contamination_penalty`: + +``` +for each case: + if preflight[case].contamination_risk == true: + scan trajectory for any tool_use that fetched one of tainted_tokens + if fetched: + if answer directly quotes tainted doc content: + contamination_penalty = -3 + else: + contamination_penalty = -1 + else: + contamination_penalty = 0 + else: + contamination_penalty = 0 +``` + +## 常见坑 + +- **wiki 链接**:`wiki://space_xxx/node_yyy` 背后的 obj_token 才是真实目标。pre-flight 扫描时必须同时记录 `wiki_token` 和 `obj_token` 两层,任一命中标记清单即 tainted +- **短链 / applink**:`applink.feishu-pre.net/...` 跳转后的最终 URL 可能是 tainted,建议 Executor 遇到短链先解析一跳再判断。这条太细,v0.1 不做强约束 +- **账号隔离失效**:PM 手滑把 dataset base 对全员开放,专用账号又能看到了。定期(每次 run 前)手动检查一下 base 的权限列表 diff --git a/skills/eval-search/references/pr-generation.md b/skills/eval-search/references/pr-generation.md new file mode 100644 index 000000000..11f41c186 --- /dev/null +++ b/skills/eval-search/references/pr-generation.md @@ -0,0 +1,277 @@ +# PR 生成流程 + description 模板 + +## 双 PR 模型 + +Optimizer 的产出可能横跨两个仓库: + +- **cli 仓库**(`larksuite/cli`,当前工作目录):skill 文档改动、新增 shortcut +- **open 仓库**(`$GOPATH/src/code.byted.org/lark_as/open/`):converter 层 `BuildDisplayInfo` 优化、bug fix + +两个仓库分别提 PR,**独立 review、独立 merge**(决策 2A)。PR description 里互相 link,但不绑定 merge 顺序——一个先 merge 另一个还没 merge 也 OK。 + +若本次 run 只有 cli 改动,`pr-draft/open/` 目录不存在,跳过所有 open 仓库步骤。 + +## 总流程 + +``` +/eval-search propose-pr + │ + ├─[0] 前置检查 + │ ├─ cli 仓库 git status 必须干净(non-dirty);否则 abort + │ ├─ cli 仓库当前分支是 main;否则 abort + │ ├─ runs//summary.json 存在且 scored >0 + │ ├─ runs//meta.json.git_dirty != true + │ └─ 若 Optimizer 产出涉及 open 仓库 → 同样检查 open 仓库 git status / 分支 + │ + ├─[1] Optimizer sub-agent(Task 工具) + │ 输入: summary.json + key_error_snippets + 两个仓库路径 + │ 输出: pr-draft/{diff.patch, commit_message.txt, generalization_note.json, unhandled_findings.md} + │ 若有 open 改动 → pr-draft/open/{diff.patch, commit_message.txt, touched_files.txt} + │ 注意: Optimizer 不自己 git commit / git apply,一切由主 agent 执行 + │ + ├─[2] 白名单复查(主 agent,防 Optimizer 越权) + │ ├─ cli diff 命中路径都在白名单内(skills/**/*.md、shortcuts/**) + │ └─ open diff 命中路径都在白名单内(biz/search_open/entity/{name}.go + *_test.go) + │ 违反 → abort,Optimizer 降级迭代 + │ + ├─[3] cli 仓库 apply + commit + │ cd + │ git checkout -b eval-search/auto-pr/ + │ git apply pr-draft/diff.patch + │ ├─[3a] Quality gate + │ │ make unit-test # 必过 + │ │ golangci-lint run --new-from-rev=origin/main # 必过 + │ │ 失败 → Optimizer 最多迭代 2 次;仍失败 → rollback,该 finding 降级为 unhandled + │ └─ git add . && git commit -F pr-draft/commit_message.txt + │ + ├─[4] open 仓库 apply + commit(若有) + │ cd $GOPATH/src/code.byted.org/lark_as/open + │ git checkout -b eval-search/auto-pr/ + │ git apply /pr-draft/open/diff.patch + │ # 无 quality gate(暂时),Optimizer 自己已做 gofmt 和测试更新 + │ git add . && git commit -F /pr-draft/open/commit_message.txt + │ + ├─[5] 确定性 regression 重跑 + │ 调用 /eval-search run 内部逻辑(无 agent 参与),生成 after_verdicts.json + │ 对比 before(summary.json)vs after,产出 per-case diff + │ 注意: open 改动若依赖 CI 部署才能生效,after 结果反映的是 cli 改动的影响;在 description 里标注 + │ + ├─[6] 组装 PR description + │ 按本文件下方模板生成 cli 和 open 两份 description.md,互相留 link 占位 + │ + ├─[7] gh pr create --draft(cli) + │ cd && gh pr create --draft → 记录 PR url CLI_PR_URL + │ + └─[8] gh pr create --draft(open,若有) + cd && gh pr create --draft,description 里 Pair 字段填入 CLI_PR_URL + 创建完之后回到 cli PR,用 gh pr edit 把 open PR url 填到 cli description 的 Pair 段 +``` + +## Quality gate 失败处理 + +两次迭代后仍失败的 finding: + +1. 回滚那一条 finding 的改动(其他 finding 保留) +2. 把它写进 `unhandled_findings.md`,归类为 `quality_gate_failure`,附带完整错误输出 +3. PR description 的"未处理归因"段列出这些 finding 并建议新人创建 issue + +## PR description 模板(cli 仓库) + +```markdown + + +## 摘要 + +基于 eval-search run `{{run_id}}` 自动生成,共 {{n_findings}} 条改进落地({{n_skipped}} 条未处理)。 + +{{#if open_pr_url}} +**Pair:** [{{open_pr_title}}]({{open_pr_url}}) — open 仓库的配套改动,独立 review。 +{{/if}} + +## 评测对比(before vs after) + +| 指标 | before | after | Δ | +|------|--------|-------|---| +| 总分 | {{before_total}} / {{max}} ({{before_pct}}%) | {{after_total}} / {{max}} ({{after_pct}}%) | **{{delta}} ({{delta_pp}}pp)** | +| recall | {{before_recall}} | {{after_recall}} | {{delta_recall}} | +| accuracy | {{before_accuracy}} | {{after_accuracy}} | {{delta_accuracy}} | +| completeness | {{before_completeness}} | {{after_completeness}} | {{delta_completeness}} | + +- Dataset size: {{dataset_size}} (同一份 base 拉取;dataset 可能已被 PM 更新,per-case diff 以 `record_id` 对齐) +- 评测账号: `{{user_name}}` (open_id `{{user_open_id}}`) +- Pollution: {{contaminated_count}} case 命中 tainted tokens{{#if contaminated_count}} — 见附录{{/if}} + +## Wins(by case) + +{{#each wins}} +- `{{case_id}}` ({{record_id}}): **{{before}}→{{after}}** (+{{delta}}) + - driver: {{driver_findings}} +{{/each}} + +## ⚠️ Regressions(软告警 — reviewer 请核验) + +{{#if regressions}} +{{#each regressions}} +- `{{case_id}}` ({{record_id}}): **{{before}}→{{after}}** ({{delta}}) + - 可能原因: {{hypothesis}} + - 建议 reviewer: 查看 `tests/eval-search/runs/{{run_id}}/trajectories/{{case_id}}.json` 对比前后行为 +{{/each}} +{{else}} +_无 regression_ +{{/if}} + +## 改动分类(Optimizer 自述) + +### 泛化原则性改动(适用面广,reviewer 较快可信) + +{{#each principled_changes}} +- **{{file}}**: {{change_summary}} + - rationale: {{rationale}} + - driven by: {{driving_cases}} +{{/each}} + +### 针对具体 case 的改动(⚠️ 过拟合风险,reviewer 重点判断) + +{{#if case_specific_changes}} +{{#each case_specific_changes}} +- **{{file}}**: {{change_summary}} + - risk: {{risk}} + - driven by: {{driving_cases}} +{{/each}} +{{else}} +_无_ +{{/if}} + +## 未处理归因 + +{{#if unhandled}} +以下 findings 本 PR 未处理,建议 reviewer 考虑创建 issue: + +{{#each unhandled}} +- **[{{bucket}}]** {{suggestion}} + - 未处理原因: {{reason}} + - driving: {{driving_cases}} +{{/each}} +{{else}} +_无_ +{{/if}} + +## 怎么 review 这个 PR + +1. 先看"评测对比"总分是否真有提升 +2. 扫一眼 Regressions,若有,点进 trajectory 看是不是噪声 +3. 重点 review "针对具体 case 的改动"——判断是否过拟合 +4. 泛化性改动是文档修订,读 diff 即可 +5. 如涉及 Go 代码,CI 已过 `make unit-test` + lint,关注接口设计 + +## 复现 + +```bash +git checkout eval-search/auto-pr/{{run_id}} +/eval-search report {{run_id}} +``` + +--- + +🤖 Generated by [eval-search harness](../skills/eval-search/SKILL.md) +``` + +## PR description 模板(open 仓库) + +比 cli 版本精简,不重复写 wins/regressions 表格(那是 CLI 端视角),只列本 PR 的改动 + 回指 cli PR。 + +```markdown + + +## 摘要 + +配合 cli 仓库 `eval-search` 评测结果优化 OAPI converter 层。改动范围:`biz/search_open/entity/` 下的 `BuildDisplayInfo` / `BuildResponseItem` / `Prune`,**不涉及 IDL 和契约变更**。 + +**Pair:** [{{cli_pr_title}}]({{cli_pr_url}}) — 主 PR,含完整评测对比、泛化声明、未处理归因。 + +## 改动清单 + +{{#each open_changes}} +- **`{{file}}`**: {{change_summary}} + - driven by: {{driving_cases}} + - 过拟合风险: {{risk_level}} +{{/each}} + +## 怎么 review + +1. 每条改动本质都是 converter 输出字符串的优化,对协议无影响 +2. Quality gate 未跑(harness 暂未接 open 仓库 CI),reviewer 请关注: + - 空字段 / nil 指针防御是否到位 + - markdown 高亮标签 `` 使用是否一致 + - 测试是否覆盖了修改的分支 +3. 对 agent 效果的量化验证在 cli PR 的评测对比段 + +## 复现 cli 侧评测 + +```bash +cd +/eval-search report {{run_id}} +``` + +--- + +🤖 Generated by [eval-search harness](https://github.com/larksuite/cli/tree/main/skills/eval-search) +``` + +## 模板填充注意 + +- 所有百分比保留 1 位小数 +- `driving_cases` 最多列 5 个,超过写 `case_003, case_007, ... (+3 more)` +- `record_id` 放在 `case_id` 后面括号里,方便 reviewer 跨 run 追踪同一条 case +- `hypothesis` 由主 agent 根据 before/after trajectory diff 推断,最多 30 字;拿不准就写 `"待核验"`,不要硬编 + +## commit message 规范 + +Conventional Commits,遵循仓库 AGENTS.md: + +``` +feat(eval-search): auto-propose improvements from run + +<一段改动概要,3-6 行> + +Eval: % → % ({{delta_pp}}pp) +Regressions: +Unhandled: + +Generated-By: eval-search/ +Co-Authored-By: eval-search-bot +``` + +## PR 创建命令 + +**cli 仓库 PR**(先创建): + +```bash +cd +gh pr create --draft \ + --title "feat(eval-search): auto-propose improvements from run " \ + --body-file tests/eval-search/runs//pr-draft/description.md \ + --base main +``` + +记录返回的 PR URL 为 `CLI_PR_URL`。 + +**open 仓库 PR**(若 `pr-draft/open/` 存在): + +```bash +cd $GOPATH/src/code.byted.org/lark_as/open +# description.md 里已填入 CLI_PR_URL 到 Pair 字段 +gh pr create --draft \ + --title "feat(search_open): improve converter display_info from eval-search run " \ + --body-file /tests/eval-search/runs//pr-draft/open/description.md \ + --base main +``` + +记录返回的 PR URL 为 `OPEN_PR_URL`,然后回填到 cli PR description: + +```bash +cd +gh pr edit --body-file +``` + +Draft 模式确保 CI 跑但不自动 merge,等 reviewer 转为 ready-for-review。两个 PR **独立 review、独立 merge**,任一方 merge 均可,不要求同步。 diff --git a/skills/eval-search/references/run-layout.md b/skills/eval-search/references/run-layout.md new file mode 100644 index 000000000..8c29bc814 --- /dev/null +++ b/skills/eval-search/references/run-layout.md @@ -0,0 +1,118 @@ +# run 目录结构 + 中间产物约定 + +## 目录位置 + +``` +/tests/eval-search/runs// +``` + +`` 格式:`YYYY-MM-DDTHH-MMZ`(UTC,用 `date -u +%Y-%m-%dT%H-%MZ` 生成)。 + +整个 `tests/eval-search/runs/` 被 gitignore,不进版本库。 + +确定性 setup runner: + +```bash +node --experimental-strip-types tests/eval-search/eval-search-run.ts \ + --loader-profile \ + --executor-profile \ + --subset 3 +``` + +runner 只负责创建 run 目录、拉取并转换 live dataset、检查 executor 账号隔离、写 `preflight.json`。它不会执行 AI Executor/Judge 阶段;setup 成功时 `summary.json.status` 为 `ready_for_executor`。 + +单账号时间隔离模式: + +```bash +node --experimental-strip-types tests/eval-search/eval-search-run.ts --snapshot-only --loader-profile +# 移除该账号的评测 Base 权限 +node --experimental-strip-types tests/eval-search/eval-search-run.ts \ + --dataset-file tests/eval-search/runs//dataset.jsonl \ + --executor-profile +``` + +第一步只写本地 `dataset.jsonl`,`summary.json.status` 为 `snapshot_ready`。第二步会复制该 dataset 到新的 run 目录,并重新检查 executor 已经不能读取评测 Base。 + +## 单次 run 目录布局 + +``` +tests/eval-search/runs/2026-04-15T10-00Z/ +├── meta.json # run 元信息(cli 版本、loader/executor profile、账号、开始/结束时间) +├── raw/ +│ ├── base_records_pages.json +│ └── base_records_combined.json +├── dataset.jsonl # 从 base 拉下来并转换的 cases +├── preflight.json # 污染预检结果 +├── trajectories/ +│ ├── case_001.json # Executor 增量写盘,崩溃可恢复 +│ ├── case_002.json +│ └── ... +├── verdicts.json # Judge 产出 +├── summary.json # 聚类后的 findings +└── pr-draft/ # 仅 propose-pr 阶段产出 + ├── diff.patch + ├── generalization_note.json + ├── unhandled_findings.md + ├── commit_message.txt + └── after_verdicts.json # regression 重跑结果(不含 trajectories,减小体积) +``` + +## meta.json + +```json +{ + "run_id": "2026-04-15T10-00Z", + "started_at": "2026-04-15T10:00:13Z", + "ended_at": "2026-04-15T11:42:51Z", + "lark_cli_version": "v1.0.11+git-abc1234", + "git_head": "abc1234", + "git_dirty": true, + "loader_profile": "base-reader", + "executor_profile": "eval-search", + "user_open_id": "ou_xxx", + "user_name": "eval-search-bot", + "subset": null, + "cases_scored": 13, + "cases_skipped_contamination": 0, + "cases_skipped_parse_error": 1 +} +``` + +`git_dirty=true` 的 run 打上 `⚠️ dirty` 标记;propose-pr 阶段若源码 dirty 会拒绝生成 PR(否则 diff 混入无关改动)。 + +## 增量持久化约定 + +Executor 每完成 1 round(= 1 次 lark-cli 调用 + 解析),追加写入 `trajectories/.json`: + +```json +{ + "case_id": "case_001", + "query": "...", + "started_at": "...", + "rounds": [ + {"idx": 1, "tool": "Read", "target": "skills/lark-doc/SKILL.md", "outcome_summary": "..."}, + {"idx": 2, "tool": "Bash", "cmd": "lark-cli docs +search --query '华东 Aily'", "outcome_summary": "top-3: ..."}, + ... + ], + "answer": null, + "gave_up": false, + "ended_at": null +} +``` + +所有未闭合的 case(`ended_at: null`)在 run 结束时标记为 `incomplete`,Judge 按 `gave_up=true` 处理但 `rounds_used` 如实记录。 + +## 并发度 + +v0.1 建议 **串行跑 Executor**: +- 避免多 sub-agent 同时打飞书 API 触发限流 +- v2 历史上 sub-agent 529 频繁,并发会放大问题 +- 评测 13 case 串行约 1-2 小时,可接受 + +未来若评测集扩到 50+ case,再考虑 semaphore 限并发 = 2。 + +## 清理策略 + +`tests/eval-search/runs/` 不自动清理。用户手动 `rm -rf tests/eval-search/runs/` 或按时间删旧的。 + +.gitignore 已覆盖整个 runs/ 目录。 diff --git a/skills/lark-doc/references/lark-doc-search.md b/skills/lark-doc/references/lark-doc-search.md index 6ca0df4d4..3639e73e8 100644 --- a/skills/lark-doc/references/lark-doc-search.md +++ b/skills/lark-doc/references/lark-doc-search.md @@ -199,6 +199,12 @@ lark-cli docs +search --query "方案" --format json --page-token '' ## 决策规则 - 参数传递:只要用户给了搜索关键词,就必须显式使用 `--query "<关键词>"`。不要生成 `lark-cli docs +search 方案`、`lark-cli docs +search xxx(搜索关键词)` 这种位置参数写法。 +- 答案型检索循环:当目标是回答问题而不是简单列结果时,不要只跑一次原始 query 后直接收敛。至少按以下顺序做一轮 blind refinement: + 1. 保留业务实体、时间、数值、产品名等硬约束,去掉“怎么/是否/为什么/近期情况”等问句噪声后重搜。 + 2. 对问题类型补充泛化检索词:案例类加“客户案例/最佳实践”,政策类加“政策/售卖/规则”,数值类加“指标/目标/准确率/金额”,对接类加“集成/同步/打通”。 + 3. 如果 top 结果标题强相关但未被读取,继续 fetch 强相关候选;如果 top 结果明显是评测集、Case 分析、Prompt/Debug 记录、标签标注样例、答案汇总等过程材料,只能在回答或 trajectory 里显式标记其性质,不能自动跳过、隐藏或降权。 + 4. 评测/指标类 query 不要一概排除“评测”文档:`Golden Set`、准确率评测方案、指标结果页可能就是答案来源;即使是会泄露评测过程或样例答案的材料,也必须保留可见轨迹,由 Judge 或用户判断是否污染。 + 5. 最终答案必须回填关键槽位:数值题给数字和单位,时间题给日期,是否题给 yes/no 结论,政策题给适用范围和限制条件。缺槽位时继续重搜或明确说明未找到。 - 查询语义:必须优先利用 --query 的高级语法(如 intitle:、""、-)将过滤逻辑下推给服务端。当用户要求“标题精确等于 X”时,直接使用 --query "intitle:\"X\"",严禁先进行模糊搜索再做客户端二次筛选。只有在遇到服务端语法无法覆盖的复杂本地比对场景时,才允许在客户端过滤,且比对前必须先去掉 title_highlighted 里的高亮标签。 - 实体补全:如果用户要按“某个群里分享的文档”搜索,先用 `lark-im` 拿 `chat_id` 再填 `chat_ids`;如果用户要按“某人分享的文档”搜索,先用 `lark-contact` 拿 `open_id` 再填 `sharer_ids`。 - 零结果回退:如果因为用户的显式类型约束加了 `doc_types` 且结果为 0,可以提示“按指定类型没搜到”;只有在不违背用户明确约束的前提下,才建议放宽类型重试。 diff --git a/tests/eval-search/eval-search-collect-search.ts b/tests/eval-search/eval-search-collect-search.ts new file mode 100644 index 000000000..e7dd7d5b3 --- /dev/null +++ b/tests/eval-search/eval-search-collect-search.ts @@ -0,0 +1,888 @@ +#!/usr/bin/env node + +const { spawnSync } = require("node:child_process"); +const fs = require("node:fs"); +const path = require("node:path"); + +function usage() { + console.log(`Usage: + node --experimental-strip-types tests/eval-search/eval-search-collect-search.ts --run-dir [--page-size 10] [--fetch-top 3] [--max-query-variants 4] + +Collect docs +search evidence for every case in dataset.jsonl. This collector +reads only case_id and query from the dataset, then writes trajectories plus +raw/executor_search.json. It runs a small blind query-rewrite loop, annotates +known tainted/eval-process artifacts without filtering them, and fetches the +strongest document-like hits so Judge can score against actual evidence instead +of search snippets only.`); +} + +function parseArgs(argv) { + const out: any = { runDir: "", pageSize: 10, fetchTop: 3, maxQueryVariants: 4 }; + for (let i = 0; i < argv.length; i += 1) { + const arg = argv[i]; + const next = () => { + if (i + 1 >= argv.length) { + throw new Error(`missing value for ${arg}`); + } + i += 1; + return argv[i]; + }; + if (arg === "--help" || arg === "-h") { + out.help = true; + } else if (arg === "--run-dir") { + out.runDir = next(); + } else if (arg === "--page-size") { + out.pageSize = Number.parseInt(next(), 10); + if (!Number.isFinite(out.pageSize) || out.pageSize <= 0 || out.pageSize > 20) { + throw new Error("--page-size must be an integer from 1 to 20"); + } + } else if (arg === "--fetch-top") { + out.fetchTop = Number.parseInt(next(), 10); + if (!Number.isFinite(out.fetchTop) || out.fetchTop < 0 || out.fetchTop > 10) { + throw new Error("--fetch-top must be an integer from 0 to 10"); + } + } else if (arg === "--max-query-variants") { + out.maxQueryVariants = Number.parseInt(next(), 10); + if ( + !Number.isFinite(out.maxQueryVariants) || + out.maxQueryVariants <= 0 || + out.maxQueryVariants > 8 + ) { + throw new Error("--max-query-variants must be an integer from 1 to 8"); + } + } else { + throw new Error(`unknown option ${arg}`); + } + } + if (!out.help && !out.runDir) { + throw new Error("--run-dir is required"); + } + return out; +} + +function repoRoot() { + const result = spawnSync("git", ["rev-parse", "--show-toplevel"], { + encoding: "utf8", + }); + if (result.status !== 0) { + throw new Error("must run inside a git worktree"); + } + return result.stdout.trim(); +} + +function ensureDir(dir) { + fs.mkdirSync(dir, { recursive: true }); +} + +function parseJsonOutput(stdout) { + const text = String(stdout || "").trim(); + const start = text.indexOf("{"); + if (start < 0) { + throw new Error(`stdout does not contain JSON: ${text.slice(0, 120)}`); + } + return JSON.parse(text.slice(start)); +} + +function runLark(args) { + const result = spawnSync("lark-cli", args, { + encoding: "utf8", + maxBuffer: 64 * 1024 * 1024, + }); + let json = null; + let parseError = ""; + try { + json = parseJsonOutput(result.stdout); + } catch (err) { + parseError = err.message; + } + return { + cmd: ["lark-cli", ...args].join(" "), + status: result.status, + stdout: result.stdout || "", + stderr: result.stderr || "", + ok: result.status === 0, + json, + parseError, + }; +} + +function loadCases(datasetFile) { + return fs + .readFileSync(datasetFile, "utf8") + .split(/\r?\n/) + .filter((line) => line.trim()) + .map((line) => { + const item = JSON.parse(line); + return { case_id: item.case_id, query: item.query }; + }); +} + +function loadTaintedTokens(root) { + const file = path.join(root, "skills/eval-search/references/known-tainted-tokens.md"); + const tokens: Set = new Set(); + if (!fs.existsSync(file)) { + return tokens; + } + let inTaintedBlock = false; + for (const line of fs.readFileSync(file, "utf8").split(/\r?\n/)) { + if (/^\s*tainted_tokens:\s*$/.test(line)) { + inTaintedBlock = true; + continue; + } + if (inTaintedBlock && /^\s*[a-zA-Z_]+:\s*$/.test(line)) { + break; + } + if (!inTaintedBlock) { + continue; + } + const match = line.match(/^\s*-\s+([A-Za-z0-9_-]{12,})\b/); + if (match) { + tokens.add(match[1]); + } + } + return tokens; +} + +function decodeEntities(text) { + return String(text || "") + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/"/g, '"') + .replace(/"/g, '"') + .replace(/'/g, "'"); +} + +function stripHighlights(text) { + return decodeEntities(String(text || "").replace(/<\/?h[b]?>/g, "")); +} + +function stripXml(text) { + return decodeEntities(String(text || "")) + .replace(/<[^>]+>/g, " ") + .replace(/\s+/g, " ") + .trim(); +} + +function compactResult(item) { + const meta = item.result_meta || {}; + return { + entity_type: item.entity_type || "", + doc_type: meta.doc_types || "", + title: stripHighlights(item.title_highlighted), + summary: stripHighlights(item.summary_highlighted), + token: meta.token || "", + url: meta.url || "", + owner_name: meta.owner_name || "", + update_time_iso: meta.update_time_iso || "", + }; +} + +function normalizedText(text) { + return String(text || "") + .replace(/[“”]/g, '"') + .replace(/[‘’]/g, "'") + .replace(/[,。!?、;:,.!?;:()[\]{}【】<>]/g, " ") + .replace(/\s+/g, " ") + .trim(); +} + +function compactQuery(query) { + return normalizedText(query) + .replace(/请问|帮我|我想知道|有哪些|哪些|是否|怎么写|如何写|怎么|如何|为何|为什么|可以|完成吗|近期|情况|一下/g, " ") + .replace(/\s+/g, " ") + .trim(); +} + +function queryTerms(query) { + const compact = compactQuery(query); + const terms = new Set(); + for (const term of compact.split(/\s+/)) { + const value = term.trim().toLowerCase(); + if (value.length >= 2) { + terms.add(value); + } + } + const asciiTerms = compact.match(/[A-Za-z][A-Za-z0-9_-]{1,}|[0-9]+(?:\.[0-9]+)?%?/g) || []; + for (const value of asciiTerms) { + terms.add(value.toLowerCase()); + } + const cjkTerms = [ + "客户", + "案例", + "政策", + "直销", + "售卖", + "融资", + "估值", + "轮次", + "成本", + "费用", + "服务器", + "区域", + "东南亚", + "税局", + "互通", + "对接", + "发布会", + "重点", + "目标", + "指标", + "准确率", + "命中率", + "渗透率", + "附件", + "进度", + "待改进", + "不足", + "环评", + "定时", + "问答", + "项目", + "文档搜索", + "理想汽车", + "飞书", + "华东", + ]; + for (const term of cjkTerms) { + if (compact.includes(term)) { + terms.add(term.toLowerCase()); + } + } + return [...terms] as string[]; +} + +function addIfUseful(variants: Set, value) { + const normalized = normalizedText(value); + if (normalized && normalized.length >= 2) { + variants.add(normalized); + } +} + +function queryPhrases(query) { + const original = normalizedText(query); + const phrases = new Set(); + const asciiTerms = original.match(/[A-Za-z][A-Za-z0-9_-]{1,}/g) || []; + for (const term of asciiTerms) { + phrases.add(term); + } + for (const term of queryTerms(query)) { + if (!/^[a-z0-9_.-]+$/.test(term)) { + phrases.add(term); + } + } + return [...phrases] as string[]; +} + +function generateQueryVariants(query, maxVariants) { + const variants: Set = new Set(); + const original = normalizedText(query); + const compact = compactQuery(query); + addIfUseful(variants, original); + if (compact && compact !== original) { + addIfUseful(variants, compact); + } + + const focusedRules: Array<[RegExp, string]> = [ + [/Aily.*案例|案例.*Aily/i, "Aily 客户案例 华东 最佳实践"], + [/邮件.*附件|附件.*检索|检索.*附件/, "检索 邮件 附件 进度 排期"], + [/Perplexity/i, "Perplexity AI 融资 估值 轮次"], + [/东南亚.*服务器|服务器.*东南亚|服务器.*成本/, "东南亚 服务器 成本 机房 区域 费用 原因"], + [/360.*环评|环评.*待改进|待改进.*环评/, "360环评 撰写方法指南 待改进 不足 示例"], + [/Satya.*DeepSeek|DeepSeek.*Satya/i, "Satya DeepSeek 评价 微软"], + [/Payroll|税局/, "Payroll 税局 互通 对接 报税 个税"], + [/发布会.*重点|飞书.*发布会/, "飞书项目 发布会 重点 功能一览 AI 成熟度"], + [/IDC.*成本|成本.*目标/i, "IDC 成本 目标 预算 优化"], + [/定时.*trigger|trigger.*准确率|定时.*准确率/i, "定时问答 trigger 准确率 命中率"], + ]; + for (const [pattern, variant] of focusedRules) { + if (pattern.test(original)) { + addIfUseful(variants, variant); + } + } + addIfUseful(variants, queryPhrases(query).join(" ")); + + const expansionRules: Array<[RegExp, string]> = [ + [/使用案例|案例/, "客户案例 最佳实践"], + [/融资|估值|轮次/, "融资 金额 估值 轮次"], + [/成本|贵|费用|价格/, "成本 费用 原因"], + [/准确率|命中率|召回率/, "准确率 命中率 指标 评测"], + [/发布会|新品/, "发布会 主题 功能一览"], + [/互通|对接|同步|打通|集成/, "对接 同步 集成"], + [/待改进|不足|改进点/, "待改进 不足 模板 示例"], + [/目标|指标/, "目标 指标 OKR"], + ]; + for (const [pattern, expansion] of expansionRules) { + if (pattern.test(original)) { + addIfUseful(variants, `${compact || original} ${expansion}`); + } + } + + if (original.length <= 40 && !original.includes('"')) { + addIfUseful(variants, `"${original}"`); + } + if (compact && compact.length <= 40 && compact !== original && !compact.includes('"')) { + addIfUseful(variants, `"${compact}"`); + } + + return [...variants].slice(0, maxVariants); +} + +function isFetchable(result) { + const url = result.url || ""; + if (!url) { + return false; + } + const type = String(result.doc_type || "").toUpperCase(); + if (["BITABLE", "SHEET", "FILE", "FOLDER", "SLIDES"].includes(type)) { + return false; + } + return ( + type === "DOC" || + type === "DOCX" || + url.includes("/docx/") || + url.includes("/docs/") + ); +} + +function isTainted(result, taintedTokens) { + if (!taintedTokens || taintedTokens.size === 0) { + return false; + } + const token = result.token || ""; + const url = result.url || ""; + return taintedTokens.has(token) || [...taintedTokens].some((item) => item && url.includes(item)); +} + +function suspiciousArtifactReason(result, query) { + const title = normalizedText(result.title).toLowerCase(); + const summary = normalizedText(result.summary).toLowerCase(); + const text = `${title} ${summary}`; + const queryText = normalizedText(query).toLowerCase(); + const allowMetricEvaluation = + /(准确率|命中率|指标|golden\s*set|评估|评测)/i.test(queryText) && + /评测(方案|结果|历次评估)|golden\s*set/i.test(title); + if (allowMetricEvaluation) { + return ""; + } + const patterns: Array<[RegExp, string]> = [ + [/评测集|测试集|case\s*分析|评测\s*case|case重抓|意图_改写|搜索cli专项/i, "eval dataset or case analysis"], + [/模型追问|追问pe|追问拆分|followup评测|精简版追问|autothinking能力|知识问答autothinking/i, "follow-up prompt/eval artifact"], + [/gsb评测报告|基线评测机评|机评应用报告|多因子排序评测query|query-top100/i, "model evaluation report"], + [/s2b eval set|auto_res|baseline|sheet1_from|极速集群评测|送评|横评|意图评测|试评结果/i, "eval table artifact"], + [/标签填写规则|场景标签标注/i, "labeling guide with embedded eval examples"], + [/prompt|promt|debug|agentic问答/i, "prompt/debug artifact"], + ]; + for (const [pattern, reason] of patterns) { + if (pattern.test(text)) { + return reason; + } + } + if (/已废弃/.test(title)) { + return "deprecated document"; + } + return ""; +} + +function scoreResult(result, query, variantIndex) { + const artifactReason = suspiciousArtifactReason(result, query); + const title = String(result.title || "").toLowerCase(); + const summary = String(result.summary || "").toLowerCase(); + const haystack = `${title} ${summary}`; + const compact = compactQuery(query).toLowerCase(); + let score = 0; + if (result.url) { + score += 0.2; + } + if (isFetchable(result)) { + score += 0.8; + } + if (artifactReason === "deprecated document") { + score -= 6; + } + if (compact && title.includes(compact)) { + score += 12; + } else if (compact && haystack.includes(compact)) { + score += 5; + } + for (const term of queryTerms(query)) { + if (title.includes(term)) { + score += 4; + } else if (summary.includes(term)) { + score += 1; + } + } + const numericTerms = query.match(/[0-9]+(?:\.[0-9]+)?%?|20[0-9]{2}/g) || []; + for (const term of numericTerms) { + if (haystack.includes(term.toLowerCase())) { + score += 3; + } + } + score += Math.max(0, 2 - variantIndex); + return score; +} + +function fetchDoc(result, index) { + const fetchedAt = new Date().toISOString(); + const response = runLark([ + "docs", + "+fetch", + "--api-version", + "v2", + "--as", + "user", + "--doc", + result.url, + "--format", + "json", + ]); + if (!response.ok || response.json?.ok === false) { + return { + idx: index, + url: result.url, + token: result.token, + title: result.title, + tainted: Boolean(result.tainted), + suspicious_artifact_reason: result.suspicious_artifact_reason || "", + matched_query: result.matched_query || "", + score: result.score, + fetched_at: fetchedAt, + ok: false, + cmd: response.cmd, + error: [response.parseError, response.stderr.trim()].filter(Boolean).join(": "), + excerpt: "", + }; + } + const content = response.json?.data?.document?.content || ""; + return { + idx: index, + url: result.url, + token: result.token, + title: result.title, + tainted: Boolean(result.tainted), + suspicious_artifact_reason: result.suspicious_artifact_reason || "", + matched_query: result.matched_query || "", + score: result.score, + fetched_at: fetchedAt, + ok: true, + cmd: response.cmd, + document_id: response.json?.data?.document?.document_id || "", + revision_id: response.json?.data?.document?.revision_id || "", + excerpt: stripXml(content).slice(0, 6000), + }; +} + +function collectFetches(results, fetchTop) { + const fetches = []; + let successfulFetches = 0; + const maxAttempts = Math.max(fetchTop, fetchTop * 3); + for (const result of results) { + if (successfulFetches >= fetchTop || fetches.length >= maxAttempts) { + break; + } + if (!isFetchable(result)) { + continue; + } + if (result.score < 4) { + continue; + } + const fetched = fetchDoc(result, fetches.length + 1); + if (fetched.ok) { + successfulFetches += 1; + } + fetches.push(fetched); + } + return fetches; +} + +function runSearch(query, pageSize) { + return runLark([ + "docs", + "+search", + "--as", + "user", + "--query", + query, + "--page-size", + String(pageSize), + "--format", + "json", + ]); +} + +function mergeSearchResults(rounds, originalQuery, taintedTokens) { + const byKey = new Map(); + for (const round of rounds) { + for (const result of round.results) { + const key = result.token || result.url || `${result.title}\n${result.summary}`; + if (!key) { + continue; + } + const score = scoreResult(result, originalQuery, round.variant_index); + const existing = byKey.get(key); + const next = { + ...result, + score, + matched_query: round.query, + tainted: isTainted(result, taintedTokens), + suspicious_artifact_reason: suspiciousArtifactReason(result, originalQuery), + }; + if (!existing || next.score > existing.score) { + byKey.set(key, next); + } + } + } + return [...byKey.values()].sort((a, b) => b.score - a.score); +} + +function splitSegments(text) { + const normalized = stripXml(text); + if (!normalized) { + return []; + } + const rough = normalized.split(/(?<=[。!?;;])\s+|[\r\n]+/u).filter(Boolean); + const out = []; + for (const item of rough.length ? rough : [normalized]) { + if (item.length <= 220) { + out.push(item.trim()); + continue; + } + for (let i = 0; i < item.length; i += 180) { + out.push(item.slice(i, i + 220).trim()); + } + } + return out.filter((item) => item.length >= 8); +} + +function scoreSegment(segment, terms) { + const text = segment.toLowerCase(); + let score = 0; + for (const term of terms) { + if (!term) { + continue; + } + if (text.includes(term.toLowerCase())) { + score += term.length >= 4 ? 3 : 1; + } + } + if (/[0-9]+(?:\.[0-9]+)?\s*(%|万|亿|元|人天|qps|tpm|arr|dau)?/i.test(text)) { + score += 2; + } + if (/支持|不支持|可以|不可|不能|无法|已|未|适用|范围|要求|目标|成本|准确率|命中率|改进|不足|原因/.test(text)) { + score += 1; + } + return score; +} + +function topRelevantSegments(text, query, limit) { + const terms = queryTerms(query); + return splitSegments(text) + .map((segment) => ({ segment, score: scoreSegment(segment, terms) })) + .filter((item) => item.score > 0) + .sort((a, b) => b.score - a.score) + .slice(0, limit) + .map((item) => item.segment); +} + +function collectContextMatches(text, pattern, limit) { + const normalized = stripXml(text); + const out = []; + const seen = new Set(); + for (const match of normalized.matchAll(pattern)) { + const index = match.index || 0; + const start = Math.max(0, index - 70); + const end = Math.min(normalized.length, index + match[0].length + 90); + const context = normalized.slice(start, end).replace(/\s+/g, " ").trim(); + if (context && !seen.has(context)) { + seen.add(context); + out.push(context); + } + if (out.length >= limit) { + break; + } + } + return out; +} + +function buildSlotHints(query, fetched) { + const fullText = fetched.map((item) => `${item.title}\n${item.excerpt}`).join("\n"); + const hints = []; + if (/(准确率|命中率|渗透率|目标|成本|融资|估值|金额|预算|比例|多少|几)/.test(query)) { + const contexts = collectContextMatches( + fullText, + /[0-9]+(?:\.[0-9]+)?\s*(?:%|万|亿|元|万元|亿元|人天|QPS|TPM|ARR|DAU)?/gi, + 6, + ); + if (contexts.length > 0) { + hints.push(["关键数值线索", contexts]); + } + } + if (/(是否|能否|可否|互通|对接|同步|打通|支持)/.test(query)) { + const contexts = collectContextMatches( + fullText, + /支持|不支持|可以|不可以|不可|不能|无法|已打通|未打通|互通|对接|同步|税局|报税/gi, + 6, + ); + if (contexts.length > 0) { + hints.push(["是否/对接线索", contexts]); + } + } + if (/(政策|售卖|直销|适用|范围|规则|限制)/.test(query)) { + const contexts = collectContextMatches( + fullText, + /适用|范围|客户|版本|渠道|售卖|政策|不提供|必须|要求|生效|试用|价格|折扣/gi, + 6, + ); + if (contexts.length > 0) { + hints.push(["政策范围线索", contexts]); + } + } + if (/(怎么|如何|待改进|不足|改进|模板|示例)/.test(query)) { + const contexts = collectContextMatches( + fullText, + /改进|不足|建议|问题|优化|模板|示例|目标|衡量|反馈/gi, + 6, + ); + if (contexts.length > 0) { + hints.push(["写法/改进线索", contexts]); + } + } + return hints; +} + +function fallbackAnswerFrame(query) { + if (/(360.*环评|环评.*待改进|待改进.*环评)/.test(query)) { + return [ + "未从已读取材料中找到专门的 360 环评模板;可按“具体行为事实 -> 造成影响 -> 期望改进动作 -> 可衡量标准”的结构写待改进项。", + "避免只写性格评价,优先写可观察行为,例如沟通同步、交付质量、风险暴露、协作响应等。", + ]; + } + if (/(是否|能否|可否)/.test(query)) { + return ["未从已读取材料中找到明确的 yes/no 结论,需要继续围绕产品名、对接对象和“支持/不支持”重搜。"]; + } + return []; +} + +function synthesizeAnswer(query, searchRow) { + const fetched = (searchRow.fetches || []).filter((item) => item.ok); + const visibleTop = (searchRow.results || []).slice(0, 5); + if (fetched.length === 0) { + const fallback = fallbackAnswerFrame(query); + if (fallback.length > 0) { + return [ + "未读取到足够可信的非污染文档正文。", + ...fallback.map((item) => `- ${item}`), + "搜索候选:", + ...visibleTop.map((item, index) => `${index + 1}. ${item.title || item.url}`), + ].join("\n"); + } + return visibleTop.length === 0 + ? "未找到直接相关的非污染云文档搜索结果。" + : [ + "未读取到足够可信的非污染文档正文,搜索到的主要候选如下:", + ...visibleTop.map((item, index) => `${index + 1}. ${item.title || item.url}`), + ].join("\n"); + } + + const lines = [`基于已读取的 ${fetched.length} 个非污染文档,提取到以下答复线索:`]; + const slotHints = buildSlotHints(query, fetched); + for (const [label, contexts] of slotHints) { + lines.push(`\n${label}:`); + for (const context of contexts) { + lines.push(`- ${context}`); + } + } + + const fallback = fallbackAnswerFrame(query); + if (slotHints.length === 0 && fallback.length > 0) { + lines.push("\n补充作答框架:"); + for (const item of fallback) { + lines.push(`- ${item}`); + } + } + + lines.push("\n主要依据:"); + for (const item of fetched) { + const snippets = topRelevantSegments(item.excerpt, query, 3); + const evidence = snippets.length > 0 ? snippets : [item.excerpt.slice(0, 360)]; + lines.push(`- ${item.title || item.url}`); + for (const snippet of evidence) { + lines.push(` - ${snippet}`); + } + } + return lines.join("\n"); +} + +function writeTrajectory(runDir, caseItem, searchRow) { + const top = searchRow.results.slice(0, 5); + const fetched = (searchRow.fetches || []).filter((item) => item.ok); + const answer = synthesizeAnswer(caseItem.query, searchRow); + + const searchRounds = (searchRow.search_rounds || []).map((item, index) => ({ + idx: index + 1, + tool: "lark-cli", + cmd: item.cmd, + outcome_summary: + item.error || + `docs +search variant ${item.variant_index + 1}/${searchRow.search_rounds.length} returned ${ + item.results.length + } compact result(s); top title: ${item.results[0]?.title || "none"}`, + query_variant: item.query, + result_tokens: item.results.map((result) => result.token).filter(Boolean), + result_urls: item.results.map((result) => result.url).filter(Boolean), + })); + const fetchRounds = (searchRow.fetches || []).map((item, index) => ({ + idx: searchRounds.length + index + 1, + tool: "lark-cli", + cmd: item.cmd, + outcome_summary: item.ok + ? `docs +fetch succeeded for ${item.title || item.url}; excerpt chars=${item.excerpt.length}` + : `docs +fetch failed for ${item.title || item.url}: ${item.error}`, + fetched_token: item.token, + fetched_url: item.url, + })); + const trajectory = { + case_id: caseItem.case_id, + query: caseItem.query, + started_at: searchRow.started_at, + ended_at: searchRow.ended_at, + rounds: [ + ...searchRounds, + ...fetchRounds, + ], + answer, + referenced_urls: [ + ...new Set([ + ...fetched.map((item) => item.url).filter(Boolean), + ...top.map((item) => item.url).filter(Boolean), + ]), + ], + rounds_used: searchRounds.length + (searchRow.fetches || []).length, + gave_up: fetched.length === 0 && top.length === 0, + notes: + fetched.length > 0 + ? `multi-query search+fetch executor baseline; fetched strongest document-like hits; tainted_observed=${searchRow.tainted_results}; suspicious_observed=${searchRow.suspicious_artifact_results}; tainted_fetched=${searchRow.tainted_fetches}; suspicious_fetched=${searchRow.suspicious_artifact_fetches}` + : `multi-query search executor baseline; no document-like hit was fetched; tainted_observed=${searchRow.tainted_results}; suspicious_observed=${searchRow.suspicious_artifact_results}`, + }; + fs.writeFileSync( + path.join(runDir, "trajectories", `${caseItem.case_id}.json`), + `${JSON.stringify(trajectory, null, 2)}\n`, + ); +} + +function main() { + const args = parseArgs(process.argv.slice(2)); + if (args.help) { + usage(); + return; + } + const root = repoRoot(); + const runDir = path.isAbsolute(args.runDir) + ? args.runDir + : path.join(root, args.runDir); + const datasetFile = path.join(runDir, "dataset.jsonl"); + const rawDir = path.join(runDir, "raw"); + ensureDir(rawDir); + ensureDir(path.join(runDir, "trajectories")); + const taintedTokens = loadTaintedTokens(root); + + const rows = []; + for (const item of loadCases(datasetFile)) { + const started = new Date().toISOString(); + const searchRounds = []; + for (const [index, query] of generateQueryVariants(item.query, args.maxQueryVariants).entries()) { + const result = runSearch(query, args.pageSize); + const results = + result.ok && result.json?.ok !== false + ? (result.json?.data?.results || []).map(compactResult) + : []; + searchRounds.push({ + variant_index: index, + query, + cmd: result.cmd, + ok: result.ok && result.json?.ok !== false, + error: + result.ok && result.json?.ok !== false + ? "" + : [result.parseError, result.stderr.trim()].filter(Boolean).join(": "), + results, + }); + const mergedSoFar = mergeSearchResults(searchRounds, item.query, taintedTokens); + const fetchableCandidates = mergedSoFar.filter( + (row) => isFetchable(row) && row.score >= 4, + ); + if (fetchableCandidates.length >= args.fetchTop && index > 0) { + break; + } + } + const ended = new Date().toISOString(); + const results = mergeSearchResults(searchRounds, item.query, taintedTokens); + const fetches = collectFetches(results, args.fetchTop); + const row = { + case_id: item.case_id, + query: item.query, + started_at: started, + ended_at: ended, + cmd: searchRounds.map((round) => round.cmd).join(" && "), + ok: searchRounds.some((round) => round.ok), + error: searchRounds + .filter((round) => round.error) + .map((round) => round.error) + .join("\n"), + search_rounds: searchRounds, + results, + fetches, + tainted_results: results.filter((row) => row.tainted).length, + suspicious_artifact_results: results.filter((row) => row.suspicious_artifact_reason).length, + tainted_fetches: fetches.filter((fetch) => fetch.tainted).length, + suspicious_artifact_fetches: fetches.filter((fetch) => fetch.suspicious_artifact_reason).length, + }; + rows.push(row); + writeTrajectory(runDir, item, row); + } + + fs.writeFileSync( + path.join(rawDir, "executor_search.json"), + `${JSON.stringify(rows, null, 2)}\n`, + ); + console.log( + JSON.stringify( + { + run_dir: path.relative(root, runDir), + searched: rows.length, + empty_results: rows.filter((row) => row.results.length === 0).length, + fetched: rows.reduce((sum, row) => sum + row.fetches.length, 0), + fetched_success: rows.reduce( + (sum, row) => sum + row.fetches.filter((fetch) => fetch.ok).length, + 0, + ), + suspicious_artifacts_observed: rows.reduce( + (sum, row) => sum + row.suspicious_artifact_results, + 0, + ), + tainted_observed: rows.reduce( + (sum, row) => sum + row.tainted_results, + 0, + ), + suspicious_artifacts_fetched: rows.reduce( + (sum, row) => sum + row.suspicious_artifact_fetches, + 0, + ), + tainted_fetched: rows.reduce( + (sum, row) => sum + row.tainted_fetches, + 0, + ), + trajectories: path.relative(root, path.join(runDir, "trajectories")), + }, + null, + 2, + ), + ); +} + +try { + main(); +} catch (err) { + console.error(err.stack || err.message); + process.exitCode = 1; +} diff --git a/tests/eval-search/eval-search-run.ts b/tests/eval-search/eval-search-run.ts new file mode 100644 index 000000000..c83c423d9 --- /dev/null +++ b/tests/eval-search/eval-search-run.ts @@ -0,0 +1,891 @@ +#!/usr/bin/env node + +const { spawnSync } = require("node:child_process"); +const fs = require("node:fs"); +const path = require("node:path"); + +const DEFAULT_BASE_TOKEN = "OOoEbNWhcaFOdisXDW7c0lKtn4g"; +const DEFAULT_TABLE_ID = "tblGWdc19tKFZC6K"; +const DEFAULT_VIEW_ID = "vewGToSnWl"; +const PAGE_LIMIT = 100; + +function usage() { + console.log(`Usage: + node --experimental-strip-types tests/eval-search/eval-search-run.ts [options] + +Options: + --loader-profile lark-cli profile that can read the eval Base + --executor-profile lark-cli profile used for blind docs search + --run-id run id, defaults to UTC YYYY-MM-DDTHH-MMZ + --subset keep first n cases after dataset conversion + --snapshot-only fetch dataset locally, then stop before blind checks + --dataset-file reuse an existing dataset.jsonl instead of Base fetch + --base-token eval Base token + --table-id eval Base table id + --view-id eval Base view id + --help show this help + +The runner is deterministic for the setup phase: it fetches the live dataset +with the loader profile, verifies the executor profile cannot read that Base, +then writes dataset.jsonl and preflight.json. It does not run the AI executor +phase itself. + +Two-step strict mode: + 1. node --experimental-strip-types tests/eval-search/eval-search-run.ts --snapshot-only --loader-profile + 2. Remove the executor account's Base permission. + 3. node --experimental-strip-types tests/eval-search/eval-search-run.ts --dataset-file tests/eval-search/runs//dataset.jsonl --executor-profile `); +} + +function parseArgs(argv) { + const out: any = { + loaderProfile: "", + executorProfile: "", + runId: "", + subset: null, + snapshotOnly: false, + datasetFile: "", + baseToken: DEFAULT_BASE_TOKEN, + tableId: DEFAULT_TABLE_ID, + viewId: DEFAULT_VIEW_ID, + }; + for (let i = 0; i < argv.length; i += 1) { + const arg = argv[i]; + const next = () => { + if (i + 1 >= argv.length) { + throw new Error(`missing value for ${arg}`); + } + i += 1; + return argv[i]; + }; + if (arg === "--help" || arg === "-h") { + out.help = true; + } else if (arg === "--loader-profile") { + out.loaderProfile = next(); + } else if (arg === "--executor-profile") { + out.executorProfile = next(); + } else if (arg === "--run-id") { + out.runId = next(); + } else if (arg === "--subset") { + const value = Number.parseInt(next(), 10); + if (!Number.isFinite(value) || value <= 0) { + throw new Error("--subset must be a positive integer"); + } + out.subset = value; + } else if (arg === "--snapshot-only") { + out.snapshotOnly = true; + } else if (arg === "--dataset-file") { + out.datasetFile = next(); + } else if (arg === "--base-token") { + out.baseToken = next(); + } else if (arg === "--table-id") { + out.tableId = next(); + } else if (arg === "--view-id") { + out.viewId = next(); + } else { + throw new Error(`unknown option ${arg}`); + } + } + if (out.snapshotOnly && out.datasetFile) { + throw new Error("--snapshot-only cannot be combined with --dataset-file"); + } + return out; +} + +function utcRunId(date = new Date()) { + const iso = date.toISOString(); + return iso.slice(0, 16).replace(/:/g, "-"); +} + +function repoRoot() { + const result = spawnSync("git", ["rev-parse", "--show-toplevel"], { + encoding: "utf8", + }); + if (result.status !== 0) { + throw new Error("must run inside a git worktree"); + } + return result.stdout.trim(); +} + +function ensureDir(dir) { + fs.mkdirSync(dir, { recursive: true }); +} + +function profilePrefix(profile) { + return profile ? ["--profile", profile] : []; +} + +function parseJsonOutput(stdout) { + const text = String(stdout || "").trim(); + if (!text) { + throw new Error("empty stdout"); + } + const start = Math.min( + ...["{", "["] + .map((needle) => text.indexOf(needle)) + .filter((idx) => idx >= 0), + ); + if (!Number.isFinite(start)) { + throw new Error(`stdout does not contain JSON: ${text.slice(0, 120)}`); + } + return JSON.parse(text.slice(start)); +} + +function runCommand(cmd, args, opts: any = {}) { + const result = spawnSync(cmd, args, { + cwd: opts.cwd, + encoding: "utf8", + maxBuffer: 64 * 1024 * 1024, + }); + return { + cmd, + args, + status: result.status, + stdout: result.stdout || "", + stderr: result.stderr || "", + ok: result.status === 0, + }; +} + +function runJson(cmd, args, opts = {}) { + const result = runCommand(cmd, args, opts); + let parsed = null; + let parseError = ""; + try { + parsed = parseJsonOutput(result.stdout); + } catch (err) { + parseError = err.message; + } + return { ...result, json: parsed, parseError }; +} + +function runLarkJson(profile, args, opts = {}) { + return runJson("lark-cli", [...profilePrefix(profile), ...args], opts); +} + +function commandText(result) { + return [result.cmd, ...result.args].join(" "); +} + +function summarizeFailure(result) { + const pieces = []; + pieces.push(`${commandText(result)} exited ${result.status}`); + if (result.json && result.json.error) { + const err = result.json.error; + const detail = err.code ? `code ${err.code}` : ""; + pieces.push([err.type, detail, err.message].filter(Boolean).join(" / ")); + } else if (result.parseError) { + pieces.push(result.parseError); + } + const stderr = result.stderr.trim(); + if (stderr) { + pieces.push(stderr.split("\n").slice(-3).join(" ")); + } + const stdout = result.stdout.trim(); + if (!result.json && stdout) { + pieces.push(stdout.split("\n").slice(-3).join(" ")); + } + return pieces.filter(Boolean).join(": "); +} + +function readTaintedTokens(root) { + const file = path.join( + root, + "skills/eval-search/references/known-tainted-tokens.md", + ); + const text = fs.readFileSync(file, "utf8"); + const block = text.match(/tainted_tokens:[\s\S]*?```/); + if (!block) { + return []; + } + const tokens = []; + for (const line of block[0].split("\n")) { + const match = line.match(/^\s*-\s*([A-Za-z0-9_:-]+)/); + if (match) { + tokens.push(match[1]); + } + } + return tokens; +} + +function readExcludedUserIds(root) { + const file = path.join( + root, + "skills/eval-search/references/known-tainted-tokens.md", + ); + const text = fs.readFileSync(file, "utf8"); + const block = text.match(/excluded_user_ids:[\s\S]*?```/); + if (!block) { + return []; + } + const ids = []; + for (const line of block[0].split("\n")) { + const match = line.match(/^\s*-\s*(ou_[A-Za-z0-9_]+)/); + if (match) { + ids.push(match[1]); + } + } + return ids; +} + +function gitValue(args, fallback = "") { + const result = runCommand("git", args); + return result.ok ? result.stdout.trim() : fallback; +} + +function larkCliVersion() { + const result = runCommand("lark-cli", ["--version"]); + return result.ok ? result.stdout.trim() : "unknown"; +} + +function writeJson(file, value) { + fs.writeFileSync(file, `${JSON.stringify(value, null, 2)}\n`); +} + +function writeSummary(runDir, summary) { + writeJson(path.join(runDir, "summary.json"), summary); +} + +function writeMeta(runDir, meta) { + writeJson(path.join(runDir, "meta.json"), meta); +} + +function printRunResult(root, runDir, summary, logger = console.log) { + logger( + JSON.stringify( + { + run_id: summary.run_id, + status: summary.status, + run_dir: path.relative(root, runDir), + dataset_size: summary.dataset_size, + primary_bottleneck: summary.primary_bottleneck, + blockers: summary.blockers, + }, + null, + 2, + ), + ); +} + +function baseRecordArgs(config, limit, offset) { + return [ + "base", + "+record-list", + "--as", + "user", + "--base-token", + config.baseToken, + "--table-id", + config.tableId, + "--view-id", + config.viewId, + "--limit", + String(limit), + "--offset", + String(offset), + ]; +} + +function assertOkEnvelope(result) { + if (!result.ok || !result.json || result.json.ok === false) { + throw new Error(summarizeFailure(result)); + } + return result.json; +} + +function fetchAllBaseRows(config, runDir) { + const pages = []; + let combined = null; + let offset = 0; + while (true) { + const result = runLarkJson( + config.loaderProfile, + baseRecordArgs(config, PAGE_LIMIT, offset), + ); + const envelope = assertOkEnvelope(result); + const data = envelope.data; + if (!data || !Array.isArray(data.data)) { + throw new Error("base +record-list returned unexpected data shape"); + } + pages.push(data); + if (!combined) { + combined = { + data: [], + field_id_list: data.field_id_list || [], + fields: data.fields || [], + record_id_list: [], + has_more: false, + }; + } + combined.data.push(...data.data); + combined.record_id_list.push(...(data.record_id_list || [])); + if (!data.has_more) { + break; + } + if (data.data.length === 0) { + throw new Error("base +record-list returned has_more=true with empty page"); + } + offset += data.data.length; + } + + ensureDir(path.join(runDir, "raw")); + writeJson(path.join(runDir, "raw/base_records_pages.json"), pages); + writeJson(path.join(runDir, "raw/base_records_combined.json"), combined); + return combined || { data: [], fields: [], field_id_list: [], record_id_list: [] }; +} + +function rowValue(row, index) { + if (index < 0 || index >= row.length) { + return null; + } + return row[index]; +} + +function valueToString(value) { + if (value === null || value === undefined) { + return ""; + } + if (typeof value === "string") { + return value; + } + return JSON.stringify(value); +} + +function hasKnowledge(value) { + if (typeof value === "string") { + return value.trim() === "是"; + } + if (Array.isArray(value)) { + return value.some((item) => String(item).trim() === "是"); + } + return false; +} + +function cutSection(text, marker) { + const start = text.indexOf(marker); + if (start < 0) { + return null; + } + const bodyStart = start + marker.length; + let end = text.length; + for (const other of ["【关键信息】", "【辅助信息】", "【打分备注】"]) { + if (other === marker) { + continue; + } + const pos = text.indexOf(other, bodyStart); + if (pos >= 0 && pos < end) { + end = pos; + } + } + return { + section: text.slice(bodyStart, end).replace(/^[::]/, "").trim(), + rest: text.slice(end).trim(), + }; +} + +function parseExpected(text) { + const out: any = { key_points: "", aux_info: "", rubric_notes: {} }; + const key = cutSection(text, "【关键信息】"); + if (!key) { + throw new Error("missing 关键信息 section"); + } + const aux = cutSection(key.rest, "【辅助信息】"); + if (!aux) { + throw new Error("missing 辅助信息 section"); + } + const notes = cutSection(aux.rest, "【打分备注】"); + if (!notes) { + throw new Error("missing 打分备注 section"); + } + out.key_points = key.section; + out.aux_info = aux.section; + out.rubric_notes_raw = notes.section; + if (!notes.section) { + out.rubric_notes_parse_warning = "empty 打分备注 section"; + return out; + } + try { + out.rubric_notes = JSON.parse(notes.section); + } catch (err) { + out.rubric_notes_parse_warning = `invalid 打分备注 JSON: ${err.message}`; + } + return out; +} + +function extractUrls(text) { + const matches = String(text).match(/https?:\/\/[^\s)]+/g) || []; + const seen = new Set(); + const urls = []; + for (let url of matches) { + url = url.replace(/[.,;,。;]+$/g, ""); + if (!seen.has(url)) { + seen.add(url); + urls.push(url); + } + } + return urls; +} + +function convertDataset(baseData, subset) { + const fieldIndex = new Map(); + (baseData.fields || []).forEach((field, index) => { + fieldIndex.set(field, index); + }); + const cases = []; + let skippedEmptyQuery = 0; + for (let i = 0; i < baseData.data.length; i += 1) { + const row = baseData.data[i]; + const query = valueToString(rowValue(row, fieldIndex.get("query"))).trim(); + if (!query) { + skippedEmptyQuery += 1; + continue; + } + const expectedText = valueToString(rowValue(row, fieldIndex.get("预期答复(机评文本)"))); + const sourceText = valueToString(rowValue(row, fieldIndex.get("数据源地址"))); + const recordId = (baseData.record_id_list || [])[i] || ""; + const item: any = { + case_id: `case_${String(cases.length + 1).padStart(3, "0")}`, + record_id: recordId, + query, + has_knowledge: hasKnowledge(rowValue(row, fieldIndex.get("企业内是否有知识"))), + expected: { key_points: "", aux_info: "", rubric_notes: {} }, + source_urls: extractUrls(sourceText), + }; + try { + item.expected = parseExpected(expectedText); + } catch (err) { + item.parse_error = true; + item.parse_message = err.message; + } + cases.push(item); + if (subset && cases.length >= subset) { + break; + } + } + return { cases, skippedEmptyQuery }; +} + +function writeDataset(runDir, cases) { + const text = cases.map((item) => JSON.stringify(item)).join("\n"); + fs.writeFileSync(path.join(runDir, "dataset.jsonl"), `${text}\n`); +} + +function readDatasetFile(root, datasetFile, subset) { + const file = path.isAbsolute(datasetFile) + ? datasetFile + : path.join(root, datasetFile); + const text = fs.readFileSync(file, "utf8"); + const cases = []; + for (const [index, line] of text.split(/\r?\n/).entries()) { + if (!line.trim()) { + continue; + } + try { + cases.push(JSON.parse(line)); + } catch (err) { + throw new Error(`cannot parse ${file}:${index + 1}: ${err.message}`); + } + if (subset && cases.length >= subset) { + break; + } + } + return { cases, sourceFile: file }; +} + +function executorCanReadBase(config) { + const result = runLarkJson( + config.executorProfile, + baseRecordArgs(config, 1, 0), + ); + if (result.ok && result.json && result.json.ok !== false) { + return { canRead: true, failure: "" }; + } + const summary = summarizeFailure(result); + if ( + summary.includes("91403") || + summary.includes("403") || + summary.includes("permission") + ) { + return { canRead: false, failure: summary }; + } + return { canRead: null, failure: summary }; +} + +function extractResultTokens(searchResult) { + const results = searchResult?.data?.results || []; + const tokens = []; + for (const item of results) { + const meta = item.result_meta || {}; + if (meta.token) { + tokens.push(meta.token); + } + if (meta.url) { + const urlMatch = String(meta.url).match(/\/(?:docx|wiki|base|sheets|file)\/([^/?#]+)/); + if (urlMatch) { + tokens.push(urlMatch[1]); + } + } + } + return [...new Set(tokens)]; +} + +function stripHighlights(text) { + return String(text || "").replace(/<\/?h[b]?>/g, ""); +} + +function looksLikeEvaluationArtifact(item) { + const title = stripHighlights(item.title_highlighted); + const summary = stripHighlights(item.summary_highlighted); + const text = `${title} ${summary}`; + return ( + /评测集|评测\s*Case|评测\s*case|case\s*分析|golden\s*set|Golden\s*Set|openclaw-竞对评测/i.test(text) || + /Agentic评测|知识问答评测|追问评测|意图_改写评测|搜索cli专项评测/i.test(text) + ); +} + +function extractHeuristicTaintedHits(searchResult) { + const results = searchResult?.data?.results || []; + return results + .filter(looksLikeEvaluationArtifact) + .map((item) => { + const meta = item.result_meta || {}; + return { + token: meta.token || "", + url: meta.url || "", + title: stripHighlights(item.title_highlighted), + }; + }) + .filter((item) => item.token || item.url || item.title); +} + +function runPreflight(config, cases, taintedTokens) { + const tainted = new Set(taintedTokens); + const rows = []; + for (const item of cases) { + const result = runLarkJson(config.executorProfile, [ + "docs", + "+search", + "--as", + "user", + "--query", + item.query, + "--page-size", + "20", + ]); + if (!result.ok || !result.json || result.json.ok === false) { + rows.push({ + case_id: item.case_id, + query: item.query, + contamination_risk: true, + tainted_tokens: [], + top_20_tokens: [], + error: summarizeFailure(result), + }); + continue; + } + const tokens = extractResultTokens(result.json); + const taintedHits = tokens.filter((token) => tainted.has(token)); + const heuristicHits = extractHeuristicTaintedHits(result.json); + rows.push({ + case_id: item.case_id, + query: item.query, + contamination_risk: taintedHits.length > 0 || heuristicHits.length > 0, + tainted_tokens: taintedHits, + heuristic_hits: heuristicHits, + top_20_tokens: tokens, + }); + } + return rows; +} + +function makeBaseMeta(config, auth, startedAt): any { + return { + run_id: config.runId, + started_at: startedAt, + ended_at: new Date().toISOString(), + lark_cli_version: larkCliVersion(), + git_head: gitValue(["rev-parse", "HEAD"]), + git_dirty: gitValue(["status", "--short"]) !== "", + loader_profile: config.loaderProfile || "default", + executor_profile: config.executorProfile || "default", + user_open_id: auth?.userOpenId || "", + user_name: auth?.userName || "", + subset: config.subset, + snapshot_only: config.snapshotOnly, + dataset_file: config.datasetFile || "", + }; +} + +function blockedSummary(config, primary, blockers, extra: any = {}) { + return { + run_id: config.runId, + status: "blocked", + dataset_size: extra.datasetSize || 0, + scored: 0, + contaminated_skipped: 0, + parse_error_cases: extra.parseErrorCases || [], + primary_bottleneck: primary, + totals: { + sum: 0, + max: 0, + percent: null, + per_dim: { recall: null, accuracy: null, completeness: null }, + }, + findings: [], + pollution_warnings: extra.pollutionWarnings || [], + blockers, + }; +} + +function main() { + const config = parseArgs(process.argv.slice(2)); + if (config.help) { + usage(); + return; + } + const root = repoRoot(); + config.runId = config.runId || utcRunId(); + const startedAt = new Date().toISOString(); + const runDir = path.join(root, "tests/eval-search/runs", config.runId); + ensureDir(runDir); + ensureDir(path.join(runDir, "trajectories")); + + const excluded = readExcludedUserIds(root); + const taintedTokens = readTaintedTokens(root); + + if (config.snapshotOnly) { + const loaderAuthResult = runLarkJson(config.loaderProfile, ["auth", "status"]); + const loaderAuth = loaderAuthResult.ok && loaderAuthResult.json ? loaderAuthResult.json : null; + if (!loaderAuth || loaderAuth.ok === false) { + const meta = makeBaseMeta(config, loaderAuth, startedAt); + meta.status = "blocked"; + meta.notes = ["loader auth status failed", summarizeFailure(loaderAuthResult)]; + writeMeta(runDir, meta); + const summary = blockedSummary(config, "auth", meta.notes); + writeSummary(runDir, summary); + printRunResult(root, runDir, summary, console.error); + process.exitCode = 2; + return; + } + + let baseData; + try { + baseData = fetchAllBaseRows(config, runDir); + } catch (err) { + const meta = makeBaseMeta(config, loaderAuth, startedAt); + meta.status = "blocked"; + meta.notes = [ + "live dataset fetch failed before dataset.jsonl could be created", + err.message, + ]; + writeMeta(runDir, meta); + const summary = blockedSummary(config, "dataset_access", [ + `Cannot fetch latest evaluation dataset from Base ${config.baseToken} / table ${config.tableId} / view ${config.viewId}: ${err.message}`, + "Cannot create a local snapshot without Base read permission.", + ]); + writeSummary(runDir, summary); + printRunResult(root, runDir, summary, console.error); + process.exitCode = 2; + return; + } + + const { cases, skippedEmptyQuery } = convertDataset(baseData, config.subset); + writeDataset(runDir, cases); + const parseErrorCases = cases + .filter((item) => item.parse_error) + .map((item) => item.case_id); + const meta = makeBaseMeta(config, loaderAuth, startedAt); + meta.status = "snapshot_ready"; + meta.dataset_size = cases.length; + meta.cases_skipped_parse_error = parseErrorCases.length; + meta.skipped_empty_query = skippedEmptyQuery; + meta.notes = [ + "local dataset snapshot created", + "remove the executor account's Base permission, then rerun with --dataset-file pointing at this dataset.jsonl", + ]; + writeMeta(runDir, meta); + const summary = { + run_id: config.runId, + status: "snapshot_ready", + dataset_size: cases.length, + scored: 0, + contaminated_skipped: 0, + parse_error_cases: parseErrorCases, + primary_bottleneck: null, + totals: { + sum: 0, + max: cases.length * 15, + percent: null, + per_dim: { recall: null, accuracy: null, completeness: null }, + }, + findings: [], + pollution_warnings: [], + blockers: [ + "blind setup has not run yet; remove Base permission and rerun with --dataset-file", + ], + }; + writeSummary(runDir, summary); + console.log( + JSON.stringify( + { + run_id: config.runId, + status: "snapshot_ready", + run_dir: path.relative(root, runDir), + dataset_file: path.relative(root, path.join(runDir, "dataset.jsonl")), + dataset_size: cases.length, + parse_errors: parseErrorCases.length, + }, + null, + 2, + ), + ); + return; + } + + const executorAuthResult = runLarkJson(config.executorProfile, ["auth", "status"]); + let executorAuth = null; + if (executorAuthResult.ok && executorAuthResult.json) { + executorAuth = executorAuthResult.json; + } + if (!executorAuth || executorAuth.ok === false) { + const meta = makeBaseMeta(config, executorAuth, startedAt); + meta.status = "blocked"; + meta.notes = ["executor auth status failed", summarizeFailure(executorAuthResult)]; + writeMeta(runDir, meta); + const summary = blockedSummary(config, "auth", meta.notes); + writeSummary(runDir, summary); + printRunResult(root, runDir, summary, console.error); + process.exitCode = 2; + return; + } + + if (excluded.includes(executorAuth.userOpenId)) { + const blocker = `executor userOpenId ${executorAuth.userOpenId} is in excluded_user_ids`; + const meta = makeBaseMeta(config, executorAuth, startedAt); + meta.status = "blocked"; + meta.notes = [blocker]; + writeMeta(runDir, meta); + const summary = blockedSummary(config, "account_isolation", [blocker]); + writeSummary(runDir, summary); + printRunResult(root, runDir, summary, console.error); + process.exitCode = 2; + return; + } + + let cases; + let skippedEmptyQuery = 0; + if (config.datasetFile) { + const loaded = readDatasetFile(root, config.datasetFile, config.subset); + cases = loaded.cases; + } else { + let baseData; + try { + baseData = fetchAllBaseRows(config, runDir); + } catch (err) { + const meta = makeBaseMeta(config, executorAuth, startedAt); + meta.status = "blocked"; + meta.notes = [ + "live dataset fetch failed before dataset.jsonl could be created", + err.message, + ]; + writeMeta(runDir, meta); + const summary = blockedSummary(config, "dataset_access", [ + `Cannot fetch latest evaluation dataset from Base ${config.baseToken} / table ${config.tableId} / view ${config.viewId}: ${err.message}`, + "Cannot perform a valid eval-search run without dataset.jsonl from the live Base.", + ]); + writeSummary(runDir, summary); + printRunResult(root, runDir, summary, console.error); + process.exitCode = 2; + return; + } + const converted = convertDataset(baseData, config.subset); + cases = converted.cases; + skippedEmptyQuery = converted.skippedEmptyQuery; + } + writeDataset(runDir, cases); + + const baseProbe = executorCanReadBase(config); + if (baseProbe.canRead !== false) { + const blocker = + baseProbe.canRead === true + ? "executor profile can read the evaluation Base; this would contaminate blind search" + : `executor Base access probe failed in an ambiguous way: ${baseProbe.failure}`; + const meta = makeBaseMeta(config, executorAuth, startedAt); + meta.status = "blocked"; + meta.cases_scored = 0; + meta.cases_skipped_parse_error = cases.filter((item) => item.parse_error).length; + meta.notes = [blocker]; + writeMeta(runDir, meta); + const summary = blockedSummary(config, "account_isolation", [blocker], { + datasetSize: cases.length, + parseErrorCases: cases.filter((item) => item.parse_error).map((item) => item.case_id), + }); + writeSummary(runDir, summary); + printRunResult(root, runDir, summary, console.error); + process.exitCode = 2; + return; + } + + const preflight = runPreflight(config, cases, taintedTokens); + writeJson(path.join(runDir, "preflight.json"), preflight); + + const parseErrorCases = cases + .filter((item) => item.parse_error) + .map((item) => item.case_id); + const contaminationCount = preflight.filter((item) => item.contamination_risk).length; + const meta = makeBaseMeta(config, executorAuth, startedAt); + meta.status = "ready_for_executor"; + meta.cases_scored = 0; + meta.cases_skipped_parse_error = parseErrorCases.length; + meta.skipped_empty_query = skippedEmptyQuery; + meta.notes = [ + "deterministic setup completed: dataset.jsonl and preflight.json are ready", + "AI executor and judge phases are intentionally not run by this Node setup runner", + ]; + writeMeta(runDir, meta); + + writeSummary(runDir, { + run_id: config.runId, + status: "ready_for_executor", + dataset_size: cases.length, + scored: 0, + contaminated_skipped: 0, + parse_error_cases: parseErrorCases, + primary_bottleneck: null, + totals: { + sum: 0, + max: cases.length * 15, + percent: null, + per_dim: { recall: null, accuracy: null, completeness: null }, + }, + findings: [], + pollution_warnings: + contaminationCount > 0 + ? [`preflight found tainted tokens in ${contaminationCount} case(s)`] + : [], + blockers: [ + "executor and judge phases still require the agent workflow described in skills/eval-search/prompts", + ], + }); + + console.log( + JSON.stringify( + { + run_id: config.runId, + status: "ready_for_executor", + run_dir: path.relative(root, runDir), + dataset_size: cases.length, + parse_errors: parseErrorCases.length, + contamination_risks: contaminationCount, + }, + null, + 2, + ), + ); +} + +try { + main(); +} catch (err) { + console.error(err.stack || err.message); + process.exitCode = 1; +} diff --git a/tests/harness/sample-plan.json b/tests/harness/sample-plan.json new file mode 100644 index 000000000..41123495c --- /dev/null +++ b/tests/harness/sample-plan.json @@ -0,0 +1,95 @@ +{ + "name": "sample-dev-harness", + "version": 1, + "stages": [ + { + "id": "explore", + "objective": "inspect repository state", + "steps": [ + { + "id": "git_status", + "command": [ + "git", + "status", + "--short", + "--branch" + ], + "expect": { + "exitCode": 0 + } + } + ] + }, + { + "id": "plan", + "objective": "confirm stable project instructions exist", + "steps": [ + { + "id": "agents_file_exists", + "command": [ + "test", + "-f", + "AGENTS.md" + ], + "expect": { + "exitCode": 0 + } + } + ] + }, + { + "id": "act", + "objective": "surface current implementation delta", + "steps": [ + { + "id": "diff_stat", + "command": [ + "git", + "diff", + "--stat" + ], + "expect": { + "exitCode": 0 + } + } + ] + }, + { + "id": "verify", + "objective": "verify runner syntax", + "steps": [ + { + "id": "node_smoke_runner", + "command": [ + "node", + "--experimental-strip-types", + "scripts/harness-runner.ts", + "--help" + ], + "expect": { + "exitCode": 0, + "stdoutIncludes": "--plan" + } + } + ] + }, + { + "id": "retrospect", + "objective": "return final branch state", + "steps": [ + { + "id": "branch_state", + "command": [ + "git", + "status", + "--short", + "--branch" + ], + "expect": { + "exitCode": 0 + } + } + ] + } + ] +} diff --git a/tests/harness/self-correct-plan.json b/tests/harness/self-correct-plan.json new file mode 100644 index 000000000..b57900c28 --- /dev/null +++ b/tests/harness/self-correct-plan.json @@ -0,0 +1,57 @@ +{ + "name": "self-correct-dev-harness", + "version": 1, + "stages": [ + { + "id": "explore", + "objective": "prove the runner can execute a basic stage", + "steps": [ + { + "id": "node_available", + "command": [ + "node", + "--version" + ], + "expect": { + "exitCode": 0 + } + } + ] + }, + { + "id": "verify", + "objective": "prove correction steps are executed and retried", + "steps": [ + { + "id": "marker_exists_after_correction", + "command": "test -f \"$HARNESS_RUN_DIR/self-correct-marker\"", + "expect": { + "exitCode": 0 + }, + "corrections": [ + { + "id": "create_marker", + "command": "touch \"$HARNESS_RUN_DIR/self-correct-marker\"", + "expect": { + "exitCode": 0 + } + } + ] + } + ] + }, + { + "id": "retrospect", + "objective": "read the generated marker", + "steps": [ + { + "id": "marker_readback", + "command": "test -f \"$HARNESS_RUN_DIR/self-correct-marker\"", + "expect": { + "exitCode": 0 + } + } + ] + } + ] +} diff --git a/tsconfig.harness.json b/tsconfig.harness.json new file mode 100644 index 000000000..795229920 --- /dev/null +++ b/tsconfig.harness.json @@ -0,0 +1,24 @@ +{ + "compilerOptions": { + "target": "ES2020", + "module": "CommonJS", + "lib": [ + "ES2020" + ], + "types": [ + "node" + ], + "allowJs": false, + "checkJs": false, + "noEmit": true, + "strict": false, + "moduleDetection": "force", + "forceConsistentCasingInFileNames": true, + "skipLibCheck": true + }, + "include": [ + "scripts/harness-runner.ts", + "tests/eval-search/eval-search-run.ts", + "tests/eval-search/eval-search-collect-search.ts" + ] +} From cb6c9c931e4f516bb86f063b784fdbf2326fd856 Mon Sep 17 00:00:00 2001 From: zhuhao Date: Wed, 6 May 2026 20:48:25 +0800 Subject: [PATCH 2/6] fix(eval-search): request json for base dataset fetch Change-Id: I9a2ad5e1626554da0cd93e702ed906210bb52999 --- tests/eval-search/eval-search-run.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/eval-search/eval-search-run.ts b/tests/eval-search/eval-search-run.ts index c83c423d9..c37fcc68b 100644 --- a/tests/eval-search/eval-search-run.ts +++ b/tests/eval-search/eval-search-run.ts @@ -272,6 +272,8 @@ function baseRecordArgs(config, limit, offset) { "+record-list", "--as", "user", + "--format", + "json", "--base-token", config.baseToken, "--table-id", From 7e54fa3ecde3d798b7aed42b93b53eb068902808 Mon Sep 17 00:00:00 2001 From: zhuhao Date: Thu, 7 May 2026 17:30:17 +0800 Subject: [PATCH 3/6] feat(eval-search): add closed-loop harness docs Change-Id: Ie0fa725150d873dbc9943fd14452a3a14c0479c0 --- .gitignore | 3 - .harness/plan.example.json | 287 ------- package-lock.json | 39 +- package.json | 6 - scripts/harness-runner.ts | 771 ------------------ skills/dev/SKILL.md | 88 -- skills/eval-search/RUBRIC.md | 10 +- skills/eval-search/SKILL.md | 42 +- skills/eval-search/prompts/executor.md | 9 +- skills/eval-search/prompts/judge.md | 6 +- skills/eval-search/prompts/optimizer.md | 6 +- skills/eval-search/references/cycle.md | 222 +++++ skills/eval-search/references/dataset.md | 2 +- .../references/known-tainted-tokens.md | 12 +- .../references/pollution-preflight.md | 8 +- skills/eval-search/references/run-layout.md | 7 +- skills/lark-doc/references/lark-doc-search.md | 6 - tests/eval-search/.gitignore | 1 + .../eval-search/eval-search-collect-search.ts | 150 +++- tests/eval-search/eval-search-run.ts | 42 +- tests/harness/sample-plan.json | 95 --- tests/harness/self-correct-plan.json | 57 -- tsconfig.harness.json | 24 - 23 files changed, 454 insertions(+), 1439 deletions(-) delete mode 100644 .harness/plan.example.json delete mode 100644 scripts/harness-runner.ts delete mode 100644 skills/dev/SKILL.md create mode 100644 skills/eval-search/references/cycle.md create mode 100644 tests/eval-search/.gitignore delete mode 100644 tests/harness/sample-plan.json delete mode 100644 tests/harness/self-correct-plan.json delete mode 100644 tsconfig.harness.json diff --git a/.gitignore b/.gitignore index 435af6819..90313e480 100644 --- a/.gitignore +++ b/.gitignore @@ -34,11 +34,8 @@ tests/mail/reports/ # Generated / test artifacts .hammer/ -.harness/runs/ -.harness_local/ internal/registry/meta_data.json cmd/api/download.bin app.log /sidecar-server-demo /server-demo -tests/eval-search/runs/ diff --git a/.harness/plan.example.json b/.harness/plan.example.json deleted file mode 100644 index 0f09132a9..000000000 --- a/.harness/plan.example.json +++ /dev/null @@ -1,287 +0,0 @@ -{ - "name": "eval-search-delivery-harness", - "version": 1, - "objective": "make the eval-search workflow executable, reviewable, and reusable while preserving blind search evaluation", - "target": { - "skill": "eval-search", - "outcome": "dataset snapshot -> blind executor evidence -> judge scoring -> optimizer-ready report" - }, - "inputs": [ - { - "id": "loader_profile", - "required": true, - "description": "lark-cli profile that can read the eval Base during dataset setup" - }, - { - "id": "executor_profile", - "required": true, - "description": "dedicated blind lark-cli profile that cannot read the eval Base" - }, - { - "id": "subset_or_dataset_file", - "required": false, - "description": "subset for smoke runs or dataset-file for two-step strict mode" - }, - { - "id": "eval_run_id", - "required": false, - "description": "stable run id under tests/eval-search/runs when reproducibility matters" - } - ], - "lifecycle": { - "id": "eval-search", - "goal": "bring lkkcli-style lifecycle control to the existing search evaluation harness", - "stage_order": [ - "prepare", - "understand", - "plan", - "act", - "verify", - "retrospect" - ] - }, - "constraints": { - "enforce_stage_order": true, - "state_root": "tests/eval-search/runs", - "role_isolation": [ - "loader profile may read the eval Base only during dataset setup", - "executor profile receives query/case_id/run_dir only and must not read expected answers", - "judge starts after executor trajectories are complete", - "optimizer receives aggregated verdicts instead of full raw trajectories" - ], - "allowed_write_paths": [ - "skills/eval-search/**", - "tests/eval-search/**", - "shortcuts/**", - ".harness/**" - ] - }, - "env": { - "EVAL_SEARCH_RUN_ROOT": "tests/eval-search/runs" - }, - "artifacts": [ - { - "id": "eval_skill", - "path": "skills/eval-search/SKILL.md", - "required": true - }, - { - "id": "rubric", - "path": "skills/eval-search/RUBRIC.md", - "required": true - }, - { - "id": "executor_prompt", - "path": "skills/eval-search/prompts/executor.md", - "required": true - }, - { - "id": "judge_prompt", - "path": "skills/eval-search/prompts/judge.md", - "required": true - }, - { - "id": "optimizer_prompt", - "path": "skills/eval-search/prompts/optimizer.md", - "required": true - }, - { - "id": "harness_runner_source", - "path": "scripts/harness-runner.ts", - "required": true - }, - { - "id": "setup_runner_source", - "path": "tests/eval-search/eval-search-run.ts", - "required": true - }, - { - "id": "evidence_collector_source", - "path": "tests/eval-search/eval-search-collect-search.ts", - "required": true - } - ], - "stages": [ - { - "id": "prepare", - "objective": "establish repo state and local tool availability before touching eval data", - "steps": [ - { - "id": "git_status", - "command": [ - "git", - "status", - "--short", - "--branch" - ], - "expect": { - "exitCode": 0 - } - }, - { - "id": "lark_cli_available", - "required": false, - "command": [ - "lark-cli", - "--version" - ], - "expect": { - "exitCode": 0 - } - } - ] - }, - { - "id": "understand", - "objective": "pin the eval-search contract before executing any case", - "steps": [ - { - "id": "skill_contract_mentions_roles", - "command": [ - "node", - "-e", - "const s=require('fs').readFileSync('skills/eval-search/SKILL.md','utf8'); for (const w of ['Executor','Judge','Optimizer','盲测']) if (!s.includes(w)) process.exit(1);" - ], - "expect": { - "exitCode": 0 - } - }, - { - "id": "rubric_exists", - "command": [ - "test", - "-f", - "skills/eval-search/RUBRIC.md" - ], - "expect": { - "exitCode": 0 - } - } - ] - }, - { - "id": "plan", - "objective": "prove the deterministic setup, blind collector, and scoring prompts are all wired", - "steps": [ - { - "id": "setup_help", - "command": [ - "node", - "--experimental-strip-types", - "tests/eval-search/eval-search-run.ts", - "--help" - ], - "expect": { - "exitCode": 0, - "stdoutIncludes": "--executor-profile" - } - }, - { - "id": "collector_help", - "command": [ - "node", - "--experimental-strip-types", - "tests/eval-search/eval-search-collect-search.ts", - "--help" - ], - "expect": { - "exitCode": 0, - "stdoutIncludes": "--fetch-top" - } - } - ] - }, - { - "id": "act", - "objective": "check the implementation pieces that produce setup and evidence artifacts", - "steps": [ - { - "id": "setup_runner_smoke", - "command": [ - "node", - "--experimental-strip-types", - "tests/eval-search/eval-search-run.ts", - "--help" - ], - "expect": { - "exitCode": 0, - "stdoutIncludes": "--executor-profile" - } - }, - { - "id": "collector_smoke", - "command": [ - "node", - "--experimental-strip-types", - "tests/eval-search/eval-search-collect-search.ts", - "--help" - ], - "expect": { - "exitCode": 0, - "stdoutIncludes": "--fetch-top" - } - } - ] - }, - { - "id": "verify", - "objective": "type-check TS and run local deterministic gates before any live eval-search run", - "steps": [ - { - "id": "typescript_check", - "command": [ - "npm", - "run", - "harness:check" - ], - "expect": { - "exitCode": 0 - } - }, - { - "id": "harness_runner_smoke", - "command": [ - "node", - "--experimental-strip-types", - "scripts/harness-runner.ts", - "--help" - ], - "expect": { - "exitCode": 0, - "stdoutIncludes": "--plan" - } - }, - { - "id": "skill_format", - "command": [ - "node", - "scripts/skill-format-check/index.js", - "skills" - ], - "expect": { - "exitCode": 0, - "stdoutIncludes": "Skill format check passed" - } - } - ] - }, - { - "id": "retrospect", - "objective": "capture the final repo state and contract summary for the next run", - "steps": [ - { - "id": "branch_state", - "command": [ - "git", - "status", - "--short", - "--branch" - ], - "expect": { - "exitCode": 0 - } - } - ] - } - ] -} diff --git a/package-lock.json b/package-lock.json index 17691befc..5c63f1dc9 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@larksuite/cli", - "version": "1.0.23", + "version": "1.0.11", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@larksuite/cli", - "version": "1.0.23", + "version": "1.0.11", "cpu": [ "x64", "arm64" @@ -24,10 +24,6 @@ "bin": { "lark-cli": "scripts/run.js" }, - "devDependencies": { - "@types/node": "^25.6.0", - "typescript": "^6.0.3" - }, "engines": { "node": ">=16" } @@ -54,16 +50,6 @@ "sisteransi": "^1.0.5" } }, - "node_modules/@types/node": { - "version": "25.6.0", - "resolved": "https://registry.npmjs.org/@types/node/-/node-25.6.0.tgz", - "integrity": "sha512-+qIYRKdNYJwY3vRCZMdJbPLJAtGjQBudzZzdzwQYkEPQd+PJGixUL5QfvCLDaULoLv+RhT3LDkwEfKaAkgSmNQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "undici-types": "~7.19.0" - } - }, "node_modules/fast-string-truncated-width": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/fast-string-truncated-width/-/fast-string-truncated-width-1.2.1.tgz", @@ -93,27 +79,6 @@ "resolved": "https://registry.npmjs.org/sisteransi/-/sisteransi-1.0.5.tgz", "integrity": "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==", "license": "MIT" - }, - "node_modules/typescript": { - "version": "6.0.3", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-6.0.3.tgz", - "integrity": "sha512-y2TvuxSZPDyQakkFRPZHKFm+KKVqIisdg9/CZwm9ftvKXLP8NRWj38/ODjNbr43SsoXqNuAisEf1GdCxqWcdBw==", - "dev": true, - "license": "Apache-2.0", - "bin": { - "tsc": "bin/tsc", - "tsserver": "bin/tsserver" - }, - "engines": { - "node": ">=14.17" - } - }, - "node_modules/undici-types": { - "version": "7.19.2", - "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.19.2.tgz", - "integrity": "sha512-qYVnV5OEm2AW8cJMCpdV20CDyaN3g0AjDlOGf1OW4iaDEx8MwdtChUp4zu4H0VP3nDRF/8RKWH+IPp9uW0YGZg==", - "dev": true, - "license": "MIT" } } } diff --git a/package.json b/package.json index fc42df1ec..3d7d6310a 100644 --- a/package.json +++ b/package.json @@ -6,8 +6,6 @@ "lark-cli": "scripts/run.js" }, "scripts": { - "harness:run": "node --experimental-strip-types scripts/harness-runner.ts", - "harness:check": "tsc -p tsconfig.harness.json --noEmit", "postinstall": "node scripts/install.js" }, "os": [ @@ -36,9 +34,5 @@ ], "dependencies": { "@clack/prompts": "^1.2.0" - }, - "devDependencies": { - "@types/node": "^25.6.0", - "typescript": "^6.0.3" } } diff --git a/scripts/harness-runner.ts b/scripts/harness-runner.ts deleted file mode 100644 index 68ec06cc2..000000000 --- a/scripts/harness-runner.ts +++ /dev/null @@ -1,771 +0,0 @@ -#!/usr/bin/env node - -const { spawnSync } = require("node:child_process"); -const fs = require("node:fs"); -const path = require("node:path"); - -const DEFAULT_STAGE_ORDER = ["explore", "plan", "act", "verify", "retrospect"]; - -function usage() { - console.log(`Usage: - node --experimental-strip-types scripts/harness-runner.ts --plan [options] - -Options: - --plan JSON plan to execute - --run-id Run id, defaults to local timestamp - --run-root Artifact root, defaults to .harness/runs - --cwd Working directory, defaults to repo root/current cwd - --max-corrections Correction rounds per step, defaults to 2 - --format - --dry-run Return step results without executing commands - --help Show this help - -Plan schema: - { - "name": "demo", - "objective": "make eval-search execution reproducible", - "target": { - "skill": "eval-search", - "outcome": "blind search eval with scored summary" - }, - "inputs": [ - { "id": "loader_profile", "required": true }, - { "id": "executor_profile", "required": true } - ], - "lifecycle": { - "id": "eval-search", - "stage_order": ["prepare", "understand", "plan", "act", "verify", "retrospect"] - }, - "constraints": { - "enforce_stage_order": true, - "state_root": "tests/eval-search/runs", - "role_isolation": ["loader", "executor", "judge", "optimizer"] - }, - "artifacts": [ - { "id": "rubric", "path": "skills/eval-search/RUBRIC.md", "required": true } - ], - "stages": [ - { - "id": "prepare", - "steps": [ - { - "id": "git_status", - "command": ["git", "status", "--short", "--branch"], - "expect": { "exitCode": 0 }, - "corrections": [ - { "id": "show_status", "command": ["git", "status", "--short"] } - ] - } - ] - } - ] - } - -Every stage and step writes a structured result to the run directory. Failed -steps may run explicit correction steps, then retry themselves.`); -} - -function parseArgs(argv) { - const out: any = { - plan: "", - runId: "", - runRoot: "", - cwd: "", - maxCorrections: 2, - format: "pretty", - dryRun: false, - }; - for (let i = 0; i < argv.length; i += 1) { - const arg = argv[i]; - const next = () => { - if (i + 1 >= argv.length) { - throw new Error(`missing value for ${arg}`); - } - i += 1; - return argv[i]; - }; - if (arg === "--help" || arg === "-h") { - out.help = true; - } else if (arg === "--plan") { - out.plan = next(); - } else if (arg === "--run-id") { - out.runId = next(); - } else if (arg === "--run-root") { - out.runRoot = next(); - } else if (arg === "--cwd") { - out.cwd = next(); - } else if (arg === "--max-corrections") { - out.maxCorrections = Number.parseInt(next(), 10); - if (!Number.isFinite(out.maxCorrections) || out.maxCorrections < 0) { - throw new Error("--max-corrections must be a non-negative integer"); - } - } else if (arg === "--format") { - out.format = next(); - if (!["pretty", "json", "ndjson"].includes(out.format)) { - throw new Error("--format must be pretty, json, or ndjson"); - } - } else if (arg === "--dry-run") { - out.dryRun = true; - } else { - throw new Error(`unknown option ${arg}`); - } - } - if (!out.help && !out.plan) { - throw new Error("--plan is required"); - } - return out; -} - -function timestampId(date = new Date()) { - const tzOffsetMs = date.getTimezoneOffset() * 60 * 1000; - return new Date(date.getTime() - tzOffsetMs) - .toISOString() - .slice(0, 19) - .replace(/:/g, "-"); -} - -function repoRoot(cwd) { - const result = spawnSync("git", ["rev-parse", "--show-toplevel"], { - cwd, - encoding: "utf8", - }); - return result.status === 0 ? result.stdout.trim() : cwd; -} - -function ensureDir(dir) { - fs.mkdirSync(dir, { recursive: true }); -} - -function readJson(file) { - return JSON.parse(fs.readFileSync(file, "utf8")); -} - -function writeJson(file, value) { - fs.writeFileSync(file, `${JSON.stringify(value, null, 2)}\n`); -} - -function expandEnvVars(value, env) { - return String(value).replace(/\$\{?([A-Z_][A-Z0-9_]*)\}?/g, (match, key) => - Object.prototype.hasOwnProperty.call(env, key) ? env[key] : match, - ); -} - -function normalizePlan(plan) { - if (!Array.isArray(plan.stages) || plan.stages.length === 0) { - throw new Error("plan.stages must be a non-empty array"); - } - return { - name: plan.name || "harness", - version: plan.version || 1, - objective: plan.objective || "", - target: normalizeObject(plan.target, "target"), - inputs: normalizeInputs(plan.inputs || []), - lifecycle: normalizeLifecycle(plan.lifecycle || {}, plan.objective || ""), - constraints: normalizeConstraints(plan.constraints || {}), - env: normalizeEnv(plan.env || {}), - artifacts: normalizeArtifacts(plan.artifacts || []), - stages: plan.stages.map((stage, index) => { - if (!stage.id) { - throw new Error(`stage at index ${index} is missing id`); - } - if (!Array.isArray(stage.steps) || stage.steps.length === 0) { - throw new Error(`stage ${stage.id} must have at least one step`); - } - return { - id: stage.id, - objective: stage.objective || "", - required: stage.required !== false, - steps: stage.steps.map((step, stepIndex) => normalizeStep(step, stage.id, stepIndex)), - }; - }), - }; -} - -function normalizeObject(value, name) { - if (value === undefined || value === null) { - return {}; - } - if (typeof value !== "object" || Array.isArray(value)) { - throw new Error(`plan.${name} must be an object`); - } - return value; -} - -function normalizeLifecycle(lifecycle, objective) { - const stageOrder = lifecycle.stage_order || lifecycle.stageOrder || DEFAULT_STAGE_ORDER; - if (!Array.isArray(stageOrder) || stageOrder.some((stage) => typeof stage !== "string" || !stage)) { - throw new Error("plan.lifecycle.stage_order must be a non-empty string array"); - } - return { - id: lifecycle.id || lifecycle.kind || "dev", - goal: lifecycle.goal || objective || "", - stage_order: stageOrder, - }; -} - -function normalizeConstraints(constraints) { - const out = { ...constraints }; - out.enforce_stage_order = constraints.enforce_stage_order === true || constraints.enforceStageOrder === true; - out.state_root = constraints.state_root || constraints.stateRoot || ""; - out.role_isolation = Array.isArray(constraints.role_isolation) - ? constraints.role_isolation - : Array.isArray(constraints.roleIsolation) - ? constraints.roleIsolation - : []; - out.allowed_write_paths = Array.isArray(constraints.allowed_write_paths) - ? constraints.allowed_write_paths - : Array.isArray(constraints.allowedWritePaths) - ? constraints.allowedWritePaths - : []; - return out; -} - -function normalizeEnv(env) { - if (typeof env !== "object" || env === null || Array.isArray(env)) { - throw new Error("plan.env must be an object"); - } - return Object.fromEntries( - Object.entries(env).map(([key, value]) => { - if (!/^[A-Z_][A-Z0-9_]*$/.test(key)) { - throw new Error(`plan.env key ${key} must be UPPER_SNAKE_CASE`); - } - return [key, String(value)]; - }), - ); -} - -function normalizeArtifacts(artifacts) { - if (!Array.isArray(artifacts)) { - throw new Error("plan.artifacts must be an array"); - } - return artifacts.map((artifact, index) => { - if (!artifact.id) { - throw new Error(`artifact at index ${index} is missing id`); - } - if (!artifact.path) { - throw new Error(`artifact ${artifact.id} is missing path`); - } - return { - id: artifact.id, - path: artifact.path, - required: artifact.required !== false, - description: artifact.description || "", - }; - }); -} - -function normalizeInputs(inputs) { - if (!Array.isArray(inputs)) { - throw new Error("plan.inputs must be an array"); - } - return inputs.map((input, index) => { - if (!input.id) { - throw new Error(`input at index ${index} is missing id`); - } - return { - id: input.id, - required: input.required !== false, - description: input.description || "", - source: input.source || "", - }; - }); -} - -function normalizeStep(step, stageId, index) { - if (!step.id) { - throw new Error(`step at ${stageId}[${index}] is missing id`); - } - if (!step.command) { - throw new Error(`step ${stageId}.${step.id} is missing command`); - } - return { - id: step.id, - name: step.name || step.id, - command: step.command, - cwd: step.cwd || "", - timeoutMs: step.timeout_ms || step.timeoutMs || 10 * 60 * 1000, - required: step.required !== false, - expect: step.expect || { exitCode: 0 }, - maxAttempts: step.max_attempts || step.maxAttempts || 1, - corrections: Array.isArray(step.corrections) - ? step.corrections.map((correction, correctionIndex) => - normalizeCorrection(correction, stageId, step.id, correctionIndex), - ) - : [], - }; -} - -function normalizeCorrection(correction, stageId, stepId, index) { - if (!correction.id) { - throw new Error(`correction at ${stageId}.${stepId}[${index}] is missing id`); - } - if (!correction.command) { - throw new Error(`correction ${stageId}.${stepId}.${correction.id} is missing command`); - } - return { - id: correction.id, - name: correction.name || correction.id, - command: correction.command, - cwd: correction.cwd || "", - timeoutMs: correction.timeout_ms || correction.timeoutMs || 10 * 60 * 1000, - expect: correction.expect || { exitCode: 0 }, - }; -} - -function commandText(command) { - return Array.isArray(command) ? command.join(" ") : command; -} - -function tail(text, limit = 4000) { - const value = String(text || ""); - return value.length <= limit ? value : value.slice(value.length - limit); -} - -function runCommand(command, opts) { - if (opts.dryRun) { - return { - status: 0, - signal: null, - stdout: "", - stderr: "", - error: null, - dry_run: true, - }; - } - const env = { ...process.env, ...opts.env }; - if (Array.isArray(command)) { - const [cmd, ...args] = command; - const result = spawnSync(cmd, args, { - cwd: opts.cwd, - env, - encoding: "utf8", - timeout: opts.timeoutMs, - maxBuffer: 64 * 1024 * 1024, - }); - return normalizeCommandResult(result); - } - const result = spawnSync(command, { - cwd: opts.cwd, - env, - shell: true, - encoding: "utf8", - timeout: opts.timeoutMs, - maxBuffer: 64 * 1024 * 1024, - }); - return normalizeCommandResult(result); -} - -function normalizeCommandResult(result) { - return { - status: typeof result.status === "number" ? result.status : 1, - signal: result.signal || null, - stdout: result.stdout || "", - stderr: result.stderr || "", - error: result.error ? result.error.message : null, - dry_run: false, - }; -} - -function expectationPassed(result, expect) { - const failures = []; - const exitCode = expect.exitCode === undefined ? 0 : expect.exitCode; - if (result.status !== exitCode) { - failures.push(`exit code ${result.status}, expected ${exitCode}`); - } - if (expect.stdoutIncludes && !result.stdout.includes(expect.stdoutIncludes)) { - failures.push(`stdout missing ${JSON.stringify(expect.stdoutIncludes)}`); - } - if (expect.stderrIncludes && !result.stderr.includes(expect.stderrIncludes)) { - failures.push(`stderr missing ${JSON.stringify(expect.stderrIncludes)}`); - } - if (expect.stdoutMatches) { - const re = new RegExp(expect.stdoutMatches); - if (!re.test(result.stdout)) { - failures.push(`stdout did not match /${expect.stdoutMatches}/`); - } - } - return failures; -} - -function classifyFailure(step, result, failures) { - const text = `${result.stderr}\n${result.stdout}\n${result.error || ""}`; - const actions = []; - let category = "command_failed"; - if (result.error && /ENOENT/.test(result.error)) { - category = "missing_command"; - actions.push(`Install or put the command on PATH: ${commandText(step.command).split(/\s+/)[0]}`); - } else if (/command not found|not found/i.test(text)) { - category = "missing_command"; - actions.push("Install the missing command or adjust the plan command."); - } else if (/permission denied|not authorized|forbidden/i.test(text)) { - category = "permission"; - actions.push("Refresh auth or request the missing permission, then retry this step."); - } else if (/timed out|ETIMEDOUT|i\/o timeout/i.test(text)) { - category = "timeout"; - actions.push("Retry with a larger timeout or reduce the command scope."); - } else if (/working tree|worktree|uncommitted|dirty/i.test(`${step.id} ${text}`)) { - category = "dirty_worktree"; - actions.push("Inspect git status and decide whether to commit, stash, or narrow the plan."); - } - if (step.corrections.length > 0) { - actions.unshift("Run configured correction steps, then retry the failed step."); - } - if (actions.length === 0) { - actions.push("Inspect stdout/stderr and add a targeted correction step to the plan."); - } - return { - category, - failures, - next_actions: actions, - }; -} - -function makeEmitter(format, eventsFile) { - return function emit(event) { - fs.appendFileSync(eventsFile, `${JSON.stringify(event)}\n`); - if (format === "ndjson") { - console.log(JSON.stringify(event)); - } else if (format === "pretty" && event.type === "step_result") { - const mark = event.status === "passed" ? "PASS" : event.status === "corrected" ? "FIXED" : "FAIL"; - const retry = event.attempts > 1 ? ` attempts=${event.attempts}` : ""; - console.log(`[${mark}] ${event.stage_id}.${event.step_id}${retry} (${event.duration_ms}ms)`); - } else if (format === "pretty" && event.type === "stage_result") { - console.log(`[STAGE ${event.status.toUpperCase()}] ${event.stage_id}`); - } - }; -} - -function runCorrection(correction, context) { - const cwd = path.resolve(context.cwd, correction.cwd || "."); - const startedAt = new Date(); - const raw = runCommand(correction.command, { - cwd, - timeoutMs: correction.timeoutMs, - dryRun: context.dryRun, - env: context.env, - }); - const endedAt = new Date(); - const failures = expectationPassed(raw, correction.expect); - return { - id: correction.id, - name: correction.name, - command: commandText(correction.command), - cwd, - status: failures.length === 0 ? "passed" : "failed", - started_at: startedAt.toISOString(), - ended_at: endedAt.toISOString(), - duration_ms: endedAt.getTime() - startedAt.getTime(), - exit_code: raw.status, - signal: raw.signal, - stdout_tail: tail(raw.stdout), - stderr_tail: tail(raw.stderr), - error: raw.error, - expectation_failures: failures, - }; -} - -function runStep(stage, step, context) { - const startedAt = new Date(); - const attempts = []; - const correctionResults = []; - const maxAttempts = Math.max(1, step.maxAttempts + context.maxCorrections); - let finalStatus = "failed"; - let selfCorrection = null; - - for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { - const cwd = path.resolve(context.cwd, step.cwd || "."); - const raw = runCommand(step.command, { - cwd, - timeoutMs: step.timeoutMs, - dryRun: context.dryRun, - env: { - ...context.env, - HARNESS_STAGE_ID: stage.id, - HARNESS_STEP_ID: step.id, - HARNESS_ATTEMPT: String(attempt), - }, - }); - const failures = expectationPassed(raw, step.expect); - attempts.push({ - attempt, - command: commandText(step.command), - cwd, - exit_code: raw.status, - signal: raw.signal, - stdout_tail: tail(raw.stdout), - stderr_tail: tail(raw.stderr), - error: raw.error, - expectation_failures: failures, - status: failures.length === 0 ? "passed" : "failed", - }); - if (failures.length === 0) { - finalStatus = attempt === 1 ? "passed" : "corrected"; - break; - } - - selfCorrection = classifyFailure(step, raw, failures); - if (attempt >= maxAttempts || step.corrections.length === 0) { - break; - } - for (const correction of step.corrections) { - correctionResults.push(runCorrection(correction, context)); - } - } - - const endedAt = new Date(); - return { - type: "step_result", - stage_id: stage.id, - step_id: step.id, - name: step.name, - required: step.required, - status: finalStatus, - started_at: startedAt.toISOString(), - ended_at: endedAt.toISOString(), - duration_ms: endedAt.getTime() - startedAt.getTime(), - attempts: attempts.length, - command: commandText(step.command), - attempt_results: attempts, - corrections: correctionResults, - self_correction: finalStatus === "passed" ? null : selfCorrection, - }; -} - -function runStage(stage, context) { - const startedAt = new Date(); - const stepResults = []; - let status = "passed"; - for (const step of stage.steps) { - const result = runStep(stage, step, context); - stepResults.push(result); - context.emit(result); - if (result.status === "failed" && step.required) { - status = "failed"; - break; - } - if (result.status === "corrected" && status !== "failed") { - status = "corrected"; - } - } - const endedAt = new Date(); - const stageResult = { - type: "stage_result", - stage_id: stage.id, - objective: stage.objective, - required: stage.required, - status, - started_at: startedAt.toISOString(), - ended_at: endedAt.toISOString(), - duration_ms: endedAt.getTime() - startedAt.getTime(), - steps: stepResults, - }; - context.emit({ - type: "stage_result", - stage_id: stage.id, - status, - duration_ms: stageResult.duration_ms, - failed_steps: stepResults.filter((step) => step.status === "failed").map((step) => step.step_id), - }); - return stageResult; -} - -function summarize(plan, stageResults, context, startedAt) { - const endedAt = new Date(); - const failedStages = stageResults.filter((stage) => stage.status === "failed"); - const stageShape = validateStageShape(plan); - const artifactResults = validateArtifacts(plan, context); - const missingRequiredArtifacts = artifactResults.filter((artifact) => artifact.required && !artifact.exists); - const stageOrderFailed = - plan.constraints.enforce_stage_order && - (stageShape.missing.length > 0 || stageShape.unexpected.length > 0 || stageShape.out_of_order.length > 0); - const failedSteps = stageResults.flatMap((stage) => - stage.steps - .filter((step) => step.status === "failed") - .map((step) => ({ - stage_id: stage.stage_id, - step_id: step.step_id, - category: step.self_correction?.category || "unknown", - next_actions: step.self_correction?.next_actions || [], - })), - ); - const correctedSteps = stageResults.flatMap((stage) => - stage.steps - .filter((step) => step.status === "corrected") - .map((step) => ({ stage_id: stage.stage_id, step_id: step.step_id })), - ); - const status = - failedStages.length === 0 && missingRequiredArtifacts.length === 0 && !stageOrderFailed ? "passed" : "failed"; - return { - run_id: context.runId, - plan_name: plan.name, - objective: plan.objective, - target: plan.target, - inputs: plan.inputs, - lifecycle: plan.lifecycle, - status, - started_at: startedAt.toISOString(), - ended_at: endedAt.toISOString(), - duration_ms: endedAt.getTime() - startedAt.getTime(), - run_dir: context.runDir, - stage_shape: stageShape, - artifacts: artifactResults, - contract_failures: [ - ...(stageOrderFailed - ? [ - { - category: "stage_order", - missing: stageShape.missing, - unexpected: stageShape.unexpected, - out_of_order: stageShape.out_of_order, - }, - ] - : []), - ...missingRequiredArtifacts.map((artifact) => ({ - category: "missing_artifact", - artifact_id: artifact.id, - path: artifact.path, - })), - ], - stages: stageResults.map((stage) => ({ - stage_id: stage.stage_id, - status: stage.status, - steps: stage.steps.length, - failed_steps: stage.steps.filter((step) => step.status === "failed").length, - corrected_steps: stage.steps.filter((step) => step.status === "corrected").length, - })), - failed_steps: failedSteps, - corrected_steps: correctedSteps, - }; -} - -function validateStageShape(plan) { - const ids = plan.stages.map((stage) => stage.id); - const expected = plan.lifecycle.stage_order; - const missing = expected.filter((stage) => !ids.includes(stage)); - const unexpected = ids.filter((stage) => !expected.includes(stage)); - const outOfOrder = []; - let lastIndex = -1; - for (const id of ids) { - const index = expected.indexOf(id); - if (index < 0) { - continue; - } - if (index < lastIndex) { - outOfOrder.push(id); - } - lastIndex = Math.max(lastIndex, index); - } - return { - lifecycle_id: plan.lifecycle.id, - expected_order: expected, - present: ids, - missing, - unexpected, - out_of_order: outOfOrder, - order_matches: missing.length === 0 && unexpected.length === 0 && outOfOrder.length === 0, - }; -} - -function validateArtifacts(plan, context) { - return plan.artifacts.map((artifact) => { - const artifactPath = expandEnvVars(artifact.path, { ...process.env, ...context.env }); - const resolvedPath = path.isAbsolute(artifactPath) ? artifactPath : path.resolve(context.cwd, artifactPath); - const exists = fs.existsSync(resolvedPath); - return { - id: artifact.id, - path: artifact.path, - resolved_path: resolvedPath, - required: artifact.required, - exists, - status: exists ? "present" : artifact.required ? "missing" : "optional_missing", - }; - }); -} - -function main() { - const args = parseArgs(process.argv.slice(2)); - if (args.help) { - usage(); - return; - } - const baseCwd = args.cwd ? path.resolve(args.cwd) : repoRoot(process.cwd()); - const runId = args.runId || timestampId(); - const runRoot = path.resolve(baseCwd, args.runRoot || ".harness/runs"); - const runDir = path.join(runRoot, runId); - const stagesDir = path.join(runDir, "stages"); - ensureDir(stagesDir); - - const planPath = path.resolve(baseCwd, args.plan); - const plan = normalizePlan(readJson(planPath)); - const eventsFile = path.join(runDir, "events.ndjson"); - fs.writeFileSync(eventsFile, ""); - const emit = makeEmitter(args.format, eventsFile); - const startedAt = new Date(); - const context = { - cwd: baseCwd, - runId, - runDir, - dryRun: args.dryRun, - maxCorrections: args.maxCorrections, - emit, - env: { - ...plan.env, - HARNESS_RUN_ID: runId, - HARNESS_RUN_DIR: runDir, - HARNESS_PLAN: plan.name, - }, - }; - - writeJson(path.join(runDir, "plan.json"), plan); - writeJson(path.join(runDir, "stage_shape.json"), validateStageShape(plan)); - writeJson(path.join(runDir, "contract.json"), { - objective: plan.objective, - target: plan.target, - inputs: plan.inputs, - lifecycle: plan.lifecycle, - constraints: plan.constraints, - artifacts: plan.artifacts, - env: plan.env, - }); - emit({ - type: "run_started", - run_id: runId, - plan_name: plan.name, - objective: plan.objective, - target: plan.target, - inputs: plan.inputs, - lifecycle: plan.lifecycle, - cwd: baseCwd, - run_dir: runDir, - dry_run: args.dryRun, - }); - - const stageResults = []; - for (const stage of plan.stages) { - const result = runStage(stage, context); - stageResults.push(result); - writeJson(path.join(stagesDir, `${stage.id}.json`), result); - if (result.status === "failed" && stage.required) { - break; - } - } - - const summary = summarize(plan, stageResults, context, startedAt); - writeJson(path.join(runDir, "summary.json"), summary); - emit({ type: "run_finished", ...summary }); - if (args.format === "json") { - console.log(JSON.stringify(summary, null, 2)); - } else if (args.format === "pretty") { - console.log(JSON.stringify(summary, null, 2)); - } - if (summary.status !== "passed") { - process.exitCode = 1; - } -} - -try { - main(); -} catch (err) { - console.error(JSON.stringify({ ok: false, error: err.message }, null, 2)); - process.exitCode = 1; -} diff --git a/skills/dev/SKILL.md b/skills/dev/SKILL.md deleted file mode 100644 index 57d94612e..000000000 --- a/skills/dev/SKILL.md +++ /dev/null @@ -1,88 +0,0 @@ ---- -name: dev -version: 0.3.0 -description: "eval-search 交付 Harness:借鉴 lkkcli /dev 的生命周期约束,把搜索评测目标落成可执行、可复盘、可修正的阶段计划。" -metadata: - requires: - bins: ["node", "git"] ---- - -# dev — eval-search 交付 Harness - -本 skill 只负责把 lkkcli `/dev` 的生命周期控制迁移到本仓库,不改变 `/eval-search` 的目标:评测 `lark-cli` 搜索能力,产出盲测轨迹、Judge 评分、归因和 Optimizer 可消费的报告。 - -## 定位 - -- `/eval-search` 是业务目标层:定义 Executor / Judge / Optimizer 隔离、评分、污染控制和 PR 生成。 -- `scripts/harness-runner.ts` 是状态执行层入口,直接通过 Node 的 TS type stripping 执行。 -- `.harness/plan.example.json` 是本仓库默认计划:用 lkkcli 风格的 `prepare -> understand -> plan -> act -> verify -> retrospect` 包住 eval-search。 - -不要把这个 skill 扩展成通用研发流水线;通用需求、部署、MR 和 CI 编排属于 lkkcli `/dev`。这里的交付标准仍然围绕搜索评测。 - -## 硬约束 - -1. **目标不漂移**:plan 的 `target.skill` 必须是 `eval-search`。 -2. **输入先声明**:loader profile、executor profile、subset/dataset-file、eval run id 必须写进 `inputs`。 -3. **生命周期可检查**:plan 必须声明 `lifecycle.stage_order`,并开启 `constraints.enforce_stage_order`。 -4. **角色隔离保留**:Loader、Executor、Judge、Optimizer 的输入边界必须写进 `constraints.role_isolation`。 -5. **TS 替代 JS**:runner、setup runner、evidence collector 只保留 `.ts`,不要再生成或维护同名 `.js`。 -6. **产物契约显式化**:rubric、Executor/Judge/Optimizer prompt、TS 入口必须列入 `artifacts`。 -7. **失败可恢复**:失败 step 必须输出 `self_correction`;能自动 correction 的写进 plan,不能自动处理的给出 next action。 - -## 标准入口 - -先运行本仓库默认计划,确认 eval-search 的静态契约和本地门禁都成立: - -```bash -node --experimental-strip-types scripts/harness-runner.ts --plan .harness/plan.example.json --format json -``` - -运行产物写入: - -```text -.harness/runs// - plan.json - contract.json - stage_shape.json - events.ndjson - stages/.json - summary.json -``` - -只有当 `summary.status == "passed"` 时,才继续执行真实 `/eval-search run` 或 `/eval-search propose-pr`。 - -## 生命周期语义 - -### Prepare - -确认 repo 状态、分支、dirty 文件和本地工具可用性。`lark-cli` 缺失不直接阻断静态门禁,但真实评测前必须补齐。 - -### Understand - -读取并确认 `/eval-search` 的核心契约:盲测、三角色隔离、rubric、污染控制。这个阶段不接触评测集答案。 - -### Plan - -确认 deterministic setup 和 evidence collector 可调用,并明确本轮使用的 loader/executor profile、subset、dataset-file 策略。 - -### Act - -检查或执行会产出 eval-search 运行材料的代码路径:dataset setup、pollution preflight、executor evidence collection。 - -### Verify - -运行递进式门禁:TypeScript check、runner syntax、eval-search 脚本 syntax、skill format。真实评测完成后,还要检查 `tests/eval-search/runs//summary.json` 和 regression 结果。 - -### Retrospect - -沉淀本轮的污染 token、失败归因、泛化改动声明和下一轮 correction。若需要新增经验,优先更新 `skills/eval-search/**` 或 `tests/eval-search/**`,不要散落到临时笔记。 - -## 收尾标准 - -最终回复用户前检查最新 summary: - -```bash -node --experimental-strip-types scripts/harness-runner.ts --plan .harness/plan.example.json --format json -``` - -如果 `summary.status != "passed"`,不能声称完成;必须给出 `summary.contract_failures` 和 `summary.failed_steps[*].next_actions`。 diff --git a/skills/eval-search/RUBRIC.md b/skills/eval-search/RUBRIC.md index 519f7563b..21cffa5ff 100644 --- a/skills/eval-search/RUBRIC.md +++ b/skills/eval-search/RUBRIC.md @@ -20,6 +20,8 @@ **特例**:`企业内是否有知识 == 否` 的 case,recall 固定 5 分(agent 不该找到任何高置信答案,答"没找到"也算召回正确)。 +**污染结果不计入 recall**:trajectory 里标记为 `tainted=true` 或 `evidence_excluded=true` 的搜索结果是可观测污染信号,但不是答案证据。即使 expected source token 只出现在这些污染结果里,也不能按 top-5 命中给 recall 分;只有非污染 `evidence_top_results` 或非污染 fetch 才能作为 recall 依据。 + ### accuracy(准确性,0-5) "Executor 给出的最终答案**在事实层面**对不对"。对照评测集 `预期答复` 的【关键信息】段 + 【打分备注】里的 "可信无误" 说明。 @@ -58,6 +60,8 @@ 该项**直接从 total 扣**,且在 verdict 里显式标注,避免"刷分嫌疑"。 +collector / Executor 可以把 tainted 搜索结果写进 trajectory,但必须把它们标为 `evidence_excluded=true`,且不能作为答案合成、fetch 选择或 recall top-5 的证据。简言之:**tainted results are observable but non-evidential**。 + ## Verdict JSON schema 每个 case 一个 verdict,合并写入 `verdicts.json`。 @@ -80,13 +84,13 @@ }, "improvement": { "tool_capability": [ - "docs +search 返回结果没有 body_preview,agent 必须 fetch 才能判断相关性。建议返回摘要字段减少 fetch 次数" + "drive +search 返回结果没有 body_preview,agent 必须 fetch 才能判断相关性。建议返回摘要字段减少 fetch 次数" ], "search_strategy": [ "Executor 只用了原词 '华东 Aily 案例',没换 '客户成功故事' / '最佳实践' 等同义词" ], "skill_prompts": [ - "lark-doc-search.md 可新增同义词清单小节,含 'case / story / best practice' 映射" + "lark-drive-search.md 可新增同义词清单小节,含 'case / story / best practice' 映射" ] }, "contamination": { @@ -105,7 +109,7 @@ Judge 打完所有 case 后,主 agent 按以下规则聚合到 `summary.json` - 同一条 skill_prompts 建议指向 `skills/lark-doc/SKILL.md` 的,合并成一条 finding - finding 保留 `driving_cases: [case_003, case_007, ...]` 反向索引 2. **计算一阶瓶颈**:三桶的建议条数之和,占比最大的那个桶就是 `primary_bottleneck` -3. **统计 contamination**:有多少 case 被 fetch 到 tainted token,若 >2 个输出警告 +3. **统计 contamination**:分别统计 search-only 观测到 tainted token 的 case 数、被 fetch 到 tainted token 的 case 数;fetch 数 >2 时输出警告 4. **汇总每个维度的均值、总分** ## 校准指引(给 Judge 看的) diff --git a/skills/eval-search/SKILL.md b/skills/eval-search/SKILL.md index 9bd677a13..7a2dc6ce5 100644 --- a/skills/eval-search/SKILL.md +++ b/skills/eval-search/SKILL.md @@ -27,9 +27,11 @@ metadata: - "对比一下最近改动对搜索效果的影响" - "看看上一轮评测还有哪些归因没处理" -## 三个入口命令 +## 四个入口命令 ``` +/eval-search cycle [--loader-profile NAME] [--executor-profile NAME] [--subset N] [--report-doc URL] + # 一键闭环:run → 打分/report → propose-pr,并把阶段进展写入云文档 /eval-search run [--loader-profile NAME] [--executor-profile NAME] [--subset N] # 跑一轮评测,产出 run-id。默认全量;--subset=3 抽样冒烟 /eval-search run --snapshot-only # 只把评测集拉成本地 dataset.jsonl,供移除权限后复用 @@ -37,29 +39,17 @@ metadata: /eval-search report # 读已有 run 的 summary.json ``` -新人典型流程:`run` → 看 summary → `propose-pr` → review PR → merge。 +新人典型流程优先使用 `cycle`,只有调试单个阶段时才手动执行 `run` / `report` / `propose-pr`。 -## 状态层(向 lkkcli Harness 对齐) +## `/eval-search cycle` 上层闭环 -本仓库额外提供一个轻量状态层,把 lkkcli `/dev` 的生命周期约束套到 `/eval-search` 上,但不改变搜索评测目标: +详细步骤见 [`references/cycle.md`](references/cycle.md)。概要: -```bash -node --experimental-strip-types scripts/harness-runner.ts --plan .harness/plan.example.json --format json -``` - -这个 plan 的目标必须保持为 `target.skill = eval-search`,生命周期固定为: - -```text -prepare -> understand -> plan -> act -> verify -> retrospect -``` - -它只做四件事: -- 声明本轮 live run 需要的 loader profile、executor profile、subset/dataset-file、run-id -- 明确 Loader / Executor / Judge / Optimizer 的隔离边界 -- 检查 rubric、prompts、TS 入口等必备产物,并直接运行 `.ts` 入口 -- 把每个阶段的命令结果、失败归因、correction 和 contract failure 写入 `.harness/runs//summary.json` - -因此,真实评测仍然按下面的 `/eval-search run` 流程执行;状态层只是先把环境、约束和本地门禁变成可复盘的执行记录。若 `summary.status != "passed"`,不要启动真实评测或声称 PR 可交付。 +1. **初始化 cycle**:生成 `cycle-id` / `run-id`,创建 `tests/eval-search/runs//cycle.json` +2. **创建或绑定云文档**:若未传 `--report-doc`,用 `lark-cli docs +create --api-version v2 --doc-format markdown` 创建报告文档;若已传文档,则直接追加本轮章节 +3. **阶段化执行并记录**:内部串联 `run → score/report → propose-pr`,每个阶段开始、成功、失败都先写本地 `cycle.json`,再追加到云文档 +4. **产物归档**:云文档只写阶段状态、分数摘要、finding 摘要、PR URL、失败原因和本地产物路径;不得写标准答案、完整 trajectory、source_urls 或 key_error_snippets +5. **污染控制**:cycle 生成或使用的云文档默认是评测过程材料,必须记录为 tainted/process material;未来持久 blocklist 变更需要单独 PR,不得混入搜索效果优化 PR ## 三层架构(必须隔离,违反会让结果失真) @@ -90,7 +80,7 @@ Optimizer (sub-agent, Task 工具) 1. **确定性 setup**:先运行 `node --experimental-strip-types tests/eval-search/eval-search-run.ts --loader-profile --executor-profile [--subset N]`。脚本会生成 run-id,建目录 `tests/eval-search/runs//`,并完成第 2-4 步。若只有一个账号,可先用 `--snapshot-only` 拉本地 `dataset.jsonl`,移除该账号的评测 Base 权限后,再用 `--dataset-file /dataset.jsonl` 继续 2. **拉数据集**:按 [`references/dataset.md`](references/dataset.md) 用 loader profile 从评测 base 拉最新数据 → `dataset.jsonl` 3. **账号隔离**:按 [`references/pollution-preflight.md`](references/pollution-preflight.md) 检查 executor profile 不在 `excluded_user_ids`,并主动探测 executor 不能读取评测 Base;若能读取则阻断 -4. **污染预检**:用 executor profile 对每条 query 跑一次 `docs +search`,命中 [`references/known-tainted-tokens.md`](references/known-tainted-tokens.md) 里的 token 则标记 `contamination_risk`。只标记不阻断;Judge 阶段再决定是否扣分 +4. **污染预检**:用 executor profile 对每条 query 跑一次 `drive +search`,命中 [`references/known-tainted-tokens.md`](references/known-tainted-tokens.md) 里的 token 则标记 `contamination_risk`。只标记不阻断;Judge 阶段再决定是否扣分 5. **Executor 并行**:用 Task 工具启动 sub-agent 按 [`prompts/executor.md`](prompts/executor.md) 跑全部 case。每个 case trajectory 落盘 `trajectories/.json` 6. **Judge 逐 case**:主 agent 按 [`prompts/judge.md`](prompts/judge.md) 打分,写 `verdicts.json` 7. **聚合**:按"改动落点文件"对 improvements 聚类,写 `summary.json`;输出 run-id 给用户 @@ -110,6 +100,13 @@ Optimizer (sub-agent, Task 工具) ## 权限边界(v0.1 软约束,迭代中调整) +### PR 颗粒度 + +每个 `/eval-search propose-pr` 只能落一个主归因桶 / 一个改动主题。主 agent 在 apply diff 前必须复查 touched files,并按以下规则拆分: +- `search_strategy` / `skill_prompts`:只能提交搜索策略或 skill 文档优化 PR,例如 `skills/lark-drive/references/*-search.md` 或当前主搜索入口对应文档。不得混入 harness、runner、package、评测集、打分脚本或基础设施改动;不要给已进入维护期的 `docs +search` 新增策略依赖。 +- `tool_capability`:只能提交 CLI shortcut / open converter 能力 PR。不得混入搜索策略文档,除非同一能力改动必须同步更新对应使用说明。 +- `eval_harness` / 评测流程自身:必须独立 PR,不能和任何搜索效果优化 PR 混在一起。 + ### cli 仓库(`larksuite/cli`,当前目录) Optimizer 默认允许改: @@ -151,6 +148,7 @@ Optimizer 把该 finding 写进 PR description 的"未处理归因"段(含建 - [`prompts/executor.md`](prompts/executor.md) — Executor sub-agent 模板 - [`prompts/judge.md`](prompts/judge.md) — Judge 打分模板 - [`prompts/optimizer.md`](prompts/optimizer.md) — Optimizer PR 生成模板 +- [`references/cycle.md`](references/cycle.md) — 一键闭环 + 云文档阶段日志 - [`references/dataset.md`](references/dataset.md) — 评测集 schema + 拉取方式 - [`references/pollution-preflight.md`](references/pollution-preflight.md) — 污染预检规则 - [`references/known-tainted-tokens.md`](references/known-tainted-tokens.md) — 已知泄露文档标记清单 diff --git a/skills/eval-search/prompts/executor.md b/skills/eval-search/prompts/executor.md index 9f474c107..9ab9e15e0 100644 --- a/skills/eval-search/prompts/executor.md +++ b/skills/eval-search/prompts/executor.md @@ -24,8 +24,9 @@ 在发出第一条 lark-cli 命令之前,MUST 用 Read 读: - `skills/lark-shared/SKILL.md` — 认证、全局参数 -- `skills/lark-doc/SKILL.md` + `skills/lark-doc/references/lark-doc-search.md` — 云空间搜索 -(搜索方法论直接在 `lark-doc-search.md` 里:关键词改写 / 失败退出 / 大文档 fallback 都在该文件的决策规则段) +- `skills/lark-drive/SKILL.md` + `skills/lark-drive/references/lark-drive-search.md` — 云空间资源发现;优先使用 `drive +search`,不要新增依赖已进入维护期的 `docs +search` +- `skills/lark-doc/SKILL.md` — 命中文档后的 fetch / 内容读取 +(搜索方法论直接在 `lark-drive-search.md` 里:关键词改写 / 失败退出 / 答案型检索循环都在该文件的决策规则段) - `skills/lark-wiki/SKILL.md` — wiki 节点是壳的关键概念 根据 query 类型可能还要读:`lark-im`、`lark-mail`、`lark-vc`、`lark-minutes`、`lark-contact` 等。 @@ -33,8 +34,8 @@ ### 标准流程 1. 阅读 query,拆"实体"(人名 / 时间 / 关键词 / 资源类型) -2. 选择搜索入口(docs / im / mail / vc / minutes / ...) -3. 发起搜索;若返回空或无相关结果,按 `lark-doc-search.md` 的"决策规则 / `--query` 高级语法"换 2-3 轮词(同义词 / `intitle:` / 排除词) +2. 选择搜索入口(drive / im / mail / vc / minutes / ...) +3. 发起搜索;若返回空或无相关结果,按 `lark-drive-search.md` 的"决策规则 / `--query` 高级语法"换 2-3 轮词(同义词 / `intitle:` / 排除词) 4. 对 top 命中做进一步 fetch / resolve(wiki 节点必须先 `wiki +resolve-node`) 5. 综合信息给出答案;若 3 轮改写仍无结果,给 best-effort 结论并明确说"未找到直接证据" 6. 写 `/trajectories/.json`,结束 diff --git a/skills/eval-search/prompts/judge.md b/skills/eval-search/prompts/judge.md index b81dc9226..9238093c6 100644 --- a/skills/eval-search/prompts/judge.md +++ b/skills/eval-search/prompts/judge.md @@ -16,10 +16,10 @@ ## 每个 case 的打分步骤 -1. **recall**:扫 trajectory 里的每一条 tool_use,提取被 fetch / resolve 过的 token 和 URL 集合。与 `source_urls` 做交集。按 RUBRIC 打分 +1. **recall**:扫 trajectory 里的每一条 tool_use,提取被 fetch / resolve 过的 token 和 URL 集合,并读取 `evidence_top_results` / search round 里的非污染 evidence tokens。与 `source_urls` 做交集。标记为 `tainted=true` 或 `evidence_excluded=true` 的 search 结果只能算污染观测,不能算 recall top-5 命中。按 RUBRIC 打分 2. **accuracy**:把 `answer` 和 `expected.【关键信息】` 段逐条比对。优先应用 `expected.【打分备注】.可信无误` 3. **completeness**:数 key points 覆盖数。优先应用 `expected.【打分备注】.完整详实` -4. **contamination**:查 trajectory 是否 fetch 过 `preflight.tainted_tokens`;search-only 命中只记录风险,不扣污染分。若有 fetch,按 RUBRIC 给 `contamination_penalty` +4. **contamination**:查 trajectory 是否 fetch 过 `preflight.tainted_tokens`;search-only 命中只记录风险,不扣污染分,也不作为 recall/accuracy/completeness 的证据。若有 fetch,按 RUBRIC 给 `contamination_penalty` 5. **improvement 三桶**:从 trajectory 里找失败片段,分类写进 `tool_capability / search_strategy / skill_prompts` ## improvement 填写规则 @@ -75,7 +75,7 @@ "finding_id": "F-002", "bucket": "tool_capability", "target_file": "shortcuts/docs/search.go", - "suggestion": "docs +search 返回结果没有 body_preview,agent 必须 fetch 才能判断相关性", + "suggestion": "drive +search 返回结果没有 body_preview,agent 必须 fetch 才能判断相关性", "driving_cases": ["case_001", "case_005"], "priority": "medium" } diff --git a/skills/eval-search/prompts/optimizer.md b/skills/eval-search/prompts/optimizer.md index 07ac3e192..786e1f699 100644 --- a/skills/eval-search/prompts/optimizer.md +++ b/skills/eval-search/prompts/optimizer.md @@ -40,7 +40,7 @@ 1. **读 summary 全部 findings**,按 `priority` 降序处理 2. **对每条 finding**: - `skill_prompts` bucket → 用 Edit 改 cli 仓库的指定 markdown,保持 tone / 结构与周边一致 - - `search_strategy` bucket → 沉淀到 cli 仓库对应域的 `references/*-search.md`(如 `skills/lark-doc/references/lark-doc-search.md`),不要塞进本 harness 的 prompt 模板 + - `search_strategy` bucket → 沉淀到 cli 仓库对应域的 `references/*-search.md`(如 `skills/lark-drive/references/lark-drive-search.md`),不要塞进本 harness 的 prompt 模板 - `tool_capability` bucket → 分两步判断: 1. 如果 finding 本质是 cli 封装层不够(缺 shortcut、shortcut 输出难解析),评估能否在 cli 仓库加 shortcut 解决 2. 如果是 OAPI 层(`BuildDisplayInfo` 信息不够、字段映射 bug),Read [`../references/open-repo-layout.md`](../references/open-repo-layout.md) 并严格按白名单改 open 仓库。不在白名单的 → 产出 issue 正文,写进 `unhandled_findings.md` 的 `proposed_issue` 段 @@ -70,7 +70,7 @@ "case_specific_changes": [ { "repo": "cli", - "file": "skills/lark-doc/references/lark-doc-search.md", + "file": "skills/lark-drive/references/lark-drive-search.md", "change_summary": "在同义词小节新增 '交个朋友 → Livflow 智能平台' 映射", "driving_cases": ["case_005"], "risk": "该同义词只由 case_005 驱动,强度弱。reviewer 可判断是否保留" @@ -79,7 +79,7 @@ "principled_changes": [ { "repo": "cli", - "file": "skills/lark-doc/SKILL.md", + "file": "skills/lark-drive/SKILL.md", "change_summary": "新增 '搜索词改写失败 3 次后给 best-effort 答案' 决策规则", "driving_cases": ["case_003", "case_007", "case_011"], "rationale": "泛化到任何搜索类任务的退出条件,不依赖具体 case 内容" diff --git a/skills/eval-search/references/cycle.md b/skills/eval-search/references/cycle.md new file mode 100644 index 000000000..2f7a34afb --- /dev/null +++ b/skills/eval-search/references/cycle.md @@ -0,0 +1,222 @@ +# cycle 上层闭环 + 云文档阶段日志 + +`/eval-search cycle` 是 `/eval-search run`、`/eval-search report`、`/eval-search propose-pr` 的上层编排入口。用户只触发一次,主 agent 负责按阶段推进、记录状态、遇到失败时停止并给出可恢复位置。 + +## 入口 + +```text +/eval-search cycle [--subset N] + [--loader-profile ] + [--executor-profile ] + [--report-doc ] + [--create-report-doc] + [--report-parent-token ] + [--skip-pr] +``` + +- `--report-doc`:把本轮阶段日志追加到已有云文档。 +- `--create-report-doc`:未传 `--report-doc` 时创建新云文档;默认创建到当前用户个人空间,可选 `--report-parent-token`。 +- `--skip-pr`:只跑到打分/report,不进入 optimizer 和 PR 创建。 +- 未指定云文档参数时,默认创建新报告文档。除非用户明确禁止云文档记录,否则 cycle 不走纯本地日志模式。 + +## 状态文件 + +cycle 必须先创建本地状态,再调用任何飞书或 GitHub 写操作: + +```text +tests/eval-search/runs// +├── cycle.json +└── cloud-doc/ + ├── 00-created.md + ├── 10-run-started.md + ├── 20-run-finished.md + ├── 30-score-finished.md + ├── 40-pr-finished.md + └── tainted_tokens.json +``` + +`cycle.json` 结构: + +```json +{ + "cycle_id": "2026-05-07T03-30Z", + "run_id": "2026-05-07T03-30Z", + "status": "running", + "started_at": "2026-05-07T03:30:00Z", + "ended_at": null, + "cloud_doc": { + "url": "", + "token": "", + "created_by_cycle": true, + "tainted": true + }, + "stages": [], + "pr_urls": [] +} +``` + +每次阶段状态变化都按顺序执行: + +1. 更新 `cycle.json` +2. 渲染一个 `cloud-doc/-.md` +3. 追加到云文档 +4. 只有云文档追加成功后才进入下一个阶段 + +若云文档追加失败,重试一次;仍失败则停止 cycle,把失败写入 `cycle.json`,不要继续提 PR。 + +## 云文档创建 / 追加 + +创建新文档: + +```bash +lark-cli docs +create --api-version v2 --as user \ + --doc-format markdown \ + --content @tests/eval-search/runs//cloud-doc/00-created.md \ + --jq '.data.document.url' +``` + +创建到指定目录: + +```bash +lark-cli docs +create --api-version v2 --as user \ + --parent-token '' \ + --doc-format markdown \ + --content @tests/eval-search/runs//cloud-doc/00-created.md \ + --jq '.data.document.url' +``` + +追加阶段日志: + +```bash +lark-cli docs +update --api-version v2 --as user \ + --doc '' \ + --command append \ + --doc-format markdown \ + --content @tests/eval-search/runs//cloud-doc/20-run-finished.md +``` + +Markdown 文件必须使用 `@file` 传参,避免 shell 转义破坏表格、链接或代码块。 + +## 云文档内容边界 + +云文档是给人看进度和 review 结果的,不是评测原始数据仓库。允许写: + +- cycle-id / run-id / git head / 分支 / 账号类型 +- stage 状态、开始结束时间、失败原因 +- dataset 数量、preflight 污染数量、executor 完成数量 +- 总分、各维度均值、finding 聚类摘要、PR URL +- 本地产物路径,例如 `tests/eval-search/runs//summary.json` + +禁止写: + +- `dataset.jsonl` 全量内容 +- 标准答案、source URLs、rubric 的 per-case 原文 +- 完整 trajectory、完整 verdict rationale、key_error_snippets +- 任何 access token、app secret、cookie、GitHub token + +per-case 信息只允许写 `case_id`、分数、桶归因和一句不含标准答案的摘要。 + +## 阶段编排 + +### 0. setup + +- 确认 repo 路径和分支 +- 确认 `lark-cli auth status`、`gh auth status` +- 生成 `run-id` +- 创建 `cycle.json` +- 创建或绑定云文档 +- 把云文档 token 写入 `cloud-doc/tainted_tokens.json` + +setup 文档段落必须包含醒目的污染声明: + +```markdown +# eval-search cycle + +> This document is eval-search process material. It may contain benchmark summaries and must be treated as tainted for future search evaluations. + +| Field | Value | +|---|---| +| Run ID | `` | +| Status | `setup started` | +``` + +### 1. run + +内部执行 `/eval-search run` 的流程:拉数据集、污染预检、Executor、Judge、聚合。 + +阶段日志至少追加两次: + +- `run started`:记录 run-id、subset、loader/executor profile、run 目录 +- `run finished`:记录 dataset size、scored count、skipped count、trajectory 数、summary 路径 + +### 2. score/report + +读取 `summary.json` 和 `verdicts.json`,形成面向人的摘要。该阶段不重新打分,只消费 run 阶段已经产出的 Judge 结果。 + +必须记录: + +- 总分 / 满分 / 百分比 +- recall / accuracy / completeness / contamination_penalty 的总和与均值 +- top findings,最多 10 条 +- tainted fetch cases 数量和 case_id 列表 + +### 3. propose-pr + +未传 `--skip-pr` 时进入该阶段。内部执行 `/eval-search propose-pr `: + +- Optimizer 生成 diff +- 主 agent 复查 PR 颗粒度和白名单 +- 质量门禁 +- regression 重跑 +- 创建 draft PR + +云文档记录: + +- PR URL / state / draft 状态 +- touched files +- quality gate 结果 +- before/after 分数摘要 +- 未处理归因 + +如果没有可提交改动,记录 `no-op`,不创建空 PR。 + +### 4. final + +更新 `cycle.json.status`: + +- `completed`:所有启用阶段完成 +- `completed_without_pr`:`--skip-pr` 或 no-op +- `failed`:任一必需阶段失败 + +最后追加一段总览,包含下一步建议和恢复命令: + +```markdown +## Final + +| Field | Value | +|---|---| +| Status | completed | +| Run ID | `` | +| Summary | `tests/eval-search/runs//summary.json` | +| PR | `` | +``` + +## 污染控制 + +cycle 生成或更新的云文档默认是 tainted/process material。规则: + +1. 创建或绑定文档后,立刻提取 doc token,写入 `cloud-doc/tainted_tokens.json` +2. 本 cycle 的 regression / after-run 必须把该 token 作为额外 tainted token +3. 未来持久 blocklist 需要单独处理: + - 单独开 `chore(eval-search): blocklist cycle report ` PR;或 + - 在云文档无法被 executor 账号搜索到的前提下,在本轮报告中说明未持久化 blocklist +4. 不得把 blocklist 更新混入 `search_strategy`、`skill_prompts` 或 `tool_capability` 优化 PR + +## 恢复策略 + +- `setup` 失败:修复认证或文档权限后,重新执行 cycle +- `run` 失败:保留 `cycle.json`,从已有 `run-id` 的本地 artifact 判断是否能补跑缺失 case;不能补跑则新 cycle +- `score/report` 失败:不重跑 Executor,只重新读取 `summary.json` / `verdicts.json` 并追加云文档 +- `propose-pr` 失败:修复 git/gh/quality gate 后,从同一 `run-id` 重新执行 propose-pr 阶段,并追加恢复记录 + +任何恢复都必须追加云文档段落,不得静默覆盖既有记录。 diff --git a/skills/eval-search/references/dataset.md b/skills/eval-search/references/dataset.md index e167ca6ce..6af493683 100644 --- a/skills/eval-search/references/dataset.md +++ b/skills/eval-search/references/dataset.md @@ -9,7 +9,7 @@ - view_id: `vewGToSnWl` - URL: https://bytedance.larkoffice.com/base/OOoEbNWhcaFOdisXDW7c0lKtn4g?table=tblGWdc19tKFZC6K&view=vewGToSnWl -> **污染警告**:这个 base 本身会被 `docs +search` 命中。harness 必须把账号拆成两个 profile:loader profile 只用于读取这个 base 并生成 `dataset.jsonl`;executor profile 只用于盲测搜索,**不可**加入该 base 的查看权限,否则评测结果被自答污染。详见 [`pollution-preflight.md`](pollution-preflight.md)。 +> **污染警告**:这个 base 本身会被 `drive +search` 命中。harness 必须把账号拆成两个 profile:loader profile 只用于读取这个 base 并生成 `dataset.jsonl`;executor profile 只用于盲测搜索,**不可**加入该 base 的查看权限,否则评测结果被自答污染。详见 [`pollution-preflight.md`](pollution-preflight.md)。 ## 原始字段(字段 id → 含义) diff --git a/skills/eval-search/references/known-tainted-tokens.md b/skills/eval-search/references/known-tainted-tokens.md index db14f0546..55faea9f7 100644 --- a/skills/eval-search/references/known-tainted-tokens.md +++ b/skills/eval-search/references/known-tainted-tokens.md @@ -59,7 +59,8 @@ tainted_tokens: ## 执行侧处理规则 - Preflight 命中 tainted token 只标记风险,不阻断整轮评测。 -- Executor/collector 不能因为命中本文件就跳过、降权或隐藏结果;否则评测会被过滤规则美化,不能反映真实搜索行为。 +- Executor/collector 不能因为命中本文件就隐藏结果;否则评测会被过滤规则美化,不能反映真实搜索行为。 +- Executor/collector 必须把命中本文件的结果标为 `tainted=true` / `evidence_excluded=true`。这些结果可以出现在 observed search results 中,但不能进入 evidence candidates、fetch 队列、答案合成或 recall top-5 证据。 - Collector 应把命中的 token 写进 trajectory / raw evidence,保留 `tainted` 这类元数据,交给 Judge 按 RUBRIC 判定污染扣分。 - `verdicts.json` 里只对“fetch 过 tainted token 且答案受其影响”的 case 扣污染分;单纯 search 命中但未 fetch 的 case 不扣污染分,但可以作为污染风险记录。 - 新增 collector、shortcut 或搜索策略时,都要把本文件当作统一标记清单读取,避免各处散落 hard-coded 污染 token。 @@ -73,3 +74,12 @@ tainted_tokens: - 发布用的 retrospective → PR description / GitHub wiki / release notes 这样根本不会污染飞书搜索语料,污染标记清单的维护压力也会逐渐下降。 + +## `/eval-search cycle` 的例外 + +如果用户明确要求把中间结果记录到云文档,允许使用 [`cycle.md`](cycle.md) 的云文档日志,但必须遵守: + +1. 云文档只写阶段状态、分数摘要、finding 摘要、PR URL 和本地产物路径;不写标准答案、完整 trajectory、source_urls 或 key_error_snippets +2. 创建或绑定报告文档后,立刻把 doc token 写入本轮 `tests/eval-search/runs//cloud-doc/tainted_tokens.json` +3. 本 cycle 的 regression / after-run 必须把该 token 作为额外污染 token +4. 需要持久 blocklist 时,单独开 `chore(eval-search): blocklist cycle report ` PR;不得混进搜索策略或能力优化 PR diff --git a/skills/eval-search/references/pollution-preflight.md b/skills/eval-search/references/pollution-preflight.md index 247cb3c45..4ff88fcc7 100644 --- a/skills/eval-search/references/pollution-preflight.md +++ b/skills/eval-search/references/pollution-preflight.md @@ -2,9 +2,9 @@ ## 动机 -评测集 base 自身、v1/v2 迭代记录文档、含 expected 的参考文档,都可能被 `docs +search` 命中。Executor 一旦 fetch 到,就是"开卷考试"——分数失去意义。 +评测集 base 自身、v1/v2 迭代记录文档、含 expected 的参考文档,都可能被 `drive +search` 命中。Executor 一旦 fetch 到,就是"开卷考试"——分数失去意义。 -v2 的教训:PM 的 dataset base 在第一次跑评测时,几乎所有 query 的 `docs +search` top-1 都是 dataset 自己。 +v2 的教训:PM 的 dataset base 在第一次跑评测时,几乎所有 query 的搜索 top-1 都是 dataset 自己。 因此 `/eval-search run` 需要两个 lark-cli profile: - `loader-profile`:能读评测 Base,只负责拉取 live dataset 并写入 `dataset.jsonl` @@ -53,7 +53,7 @@ lark-cli --profile base +record-list \ ``` for each case in dataset.jsonl: - result = lark-cli --profile docs +search --query "" --page-size 20 + result = lark-cli --profile drive +search --query "" --page-size 20 hit_tokens = extract all obj_token / wiki_token from result tainted = hit_tokens ∩ known_tainted_tokens @@ -66,6 +66,8 @@ for each case in dataset.jsonl: } ``` +实际执行时,`known_tainted_tokens` 由持久清单 [`known-tainted-tokens.md`](known-tainted-tokens.md) 和本轮 `cloud-doc/tainted_tokens.json` 合并得到。后者用于 `/eval-search cycle` 生成的临时报告文档,避免还没进入持久 blocklist 的过程材料影响本轮 after-run。 + **不阻断**,只标记。原因:有时 pre-flight 命中但 Executor 最终没 fetch,这种 case 依然有效,Judge 会打出正常 recall 分。 ### known_tainted_tokens 的维护 diff --git a/skills/eval-search/references/run-layout.md b/skills/eval-search/references/run-layout.md index 8c29bc814..5bbac1618 100644 --- a/skills/eval-search/references/run-layout.md +++ b/skills/eval-search/references/run-layout.md @@ -37,6 +37,11 @@ node --experimental-strip-types tests/eval-search/eval-search-run.ts \ ``` tests/eval-search/runs/2026-04-15T10-00Z/ +├── cycle.json # 仅 /eval-search cycle 阶段编排使用;记录云文档、阶段状态、PR URL +├── cloud-doc/ # 仅 /eval-search cycle 使用;每次追加云文档前生成的 markdown 片段 +│ ├── 00-created.md +│ ├── 20-run-finished.md +│ └── tainted_tokens.json ├── meta.json # run 元信息(cli 版本、loader/executor profile、账号、开始/结束时间) ├── raw/ │ ├── base_records_pages.json @@ -91,7 +96,7 @@ Executor 每完成 1 round(= 1 次 lark-cli 调用 + 解析),追加写入 "started_at": "...", "rounds": [ {"idx": 1, "tool": "Read", "target": "skills/lark-doc/SKILL.md", "outcome_summary": "..."}, - {"idx": 2, "tool": "Bash", "cmd": "lark-cli docs +search --query '华东 Aily'", "outcome_summary": "top-3: ..."}, + {"idx": 2, "tool": "Bash", "cmd": "lark-cli drive +search --query '华东 Aily'", "outcome_summary": "top-3: ..."}, ... ], "answer": null, diff --git a/skills/lark-doc/references/lark-doc-search.md b/skills/lark-doc/references/lark-doc-search.md index 3639e73e8..6ca0df4d4 100644 --- a/skills/lark-doc/references/lark-doc-search.md +++ b/skills/lark-doc/references/lark-doc-search.md @@ -199,12 +199,6 @@ lark-cli docs +search --query "方案" --format json --page-token '' ## 决策规则 - 参数传递:只要用户给了搜索关键词,就必须显式使用 `--query "<关键词>"`。不要生成 `lark-cli docs +search 方案`、`lark-cli docs +search xxx(搜索关键词)` 这种位置参数写法。 -- 答案型检索循环:当目标是回答问题而不是简单列结果时,不要只跑一次原始 query 后直接收敛。至少按以下顺序做一轮 blind refinement: - 1. 保留业务实体、时间、数值、产品名等硬约束,去掉“怎么/是否/为什么/近期情况”等问句噪声后重搜。 - 2. 对问题类型补充泛化检索词:案例类加“客户案例/最佳实践”,政策类加“政策/售卖/规则”,数值类加“指标/目标/准确率/金额”,对接类加“集成/同步/打通”。 - 3. 如果 top 结果标题强相关但未被读取,继续 fetch 强相关候选;如果 top 结果明显是评测集、Case 分析、Prompt/Debug 记录、标签标注样例、答案汇总等过程材料,只能在回答或 trajectory 里显式标记其性质,不能自动跳过、隐藏或降权。 - 4. 评测/指标类 query 不要一概排除“评测”文档:`Golden Set`、准确率评测方案、指标结果页可能就是答案来源;即使是会泄露评测过程或样例答案的材料,也必须保留可见轨迹,由 Judge 或用户判断是否污染。 - 5. 最终答案必须回填关键槽位:数值题给数字和单位,时间题给日期,是否题给 yes/no 结论,政策题给适用范围和限制条件。缺槽位时继续重搜或明确说明未找到。 - 查询语义:必须优先利用 --query 的高级语法(如 intitle:、""、-)将过滤逻辑下推给服务端。当用户要求“标题精确等于 X”时,直接使用 --query "intitle:\"X\"",严禁先进行模糊搜索再做客户端二次筛选。只有在遇到服务端语法无法覆盖的复杂本地比对场景时,才允许在客户端过滤,且比对前必须先去掉 title_highlighted 里的高亮标签。 - 实体补全:如果用户要按“某个群里分享的文档”搜索,先用 `lark-im` 拿 `chat_id` 再填 `chat_ids`;如果用户要按“某人分享的文档”搜索,先用 `lark-contact` 拿 `open_id` 再填 `sharer_ids`。 - 零结果回退:如果因为用户的显式类型约束加了 `doc_types` 且结果为 0,可以提示“按指定类型没搜到”;只有在不违背用户明确约束的前提下,才建议放宽类型重试。 diff --git a/tests/eval-search/.gitignore b/tests/eval-search/.gitignore new file mode 100644 index 000000000..a1e03960f --- /dev/null +++ b/tests/eval-search/.gitignore @@ -0,0 +1 @@ +runs/ diff --git a/tests/eval-search/eval-search-collect-search.ts b/tests/eval-search/eval-search-collect-search.ts index e7dd7d5b3..c528042c6 100644 --- a/tests/eval-search/eval-search-collect-search.ts +++ b/tests/eval-search/eval-search-collect-search.ts @@ -8,7 +8,7 @@ function usage() { console.log(`Usage: node --experimental-strip-types tests/eval-search/eval-search-collect-search.ts --run-dir [--page-size 10] [--fetch-top 3] [--max-query-variants 4] -Collect docs +search evidence for every case in dataset.jsonl. This collector +Collect drive +search evidence for every case in dataset.jsonl. This collector reads only case_id and query from the dataset, then writes trajectories plus raw/executor_search.json. It runs a small blind query-rewrite loop, annotates known tainted/eval-process artifacts without filtering them, and fetches the @@ -117,7 +117,28 @@ function loadCases(datasetFile) { }); } -function loadTaintedTokens(root) { +function addTokensFromValue(value, tokens) { + if (Array.isArray(value)) { + for (const item of value) { + addTokensFromValue(item, tokens); + } + return; + } + if (value && typeof value === "object") { + for (const item of Object.values(value)) { + addTokensFromValue(item, tokens); + } + return; + } + if (typeof value !== "string") { + return; + } + for (const match of value.match(/[A-Za-z0-9_-]{12,}/g) || []) { + tokens.add(match); + } +} + +function loadTaintedTokens(root, runDir = "") { const file = path.join(root, "skills/eval-search/references/known-tainted-tokens.md"); const tokens: Set = new Set(); if (!fs.existsSync(file)) { @@ -140,6 +161,10 @@ function loadTaintedTokens(root) { tokens.add(match[1]); } } + const localFile = runDir ? path.join(runDir, "cloud-doc", "tainted_tokens.json") : ""; + if (localFile && fs.existsSync(localFile)) { + addTokensFromValue(JSON.parse(fs.readFileSync(localFile, "utf8")), tokens); + } return tokens; } @@ -422,6 +447,21 @@ function scoreResult(result, query, variantIndex) { return score; } +function annotateEvidenceStatus(result, query, taintedTokens) { + const tainted = isTainted(result, taintedTokens); + return { + ...result, + tainted, + suspicious_artifact_reason: suspiciousArtifactReason(result, query), + evidence_excluded: tainted, + evidence_excluded_reason: tainted ? "known_tainted_token" : "", + }; +} + +function isEvidenceCandidate(result) { + return !result.evidence_excluded && !result.tainted; +} + function fetchDoc(result, index) { const fetchedAt = new Date().toISOString(); const response = runLark([ @@ -483,6 +523,9 @@ function collectFetches(results, fetchTop) { if (!isFetchable(result)) { continue; } + if (!isEvidenceCandidate(result)) { + continue; + } if (result.score < 4) { continue; } @@ -497,7 +540,7 @@ function collectFetches(results, fetchTop) { function runSearch(query, pageSize) { return runLark([ - "docs", + "drive", "+search", "--as", "user", @@ -527,6 +570,8 @@ function mergeSearchResults(rounds, originalQuery, taintedTokens) { tainted: isTainted(result, taintedTokens), suspicious_artifact_reason: suspiciousArtifactReason(result, originalQuery), }; + next.evidence_excluded = Boolean(next.tainted); + next.evidence_excluded_reason = next.tainted ? "known_tainted_token" : ""; if (!existing || next.score > existing.score) { byKey.set(key, next); } @@ -665,23 +710,38 @@ function fallbackAnswerFrame(query) { function synthesizeAnswer(query, searchRow) { const fetched = (searchRow.fetches || []).filter((item) => item.ok); - const visibleTop = (searchRow.results || []).slice(0, 5); + const evidenceTop = (searchRow.evidence_results || []).slice(0, 5); + const excludedTop = (searchRow.non_evidential_results || []).slice(0, 3); if (fetched.length === 0) { const fallback = fallbackAnswerFrame(query); if (fallback.length > 0) { return [ "未读取到足够可信的非污染文档正文。", ...fallback.map((item) => `- ${item}`), - "搜索候选:", - ...visibleTop.map((item, index) => `${index + 1}. ${item.title || item.url}`), + "非污染搜索候选:", + ...(evidenceTop.length > 0 + ? evidenceTop.map((item, index) => `${index + 1}. ${item.title || item.url}`) + : ["无"]), + ...(excludedTop.length > 0 + ? [ + "已观测但不作为证据的污染候选:", + ...excludedTop.map((item, index) => `${index + 1}. ${item.title || item.url}`), + ] + : []), ].join("\n"); } - return visibleTop.length === 0 - ? "未找到直接相关的非污染云文档搜索结果。" - : [ - "未读取到足够可信的非污染文档正文,搜索到的主要候选如下:", - ...visibleTop.map((item, index) => `${index + 1}. ${item.title || item.url}`), - ].join("\n"); + if (evidenceTop.length === 0) { + return excludedTop.length === 0 + ? "未找到直接相关的非污染云文档搜索结果。" + : [ + "搜索命中了已知污染材料,但没有找到可作为答案证据的非污染云文档。", + ...excludedTop.map((item, index) => `${index + 1}. ${item.title || item.url}`), + ].join("\n"); + } + return [ + "未读取到足够可信的非污染文档正文,搜索到的主要候选如下:", + ...evidenceTop.map((item, index) => `${index + 1}. ${item.title || item.url}`), + ].join("\n"); } const lines = [`基于已读取的 ${fetched.length} 个非污染文档,提取到以下答复线索:`]; @@ -714,7 +774,9 @@ function synthesizeAnswer(query, searchRow) { } function writeTrajectory(runDir, caseItem, searchRow) { - const top = searchRow.results.slice(0, 5); + const topObserved = searchRow.results.slice(0, 5); + const topEvidence = (searchRow.evidence_results || []).slice(0, 5); + const nonEvidential = (searchRow.non_evidential_results || []).slice(0, 10); const fetched = (searchRow.fetches || []).filter((item) => item.ok); const answer = synthesizeAnswer(caseItem.query, searchRow); @@ -724,12 +786,20 @@ function writeTrajectory(runDir, caseItem, searchRow) { cmd: item.cmd, outcome_summary: item.error || - `docs +search variant ${item.variant_index + 1}/${searchRow.search_rounds.length} returned ${ + `drive +search variant ${item.variant_index + 1}/${searchRow.search_rounds.length} returned ${ item.results.length } compact result(s); top title: ${item.results[0]?.title || "none"}`, query_variant: item.query, result_tokens: item.results.map((result) => result.token).filter(Boolean), result_urls: item.results.map((result) => result.url).filter(Boolean), + tainted_tokens_observed: item.results + .filter((result) => result.tainted) + .map((result) => result.token) + .filter(Boolean), + evidence_tokens: item.results + .filter((result) => isEvidenceCandidate(result)) + .map((result) => result.token) + .filter(Boolean), })); const fetchRounds = (searchRow.fetches || []).map((item, index) => ({ idx: searchRounds.length + index + 1, @@ -751,18 +821,39 @@ function writeTrajectory(runDir, caseItem, searchRow) { ...fetchRounds, ], answer, + observed_top_results: topObserved.map((item) => ({ + title: item.title, + url: item.url, + token: item.token, + tainted: Boolean(item.tainted), + evidence_excluded: Boolean(item.evidence_excluded), + evidence_excluded_reason: item.evidence_excluded_reason || "", + suspicious_artifact_reason: item.suspicious_artifact_reason || "", + })), + evidence_top_results: topEvidence.map((item) => ({ + title: item.title, + url: item.url, + token: item.token, + suspicious_artifact_reason: item.suspicious_artifact_reason || "", + })), + non_evidential_results: nonEvidential.map((item) => ({ + title: item.title, + url: item.url, + token: item.token, + reason: item.evidence_excluded_reason || "unknown", + })), referenced_urls: [ ...new Set([ ...fetched.map((item) => item.url).filter(Boolean), - ...top.map((item) => item.url).filter(Boolean), + ...topEvidence.map((item) => item.url).filter(Boolean), ]), ], rounds_used: searchRounds.length + (searchRow.fetches || []).length, - gave_up: fetched.length === 0 && top.length === 0, + gave_up: fetched.length === 0 && topEvidence.length === 0, notes: fetched.length > 0 - ? `multi-query search+fetch executor baseline; fetched strongest document-like hits; tainted_observed=${searchRow.tainted_results}; suspicious_observed=${searchRow.suspicious_artifact_results}; tainted_fetched=${searchRow.tainted_fetches}; suspicious_fetched=${searchRow.suspicious_artifact_fetches}` - : `multi-query search executor baseline; no document-like hit was fetched; tainted_observed=${searchRow.tainted_results}; suspicious_observed=${searchRow.suspicious_artifact_results}`, + ? `multi-query search+fetch executor baseline; fetched strongest non-tainted document-like hits; evidence_candidates=${searchRow.evidence_results_count}; tainted_observed=${searchRow.tainted_results}; non_evidential_observed=${searchRow.non_evidential_results_count}; suspicious_observed=${searchRow.suspicious_artifact_results}; tainted_fetched=${searchRow.tainted_fetches}; suspicious_fetched=${searchRow.suspicious_artifact_fetches}` + : `multi-query search executor baseline; no non-tainted document-like hit was fetched; evidence_candidates=${searchRow.evidence_results_count}; tainted_observed=${searchRow.tainted_results}; non_evidential_observed=${searchRow.non_evidential_results_count}; suspicious_observed=${searchRow.suspicious_artifact_results}`, }; fs.writeFileSync( path.join(runDir, "trajectories", `${caseItem.case_id}.json`), @@ -784,7 +875,7 @@ function main() { const rawDir = path.join(runDir, "raw"); ensureDir(rawDir); ensureDir(path.join(runDir, "trajectories")); - const taintedTokens = loadTaintedTokens(root); + const taintedTokens = loadTaintedTokens(root, runDir); const rows = []; for (const item of loadCases(datasetFile)) { @@ -794,7 +885,9 @@ function main() { const result = runSearch(query, args.pageSize); const results = result.ok && result.json?.ok !== false - ? (result.json?.data?.results || []).map(compactResult) + ? (result.json?.data?.results || []) + .map(compactResult) + .map((item) => annotateEvidenceStatus(item, item.query || query, taintedTokens)) : []; searchRounds.push({ variant_index: index, @@ -809,7 +902,7 @@ function main() { }); const mergedSoFar = mergeSearchResults(searchRounds, item.query, taintedTokens); const fetchableCandidates = mergedSoFar.filter( - (row) => isFetchable(row) && row.score >= 4, + (row) => isEvidenceCandidate(row) && isFetchable(row) && row.score >= 4, ); if (fetchableCandidates.length >= args.fetchTop && index > 0) { break; @@ -817,6 +910,8 @@ function main() { } const ended = new Date().toISOString(); const results = mergeSearchResults(searchRounds, item.query, taintedTokens); + const evidenceResults = results.filter((row) => isEvidenceCandidate(row)); + const nonEvidentialResults = results.filter((row) => row.evidence_excluded); const fetches = collectFetches(results, args.fetchTop); const row = { case_id: item.case_id, @@ -831,7 +926,11 @@ function main() { .join("\n"), search_rounds: searchRounds, results, + evidence_results: evidenceResults, + non_evidential_results: nonEvidentialResults, fetches, + evidence_results_count: evidenceResults.length, + non_evidential_results_count: nonEvidentialResults.length, tainted_results: results.filter((row) => row.tainted).length, suspicious_artifact_results: results.filter((row) => row.suspicious_artifact_reason).length, tainted_fetches: fetches.filter((fetch) => fetch.tainted).length, @@ -851,6 +950,7 @@ function main() { run_dir: path.relative(root, runDir), searched: rows.length, empty_results: rows.filter((row) => row.results.length === 0).length, + empty_evidence_results: rows.filter((row) => row.evidence_results.length === 0).length, fetched: rows.reduce((sum, row) => sum + row.fetches.length, 0), fetched_success: rows.reduce( (sum, row) => sum + row.fetches.filter((fetch) => fetch.ok).length, @@ -864,6 +964,14 @@ function main() { (sum, row) => sum + row.tainted_results, 0, ), + non_evidential_observed: rows.reduce( + (sum, row) => sum + row.non_evidential_results_count, + 0, + ), + evidence_candidates: rows.reduce( + (sum, row) => sum + row.evidence_results_count, + 0, + ), suspicious_artifacts_fetched: rows.reduce( (sum, row) => sum + row.suspicious_artifact_fetches, 0, diff --git a/tests/eval-search/eval-search-run.ts b/tests/eval-search/eval-search-run.ts index c37fcc68b..3a2fb5777 100644 --- a/tests/eval-search/eval-search-run.ts +++ b/tests/eval-search/eval-search-run.ts @@ -15,7 +15,7 @@ function usage() { Options: --loader-profile lark-cli profile that can read the eval Base - --executor-profile lark-cli profile used for blind docs search + --executor-profile lark-cli profile used for blind drive search --run-id run id, defaults to UTC YYYY-MM-DDTHH-MMZ --subset keep first n cases after dataset conversion --snapshot-only fetch dataset locally, then stop before blind checks @@ -207,6 +207,37 @@ function readTaintedTokens(root) { return tokens; } +function addTokensFromValue(value, tokens) { + if (Array.isArray(value)) { + for (const item of value) { + addTokensFromValue(item, tokens); + } + return; + } + if (value && typeof value === "object") { + for (const item of Object.values(value)) { + addTokensFromValue(item, tokens); + } + return; + } + if (typeof value !== "string") { + return; + } + for (const match of value.match(/[A-Za-z0-9_-]{12,}/g) || []) { + tokens.add(match); + } +} + +function readRunTaintedTokens(runDir) { + const file = path.join(runDir, "cloud-doc", "tainted_tokens.json"); + if (!fs.existsSync(file)) { + return []; + } + const tokens = new Set(); + addTokensFromValue(JSON.parse(fs.readFileSync(file, "utf8")), tokens); + return [...tokens]; +} + function readExcludedUserIds(root) { const file = path.join( root, @@ -564,7 +595,7 @@ function runPreflight(config, cases, taintedTokens) { const rows = []; for (const item of cases) { const result = runLarkJson(config.executorProfile, [ - "docs", + "drive", "+search", "--as", "user", @@ -652,7 +683,12 @@ function main() { ensureDir(path.join(runDir, "trajectories")); const excluded = readExcludedUserIds(root); - const taintedTokens = readTaintedTokens(root); + const taintedTokens = [ + ...new Set([ + ...readTaintedTokens(root), + ...readRunTaintedTokens(runDir), + ]), + ]; if (config.snapshotOnly) { const loaderAuthResult = runLarkJson(config.loaderProfile, ["auth", "status"]); diff --git a/tests/harness/sample-plan.json b/tests/harness/sample-plan.json deleted file mode 100644 index 41123495c..000000000 --- a/tests/harness/sample-plan.json +++ /dev/null @@ -1,95 +0,0 @@ -{ - "name": "sample-dev-harness", - "version": 1, - "stages": [ - { - "id": "explore", - "objective": "inspect repository state", - "steps": [ - { - "id": "git_status", - "command": [ - "git", - "status", - "--short", - "--branch" - ], - "expect": { - "exitCode": 0 - } - } - ] - }, - { - "id": "plan", - "objective": "confirm stable project instructions exist", - "steps": [ - { - "id": "agents_file_exists", - "command": [ - "test", - "-f", - "AGENTS.md" - ], - "expect": { - "exitCode": 0 - } - } - ] - }, - { - "id": "act", - "objective": "surface current implementation delta", - "steps": [ - { - "id": "diff_stat", - "command": [ - "git", - "diff", - "--stat" - ], - "expect": { - "exitCode": 0 - } - } - ] - }, - { - "id": "verify", - "objective": "verify runner syntax", - "steps": [ - { - "id": "node_smoke_runner", - "command": [ - "node", - "--experimental-strip-types", - "scripts/harness-runner.ts", - "--help" - ], - "expect": { - "exitCode": 0, - "stdoutIncludes": "--plan" - } - } - ] - }, - { - "id": "retrospect", - "objective": "return final branch state", - "steps": [ - { - "id": "branch_state", - "command": [ - "git", - "status", - "--short", - "--branch" - ], - "expect": { - "exitCode": 0 - } - } - ] - } - ] -} diff --git a/tests/harness/self-correct-plan.json b/tests/harness/self-correct-plan.json deleted file mode 100644 index b57900c28..000000000 --- a/tests/harness/self-correct-plan.json +++ /dev/null @@ -1,57 +0,0 @@ -{ - "name": "self-correct-dev-harness", - "version": 1, - "stages": [ - { - "id": "explore", - "objective": "prove the runner can execute a basic stage", - "steps": [ - { - "id": "node_available", - "command": [ - "node", - "--version" - ], - "expect": { - "exitCode": 0 - } - } - ] - }, - { - "id": "verify", - "objective": "prove correction steps are executed and retried", - "steps": [ - { - "id": "marker_exists_after_correction", - "command": "test -f \"$HARNESS_RUN_DIR/self-correct-marker\"", - "expect": { - "exitCode": 0 - }, - "corrections": [ - { - "id": "create_marker", - "command": "touch \"$HARNESS_RUN_DIR/self-correct-marker\"", - "expect": { - "exitCode": 0 - } - } - ] - } - ] - }, - { - "id": "retrospect", - "objective": "read the generated marker", - "steps": [ - { - "id": "marker_readback", - "command": "test -f \"$HARNESS_RUN_DIR/self-correct-marker\"", - "expect": { - "exitCode": 0 - } - } - ] - } - ] -} diff --git a/tsconfig.harness.json b/tsconfig.harness.json deleted file mode 100644 index 795229920..000000000 --- a/tsconfig.harness.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "compilerOptions": { - "target": "ES2020", - "module": "CommonJS", - "lib": [ - "ES2020" - ], - "types": [ - "node" - ], - "allowJs": false, - "checkJs": false, - "noEmit": true, - "strict": false, - "moduleDetection": "force", - "forceConsistentCasingInFileNames": true, - "skipLibCheck": true - }, - "include": [ - "scripts/harness-runner.ts", - "tests/eval-search/eval-search-run.ts", - "tests/eval-search/eval-search-collect-search.ts" - ] -} From a7d6ab1c0d000a221b198922ce1cbeed603cd2c1 Mon Sep 17 00:00:00 2001 From: zhuhao Date: Thu, 7 May 2026 17:39:33 +0800 Subject: [PATCH 4/6] docs(eval-search): fix cycle completion contract Change-Id: I2aa5c7dc3a93b915a051f3ae7bdf2bacaa17a21e --- skills/eval-search/SKILL.md | 1 + skills/eval-search/references/cycle.md | 30 ++++++++++++++++++- .../eval-search/references/pr-generation.md | 16 ++++++++++ 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/skills/eval-search/SKILL.md b/skills/eval-search/SKILL.md index 7a2dc6ce5..3489422fd 100644 --- a/skills/eval-search/SKILL.md +++ b/skills/eval-search/SKILL.md @@ -50,6 +50,7 @@ metadata: 3. **阶段化执行并记录**:内部串联 `run → score/report → propose-pr`,每个阶段开始、成功、失败都先写本地 `cycle.json`,再追加到云文档 4. **产物归档**:云文档只写阶段状态、分数摘要、finding 摘要、PR URL、失败原因和本地产物路径;不得写标准答案、完整 trajectory、source_urls 或 key_error_snippets 5. **污染控制**:cycle 生成或使用的云文档默认是评测过程材料,必须记录为 tainted/process material;未来持久 blocklist 变更需要单独 PR,不得混入搜索效果优化 PR +6. **完成定义**:未传 `--skip-pr` 时,最终回复必须同时给出 Cloud report URL 和 Draft PR URL;任一链接缺失都不能视为完成 ## 三层架构(必须隔离,违反会让结果失真) diff --git a/skills/eval-search/references/cycle.md b/skills/eval-search/references/cycle.md index 2f7a34afb..7323ed5b9 100644 --- a/skills/eval-search/references/cycle.md +++ b/skills/eval-search/references/cycle.md @@ -64,6 +64,20 @@ tests/eval-search/runs// 若云文档追加失败,重试一次;仍失败则停止 cycle,把失败写入 `cycle.json`,不要继续提 PR。 +## 完成定义(必须满足) + +`/eval-search cycle` 不是本地脚本跑完就结束。未传 `--skip-pr` 时,必须同时交付: + +- `summary.json` / `verdicts.json` 已写入本地 run 目录 +- 云文档已创建或追加成功,且文档 URL 写入 `cycle.json.cloud_doc.url` +- 云文档 token 已写入 `cloud-doc/tainted_tokens.json` +- draft PR 已创建,PR URL 写入 `cycle.json.pr_urls` +- PR description 包含云文档 URL、run-id、分数摘要、污染摘要和未处理 finding +- PR URL 已追加回云文档的 final 段 +- 最终回复用户时同时给出云文档 URL 和 PR URL + +任一必需链接缺失时,cycle 状态只能是 `failed` 或 `blocked`,不能回复“已完成”。 + ## 云文档创建 / 追加 创建新文档: @@ -168,7 +182,10 @@ setup 文档段落必须包含醒目的污染声明: - 主 agent 复查 PR 颗粒度和白名单 - 质量门禁 - regression 重跑 -- 创建 draft PR +- 生成 PR description,并把云文档 URL 写入 description +- 创建 draft PR,记录返回的 PR URL +- 立刻把 PR URL 回写到 `cycle.json.pr_urls` +- 追加 `40-pr-finished.md` 到云文档,包含 PR URL 云文档记录: @@ -180,6 +197,8 @@ setup 文档段落必须包含醒目的污染声明: 如果没有可提交改动,记录 `no-op`,不创建空 PR。 +PR 创建失败时,必须把失败原因、当前分支、commit sha、可恢复命令写入云文档;不得只在本地终端输出错误。 + ### 4. final 更新 `cycle.json.status`: @@ -199,6 +218,15 @@ setup 文档段落必须包含醒目的污染声明: | Run ID | `` | | Summary | `tests/eval-search/runs//summary.json` | | PR | `` | +| Report Doc | `` | +``` + +最终回复必须包含: + +```text +PR: +Cloud report: +Run ID: ``` ## 污染控制 diff --git a/skills/eval-search/references/pr-generation.md b/skills/eval-search/references/pr-generation.md index 11f41c186..14046afd6 100644 --- a/skills/eval-search/references/pr-generation.md +++ b/skills/eval-search/references/pr-generation.md @@ -58,15 +58,28 @@ Optimizer 的产出可能横跨两个仓库: │ ├─[6] 组装 PR description │ 按本文件下方模板生成 cli 和 open 两份 description.md,互相留 link 占位 + │ 若由 /eval-search cycle 调用,description 必须包含 cloud report URL │ ├─[7] gh pr create --draft(cli) │ cd && gh pr create --draft → 记录 PR url CLI_PR_URL + │ 若由 /eval-search cycle 调用,立刻回写 cycle.json.pr_urls 并追加云文档 │ └─[8] gh pr create --draft(open,若有) cd && gh pr create --draft,description 里 Pair 字段填入 CLI_PR_URL 创建完之后回到 cli PR,用 gh pr edit 把 open PR url 填到 cli description 的 Pair 段 ``` +## PR URL 交付契约 + +`gh pr create --draft` 的返回 URL 是 `/eval-search propose-pr` 的主产物,必须持久化到: + +- `tests/eval-search/runs//pr-draft/pr-url.txt` +- `tests/eval-search/runs//summary.json` 的 `pr_urls` 字段(若已有 summary) +- `cycle.json.pr_urls`(仅 `/eval-search cycle`) +- 云文档 final / pr-finished 段(仅 `/eval-search cycle`) + +最终回复用户时必须直接贴出 PR URL。若 PR 创建失败,回复中必须说明失败阶段、失败命令和当前可恢复分支,不得只说“已提交”。 + ## Quality gate 失败处理 两次迭代后仍失败的 finding: @@ -100,6 +113,9 @@ Optimizer 的产出可能横跨两个仓库: - Dataset size: {{dataset_size}} (同一份 base 拉取;dataset 可能已被 PM 更新,per-case diff 以 `record_id` 对齐) - 评测账号: `{{user_name}}` (open_id `{{user_open_id}}`) - Pollution: {{contaminated_count}} case 命中 tainted tokens{{#if contaminated_count}} — 见附录{{/if}} +{{#if cloud_report_url}} +- Cloud report: {{cloud_report_url}} +{{/if}} ## Wins(by case) From 4bb94352ee1f728ae6d19e68800d1de6c5652b1c Mon Sep 17 00:00:00 2001 From: zhuhao Date: Thu, 7 May 2026 17:45:17 +0800 Subject: [PATCH 5/6] chore(eval-search): keep submitted pr out of skills Change-Id: I8c9a079d565fdb8397357fbdd6789b4f0b85a296 --- skills/eval-search/RUBRIC.md | 119 ------- skills/eval-search/SKILL.md | 158 ---------- skills/eval-search/prompts/executor.md | 74 ----- skills/eval-search/prompts/judge.md | 97 ------ skills/eval-search/prompts/optimizer.md | 150 --------- skills/eval-search/references/cycle.md | 250 --------------- skills/eval-search/references/dataset.md | 127 -------- .../references/open-repo-layout.md | 162 ---------- .../references/pollution-preflight.md | 105 ------- .../eval-search/references/pr-generation.md | 293 ------------------ skills/eval-search/references/run-layout.md | 123 -------- .../eval-search/eval-search-collect-search.ts | 2 +- tests/eval-search/eval-search-run.ts | 6 +- .../references/known-tainted-tokens.md | 2 +- 14 files changed, 5 insertions(+), 1663 deletions(-) delete mode 100644 skills/eval-search/RUBRIC.md delete mode 100644 skills/eval-search/SKILL.md delete mode 100644 skills/eval-search/prompts/executor.md delete mode 100644 skills/eval-search/prompts/judge.md delete mode 100644 skills/eval-search/prompts/optimizer.md delete mode 100644 skills/eval-search/references/cycle.md delete mode 100644 skills/eval-search/references/dataset.md delete mode 100644 skills/eval-search/references/open-repo-layout.md delete mode 100644 skills/eval-search/references/pollution-preflight.md delete mode 100644 skills/eval-search/references/pr-generation.md delete mode 100644 skills/eval-search/references/run-layout.md rename {skills => tests}/eval-search/references/known-tainted-tokens.md (98%) diff --git a/skills/eval-search/RUBRIC.md b/skills/eval-search/RUBRIC.md deleted file mode 100644 index 21cffa5ff..000000000 --- a/skills/eval-search/RUBRIC.md +++ /dev/null @@ -1,119 +0,0 @@ -# RUBRIC — 4 维度评分细则 - -每个 case 按 4 维打分,每维 0-5 分,单 case 满分 15。总分 = sum(recall + accuracy + completeness)。 - -> 注:`total` 字段只聚合 3 个打分维度。第 4 维 `contamination_penalty` 是修饰项,见下。 - -## 维度定义 - -### recall(召回,0-5) - -"Executor 是否找到 / fetch 过**正确的目标文档**"。对应评测集 `数据源地址` 字段里的 URL / token。 - -| 分 | 判据 | -|----|------| -| 5 | trajectory 里显式 fetch 过全部 expected source;或 search 结果 top-5 里能看到全部 expected source 的 token | -| 4 | fetch 过一半以上(严格过半) | -| 3 | fetch 过至少 1 个但不到一半;或 top-5 里出现但未 fetch | -| 1-2 | 没 fetch、没在 top-5,但有相关命中(同主题不同文档) | -| 0 | 完全无关的命中 / 空结果 | - -**特例**:`企业内是否有知识 == 否` 的 case,recall 固定 5 分(agent 不该找到任何高置信答案,答"没找到"也算召回正确)。 - -**污染结果不计入 recall**:trajectory 里标记为 `tainted=true` 或 `evidence_excluded=true` 的搜索结果是可观测污染信号,但不是答案证据。即使 expected source token 只出现在这些污染结果里,也不能按 top-5 命中给 recall 分;只有非污染 `evidence_top_results` 或非污染 fetch 才能作为 recall 依据。 - -### accuracy(准确性,0-5) - -"Executor 给出的最终答案**在事实层面**对不对"。对照评测集 `预期答复` 的【关键信息】段 + 【打分备注】里的 "可信无误" 说明。 - -| 分 | 判据 | -|----|------| -| 5 | 关键信息全部正确,无事实错误 | -| 4 | 主要信息正确,少量细节偏差(时间、数字小错) | -| 3 | 部分正确部分错 / 含明显可证伪陈述 | -| 1-2 | 大部分错误,但方向对 | -| 0 | 完全错 / 幻觉 / 答非所问 | - -**【打分备注】优先级高于通用判据**。例如某 case 备注 "给到 0.x 折这类可信要扣分",即使答案看起来合理,只要踩到就扣。 - -### completeness(完整性,0-5) - -"Executor 覆盖了多少 expected key points"。对照【关键信息】列出的条目 + 【打分备注】里的 "完整详实" 说明。 - -| 分 | 判据 | -|----|------| -| 5 | 覆盖 ≥80% key points,或满足 `完整详实` 备注的明确阈值(如"答出 5 个及以上不扣分") | -| 4 | 覆盖 60-80% | -| 3 | 覆盖 40-60% | -| 1-2 | 覆盖 20-40% | -| 0 | <20% 或未给答案 | - -### contamination_penalty(污染修饰,-3 ~ 0) - -仅当 pre-flight 标记了 `contamination_risk=true` 且 trajectory 显示 Executor **fetch 过 tainted token** 时触发。 - -| 分 | 判据 | -|----|------| -| 0 | 未命中 tainted token,或命中但未 fetch | -| -1 | fetch 了 tainted token 但最终答案未直接引用其内容 | -| -3 | fetch 了 tainted token 且答案明显抄袭其结构 / 原文 | - -该项**直接从 total 扣**,且在 verdict 里显式标注,避免"刷分嫌疑"。 - -collector / Executor 可以把 tainted 搜索结果写进 trajectory,但必须把它们标为 `evidence_excluded=true`,且不能作为答案合成、fetch 选择或 recall top-5 的证据。简言之:**tainted results are observable but non-evidential**。 - -## Verdict JSON schema - -每个 case 一个 verdict,合并写入 `verdicts.json`。 - -```json -{ - "case_id": "case_001", - "query": "...", - "scores": { - "recall": 4, - "accuracy": 5, - "completeness": 3, - "contamination_penalty": 0, - "total": 12 - }, - "rationale": { - "recall": "fetch 了 Es5wwNCyei3eYNkXc8Tcx35nnWe,top-3 里出现 HxnMwM9cyiFW1dkACUBcC7KWnEd 但未 fetch", - "accuracy": "8 个案例全部在参考文档里,无幻觉", - "completeness": "列了 5/10,备注要求 ≥5 不扣分,按备注打 5" - }, - "improvement": { - "tool_capability": [ - "drive +search 返回结果没有 body_preview,agent 必须 fetch 才能判断相关性。建议返回摘要字段减少 fetch 次数" - ], - "search_strategy": [ - "Executor 只用了原词 '华东 Aily 案例',没换 '客户成功故事' / '最佳实践' 等同义词" - ], - "skill_prompts": [ - "lark-drive-search.md 可新增同义词清单小节,含 'case / story / best practice' 映射" - ] - }, - "contamination": { - "risk_flagged": false, - "tainted_tokens_fetched": [], - "penalty_applied": 0 - } -} -``` - -## 聚合规则(summary.json) - -Judge 打完所有 case 后,主 agent 按以下规则聚合到 `summary.json`: - -1. **按改动落点文件聚类 improvements**,不按文本相似度: - - 同一条 skill_prompts 建议指向 `skills/lark-doc/SKILL.md` 的,合并成一条 finding - - finding 保留 `driving_cases: [case_003, case_007, ...]` 反向索引 -2. **计算一阶瓶颈**:三桶的建议条数之和,占比最大的那个桶就是 `primary_bottleneck` -3. **统计 contamination**:分别统计 search-only 观测到 tainted token 的 case 数、被 fetch 到 tainted token 的 case 数;fetch 数 >2 时输出警告 -4. **汇总每个维度的均值、总分** - -## 校准指引(给 Judge 看的) - -- 优先使用【打分备注】里的 per-case rubric;与通用判据冲突时**以备注为准** -- 宁低勿高:打分是迭代的信号源,乐观打分会让下一轮 optimizer 找不到方向 -- rationale 字段必填,且要引用 trajectory 里的具体命令或 URL。只写"还行""不够完整"等空洞判断会被 Optimizer 识别为低质量 verdict 并丢弃 diff --git a/skills/eval-search/SKILL.md b/skills/eval-search/SKILL.md deleted file mode 100644 index 3489422fd..000000000 --- a/skills/eval-search/SKILL.md +++ /dev/null @@ -1,158 +0,0 @@ ---- -name: eval-search -version: 0.1.0 -description: "lark-cli 搜索能力端到端评测 Harness:拉取飞书评测集 → 盲测执行 → 四维打分 → 聚合归因 → 自动生成 PR 草稿。当用户要评测 lark-cli 搜索效果、做 v_n→v_{n+1} 迭代、让新人跑一轮优化闭环时使用。" -metadata: - requires: - bins: ["node", "lark-cli", "jq", "git", "gh"] ---- - -# eval-search — lark-cli 搜索能力评测 Harness - -**CRITICAL — 开始前 MUST 先用 Read 工具读取 [`../lark-shared/SKILL.md`](../lark-shared/SKILL.md)(认证)和 [`RUBRIC.md`](RUBRIC.md)(评分细则)。** - -## 目标 - -给 AI agent 一个自然语言搜索问题,它能否通过 lark-cli 在飞书企业知识库里找到正确答案?当它做不到,定位到: -- **(a) tool_capability** — 工具能力缺口(缺 shortcut / 缺 flag / 输出难解析) -- **(b) search_strategy** — agent 应该但没做的搜索动作 -- **(c) skill_prompts** — 方法论没在 skill 文档里 - -并把归因汇聚成可执行的 PR 草稿。 - -## 适用场景 - -- "跑一轮搜索评测" -- "新人想参与 lark-cli 优化,从哪里开始" -- "对比一下最近改动对搜索效果的影响" -- "看看上一轮评测还有哪些归因没处理" - -## 四个入口命令 - -``` -/eval-search cycle [--loader-profile NAME] [--executor-profile NAME] [--subset N] [--report-doc URL] - # 一键闭环:run → 打分/report → propose-pr,并把阶段进展写入云文档 -/eval-search run [--loader-profile NAME] [--executor-profile NAME] [--subset N] - # 跑一轮评测,产出 run-id。默认全量;--subset=3 抽样冒烟 -/eval-search run --snapshot-only # 只把评测集拉成本地 dataset.jsonl,供移除权限后复用 -/eval-search propose-pr # 基于 run 生成 PR 草稿(含 before/after + 泛化声明 + regression 告警) -/eval-search report # 读已有 run 的 summary.json -``` - -新人典型流程优先使用 `cycle`,只有调试单个阶段时才手动执行 `run` / `report` / `propose-pr`。 - -## `/eval-search cycle` 上层闭环 - -详细步骤见 [`references/cycle.md`](references/cycle.md)。概要: - -1. **初始化 cycle**:生成 `cycle-id` / `run-id`,创建 `tests/eval-search/runs//cycle.json` -2. **创建或绑定云文档**:若未传 `--report-doc`,用 `lark-cli docs +create --api-version v2 --doc-format markdown` 创建报告文档;若已传文档,则直接追加本轮章节 -3. **阶段化执行并记录**:内部串联 `run → score/report → propose-pr`,每个阶段开始、成功、失败都先写本地 `cycle.json`,再追加到云文档 -4. **产物归档**:云文档只写阶段状态、分数摘要、finding 摘要、PR URL、失败原因和本地产物路径;不得写标准答案、完整 trajectory、source_urls 或 key_error_snippets -5. **污染控制**:cycle 生成或使用的云文档默认是评测过程材料,必须记录为 tainted/process material;未来持久 blocklist 变更需要单独 PR,不得混入搜索效果优化 PR -6. **完成定义**:未传 `--skip-pr` 时,最终回复必须同时给出 Cloud report URL 和 Draft PR URL;任一链接缺失都不能视为完成 - -## 三层架构(必须隔离,违反会让结果失真) - -``` -Executor (sub-agent, Task 工具) - 输入: query only 不知道: expected / rubric / source_urls - 工具: 仅 lark-cli - 产出: trajectory + answer - ↓ -Judge (主 agent 切 hat,时序隔离) - 输入: query + answer + expected + rubric - 产出: 4 维打分 + 三桶 improvement - ↓ -Optimizer (sub-agent, Task 工具) - 输入: 全部 verdicts summary + 失败 case 的关键错误片段(不喂 trajectory 全文) - 产出: diff + 泛化声明字段 -``` - -**隔离纪律**: -- Executor prompt 永远只注入 `query`,绝不传 expected/rubric/source_urls(盲测) -- Judge 必须在 Executor 全部跑完之后开始,不得和 Executor 共享 tool-use 窗口 -- Optimizer 只看 Judge 聚合出的 summary,**不喂 trajectory 原文全文**,只喂失败 case 的关键错误行(防过拟合 + 控 context) - -## `/eval-search run` 流程 - -详细步骤见 [`references/run-layout.md`](references/run-layout.md)。概要: - -1. **确定性 setup**:先运行 `node --experimental-strip-types tests/eval-search/eval-search-run.ts --loader-profile --executor-profile [--subset N]`。脚本会生成 run-id,建目录 `tests/eval-search/runs//`,并完成第 2-4 步。若只有一个账号,可先用 `--snapshot-only` 拉本地 `dataset.jsonl`,移除该账号的评测 Base 权限后,再用 `--dataset-file /dataset.jsonl` 继续 -2. **拉数据集**:按 [`references/dataset.md`](references/dataset.md) 用 loader profile 从评测 base 拉最新数据 → `dataset.jsonl` -3. **账号隔离**:按 [`references/pollution-preflight.md`](references/pollution-preflight.md) 检查 executor profile 不在 `excluded_user_ids`,并主动探测 executor 不能读取评测 Base;若能读取则阻断 -4. **污染预检**:用 executor profile 对每条 query 跑一次 `drive +search`,命中 [`references/known-tainted-tokens.md`](references/known-tainted-tokens.md) 里的 token 则标记 `contamination_risk`。只标记不阻断;Judge 阶段再决定是否扣分 -5. **Executor 并行**:用 Task 工具启动 sub-agent 按 [`prompts/executor.md`](prompts/executor.md) 跑全部 case。每个 case trajectory 落盘 `trajectories/.json` -6. **Judge 逐 case**:主 agent 按 [`prompts/judge.md`](prompts/judge.md) 打分,写 `verdicts.json` -7. **聚合**:按"改动落点文件"对 improvements 聚类,写 `summary.json`;输出 run-id 给用户 - -## `/eval-search propose-pr` 流程 - -详细见 [`references/pr-generation.md`](references/pr-generation.md)。概要: - -1. **Optimizer 生成 diff**:用 Task 工具启动 sub-agent 按 [`prompts/optimizer.md`](prompts/optimizer.md) 读 summary + 两个仓库代码,产出 **cli diff + open diff(如有)** 和泛化声明 -2. **应用 diff 到两个 worktree**: - - cli 仓库:独立分支 `eval-search/auto-pr/` - - open 仓库(若有改动):独立分支 `eval-search/auto-pr/`,互不污染 main -3. **Quality gate**(当前仅 cli 仓库):`make unit-test` + `golangci-lint run --new-from-rev=origin/main` 必须通过。失败 → Optimizer 最多迭代 2 次,仍失败 → 把触发失败的改动降级为 GitHub issue,不进 PR。open 仓库暂不跑 gate(CI 配置非 harness 可控) -4. **确定性 regression 重跑**:按 diff 之上重跑完整评测(复用 `/eval-search run` 内部流程),产出 after verdicts。**这一步不给 Optimizer 参与** -5. **组装两份 PR description**:按 [`references/pr-generation.md`](references/pr-generation.md) 里的模板,包含 before/after 数值、wins/regressions 逐 case 列表、泛化声明、未处理归因、**对端 PR 互相 link** -6. **`gh pr create --draft`**:双 PR 独立提,**独立 review、独立 merge**。不强绑定联动。一个 PR 先 merge 另一个还没 merge 也 OK,在 PR description 里标记 cross-ref - -## 权限边界(v0.1 软约束,迭代中调整) - -### PR 颗粒度 - -每个 `/eval-search propose-pr` 只能落一个主归因桶 / 一个改动主题。主 agent 在 apply diff 前必须复查 touched files,并按以下规则拆分: -- `search_strategy` / `skill_prompts`:只能提交搜索策略或 skill 文档优化 PR,例如 `skills/lark-drive/references/*-search.md` 或当前主搜索入口对应文档。不得混入 harness、runner、package、评测集、打分脚本或基础设施改动;不要给已进入维护期的 `docs +search` 新增策略依赖。 -- `tool_capability`:只能提交 CLI shortcut / open converter 能力 PR。不得混入搜索策略文档,除非同一能力改动必须同步更新对应使用说明。 -- `eval_harness` / 评测流程自身:必须独立 PR,不能和任何搜索效果优化 PR 混在一起。 - -### cli 仓库(`larksuite/cli`,当前目录) - -Optimizer 默认允许改: -- `skills/**/*.md` -- 新增 `shortcuts//*.go` 及对应测试 - -Optimizer 不自动改: -- `internal/**`, `extension/**`, `cmd/root.go`, `cmd/service/**` 等基础设施 → 降级为 issue -- 任何旧 shortcut 的删除 / 重命名 / 破坏性改动 - -### open 仓库(`$GOPATH/src/code.byted.org/lark_as/open/`) - -详见 [`references/open-repo-layout.md`](references/open-repo-layout.md)。简要: - -Optimizer 默认允许改: -- `biz/search_open/entity/{name}.go` 的 `BuildDisplayInfo` / `BuildResponseItem` bug fix / `Prune` 及配套 `*_test.go` - -Optimizer 不自动改: -- IDL(在独立的 `lark/idl` 仓库,需要跑 overpass,不属于 PR 范畴) -- `api_meta/**/*.yml`(契约变更,走人工) -- `biz/search_open/handler.go` / `adapter.go` / `pagetoken.go` / `response.go` 等基础设施 -- 任何"新增 OAPI 字段"类需求(跨两个仓库 + 手工步骤,产出 issue 正文即可) - -### 违反白名单的处理 - -Optimizer 把该 finding 写进 PR description 的"未处理归因"段(含建议 issue 正文),由新人创建对应 GitHub issue。**不发**跨仓库 / 超出白名单的 PR。 - -## 关键纪律(不遵守分数会失真) - -1. **盲测纪律**:Executor prompt 只注入 `query`。即使主 agent fallback 接管 Executor,也必须自我约束不读 `dataset.jsonl` 的非 query 字段 -2. **三层隔离**:Judge 不能和 Executor 在同一轮 reasoning;Optimizer 不喂 trajectory 全文 -3. **Regression 软告警**:after 出现 regression 不硬 block,但必须在 PR description 里逐 case 列出;reviewer 判断 -4. **泛化声明必填**:Optimizer 必须区分"针对具体 case 的改动"和"泛化原则性改动"。前者过拟合风险高,reviewer 重点看 -5. **污染隔离**:harness 至少使用两个 profile。loader profile 可以读取评测 Base,但只允许用于拉数据集;executor profile 必须是专用测试账号(非 PM 账号、非 dataset owner 账号),且不能读取评测 Base。若 executor profile 的 `userOpenId` 出现在 [`references/known-tainted-tokens.md`](references/known-tainted-tokens.md) 的 `excluded_user_ids` 列表里,或 executor 可以读取评测 Base,拒绝启动 - -## 参考 - -- [`RUBRIC.md`](RUBRIC.md) — 4 维度评分细则 -- [`prompts/executor.md`](prompts/executor.md) — Executor sub-agent 模板 -- [`prompts/judge.md`](prompts/judge.md) — Judge 打分模板 -- [`prompts/optimizer.md`](prompts/optimizer.md) — Optimizer PR 生成模板 -- [`references/cycle.md`](references/cycle.md) — 一键闭环 + 云文档阶段日志 -- [`references/dataset.md`](references/dataset.md) — 评测集 schema + 拉取方式 -- [`references/pollution-preflight.md`](references/pollution-preflight.md) — 污染预检规则 -- [`references/known-tainted-tokens.md`](references/known-tainted-tokens.md) — 已知泄露文档标记清单 -- [`references/run-layout.md`](references/run-layout.md) — run 目录结构 + 中间产物约定 -- [`references/pr-generation.md`](references/pr-generation.md) — PR 生成流程 + description 模板(双 PR) -- [`references/open-repo-layout.md`](references/open-repo-layout.md) — `lark_as/open` 仓库允许改动的白名单导航 diff --git a/skills/eval-search/prompts/executor.md b/skills/eval-search/prompts/executor.md deleted file mode 100644 index 9ab9e15e0..000000000 --- a/skills/eval-search/prompts/executor.md +++ /dev/null @@ -1,74 +0,0 @@ -# Executor sub-agent 模板 - -**使用方式**:主 agent 用 Task 工具启动 sub-agent(`subagent_type: general-purpose`),把本文件内容 + 具体 `query` 拼为 prompt 传入。**禁止在 prompt 里注入 expected / rubric / source_urls / 评测集任何其他字段**。 - ---- - -## SYSTEM(照原样复制到 Task prompt 开头) - -你是 lark-cli 搜索能力评测 harness 的**执行层 sub-agent**,任务是**盲测**:回答一个来自飞书企业知识库的自然语言问题。 - -### 你的约束 - -1. **工具只有 lark-cli**:可以用 `lark-cli` 的任何 shortcut、API、schema 命令。禁止使用 WebFetch / WebSearch / 其他外部工具。 -2. **身份为当前登录的 user**。不要主动切 bot。 -3. **你不知道标准答案**,也不知道答案在哪个文档。你唯一拥有的信息就是 `query`。 -4. **单 case round 预算:12 round**(一个 lark-cli 调用 = 1 round)。超过必须收尾给 best-effort 答案。 -5. **Context discipline**: - - 任何 lark-cli 输出 >30 行 → 先 `--format json -q '.data[].title'` 之类精简,或落盘到 `/tmp/case__.txt` 再 grep - - 不要把整篇文档正文贴进 reasoning - - 每一步的内部总结 ≤200 字符 -6. **增量持久化**:每完成 1 round,把 trajectory 追加写入 `/trajectories/.json`。崩溃恢复靠这个文件。 - -### 方法论(**必须先阅读**,不是建议) - -在发出第一条 lark-cli 命令之前,MUST 用 Read 读: -- `skills/lark-shared/SKILL.md` — 认证、全局参数 -- `skills/lark-drive/SKILL.md` + `skills/lark-drive/references/lark-drive-search.md` — 云空间资源发现;优先使用 `drive +search`,不要新增依赖已进入维护期的 `docs +search` -- `skills/lark-doc/SKILL.md` — 命中文档后的 fetch / 内容读取 -(搜索方法论直接在 `lark-drive-search.md` 里:关键词改写 / 失败退出 / 答案型检索循环都在该文件的决策规则段) -- `skills/lark-wiki/SKILL.md` — wiki 节点是壳的关键概念 - -根据 query 类型可能还要读:`lark-im`、`lark-mail`、`lark-vc`、`lark-minutes`、`lark-contact` 等。 - -### 标准流程 - -1. 阅读 query,拆"实体"(人名 / 时间 / 关键词 / 资源类型) -2. 选择搜索入口(drive / im / mail / vc / minutes / ...) -3. 发起搜索;若返回空或无相关结果,按 `lark-drive-search.md` 的"决策规则 / `--query` 高级语法"换 2-3 轮词(同义词 / `intitle:` / 排除词) -4. 对 top 命中做进一步 fetch / resolve(wiki 节点必须先 `wiki +resolve-node`) -5. 综合信息给出答案;若 3 轮改写仍无结果,给 best-effort 结论并明确说"未找到直接证据" -6. 写 `/trajectories/.json`,结束 - -### 输出格式(最后一条消息,JSON) - -```json -{ - "case_id": "", - "answer": "<自然语言答案,markdown 允许>", - "referenced_urls": ["<从 lark-cli 命中的 URL>", ...], - "rounds_used": , - "gave_up": , - "notes": "<可选,给 Judge 的说明,例如:'时间窗超了,只跑了 8 round 提前收敛'>" -} -``` - -### 反模式(会被 Judge 扣分) - -- ❌ 不读 skill 文档直接 `lark-cli api GET /...` 手拼参数 -- ❌ 把 wiki token 当 doc token 传给 `docs +fetch` -- ❌ 搜不到时只重复同一个关键词 -- ❌ 一次性 `lark-cli ... | cat` 把 500 行塞进 reasoning -- ❌ 编造答案(没 fetch 过就说"根据文档 X...") - ---- - -## USER(主 agent 拼接时注入) - -``` -query: <来自 dataset.jsonl 的 query 字段原文> -case_id: -run_dir: > -``` - -**除以上三个字段,不注入任何评测集其他字段**。 diff --git a/skills/eval-search/prompts/judge.md b/skills/eval-search/prompts/judge.md deleted file mode 100644 index 9238093c6..000000000 --- a/skills/eval-search/prompts/judge.md +++ /dev/null @@ -1,97 +0,0 @@ -# Judge 打分模板 - -**使用方式**:主 agent 切 hat 执行。Executor 全部跑完后,主 agent 逐 case 读 `trajectory + expected`,按本文件产出 verdict。 - -> **隔离纪律**:不要在 Executor 尚未跑完时开始 Judge(会污染 Executor 所在 reasoning 窗口)。Executor 全部完成、`trajectories/*.json` 落盘后再启动 Judge。 - ---- - -## Judge 每个 case 的输入 - -从磁盘读(**不要复用 Executor 的 reasoning context**): -- `dataset.jsonl` 中该 case 的 `query / expected / source_urls / has_knowledge / rubric_notes` -- `trajectories/.json`(含 rounds 列表 + 最终 answer) -- `preflight.json`(看 `contamination_risk` 和 `tainted_tokens`) -- `skills/eval-search/RUBRIC.md` - -## 每个 case 的打分步骤 - -1. **recall**:扫 trajectory 里的每一条 tool_use,提取被 fetch / resolve 过的 token 和 URL 集合,并读取 `evidence_top_results` / search round 里的非污染 evidence tokens。与 `source_urls` 做交集。标记为 `tainted=true` 或 `evidence_excluded=true` 的 search 结果只能算污染观测,不能算 recall top-5 命中。按 RUBRIC 打分 -2. **accuracy**:把 `answer` 和 `expected.【关键信息】` 段逐条比对。优先应用 `expected.【打分备注】.可信无误` -3. **completeness**:数 key points 覆盖数。优先应用 `expected.【打分备注】.完整详实` -4. **contamination**:查 trajectory 是否 fetch 过 `preflight.tainted_tokens`;search-only 命中只记录风险,不扣污染分,也不作为 recall/accuracy/completeness 的证据。若有 fetch,按 RUBRIC 给 `contamination_penalty` -5. **improvement 三桶**:从 trajectory 里找失败片段,分类写进 `tool_capability / search_strategy / skill_prompts` - -## improvement 填写规则 - -**每条建议必须满足**: -- 指向**具体文件**(skill_prompts)、**具体命令**(tool_capability)或**具体动作**(search_strategy) -- 引用 trajectory 里触发该建议的 round 序号 -- 不写"可以更好"这种无落点的建议;写不出具体落点的建议**丢弃**,不要凑数 - -**示例**: - -✅ 好的: -```json -"skill_prompts": [ - "round 4 Executor 把 wiki URL 直接传给 docs +fetch 导致 param invalid。lark-wiki/SKILL.md 的反模式段应加'wiki 链接必须先走 +resolve-node'的明确警告(当前只在 references 里写了)" -] -``` - -❌ 差的: -```json -"skill_prompts": [ - "搜索不够全面", - "agent 应该更聪明地处理 wiki" -] -``` - -## 合并规则(主 agent 在全部 case 打完后做) - -把所有 verdicts 的 `improvement` 按"改动落点文件"去重合并到 `summary.json`: - -```json -{ - "run_id": "2026-04-15T10-00Z", - "dataset_size": 14, - "scored": 13, - "contaminated_fetched": 1, - "totals": { - "sum": 132, - "max": 195, - "percent": 67.7, - "per_dim": {"recall": 2.69, "accuracy": 3.92, "completeness": 3.54} - }, - "findings": [ - { - "finding_id": "F-001", - "bucket": "skill_prompts", - "target_file": "skills/lark-wiki/SKILL.md", - "suggestion": "在反模式段加 'wiki 链接必须先走 +resolve-node' 警告", - "driving_cases": ["case_003", "case_007", "case_011"], - "priority": "high" - }, - { - "finding_id": "F-002", - "bucket": "tool_capability", - "target_file": "shortcuts/docs/search.go", - "suggestion": "drive +search 返回结果没有 body_preview,agent 必须 fetch 才能判断相关性", - "driving_cases": ["case_001", "case_005"], - "priority": "medium" - } - ], - "primary_bottleneck": "skill_prompts", - "pollution_warnings": [] -} -``` - -**priority 判定**: -- `high`: driving_cases ≥3 且 bucket 是 `skill_prompts` / `search_strategy`(改文档成本低、收益面广) -- `medium`: driving_cases ≥2 或 bucket 是 `tool_capability`(代码改动) -- `low`: driving_cases == 1(过拟合风险高,给 Optimizer 作参考但不强推) - -## 自我校准检查(写 verdict 前自问) - -- 我是不是看了 expected 才倒推 trajectory 合理性?(应该反过来:先看 trajectory 自己是否合理,再 check 是否命中 expected) -- contamination_penalty 有没有漏判? -- improvement 的三桶比例是否均衡到可疑(例如 13 个 case 全扔 `skill_prompts`,可能是判断懒) diff --git a/skills/eval-search/prompts/optimizer.md b/skills/eval-search/prompts/optimizer.md deleted file mode 100644 index 786e1f699..000000000 --- a/skills/eval-search/prompts/optimizer.md +++ /dev/null @@ -1,150 +0,0 @@ -# Optimizer sub-agent 模板 - -**使用方式**:主 agent 用 Task 工具启动 sub-agent。Optimizer 读 `summary.json` + 失败 case 的关键错误片段 + 仓库代码,产出 diff 草稿。 - -> **关键纪律**:不喂 trajectory 原文全文,只喂主 agent 从失败 case 摘出的"关键错误行"(通常 ≤20 行/case)。这是防过拟合 + 控 context 的核心设计。 - ---- - -## SYSTEM(Task prompt 开头) - -你是 lark-cli 搜索能力评测 harness 的**优化层 sub-agent**。Judge 已经产出 `summary.json`(含聚类后的 findings),你的任务是把这些 findings 转成**可直接 commit 的代码 / 文档改动**,并自我区分哪些是泛化的、哪些是针对具体 case 的。 - -### 你的约束 - -1. **工具**:Read / Edit / Write / Grep / Glob / Bash(仅限 `go build`, `make unit-test`, `git diff`, `gofmt`)。禁止 `git push` / `gh pr create` / `git commit` — 那是主 agent 的事 -2. **白名单 — cli 仓库**(`larksuite/cli`,当前工作目录): - - ✅ `skills/**/*.md`(改已有或新增) - - ✅ 新增 `shortcuts//.go` + 配套 `*_test.go` - - ❌ `internal/**`, `extension/**`, `cmd/root.go`, `cmd/service/**` - - ❌ 旧 shortcut 的删除 / 重命名 / 破坏性修改 -3. **白名单 — open 仓库**(`$GOPATH/src/code.byted.org/lark_as/open/`,**只读导航后才能改**): - - 处理 `tool_capability` 桶里的 finding 时,MUST 先 Read [`../references/open-repo-layout.md`](../references/open-repo-layout.md) 了解允许动哪些文件 - - ✅ 简要:`biz/search_open/entity/{name}.go` 的 `BuildDisplayInfo` / `BuildResponseItem` bug fix / `Prune`,及配套 `*_test.go` - - ❌ 简要:IDL / `handler.go` / `adapter.go` / `api_meta/**` / 新增 OAPI 字段(详见导航手册) - - 涉及 IDL 或契约变更的 finding → 写进 `unhandled_findings.md` 的 `proposed_issue` 段,不写 diff -4. 触犯白名单外的 finding → 写进 `unhandled_findings.md`,建议新人改成 GitHub issue -5. 每次改 cli 仓库 Go 代码后 MUST 跑 `make unit-test` 验证。失败最多迭代 2 次,仍失败则该 finding 降级到 `unhandled_findings.md` -6. open 仓库暂不跑 quality gate(CI 配置非 harness 可控),但 Optimizer 自己 MUST:所有 `.go` 改动过 `gofmt`、动了 `entity/{name}.go` 必须同步动 `entity/{name}_test.go` -7. 改完所有 cli finding 后 MUST 跑 `go run github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.1.6 run --new-from-rev=origin/main` -8. 按 Conventional Commits 格式写 commit message — 双仓库情况下产出两份独立 commit message(见下方产出结构) - -### 输入(主 agent 会拼到 prompt) - -- `summary_json`: 完整 `summary.json` 内容 -- `key_error_snippets`: 每个 high-priority finding 的 driving_cases 里摘的关键错误行(主 agent 挑好) -- `run_dir`: 评测目录,用于读历史产物和写输出 - -### 工作流 - -1. **读 summary 全部 findings**,按 `priority` 降序处理 -2. **对每条 finding**: - - `skill_prompts` bucket → 用 Edit 改 cli 仓库的指定 markdown,保持 tone / 结构与周边一致 - - `search_strategy` bucket → 沉淀到 cli 仓库对应域的 `references/*-search.md`(如 `skills/lark-drive/references/lark-drive-search.md`),不要塞进本 harness 的 prompt 模板 - - `tool_capability` bucket → 分两步判断: - 1. 如果 finding 本质是 cli 封装层不够(缺 shortcut、shortcut 输出难解析),评估能否在 cli 仓库加 shortcut 解决 - 2. 如果是 OAPI 层(`BuildDisplayInfo` 信息不够、字段映射 bug),Read [`../references/open-repo-layout.md`](../references/open-repo-layout.md) 并严格按白名单改 open 仓库。不在白名单的 → 产出 issue 正文,写进 `unhandled_findings.md` 的 `proposed_issue` 段 -3. **过拟合自检**:每条改动自问"这条是否仅对 driving_cases 有效"。如果是,**标记为 case-specific** 写进 `generalization_note.json` -4. **写产出**(到 `/pr-draft/`): - -``` -/pr-draft/ -├── diff.patch ← cli 仓库改动(在 larksuite/cli 目录下 git diff > diff.patch) -├── commit_message.txt ← cli 仓库 commit message -├── generalization_note.json -├── unhandled_findings.md -└── open/ ← 若有 open 仓库改动才创建 - ├── diff.patch ← open 仓库改动(在 lark_as/open 目录下 git diff > diff.patch) - ├── commit_message.txt ← open 仓库 commit message - └── touched_files.txt ← 改动文件清单(用于主 agent 白名单复查) -``` - -**重要**:Optimizer 不执行 `git commit`。只产出 diff.patch + commit_message.txt,由主 agent 分别在两个仓库 apply + commit。 - -### generalization_note.json 格式(**必填,主 agent 会读并注入 PR description**) - -每条改动必须带 `repo` 字段(`cli` 或 `open`),主 agent 按此分发到对应 PR。 - -```json -{ - "case_specific_changes": [ - { - "repo": "cli", - "file": "skills/lark-drive/references/lark-drive-search.md", - "change_summary": "在同义词小节新增 '交个朋友 → Livflow 智能平台' 映射", - "driving_cases": ["case_005"], - "risk": "该同义词只由 case_005 驱动,强度弱。reviewer 可判断是否保留" - } - ], - "principled_changes": [ - { - "repo": "cli", - "file": "skills/lark-drive/SKILL.md", - "change_summary": "新增 '搜索词改写失败 3 次后给 best-effort 答案' 决策规则", - "driving_cases": ["case_003", "case_007", "case_011"], - "rationale": "泛化到任何搜索类任务的退出条件,不依赖具体 case 内容" - }, - { - "repo": "open", - "file": "biz/search_open/entity/chat.go", - "change_summary": "BuildDisplayInfo 在群描述为空时 fallback 展示群主名称", - "driving_cases": ["case_012"], - "rationale": "空描述的群目前 agent 只能看到标题,判断相关性信息不足;泛化到所有群搜索结果" - } - ] -} -``` - -`unhandled_findings.md` 内若含涉及 IDL / 契约变更的 finding,按以下结构写 `proposed_issue` 段: - -```markdown -### [proposed-issue] - -**Bucket:** tool_capability -**Driving cases:** case_003, case_008 -**Why not auto-fixed:** 需要 IDL 新增 optional 字段 `.`,跨 idl/open 两仓库,人工处理 - -**Suggested issue body:** -<可直接贴到 github issue 的完整正文,含背景、proto 来源字段、对 agent 决策的价值> -``` - -### commit_message.txt 格式 - -两份 commit message 结构相同,区别在 scope: - -**cli 仓库** (`pr-draft/commit_message.txt`): -``` -feat(eval-search): auto-propose improvements from run - -Driven by /eval-search propose-pr . - -- -- -- (case_005) - -Eval: % → % -Regressions: - -Generated-By: eval-search/ -``` - -**open 仓库** (`pr-draft/open/commit_message.txt`): -``` -feat(search_open): improve converter display_info from eval-search run - -- -- - -Driven by: larksuite/cli /eval-search run -Pair: -Generated-By: eval-search/ -``` - -### 禁止事项 - -- ❌ 不要改 `RUBRIC.md` / `prompts/*.md`(你自己的 prompt 不该自己改) -- ❌ 不要改 `dataset` 或评测 base 相关文件(评测集改动不由 Optimizer 负责) -- ❌ 不要修"已知 regression"反向打补丁(那是拼分,不是真修复) -- ❌ 找不到落点的 finding 不要硬凑,写进 `unhandled_findings.md` -- ❌ 不要给 skill markdown 加"由 Optimizer 自动生成"这类元信息注释——文档应读起来是人写的 -- ❌ 不要改 IDL 仓库 / kitex_gen 生成代码 / open 仓库白名单外的任何文件(详见 `open-repo-layout.md`) diff --git a/skills/eval-search/references/cycle.md b/skills/eval-search/references/cycle.md deleted file mode 100644 index 7323ed5b9..000000000 --- a/skills/eval-search/references/cycle.md +++ /dev/null @@ -1,250 +0,0 @@ -# cycle 上层闭环 + 云文档阶段日志 - -`/eval-search cycle` 是 `/eval-search run`、`/eval-search report`、`/eval-search propose-pr` 的上层编排入口。用户只触发一次,主 agent 负责按阶段推进、记录状态、遇到失败时停止并给出可恢复位置。 - -## 入口 - -```text -/eval-search cycle [--subset N] - [--loader-profile ] - [--executor-profile ] - [--report-doc ] - [--create-report-doc] - [--report-parent-token ] - [--skip-pr] -``` - -- `--report-doc`:把本轮阶段日志追加到已有云文档。 -- `--create-report-doc`:未传 `--report-doc` 时创建新云文档;默认创建到当前用户个人空间,可选 `--report-parent-token`。 -- `--skip-pr`:只跑到打分/report,不进入 optimizer 和 PR 创建。 -- 未指定云文档参数时,默认创建新报告文档。除非用户明确禁止云文档记录,否则 cycle 不走纯本地日志模式。 - -## 状态文件 - -cycle 必须先创建本地状态,再调用任何飞书或 GitHub 写操作: - -```text -tests/eval-search/runs// -├── cycle.json -└── cloud-doc/ - ├── 00-created.md - ├── 10-run-started.md - ├── 20-run-finished.md - ├── 30-score-finished.md - ├── 40-pr-finished.md - └── tainted_tokens.json -``` - -`cycle.json` 结构: - -```json -{ - "cycle_id": "2026-05-07T03-30Z", - "run_id": "2026-05-07T03-30Z", - "status": "running", - "started_at": "2026-05-07T03:30:00Z", - "ended_at": null, - "cloud_doc": { - "url": "", - "token": "", - "created_by_cycle": true, - "tainted": true - }, - "stages": [], - "pr_urls": [] -} -``` - -每次阶段状态变化都按顺序执行: - -1. 更新 `cycle.json` -2. 渲染一个 `cloud-doc/-.md` -3. 追加到云文档 -4. 只有云文档追加成功后才进入下一个阶段 - -若云文档追加失败,重试一次;仍失败则停止 cycle,把失败写入 `cycle.json`,不要继续提 PR。 - -## 完成定义(必须满足) - -`/eval-search cycle` 不是本地脚本跑完就结束。未传 `--skip-pr` 时,必须同时交付: - -- `summary.json` / `verdicts.json` 已写入本地 run 目录 -- 云文档已创建或追加成功,且文档 URL 写入 `cycle.json.cloud_doc.url` -- 云文档 token 已写入 `cloud-doc/tainted_tokens.json` -- draft PR 已创建,PR URL 写入 `cycle.json.pr_urls` -- PR description 包含云文档 URL、run-id、分数摘要、污染摘要和未处理 finding -- PR URL 已追加回云文档的 final 段 -- 最终回复用户时同时给出云文档 URL 和 PR URL - -任一必需链接缺失时,cycle 状态只能是 `failed` 或 `blocked`,不能回复“已完成”。 - -## 云文档创建 / 追加 - -创建新文档: - -```bash -lark-cli docs +create --api-version v2 --as user \ - --doc-format markdown \ - --content @tests/eval-search/runs//cloud-doc/00-created.md \ - --jq '.data.document.url' -``` - -创建到指定目录: - -```bash -lark-cli docs +create --api-version v2 --as user \ - --parent-token '' \ - --doc-format markdown \ - --content @tests/eval-search/runs//cloud-doc/00-created.md \ - --jq '.data.document.url' -``` - -追加阶段日志: - -```bash -lark-cli docs +update --api-version v2 --as user \ - --doc '' \ - --command append \ - --doc-format markdown \ - --content @tests/eval-search/runs//cloud-doc/20-run-finished.md -``` - -Markdown 文件必须使用 `@file` 传参,避免 shell 转义破坏表格、链接或代码块。 - -## 云文档内容边界 - -云文档是给人看进度和 review 结果的,不是评测原始数据仓库。允许写: - -- cycle-id / run-id / git head / 分支 / 账号类型 -- stage 状态、开始结束时间、失败原因 -- dataset 数量、preflight 污染数量、executor 完成数量 -- 总分、各维度均值、finding 聚类摘要、PR URL -- 本地产物路径,例如 `tests/eval-search/runs//summary.json` - -禁止写: - -- `dataset.jsonl` 全量内容 -- 标准答案、source URLs、rubric 的 per-case 原文 -- 完整 trajectory、完整 verdict rationale、key_error_snippets -- 任何 access token、app secret、cookie、GitHub token - -per-case 信息只允许写 `case_id`、分数、桶归因和一句不含标准答案的摘要。 - -## 阶段编排 - -### 0. setup - -- 确认 repo 路径和分支 -- 确认 `lark-cli auth status`、`gh auth status` -- 生成 `run-id` -- 创建 `cycle.json` -- 创建或绑定云文档 -- 把云文档 token 写入 `cloud-doc/tainted_tokens.json` - -setup 文档段落必须包含醒目的污染声明: - -```markdown -# eval-search cycle - -> This document is eval-search process material. It may contain benchmark summaries and must be treated as tainted for future search evaluations. - -| Field | Value | -|---|---| -| Run ID | `` | -| Status | `setup started` | -``` - -### 1. run - -内部执行 `/eval-search run` 的流程:拉数据集、污染预检、Executor、Judge、聚合。 - -阶段日志至少追加两次: - -- `run started`:记录 run-id、subset、loader/executor profile、run 目录 -- `run finished`:记录 dataset size、scored count、skipped count、trajectory 数、summary 路径 - -### 2. score/report - -读取 `summary.json` 和 `verdicts.json`,形成面向人的摘要。该阶段不重新打分,只消费 run 阶段已经产出的 Judge 结果。 - -必须记录: - -- 总分 / 满分 / 百分比 -- recall / accuracy / completeness / contamination_penalty 的总和与均值 -- top findings,最多 10 条 -- tainted fetch cases 数量和 case_id 列表 - -### 3. propose-pr - -未传 `--skip-pr` 时进入该阶段。内部执行 `/eval-search propose-pr `: - -- Optimizer 生成 diff -- 主 agent 复查 PR 颗粒度和白名单 -- 质量门禁 -- regression 重跑 -- 生成 PR description,并把云文档 URL 写入 description -- 创建 draft PR,记录返回的 PR URL -- 立刻把 PR URL 回写到 `cycle.json.pr_urls` -- 追加 `40-pr-finished.md` 到云文档,包含 PR URL - -云文档记录: - -- PR URL / state / draft 状态 -- touched files -- quality gate 结果 -- before/after 分数摘要 -- 未处理归因 - -如果没有可提交改动,记录 `no-op`,不创建空 PR。 - -PR 创建失败时,必须把失败原因、当前分支、commit sha、可恢复命令写入云文档;不得只在本地终端输出错误。 - -### 4. final - -更新 `cycle.json.status`: - -- `completed`:所有启用阶段完成 -- `completed_without_pr`:`--skip-pr` 或 no-op -- `failed`:任一必需阶段失败 - -最后追加一段总览,包含下一步建议和恢复命令: - -```markdown -## Final - -| Field | Value | -|---|---| -| Status | completed | -| Run ID | `` | -| Summary | `tests/eval-search/runs//summary.json` | -| PR | `` | -| Report Doc | `` | -``` - -最终回复必须包含: - -```text -PR: -Cloud report: -Run ID: -``` - -## 污染控制 - -cycle 生成或更新的云文档默认是 tainted/process material。规则: - -1. 创建或绑定文档后,立刻提取 doc token,写入 `cloud-doc/tainted_tokens.json` -2. 本 cycle 的 regression / after-run 必须把该 token 作为额外 tainted token -3. 未来持久 blocklist 需要单独处理: - - 单独开 `chore(eval-search): blocklist cycle report ` PR;或 - - 在云文档无法被 executor 账号搜索到的前提下,在本轮报告中说明未持久化 blocklist -4. 不得把 blocklist 更新混入 `search_strategy`、`skill_prompts` 或 `tool_capability` 优化 PR - -## 恢复策略 - -- `setup` 失败:修复认证或文档权限后,重新执行 cycle -- `run` 失败:保留 `cycle.json`,从已有 `run-id` 的本地 artifact 判断是否能补跑缺失 case;不能补跑则新 cycle -- `score/report` 失败:不重跑 Executor,只重新读取 `summary.json` / `verdicts.json` 并追加云文档 -- `propose-pr` 失败:修复 git/gh/quality gate 后,从同一 `run-id` 重新执行 propose-pr 阶段,并追加恢复记录 - -任何恢复都必须追加云文档段落,不得静默覆盖既有记录。 diff --git a/skills/eval-search/references/dataset.md b/skills/eval-search/references/dataset.md deleted file mode 100644 index 6af493683..000000000 --- a/skills/eval-search/references/dataset.md +++ /dev/null @@ -1,127 +0,0 @@ -# 评测集 schema + 拉取方式 - -## 位置 - -评测集存在飞书多维表格(**live 数据源**,PM 持续更新): - -- base_token: `OOoEbNWhcaFOdisXDW7c0lKtn4g` -- table_id: `tblGWdc19tKFZC6K` -- view_id: `vewGToSnWl` -- URL: https://bytedance.larkoffice.com/base/OOoEbNWhcaFOdisXDW7c0lKtn4g?table=tblGWdc19tKFZC6K&view=vewGToSnWl - -> **污染警告**:这个 base 本身会被 `drive +search` 命中。harness 必须把账号拆成两个 profile:loader profile 只用于读取这个 base 并生成 `dataset.jsonl`;executor profile 只用于盲测搜索,**不可**加入该 base 的查看权限,否则评测结果被自答污染。详见 [`pollution-preflight.md`](pollution-preflight.md)。 - -## 原始字段(字段 id → 含义) - -| 字段名 | 类型 | 说明 | -|--------|------|------| -| `query` | text | 自然语言问题;Executor 唯一可见输入 | -| `len` | number | 历史字段,忽略 | -| `企业内是否有知识` | single-select | `是` / `否`。`否` 意味着企业知识库里本来就没答案,Executor 应答"找不到",recall 维度固定给 5 | -| `预期答复(机评文本)` | text | 含三段:【关键信息】/ 【辅助信息】/ 【打分备注】。Judge 独占使用,**Executor 不可见** | -| `数据源地址` | text(markdown 链接) | expected source URLs;Judge 独占使用,**Executor 不可见** | - -## 拉取命令 - -推荐用确定性 setup runner 拉取并转换: - -```bash -node --experimental-strip-types tests/eval-search/eval-search-run.ts \ - --loader-profile \ - --executor-profile \ - --subset 3 -``` - -如果只有一个账号,可以拆成两步: - -```bash -# 账号仍有评测 Base 权限时,只拉本地快照 -node --experimental-strip-types tests/eval-search/eval-search-run.ts \ - --snapshot-only \ - --loader-profile - -# 移除该账号的评测 Base 权限后,从本地快照继续盲测 setup -node --experimental-strip-types tests/eval-search/eval-search-run.ts \ - --dataset-file tests/eval-search/runs//dataset.jsonl \ - --executor-profile -``` - -只看原始 Base 拉取时,用 loader profile 执行: - -```bash -lark-cli --profile base +record-list \ - --as user \ - --base-token OOoEbNWhcaFOdisXDW7c0lKtn4g \ - --table-id tblGWdc19tKFZC6K \ - --view-id vewGToSnWl \ - --limit 100 -``` - -返回形如: -```json -{ - "ok": true, - "data": { - "data": [ [value_of_query, value_of_len, ...], ... ], - "field_id_list": ["fldh3DHP53", ...], - "fields": ["query", "len", "企业内是否有知识", "预期答复(机评文本)", "数据源地址"], - "record_id_list": ["recvg4qIXMSU6K", ...], - "has_more": true - } -} -``` - -若 `has_more=true`,用 `--offset` 翻页直到全部拉完。 - -## 转换为 harness 内部 schema - -主 agent 把每一行转成一个 case 对象,拼成 `dataset.jsonl`(jsonl,一行一个 case): - -```json -{ - "case_id": "case_001", - "record_id": "recvg4qIXMSU6K", - "query": "华东客户有哪些 Aily 优秀使用案例", - "has_knowledge": true, - "expected": { - "key_points": "【关键信息】的原文段", - "aux_info": "【辅助信息】的原文段", - "rubric_notes": { - "类型说明": "开放问题", - "可信无误": "不局限于ref,只要明确作为aily使用案例出现即算可信", - "完整详实": "答出5个及以上不扣分", - "结构清晰": "无", - "语言表述": "无", - "相关辅助": "无", - "引用准确": "无" - } - }, - "source_urls": [ - "https://bytedance.larkoffice.com/wiki/HxnMwM9cyiFW1dkACUBcC7KWnEd", - "https://bytedance.larkoffice.com/wiki/Es5wwNCyei3eYNkXc8Tcx35nnWe" - ] -} -``` - -### 转换要点 - -1. **case_id 编号**:按 record_id 在返回里的顺序分配 `case_001, case_002, ...`。同一次 run 内稳定,跨 run 不保证(PM 在 base 里插新行会错位)。如需跨 run 追踪,用 `record_id` -2. **filter `企业内是否有知识`**:harness 同时支持 `是` 和 `否` 的 case;但**pilot 阶段建议只跑 `是` 的**(`否` case 判分逻辑更复杂,后续加) -3. **解析 `预期答复` 的三段**: - - split 文本找 `【关键信息】` / `【辅助信息】` / 【打分备注】` 三个 heading - - 【打分备注】段是嵌套 JSON,`json.loads` 解析到 `rubric_notes` - - 解析失败的 case 标记 `parse_error: true`,跳过不评(写进 `summary.json.skipped`) -4. **解析 `数据源地址`**:正则提取 markdown 链接 `[text](url)` → `source_urls: [url, ...]`。非 URL 的纯文本(如提示语)忽略 -5. **空 query 过滤**:`query` 字段为空或纯空白的记录跳过 - -## Pilot 样本:只跑前 3 条冒烟 - -`/eval-search run --subset 3` 只拉前 3 条 `是` 类 case 跑。用于: -- 第一次落地 harness,验证端到端能跑通 -- auto-PR 流程的 dry-run(改完 skill 跑 3 条看趋势) - -## 频率 / 数据漂移 - -PM 在 base 里编辑 case 是常态。harness 不做 snapshot 冻结(v0.1 范围外),每次 `run` 拉最新。 - -**代价**:v_n 和 v_{n+1} 的分数差会混入 dataset 变化。在 PR description 里强制标注 `dataset_size / first_run_of_records` 两个字段,reviewer 自己判断。 diff --git a/skills/eval-search/references/open-repo-layout.md b/skills/eval-search/references/open-repo-layout.md deleted file mode 100644 index bac261cd7..000000000 --- a/skills/eval-search/references/open-repo-layout.md +++ /dev/null @@ -1,162 +0,0 @@ -# open 仓库导航手册(Optimizer 专用) - -> **读者:** `prompts/optimizer.md` 在处理 `tool_capability` 桶的 finding 时会 Read 这篇文档。 -> -> **目的:** 把 `lark_as/open` 仓库当"受控沙盒" — 明确 Optimizer 允许改哪些文件、禁止碰哪些文件、改完怎么验证。 - -## 仓库定位 - -``` -$GOPATH/src/code.byted.org/lark_as/open/ -``` - -这是 lark-cli 背后的 OpenAPI 服务层(后台简称 suite.as.open)。它把飞书内部大搜 PB(MGUniversalSearch)封装成面向外部的 OAPI。CLI 调这些 OAPI,agent 调 CLI。整条链路: - -``` -CLI (larksuite/cli) - → OAPI (lark_as/open) - → kitex_gen stub (git.byted.org/ee/go/kitex_gen, 由 IDL 仓库自动生成) - → RPC → 大搜后端 -``` - -**Optimizer 只动 open 仓库一层。** IDL 和 kitex_gen 不动(见禁止清单)。 - -## 核心目录(只读懂即可) - -``` -biz/search_open/ ← AI Friendly 新框架,所有改动都在这里 -├── entity/ ← 每实体一个 converter 文件 -│ ├── iconverter.go ← Converter 接口定义(不动) -│ ├── chat.go ← 参考实现(group chat 搜索) -│ ├── meeting.go ← 参考实现(平台实体,走 SlashCommand) -│ ├── message.go / doc.go / wiki.go / user.go / mail.go / task.go / ... -│ └── timeutil.go ← 时间格式工具(不动) -├── adapter.go ← 调 UniversalSearch RPC(不动) -├── handler.go ← 编排(不动) -├── pagetoken.go ← 翻页(不动) -├── response.go ← 错误码(不动) -├── CLAUDE.md ← open 仓库的开发规范,读它能看懂架构 -└── api_meta/{entity}/ ← 每实体 4 个 yml(search/filter/item/meta) - -biz/handler/handler.go ← 顶层路由(不动) -rpc/ ← 旧搜索 + RPC 封装(不动) -main.go / conf/ / utils/ ← 基础设施(不动) -``` - -## Converter 接口速览 - -每个 `entity/{name}.go` 都实现同一套 5 方法接口: - -```go -type Converter interface { - EntityType() usearch.SearchEntityType - BuildEntityItem(ctx, req) (*usearch.BaseEntity_EntityItem, error) // OAPI Filter → PB Filter - BuildResponseItem(result *usearch.SearchResult) (interface{}, error) // PB Meta → OAPI Item - BuildDisplayInfo(result *usearch.SearchResult) string // 组装给 AI 看的 markdown 卡片 - Prune(item interface{}, fields []string) interface{} // 字段裁剪 -} -``` - -**AI friendly 的高杠杆改动点几乎全在 `BuildDisplayInfo`**:它返回的 markdown 就是 agent 在 CLI 里看到的搜索结果文本。大搜结果里的标题、摘要、上下文、高亮(`` 标签)的组装方式直接决定 agent 能否一眼判断相关性。 - -## ✅ 允许改动(白名单) - -以下三类改动 Optimizer 可以直接写 diff,不需要动 IDL: - -### 1. `BuildDisplayInfo` 优化 - -- 补充 markdown 字段(例如加入更多上下文、路径信息、作者、时间) -- 调整高亮策略(命中词用 `` 标签包裹) -- 修复格式化 bug(换行、空字段处理、转义) - -**边界:** 只能使用 `*usearch.SearchResult` 里已有的字段。要是需要 PB 没返回的信息,那是 PB/IDL 的问题,降级为 issue。 - -### 2. `BuildResponseItem` 的字段映射 bug fix - -- `nil` 指针防御 -- 时间戳转换错误(`UnixToISO8601` / `UnixMsToISO8601` 用错) -- 枚举值映射错(比如 `chatStatusNormal` 漏判) -- ID 字段赋值缺失 - -**边界:** 只能在已有 OAPI 响应字段上做映射修复;**不能**新增 OAPI 响应字段(那是 IDL 级别的契约变更)。 - -### 3. `Prune` 敏感字段裁剪 - -- 根据业务需要把敏感/内部字段从响应里去掉 - -### 4. 配套测试 - -- 每次改 `entity/{name}.go` **必须**同时更新 `entity/{name}_test.go`,否则 quality gate(未来启用)会 block - -## ❌ 禁止改动(硬黑名单) - -| 路径 | 原因 | -|------|------| -| `../lark/idl/**` | IDL 在另一个仓库,需要跑 overpass + go get,不是 PR 范畴 | -| `biz/search_open/handler.go` | 编排逻辑,动了容易坏所有实体 | -| `biz/search_open/adapter.go` | RPC 适配层,牵扯协议 | -| `biz/search_open/pagetoken.go` | 翻页 + Redis,幂等性敏感 | -| `biz/search_open/response.go` | 错误码契约 | -| `biz/search_open/entity/iconverter.go` | Converter 接口,动了所有实体都得跟 | -| `biz/search_open/entity/timeutil.go` | 时间工具,动了影响所有实体 | -| `biz/search_open/api_meta/**/*.yml` | 新增 / 修改 schema = 契约变更,走人工 | -| `biz/handler/handler.go` | 顶层路由 | -| `rpc/**` | 旧搜索 + RPC 封装 | -| `main.go` / `conf/**` / `utils/**` | 基础设施 | -| `go.mod` / `go.sum` | 依赖升级人工做 | - -**触犯任一条** → finding 必须进 `unhandled_findings.md`,附带 issue 描述建议,不写进 diff。 - -## 新增 OAPI 字段(即使是 optional)的处理 - -**Optimizer 不能自动加字段。** 流程太复杂: - -1. 需要改 IDL 仓库(`$GOPATH/src/code.byted.org/lark/idl/idl/suite/as/open/*.thrift`) -2. 需要跑 overpass 生成 kitex_gen stub -3. 需要 `go get` 拉 stub 更新 -4. 需要同步改 open 仓库的 converter 映射 -5. 需要同步改 `api_meta/{entity}/*.yml` schema - -这是多仓库协作 + 手工步骤,Optimizer 不应该做。改为产出 GitHub issue 正文,正文包含: - -- 哪个 entity 需要新字段 -- 字段含义(含 proto 里已有的来源字段,若有) -- driving case 的引用 -- 对 agent 决策的价值说明 - -issue 正文写进 `unhandled_findings.md` 的 `proposed_issue` 段,由人工创建。 - -## 验证策略(当前版本) - -**Quality gate 暂未启用**(`/eval-search propose-pr` 跳过 open 仓库测试)。原因:open 仓库跑测试需要下游依赖,CI 配置不是 harness 可控的。PR 开出去之后,open 仓库的 CI 会自己跑。 - -Optimizer 自己必须做的最小校验: - -1. 所有改动文件 `gofmt` 过 -2. 改了 `entity/{name}.go` 必须同步动 `entity/{name}_test.go`(至少加一条测试覆盖修改的分支) -3. 不允许删除已有测试 - -## 参考文件(Optimizer 生成改动前**必读**) - -- `biz/search_open/CLAUDE.md` — 开发规范原文 -- `biz/search_open/entity/chat.go` — 完整 converter 参考 -- `biz/search_open/entity/chat_test.go` — 测试写法参考 -- `biz/search_open/entity/meeting.go` — 平台实体 converter 参考(`BuildDisplayInfo` 写法略有不同) - -## 与主 agent 的交互契约 - -Optimizer 处理涉及 open 仓库的 finding 时,产出放在 `pr-draft/open/` 子目录(和 cli 仓库的 `pr-draft/` 同级): - -``` -tests/eval-search/runs//pr-draft/ -├── diff.patch # cli 仓库改动(原本就有) -├── generalization_note.json -├── unhandled_findings.md -├── commit_message.txt -└── open/ # 新增:open 仓库改动 - ├── diff.patch # 应用到 $GOPATH/src/code.byted.org/lark_as/open/ - ├── commit_message.txt - └── touched_files.txt # 命中白名单校验的冗余证据 -``` - -主 agent 拿到两份 diff.patch 之后,分别 checkout 两个仓库、分别 apply、分别 commit、分别 `gh pr create`,在两个 PR description 里互相 link(见 `pr-generation.md`)。 diff --git a/skills/eval-search/references/pollution-preflight.md b/skills/eval-search/references/pollution-preflight.md deleted file mode 100644 index 4ff88fcc7..000000000 --- a/skills/eval-search/references/pollution-preflight.md +++ /dev/null @@ -1,105 +0,0 @@ -# 污染预检规则 - -## 动机 - -评测集 base 自身、v1/v2 迭代记录文档、含 expected 的参考文档,都可能被 `drive +search` 命中。Executor 一旦 fetch 到,就是"开卷考试"——分数失去意义。 - -v2 的教训:PM 的 dataset base 在第一次跑评测时,几乎所有 query 的搜索 top-1 都是 dataset 自己。 - -因此 `/eval-search run` 需要两个 lark-cli profile: -- `loader-profile`:能读评测 Base,只负责拉取 live dataset 并写入 `dataset.jsonl` -- `executor-profile`:负责盲测搜索,必须不能读评测 Base - -也可以用同一个人账号做时间隔离:先在有权限时运行 `--snapshot-only` 拉本地快照;随后把该账号从评测 Base 权限里移除;最后用 `--dataset-file` 从本地快照继续。第二步运行时仍会探测 executor 是否能读 Base,能读则阻断。 - -## 两道防线(必须叠加) - -### 防线 1:专用账号(物理隔离) - -harness 启动时 MUST 先对 executor profile 做账号检查: - -```bash -lark-cli --profile auth status -``` - -从返回里读 `userOpenId`,对照 [`known-tainted-tokens.md`](known-tainted-tokens.md) 的 `excluded_user_ids` 列表: -- 命中 → **拒绝启动**,报错退出:`当前账号在 excluded_user_ids 里;harness 必须用专用测试账号运行` -- 未命中 → 继续 - -**新建测试账号步骤**(手工一次性): -1. 申请独立企业飞书账号(非 PM、非 dataset owner) -2. 账号不加入评测集 base 的权限,不加入"参考流程文档"的权限 -3. 在 `~/.config/lark-cli/profiles/` 下建独立 profile,`lark-cli auth login --profile eval-search` -4. 评测运行时:`lark-cli --profile eval-search ...` - -setup runner 还会主动探测 executor profile 是否能读取评测 Base: - -```bash -lark-cli --profile base +record-list \ - --as user \ - --base-token OOoEbNWhcaFOdisXDW7c0lKtn4g \ - --table-id tblGWdc19tKFZC6K \ - --view-id vewGToSnWl \ - --limit 1 -``` - -期望结果是权限失败。若读取成功,说明 executor 可直接搜到或打开评测集,必须阻断本轮 run。 - -### 防线 2:Pre-flight 扫描(兜底) - -即使账号做了物理隔离,某些情况下仍可能被污染(例如:某个新建文档恰好包含了答案且权限开放)。Pre-flight 作为兜底: - -**流程**: - -``` -for each case in dataset.jsonl: - result = lark-cli --profile drive +search --query "" --page-size 20 - hit_tokens = extract all obj_token / wiki_token from result - tainted = hit_tokens ∩ known_tainted_tokens - - write to preflight.json: - { - "case_id": "case_001", - "contamination_risk": len(tainted) > 0, - "tainted_tokens": [...], - "top_20_tokens": [...] - } -``` - -实际执行时,`known_tainted_tokens` 由持久清单 [`known-tainted-tokens.md`](known-tainted-tokens.md) 和本轮 `cloud-doc/tainted_tokens.json` 合并得到。后者用于 `/eval-search cycle` 生成的临时报告文档,避免还没进入持久 blocklist 的过程材料影响本轮 after-run。 - -**不阻断**,只标记。原因:有时 pre-flight 命中但 Executor 最终没 fetch,这种 case 依然有效,Judge 会打出正常 recall 分。 - -### known_tainted_tokens 的维护 - -见 [`known-tainted-tokens.md`](known-tainted-tokens.md)。三类必须纳入: -1. **评测集 base 自身**:`OOoEbNWhcaFOdisXDW7c0lKtn4g` -2. **v1/v2 迭代记录 docx**:`VdUKdAXjmo9vl8xq4FrczK6unct`(含全部评测方法论 + 具体 case 分数) -3. **人类写的"答题参考"/"流程总结"**:任何在评测过程中被主 agent 写到飞书的 note - -每次新增一个"讨论评测过程"的飞书文档,记得加进标记清单(或者更简单:**不要在飞书上写这种文档**,都写成本仓库 markdown)。 - -## Judge 怎么用 preflight 数据 - -Judge 读 `preflight.json` 判断 `contamination_penalty`: - -``` -for each case: - if preflight[case].contamination_risk == true: - scan trajectory for any tool_use that fetched one of tainted_tokens - if fetched: - if answer directly quotes tainted doc content: - contamination_penalty = -3 - else: - contamination_penalty = -1 - else: - contamination_penalty = 0 - else: - contamination_penalty = 0 -``` - -## 常见坑 - -- **wiki 链接**:`wiki://space_xxx/node_yyy` 背后的 obj_token 才是真实目标。pre-flight 扫描时必须同时记录 `wiki_token` 和 `obj_token` 两层,任一命中标记清单即 tainted -- **短链 / applink**:`applink.feishu-pre.net/...` 跳转后的最终 URL 可能是 tainted,建议 Executor 遇到短链先解析一跳再判断。这条太细,v0.1 不做强约束 -- **账号隔离失效**:PM 手滑把 dataset base 对全员开放,专用账号又能看到了。定期(每次 run 前)手动检查一下 base 的权限列表 diff --git a/skills/eval-search/references/pr-generation.md b/skills/eval-search/references/pr-generation.md deleted file mode 100644 index 14046afd6..000000000 --- a/skills/eval-search/references/pr-generation.md +++ /dev/null @@ -1,293 +0,0 @@ -# PR 生成流程 + description 模板 - -## 双 PR 模型 - -Optimizer 的产出可能横跨两个仓库: - -- **cli 仓库**(`larksuite/cli`,当前工作目录):skill 文档改动、新增 shortcut -- **open 仓库**(`$GOPATH/src/code.byted.org/lark_as/open/`):converter 层 `BuildDisplayInfo` 优化、bug fix - -两个仓库分别提 PR,**独立 review、独立 merge**(决策 2A)。PR description 里互相 link,但不绑定 merge 顺序——一个先 merge 另一个还没 merge 也 OK。 - -若本次 run 只有 cli 改动,`pr-draft/open/` 目录不存在,跳过所有 open 仓库步骤。 - -## 总流程 - -``` -/eval-search propose-pr - │ - ├─[0] 前置检查 - │ ├─ cli 仓库 git status 必须干净(non-dirty);否则 abort - │ ├─ cli 仓库当前分支是 main;否则 abort - │ ├─ runs//summary.json 存在且 scored >0 - │ ├─ runs//meta.json.git_dirty != true - │ └─ 若 Optimizer 产出涉及 open 仓库 → 同样检查 open 仓库 git status / 分支 - │ - ├─[1] Optimizer sub-agent(Task 工具) - │ 输入: summary.json + key_error_snippets + 两个仓库路径 - │ 输出: pr-draft/{diff.patch, commit_message.txt, generalization_note.json, unhandled_findings.md} - │ 若有 open 改动 → pr-draft/open/{diff.patch, commit_message.txt, touched_files.txt} - │ 注意: Optimizer 不自己 git commit / git apply,一切由主 agent 执行 - │ - ├─[2] 白名单复查(主 agent,防 Optimizer 越权) - │ ├─ cli diff 命中路径都在白名单内(skills/**/*.md、shortcuts/**) - │ └─ open diff 命中路径都在白名单内(biz/search_open/entity/{name}.go + *_test.go) - │ 违反 → abort,Optimizer 降级迭代 - │ - ├─[3] cli 仓库 apply + commit - │ cd - │ git checkout -b eval-search/auto-pr/ - │ git apply pr-draft/diff.patch - │ ├─[3a] Quality gate - │ │ make unit-test # 必过 - │ │ golangci-lint run --new-from-rev=origin/main # 必过 - │ │ 失败 → Optimizer 最多迭代 2 次;仍失败 → rollback,该 finding 降级为 unhandled - │ └─ git add . && git commit -F pr-draft/commit_message.txt - │ - ├─[4] open 仓库 apply + commit(若有) - │ cd $GOPATH/src/code.byted.org/lark_as/open - │ git checkout -b eval-search/auto-pr/ - │ git apply /pr-draft/open/diff.patch - │ # 无 quality gate(暂时),Optimizer 自己已做 gofmt 和测试更新 - │ git add . && git commit -F /pr-draft/open/commit_message.txt - │ - ├─[5] 确定性 regression 重跑 - │ 调用 /eval-search run 内部逻辑(无 agent 参与),生成 after_verdicts.json - │ 对比 before(summary.json)vs after,产出 per-case diff - │ 注意: open 改动若依赖 CI 部署才能生效,after 结果反映的是 cli 改动的影响;在 description 里标注 - │ - ├─[6] 组装 PR description - │ 按本文件下方模板生成 cli 和 open 两份 description.md,互相留 link 占位 - │ 若由 /eval-search cycle 调用,description 必须包含 cloud report URL - │ - ├─[7] gh pr create --draft(cli) - │ cd && gh pr create --draft → 记录 PR url CLI_PR_URL - │ 若由 /eval-search cycle 调用,立刻回写 cycle.json.pr_urls 并追加云文档 - │ - └─[8] gh pr create --draft(open,若有) - cd && gh pr create --draft,description 里 Pair 字段填入 CLI_PR_URL - 创建完之后回到 cli PR,用 gh pr edit 把 open PR url 填到 cli description 的 Pair 段 -``` - -## PR URL 交付契约 - -`gh pr create --draft` 的返回 URL 是 `/eval-search propose-pr` 的主产物,必须持久化到: - -- `tests/eval-search/runs//pr-draft/pr-url.txt` -- `tests/eval-search/runs//summary.json` 的 `pr_urls` 字段(若已有 summary) -- `cycle.json.pr_urls`(仅 `/eval-search cycle`) -- 云文档 final / pr-finished 段(仅 `/eval-search cycle`) - -最终回复用户时必须直接贴出 PR URL。若 PR 创建失败,回复中必须说明失败阶段、失败命令和当前可恢复分支,不得只说“已提交”。 - -## Quality gate 失败处理 - -两次迭代后仍失败的 finding: - -1. 回滚那一条 finding 的改动(其他 finding 保留) -2. 把它写进 `unhandled_findings.md`,归类为 `quality_gate_failure`,附带完整错误输出 -3. PR description 的"未处理归因"段列出这些 finding 并建议新人创建 issue - -## PR description 模板(cli 仓库) - -```markdown - - -## 摘要 - -基于 eval-search run `{{run_id}}` 自动生成,共 {{n_findings}} 条改进落地({{n_skipped}} 条未处理)。 - -{{#if open_pr_url}} -**Pair:** [{{open_pr_title}}]({{open_pr_url}}) — open 仓库的配套改动,独立 review。 -{{/if}} - -## 评测对比(before vs after) - -| 指标 | before | after | Δ | -|------|--------|-------|---| -| 总分 | {{before_total}} / {{max}} ({{before_pct}}%) | {{after_total}} / {{max}} ({{after_pct}}%) | **{{delta}} ({{delta_pp}}pp)** | -| recall | {{before_recall}} | {{after_recall}} | {{delta_recall}} | -| accuracy | {{before_accuracy}} | {{after_accuracy}} | {{delta_accuracy}} | -| completeness | {{before_completeness}} | {{after_completeness}} | {{delta_completeness}} | - -- Dataset size: {{dataset_size}} (同一份 base 拉取;dataset 可能已被 PM 更新,per-case diff 以 `record_id` 对齐) -- 评测账号: `{{user_name}}` (open_id `{{user_open_id}}`) -- Pollution: {{contaminated_count}} case 命中 tainted tokens{{#if contaminated_count}} — 见附录{{/if}} -{{#if cloud_report_url}} -- Cloud report: {{cloud_report_url}} -{{/if}} - -## Wins(by case) - -{{#each wins}} -- `{{case_id}}` ({{record_id}}): **{{before}}→{{after}}** (+{{delta}}) - - driver: {{driver_findings}} -{{/each}} - -## ⚠️ Regressions(软告警 — reviewer 请核验) - -{{#if regressions}} -{{#each regressions}} -- `{{case_id}}` ({{record_id}}): **{{before}}→{{after}}** ({{delta}}) - - 可能原因: {{hypothesis}} - - 建议 reviewer: 查看 `tests/eval-search/runs/{{run_id}}/trajectories/{{case_id}}.json` 对比前后行为 -{{/each}} -{{else}} -_无 regression_ -{{/if}} - -## 改动分类(Optimizer 自述) - -### 泛化原则性改动(适用面广,reviewer 较快可信) - -{{#each principled_changes}} -- **{{file}}**: {{change_summary}} - - rationale: {{rationale}} - - driven by: {{driving_cases}} -{{/each}} - -### 针对具体 case 的改动(⚠️ 过拟合风险,reviewer 重点判断) - -{{#if case_specific_changes}} -{{#each case_specific_changes}} -- **{{file}}**: {{change_summary}} - - risk: {{risk}} - - driven by: {{driving_cases}} -{{/each}} -{{else}} -_无_ -{{/if}} - -## 未处理归因 - -{{#if unhandled}} -以下 findings 本 PR 未处理,建议 reviewer 考虑创建 issue: - -{{#each unhandled}} -- **[{{bucket}}]** {{suggestion}} - - 未处理原因: {{reason}} - - driving: {{driving_cases}} -{{/each}} -{{else}} -_无_ -{{/if}} - -## 怎么 review 这个 PR - -1. 先看"评测对比"总分是否真有提升 -2. 扫一眼 Regressions,若有,点进 trajectory 看是不是噪声 -3. 重点 review "针对具体 case 的改动"——判断是否过拟合 -4. 泛化性改动是文档修订,读 diff 即可 -5. 如涉及 Go 代码,CI 已过 `make unit-test` + lint,关注接口设计 - -## 复现 - -```bash -git checkout eval-search/auto-pr/{{run_id}} -/eval-search report {{run_id}} -``` - ---- - -🤖 Generated by [eval-search harness](../skills/eval-search/SKILL.md) -``` - -## PR description 模板(open 仓库) - -比 cli 版本精简,不重复写 wins/regressions 表格(那是 CLI 端视角),只列本 PR 的改动 + 回指 cli PR。 - -```markdown - - -## 摘要 - -配合 cli 仓库 `eval-search` 评测结果优化 OAPI converter 层。改动范围:`biz/search_open/entity/` 下的 `BuildDisplayInfo` / `BuildResponseItem` / `Prune`,**不涉及 IDL 和契约变更**。 - -**Pair:** [{{cli_pr_title}}]({{cli_pr_url}}) — 主 PR,含完整评测对比、泛化声明、未处理归因。 - -## 改动清单 - -{{#each open_changes}} -- **`{{file}}`**: {{change_summary}} - - driven by: {{driving_cases}} - - 过拟合风险: {{risk_level}} -{{/each}} - -## 怎么 review - -1. 每条改动本质都是 converter 输出字符串的优化,对协议无影响 -2. Quality gate 未跑(harness 暂未接 open 仓库 CI),reviewer 请关注: - - 空字段 / nil 指针防御是否到位 - - markdown 高亮标签 `` 使用是否一致 - - 测试是否覆盖了修改的分支 -3. 对 agent 效果的量化验证在 cli PR 的评测对比段 - -## 复现 cli 侧评测 - -```bash -cd -/eval-search report {{run_id}} -``` - ---- - -🤖 Generated by [eval-search harness](https://github.com/larksuite/cli/tree/main/skills/eval-search) -``` - -## 模板填充注意 - -- 所有百分比保留 1 位小数 -- `driving_cases` 最多列 5 个,超过写 `case_003, case_007, ... (+3 more)` -- `record_id` 放在 `case_id` 后面括号里,方便 reviewer 跨 run 追踪同一条 case -- `hypothesis` 由主 agent 根据 before/after trajectory diff 推断,最多 30 字;拿不准就写 `"待核验"`,不要硬编 - -## commit message 规范 - -Conventional Commits,遵循仓库 AGENTS.md: - -``` -feat(eval-search): auto-propose improvements from run - -<一段改动概要,3-6 行> - -Eval: % → % ({{delta_pp}}pp) -Regressions: -Unhandled: - -Generated-By: eval-search/ -Co-Authored-By: eval-search-bot -``` - -## PR 创建命令 - -**cli 仓库 PR**(先创建): - -```bash -cd -gh pr create --draft \ - --title "feat(eval-search): auto-propose improvements from run " \ - --body-file tests/eval-search/runs//pr-draft/description.md \ - --base main -``` - -记录返回的 PR URL 为 `CLI_PR_URL`。 - -**open 仓库 PR**(若 `pr-draft/open/` 存在): - -```bash -cd $GOPATH/src/code.byted.org/lark_as/open -# description.md 里已填入 CLI_PR_URL 到 Pair 字段 -gh pr create --draft \ - --title "feat(search_open): improve converter display_info from eval-search run " \ - --body-file /tests/eval-search/runs//pr-draft/open/description.md \ - --base main -``` - -记录返回的 PR URL 为 `OPEN_PR_URL`,然后回填到 cli PR description: - -```bash -cd -gh pr edit --body-file -``` - -Draft 模式确保 CI 跑但不自动 merge,等 reviewer 转为 ready-for-review。两个 PR **独立 review、独立 merge**,任一方 merge 均可,不要求同步。 diff --git a/skills/eval-search/references/run-layout.md b/skills/eval-search/references/run-layout.md deleted file mode 100644 index 5bbac1618..000000000 --- a/skills/eval-search/references/run-layout.md +++ /dev/null @@ -1,123 +0,0 @@ -# run 目录结构 + 中间产物约定 - -## 目录位置 - -``` -/tests/eval-search/runs// -``` - -`` 格式:`YYYY-MM-DDTHH-MMZ`(UTC,用 `date -u +%Y-%m-%dT%H-%MZ` 生成)。 - -整个 `tests/eval-search/runs/` 被 gitignore,不进版本库。 - -确定性 setup runner: - -```bash -node --experimental-strip-types tests/eval-search/eval-search-run.ts \ - --loader-profile \ - --executor-profile \ - --subset 3 -``` - -runner 只负责创建 run 目录、拉取并转换 live dataset、检查 executor 账号隔离、写 `preflight.json`。它不会执行 AI Executor/Judge 阶段;setup 成功时 `summary.json.status` 为 `ready_for_executor`。 - -单账号时间隔离模式: - -```bash -node --experimental-strip-types tests/eval-search/eval-search-run.ts --snapshot-only --loader-profile -# 移除该账号的评测 Base 权限 -node --experimental-strip-types tests/eval-search/eval-search-run.ts \ - --dataset-file tests/eval-search/runs//dataset.jsonl \ - --executor-profile -``` - -第一步只写本地 `dataset.jsonl`,`summary.json.status` 为 `snapshot_ready`。第二步会复制该 dataset 到新的 run 目录,并重新检查 executor 已经不能读取评测 Base。 - -## 单次 run 目录布局 - -``` -tests/eval-search/runs/2026-04-15T10-00Z/ -├── cycle.json # 仅 /eval-search cycle 阶段编排使用;记录云文档、阶段状态、PR URL -├── cloud-doc/ # 仅 /eval-search cycle 使用;每次追加云文档前生成的 markdown 片段 -│ ├── 00-created.md -│ ├── 20-run-finished.md -│ └── tainted_tokens.json -├── meta.json # run 元信息(cli 版本、loader/executor profile、账号、开始/结束时间) -├── raw/ -│ ├── base_records_pages.json -│ └── base_records_combined.json -├── dataset.jsonl # 从 base 拉下来并转换的 cases -├── preflight.json # 污染预检结果 -├── trajectories/ -│ ├── case_001.json # Executor 增量写盘,崩溃可恢复 -│ ├── case_002.json -│ └── ... -├── verdicts.json # Judge 产出 -├── summary.json # 聚类后的 findings -└── pr-draft/ # 仅 propose-pr 阶段产出 - ├── diff.patch - ├── generalization_note.json - ├── unhandled_findings.md - ├── commit_message.txt - └── after_verdicts.json # regression 重跑结果(不含 trajectories,减小体积) -``` - -## meta.json - -```json -{ - "run_id": "2026-04-15T10-00Z", - "started_at": "2026-04-15T10:00:13Z", - "ended_at": "2026-04-15T11:42:51Z", - "lark_cli_version": "v1.0.11+git-abc1234", - "git_head": "abc1234", - "git_dirty": true, - "loader_profile": "base-reader", - "executor_profile": "eval-search", - "user_open_id": "ou_xxx", - "user_name": "eval-search-bot", - "subset": null, - "cases_scored": 13, - "cases_skipped_contamination": 0, - "cases_skipped_parse_error": 1 -} -``` - -`git_dirty=true` 的 run 打上 `⚠️ dirty` 标记;propose-pr 阶段若源码 dirty 会拒绝生成 PR(否则 diff 混入无关改动)。 - -## 增量持久化约定 - -Executor 每完成 1 round(= 1 次 lark-cli 调用 + 解析),追加写入 `trajectories/.json`: - -```json -{ - "case_id": "case_001", - "query": "...", - "started_at": "...", - "rounds": [ - {"idx": 1, "tool": "Read", "target": "skills/lark-doc/SKILL.md", "outcome_summary": "..."}, - {"idx": 2, "tool": "Bash", "cmd": "lark-cli drive +search --query '华东 Aily'", "outcome_summary": "top-3: ..."}, - ... - ], - "answer": null, - "gave_up": false, - "ended_at": null -} -``` - -所有未闭合的 case(`ended_at: null`)在 run 结束时标记为 `incomplete`,Judge 按 `gave_up=true` 处理但 `rounds_used` 如实记录。 - -## 并发度 - -v0.1 建议 **串行跑 Executor**: -- 避免多 sub-agent 同时打飞书 API 触发限流 -- v2 历史上 sub-agent 529 频繁,并发会放大问题 -- 评测 13 case 串行约 1-2 小时,可接受 - -未来若评测集扩到 50+ case,再考虑 semaphore 限并发 = 2。 - -## 清理策略 - -`tests/eval-search/runs/` 不自动清理。用户手动 `rm -rf tests/eval-search/runs/` 或按时间删旧的。 - -.gitignore 已覆盖整个 runs/ 目录。 diff --git a/tests/eval-search/eval-search-collect-search.ts b/tests/eval-search/eval-search-collect-search.ts index c528042c6..89ae83f8f 100644 --- a/tests/eval-search/eval-search-collect-search.ts +++ b/tests/eval-search/eval-search-collect-search.ts @@ -139,7 +139,7 @@ function addTokensFromValue(value, tokens) { } function loadTaintedTokens(root, runDir = "") { - const file = path.join(root, "skills/eval-search/references/known-tainted-tokens.md"); + const file = path.join(root, "tests/eval-search/references/known-tainted-tokens.md"); const tokens: Set = new Set(); if (!fs.existsSync(file)) { return tokens; diff --git a/tests/eval-search/eval-search-run.ts b/tests/eval-search/eval-search-run.ts index 3a2fb5777..a4a8351e4 100644 --- a/tests/eval-search/eval-search-run.ts +++ b/tests/eval-search/eval-search-run.ts @@ -190,7 +190,7 @@ function summarizeFailure(result) { function readTaintedTokens(root) { const file = path.join( root, - "skills/eval-search/references/known-tainted-tokens.md", + "tests/eval-search/references/known-tainted-tokens.md", ); const text = fs.readFileSync(file, "utf8"); const block = text.match(/tainted_tokens:[\s\S]*?```/); @@ -241,7 +241,7 @@ function readRunTaintedTokens(runDir) { function readExcludedUserIds(root) { const file = path.join( root, - "skills/eval-search/references/known-tainted-tokens.md", + "tests/eval-search/references/known-tainted-tokens.md", ); const text = fs.readFileSync(file, "utf8"); const block = text.match(/excluded_user_ids:[\s\S]*?```/); @@ -901,7 +901,7 @@ function main() { ? [`preflight found tainted tokens in ${contaminationCount} case(s)`] : [], blockers: [ - "executor and judge phases still require the agent workflow described in skills/eval-search/prompts", + "executor and judge phases still require the eval-search agent workflow", ], }); diff --git a/skills/eval-search/references/known-tainted-tokens.md b/tests/eval-search/references/known-tainted-tokens.md similarity index 98% rename from skills/eval-search/references/known-tainted-tokens.md rename to tests/eval-search/references/known-tainted-tokens.md index 55faea9f7..8598ba019 100644 --- a/skills/eval-search/references/known-tainted-tokens.md +++ b/tests/eval-search/references/known-tainted-tokens.md @@ -69,7 +69,7 @@ tainted_tokens: **不要在飞书上写"评测过程记录" / "v_n 比对分析"之类文档**。都写成本仓库 markdown: -- 评测流程/设计 → `skills/eval-search/**`(已就位) +- 评测流程/设计 → `tests/eval-search/**` 或本地 run 产物 - 某轮迭代分析 → `tests/eval-search/runs//*.md`(gitignored,本地查看) - 发布用的 retrospective → PR description / GitHub wiki / release notes From 31c3d4e456f0e11fa04f59034b39db27f8314111 Mon Sep 17 00:00:00 2001 From: zhuhao Date: Thu, 7 May 2026 17:48:41 +0800 Subject: [PATCH 6/6] Revert "chore(eval-search): keep submitted pr out of skills" This reverts commit 4bb94352ee1f728ae6d19e68800d1de6c5652b1c. --- skills/eval-search/RUBRIC.md | 119 +++++++ skills/eval-search/SKILL.md | 158 ++++++++++ skills/eval-search/prompts/executor.md | 74 +++++ skills/eval-search/prompts/judge.md | 97 ++++++ skills/eval-search/prompts/optimizer.md | 150 +++++++++ skills/eval-search/references/cycle.md | 250 +++++++++++++++ skills/eval-search/references/dataset.md | 127 ++++++++ .../references/known-tainted-tokens.md | 2 +- .../references/open-repo-layout.md | 162 ++++++++++ .../references/pollution-preflight.md | 105 +++++++ .../eval-search/references/pr-generation.md | 293 ++++++++++++++++++ skills/eval-search/references/run-layout.md | 123 ++++++++ .../eval-search/eval-search-collect-search.ts | 2 +- tests/eval-search/eval-search-run.ts | 6 +- 14 files changed, 1663 insertions(+), 5 deletions(-) create mode 100644 skills/eval-search/RUBRIC.md create mode 100644 skills/eval-search/SKILL.md create mode 100644 skills/eval-search/prompts/executor.md create mode 100644 skills/eval-search/prompts/judge.md create mode 100644 skills/eval-search/prompts/optimizer.md create mode 100644 skills/eval-search/references/cycle.md create mode 100644 skills/eval-search/references/dataset.md rename {tests => skills}/eval-search/references/known-tainted-tokens.md (98%) create mode 100644 skills/eval-search/references/open-repo-layout.md create mode 100644 skills/eval-search/references/pollution-preflight.md create mode 100644 skills/eval-search/references/pr-generation.md create mode 100644 skills/eval-search/references/run-layout.md diff --git a/skills/eval-search/RUBRIC.md b/skills/eval-search/RUBRIC.md new file mode 100644 index 000000000..21cffa5ff --- /dev/null +++ b/skills/eval-search/RUBRIC.md @@ -0,0 +1,119 @@ +# RUBRIC — 4 维度评分细则 + +每个 case 按 4 维打分,每维 0-5 分,单 case 满分 15。总分 = sum(recall + accuracy + completeness)。 + +> 注:`total` 字段只聚合 3 个打分维度。第 4 维 `contamination_penalty` 是修饰项,见下。 + +## 维度定义 + +### recall(召回,0-5) + +"Executor 是否找到 / fetch 过**正确的目标文档**"。对应评测集 `数据源地址` 字段里的 URL / token。 + +| 分 | 判据 | +|----|------| +| 5 | trajectory 里显式 fetch 过全部 expected source;或 search 结果 top-5 里能看到全部 expected source 的 token | +| 4 | fetch 过一半以上(严格过半) | +| 3 | fetch 过至少 1 个但不到一半;或 top-5 里出现但未 fetch | +| 1-2 | 没 fetch、没在 top-5,但有相关命中(同主题不同文档) | +| 0 | 完全无关的命中 / 空结果 | + +**特例**:`企业内是否有知识 == 否` 的 case,recall 固定 5 分(agent 不该找到任何高置信答案,答"没找到"也算召回正确)。 + +**污染结果不计入 recall**:trajectory 里标记为 `tainted=true` 或 `evidence_excluded=true` 的搜索结果是可观测污染信号,但不是答案证据。即使 expected source token 只出现在这些污染结果里,也不能按 top-5 命中给 recall 分;只有非污染 `evidence_top_results` 或非污染 fetch 才能作为 recall 依据。 + +### accuracy(准确性,0-5) + +"Executor 给出的最终答案**在事实层面**对不对"。对照评测集 `预期答复` 的【关键信息】段 + 【打分备注】里的 "可信无误" 说明。 + +| 分 | 判据 | +|----|------| +| 5 | 关键信息全部正确,无事实错误 | +| 4 | 主要信息正确,少量细节偏差(时间、数字小错) | +| 3 | 部分正确部分错 / 含明显可证伪陈述 | +| 1-2 | 大部分错误,但方向对 | +| 0 | 完全错 / 幻觉 / 答非所问 | + +**【打分备注】优先级高于通用判据**。例如某 case 备注 "给到 0.x 折这类可信要扣分",即使答案看起来合理,只要踩到就扣。 + +### completeness(完整性,0-5) + +"Executor 覆盖了多少 expected key points"。对照【关键信息】列出的条目 + 【打分备注】里的 "完整详实" 说明。 + +| 分 | 判据 | +|----|------| +| 5 | 覆盖 ≥80% key points,或满足 `完整详实` 备注的明确阈值(如"答出 5 个及以上不扣分") | +| 4 | 覆盖 60-80% | +| 3 | 覆盖 40-60% | +| 1-2 | 覆盖 20-40% | +| 0 | <20% 或未给答案 | + +### contamination_penalty(污染修饰,-3 ~ 0) + +仅当 pre-flight 标记了 `contamination_risk=true` 且 trajectory 显示 Executor **fetch 过 tainted token** 时触发。 + +| 分 | 判据 | +|----|------| +| 0 | 未命中 tainted token,或命中但未 fetch | +| -1 | fetch 了 tainted token 但最终答案未直接引用其内容 | +| -3 | fetch 了 tainted token 且答案明显抄袭其结构 / 原文 | + +该项**直接从 total 扣**,且在 verdict 里显式标注,避免"刷分嫌疑"。 + +collector / Executor 可以把 tainted 搜索结果写进 trajectory,但必须把它们标为 `evidence_excluded=true`,且不能作为答案合成、fetch 选择或 recall top-5 的证据。简言之:**tainted results are observable but non-evidential**。 + +## Verdict JSON schema + +每个 case 一个 verdict,合并写入 `verdicts.json`。 + +```json +{ + "case_id": "case_001", + "query": "...", + "scores": { + "recall": 4, + "accuracy": 5, + "completeness": 3, + "contamination_penalty": 0, + "total": 12 + }, + "rationale": { + "recall": "fetch 了 Es5wwNCyei3eYNkXc8Tcx35nnWe,top-3 里出现 HxnMwM9cyiFW1dkACUBcC7KWnEd 但未 fetch", + "accuracy": "8 个案例全部在参考文档里,无幻觉", + "completeness": "列了 5/10,备注要求 ≥5 不扣分,按备注打 5" + }, + "improvement": { + "tool_capability": [ + "drive +search 返回结果没有 body_preview,agent 必须 fetch 才能判断相关性。建议返回摘要字段减少 fetch 次数" + ], + "search_strategy": [ + "Executor 只用了原词 '华东 Aily 案例',没换 '客户成功故事' / '最佳实践' 等同义词" + ], + "skill_prompts": [ + "lark-drive-search.md 可新增同义词清单小节,含 'case / story / best practice' 映射" + ] + }, + "contamination": { + "risk_flagged": false, + "tainted_tokens_fetched": [], + "penalty_applied": 0 + } +} +``` + +## 聚合规则(summary.json) + +Judge 打完所有 case 后,主 agent 按以下规则聚合到 `summary.json`: + +1. **按改动落点文件聚类 improvements**,不按文本相似度: + - 同一条 skill_prompts 建议指向 `skills/lark-doc/SKILL.md` 的,合并成一条 finding + - finding 保留 `driving_cases: [case_003, case_007, ...]` 反向索引 +2. **计算一阶瓶颈**:三桶的建议条数之和,占比最大的那个桶就是 `primary_bottleneck` +3. **统计 contamination**:分别统计 search-only 观测到 tainted token 的 case 数、被 fetch 到 tainted token 的 case 数;fetch 数 >2 时输出警告 +4. **汇总每个维度的均值、总分** + +## 校准指引(给 Judge 看的) + +- 优先使用【打分备注】里的 per-case rubric;与通用判据冲突时**以备注为准** +- 宁低勿高:打分是迭代的信号源,乐观打分会让下一轮 optimizer 找不到方向 +- rationale 字段必填,且要引用 trajectory 里的具体命令或 URL。只写"还行""不够完整"等空洞判断会被 Optimizer 识别为低质量 verdict 并丢弃 diff --git a/skills/eval-search/SKILL.md b/skills/eval-search/SKILL.md new file mode 100644 index 000000000..3489422fd --- /dev/null +++ b/skills/eval-search/SKILL.md @@ -0,0 +1,158 @@ +--- +name: eval-search +version: 0.1.0 +description: "lark-cli 搜索能力端到端评测 Harness:拉取飞书评测集 → 盲测执行 → 四维打分 → 聚合归因 → 自动生成 PR 草稿。当用户要评测 lark-cli 搜索效果、做 v_n→v_{n+1} 迭代、让新人跑一轮优化闭环时使用。" +metadata: + requires: + bins: ["node", "lark-cli", "jq", "git", "gh"] +--- + +# eval-search — lark-cli 搜索能力评测 Harness + +**CRITICAL — 开始前 MUST 先用 Read 工具读取 [`../lark-shared/SKILL.md`](../lark-shared/SKILL.md)(认证)和 [`RUBRIC.md`](RUBRIC.md)(评分细则)。** + +## 目标 + +给 AI agent 一个自然语言搜索问题,它能否通过 lark-cli 在飞书企业知识库里找到正确答案?当它做不到,定位到: +- **(a) tool_capability** — 工具能力缺口(缺 shortcut / 缺 flag / 输出难解析) +- **(b) search_strategy** — agent 应该但没做的搜索动作 +- **(c) skill_prompts** — 方法论没在 skill 文档里 + +并把归因汇聚成可执行的 PR 草稿。 + +## 适用场景 + +- "跑一轮搜索评测" +- "新人想参与 lark-cli 优化,从哪里开始" +- "对比一下最近改动对搜索效果的影响" +- "看看上一轮评测还有哪些归因没处理" + +## 四个入口命令 + +``` +/eval-search cycle [--loader-profile NAME] [--executor-profile NAME] [--subset N] [--report-doc URL] + # 一键闭环:run → 打分/report → propose-pr,并把阶段进展写入云文档 +/eval-search run [--loader-profile NAME] [--executor-profile NAME] [--subset N] + # 跑一轮评测,产出 run-id。默认全量;--subset=3 抽样冒烟 +/eval-search run --snapshot-only # 只把评测集拉成本地 dataset.jsonl,供移除权限后复用 +/eval-search propose-pr # 基于 run 生成 PR 草稿(含 before/after + 泛化声明 + regression 告警) +/eval-search report # 读已有 run 的 summary.json +``` + +新人典型流程优先使用 `cycle`,只有调试单个阶段时才手动执行 `run` / `report` / `propose-pr`。 + +## `/eval-search cycle` 上层闭环 + +详细步骤见 [`references/cycle.md`](references/cycle.md)。概要: + +1. **初始化 cycle**:生成 `cycle-id` / `run-id`,创建 `tests/eval-search/runs//cycle.json` +2. **创建或绑定云文档**:若未传 `--report-doc`,用 `lark-cli docs +create --api-version v2 --doc-format markdown` 创建报告文档;若已传文档,则直接追加本轮章节 +3. **阶段化执行并记录**:内部串联 `run → score/report → propose-pr`,每个阶段开始、成功、失败都先写本地 `cycle.json`,再追加到云文档 +4. **产物归档**:云文档只写阶段状态、分数摘要、finding 摘要、PR URL、失败原因和本地产物路径;不得写标准答案、完整 trajectory、source_urls 或 key_error_snippets +5. **污染控制**:cycle 生成或使用的云文档默认是评测过程材料,必须记录为 tainted/process material;未来持久 blocklist 变更需要单独 PR,不得混入搜索效果优化 PR +6. **完成定义**:未传 `--skip-pr` 时,最终回复必须同时给出 Cloud report URL 和 Draft PR URL;任一链接缺失都不能视为完成 + +## 三层架构(必须隔离,违反会让结果失真) + +``` +Executor (sub-agent, Task 工具) + 输入: query only 不知道: expected / rubric / source_urls + 工具: 仅 lark-cli + 产出: trajectory + answer + ↓ +Judge (主 agent 切 hat,时序隔离) + 输入: query + answer + expected + rubric + 产出: 4 维打分 + 三桶 improvement + ↓ +Optimizer (sub-agent, Task 工具) + 输入: 全部 verdicts summary + 失败 case 的关键错误片段(不喂 trajectory 全文) + 产出: diff + 泛化声明字段 +``` + +**隔离纪律**: +- Executor prompt 永远只注入 `query`,绝不传 expected/rubric/source_urls(盲测) +- Judge 必须在 Executor 全部跑完之后开始,不得和 Executor 共享 tool-use 窗口 +- Optimizer 只看 Judge 聚合出的 summary,**不喂 trajectory 原文全文**,只喂失败 case 的关键错误行(防过拟合 + 控 context) + +## `/eval-search run` 流程 + +详细步骤见 [`references/run-layout.md`](references/run-layout.md)。概要: + +1. **确定性 setup**:先运行 `node --experimental-strip-types tests/eval-search/eval-search-run.ts --loader-profile --executor-profile [--subset N]`。脚本会生成 run-id,建目录 `tests/eval-search/runs//`,并完成第 2-4 步。若只有一个账号,可先用 `--snapshot-only` 拉本地 `dataset.jsonl`,移除该账号的评测 Base 权限后,再用 `--dataset-file /dataset.jsonl` 继续 +2. **拉数据集**:按 [`references/dataset.md`](references/dataset.md) 用 loader profile 从评测 base 拉最新数据 → `dataset.jsonl` +3. **账号隔离**:按 [`references/pollution-preflight.md`](references/pollution-preflight.md) 检查 executor profile 不在 `excluded_user_ids`,并主动探测 executor 不能读取评测 Base;若能读取则阻断 +4. **污染预检**:用 executor profile 对每条 query 跑一次 `drive +search`,命中 [`references/known-tainted-tokens.md`](references/known-tainted-tokens.md) 里的 token 则标记 `contamination_risk`。只标记不阻断;Judge 阶段再决定是否扣分 +5. **Executor 并行**:用 Task 工具启动 sub-agent 按 [`prompts/executor.md`](prompts/executor.md) 跑全部 case。每个 case trajectory 落盘 `trajectories/.json` +6. **Judge 逐 case**:主 agent 按 [`prompts/judge.md`](prompts/judge.md) 打分,写 `verdicts.json` +7. **聚合**:按"改动落点文件"对 improvements 聚类,写 `summary.json`;输出 run-id 给用户 + +## `/eval-search propose-pr` 流程 + +详细见 [`references/pr-generation.md`](references/pr-generation.md)。概要: + +1. **Optimizer 生成 diff**:用 Task 工具启动 sub-agent 按 [`prompts/optimizer.md`](prompts/optimizer.md) 读 summary + 两个仓库代码,产出 **cli diff + open diff(如有)** 和泛化声明 +2. **应用 diff 到两个 worktree**: + - cli 仓库:独立分支 `eval-search/auto-pr/` + - open 仓库(若有改动):独立分支 `eval-search/auto-pr/`,互不污染 main +3. **Quality gate**(当前仅 cli 仓库):`make unit-test` + `golangci-lint run --new-from-rev=origin/main` 必须通过。失败 → Optimizer 最多迭代 2 次,仍失败 → 把触发失败的改动降级为 GitHub issue,不进 PR。open 仓库暂不跑 gate(CI 配置非 harness 可控) +4. **确定性 regression 重跑**:按 diff 之上重跑完整评测(复用 `/eval-search run` 内部流程),产出 after verdicts。**这一步不给 Optimizer 参与** +5. **组装两份 PR description**:按 [`references/pr-generation.md`](references/pr-generation.md) 里的模板,包含 before/after 数值、wins/regressions 逐 case 列表、泛化声明、未处理归因、**对端 PR 互相 link** +6. **`gh pr create --draft`**:双 PR 独立提,**独立 review、独立 merge**。不强绑定联动。一个 PR 先 merge 另一个还没 merge 也 OK,在 PR description 里标记 cross-ref + +## 权限边界(v0.1 软约束,迭代中调整) + +### PR 颗粒度 + +每个 `/eval-search propose-pr` 只能落一个主归因桶 / 一个改动主题。主 agent 在 apply diff 前必须复查 touched files,并按以下规则拆分: +- `search_strategy` / `skill_prompts`:只能提交搜索策略或 skill 文档优化 PR,例如 `skills/lark-drive/references/*-search.md` 或当前主搜索入口对应文档。不得混入 harness、runner、package、评测集、打分脚本或基础设施改动;不要给已进入维护期的 `docs +search` 新增策略依赖。 +- `tool_capability`:只能提交 CLI shortcut / open converter 能力 PR。不得混入搜索策略文档,除非同一能力改动必须同步更新对应使用说明。 +- `eval_harness` / 评测流程自身:必须独立 PR,不能和任何搜索效果优化 PR 混在一起。 + +### cli 仓库(`larksuite/cli`,当前目录) + +Optimizer 默认允许改: +- `skills/**/*.md` +- 新增 `shortcuts//*.go` 及对应测试 + +Optimizer 不自动改: +- `internal/**`, `extension/**`, `cmd/root.go`, `cmd/service/**` 等基础设施 → 降级为 issue +- 任何旧 shortcut 的删除 / 重命名 / 破坏性改动 + +### open 仓库(`$GOPATH/src/code.byted.org/lark_as/open/`) + +详见 [`references/open-repo-layout.md`](references/open-repo-layout.md)。简要: + +Optimizer 默认允许改: +- `biz/search_open/entity/{name}.go` 的 `BuildDisplayInfo` / `BuildResponseItem` bug fix / `Prune` 及配套 `*_test.go` + +Optimizer 不自动改: +- IDL(在独立的 `lark/idl` 仓库,需要跑 overpass,不属于 PR 范畴) +- `api_meta/**/*.yml`(契约变更,走人工) +- `biz/search_open/handler.go` / `adapter.go` / `pagetoken.go` / `response.go` 等基础设施 +- 任何"新增 OAPI 字段"类需求(跨两个仓库 + 手工步骤,产出 issue 正文即可) + +### 违反白名单的处理 + +Optimizer 把该 finding 写进 PR description 的"未处理归因"段(含建议 issue 正文),由新人创建对应 GitHub issue。**不发**跨仓库 / 超出白名单的 PR。 + +## 关键纪律(不遵守分数会失真) + +1. **盲测纪律**:Executor prompt 只注入 `query`。即使主 agent fallback 接管 Executor,也必须自我约束不读 `dataset.jsonl` 的非 query 字段 +2. **三层隔离**:Judge 不能和 Executor 在同一轮 reasoning;Optimizer 不喂 trajectory 全文 +3. **Regression 软告警**:after 出现 regression 不硬 block,但必须在 PR description 里逐 case 列出;reviewer 判断 +4. **泛化声明必填**:Optimizer 必须区分"针对具体 case 的改动"和"泛化原则性改动"。前者过拟合风险高,reviewer 重点看 +5. **污染隔离**:harness 至少使用两个 profile。loader profile 可以读取评测 Base,但只允许用于拉数据集;executor profile 必须是专用测试账号(非 PM 账号、非 dataset owner 账号),且不能读取评测 Base。若 executor profile 的 `userOpenId` 出现在 [`references/known-tainted-tokens.md`](references/known-tainted-tokens.md) 的 `excluded_user_ids` 列表里,或 executor 可以读取评测 Base,拒绝启动 + +## 参考 + +- [`RUBRIC.md`](RUBRIC.md) — 4 维度评分细则 +- [`prompts/executor.md`](prompts/executor.md) — Executor sub-agent 模板 +- [`prompts/judge.md`](prompts/judge.md) — Judge 打分模板 +- [`prompts/optimizer.md`](prompts/optimizer.md) — Optimizer PR 生成模板 +- [`references/cycle.md`](references/cycle.md) — 一键闭环 + 云文档阶段日志 +- [`references/dataset.md`](references/dataset.md) — 评测集 schema + 拉取方式 +- [`references/pollution-preflight.md`](references/pollution-preflight.md) — 污染预检规则 +- [`references/known-tainted-tokens.md`](references/known-tainted-tokens.md) — 已知泄露文档标记清单 +- [`references/run-layout.md`](references/run-layout.md) — run 目录结构 + 中间产物约定 +- [`references/pr-generation.md`](references/pr-generation.md) — PR 生成流程 + description 模板(双 PR) +- [`references/open-repo-layout.md`](references/open-repo-layout.md) — `lark_as/open` 仓库允许改动的白名单导航 diff --git a/skills/eval-search/prompts/executor.md b/skills/eval-search/prompts/executor.md new file mode 100644 index 000000000..9ab9e15e0 --- /dev/null +++ b/skills/eval-search/prompts/executor.md @@ -0,0 +1,74 @@ +# Executor sub-agent 模板 + +**使用方式**:主 agent 用 Task 工具启动 sub-agent(`subagent_type: general-purpose`),把本文件内容 + 具体 `query` 拼为 prompt 传入。**禁止在 prompt 里注入 expected / rubric / source_urls / 评测集任何其他字段**。 + +--- + +## SYSTEM(照原样复制到 Task prompt 开头) + +你是 lark-cli 搜索能力评测 harness 的**执行层 sub-agent**,任务是**盲测**:回答一个来自飞书企业知识库的自然语言问题。 + +### 你的约束 + +1. **工具只有 lark-cli**:可以用 `lark-cli` 的任何 shortcut、API、schema 命令。禁止使用 WebFetch / WebSearch / 其他外部工具。 +2. **身份为当前登录的 user**。不要主动切 bot。 +3. **你不知道标准答案**,也不知道答案在哪个文档。你唯一拥有的信息就是 `query`。 +4. **单 case round 预算:12 round**(一个 lark-cli 调用 = 1 round)。超过必须收尾给 best-effort 答案。 +5. **Context discipline**: + - 任何 lark-cli 输出 >30 行 → 先 `--format json -q '.data[].title'` 之类精简,或落盘到 `/tmp/case__.txt` 再 grep + - 不要把整篇文档正文贴进 reasoning + - 每一步的内部总结 ≤200 字符 +6. **增量持久化**:每完成 1 round,把 trajectory 追加写入 `/trajectories/.json`。崩溃恢复靠这个文件。 + +### 方法论(**必须先阅读**,不是建议) + +在发出第一条 lark-cli 命令之前,MUST 用 Read 读: +- `skills/lark-shared/SKILL.md` — 认证、全局参数 +- `skills/lark-drive/SKILL.md` + `skills/lark-drive/references/lark-drive-search.md` — 云空间资源发现;优先使用 `drive +search`,不要新增依赖已进入维护期的 `docs +search` +- `skills/lark-doc/SKILL.md` — 命中文档后的 fetch / 内容读取 +(搜索方法论直接在 `lark-drive-search.md` 里:关键词改写 / 失败退出 / 答案型检索循环都在该文件的决策规则段) +- `skills/lark-wiki/SKILL.md` — wiki 节点是壳的关键概念 + +根据 query 类型可能还要读:`lark-im`、`lark-mail`、`lark-vc`、`lark-minutes`、`lark-contact` 等。 + +### 标准流程 + +1. 阅读 query,拆"实体"(人名 / 时间 / 关键词 / 资源类型) +2. 选择搜索入口(drive / im / mail / vc / minutes / ...) +3. 发起搜索;若返回空或无相关结果,按 `lark-drive-search.md` 的"决策规则 / `--query` 高级语法"换 2-3 轮词(同义词 / `intitle:` / 排除词) +4. 对 top 命中做进一步 fetch / resolve(wiki 节点必须先 `wiki +resolve-node`) +5. 综合信息给出答案;若 3 轮改写仍无结果,给 best-effort 结论并明确说"未找到直接证据" +6. 写 `/trajectories/.json`,结束 + +### 输出格式(最后一条消息,JSON) + +```json +{ + "case_id": "", + "answer": "<自然语言答案,markdown 允许>", + "referenced_urls": ["<从 lark-cli 命中的 URL>", ...], + "rounds_used": , + "gave_up": , + "notes": "<可选,给 Judge 的说明,例如:'时间窗超了,只跑了 8 round 提前收敛'>" +} +``` + +### 反模式(会被 Judge 扣分) + +- ❌ 不读 skill 文档直接 `lark-cli api GET /...` 手拼参数 +- ❌ 把 wiki token 当 doc token 传给 `docs +fetch` +- ❌ 搜不到时只重复同一个关键词 +- ❌ 一次性 `lark-cli ... | cat` 把 500 行塞进 reasoning +- ❌ 编造答案(没 fetch 过就说"根据文档 X...") + +--- + +## USER(主 agent 拼接时注入) + +``` +query: <来自 dataset.jsonl 的 query 字段原文> +case_id: +run_dir: > +``` + +**除以上三个字段,不注入任何评测集其他字段**。 diff --git a/skills/eval-search/prompts/judge.md b/skills/eval-search/prompts/judge.md new file mode 100644 index 000000000..9238093c6 --- /dev/null +++ b/skills/eval-search/prompts/judge.md @@ -0,0 +1,97 @@ +# Judge 打分模板 + +**使用方式**:主 agent 切 hat 执行。Executor 全部跑完后,主 agent 逐 case 读 `trajectory + expected`,按本文件产出 verdict。 + +> **隔离纪律**:不要在 Executor 尚未跑完时开始 Judge(会污染 Executor 所在 reasoning 窗口)。Executor 全部完成、`trajectories/*.json` 落盘后再启动 Judge。 + +--- + +## Judge 每个 case 的输入 + +从磁盘读(**不要复用 Executor 的 reasoning context**): +- `dataset.jsonl` 中该 case 的 `query / expected / source_urls / has_knowledge / rubric_notes` +- `trajectories/.json`(含 rounds 列表 + 最终 answer) +- `preflight.json`(看 `contamination_risk` 和 `tainted_tokens`) +- `skills/eval-search/RUBRIC.md` + +## 每个 case 的打分步骤 + +1. **recall**:扫 trajectory 里的每一条 tool_use,提取被 fetch / resolve 过的 token 和 URL 集合,并读取 `evidence_top_results` / search round 里的非污染 evidence tokens。与 `source_urls` 做交集。标记为 `tainted=true` 或 `evidence_excluded=true` 的 search 结果只能算污染观测,不能算 recall top-5 命中。按 RUBRIC 打分 +2. **accuracy**:把 `answer` 和 `expected.【关键信息】` 段逐条比对。优先应用 `expected.【打分备注】.可信无误` +3. **completeness**:数 key points 覆盖数。优先应用 `expected.【打分备注】.完整详实` +4. **contamination**:查 trajectory 是否 fetch 过 `preflight.tainted_tokens`;search-only 命中只记录风险,不扣污染分,也不作为 recall/accuracy/completeness 的证据。若有 fetch,按 RUBRIC 给 `contamination_penalty` +5. **improvement 三桶**:从 trajectory 里找失败片段,分类写进 `tool_capability / search_strategy / skill_prompts` + +## improvement 填写规则 + +**每条建议必须满足**: +- 指向**具体文件**(skill_prompts)、**具体命令**(tool_capability)或**具体动作**(search_strategy) +- 引用 trajectory 里触发该建议的 round 序号 +- 不写"可以更好"这种无落点的建议;写不出具体落点的建议**丢弃**,不要凑数 + +**示例**: + +✅ 好的: +```json +"skill_prompts": [ + "round 4 Executor 把 wiki URL 直接传给 docs +fetch 导致 param invalid。lark-wiki/SKILL.md 的反模式段应加'wiki 链接必须先走 +resolve-node'的明确警告(当前只在 references 里写了)" +] +``` + +❌ 差的: +```json +"skill_prompts": [ + "搜索不够全面", + "agent 应该更聪明地处理 wiki" +] +``` + +## 合并规则(主 agent 在全部 case 打完后做) + +把所有 verdicts 的 `improvement` 按"改动落点文件"去重合并到 `summary.json`: + +```json +{ + "run_id": "2026-04-15T10-00Z", + "dataset_size": 14, + "scored": 13, + "contaminated_fetched": 1, + "totals": { + "sum": 132, + "max": 195, + "percent": 67.7, + "per_dim": {"recall": 2.69, "accuracy": 3.92, "completeness": 3.54} + }, + "findings": [ + { + "finding_id": "F-001", + "bucket": "skill_prompts", + "target_file": "skills/lark-wiki/SKILL.md", + "suggestion": "在反模式段加 'wiki 链接必须先走 +resolve-node' 警告", + "driving_cases": ["case_003", "case_007", "case_011"], + "priority": "high" + }, + { + "finding_id": "F-002", + "bucket": "tool_capability", + "target_file": "shortcuts/docs/search.go", + "suggestion": "drive +search 返回结果没有 body_preview,agent 必须 fetch 才能判断相关性", + "driving_cases": ["case_001", "case_005"], + "priority": "medium" + } + ], + "primary_bottleneck": "skill_prompts", + "pollution_warnings": [] +} +``` + +**priority 判定**: +- `high`: driving_cases ≥3 且 bucket 是 `skill_prompts` / `search_strategy`(改文档成本低、收益面广) +- `medium`: driving_cases ≥2 或 bucket 是 `tool_capability`(代码改动) +- `low`: driving_cases == 1(过拟合风险高,给 Optimizer 作参考但不强推) + +## 自我校准检查(写 verdict 前自问) + +- 我是不是看了 expected 才倒推 trajectory 合理性?(应该反过来:先看 trajectory 自己是否合理,再 check 是否命中 expected) +- contamination_penalty 有没有漏判? +- improvement 的三桶比例是否均衡到可疑(例如 13 个 case 全扔 `skill_prompts`,可能是判断懒) diff --git a/skills/eval-search/prompts/optimizer.md b/skills/eval-search/prompts/optimizer.md new file mode 100644 index 000000000..786e1f699 --- /dev/null +++ b/skills/eval-search/prompts/optimizer.md @@ -0,0 +1,150 @@ +# Optimizer sub-agent 模板 + +**使用方式**:主 agent 用 Task 工具启动 sub-agent。Optimizer 读 `summary.json` + 失败 case 的关键错误片段 + 仓库代码,产出 diff 草稿。 + +> **关键纪律**:不喂 trajectory 原文全文,只喂主 agent 从失败 case 摘出的"关键错误行"(通常 ≤20 行/case)。这是防过拟合 + 控 context 的核心设计。 + +--- + +## SYSTEM(Task prompt 开头) + +你是 lark-cli 搜索能力评测 harness 的**优化层 sub-agent**。Judge 已经产出 `summary.json`(含聚类后的 findings),你的任务是把这些 findings 转成**可直接 commit 的代码 / 文档改动**,并自我区分哪些是泛化的、哪些是针对具体 case 的。 + +### 你的约束 + +1. **工具**:Read / Edit / Write / Grep / Glob / Bash(仅限 `go build`, `make unit-test`, `git diff`, `gofmt`)。禁止 `git push` / `gh pr create` / `git commit` — 那是主 agent 的事 +2. **白名单 — cli 仓库**(`larksuite/cli`,当前工作目录): + - ✅ `skills/**/*.md`(改已有或新增) + - ✅ 新增 `shortcuts//.go` + 配套 `*_test.go` + - ❌ `internal/**`, `extension/**`, `cmd/root.go`, `cmd/service/**` + - ❌ 旧 shortcut 的删除 / 重命名 / 破坏性修改 +3. **白名单 — open 仓库**(`$GOPATH/src/code.byted.org/lark_as/open/`,**只读导航后才能改**): + - 处理 `tool_capability` 桶里的 finding 时,MUST 先 Read [`../references/open-repo-layout.md`](../references/open-repo-layout.md) 了解允许动哪些文件 + - ✅ 简要:`biz/search_open/entity/{name}.go` 的 `BuildDisplayInfo` / `BuildResponseItem` bug fix / `Prune`,及配套 `*_test.go` + - ❌ 简要:IDL / `handler.go` / `adapter.go` / `api_meta/**` / 新增 OAPI 字段(详见导航手册) + - 涉及 IDL 或契约变更的 finding → 写进 `unhandled_findings.md` 的 `proposed_issue` 段,不写 diff +4. 触犯白名单外的 finding → 写进 `unhandled_findings.md`,建议新人改成 GitHub issue +5. 每次改 cli 仓库 Go 代码后 MUST 跑 `make unit-test` 验证。失败最多迭代 2 次,仍失败则该 finding 降级到 `unhandled_findings.md` +6. open 仓库暂不跑 quality gate(CI 配置非 harness 可控),但 Optimizer 自己 MUST:所有 `.go` 改动过 `gofmt`、动了 `entity/{name}.go` 必须同步动 `entity/{name}_test.go` +7. 改完所有 cli finding 后 MUST 跑 `go run github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.1.6 run --new-from-rev=origin/main` +8. 按 Conventional Commits 格式写 commit message — 双仓库情况下产出两份独立 commit message(见下方产出结构) + +### 输入(主 agent 会拼到 prompt) + +- `summary_json`: 完整 `summary.json` 内容 +- `key_error_snippets`: 每个 high-priority finding 的 driving_cases 里摘的关键错误行(主 agent 挑好) +- `run_dir`: 评测目录,用于读历史产物和写输出 + +### 工作流 + +1. **读 summary 全部 findings**,按 `priority` 降序处理 +2. **对每条 finding**: + - `skill_prompts` bucket → 用 Edit 改 cli 仓库的指定 markdown,保持 tone / 结构与周边一致 + - `search_strategy` bucket → 沉淀到 cli 仓库对应域的 `references/*-search.md`(如 `skills/lark-drive/references/lark-drive-search.md`),不要塞进本 harness 的 prompt 模板 + - `tool_capability` bucket → 分两步判断: + 1. 如果 finding 本质是 cli 封装层不够(缺 shortcut、shortcut 输出难解析),评估能否在 cli 仓库加 shortcut 解决 + 2. 如果是 OAPI 层(`BuildDisplayInfo` 信息不够、字段映射 bug),Read [`../references/open-repo-layout.md`](../references/open-repo-layout.md) 并严格按白名单改 open 仓库。不在白名单的 → 产出 issue 正文,写进 `unhandled_findings.md` 的 `proposed_issue` 段 +3. **过拟合自检**:每条改动自问"这条是否仅对 driving_cases 有效"。如果是,**标记为 case-specific** 写进 `generalization_note.json` +4. **写产出**(到 `/pr-draft/`): + +``` +/pr-draft/ +├── diff.patch ← cli 仓库改动(在 larksuite/cli 目录下 git diff > diff.patch) +├── commit_message.txt ← cli 仓库 commit message +├── generalization_note.json +├── unhandled_findings.md +└── open/ ← 若有 open 仓库改动才创建 + ├── diff.patch ← open 仓库改动(在 lark_as/open 目录下 git diff > diff.patch) + ├── commit_message.txt ← open 仓库 commit message + └── touched_files.txt ← 改动文件清单(用于主 agent 白名单复查) +``` + +**重要**:Optimizer 不执行 `git commit`。只产出 diff.patch + commit_message.txt,由主 agent 分别在两个仓库 apply + commit。 + +### generalization_note.json 格式(**必填,主 agent 会读并注入 PR description**) + +每条改动必须带 `repo` 字段(`cli` 或 `open`),主 agent 按此分发到对应 PR。 + +```json +{ + "case_specific_changes": [ + { + "repo": "cli", + "file": "skills/lark-drive/references/lark-drive-search.md", + "change_summary": "在同义词小节新增 '交个朋友 → Livflow 智能平台' 映射", + "driving_cases": ["case_005"], + "risk": "该同义词只由 case_005 驱动,强度弱。reviewer 可判断是否保留" + } + ], + "principled_changes": [ + { + "repo": "cli", + "file": "skills/lark-drive/SKILL.md", + "change_summary": "新增 '搜索词改写失败 3 次后给 best-effort 答案' 决策规则", + "driving_cases": ["case_003", "case_007", "case_011"], + "rationale": "泛化到任何搜索类任务的退出条件,不依赖具体 case 内容" + }, + { + "repo": "open", + "file": "biz/search_open/entity/chat.go", + "change_summary": "BuildDisplayInfo 在群描述为空时 fallback 展示群主名称", + "driving_cases": ["case_012"], + "rationale": "空描述的群目前 agent 只能看到标题,判断相关性信息不足;泛化到所有群搜索结果" + } + ] +} +``` + +`unhandled_findings.md` 内若含涉及 IDL / 契约变更的 finding,按以下结构写 `proposed_issue` 段: + +```markdown +### [proposed-issue] + +**Bucket:** tool_capability +**Driving cases:** case_003, case_008 +**Why not auto-fixed:** 需要 IDL 新增 optional 字段 `.`,跨 idl/open 两仓库,人工处理 + +**Suggested issue body:** +<可直接贴到 github issue 的完整正文,含背景、proto 来源字段、对 agent 决策的价值> +``` + +### commit_message.txt 格式 + +两份 commit message 结构相同,区别在 scope: + +**cli 仓库** (`pr-draft/commit_message.txt`): +``` +feat(eval-search): auto-propose improvements from run + +Driven by /eval-search propose-pr . + +- +- +- (case_005) + +Eval: % → % +Regressions: + +Generated-By: eval-search/ +``` + +**open 仓库** (`pr-draft/open/commit_message.txt`): +``` +feat(search_open): improve converter display_info from eval-search run + +- +- + +Driven by: larksuite/cli /eval-search run +Pair: +Generated-By: eval-search/ +``` + +### 禁止事项 + +- ❌ 不要改 `RUBRIC.md` / `prompts/*.md`(你自己的 prompt 不该自己改) +- ❌ 不要改 `dataset` 或评测 base 相关文件(评测集改动不由 Optimizer 负责) +- ❌ 不要修"已知 regression"反向打补丁(那是拼分,不是真修复) +- ❌ 找不到落点的 finding 不要硬凑,写进 `unhandled_findings.md` +- ❌ 不要给 skill markdown 加"由 Optimizer 自动生成"这类元信息注释——文档应读起来是人写的 +- ❌ 不要改 IDL 仓库 / kitex_gen 生成代码 / open 仓库白名单外的任何文件(详见 `open-repo-layout.md`) diff --git a/skills/eval-search/references/cycle.md b/skills/eval-search/references/cycle.md new file mode 100644 index 000000000..7323ed5b9 --- /dev/null +++ b/skills/eval-search/references/cycle.md @@ -0,0 +1,250 @@ +# cycle 上层闭环 + 云文档阶段日志 + +`/eval-search cycle` 是 `/eval-search run`、`/eval-search report`、`/eval-search propose-pr` 的上层编排入口。用户只触发一次,主 agent 负责按阶段推进、记录状态、遇到失败时停止并给出可恢复位置。 + +## 入口 + +```text +/eval-search cycle [--subset N] + [--loader-profile ] + [--executor-profile ] + [--report-doc ] + [--create-report-doc] + [--report-parent-token ] + [--skip-pr] +``` + +- `--report-doc`:把本轮阶段日志追加到已有云文档。 +- `--create-report-doc`:未传 `--report-doc` 时创建新云文档;默认创建到当前用户个人空间,可选 `--report-parent-token`。 +- `--skip-pr`:只跑到打分/report,不进入 optimizer 和 PR 创建。 +- 未指定云文档参数时,默认创建新报告文档。除非用户明确禁止云文档记录,否则 cycle 不走纯本地日志模式。 + +## 状态文件 + +cycle 必须先创建本地状态,再调用任何飞书或 GitHub 写操作: + +```text +tests/eval-search/runs// +├── cycle.json +└── cloud-doc/ + ├── 00-created.md + ├── 10-run-started.md + ├── 20-run-finished.md + ├── 30-score-finished.md + ├── 40-pr-finished.md + └── tainted_tokens.json +``` + +`cycle.json` 结构: + +```json +{ + "cycle_id": "2026-05-07T03-30Z", + "run_id": "2026-05-07T03-30Z", + "status": "running", + "started_at": "2026-05-07T03:30:00Z", + "ended_at": null, + "cloud_doc": { + "url": "", + "token": "", + "created_by_cycle": true, + "tainted": true + }, + "stages": [], + "pr_urls": [] +} +``` + +每次阶段状态变化都按顺序执行: + +1. 更新 `cycle.json` +2. 渲染一个 `cloud-doc/-.md` +3. 追加到云文档 +4. 只有云文档追加成功后才进入下一个阶段 + +若云文档追加失败,重试一次;仍失败则停止 cycle,把失败写入 `cycle.json`,不要继续提 PR。 + +## 完成定义(必须满足) + +`/eval-search cycle` 不是本地脚本跑完就结束。未传 `--skip-pr` 时,必须同时交付: + +- `summary.json` / `verdicts.json` 已写入本地 run 目录 +- 云文档已创建或追加成功,且文档 URL 写入 `cycle.json.cloud_doc.url` +- 云文档 token 已写入 `cloud-doc/tainted_tokens.json` +- draft PR 已创建,PR URL 写入 `cycle.json.pr_urls` +- PR description 包含云文档 URL、run-id、分数摘要、污染摘要和未处理 finding +- PR URL 已追加回云文档的 final 段 +- 最终回复用户时同时给出云文档 URL 和 PR URL + +任一必需链接缺失时,cycle 状态只能是 `failed` 或 `blocked`,不能回复“已完成”。 + +## 云文档创建 / 追加 + +创建新文档: + +```bash +lark-cli docs +create --api-version v2 --as user \ + --doc-format markdown \ + --content @tests/eval-search/runs//cloud-doc/00-created.md \ + --jq '.data.document.url' +``` + +创建到指定目录: + +```bash +lark-cli docs +create --api-version v2 --as user \ + --parent-token '' \ + --doc-format markdown \ + --content @tests/eval-search/runs//cloud-doc/00-created.md \ + --jq '.data.document.url' +``` + +追加阶段日志: + +```bash +lark-cli docs +update --api-version v2 --as user \ + --doc '' \ + --command append \ + --doc-format markdown \ + --content @tests/eval-search/runs//cloud-doc/20-run-finished.md +``` + +Markdown 文件必须使用 `@file` 传参,避免 shell 转义破坏表格、链接或代码块。 + +## 云文档内容边界 + +云文档是给人看进度和 review 结果的,不是评测原始数据仓库。允许写: + +- cycle-id / run-id / git head / 分支 / 账号类型 +- stage 状态、开始结束时间、失败原因 +- dataset 数量、preflight 污染数量、executor 完成数量 +- 总分、各维度均值、finding 聚类摘要、PR URL +- 本地产物路径,例如 `tests/eval-search/runs//summary.json` + +禁止写: + +- `dataset.jsonl` 全量内容 +- 标准答案、source URLs、rubric 的 per-case 原文 +- 完整 trajectory、完整 verdict rationale、key_error_snippets +- 任何 access token、app secret、cookie、GitHub token + +per-case 信息只允许写 `case_id`、分数、桶归因和一句不含标准答案的摘要。 + +## 阶段编排 + +### 0. setup + +- 确认 repo 路径和分支 +- 确认 `lark-cli auth status`、`gh auth status` +- 生成 `run-id` +- 创建 `cycle.json` +- 创建或绑定云文档 +- 把云文档 token 写入 `cloud-doc/tainted_tokens.json` + +setup 文档段落必须包含醒目的污染声明: + +```markdown +# eval-search cycle + +> This document is eval-search process material. It may contain benchmark summaries and must be treated as tainted for future search evaluations. + +| Field | Value | +|---|---| +| Run ID | `` | +| Status | `setup started` | +``` + +### 1. run + +内部执行 `/eval-search run` 的流程:拉数据集、污染预检、Executor、Judge、聚合。 + +阶段日志至少追加两次: + +- `run started`:记录 run-id、subset、loader/executor profile、run 目录 +- `run finished`:记录 dataset size、scored count、skipped count、trajectory 数、summary 路径 + +### 2. score/report + +读取 `summary.json` 和 `verdicts.json`,形成面向人的摘要。该阶段不重新打分,只消费 run 阶段已经产出的 Judge 结果。 + +必须记录: + +- 总分 / 满分 / 百分比 +- recall / accuracy / completeness / contamination_penalty 的总和与均值 +- top findings,最多 10 条 +- tainted fetch cases 数量和 case_id 列表 + +### 3. propose-pr + +未传 `--skip-pr` 时进入该阶段。内部执行 `/eval-search propose-pr `: + +- Optimizer 生成 diff +- 主 agent 复查 PR 颗粒度和白名单 +- 质量门禁 +- regression 重跑 +- 生成 PR description,并把云文档 URL 写入 description +- 创建 draft PR,记录返回的 PR URL +- 立刻把 PR URL 回写到 `cycle.json.pr_urls` +- 追加 `40-pr-finished.md` 到云文档,包含 PR URL + +云文档记录: + +- PR URL / state / draft 状态 +- touched files +- quality gate 结果 +- before/after 分数摘要 +- 未处理归因 + +如果没有可提交改动,记录 `no-op`,不创建空 PR。 + +PR 创建失败时,必须把失败原因、当前分支、commit sha、可恢复命令写入云文档;不得只在本地终端输出错误。 + +### 4. final + +更新 `cycle.json.status`: + +- `completed`:所有启用阶段完成 +- `completed_without_pr`:`--skip-pr` 或 no-op +- `failed`:任一必需阶段失败 + +最后追加一段总览,包含下一步建议和恢复命令: + +```markdown +## Final + +| Field | Value | +|---|---| +| Status | completed | +| Run ID | `` | +| Summary | `tests/eval-search/runs//summary.json` | +| PR | `` | +| Report Doc | `` | +``` + +最终回复必须包含: + +```text +PR: +Cloud report: +Run ID: +``` + +## 污染控制 + +cycle 生成或更新的云文档默认是 tainted/process material。规则: + +1. 创建或绑定文档后,立刻提取 doc token,写入 `cloud-doc/tainted_tokens.json` +2. 本 cycle 的 regression / after-run 必须把该 token 作为额外 tainted token +3. 未来持久 blocklist 需要单独处理: + - 单独开 `chore(eval-search): blocklist cycle report ` PR;或 + - 在云文档无法被 executor 账号搜索到的前提下,在本轮报告中说明未持久化 blocklist +4. 不得把 blocklist 更新混入 `search_strategy`、`skill_prompts` 或 `tool_capability` 优化 PR + +## 恢复策略 + +- `setup` 失败:修复认证或文档权限后,重新执行 cycle +- `run` 失败:保留 `cycle.json`,从已有 `run-id` 的本地 artifact 判断是否能补跑缺失 case;不能补跑则新 cycle +- `score/report` 失败:不重跑 Executor,只重新读取 `summary.json` / `verdicts.json` 并追加云文档 +- `propose-pr` 失败:修复 git/gh/quality gate 后,从同一 `run-id` 重新执行 propose-pr 阶段,并追加恢复记录 + +任何恢复都必须追加云文档段落,不得静默覆盖既有记录。 diff --git a/skills/eval-search/references/dataset.md b/skills/eval-search/references/dataset.md new file mode 100644 index 000000000..6af493683 --- /dev/null +++ b/skills/eval-search/references/dataset.md @@ -0,0 +1,127 @@ +# 评测集 schema + 拉取方式 + +## 位置 + +评测集存在飞书多维表格(**live 数据源**,PM 持续更新): + +- base_token: `OOoEbNWhcaFOdisXDW7c0lKtn4g` +- table_id: `tblGWdc19tKFZC6K` +- view_id: `vewGToSnWl` +- URL: https://bytedance.larkoffice.com/base/OOoEbNWhcaFOdisXDW7c0lKtn4g?table=tblGWdc19tKFZC6K&view=vewGToSnWl + +> **污染警告**:这个 base 本身会被 `drive +search` 命中。harness 必须把账号拆成两个 profile:loader profile 只用于读取这个 base 并生成 `dataset.jsonl`;executor profile 只用于盲测搜索,**不可**加入该 base 的查看权限,否则评测结果被自答污染。详见 [`pollution-preflight.md`](pollution-preflight.md)。 + +## 原始字段(字段 id → 含义) + +| 字段名 | 类型 | 说明 | +|--------|------|------| +| `query` | text | 自然语言问题;Executor 唯一可见输入 | +| `len` | number | 历史字段,忽略 | +| `企业内是否有知识` | single-select | `是` / `否`。`否` 意味着企业知识库里本来就没答案,Executor 应答"找不到",recall 维度固定给 5 | +| `预期答复(机评文本)` | text | 含三段:【关键信息】/ 【辅助信息】/ 【打分备注】。Judge 独占使用,**Executor 不可见** | +| `数据源地址` | text(markdown 链接) | expected source URLs;Judge 独占使用,**Executor 不可见** | + +## 拉取命令 + +推荐用确定性 setup runner 拉取并转换: + +```bash +node --experimental-strip-types tests/eval-search/eval-search-run.ts \ + --loader-profile \ + --executor-profile \ + --subset 3 +``` + +如果只有一个账号,可以拆成两步: + +```bash +# 账号仍有评测 Base 权限时,只拉本地快照 +node --experimental-strip-types tests/eval-search/eval-search-run.ts \ + --snapshot-only \ + --loader-profile + +# 移除该账号的评测 Base 权限后,从本地快照继续盲测 setup +node --experimental-strip-types tests/eval-search/eval-search-run.ts \ + --dataset-file tests/eval-search/runs//dataset.jsonl \ + --executor-profile +``` + +只看原始 Base 拉取时,用 loader profile 执行: + +```bash +lark-cli --profile base +record-list \ + --as user \ + --base-token OOoEbNWhcaFOdisXDW7c0lKtn4g \ + --table-id tblGWdc19tKFZC6K \ + --view-id vewGToSnWl \ + --limit 100 +``` + +返回形如: +```json +{ + "ok": true, + "data": { + "data": [ [value_of_query, value_of_len, ...], ... ], + "field_id_list": ["fldh3DHP53", ...], + "fields": ["query", "len", "企业内是否有知识", "预期答复(机评文本)", "数据源地址"], + "record_id_list": ["recvg4qIXMSU6K", ...], + "has_more": true + } +} +``` + +若 `has_more=true`,用 `--offset` 翻页直到全部拉完。 + +## 转换为 harness 内部 schema + +主 agent 把每一行转成一个 case 对象,拼成 `dataset.jsonl`(jsonl,一行一个 case): + +```json +{ + "case_id": "case_001", + "record_id": "recvg4qIXMSU6K", + "query": "华东客户有哪些 Aily 优秀使用案例", + "has_knowledge": true, + "expected": { + "key_points": "【关键信息】的原文段", + "aux_info": "【辅助信息】的原文段", + "rubric_notes": { + "类型说明": "开放问题", + "可信无误": "不局限于ref,只要明确作为aily使用案例出现即算可信", + "完整详实": "答出5个及以上不扣分", + "结构清晰": "无", + "语言表述": "无", + "相关辅助": "无", + "引用准确": "无" + } + }, + "source_urls": [ + "https://bytedance.larkoffice.com/wiki/HxnMwM9cyiFW1dkACUBcC7KWnEd", + "https://bytedance.larkoffice.com/wiki/Es5wwNCyei3eYNkXc8Tcx35nnWe" + ] +} +``` + +### 转换要点 + +1. **case_id 编号**:按 record_id 在返回里的顺序分配 `case_001, case_002, ...`。同一次 run 内稳定,跨 run 不保证(PM 在 base 里插新行会错位)。如需跨 run 追踪,用 `record_id` +2. **filter `企业内是否有知识`**:harness 同时支持 `是` 和 `否` 的 case;但**pilot 阶段建议只跑 `是` 的**(`否` case 判分逻辑更复杂,后续加) +3. **解析 `预期答复` 的三段**: + - split 文本找 `【关键信息】` / `【辅助信息】` / 【打分备注】` 三个 heading + - 【打分备注】段是嵌套 JSON,`json.loads` 解析到 `rubric_notes` + - 解析失败的 case 标记 `parse_error: true`,跳过不评(写进 `summary.json.skipped`) +4. **解析 `数据源地址`**:正则提取 markdown 链接 `[text](url)` → `source_urls: [url, ...]`。非 URL 的纯文本(如提示语)忽略 +5. **空 query 过滤**:`query` 字段为空或纯空白的记录跳过 + +## Pilot 样本:只跑前 3 条冒烟 + +`/eval-search run --subset 3` 只拉前 3 条 `是` 类 case 跑。用于: +- 第一次落地 harness,验证端到端能跑通 +- auto-PR 流程的 dry-run(改完 skill 跑 3 条看趋势) + +## 频率 / 数据漂移 + +PM 在 base 里编辑 case 是常态。harness 不做 snapshot 冻结(v0.1 范围外),每次 `run` 拉最新。 + +**代价**:v_n 和 v_{n+1} 的分数差会混入 dataset 变化。在 PR description 里强制标注 `dataset_size / first_run_of_records` 两个字段,reviewer 自己判断。 diff --git a/tests/eval-search/references/known-tainted-tokens.md b/skills/eval-search/references/known-tainted-tokens.md similarity index 98% rename from tests/eval-search/references/known-tainted-tokens.md rename to skills/eval-search/references/known-tainted-tokens.md index 8598ba019..55faea9f7 100644 --- a/tests/eval-search/references/known-tainted-tokens.md +++ b/skills/eval-search/references/known-tainted-tokens.md @@ -69,7 +69,7 @@ tainted_tokens: **不要在飞书上写"评测过程记录" / "v_n 比对分析"之类文档**。都写成本仓库 markdown: -- 评测流程/设计 → `tests/eval-search/**` 或本地 run 产物 +- 评测流程/设计 → `skills/eval-search/**`(已就位) - 某轮迭代分析 → `tests/eval-search/runs//*.md`(gitignored,本地查看) - 发布用的 retrospective → PR description / GitHub wiki / release notes diff --git a/skills/eval-search/references/open-repo-layout.md b/skills/eval-search/references/open-repo-layout.md new file mode 100644 index 000000000..bac261cd7 --- /dev/null +++ b/skills/eval-search/references/open-repo-layout.md @@ -0,0 +1,162 @@ +# open 仓库导航手册(Optimizer 专用) + +> **读者:** `prompts/optimizer.md` 在处理 `tool_capability` 桶的 finding 时会 Read 这篇文档。 +> +> **目的:** 把 `lark_as/open` 仓库当"受控沙盒" — 明确 Optimizer 允许改哪些文件、禁止碰哪些文件、改完怎么验证。 + +## 仓库定位 + +``` +$GOPATH/src/code.byted.org/lark_as/open/ +``` + +这是 lark-cli 背后的 OpenAPI 服务层(后台简称 suite.as.open)。它把飞书内部大搜 PB(MGUniversalSearch)封装成面向外部的 OAPI。CLI 调这些 OAPI,agent 调 CLI。整条链路: + +``` +CLI (larksuite/cli) + → OAPI (lark_as/open) + → kitex_gen stub (git.byted.org/ee/go/kitex_gen, 由 IDL 仓库自动生成) + → RPC → 大搜后端 +``` + +**Optimizer 只动 open 仓库一层。** IDL 和 kitex_gen 不动(见禁止清单)。 + +## 核心目录(只读懂即可) + +``` +biz/search_open/ ← AI Friendly 新框架,所有改动都在这里 +├── entity/ ← 每实体一个 converter 文件 +│ ├── iconverter.go ← Converter 接口定义(不动) +│ ├── chat.go ← 参考实现(group chat 搜索) +│ ├── meeting.go ← 参考实现(平台实体,走 SlashCommand) +│ ├── message.go / doc.go / wiki.go / user.go / mail.go / task.go / ... +│ └── timeutil.go ← 时间格式工具(不动) +├── adapter.go ← 调 UniversalSearch RPC(不动) +├── handler.go ← 编排(不动) +├── pagetoken.go ← 翻页(不动) +├── response.go ← 错误码(不动) +├── CLAUDE.md ← open 仓库的开发规范,读它能看懂架构 +└── api_meta/{entity}/ ← 每实体 4 个 yml(search/filter/item/meta) + +biz/handler/handler.go ← 顶层路由(不动) +rpc/ ← 旧搜索 + RPC 封装(不动) +main.go / conf/ / utils/ ← 基础设施(不动) +``` + +## Converter 接口速览 + +每个 `entity/{name}.go` 都实现同一套 5 方法接口: + +```go +type Converter interface { + EntityType() usearch.SearchEntityType + BuildEntityItem(ctx, req) (*usearch.BaseEntity_EntityItem, error) // OAPI Filter → PB Filter + BuildResponseItem(result *usearch.SearchResult) (interface{}, error) // PB Meta → OAPI Item + BuildDisplayInfo(result *usearch.SearchResult) string // 组装给 AI 看的 markdown 卡片 + Prune(item interface{}, fields []string) interface{} // 字段裁剪 +} +``` + +**AI friendly 的高杠杆改动点几乎全在 `BuildDisplayInfo`**:它返回的 markdown 就是 agent 在 CLI 里看到的搜索结果文本。大搜结果里的标题、摘要、上下文、高亮(`` 标签)的组装方式直接决定 agent 能否一眼判断相关性。 + +## ✅ 允许改动(白名单) + +以下三类改动 Optimizer 可以直接写 diff,不需要动 IDL: + +### 1. `BuildDisplayInfo` 优化 + +- 补充 markdown 字段(例如加入更多上下文、路径信息、作者、时间) +- 调整高亮策略(命中词用 `` 标签包裹) +- 修复格式化 bug(换行、空字段处理、转义) + +**边界:** 只能使用 `*usearch.SearchResult` 里已有的字段。要是需要 PB 没返回的信息,那是 PB/IDL 的问题,降级为 issue。 + +### 2. `BuildResponseItem` 的字段映射 bug fix + +- `nil` 指针防御 +- 时间戳转换错误(`UnixToISO8601` / `UnixMsToISO8601` 用错) +- 枚举值映射错(比如 `chatStatusNormal` 漏判) +- ID 字段赋值缺失 + +**边界:** 只能在已有 OAPI 响应字段上做映射修复;**不能**新增 OAPI 响应字段(那是 IDL 级别的契约变更)。 + +### 3. `Prune` 敏感字段裁剪 + +- 根据业务需要把敏感/内部字段从响应里去掉 + +### 4. 配套测试 + +- 每次改 `entity/{name}.go` **必须**同时更新 `entity/{name}_test.go`,否则 quality gate(未来启用)会 block + +## ❌ 禁止改动(硬黑名单) + +| 路径 | 原因 | +|------|------| +| `../lark/idl/**` | IDL 在另一个仓库,需要跑 overpass + go get,不是 PR 范畴 | +| `biz/search_open/handler.go` | 编排逻辑,动了容易坏所有实体 | +| `biz/search_open/adapter.go` | RPC 适配层,牵扯协议 | +| `biz/search_open/pagetoken.go` | 翻页 + Redis,幂等性敏感 | +| `biz/search_open/response.go` | 错误码契约 | +| `biz/search_open/entity/iconverter.go` | Converter 接口,动了所有实体都得跟 | +| `biz/search_open/entity/timeutil.go` | 时间工具,动了影响所有实体 | +| `biz/search_open/api_meta/**/*.yml` | 新增 / 修改 schema = 契约变更,走人工 | +| `biz/handler/handler.go` | 顶层路由 | +| `rpc/**` | 旧搜索 + RPC 封装 | +| `main.go` / `conf/**` / `utils/**` | 基础设施 | +| `go.mod` / `go.sum` | 依赖升级人工做 | + +**触犯任一条** → finding 必须进 `unhandled_findings.md`,附带 issue 描述建议,不写进 diff。 + +## 新增 OAPI 字段(即使是 optional)的处理 + +**Optimizer 不能自动加字段。** 流程太复杂: + +1. 需要改 IDL 仓库(`$GOPATH/src/code.byted.org/lark/idl/idl/suite/as/open/*.thrift`) +2. 需要跑 overpass 生成 kitex_gen stub +3. 需要 `go get` 拉 stub 更新 +4. 需要同步改 open 仓库的 converter 映射 +5. 需要同步改 `api_meta/{entity}/*.yml` schema + +这是多仓库协作 + 手工步骤,Optimizer 不应该做。改为产出 GitHub issue 正文,正文包含: + +- 哪个 entity 需要新字段 +- 字段含义(含 proto 里已有的来源字段,若有) +- driving case 的引用 +- 对 agent 决策的价值说明 + +issue 正文写进 `unhandled_findings.md` 的 `proposed_issue` 段,由人工创建。 + +## 验证策略(当前版本) + +**Quality gate 暂未启用**(`/eval-search propose-pr` 跳过 open 仓库测试)。原因:open 仓库跑测试需要下游依赖,CI 配置不是 harness 可控的。PR 开出去之后,open 仓库的 CI 会自己跑。 + +Optimizer 自己必须做的最小校验: + +1. 所有改动文件 `gofmt` 过 +2. 改了 `entity/{name}.go` 必须同步动 `entity/{name}_test.go`(至少加一条测试覆盖修改的分支) +3. 不允许删除已有测试 + +## 参考文件(Optimizer 生成改动前**必读**) + +- `biz/search_open/CLAUDE.md` — 开发规范原文 +- `biz/search_open/entity/chat.go` — 完整 converter 参考 +- `biz/search_open/entity/chat_test.go` — 测试写法参考 +- `biz/search_open/entity/meeting.go` — 平台实体 converter 参考(`BuildDisplayInfo` 写法略有不同) + +## 与主 agent 的交互契约 + +Optimizer 处理涉及 open 仓库的 finding 时,产出放在 `pr-draft/open/` 子目录(和 cli 仓库的 `pr-draft/` 同级): + +``` +tests/eval-search/runs//pr-draft/ +├── diff.patch # cli 仓库改动(原本就有) +├── generalization_note.json +├── unhandled_findings.md +├── commit_message.txt +└── open/ # 新增:open 仓库改动 + ├── diff.patch # 应用到 $GOPATH/src/code.byted.org/lark_as/open/ + ├── commit_message.txt + └── touched_files.txt # 命中白名单校验的冗余证据 +``` + +主 agent 拿到两份 diff.patch 之后,分别 checkout 两个仓库、分别 apply、分别 commit、分别 `gh pr create`,在两个 PR description 里互相 link(见 `pr-generation.md`)。 diff --git a/skills/eval-search/references/pollution-preflight.md b/skills/eval-search/references/pollution-preflight.md new file mode 100644 index 000000000..4ff88fcc7 --- /dev/null +++ b/skills/eval-search/references/pollution-preflight.md @@ -0,0 +1,105 @@ +# 污染预检规则 + +## 动机 + +评测集 base 自身、v1/v2 迭代记录文档、含 expected 的参考文档,都可能被 `drive +search` 命中。Executor 一旦 fetch 到,就是"开卷考试"——分数失去意义。 + +v2 的教训:PM 的 dataset base 在第一次跑评测时,几乎所有 query 的搜索 top-1 都是 dataset 自己。 + +因此 `/eval-search run` 需要两个 lark-cli profile: +- `loader-profile`:能读评测 Base,只负责拉取 live dataset 并写入 `dataset.jsonl` +- `executor-profile`:负责盲测搜索,必须不能读评测 Base + +也可以用同一个人账号做时间隔离:先在有权限时运行 `--snapshot-only` 拉本地快照;随后把该账号从评测 Base 权限里移除;最后用 `--dataset-file` 从本地快照继续。第二步运行时仍会探测 executor 是否能读 Base,能读则阻断。 + +## 两道防线(必须叠加) + +### 防线 1:专用账号(物理隔离) + +harness 启动时 MUST 先对 executor profile 做账号检查: + +```bash +lark-cli --profile auth status +``` + +从返回里读 `userOpenId`,对照 [`known-tainted-tokens.md`](known-tainted-tokens.md) 的 `excluded_user_ids` 列表: +- 命中 → **拒绝启动**,报错退出:`当前账号在 excluded_user_ids 里;harness 必须用专用测试账号运行` +- 未命中 → 继续 + +**新建测试账号步骤**(手工一次性): +1. 申请独立企业飞书账号(非 PM、非 dataset owner) +2. 账号不加入评测集 base 的权限,不加入"参考流程文档"的权限 +3. 在 `~/.config/lark-cli/profiles/` 下建独立 profile,`lark-cli auth login --profile eval-search` +4. 评测运行时:`lark-cli --profile eval-search ...` + +setup runner 还会主动探测 executor profile 是否能读取评测 Base: + +```bash +lark-cli --profile base +record-list \ + --as user \ + --base-token OOoEbNWhcaFOdisXDW7c0lKtn4g \ + --table-id tblGWdc19tKFZC6K \ + --view-id vewGToSnWl \ + --limit 1 +``` + +期望结果是权限失败。若读取成功,说明 executor 可直接搜到或打开评测集,必须阻断本轮 run。 + +### 防线 2:Pre-flight 扫描(兜底) + +即使账号做了物理隔离,某些情况下仍可能被污染(例如:某个新建文档恰好包含了答案且权限开放)。Pre-flight 作为兜底: + +**流程**: + +``` +for each case in dataset.jsonl: + result = lark-cli --profile drive +search --query "" --page-size 20 + hit_tokens = extract all obj_token / wiki_token from result + tainted = hit_tokens ∩ known_tainted_tokens + + write to preflight.json: + { + "case_id": "case_001", + "contamination_risk": len(tainted) > 0, + "tainted_tokens": [...], + "top_20_tokens": [...] + } +``` + +实际执行时,`known_tainted_tokens` 由持久清单 [`known-tainted-tokens.md`](known-tainted-tokens.md) 和本轮 `cloud-doc/tainted_tokens.json` 合并得到。后者用于 `/eval-search cycle` 生成的临时报告文档,避免还没进入持久 blocklist 的过程材料影响本轮 after-run。 + +**不阻断**,只标记。原因:有时 pre-flight 命中但 Executor 最终没 fetch,这种 case 依然有效,Judge 会打出正常 recall 分。 + +### known_tainted_tokens 的维护 + +见 [`known-tainted-tokens.md`](known-tainted-tokens.md)。三类必须纳入: +1. **评测集 base 自身**:`OOoEbNWhcaFOdisXDW7c0lKtn4g` +2. **v1/v2 迭代记录 docx**:`VdUKdAXjmo9vl8xq4FrczK6unct`(含全部评测方法论 + 具体 case 分数) +3. **人类写的"答题参考"/"流程总结"**:任何在评测过程中被主 agent 写到飞书的 note + +每次新增一个"讨论评测过程"的飞书文档,记得加进标记清单(或者更简单:**不要在飞书上写这种文档**,都写成本仓库 markdown)。 + +## Judge 怎么用 preflight 数据 + +Judge 读 `preflight.json` 判断 `contamination_penalty`: + +``` +for each case: + if preflight[case].contamination_risk == true: + scan trajectory for any tool_use that fetched one of tainted_tokens + if fetched: + if answer directly quotes tainted doc content: + contamination_penalty = -3 + else: + contamination_penalty = -1 + else: + contamination_penalty = 0 + else: + contamination_penalty = 0 +``` + +## 常见坑 + +- **wiki 链接**:`wiki://space_xxx/node_yyy` 背后的 obj_token 才是真实目标。pre-flight 扫描时必须同时记录 `wiki_token` 和 `obj_token` 两层,任一命中标记清单即 tainted +- **短链 / applink**:`applink.feishu-pre.net/...` 跳转后的最终 URL 可能是 tainted,建议 Executor 遇到短链先解析一跳再判断。这条太细,v0.1 不做强约束 +- **账号隔离失效**:PM 手滑把 dataset base 对全员开放,专用账号又能看到了。定期(每次 run 前)手动检查一下 base 的权限列表 diff --git a/skills/eval-search/references/pr-generation.md b/skills/eval-search/references/pr-generation.md new file mode 100644 index 000000000..14046afd6 --- /dev/null +++ b/skills/eval-search/references/pr-generation.md @@ -0,0 +1,293 @@ +# PR 生成流程 + description 模板 + +## 双 PR 模型 + +Optimizer 的产出可能横跨两个仓库: + +- **cli 仓库**(`larksuite/cli`,当前工作目录):skill 文档改动、新增 shortcut +- **open 仓库**(`$GOPATH/src/code.byted.org/lark_as/open/`):converter 层 `BuildDisplayInfo` 优化、bug fix + +两个仓库分别提 PR,**独立 review、独立 merge**(决策 2A)。PR description 里互相 link,但不绑定 merge 顺序——一个先 merge 另一个还没 merge 也 OK。 + +若本次 run 只有 cli 改动,`pr-draft/open/` 目录不存在,跳过所有 open 仓库步骤。 + +## 总流程 + +``` +/eval-search propose-pr + │ + ├─[0] 前置检查 + │ ├─ cli 仓库 git status 必须干净(non-dirty);否则 abort + │ ├─ cli 仓库当前分支是 main;否则 abort + │ ├─ runs//summary.json 存在且 scored >0 + │ ├─ runs//meta.json.git_dirty != true + │ └─ 若 Optimizer 产出涉及 open 仓库 → 同样检查 open 仓库 git status / 分支 + │ + ├─[1] Optimizer sub-agent(Task 工具) + │ 输入: summary.json + key_error_snippets + 两个仓库路径 + │ 输出: pr-draft/{diff.patch, commit_message.txt, generalization_note.json, unhandled_findings.md} + │ 若有 open 改动 → pr-draft/open/{diff.patch, commit_message.txt, touched_files.txt} + │ 注意: Optimizer 不自己 git commit / git apply,一切由主 agent 执行 + │ + ├─[2] 白名单复查(主 agent,防 Optimizer 越权) + │ ├─ cli diff 命中路径都在白名单内(skills/**/*.md、shortcuts/**) + │ └─ open diff 命中路径都在白名单内(biz/search_open/entity/{name}.go + *_test.go) + │ 违反 → abort,Optimizer 降级迭代 + │ + ├─[3] cli 仓库 apply + commit + │ cd + │ git checkout -b eval-search/auto-pr/ + │ git apply pr-draft/diff.patch + │ ├─[3a] Quality gate + │ │ make unit-test # 必过 + │ │ golangci-lint run --new-from-rev=origin/main # 必过 + │ │ 失败 → Optimizer 最多迭代 2 次;仍失败 → rollback,该 finding 降级为 unhandled + │ └─ git add . && git commit -F pr-draft/commit_message.txt + │ + ├─[4] open 仓库 apply + commit(若有) + │ cd $GOPATH/src/code.byted.org/lark_as/open + │ git checkout -b eval-search/auto-pr/ + │ git apply /pr-draft/open/diff.patch + │ # 无 quality gate(暂时),Optimizer 自己已做 gofmt 和测试更新 + │ git add . && git commit -F /pr-draft/open/commit_message.txt + │ + ├─[5] 确定性 regression 重跑 + │ 调用 /eval-search run 内部逻辑(无 agent 参与),生成 after_verdicts.json + │ 对比 before(summary.json)vs after,产出 per-case diff + │ 注意: open 改动若依赖 CI 部署才能生效,after 结果反映的是 cli 改动的影响;在 description 里标注 + │ + ├─[6] 组装 PR description + │ 按本文件下方模板生成 cli 和 open 两份 description.md,互相留 link 占位 + │ 若由 /eval-search cycle 调用,description 必须包含 cloud report URL + │ + ├─[7] gh pr create --draft(cli) + │ cd && gh pr create --draft → 记录 PR url CLI_PR_URL + │ 若由 /eval-search cycle 调用,立刻回写 cycle.json.pr_urls 并追加云文档 + │ + └─[8] gh pr create --draft(open,若有) + cd && gh pr create --draft,description 里 Pair 字段填入 CLI_PR_URL + 创建完之后回到 cli PR,用 gh pr edit 把 open PR url 填到 cli description 的 Pair 段 +``` + +## PR URL 交付契约 + +`gh pr create --draft` 的返回 URL 是 `/eval-search propose-pr` 的主产物,必须持久化到: + +- `tests/eval-search/runs//pr-draft/pr-url.txt` +- `tests/eval-search/runs//summary.json` 的 `pr_urls` 字段(若已有 summary) +- `cycle.json.pr_urls`(仅 `/eval-search cycle`) +- 云文档 final / pr-finished 段(仅 `/eval-search cycle`) + +最终回复用户时必须直接贴出 PR URL。若 PR 创建失败,回复中必须说明失败阶段、失败命令和当前可恢复分支,不得只说“已提交”。 + +## Quality gate 失败处理 + +两次迭代后仍失败的 finding: + +1. 回滚那一条 finding 的改动(其他 finding 保留) +2. 把它写进 `unhandled_findings.md`,归类为 `quality_gate_failure`,附带完整错误输出 +3. PR description 的"未处理归因"段列出这些 finding 并建议新人创建 issue + +## PR description 模板(cli 仓库) + +```markdown + + +## 摘要 + +基于 eval-search run `{{run_id}}` 自动生成,共 {{n_findings}} 条改进落地({{n_skipped}} 条未处理)。 + +{{#if open_pr_url}} +**Pair:** [{{open_pr_title}}]({{open_pr_url}}) — open 仓库的配套改动,独立 review。 +{{/if}} + +## 评测对比(before vs after) + +| 指标 | before | after | Δ | +|------|--------|-------|---| +| 总分 | {{before_total}} / {{max}} ({{before_pct}}%) | {{after_total}} / {{max}} ({{after_pct}}%) | **{{delta}} ({{delta_pp}}pp)** | +| recall | {{before_recall}} | {{after_recall}} | {{delta_recall}} | +| accuracy | {{before_accuracy}} | {{after_accuracy}} | {{delta_accuracy}} | +| completeness | {{before_completeness}} | {{after_completeness}} | {{delta_completeness}} | + +- Dataset size: {{dataset_size}} (同一份 base 拉取;dataset 可能已被 PM 更新,per-case diff 以 `record_id` 对齐) +- 评测账号: `{{user_name}}` (open_id `{{user_open_id}}`) +- Pollution: {{contaminated_count}} case 命中 tainted tokens{{#if contaminated_count}} — 见附录{{/if}} +{{#if cloud_report_url}} +- Cloud report: {{cloud_report_url}} +{{/if}} + +## Wins(by case) + +{{#each wins}} +- `{{case_id}}` ({{record_id}}): **{{before}}→{{after}}** (+{{delta}}) + - driver: {{driver_findings}} +{{/each}} + +## ⚠️ Regressions(软告警 — reviewer 请核验) + +{{#if regressions}} +{{#each regressions}} +- `{{case_id}}` ({{record_id}}): **{{before}}→{{after}}** ({{delta}}) + - 可能原因: {{hypothesis}} + - 建议 reviewer: 查看 `tests/eval-search/runs/{{run_id}}/trajectories/{{case_id}}.json` 对比前后行为 +{{/each}} +{{else}} +_无 regression_ +{{/if}} + +## 改动分类(Optimizer 自述) + +### 泛化原则性改动(适用面广,reviewer 较快可信) + +{{#each principled_changes}} +- **{{file}}**: {{change_summary}} + - rationale: {{rationale}} + - driven by: {{driving_cases}} +{{/each}} + +### 针对具体 case 的改动(⚠️ 过拟合风险,reviewer 重点判断) + +{{#if case_specific_changes}} +{{#each case_specific_changes}} +- **{{file}}**: {{change_summary}} + - risk: {{risk}} + - driven by: {{driving_cases}} +{{/each}} +{{else}} +_无_ +{{/if}} + +## 未处理归因 + +{{#if unhandled}} +以下 findings 本 PR 未处理,建议 reviewer 考虑创建 issue: + +{{#each unhandled}} +- **[{{bucket}}]** {{suggestion}} + - 未处理原因: {{reason}} + - driving: {{driving_cases}} +{{/each}} +{{else}} +_无_ +{{/if}} + +## 怎么 review 这个 PR + +1. 先看"评测对比"总分是否真有提升 +2. 扫一眼 Regressions,若有,点进 trajectory 看是不是噪声 +3. 重点 review "针对具体 case 的改动"——判断是否过拟合 +4. 泛化性改动是文档修订,读 diff 即可 +5. 如涉及 Go 代码,CI 已过 `make unit-test` + lint,关注接口设计 + +## 复现 + +```bash +git checkout eval-search/auto-pr/{{run_id}} +/eval-search report {{run_id}} +``` + +--- + +🤖 Generated by [eval-search harness](../skills/eval-search/SKILL.md) +``` + +## PR description 模板(open 仓库) + +比 cli 版本精简,不重复写 wins/regressions 表格(那是 CLI 端视角),只列本 PR 的改动 + 回指 cli PR。 + +```markdown + + +## 摘要 + +配合 cli 仓库 `eval-search` 评测结果优化 OAPI converter 层。改动范围:`biz/search_open/entity/` 下的 `BuildDisplayInfo` / `BuildResponseItem` / `Prune`,**不涉及 IDL 和契约变更**。 + +**Pair:** [{{cli_pr_title}}]({{cli_pr_url}}) — 主 PR,含完整评测对比、泛化声明、未处理归因。 + +## 改动清单 + +{{#each open_changes}} +- **`{{file}}`**: {{change_summary}} + - driven by: {{driving_cases}} + - 过拟合风险: {{risk_level}} +{{/each}} + +## 怎么 review + +1. 每条改动本质都是 converter 输出字符串的优化,对协议无影响 +2. Quality gate 未跑(harness 暂未接 open 仓库 CI),reviewer 请关注: + - 空字段 / nil 指针防御是否到位 + - markdown 高亮标签 `` 使用是否一致 + - 测试是否覆盖了修改的分支 +3. 对 agent 效果的量化验证在 cli PR 的评测对比段 + +## 复现 cli 侧评测 + +```bash +cd +/eval-search report {{run_id}} +``` + +--- + +🤖 Generated by [eval-search harness](https://github.com/larksuite/cli/tree/main/skills/eval-search) +``` + +## 模板填充注意 + +- 所有百分比保留 1 位小数 +- `driving_cases` 最多列 5 个,超过写 `case_003, case_007, ... (+3 more)` +- `record_id` 放在 `case_id` 后面括号里,方便 reviewer 跨 run 追踪同一条 case +- `hypothesis` 由主 agent 根据 before/after trajectory diff 推断,最多 30 字;拿不准就写 `"待核验"`,不要硬编 + +## commit message 规范 + +Conventional Commits,遵循仓库 AGENTS.md: + +``` +feat(eval-search): auto-propose improvements from run + +<一段改动概要,3-6 行> + +Eval: % → % ({{delta_pp}}pp) +Regressions: +Unhandled: + +Generated-By: eval-search/ +Co-Authored-By: eval-search-bot +``` + +## PR 创建命令 + +**cli 仓库 PR**(先创建): + +```bash +cd +gh pr create --draft \ + --title "feat(eval-search): auto-propose improvements from run " \ + --body-file tests/eval-search/runs//pr-draft/description.md \ + --base main +``` + +记录返回的 PR URL 为 `CLI_PR_URL`。 + +**open 仓库 PR**(若 `pr-draft/open/` 存在): + +```bash +cd $GOPATH/src/code.byted.org/lark_as/open +# description.md 里已填入 CLI_PR_URL 到 Pair 字段 +gh pr create --draft \ + --title "feat(search_open): improve converter display_info from eval-search run " \ + --body-file /tests/eval-search/runs//pr-draft/open/description.md \ + --base main +``` + +记录返回的 PR URL 为 `OPEN_PR_URL`,然后回填到 cli PR description: + +```bash +cd +gh pr edit --body-file +``` + +Draft 模式确保 CI 跑但不自动 merge,等 reviewer 转为 ready-for-review。两个 PR **独立 review、独立 merge**,任一方 merge 均可,不要求同步。 diff --git a/skills/eval-search/references/run-layout.md b/skills/eval-search/references/run-layout.md new file mode 100644 index 000000000..5bbac1618 --- /dev/null +++ b/skills/eval-search/references/run-layout.md @@ -0,0 +1,123 @@ +# run 目录结构 + 中间产物约定 + +## 目录位置 + +``` +/tests/eval-search/runs// +``` + +`` 格式:`YYYY-MM-DDTHH-MMZ`(UTC,用 `date -u +%Y-%m-%dT%H-%MZ` 生成)。 + +整个 `tests/eval-search/runs/` 被 gitignore,不进版本库。 + +确定性 setup runner: + +```bash +node --experimental-strip-types tests/eval-search/eval-search-run.ts \ + --loader-profile \ + --executor-profile \ + --subset 3 +``` + +runner 只负责创建 run 目录、拉取并转换 live dataset、检查 executor 账号隔离、写 `preflight.json`。它不会执行 AI Executor/Judge 阶段;setup 成功时 `summary.json.status` 为 `ready_for_executor`。 + +单账号时间隔离模式: + +```bash +node --experimental-strip-types tests/eval-search/eval-search-run.ts --snapshot-only --loader-profile +# 移除该账号的评测 Base 权限 +node --experimental-strip-types tests/eval-search/eval-search-run.ts \ + --dataset-file tests/eval-search/runs//dataset.jsonl \ + --executor-profile +``` + +第一步只写本地 `dataset.jsonl`,`summary.json.status` 为 `snapshot_ready`。第二步会复制该 dataset 到新的 run 目录,并重新检查 executor 已经不能读取评测 Base。 + +## 单次 run 目录布局 + +``` +tests/eval-search/runs/2026-04-15T10-00Z/ +├── cycle.json # 仅 /eval-search cycle 阶段编排使用;记录云文档、阶段状态、PR URL +├── cloud-doc/ # 仅 /eval-search cycle 使用;每次追加云文档前生成的 markdown 片段 +│ ├── 00-created.md +│ ├── 20-run-finished.md +│ └── tainted_tokens.json +├── meta.json # run 元信息(cli 版本、loader/executor profile、账号、开始/结束时间) +├── raw/ +│ ├── base_records_pages.json +│ └── base_records_combined.json +├── dataset.jsonl # 从 base 拉下来并转换的 cases +├── preflight.json # 污染预检结果 +├── trajectories/ +│ ├── case_001.json # Executor 增量写盘,崩溃可恢复 +│ ├── case_002.json +│ └── ... +├── verdicts.json # Judge 产出 +├── summary.json # 聚类后的 findings +└── pr-draft/ # 仅 propose-pr 阶段产出 + ├── diff.patch + ├── generalization_note.json + ├── unhandled_findings.md + ├── commit_message.txt + └── after_verdicts.json # regression 重跑结果(不含 trajectories,减小体积) +``` + +## meta.json + +```json +{ + "run_id": "2026-04-15T10-00Z", + "started_at": "2026-04-15T10:00:13Z", + "ended_at": "2026-04-15T11:42:51Z", + "lark_cli_version": "v1.0.11+git-abc1234", + "git_head": "abc1234", + "git_dirty": true, + "loader_profile": "base-reader", + "executor_profile": "eval-search", + "user_open_id": "ou_xxx", + "user_name": "eval-search-bot", + "subset": null, + "cases_scored": 13, + "cases_skipped_contamination": 0, + "cases_skipped_parse_error": 1 +} +``` + +`git_dirty=true` 的 run 打上 `⚠️ dirty` 标记;propose-pr 阶段若源码 dirty 会拒绝生成 PR(否则 diff 混入无关改动)。 + +## 增量持久化约定 + +Executor 每完成 1 round(= 1 次 lark-cli 调用 + 解析),追加写入 `trajectories/.json`: + +```json +{ + "case_id": "case_001", + "query": "...", + "started_at": "...", + "rounds": [ + {"idx": 1, "tool": "Read", "target": "skills/lark-doc/SKILL.md", "outcome_summary": "..."}, + {"idx": 2, "tool": "Bash", "cmd": "lark-cli drive +search --query '华东 Aily'", "outcome_summary": "top-3: ..."}, + ... + ], + "answer": null, + "gave_up": false, + "ended_at": null +} +``` + +所有未闭合的 case(`ended_at: null`)在 run 结束时标记为 `incomplete`,Judge 按 `gave_up=true` 处理但 `rounds_used` 如实记录。 + +## 并发度 + +v0.1 建议 **串行跑 Executor**: +- 避免多 sub-agent 同时打飞书 API 触发限流 +- v2 历史上 sub-agent 529 频繁,并发会放大问题 +- 评测 13 case 串行约 1-2 小时,可接受 + +未来若评测集扩到 50+ case,再考虑 semaphore 限并发 = 2。 + +## 清理策略 + +`tests/eval-search/runs/` 不自动清理。用户手动 `rm -rf tests/eval-search/runs/` 或按时间删旧的。 + +.gitignore 已覆盖整个 runs/ 目录。 diff --git a/tests/eval-search/eval-search-collect-search.ts b/tests/eval-search/eval-search-collect-search.ts index 89ae83f8f..c528042c6 100644 --- a/tests/eval-search/eval-search-collect-search.ts +++ b/tests/eval-search/eval-search-collect-search.ts @@ -139,7 +139,7 @@ function addTokensFromValue(value, tokens) { } function loadTaintedTokens(root, runDir = "") { - const file = path.join(root, "tests/eval-search/references/known-tainted-tokens.md"); + const file = path.join(root, "skills/eval-search/references/known-tainted-tokens.md"); const tokens: Set = new Set(); if (!fs.existsSync(file)) { return tokens; diff --git a/tests/eval-search/eval-search-run.ts b/tests/eval-search/eval-search-run.ts index a4a8351e4..3a2fb5777 100644 --- a/tests/eval-search/eval-search-run.ts +++ b/tests/eval-search/eval-search-run.ts @@ -190,7 +190,7 @@ function summarizeFailure(result) { function readTaintedTokens(root) { const file = path.join( root, - "tests/eval-search/references/known-tainted-tokens.md", + "skills/eval-search/references/known-tainted-tokens.md", ); const text = fs.readFileSync(file, "utf8"); const block = text.match(/tainted_tokens:[\s\S]*?```/); @@ -241,7 +241,7 @@ function readRunTaintedTokens(runDir) { function readExcludedUserIds(root) { const file = path.join( root, - "tests/eval-search/references/known-tainted-tokens.md", + "skills/eval-search/references/known-tainted-tokens.md", ); const text = fs.readFileSync(file, "utf8"); const block = text.match(/excluded_user_ids:[\s\S]*?```/); @@ -901,7 +901,7 @@ function main() { ? [`preflight found tainted tokens in ${contaminationCount} case(s)`] : [], blockers: [ - "executor and judge phases still require the eval-search agent workflow", + "executor and judge phases still require the agent workflow described in skills/eval-search/prompts", ], });