diff --git a/src/skillify/skill-proposer.ts b/src/skillify/skill-proposer.ts index 1032b1bd..9158c284 100644 --- a/src/skillify/skill-proposer.ts +++ b/src/skillify/skill-proposer.ts @@ -26,36 +26,65 @@ export interface ProposeConfig { } const SYSTEM = - "You improve an engineering SKILL document that has been producing repeated, " + - "confirmed failures. Diagnose the SINGLE recurring weakness behind the failures " + - "and propose a SMALL set of structured edits that fix it. Do NOT rewrite the " + - `whole doc, and do NOT touch anything between ${SU_START} and ${SU_END}. Reply ` + - 'with ONLY a JSON array of edits, each: {"op":"append|insert_after|replace|' + - 'delete","target":"","content":""}. Prefer the smallest change that fixes the weakness.'; + "You improve an engineering SKILL document that produced repeated, confirmed " + + "failures. Work in two steps: (1) diagnose the SINGLE recurring weakness — the " + + "specific thing the doc told the agent to do, or failed to tell it, that caused " + + "the pushback; (2) propose a SMALL set of structured edits that fix exactly that.\n" + + "Every edit MUST be:\n" + + "- CONCRETE and OPERATIONAL: a specific rule, check, command, or step the agent " + + "can follow — never vague advice ('be careful', 'communicate clearly').\n" + + "- PLACED WHERE IT FIRES: anchor it to the RELEVANT existing section " + + "(insert_after that heading), or REPLACE the weak existing instruction that " + + "allowed the failure. Do NOT just append a paragraph at the end of a long doc — " + + "that gets skipped.\n" + + "- TRACEABLE: each edit must directly prevent a specific failure listed below.\n" + + "Prefer strengthening/replacing an existing instruction over adding new text. Do " + + `NOT rewrite the whole doc, and do NOT touch anything between ${SU_START} and ` + + `${SU_END}. Reply in TWO parts: first ONE line naming the single recurring ` + + "weakness, then a JSON array of edits (the array is what gets parsed), each: " + + '{"op":"append|insert_after|replace|delete","target":"","content":""}. Prefer the smallest change that fixes the weakness.'; function buildUserPrompt(body: string, failures: string[], priorEdits: string[]): string { const cases = failures.slice(0, 8).map((f, i) => `${i + 1}. ${f}`).join("\n"); const prior = priorEdits.length ? `\n\nALREADY TRIED for this skill on earlier runs (do NOT repeat these — propose something different, or nothing):\n${priorEdits.slice(0, 12).map((p) => `- ${p}`).join("\n")}` : ""; - return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nPropose the bounded edits. JSON array only.`; + return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nFirst name the single recurring weakness in one line, then output the JSON array of edits that anchor the fix into the relevant existing section.`; } const OPS = new Set(["append", "insert_after", "replace", "delete"]); +/** + * Extract the JSON array from noisy model output. Robust to a leading prose line + * (the "weakness" the proposer is asked to state first) even if it contains + * brackets: scan back from the last `]` to its balanced `[`, and fall back to + * first-`[`..last-`]` if that doesn't parse. + */ +function extractArray(s: string): unknown[] | null { + const b = s.lastIndexOf("]"); + if (b === -1) return null; + let depth = 0; + for (let i = b; i >= 0; i--) { + if (s[i] === "]") depth++; + else if (s[i] === "[" && --depth === 0) { + try { const a = JSON.parse(s.slice(i, b + 1)); if (Array.isArray(a)) return a; } catch { /* fall through */ } + break; + } + } + const first = s.indexOf("["); + if (first !== -1 && b > first) { try { const a = JSON.parse(s.slice(first, b + 1)); if (Array.isArray(a)) return a; } catch { /* none */ } } + return null; +} + /** Tolerant parse of a JSON array of edits (handles ```fences / surrounding prose). */ export function parseEdits(raw: string): Edit[] { let s = raw.trim(); const fence = s.match(/```(?:json)?\s*([\s\S]*?)```/); if (fence) s = fence[1].trim(); - const a = s.indexOf("["); - const b = s.lastIndexOf("]"); - if (a === -1 || b <= a) return []; - let arr: unknown; - try { arr = JSON.parse(s.slice(a, b + 1)); } catch { return []; } - if (!Array.isArray(arr)) return []; + const arr = extractArray(s); + if (!arr) return []; const out: Edit[] = []; for (const e of arr) { if (!e || typeof e !== "object") continue; diff --git a/tests/shared/skill-proposer.test.ts b/tests/shared/skill-proposer.test.ts index cb1a1c00..97a704c8 100644 --- a/tests/shared/skill-proposer.test.ts +++ b/tests/shared/skill-proposer.test.ts @@ -18,6 +18,16 @@ describe("parseEdits", () => { it("returns [] when there's no array", () => { expect(parseEdits("the model refused")).toEqual([]); }); + it("extracts the edits array past a weakness line that itself contains brackets", () => { + const raw = 'Weakness: the doc never says to verify [the real client] before reporting.\n' + + '[{"op":"append","content":"verify via API"}]'; + expect(parseEdits(raw)).toEqual([{ op: "append", content: "verify via API" }]); + }); + it("falls back to first-[..last-] when content holds an unbalanced bracket", () => { + // the inner "]" defeats the balanced back-scan; the fallback still parses it + const raw = '[{"op":"append","content":"use arr] carefully"}]'; + expect(parseEdits(raw)).toEqual([{ op: "append", content: "use arr] carefully" }]); + }); }); describe("proposeSkillEdit", () => { @@ -35,6 +45,26 @@ describe("proposeSkillEdit", () => { expect(model.mock.calls[0][1]).toContain("CONFIRMED FAILURES"); }); + it("steers toward concrete, anchored edits — not append-at-end", async () => { + const model = vi.fn(async (_s: string, _u: string) => + '[{"op":"insert_after","target":"## Rules","content":"0. verify on the real client first"}]'); + const p = await proposeSkillEdit(body, failures, { model }); + expect(p.changed).toBe(true); + const system = model.mock.calls[0][0]; + expect(system).toContain("anchor it to the RELEVANT existing section"); + expect(system).toContain("CONCRETE and OPERATIONAL"); + expect(system).toContain("REPLACE the weak existing instruction"); + expect(model.mock.calls[0][1]).toContain("First name the single recurring weakness in one line"); + }); + + it("includes prior edits so the proposer doesn't repeat them", async () => { + const model = vi.fn(async (_s: string, _u: string) => "[]"); + await proposeSkillEdit(body, failures, { model, priorEdits: ["append: verify via API"] }); + const user = model.mock.calls[0][1]; + expect(user).toContain("ALREADY TRIED"); + expect(user).toContain("append: verify via API"); + }); + it("enforces the edit budget", async () => { const model = vi.fn(async (_s: string, _u: string) => '[{"op":"append","content":"a"},{"op":"append","content":"b"},{"op":"append","content":"c"}]');