From 11021a3806b4ccd39249cc0174fa38fa3f87e43d Mon Sep 17 00:00:00 2001 From: khustup2 Date: Sun, 7 Jun 2026 06:16:59 +0000 Subject: [PATCH 1/4] feat(skillopt): proposer targets root-cause + anchors edits, not append-at-end The proposer appended generic guidance to the end of long skill docs, which diluted rather than fixed (and lost the A/B against the current body). It now diagnoses the single recurring weakness first, then requires each edit to be concrete/operational and anchored into the relevant existing section (or to replace the weak existing instruction). On a 4-skill benchmark of real failing org skills, an independent judge prefers the proposed edit over the current body 3/4, up from 2/4 before. --- src/skillify/skill-proposer.ts | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/skillify/skill-proposer.ts b/src/skillify/skill-proposer.ts index 1032b1bd..99b4d951 100644 --- a/src/skillify/skill-proposer.ts +++ b/src/skillify/skill-proposer.ts @@ -26,13 +26,23 @@ export interface ProposeConfig { } const SYSTEM = - "You improve an engineering SKILL document that has been producing repeated, " + - "confirmed failures. Diagnose the SINGLE recurring weakness behind the failures " + - "and propose a SMALL set of structured edits that fix it. Do NOT rewrite the " + - `whole doc, and do NOT touch anything between ${SU_START} and ${SU_END}. Reply ` + - 'with ONLY a JSON array of edits, each: {"op":"append|insert_after|replace|' + - 'delete","target":"","content":"","content":""}. Prefer the smallest change that fixes the weakness.'; function buildUserPrompt(body: string, failures: string[], priorEdits: string[]): string { @@ -40,7 +50,7 @@ function buildUserPrompt(body: string, failures: string[], priorEdits: string[]) const prior = priorEdits.length ? `\n\nALREADY TRIED for this skill on earlier runs (do NOT repeat these — propose something different, or nothing):\n${priorEdits.slice(0, 12).map((p) => `- ${p}`).join("\n")}` : ""; - return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nPropose the bounded edits. JSON array only.`; + return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nFirst name the single recurring weakness in one line, then output the edits that anchor the fix into the relevant existing section. JSON array only.`; } const OPS = new Set(["append", "insert_after", "replace", "delete"]); From 2840a1c0955b46047ab1cc377c5fde26e76b2e12 Mon Sep 17 00:00:00 2001 From: khustup2 Date: Sun, 7 Jun 2026 07:07:37 +0000 Subject: [PATCH 2/4] test(skillopt): cover proposer steering toward concrete, anchored edits --- tests/shared/skill-proposer.test.ts | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/shared/skill-proposer.test.ts b/tests/shared/skill-proposer.test.ts index cb1a1c00..a1506c73 100644 --- a/tests/shared/skill-proposer.test.ts +++ b/tests/shared/skill-proposer.test.ts @@ -35,6 +35,18 @@ describe("proposeSkillEdit", () => { expect(model.mock.calls[0][1]).toContain("CONFIRMED FAILURES"); }); + it("steers toward concrete, anchored edits — not append-at-end", async () => { + const model = vi.fn(async (_s: string, _u: string) => + '[{"op":"insert_after","target":"## Rules","content":"0. verify on the real client first"}]'); + const p = await proposeSkillEdit(body, failures, { model }); + expect(p.changed).toBe(true); + const system = model.mock.calls[0][0]; + expect(system).toMatch(/anchor/i); // place the fix where it fires + expect(system).toMatch(/concrete|operational/i); + expect(system).toMatch(/replace the weak/i); // strengthen, don't only append + expect(model.mock.calls[0][1]).toMatch(/name the single recurring weakness/i); + }); + it("enforces the edit budget", async () => { const model = vi.fn(async (_s: string, _u: string) => '[{"op":"append","content":"a"},{"op":"append","content":"b"},{"op":"append","content":"c"}]'); From 1e5f58d64c3983c93999cdc22d4fee9edc75d34f Mon Sep 17 00:00:00 2001 From: khustup2 Date: Sun, 7 Jun 2026 15:33:41 +0000 Subject: [PATCH 3/4] fix(skillopt): reconcile proposer output contract (JSON-only) + tighten tests/coverage - Resolve SYSTEM/user-prompt contradiction (CodeRabbit): output stays JSON-only; diagnosis is an internal step, not emitted (also avoids a bracketed weakness line breaking parseEdits). - Assert exact prompt fragments instead of broad regexes. - Cover the priorEdits branch. --- src/skillify/skill-proposer.ts | 2 +- tests/shared/skill-proposer.test.ts | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/skillify/skill-proposer.ts b/src/skillify/skill-proposer.ts index 99b4d951..3cf4c68f 100644 --- a/src/skillify/skill-proposer.ts +++ b/src/skillify/skill-proposer.ts @@ -50,7 +50,7 @@ function buildUserPrompt(body: string, failures: string[], priorEdits: string[]) const prior = priorEdits.length ? `\n\nALREADY TRIED for this skill on earlier runs (do NOT repeat these — propose something different, or nothing):\n${priorEdits.slice(0, 12).map((p) => `- ${p}`).join("\n")}` : ""; - return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nFirst name the single recurring weakness in one line, then output the edits that anchor the fix into the relevant existing section. JSON array only.`; + return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nDiagnose the single recurring weakness, then output the edits that anchor the fix into the relevant existing section. JSON array only.`; } const OPS = new Set(["append", "insert_after", "replace", "delete"]); diff --git a/tests/shared/skill-proposer.test.ts b/tests/shared/skill-proposer.test.ts index a1506c73..01f54a65 100644 --- a/tests/shared/skill-proposer.test.ts +++ b/tests/shared/skill-proposer.test.ts @@ -41,10 +41,18 @@ describe("proposeSkillEdit", () => { const p = await proposeSkillEdit(body, failures, { model }); expect(p.changed).toBe(true); const system = model.mock.calls[0][0]; - expect(system).toMatch(/anchor/i); // place the fix where it fires - expect(system).toMatch(/concrete|operational/i); - expect(system).toMatch(/replace the weak/i); // strengthen, don't only append - expect(model.mock.calls[0][1]).toMatch(/name the single recurring weakness/i); + expect(system).toContain("anchor it to the RELEVANT existing section"); + expect(system).toContain("CONCRETE and OPERATIONAL"); + expect(system).toContain("REPLACE the weak existing instruction"); + expect(model.mock.calls[0][1]).toContain("Diagnose the single recurring weakness"); + }); + + it("includes prior edits so the proposer doesn't repeat them", async () => { + const model = vi.fn(async (_s: string, _u: string) => "[]"); + await proposeSkillEdit(body, failures, { model, priorEdits: ["append: verify via API"] }); + const user = model.mock.calls[0][1]; + expect(user).toContain("ALREADY TRIED"); + expect(user).toContain("append: verify via API"); }); it("enforces the edit budget", async () => { From a6055ca7b53e75ccf624d7c3253475edaea1fa1a Mon Sep 17 00:00:00 2001 From: khustup2 Date: Sun, 7 Jun 2026 15:38:00 +0000 Subject: [PATCH 4/4] fix(skillopt): consistent two-part proposer contract + robust array parse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The JSON-only reconcile removed the 'name the weakness first' step, which was carrying the benchmark lift. Restore it as an explicit TWO-part contract (weakness line, then JSON array) — SYSTEM and user prompt now agree — and harden parseEdits to extract the array via a balanced back-scan (robust to a bracketed weakness line), with a first-[..last-] fallback. Tests cover both paths. --- src/skillify/skill-proposer.ts | 41 +++++++++++++++++++++-------- tests/shared/skill-proposer.test.ts | 12 ++++++++- 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/src/skillify/skill-proposer.ts b/src/skillify/skill-proposer.ts index 3cf4c68f..9158c284 100644 --- a/src/skillify/skill-proposer.ts +++ b/src/skillify/skill-proposer.ts @@ -40,32 +40,51 @@ const SYSTEM = "- TRACEABLE: each edit must directly prevent a specific failure listed below.\n" + "Prefer strengthening/replacing an existing instruction over adding new text. Do " + `NOT rewrite the whole doc, and do NOT touch anything between ${SU_START} and ` + - `${SU_END}. Reply with ONLY a JSON array of edits, each: {"op":"append|` + - 'insert_after|replace|delete","target":"","content":""}. Prefer the smallest change that fixes the weakness.'; + `${SU_END}. Reply in TWO parts: first ONE line naming the single recurring ` + + "weakness, then a JSON array of edits (the array is what gets parsed), each: " + + '{"op":"append|insert_after|replace|delete","target":"","content":""}. Prefer the smallest change that fixes the weakness.'; function buildUserPrompt(body: string, failures: string[], priorEdits: string[]): string { const cases = failures.slice(0, 8).map((f, i) => `${i + 1}. ${f}`).join("\n"); const prior = priorEdits.length ? `\n\nALREADY TRIED for this skill on earlier runs (do NOT repeat these — propose something different, or nothing):\n${priorEdits.slice(0, 12).map((p) => `- ${p}`).join("\n")}` : ""; - return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nDiagnose the single recurring weakness, then output the edits that anchor the fix into the relevant existing section. JSON array only.`; + return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nFirst name the single recurring weakness in one line, then output the JSON array of edits that anchor the fix into the relevant existing section.`; } const OPS = new Set(["append", "insert_after", "replace", "delete"]); +/** + * Extract the JSON array from noisy model output. Robust to a leading prose line + * (the "weakness" the proposer is asked to state first) even if it contains + * brackets: scan back from the last `]` to its balanced `[`, and fall back to + * first-`[`..last-`]` if that doesn't parse. + */ +function extractArray(s: string): unknown[] | null { + const b = s.lastIndexOf("]"); + if (b === -1) return null; + let depth = 0; + for (let i = b; i >= 0; i--) { + if (s[i] === "]") depth++; + else if (s[i] === "[" && --depth === 0) { + try { const a = JSON.parse(s.slice(i, b + 1)); if (Array.isArray(a)) return a; } catch { /* fall through */ } + break; + } + } + const first = s.indexOf("["); + if (first !== -1 && b > first) { try { const a = JSON.parse(s.slice(first, b + 1)); if (Array.isArray(a)) return a; } catch { /* none */ } } + return null; +} + /** Tolerant parse of a JSON array of edits (handles ```fences / surrounding prose). */ export function parseEdits(raw: string): Edit[] { let s = raw.trim(); const fence = s.match(/```(?:json)?\s*([\s\S]*?)```/); if (fence) s = fence[1].trim(); - const a = s.indexOf("["); - const b = s.lastIndexOf("]"); - if (a === -1 || b <= a) return []; - let arr: unknown; - try { arr = JSON.parse(s.slice(a, b + 1)); } catch { return []; } - if (!Array.isArray(arr)) return []; + const arr = extractArray(s); + if (!arr) return []; const out: Edit[] = []; for (const e of arr) { if (!e || typeof e !== "object") continue; diff --git a/tests/shared/skill-proposer.test.ts b/tests/shared/skill-proposer.test.ts index 01f54a65..97a704c8 100644 --- a/tests/shared/skill-proposer.test.ts +++ b/tests/shared/skill-proposer.test.ts @@ -18,6 +18,16 @@ describe("parseEdits", () => { it("returns [] when there's no array", () => { expect(parseEdits("the model refused")).toEqual([]); }); + it("extracts the edits array past a weakness line that itself contains brackets", () => { + const raw = 'Weakness: the doc never says to verify [the real client] before reporting.\n' + + '[{"op":"append","content":"verify via API"}]'; + expect(parseEdits(raw)).toEqual([{ op: "append", content: "verify via API" }]); + }); + it("falls back to first-[..last-] when content holds an unbalanced bracket", () => { + // the inner "]" defeats the balanced back-scan; the fallback still parses it + const raw = '[{"op":"append","content":"use arr] carefully"}]'; + expect(parseEdits(raw)).toEqual([{ op: "append", content: "use arr] carefully" }]); + }); }); describe("proposeSkillEdit", () => { @@ -44,7 +54,7 @@ describe("proposeSkillEdit", () => { expect(system).toContain("anchor it to the RELEVANT existing section"); expect(system).toContain("CONCRETE and OPERATIONAL"); expect(system).toContain("REPLACE the weak existing instruction"); - expect(model.mock.calls[0][1]).toContain("Diagnose the single recurring weakness"); + expect(model.mock.calls[0][1]).toContain("First name the single recurring weakness in one line"); }); it("includes prior edits so the proposer doesn't repeat them", async () => {