From 11021a3806b4ccd39249cc0174fa38fa3f87e43d Mon Sep 17 00:00:00 2001
From: khustup2 <sasun@activeloop.ai>
Date: Sun, 7 Jun 2026 06:16:59 +0000
Subject: [PATCH 1/4] feat(skillopt): proposer targets root-cause + anchors
 edits, not append-at-end

The proposer appended generic guidance to the end of long skill docs, which
diluted rather than fixed (and lost the A/B against the current body). It now
diagnoses the single recurring weakness first, then requires each edit to be
concrete/operational and anchored into the relevant existing section (or to
replace the weak existing instruction).

On a 4-skill benchmark of real failing org skills, an independent judge prefers
the proposed edit over the current body 3/4, up from 2/4 before.
---
 src/skillify/skill-proposer.ts | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)
diff --git a/src/skillify/skill-proposer.ts b/src/skillify/skill-proposer.ts
index 1032b1bd..99b4d951 100644
--- a/src/skillify/skill-proposer.ts
+++ b/src/skillify/skill-proposer.ts
@@ -26,13 +26,23 @@ export interface ProposeConfig {
 }
 
 const SYSTEM =
-  "You improve an engineering SKILL document that has been producing repeated, " +
-  "confirmed failures. Diagnose the SINGLE recurring weakness behind the failures " +
-  "and propose a SMALL set of structured edits that fix it. Do NOT rewrite the " +
-  `whole doc, and do NOT touch anything between ${SU_START} and ${SU_END}. Reply ` +
-  'with ONLY a JSON array of edits, each: {"op":"append|insert_after|replace|' +
-  'delete","target":"<exact existing text to anchor on; required for ' +
-  'insert_after/replace/delete>","content":"<new text; required for ' +
+  "You improve an engineering SKILL document that produced repeated, confirmed " +
+  "failures. Work in two steps: (1) diagnose the SINGLE recurring weakness — the " +
+  "specific thing the doc told the agent to do, or failed to tell it, that caused " +
+  "the pushback; (2) propose a SMALL set of structured edits that fix exactly that.\n" +
+  "Every edit MUST be:\n" +
+  "- CONCRETE and OPERATIONAL: a specific rule, check, command, or step the agent " +
+  "can follow — never vague advice ('be careful', 'communicate clearly').\n" +
+  "- PLACED WHERE IT FIRES: anchor it to the RELEVANT existing section " +
+  "(insert_after that heading), or REPLACE the weak existing instruction that " +
+  "allowed the failure. Do NOT just append a paragraph at the end of a long doc — " +
+  "that gets skipped.\n" +
+  "- TRACEABLE: each edit must directly prevent a specific failure listed below.\n" +
+  "Prefer strengthening/replacing an existing instruction over adding new text. Do " +
+  `NOT rewrite the whole doc, and do NOT touch anything between ${SU_START} and ` +
+  `${SU_END}. Reply with ONLY a JSON array of edits, each: {"op":"append|` +
+  'insert_after|replace|delete","target":"<exact existing text to anchor on; ' +
+  'required for insert_after/replace/delete>","content":"<new text; required for ' +
   'append/insert_after/replace>"}. Prefer the smallest change that fixes the weakness.';
 
 function buildUserPrompt(body: string, failures: string[], priorEdits: string[]): string {
@@ -40,7 +50,7 @@ function buildUserPrompt(body: string, failures: string[], priorEdits: string[])
   const prior = priorEdits.length
     ? `\n\nALREADY TRIED for this skill on earlier runs (do NOT repeat these — propose something different, or nothing):\n${priorEdits.slice(0, 12).map((p) => `- ${p}`).join("\n")}`
     : "";
-  return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nPropose the bounded edits. JSON array only.`;
+  return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nFirst name the single recurring weakness in one line, then output the edits that anchor the fix into the relevant existing section. JSON array only.`;
 }
 
 const OPS = new Set<EditOp>(["append", "insert_after", "replace", "delete"]);

From 2840a1c0955b46047ab1cc377c5fde26e76b2e12 Mon Sep 17 00:00:00 2001
From: khustup2 <sasun@activeloop.ai>
Date: Sun, 7 Jun 2026 07:07:37 +0000
Subject: [PATCH 2/4] test(skillopt): cover proposer steering toward concrete,
 anchored edits

---
 tests/shared/skill-proposer.test.ts | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/shared/skill-proposer.test.ts b/tests/shared/skill-proposer.test.ts
index cb1a1c00..a1506c73 100644
--- a/tests/shared/skill-proposer.test.ts
+++ b/tests/shared/skill-proposer.test.ts
@@ -35,6 +35,18 @@ describe("proposeSkillEdit", () => {
     expect(model.mock.calls[0][1]).toContain("CONFIRMED FAILURES");
   });
 
+  it("steers toward concrete, anchored edits — not append-at-end", async () => {
+    const model = vi.fn(async (_s: string, _u: string) =>
+      '[{"op":"insert_after","target":"## Rules","content":"0. verify on the real client first"}]');
+    const p = await proposeSkillEdit(body, failures, { model });
+    expect(p.changed).toBe(true);
+    const system = model.mock.calls[0][0];
+    expect(system).toMatch(/anchor/i);            // place the fix where it fires
+    expect(system).toMatch(/concrete|operational/i);
+    expect(system).toMatch(/replace the weak/i);  // strengthen, don't only append
+    expect(model.mock.calls[0][1]).toMatch(/name the single recurring weakness/i);
+  });
+
   it("enforces the edit budget", async () => {
     const model = vi.fn(async (_s: string, _u: string) =>
       '[{"op":"append","content":"a"},{"op":"append","content":"b"},{"op":"append","content":"c"}]');

From 1e5f58d64c3983c93999cdc22d4fee9edc75d34f Mon Sep 17 00:00:00 2001
From: khustup2 <sasun@activeloop.ai>
Date: Sun, 7 Jun 2026 15:33:41 +0000
Subject: [PATCH 3/4] fix(skillopt): reconcile proposer output contract
 (JSON-only) + tighten tests/coverage

- Resolve SYSTEM/user-prompt contradiction (CodeRabbit): output stays JSON-only;
  diagnosis is an internal step, not emitted (also avoids a bracketed weakness
  line breaking parseEdits).
- Assert exact prompt fragments instead of broad regexes.
- Cover the priorEdits branch.
---
 src/skillify/skill-proposer.ts      |  2 +-
 tests/shared/skill-proposer.test.ts | 16 ++++++++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/skillify/skill-proposer.ts b/src/skillify/skill-proposer.ts
index 99b4d951..3cf4c68f 100644
--- a/src/skillify/skill-proposer.ts
+++ b/src/skillify/skill-proposer.ts
@@ -50,7 +50,7 @@ function buildUserPrompt(body: string, failures: string[], priorEdits: string[])
   const prior = priorEdits.length
     ? `\n\nALREADY TRIED for this skill on earlier runs (do NOT repeat these — propose something different, or nothing):\n${priorEdits.slice(0, 12).map((p) => `- ${p}`).join("\n")}`
     : "";
-  return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nFirst name the single recurring weakness in one line, then output the edits that anchor the fix into the relevant existing section. JSON array only.`;
+  return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nDiagnose the single recurring weakness, then output the edits that anchor the fix into the relevant existing section. JSON array only.`;
 }
 
 const OPS = new Set<EditOp>(["append", "insert_after", "replace", "delete"]);
diff --git a/tests/shared/skill-proposer.test.ts b/tests/shared/skill-proposer.test.ts
index a1506c73..01f54a65 100644
--- a/tests/shared/skill-proposer.test.ts
+++ b/tests/shared/skill-proposer.test.ts
@@ -41,10 +41,18 @@ describe("proposeSkillEdit", () => {
     const p = await proposeSkillEdit(body, failures, { model });
     expect(p.changed).toBe(true);
     const system = model.mock.calls[0][0];
-    expect(system).toMatch(/anchor/i);            // place the fix where it fires
-    expect(system).toMatch(/concrete|operational/i);
-    expect(system).toMatch(/replace the weak/i);  // strengthen, don't only append
-    expect(model.mock.calls[0][1]).toMatch(/name the single recurring weakness/i);
+    expect(system).toContain("anchor it to the RELEVANT existing section");
+    expect(system).toContain("CONCRETE and OPERATIONAL");
+    expect(system).toContain("REPLACE the weak existing instruction");
+    expect(model.mock.calls[0][1]).toContain("Diagnose the single recurring weakness");
+  });
+
+  it("includes prior edits so the proposer doesn't repeat them", async () => {
+    const model = vi.fn(async (_s: string, _u: string) => "[]");
+    await proposeSkillEdit(body, failures, { model, priorEdits: ["append: verify via API"] });
+    const user = model.mock.calls[0][1];
+    expect(user).toContain("ALREADY TRIED");
+    expect(user).toContain("append: verify via API");
   });
 
   it("enforces the edit budget", async () => {

From a6055ca7b53e75ccf624d7c3253475edaea1fa1a Mon Sep 17 00:00:00 2001
From: khustup2 <sasun@activeloop.ai>
Date: Sun, 7 Jun 2026 15:38:00 +0000
Subject: [PATCH 4/4] fix(skillopt): consistent two-part proposer contract +
 robust array parse
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The JSON-only reconcile removed the 'name the weakness first' step, which was
carrying the benchmark lift. Restore it as an explicit TWO-part contract (weakness
line, then JSON array) — SYSTEM and user prompt now agree — and harden parseEdits
to extract the array via a balanced back-scan (robust to a bracketed weakness
line), with a first-[..last-] fallback. Tests cover both paths.
---
 src/skillify/skill-proposer.ts      | 41 +++++++++++++++++++++--------
 tests/shared/skill-proposer.test.ts | 12 ++++++++-
 2 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/src/skillify/skill-proposer.ts b/src/skillify/skill-proposer.ts
index 3cf4c68f..9158c284 100644
--- a/src/skillify/skill-proposer.ts
+++ b/src/skillify/skill-proposer.ts
@@ -40,32 +40,51 @@ const SYSTEM =
   "- TRACEABLE: each edit must directly prevent a specific failure listed below.\n" +
   "Prefer strengthening/replacing an existing instruction over adding new text. Do " +
   `NOT rewrite the whole doc, and do NOT touch anything between ${SU_START} and ` +
-  `${SU_END}. Reply with ONLY a JSON array of edits, each: {"op":"append|` +
-  'insert_after|replace|delete","target":"<exact existing text to anchor on; ' +
-  'required for insert_after/replace/delete>","content":"<new text; required for ' +
-  'append/insert_after/replace>"}. Prefer the smallest change that fixes the weakness.';
+  `${SU_END}. Reply in TWO parts: first ONE line naming the single recurring ` +
+  "weakness, then a JSON array of edits (the array is what gets parsed), each: " +
+  '{"op":"append|insert_after|replace|delete","target":"<exact existing text to ' +
+  'anchor on; required for insert_after/replace/delete>","content":"<new text; ' +
+  'required for append/insert_after/replace>"}. Prefer the smallest change that fixes the weakness.';
 
 function buildUserPrompt(body: string, failures: string[], priorEdits: string[]): string {
   const cases = failures.slice(0, 8).map((f, i) => `${i + 1}. ${f}`).join("\n");
   const prior = priorEdits.length
     ? `\n\nALREADY TRIED for this skill on earlier runs (do NOT repeat these — propose something different, or nothing):\n${priorEdits.slice(0, 12).map((p) => `- ${p}`).join("\n")}`
     : "";
-  return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nDiagnose the single recurring weakness, then output the edits that anchor the fix into the relevant existing section. JSON array only.`;
+  return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nFirst name the single recurring weakness in one line, then output the JSON array of edits that anchor the fix into the relevant existing section.`;
 }
 
 const OPS = new Set<EditOp>(["append", "insert_after", "replace", "delete"]);
 
+/**
+ * Extract the JSON array from noisy model output. Robust to a leading prose line
+ * (the "weakness" the proposer is asked to state first) even if it contains
+ * brackets: scan back from the last `]` to its balanced `[`, and fall back to
+ * first-`[`..last-`]` if that doesn't parse.
+ */
+function extractArray(s: string): unknown[] | null {
+  const b = s.lastIndexOf("]");
+  if (b === -1) return null;
+  let depth = 0;
+  for (let i = b; i >= 0; i--) {
+    if (s[i] === "]") depth++;
+    else if (s[i] === "[" && --depth === 0) {
+      try { const a = JSON.parse(s.slice(i, b + 1)); if (Array.isArray(a)) return a; } catch { /* fall through */ }
+      break;
+    }
+  }
+  const first = s.indexOf("[");
+  if (first !== -1 && b > first) { try { const a = JSON.parse(s.slice(first, b + 1)); if (Array.isArray(a)) return a; } catch { /* none */ } }
+  return null;
+}
+
 /** Tolerant parse of a JSON array of edits (handles ```fences / surrounding prose). */
 export function parseEdits(raw: string): Edit[] {
   let s = raw.trim();
   const fence = s.match(/```(?:json)?\s*([\s\S]*?)```/);
   if (fence) s = fence[1].trim();
-  const a = s.indexOf("[");
-  const b = s.lastIndexOf("]");
-  if (a === -1 || b <= a) return [];
-  let arr: unknown;
-  try { arr = JSON.parse(s.slice(a, b + 1)); } catch { return []; }
-  if (!Array.isArray(arr)) return [];
+  const arr = extractArray(s);
+  if (!arr) return [];
   const out: Edit[] = [];
   for (const e of arr) {
     if (!e || typeof e !== "object") continue;
diff --git a/tests/shared/skill-proposer.test.ts b/tests/shared/skill-proposer.test.ts
index 01f54a65..97a704c8 100644
--- a/tests/shared/skill-proposer.test.ts
+++ b/tests/shared/skill-proposer.test.ts
@@ -18,6 +18,16 @@ describe("parseEdits", () => {
   it("returns [] when there's no array", () => {
     expect(parseEdits("the model refused")).toEqual([]);
   });
+  it("extracts the edits array past a weakness line that itself contains brackets", () => {
+    const raw = 'Weakness: the doc never says to verify [the real client] before reporting.\n' +
+      '[{"op":"append","content":"verify via API"}]';
+    expect(parseEdits(raw)).toEqual([{ op: "append", content: "verify via API" }]);
+  });
+  it("falls back to first-[..last-] when content holds an unbalanced bracket", () => {
+    // the inner "]" defeats the balanced back-scan; the fallback still parses it
+    const raw = '[{"op":"append","content":"use arr] carefully"}]';
+    expect(parseEdits(raw)).toEqual([{ op: "append", content: "use arr] carefully" }]);
+  });
 });
 
 describe("proposeSkillEdit", () => {
@@ -44,7 +54,7 @@ describe("proposeSkillEdit", () => {
     expect(system).toContain("anchor it to the RELEVANT existing section");
     expect(system).toContain("CONCRETE and OPERATIONAL");
     expect(system).toContain("REPLACE the weak existing instruction");
-    expect(model.mock.calls[0][1]).toContain("Diagnose the single recurring weakness");
+    expect(model.mock.calls[0][1]).toContain("First name the single recurring weakness in one line");
   });
 
   it("includes prior edits so the proposer doesn't repeat them", async () => {