diff --git a/bench/baselines/current.json b/bench/baselines/current.json index 77bfa0d..2211d13 100644 --- a/bench/baselines/current.json +++ b/bench/baselines/current.json @@ -1,23 +1,23 @@ { "version": "1.0.0", - "generated": "2026-02-26T05:31:42.406Z", + "generated": "2026-03-10T00:15:20.299Z", "results": { "basic": { "Coding assistant": { - "ratio": 1.6812907904278462, - "tokenRatio": 1.6729559748427674, + "ratio": 1.9385451505016722, + "tokenRatio": 1.9275362318840579, "compressed": 5, "preserved": 8 }, "Long Q&A": { - "ratio": 6.158536585365853, - "tokenRatio": 6.114164904862579, + "ratio": 4.902912621359223, + "tokenRatio": 4.87689713322091, "compressed": 4, "preserved": 6 }, "Tool-heavy": { - "ratio": 1.2991563919532771, - "tokenRatio": 1.2946428571428572, + "ratio": 1.4128440366972477, + "tokenRatio": 1.4043583535108959, "compressed": 2, "preserved": 16 }, @@ -28,8 +28,8 @@ "preserved": 7 }, "Deep conversation": { - "ratio": 2.124913733609386, - "tokenRatio": 2.1241305510968433, + "ratio": 2.5041568769202964, + "tokenRatio": 2.4905897114178166, "compressed": 50, "preserved": 1 }, @@ -40,21 +40,21 @@ "preserved": 11 }, "Structured content": { - "ratio": 1.9338990620812864, - "tokenRatio": 1.9241486068111455, + "ratio": 1.8559794256322333, + "tokenRatio": 1.8469539375928679, "compressed": 2, "preserved": 10 }, "Agentic coding session": { - "ratio": 1.428351309707242, - "tokenRatio": 1.4258962011771001, + "ratio": 1.4768201370081249, + "tokenRatio": 1.4740044247787611, "compressed": 2, "preserved": 31 } }, "tokenBudget": { "Deep conversation|dedup=false": { - "tokenCount": 3738, + "tokenCount": 3188, "fits": false, "recencyWindow": 0, "compressed": 50, @@ -62,7 +62,7 @@ "deduped": 0 }, "Deep conversation|dedup=true": { - "tokenCount": 3738, + "tokenCount": 3188, "fits": false, "recencyWindow": 0, "compressed": 50, @@ -70,7 +70,7 @@ "deduped": 0 }, "Agentic coding session|dedup=false": { - "tokenCount": 2345, + "tokenCount": 2223, "fits": false, "recencyWindow": 0, "compressed": 4, @@ -78,7 +78,7 @@ "deduped": 0 }, "Agentic coding session|dedup=true": { - "tokenCount": 1957, + "tokenCount": 1900, "fits": true, "recencyWindow": 9, "compressed": 1, @@ -88,24 +88,24 @@ }, "dedup": { "Coding assistant": { - "rw0Base": 1.6812907904278462, - "rw0Dup": 1.6812907904278462, - "rw4Base": 1.5104234527687297, - "rw4Dup": 1.5104234527687297, + "rw0Base": 1.9385451505016722, + "rw0Dup": 1.9385451505016722, + "rw4Base": 1.6061655697956356, + "rw4Dup": 1.6061655697956356, "deduped": 0 }, "Long Q&A": { - "rw0Base": 5.139949109414759, - "rw0Dup": 6.158536585365853, - "rw4Base": 1.9024298361273309, - "rw4Dup": 2.0264847512038524, + "rw0Base": 4, + "rw0Dup": 4.902912621359223, + "rw4Base": 1.76296037702915, + "rw4Dup": 1.918693009118541, "deduped": 1 }, "Tool-heavy": { - "rw0Base": 1.2991563919532771, - "rw0Dup": 1.2991563919532771, - "rw4Base": 1.2991563919532771, - "rw4Dup": 1.2991563919532771, + "rw0Base": 1.4128440366972477, + "rw0Dup": 1.4128440366972477, + "rw4Base": 1.4128440366972477, + "rw4Dup": 1.4128440366972477, "deduped": 0 }, "Short conversation": { @@ -116,10 +116,10 @@ "deduped": 0 }, "Deep conversation": { - "rw0Base": 2.124913733609386, - "rw0Dup": 2.124913733609386, - "rw4Base": 1.9527165104643789, - "rw4Dup": 1.9527165104643789, + "rw0Base": 2.5041568769202964, + "rw0Dup": 2.5041568769202964, + "rw4Base": 2.2394536932277354, + "rw4Dup": 2.2394536932277354, "deduped": 0 }, "Technical explanation": { @@ -130,17 +130,17 @@ "deduped": 0 }, "Structured content": { - "rw0Base": 1.9338990620812864, - "rw0Dup": 1.9338990620812864, - "rw4Base": 1.373730964467005, - "rw4Dup": 1.373730964467005, + "rw0Base": 1.8559794256322333, + "rw0Dup": 1.8559794256322333, + "rw4Base": 1.3339494762784967, + "rw4Dup": 1.3339494762784967, "deduped": 0 }, "Agentic coding session": { - "rw0Base": 1.1374233128834357, - "rw0Dup": 1.428351309707242, - "rw4Base": 1.1374233128834357, - "rw4Dup": 1.428351309707242, + "rw0Base": 1.2001553599171413, + "rw0Dup": 1.4768201370081249, + "rw4Base": 1.2001553599171413, + "rw4Dup": 1.4768201370081249, "deduped": 4 } }, @@ -148,17 +148,17 @@ "Coding assistant": { "exact": 0, "fuzzy": 0, - "ratio": 1.6812907904278462 + "ratio": 1.9385451505016722 }, "Long Q&A": { "exact": 1, "fuzzy": 0, - "ratio": 6.158536585365853 + "ratio": 4.902912621359223 }, "Tool-heavy": { "exact": 0, "fuzzy": 0, - "ratio": 1.2991563919532771 + "ratio": 1.4128440366972477 }, "Short conversation": { "exact": 0, @@ -168,7 +168,7 @@ "Deep conversation": { "exact": 0, "fuzzy": 0, - "ratio": 2.124913733609386 + "ratio": 2.5041568769202964 }, "Technical explanation": { "exact": 0, @@ -178,22 +178,22 @@ "Structured content": { "exact": 0, "fuzzy": 0, - "ratio": 1.9338990620812864 + "ratio": 1.8559794256322333 }, "Agentic coding session": { "exact": 4, "fuzzy": 2, - "ratio": 2.229973538609574 + "ratio": 2.3504056795131847 } }, "bundleSize": { "classify.js": { - "bytes": 7724, - "gzipBytes": 3250 + "bytes": 8074, + "gzipBytes": 3443 }, "compress.js": { - "bytes": 33941, - "gzipBytes": 8721 + "bytes": 34466, + "gzipBytes": 8914 }, "dedup.js": { "bytes": 10260, @@ -216,8 +216,8 @@ "gzipBytes": 31 }, "total": { - "bytes": 57498, - "gzipBytes": 16952 + "bytes": 58373, + "gzipBytes": 17338 } } } diff --git a/bench/baselines/history/v1.0.0.json b/bench/baselines/history/v1.0.0.json index 77bfa0d..2211d13 100644 --- a/bench/baselines/history/v1.0.0.json +++ b/bench/baselines/history/v1.0.0.json @@ -1,23 +1,23 @@ { "version": "1.0.0", - "generated": "2026-02-26T05:31:42.406Z", + "generated": "2026-03-10T00:15:20.299Z", "results": { "basic": { "Coding assistant": { - "ratio": 1.6812907904278462, - "tokenRatio": 1.6729559748427674, + "ratio": 1.9385451505016722, + "tokenRatio": 1.9275362318840579, "compressed": 5, "preserved": 8 }, "Long Q&A": { - "ratio": 6.158536585365853, - "tokenRatio": 6.114164904862579, + "ratio": 4.902912621359223, + "tokenRatio": 4.87689713322091, "compressed": 4, "preserved": 6 }, "Tool-heavy": { - "ratio": 1.2991563919532771, - "tokenRatio": 1.2946428571428572, + "ratio": 1.4128440366972477, + "tokenRatio": 1.4043583535108959, "compressed": 2, "preserved": 16 }, @@ -28,8 +28,8 @@ "preserved": 7 }, "Deep conversation": { - "ratio": 2.124913733609386, - "tokenRatio": 2.1241305510968433, + "ratio": 2.5041568769202964, + "tokenRatio": 2.4905897114178166, "compressed": 50, "preserved": 1 }, @@ -40,21 +40,21 @@ "preserved": 11 }, "Structured content": { - "ratio": 1.9338990620812864, - "tokenRatio": 1.9241486068111455, + "ratio": 1.8559794256322333, + "tokenRatio": 1.8469539375928679, "compressed": 2, "preserved": 10 }, "Agentic coding session": { - "ratio": 1.428351309707242, - "tokenRatio": 1.4258962011771001, + "ratio": 1.4768201370081249, + "tokenRatio": 1.4740044247787611, "compressed": 2, "preserved": 31 } }, "tokenBudget": { "Deep conversation|dedup=false": { - "tokenCount": 3738, + "tokenCount": 3188, "fits": false, "recencyWindow": 0, "compressed": 50, @@ -62,7 +62,7 @@ "deduped": 0 }, "Deep conversation|dedup=true": { - "tokenCount": 3738, + "tokenCount": 3188, "fits": false, "recencyWindow": 0, "compressed": 50, @@ -70,7 +70,7 @@ "deduped": 0 }, "Agentic coding session|dedup=false": { - "tokenCount": 2345, + "tokenCount": 2223, "fits": false, "recencyWindow": 0, "compressed": 4, @@ -78,7 +78,7 @@ "deduped": 0 }, "Agentic coding session|dedup=true": { - "tokenCount": 1957, + "tokenCount": 1900, "fits": true, "recencyWindow": 9, "compressed": 1, @@ -88,24 +88,24 @@ }, "dedup": { "Coding assistant": { - "rw0Base": 1.6812907904278462, - "rw0Dup": 1.6812907904278462, - "rw4Base": 1.5104234527687297, - "rw4Dup": 1.5104234527687297, + "rw0Base": 1.9385451505016722, + "rw0Dup": 1.9385451505016722, + "rw4Base": 1.6061655697956356, + "rw4Dup": 1.6061655697956356, "deduped": 0 }, "Long Q&A": { - "rw0Base": 5.139949109414759, - "rw0Dup": 6.158536585365853, - "rw4Base": 1.9024298361273309, - "rw4Dup": 2.0264847512038524, + "rw0Base": 4, + "rw0Dup": 4.902912621359223, + "rw4Base": 1.76296037702915, + "rw4Dup": 1.918693009118541, "deduped": 1 }, "Tool-heavy": { - "rw0Base": 1.2991563919532771, - "rw0Dup": 1.2991563919532771, - "rw4Base": 1.2991563919532771, - "rw4Dup": 1.2991563919532771, + "rw0Base": 1.4128440366972477, + "rw0Dup": 1.4128440366972477, + "rw4Base": 1.4128440366972477, + "rw4Dup": 1.4128440366972477, "deduped": 0 }, "Short conversation": { @@ -116,10 +116,10 @@ "deduped": 0 }, "Deep conversation": { - "rw0Base": 2.124913733609386, - "rw0Dup": 2.124913733609386, - "rw4Base": 1.9527165104643789, - "rw4Dup": 1.9527165104643789, + "rw0Base": 2.5041568769202964, + "rw0Dup": 2.5041568769202964, + "rw4Base": 2.2394536932277354, + "rw4Dup": 2.2394536932277354, "deduped": 0 }, "Technical explanation": { @@ -130,17 +130,17 @@ "deduped": 0 }, "Structured content": { - "rw0Base": 1.9338990620812864, - "rw0Dup": 1.9338990620812864, - "rw4Base": 1.373730964467005, - "rw4Dup": 1.373730964467005, + "rw0Base": 1.8559794256322333, + "rw0Dup": 1.8559794256322333, + "rw4Base": 1.3339494762784967, + "rw4Dup": 1.3339494762784967, "deduped": 0 }, "Agentic coding session": { - "rw0Base": 1.1374233128834357, - "rw0Dup": 1.428351309707242, - "rw4Base": 1.1374233128834357, - "rw4Dup": 1.428351309707242, + "rw0Base": 1.2001553599171413, + "rw0Dup": 1.4768201370081249, + "rw4Base": 1.2001553599171413, + "rw4Dup": 1.4768201370081249, "deduped": 4 } }, @@ -148,17 +148,17 @@ "Coding assistant": { "exact": 0, "fuzzy": 0, - "ratio": 1.6812907904278462 + "ratio": 1.9385451505016722 }, "Long Q&A": { "exact": 1, "fuzzy": 0, - "ratio": 6.158536585365853 + "ratio": 4.902912621359223 }, "Tool-heavy": { "exact": 0, "fuzzy": 0, - "ratio": 1.2991563919532771 + "ratio": 1.4128440366972477 }, "Short conversation": { "exact": 0, @@ -168,7 +168,7 @@ "Deep conversation": { "exact": 0, "fuzzy": 0, - "ratio": 2.124913733609386 + "ratio": 2.5041568769202964 }, "Technical explanation": { "exact": 0, @@ -178,22 +178,22 @@ "Structured content": { "exact": 0, "fuzzy": 0, - "ratio": 1.9338990620812864 + "ratio": 1.8559794256322333 }, "Agentic coding session": { "exact": 4, "fuzzy": 2, - "ratio": 2.229973538609574 + "ratio": 2.3504056795131847 } }, "bundleSize": { "classify.js": { - "bytes": 7724, - "gzipBytes": 3250 + "bytes": 8074, + "gzipBytes": 3443 }, "compress.js": { - "bytes": 33941, - "gzipBytes": 8721 + "bytes": 34466, + "gzipBytes": 8914 }, "dedup.js": { "bytes": 10260, @@ -216,8 +216,8 @@ "gzipBytes": 31 }, "total": { - "bytes": 57498, - "gzipBytes": 16952 + "bytes": 58373, + "gzipBytes": 17338 } } } diff --git a/docs/benchmark-results.md b/docs/benchmark-results.md index 458513a..f9d1d66 100644 --- a/docs/benchmark-results.md +++ b/docs/benchmark-results.md @@ -4,17 +4,17 @@ _Auto-generated by `npm run bench:save`. Do not edit manually._ -**v1.0.0** · Generated: 2026-02-26 +**v1.0.0** · Generated: 2026-03-10 -![avg ratio](https://img.shields.io/badge/avg%20ratio-2.08x-blue) ![best](https://img.shields.io/badge/best-6.16x-blue) ![scenarios](https://img.shields.io/badge/scenarios-8-blue) ![round-trip](https://img.shields.io/badge/round--trip-all_PASS-brightgreen) ![gzip](https://img.shields.io/badge/gzip-16.6%20KB-blue) +![avg ratio](https://img.shields.io/badge/avg%20ratio-2.01x-blue) ![best](https://img.shields.io/badge/best-4.90x-blue) ![scenarios](https://img.shields.io/badge/scenarios-8-blue) ![round-trip](https://img.shields.io/badge/round--trip-all_PASS-brightgreen) ![gzip](https://img.shields.io/badge/gzip-16.9%20KB-blue) ## Summary | Metric | Value | | -------------------- | -------- | | Scenarios | 8 | -| Average compression | 2.08x | -| Best compression | 6.16x | +| Average compression | 2.01x | +| Best compression | 4.90x | | Round-trip integrity | all PASS | ```mermaid @@ -25,26 +25,26 @@ pie title "Message Outcomes" ## Compression by Scenario -> **8 scenarios** · **2.08x** avg ratio · **1.00x** – **6.16x** range · all round-trips PASS +> **8 scenarios** · **2.01x** avg ratio · **1.00x** – **4.90x** range · all round-trips PASS ```mermaid xychart-beta title "Compression Ratio by Scenario" x-axis ["Coding", "Long Q&A", "Tool-heavy", "Short", "Deep", "Technical", "Structured", "Agentic"] y-axis "Char Ratio" - bar [1.68, 6.16, 1.30, 1.00, 2.12, 1.00, 1.93, 1.43] + bar [1.94, 4.90, 1.41, 1.00, 2.50, 1.00, 1.86, 1.48] ``` | Scenario | Ratio | Reduction | Token Ratio | Messages | Compressed | Preserved | | ---------------------- | ----: | --------: | ----------: | -------: | ---------: | --------: | -| Coding assistant | 1.68 | 41% | 1.67 | 13 | 5 | 8 | -| Long Q&A | 6.16 | 84% | 6.11 | 10 | 4 | 6 | -| Tool-heavy | 1.30 | 23% | 1.29 | 18 | 2 | 16 | +| Coding assistant | 1.94 | 48% | 1.93 | 13 | 5 | 8 | +| Long Q&A | 4.90 | 80% | 4.88 | 10 | 4 | 6 | +| Tool-heavy | 1.41 | 29% | 1.40 | 18 | 2 | 16 | | Short conversation | 1.00 | 0% | 1.00 | 7 | 0 | 7 | -| Deep conversation | 2.12 | 53% | 2.12 | 51 | 50 | 1 | +| Deep conversation | 2.50 | 60% | 2.49 | 51 | 50 | 1 | | Technical explanation | 1.00 | 0% | 1.00 | 11 | 0 | 11 | -| Structured content | 1.93 | 48% | 1.92 | 12 | 2 | 10 | -| Agentic coding session | 1.43 | 30% | 1.43 | 33 | 2 | 31 | +| Structured content | 1.86 | 46% | 1.85 | 12 | 2 | 10 | +| Agentic coding session | 1.48 | 32% | 1.47 | 33 | 2 | 31 | ## Deduplication Impact @@ -53,35 +53,35 @@ xychart-beta title "Deduplication Impact (recencyWindow=0)" x-axis ["Long Q&A", "Agentic"] y-axis "Char Ratio" - bar [5.14, 1.14] - bar [6.16, 1.43] + bar [4.00, 1.20] + bar [4.90, 1.48] ``` _First bar: no dedup · Second bar: with dedup_ | Scenario | No Dedup (rw=0) | Dedup (rw=0) | No Dedup (rw=4) | Dedup (rw=4) | Deduped | | ---------------------- | --------------: | -----------: | --------------: | -----------: | ------: | -| Coding assistant | 1.68 | 1.68 | 1.51 | 1.51 | 0 | -| Long Q&A | 5.14 | 6.16 | 1.90 | 2.03 | 1 | -| Tool-heavy | 1.30 | 1.30 | 1.30 | 1.30 | 0 | +| Coding assistant | 1.94 | 1.94 | 1.61 | 1.61 | 0 | +| Long Q&A | 4.00 | 4.90 | 1.76 | 1.92 | 1 | +| Tool-heavy | 1.41 | 1.41 | 1.41 | 1.41 | 0 | | Short conversation | 1.00 | 1.00 | 1.00 | 1.00 | 0 | -| Deep conversation | 2.12 | 2.12 | 1.95 | 1.95 | 0 | +| Deep conversation | 2.50 | 2.50 | 2.24 | 2.24 | 0 | | Technical explanation | 1.00 | 1.00 | 1.00 | 1.00 | 0 | -| Structured content | 1.93 | 1.93 | 1.37 | 1.37 | 0 | -| Agentic coding session | 1.14 | 1.43 | 1.14 | 1.43 | 4 | +| Structured content | 1.86 | 1.86 | 1.33 | 1.33 | 0 | +| Agentic coding session | 1.20 | 1.48 | 1.20 | 1.48 | 4 | ### Fuzzy Dedup | Scenario | Exact Deduped | Fuzzy Deduped | Ratio | vs Base | | ---------------------- | ------------: | ------------: | ----: | ------: | -| Coding assistant | 0 | 0 | 1.68 | - | -| Long Q&A | 1 | 0 | 6.16 | - | -| Tool-heavy | 0 | 0 | 1.30 | - | +| Coding assistant | 0 | 0 | 1.94 | - | +| Long Q&A | 1 | 0 | 4.90 | - | +| Tool-heavy | 0 | 0 | 1.41 | - | | Short conversation | 0 | 0 | 1.00 | - | -| Deep conversation | 0 | 0 | 2.12 | - | +| Deep conversation | 0 | 0 | 2.50 | - | | Technical explanation | 0 | 0 | 1.00 | - | -| Structured content | 0 | 0 | 1.93 | - | -| Agentic coding session | 4 | 2 | 2.23 | +56% | +| Structured content | 0 | 0 | 1.86 | - | +| Agentic coding session | 4 | 2 | 2.35 | +59% | ## Token Budget @@ -89,10 +89,10 @@ Target: **2000 tokens** · 1/4 fit | Scenario | Dedup | Tokens | Fits | recencyWindow | Compressed | Preserved | Deduped | | ---------------------- | ----- | -----: | ---- | ------------: | ---------: | --------: | ------: | -| Deep conversation | no | 3738 | no | 0 | 50 | 1 | 0 | -| Deep conversation | yes | 3738 | no | 0 | 50 | 1 | 0 | -| Agentic coding session | no | 2345 | no | 0 | 4 | 33 | 0 | -| Agentic coding session | yes | 1957 | yes | 9 | 1 | 32 | 4 | +| Deep conversation | no | 3188 | no | 0 | 50 | 1 | 0 | +| Deep conversation | yes | 3188 | no | 0 | 50 | 1 | 0 | +| Agentic coding session | no | 2223 | no | 0 | 4 | 33 | 0 | +| Agentic coding session | yes | 1900 | yes | 9 | 1 | 32 | 4 | ## Bundle Size @@ -100,14 +100,14 @@ Target: **2000 tokens** · 1/4 fit | File | Size | Gzip | | ------------- | ------: | ------: | -| classify.js | 7.5 KB | 3.2 KB | -| compress.js | 33.1 KB | 8.5 KB | +| classify.js | 7.9 KB | 3.4 KB | +| compress.js | 33.7 KB | 8.7 KB | | dedup.js | 10.0 KB | 2.8 KB | | expand.js | 2.7 KB | 934 B | | index.js | 225 B | 159 B | | summarizer.js | 2.5 KB | 993 B | | types.js | 11 B | 31 B | -| **total** | 56.2 KB | 16.6 KB | +| **total** | 57.0 KB | 16.9 KB | ## LLM vs Deterministic @@ -116,26 +116,26 @@ Target: **2000 tokens** · 1/4 fit ``` Deterministic vs ollama/llama3.2 -Coding assistant Det ████████░░░░░░░░░░░░░░░░░░░░░░ 1.68x - LLM ████████░░░░░░░░░░░░░░░░░░░░░░ 1.55x +Coding assistant Det ████████████░░░░░░░░░░░░░░░░░░ 1.94x + LLM █████████░░░░░░░░░░░░░░░░░░░░░ 1.55x -Long Q&A Det ██████████████████████████████ 6.16x - LLM ██████████████████████░░░░░░░░ 4.49x +Long Q&A Det ██████████████████████████████ 4.90x + LLM ███████████████████████████░░░ 4.49x -Tool-heavy Det ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.30x - LLM ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.28x +Tool-heavy Det █████████░░░░░░░░░░░░░░░░░░░░░ 1.41x + LLM ████████░░░░░░░░░░░░░░░░░░░░░░ 1.28x -Deep conversation Det ██████████░░░░░░░░░░░░░░░░░░░░ 2.12x - LLM ████████████████░░░░░░░░░░░░░░ 3.28x ★ +Deep conversation Det ███████████████░░░░░░░░░░░░░░░ 2.50x + LLM ████████████████████░░░░░░░░░░ 3.28x ★ -Technical explanation Det █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x - LLM █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x +Technical explanation Det ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x + LLM ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x -Structured content Det █████████░░░░░░░░░░░░░░░░░░░░░ 1.93x - LLM ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.46x +Structured content Det ███████████░░░░░░░░░░░░░░░░░░░ 1.86x + LLM █████████░░░░░░░░░░░░░░░░░░░░░ 1.46x -Agentic coding session Det ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.43x - LLM ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.40x +Agentic coding session Det █████████░░░░░░░░░░░░░░░░░░░░░ 1.48x + LLM █████████░░░░░░░░░░░░░░░░░░░░░ 1.40x ★ = LLM wins ``` @@ -143,26 +143,26 @@ Agentic coding session Det ███████░░░░░░░░░░ ``` Deterministic vs openai/gpt-4.1-mini -Coding assistant Det ████████░░░░░░░░░░░░░░░░░░░░░░ 1.68x - LLM ████████░░░░░░░░░░░░░░░░░░░░░░ 1.64x +Coding assistant Det ███████████░░░░░░░░░░░░░░░░░░░ 1.94x + LLM █████████░░░░░░░░░░░░░░░░░░░░░ 1.64x -Long Q&A Det ██████████████████████████████ 6.16x - LLM ██████████████████████████░░░░ 5.37x +Long Q&A Det ███████████████████████████░░░ 4.90x + LLM ██████████████████████████████ 5.37x ★ -Tool-heavy Det ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.30x - LLM █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.12x +Tool-heavy Det ████████░░░░░░░░░░░░░░░░░░░░░░ 1.41x + LLM ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.12x -Deep conversation Det ██████████░░░░░░░░░░░░░░░░░░░░ 2.12x - LLM ████████████░░░░░░░░░░░░░░░░░░ 2.37x ★ +Deep conversation Det ██████████████░░░░░░░░░░░░░░░░ 2.50x + LLM █████████████░░░░░░░░░░░░░░░░░ 2.37x -Technical explanation Det █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x - LLM █████░░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x +Technical explanation Det ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x + LLM ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.00x -Structured content Det █████████░░░░░░░░░░░░░░░░░░░░░ 1.93x - LLM ██████░░░░░░░░░░░░░░░░░░░░░░░░ 1.29x +Structured content Det ██████████░░░░░░░░░░░░░░░░░░░░ 1.86x + LLM ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.29x -Agentic coding session Det ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.43x - LLM ███████░░░░░░░░░░░░░░░░░░░░░░░ 1.43x +Agentic coding session Det ████████░░░░░░░░░░░░░░░░░░░░░░ 1.48x + LLM ████████░░░░░░░░░░░░░░░░░░░░░░ 1.43x ★ = LLM wins ``` diff --git a/docs/compression-pipeline.md b/docs/compression-pipeline.md index f894cd4..da9c5ec 100644 --- a/docs/compression-pipeline.md +++ b/docs/compression-pipeline.md @@ -100,7 +100,7 @@ The `summarize` function uses sentence scoring: 5. Re-sort selected sentences by original position to preserve reading order 6. Join with `...` separator -Budget: 200 chars if input < 600 chars, 400 chars otherwise. +Budget scales adaptively: max(200, min(round(length × 0.3), 600)). Short content gets 200 chars, long content up to 600. ### Entity extraction @@ -111,14 +111,14 @@ After summarizing, `extractEntities` pulls out key identifiers from the original - Vowelless abbreviations - Numbers with units/context -Up to 10 entities are appended as `| entities: foo, bar, baz`. +Entities scale with content length (3–15) and are appended as `| entities: foo, bar, baz`. ### Code-split processing Messages containing code fences with significant prose (>= 80 chars) get split: 1. `splitCodeAndProse` extracts code fences and surrounding prose separately -2. Prose is summarized (budget: 200 if < 600 chars, else 400) +2. Prose is summarized (budget scales adaptively with prose length) 3. Code fences are preserved verbatim 4. Result: `[summary: ...]\n\n```code here```` diff --git a/docs/preservation-rules.md b/docs/preservation-rules.md index 1060e07..bba9bdf 100644 --- a/docs/preservation-rules.md +++ b/docs/preservation-rules.md @@ -68,11 +68,11 @@ Soft T0 content is still compressible because the entity extraction step capture ### T2 — Short prose -Prose under 20 words. Currently treated the same as T3 in the compression pipeline. +Prose under 20 words. Treated identically to T3 in the current deterministic pipeline — the distinction is preserved for future LLM classifier integration, which can apply lighter compression to short prose. ### T3 — Long prose -Prose of 20+ words. The primary target for summarization. +Prose of 20+ words. The primary target for summarization. Treated identically to T2 in the current pipeline; the LLM classifier will use the T2/T3 distinction for tier-specific strategies. ## API key detection @@ -103,7 +103,7 @@ SQL detection uses a tiered anchor system to avoid false positives on English pr Messages with code fences and significant prose (>= 80 chars) are split: 1. Code fences are extracted verbatim -2. Surrounding prose is summarized (budget: 200 chars if < 600 chars, 400 otherwise) +2. Surrounding prose is summarized (budget scales adaptively: 200–600 chars based on prose length) 3. Result: summary + preserved code fences If the total prose is < 80 chars, the entire message is preserved (not enough prose to justify splitting). diff --git a/src/classify.ts b/src/classify.ts index 6e5f5cd..68794fa 100644 --- a/src/classify.ts +++ b/src/classify.ts @@ -1,5 +1,18 @@ export type ClassifyResult = { decision: 'T0' | 'T2' | 'T3'; + /** + * Classification confidence (0–1). Higher values indicate stronger signal. + * + * For T0: starts at 0.70, increases by 0.05 per additional structural reason + * (capped at 0.95). Multiple overlapping signals → higher confidence. + * For T2/T3: fixed at 0.65 (pure prose heuristic, no structural anchors). + * + * The deterministic pipeline does not route on confidence — it uses the + * hard/soft T0 distinction instead. Consumers can use confidence for custom + * routing (e.g. only compress below a threshold), monitoring dashboards, + * or LLM classifier fallback decisions (cf. Amazon Science "Label with + * Confidence" for confidence-weighted routing patterns). + */ confidence: number; reasons: string[]; }; @@ -189,6 +202,14 @@ function detectContentTypes(text: string): { // -- Tier heuristic for clean prose -- +/** + * Assign T2 (short prose, < 20 words) or T3 (long prose, >= 20 words). + * + * Both tiers are compressed identically in the current deterministic pipeline. + * The distinction exists so a future LLM classifier can apply different + * strategies per tier — e.g. lighter summarization for T2 or aggressive + * compression for verbose T3 content. + */ function inferProseTier(text: string): 'T2' | 'T3' { const words = text.split(/\s+/).length; if (words < 20) return 'T2'; diff --git a/src/compress.ts b/src/compress.ts index b77b72c..6c09c03 100644 --- a/src/compress.ts +++ b/src/compress.ts @@ -330,6 +330,10 @@ const COMMON_STARTERS = new Set([ 'Into', ]); +function computeBudget(contentLength: number): number { + return Math.max(200, Math.min(Math.round(contentLength * 0.3), 600)); +} + function extractEntities(text: string): string[] { const entities = new Set(); @@ -376,8 +380,8 @@ function extractEntities(text: string): string[] { for (const n of numbersCtx) entities.add(n.trim()); } - // Cap at 10 - return Array.from(entities).slice(0, 10); + const maxEntities = Math.max(3, Math.min(Math.round(text.length / 200), 15)); + return Array.from(entities).slice(0, maxEntities); } function splitCodeAndProse(text: string): Array<{ type: 'prose' | 'code'; content: string }> { @@ -572,6 +576,10 @@ function classifyAll( } return { msg, preserved: true }; } + // T2 (short prose) and T3 (long prose) are intentionally treated identically + // in the current pipeline — both go through the same summarization path. + // The distinction is preserved for future LLM classifier integration, which + // can apply different strategies per tier (e.g. lighter compression for T2). if (content) { const cls = classifyMessage(content); if (cls.decision === 'T0') { @@ -735,7 +743,7 @@ function* compressGen( .map((s) => s.content) .join(' '); const codeFences = segments.filter((s) => s.type === 'code').map((s) => s.content); - const proseBudget = proseText.length < 600 ? 200 : 400; + const proseBudget = computeBudget(proseText.length); const summaryText: string = yield { text: proseText, budget: proseBudget }; const embeddedId = options.embedSummaryId ? makeSummaryId([msg.id]) : undefined; const compressed = `${formatSummary(summaryText, proseText, undefined, true, embeddedId)}\n\n${codeFences.join('\n\n')}`; @@ -762,7 +770,7 @@ function* compressGen( const allContent = group .map((g) => (typeof g.msg.content === 'string' ? g.msg.content : '')) .join(' '); - const contentBudget = allContent.length < 600 ? 200 : 400; + const contentBudget = computeBudget(allContent.length); const summaryText = isStructuredOutput(allContent) ? summarizeStructured(allContent, contentBudget) : yield { text: allContent, budget: contentBudget }; diff --git a/tests/compress.test.ts b/tests/compress.test.ts index 822cccc..6b5512a 100644 --- a/tests/compress.test.ts +++ b/tests/compress.test.ts @@ -684,14 +684,13 @@ describe('compress', () => { expect(content).toContain('Express'); }); - it('caps at 400 chars when no punctuation', () => { - const noPunct = 'word '.repeat(200); // 1000 chars, no sentence-ending punctuation + it('caps at adaptive budget when no punctuation', () => { + const noPunct = 'word '.repeat(200); // 1000 chars → computeBudget = 300 const messages: Message[] = [msg({ id: '1', index: 0, role: 'user', content: noPunct })]; const result = compress(messages, { recencyWindow: 0 }); - // The summary text (between [summary: and the suffix) should not exceed 400 chars const match = result.messages[0].content!.match(/\[summary: (.*?)(?:\s*\(|\s*\||\])/); expect(match).toBeTruthy(); - expect(match![1].length).toBeLessThanOrEqual(400); + expect(match![1].length).toBeLessThanOrEqual(300); }); it('includes first substantive + last sentence', () => { @@ -719,19 +718,20 @@ describe('compress', () => { expect(content).toContain('Sure thing'); }); - it('hard caps overall summary at 400 chars', () => { + it('hard caps overall summary at adaptive budget', () => { // Use non-hex chars to avoid triggering hash_or_sha T0 detection const longSentence = 'Wor '.repeat(50) + 'is the architecture we chose for this particular deployment. '; const text = longSentence + 'The last sentence describes the final outcome of this deployment strategy.'; + // ~1675 chars → computeBudget = 503 const messages: Message[] = [ msg({ id: '1', index: 0, role: 'user', content: text.repeat(5) }), ]; const result = compress(messages, { recencyWindow: 0 }); const match = result.messages[0].content!.match(/\[summary: (.*?)(?:\s*\(|\s*\||\])/); expect(match).toBeTruthy(); - expect(match![1].length).toBeLessThanOrEqual(400); + expect(match![1].length).toBeLessThanOrEqual(503); }); it('extracts content from multiple paragraphs', () => { @@ -760,7 +760,7 @@ describe('compress', () => { expect(content).toContain('authentication module'); }); - it('budget ceiling at 400 chars', () => { + it('adaptive budget ceiling scales with content length', () => { const sentences = Array.from( { length: 20 }, (_, i) => `Sentence number ${i + 1} provides additional context about the deployment.`, @@ -771,7 +771,8 @@ describe('compress', () => { const result = compress(messages, { recencyWindow: 0 }); const match = result.messages[0].content!.match(/\[summary: (.*?)(?:\s*\(|\s*\||\])/); expect(match).toBeTruthy(); - expect(match![1].length).toBeLessThanOrEqual(400); + // ~3900 chars content → computeBudget = 600 + expect(match![1].length).toBeLessThanOrEqual(600); }); it('weights PASS/FAIL/ERROR status words higher', () => { @@ -878,7 +879,7 @@ describe('compress', () => { expect(content).toContain('grpc'); }); - it('caps entities at 10', () => { + it('caps entities proportionally to content length', () => { const text = 'Alice Bob Charlie Dave Eve Frank Grace Heidi Ivan Judy Karl Liam Mallory spoke about getUserData fetchItems parseConfig with user_id auth_token db_name cache_key log_level queue_size worker_count and 5 retries and 10 seconds. '.repeat( 3, @@ -889,7 +890,57 @@ describe('compress', () => { const entitiesMatch = content.match(/entities: ([^\]]+)/); expect(entitiesMatch).toBeTruthy(); const entityList = entitiesMatch![1].split(', '); - expect(entityList.length).toBeLessThanOrEqual(10); + // ~684 chars → cap = max(3, min(round(684/200), 15)) = 3 + expect(entityList.length).toBeLessThanOrEqual(3); + }); + + it('allows more entities for longer content', () => { + const text = + 'Alice Bob Charlie Dave Eve Frank Grace Heidi Ivan Judy Karl Liam Mallory spoke about getUserData fetchItems parseConfig with user_id auth_token db_name cache_key log_level queue_size worker_count and 5 retries and 10 seconds. '.repeat( + 12, + ); + const messages: Message[] = [msg({ id: '1', index: 0, role: 'user', content: text })]; + const result = compress(messages, { recencyWindow: 0 }); + const content = result.messages[0].content!; + const entitiesMatch = content.match(/entities: ([^\]]+)/); + expect(entitiesMatch).toBeTruthy(); + const entityList = entitiesMatch![1].split(', '); + // ~2736 chars → cap = max(3, min(round(2736/200), 15)) = 14 + expect(entityList.length).toBeGreaterThan(3); + expect(entityList.length).toBeLessThanOrEqual(15); + }); + }); + + describe('adaptive budget scaling', () => { + it('short content gets a small budget (≤ 200 chars)', () => { + // ~500 chars of prose → computeBudget(500) = 200 + const text = + 'The deployment process starts by pulling the latest Docker image from the registry and running pre-flight checks. '.repeat( + 4, + ); + expect(text.length).toBeLessThan(667); + const messages: Message[] = [msg({ id: '1', index: 0, role: 'user', content: text })]; + const result = compress(messages, { recencyWindow: 0 }); + const match = result.messages[0].content!.match(/\[summary: (.*?)(?:\s*\(|\s*\||\])/); + expect(match).toBeTruthy(); + expect(match![1].length).toBeLessThanOrEqual(200); + }); + + it('long content gets a larger budget (≤ 600 and > 200 chars)', () => { + // ~2400 chars of diverse prose → computeBudget(2400) = 600 + const sentences = Array.from( + { length: 30 }, + (_, i) => + `Step ${i + 1} in the deployment pipeline involves running integration tests against the staging environment.`, + ).join(' '); + expect(sentences.length).toBeGreaterThan(2000); + const messages: Message[] = [msg({ id: '1', index: 0, role: 'user', content: sentences })]; + const result = compress(messages, { recencyWindow: 0 }); + const match = result.messages[0].content!.match(/\[summary: (.*?)(?:\s*\(|\s*\||\])/); + expect(match).toBeTruthy(); + expect(match![1].length).toBeLessThanOrEqual(600); + // Budget is 600 so the summarizer has room for > 200 chars + expect(match![1].length).toBeGreaterThan(200); }); });