diff --git a/docs/api-reference.md b/docs/api-reference.md index 9f5973b..4c27fe0 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -53,37 +53,39 @@ function compress( ### CompressOptions -| Option | Type | Default | Description | -| ------------------ | -------------------------- | --------------------- | ------------------------------------------------------------------------------------------------------------------ | -| `preserve` | `string[]` | `['system']` | Roles to never compress | -| `recencyWindow` | `number` | `4` | Protect the last N messages from compression | -| `sourceVersion` | `number` | `0` | Version tag for [provenance tracking](provenance.md) | -| `summarizer` | `Summarizer` | - | LLM-powered summarizer. When provided, `compress()` returns a `Promise`. See [LLM integration](llm-integration.md) | -| `tokenBudget` | `number` | - | Target token count. Binary-searches `recencyWindow` to fit. See [Token budget](token-budget.md) | -| `minRecencyWindow` | `number` | `0` | Floor for `recencyWindow` when using `tokenBudget` | -| `dedup` | `boolean` | `true` | Replace earlier exact-duplicate messages with a compact reference. See [Deduplication](deduplication.md) | -| `fuzzyDedup` | `boolean` | `false` | Detect near-duplicate messages using line-level similarity. See [Deduplication](deduplication.md) | -| `fuzzyThreshold` | `number` | `0.85` | Similarity threshold for fuzzy dedup (0-1) | -| `embedSummaryId` | `boolean` | `false` | Embed `summary_id` in compressed content for downstream reference. See [Provenance](provenance.md) | -| `forceConverge` | `boolean` | `false` | Hard-truncate non-recency messages when binary search bottoms out. See [Token budget](token-budget.md) | -| `tokenCounter` | `(msg: Message) => number` | `defaultTokenCounter` | Custom token counter per message. See [Token budget](token-budget.md) | +| Option | Type | Default | Description | +| ------------------ | -------------------------------------- | --------------------- | ------------------------------------------------------------------------------------------------------------------ | +| `preserve` | `string[]` | `['system']` | Roles to never compress | +| `recencyWindow` | `number` | `4` | Protect the last N messages from compression | +| `sourceVersion` | `number` | `0` | Version tag for [provenance tracking](provenance.md) | +| `summarizer` | `Summarizer` | - | LLM-powered summarizer. When provided, `compress()` returns a `Promise`. See [LLM integration](llm-integration.md) | +| `tokenBudget` | `number` | - | Target token count. Binary-searches `recencyWindow` to fit. See [Token budget](token-budget.md) | +| `minRecencyWindow` | `number` | `0` | Floor for `recencyWindow` when using `tokenBudget` | +| `dedup` | `boolean` | `true` | Replace earlier exact-duplicate messages with a compact reference. See [Deduplication](deduplication.md) | +| `fuzzyDedup` | `boolean` | `false` | Detect near-duplicate messages using line-level similarity. See [Deduplication](deduplication.md) | +| `fuzzyThreshold` | `number` | `0.85` | Similarity threshold for fuzzy dedup (0-1) | +| `embedSummaryId` | `boolean` | `false` | Embed `summary_id` in compressed content for downstream reference. See [Provenance](provenance.md) | +| `forceConverge` | `boolean` | `false` | Hard-truncate non-recency messages when binary search bottoms out. See [Token budget](token-budget.md) | +| `preservePatterns` | `Array<{ re: RegExp; label: string }>` | - | Custom regex patterns that force hard T0 preservation. See [Preservation rules](preservation-rules.md) | +| `tokenCounter` | `(msg: Message) => number` | `defaultTokenCounter` | Custom token counter per message. See [Token budget](token-budget.md) | ### CompressResult -| Field | Type | Description | -| ------------------------------------ | ---------------------- | ----------------------------------------------------------------------------------- | -| `messages` | `Message[]` | Compressed message array | -| `verbatim` | `VerbatimMap` | Original messages keyed by ID. Must be persisted atomically with `messages` | -| `compression.original_version` | `number` | Mirrors `sourceVersion` | -| `compression.ratio` | `number` | Character-based compression ratio. >1 means savings | -| `compression.token_ratio` | `number` | Token-based compression ratio. >1 means savings | -| `compression.messages_compressed` | `number` | Messages that were compressed | -| `compression.messages_preserved` | `number` | Messages kept as-is | -| `compression.messages_deduped` | `number \| undefined` | Exact duplicates replaced (when `dedup: true`) | -| `compression.messages_fuzzy_deduped` | `number \| undefined` | Near-duplicates replaced (when `fuzzyDedup: true`) | -| `fits` | `boolean \| undefined` | Whether result fits within `tokenBudget`. Present when `tokenBudget` is set | -| `tokenCount` | `number \| undefined` | Estimated token count. Present when `tokenBudget` is set | -| `recencyWindow` | `number \| undefined` | The `recencyWindow` the binary search settled on. Present when `tokenBudget` is set | +| Field | Type | Description | +| ---------------------------------------- | ---------------------- | ----------------------------------------------------------------------------------- | +| `messages` | `Message[]` | Compressed message array | +| `verbatim` | `VerbatimMap` | Original messages keyed by ID. Must be persisted atomically with `messages` | +| `compression.original_version` | `number` | Mirrors `sourceVersion` | +| `compression.ratio` | `number` | Character-based compression ratio. >1 means savings | +| `compression.token_ratio` | `number` | Token-based compression ratio. >1 means savings | +| `compression.messages_compressed` | `number` | Messages that were compressed | +| `compression.messages_preserved` | `number` | Messages kept as-is | +| `compression.messages_deduped` | `number \| undefined` | Exact duplicates replaced (when `dedup: true`) | +| `compression.messages_fuzzy_deduped` | `number \| undefined` | Near-duplicates replaced (when `fuzzyDedup: true`) | +| `compression.messages_pattern_preserved` | `number \| undefined` | Messages preserved by `preservePatterns` (when patterns are provided) | +| `fits` | `boolean \| undefined` | Whether result fits within `tokenBudget`. Present when `tokenBudget` is set | +| `tokenCount` | `number \| undefined` | Estimated token count. Present when `tokenBudget` is set | +| `recencyWindow` | `number \| undefined` | The `recencyWindow` the binary search settled on. Present when `tokenBudget` is set | ### Example diff --git a/docs/preservation-rules.md b/docs/preservation-rules.md index bba9bdf..64edee2 100644 --- a/docs/preservation-rules.md +++ b/docs/preservation-rules.md @@ -19,8 +19,9 @@ Messages are evaluated in this order. The **first matching rule** determines the | 7 | Code fences + prose >= 80 chars | Code-split path | | 8 | Code fences + prose < 80 chars | Preserved | | 9 | Hard T0 classification | Preserved | -| 10 | Valid JSON | Preserved | -| 11 | Everything else | Compressed | +| 10 | Custom `preservePatterns` match | Preserved | +| 11 | Valid JSON | Preserved | +| 12 | Everything else | Compressed | Soft T0 classifications (file paths, URLs, version numbers, etc.) do **not** prevent compression — entities capture the important references, and the prose is still compressible. @@ -143,6 +144,55 @@ compress(messages, { recencyWindow: 10 }); // protect last 10 compress(messages, { recencyWindow: 0 }); // no recency protection ``` +### `preservePatterns` option + +Force preservation of messages matching domain-specific regex patterns. Each pattern is a hard T0 — the message is preserved verbatim, no summarization. Patterns are checked after the built-in heuristic classifier but before JSON detection. + +```ts +compress(messages, { + preservePatterns: [ + { re: /§\s*\d+/, label: 'section_ref' }, + { re: /\d+\s*mg\b/i, label: 'dosage' }, + ], +}); +``` + +**Domain examples:** + +**Legal** — preserve clause references, case citations, regulatory references: + +```ts +preservePatterns: [ + { re: /§\s*\d+/, label: 'section_ref' }, + { re: /\b\d+\s+U\.S\.C\.\s*§/, label: 'usc_cite' }, + { re: /\bArticle\s+[IVX]+\b/, label: 'article_ref' }, + { re: /\bGDPR\s+Art\.\s*\d+/, label: 'gdpr_ref' }, +]; +``` + +**Medical** — preserve dosages, diagnostic codes, lab values: + +```ts +preservePatterns: [ + { re: /\d+\s*mg\b/i, label: 'dosage' }, + { re: /\bICD-10:\s*[A-Z]\d+/i, label: 'icd_code' }, + { re: /\bCPT\s+\d{5}/, label: 'cpt_code' }, + { re: /\bBP\s+\d+\/\d+/, label: 'vital_sign' }, +]; +``` + +**Academic** — preserve DOIs, citation markers, theorem references: + +```ts +preservePatterns: [ + { re: /\bdoi:\s*10\.\d{4,}/, label: 'doi' }, + { re: /\[(\d+(?:,\s*\d+)*)\]/, label: 'citation_marker' }, + { re: /\bTheorem\s+\d+/i, label: 'theorem_ref' }, +]; +``` + +The stat `compression.messages_pattern_preserved` reports how many messages were preserved by custom patterns. + --- ## See also diff --git a/src/compress.ts b/src/compress.ts index 6c09c03..becbb83 100644 --- a/src/compress.ts +++ b/src/compress.ts @@ -445,6 +445,7 @@ type Classified = { preserved: boolean; codeSplit?: boolean; dedup?: DedupAnnotation; + patternPreserved?: boolean; }; /** Build a compressed message with _cce_original provenance metadata. */ @@ -538,6 +539,7 @@ function classifyAll( preserveRoles: Set, recencyWindow: number, dedupAnnotations?: Map, + preservePatterns?: Array<{ re: RegExp; label: string }>, ): Classified[] { const recencyStart = Math.max(0, messages.length - recencyWindow); @@ -590,6 +592,11 @@ function classifyAll( // Soft T0 only — allow compression, entities will capture references } } + if (preservePatterns && preservePatterns.length > 0 && content) { + if (preservePatterns.some((p) => p.re.test(content))) { + return { msg, preserved: true, patternPreserved: true }; + } + } if (content && isValidJson(content)) { return { msg, preserved: true }; } @@ -607,6 +614,7 @@ function computeStats( counter: (msg: Message) => number, messagesDeduped?: number, messagesFuzzyDeduped?: number, + messagesPatternPreserved?: number, ): CompressResult['compression'] { const originalTotalChars = originalMessages.reduce((sum, m) => sum + contentLength(m), 0); const compressedTotalChars = resultMessages.reduce((sum, m) => sum + contentLength(m), 0); @@ -627,6 +635,9 @@ function computeStats( ...(messagesFuzzyDeduped && messagesFuzzyDeduped > 0 ? { messages_fuzzy_deduped: messagesFuzzyDeduped } : {}), + ...(messagesPatternPreserved && messagesPatternPreserved > 0 + ? { messages_pattern_preserved: messagesPatternPreserved } + : {}), }; } @@ -696,7 +707,13 @@ function* compressGen( } } - const classified = classifyAll(messages, preserveRoles, recencyWindow, dedupAnnotations); + const classified = classifyAll( + messages, + preserveRoles, + recencyWindow, + dedupAnnotations, + options.preservePatterns, + ); const result: Message[] = []; const verbatim: Record = {}; @@ -704,6 +721,7 @@ function* compressGen( let messagesPreserved = 0; let messagesDeduped = 0; let messagesFuzzyDeduped = 0; + let messagesPatternPreserved = 0; let i = 0; while (i < classified.length) { @@ -712,6 +730,7 @@ function* compressGen( if (preserved) { result.push(msg); messagesPreserved++; + if (classified[i].patternPreserved) messagesPatternPreserved++; i++; continue; } @@ -829,6 +848,7 @@ function* compressGen( counter, messagesDeduped, messagesFuzzyDeduped, + messagesPatternPreserved, ), verbatim, }; diff --git a/src/types.ts b/src/types.ts index 16e4fd3..ebe3df9 100644 --- a/src/types.ts +++ b/src/types.ts @@ -32,6 +32,10 @@ export type CompressOptions = { embedSummaryId?: boolean; /** Hard-truncate non-recency messages when binary search bottoms out and budget still exceeded. Default: false. */ forceConverge?: boolean; + /** Custom patterns that force preservation (hard T0). Matched against message content. + * Each pattern needs a regex and a label used in classification reasons. + * Example: `[{ re: /§\s*\d+/, label: 'section_ref' }]` */ + preservePatterns?: Array<{ re: RegExp; label: string }>; /** Custom token counter per message. Default: ceil(content.length / 3.5) — see defaultTokenCounter for rationale. */ tokenCounter?: (msg: Message) => number; }; @@ -63,6 +67,7 @@ export type CompressResult = { messages_preserved: number; messages_deduped?: number; messages_fuzzy_deduped?: number; + messages_pattern_preserved?: number; }; /** * Original verbatim messages keyed by ID — every compressed message's diff --git a/tests/compress.test.ts b/tests/compress.test.ts index 6b5512a..d3ecbc9 100644 --- a/tests/compress.test.ts +++ b/tests/compress.test.ts @@ -2468,3 +2468,121 @@ describe('compress with custom tokenCounter', () => { expect(withDefault.fits).toBe(withExplicit.fits); }); }); + +// --------------------------------------------------------------------------- +// preservePatterns +// --------------------------------------------------------------------------- + +describe('preservePatterns', () => { + const LONG_PROSE = + 'This is a long user message that talks about many things and goes on for a while to exceed the threshold and get compressed normally. '.repeat( + 5, + ); + + it('pattern-matched message is preserved even when it would normally compress', () => { + const content = `Pursuant to § 42 of the agreement, the parties agree. ${LONG_PROSE}`; + const messages: Message[] = [msg({ id: '1', index: 0, content })]; + const result = compress(messages, { + recencyWindow: 0, + preservePatterns: [{ re: /§\s*\d+/, label: 'section_ref' }], + }); + expect(result.messages[0].content).toBe(content); + expect(result.compression.messages_preserved).toBe(1); + expect(result.compression.messages_compressed).toBe(0); + expect(result.compression.messages_pattern_preserved).toBe(1); + }); + + it('non-matching messages still compress normally', () => { + const messages: Message[] = [msg({ id: '1', index: 0, content: LONG_PROSE })]; + const result = compress(messages, { + recencyWindow: 0, + preservePatterns: [{ re: /§\s*\d+/, label: 'section_ref' }], + }); + expect(result.messages[0].content).toMatch(/^\[summary:/); + expect(result.compression.messages_compressed).toBe(1); + expect(result.compression.messages_pattern_preserved).toBeUndefined(); + }); + + it('multiple patterns — any match preserves', () => { + const content = `Patient prescribed Metformin 500mg bid. ${LONG_PROSE}`; + const messages: Message[] = [msg({ id: '1', index: 0, content })]; + const result = compress(messages, { + recencyWindow: 0, + preservePatterns: [ + { re: /§\s*\d+/, label: 'section_ref' }, + { re: /\d+\s*mg\b/i, label: 'dosage' }, + ], + }); + expect(result.messages[0].content).toBe(content); + expect(result.compression.messages_pattern_preserved).toBe(1); + }); + + it('empty preservePatterns array has no effect', () => { + const messages: Message[] = [msg({ id: '1', index: 0, content: LONG_PROSE })]; + const withEmpty = compress(messages, { recencyWindow: 0, preservePatterns: [] }); + const without = compress(messages, { recencyWindow: 0 }); + expect(withEmpty.compression.messages_compressed).toBe(without.compression.messages_compressed); + expect(withEmpty.compression.messages_pattern_preserved).toBeUndefined(); + }); + + it('code-split check runs before pattern check — code-split messages are not affected', () => { + const proseWithPattern = `Section § 12 discussion. ${LONG_PROSE}`; + const codeContent = `${proseWithPattern}\n\n\`\`\`ts\nconst x = 1;\n\`\`\``; + const messages: Message[] = [ + msg({ id: '1', index: 0, role: 'assistant', content: codeContent }), + ]; + const result = compress(messages, { + recencyWindow: 0, + preservePatterns: [{ re: /§\s*\d+/, label: 'section_ref' }], + }); + // Code-split path takes precedence: prose is compressed, code fence preserved + expect(result.messages[0].content).toContain('```'); + expect(result.compression.messages_compressed).toBe(1); + expect(result.compression.messages_pattern_preserved).toBeUndefined(); + }); + + it('dedup runs before patterns — deduped message stays deduped', () => { + const content = `Reference to § 42 in this document. ${LONG_PROSE}`; + const messages: Message[] = [ + msg({ id: '1', index: 0, content }), + msg({ id: '2', index: 1, content }), + ]; + const result = compress(messages, { + recencyWindow: 0, + dedup: true, + preservePatterns: [{ re: /§\s*\d+/, label: 'section_ref' }], + }); + // First message is deduped (earlier duplicate), second is pattern-preserved + expect(result.messages[0].content).toMatch(/^\[cce:dup/); + expect(result.messages[1].content).toBe(content); + expect(result.compression.messages_deduped).toBe(1); + expect(result.compression.messages_pattern_preserved).toBe(1); + }); + + it('pattern-preserved messages survive tokenBudget binary search', () => { + const matchContent = `Legal clause § 7 reference. ${LONG_PROSE}`; + const plainContent = LONG_PROSE; + const messages: Message[] = [ + msg({ id: '0', index: 0, content: matchContent }), + msg({ id: '1', index: 1, content: plainContent }), + msg({ id: '2', index: 2, content: matchContent }), + msg({ id: '3', index: 3, content: plainContent }), + msg({ id: '4', index: 4, content: matchContent }), + msg({ id: '5', index: 5, content: 'recent' }), + ]; + // Budget tight enough to trigger binary search (not fast-path) + const perMsg = (m: Message) => (typeof m.content === 'string' ? m.content.length : 0); + const totalTokens = messages.reduce((s, m) => s + perMsg(m), 0); + const result = compress(messages, { + tokenBudget: Math.floor(totalTokens * 0.8), + tokenCounter: perMsg, + dedup: false, + preservePatterns: [{ re: /§\s*\d+/, label: 'section_ref' }], + }); + // Pattern-matched messages should be preserved even under budget pressure + expect(result.messages[0].content).toBe(matchContent); + expect(result.messages[2].content).toBe(matchContent); + // Plain prose messages should be compressed to fit budget + expect(result.compression.messages_compressed).toBeGreaterThan(0); + }); +});