Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 30 additions & 28 deletions docs/api-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,37 +53,39 @@ function compress(

### CompressOptions

| Option | Type | Default | Description |
| ------------------ | -------------------------- | --------------------- | ------------------------------------------------------------------------------------------------------------------ |
| `preserve` | `string[]` | `['system']` | Roles to never compress |
| `recencyWindow` | `number` | `4` | Protect the last N messages from compression |
| `sourceVersion` | `number` | `0` | Version tag for [provenance tracking](provenance.md) |
| `summarizer` | `Summarizer` | - | LLM-powered summarizer. When provided, `compress()` returns a `Promise`. See [LLM integration](llm-integration.md) |
| `tokenBudget` | `number` | - | Target token count. Binary-searches `recencyWindow` to fit. See [Token budget](token-budget.md) |
| `minRecencyWindow` | `number` | `0` | Floor for `recencyWindow` when using `tokenBudget` |
| `dedup` | `boolean` | `true` | Replace earlier exact-duplicate messages with a compact reference. See [Deduplication](deduplication.md) |
| `fuzzyDedup` | `boolean` | `false` | Detect near-duplicate messages using line-level similarity. See [Deduplication](deduplication.md) |
| `fuzzyThreshold` | `number` | `0.85` | Similarity threshold for fuzzy dedup (0-1) |
| `embedSummaryId` | `boolean` | `false` | Embed `summary_id` in compressed content for downstream reference. See [Provenance](provenance.md) |
| `forceConverge` | `boolean` | `false` | Hard-truncate non-recency messages when binary search bottoms out. See [Token budget](token-budget.md) |
| `tokenCounter` | `(msg: Message) => number` | `defaultTokenCounter` | Custom token counter per message. See [Token budget](token-budget.md) |
| Option | Type | Default | Description |
| ------------------ | -------------------------------------- | --------------------- | ------------------------------------------------------------------------------------------------------------------ |
| `preserve` | `string[]` | `['system']` | Roles to never compress |
| `recencyWindow` | `number` | `4` | Protect the last N messages from compression |
| `sourceVersion` | `number` | `0` | Version tag for [provenance tracking](provenance.md) |
| `summarizer` | `Summarizer` | - | LLM-powered summarizer. When provided, `compress()` returns a `Promise`. See [LLM integration](llm-integration.md) |
| `tokenBudget` | `number` | - | Target token count. Binary-searches `recencyWindow` to fit. See [Token budget](token-budget.md) |
| `minRecencyWindow` | `number` | `0` | Floor for `recencyWindow` when using `tokenBudget` |
| `dedup` | `boolean` | `true` | Replace earlier exact-duplicate messages with a compact reference. See [Deduplication](deduplication.md) |
| `fuzzyDedup` | `boolean` | `false` | Detect near-duplicate messages using line-level similarity. See [Deduplication](deduplication.md) |
| `fuzzyThreshold` | `number` | `0.85` | Similarity threshold for fuzzy dedup (0-1) |
| `embedSummaryId` | `boolean` | `false` | Embed `summary_id` in compressed content for downstream reference. See [Provenance](provenance.md) |
| `forceConverge` | `boolean` | `false` | Hard-truncate non-recency messages when binary search bottoms out. See [Token budget](token-budget.md) |
| `preservePatterns` | `Array<{ re: RegExp; label: string }>` | - | Custom regex patterns that force hard T0 preservation. See [Preservation rules](preservation-rules.md) |
| `tokenCounter` | `(msg: Message) => number` | `defaultTokenCounter` | Custom token counter per message. See [Token budget](token-budget.md) |

### CompressResult

| Field | Type | Description |
| ------------------------------------ | ---------------------- | ----------------------------------------------------------------------------------- |
| `messages` | `Message[]` | Compressed message array |
| `verbatim` | `VerbatimMap` | Original messages keyed by ID. Must be persisted atomically with `messages` |
| `compression.original_version` | `number` | Mirrors `sourceVersion` |
| `compression.ratio` | `number` | Character-based compression ratio. >1 means savings |
| `compression.token_ratio` | `number` | Token-based compression ratio. >1 means savings |
| `compression.messages_compressed` | `number` | Messages that were compressed |
| `compression.messages_preserved` | `number` | Messages kept as-is |
| `compression.messages_deduped` | `number \| undefined` | Exact duplicates replaced (when `dedup: true`) |
| `compression.messages_fuzzy_deduped` | `number \| undefined` | Near-duplicates replaced (when `fuzzyDedup: true`) |
| `fits` | `boolean \| undefined` | Whether result fits within `tokenBudget`. Present when `tokenBudget` is set |
| `tokenCount` | `number \| undefined` | Estimated token count. Present when `tokenBudget` is set |
| `recencyWindow` | `number \| undefined` | The `recencyWindow` the binary search settled on. Present when `tokenBudget` is set |
| Field | Type | Description |
| ---------------------------------------- | ---------------------- | ----------------------------------------------------------------------------------- |
| `messages` | `Message[]` | Compressed message array |
| `verbatim` | `VerbatimMap` | Original messages keyed by ID. Must be persisted atomically with `messages` |
| `compression.original_version` | `number` | Mirrors `sourceVersion` |
| `compression.ratio` | `number` | Character-based compression ratio. >1 means savings |
| `compression.token_ratio` | `number` | Token-based compression ratio. >1 means savings |
| `compression.messages_compressed` | `number` | Messages that were compressed |
| `compression.messages_preserved` | `number` | Messages kept as-is |
| `compression.messages_deduped` | `number \| undefined` | Exact duplicates replaced (when `dedup: true`) |
| `compression.messages_fuzzy_deduped` | `number \| undefined` | Near-duplicates replaced (when `fuzzyDedup: true`) |
| `compression.messages_pattern_preserved` | `number \| undefined` | Messages preserved by `preservePatterns` (when patterns are provided) |
| `fits` | `boolean \| undefined` | Whether result fits within `tokenBudget`. Present when `tokenBudget` is set |
| `tokenCount` | `number \| undefined` | Estimated token count. Present when `tokenBudget` is set |
| `recencyWindow` | `number \| undefined` | The `recencyWindow` the binary search settled on. Present when `tokenBudget` is set |

### Example

Expand Down
54 changes: 52 additions & 2 deletions docs/preservation-rules.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@ Messages are evaluated in this order. The **first matching rule** determines the
| 7 | Code fences + prose >= 80 chars | Code-split path |
| 8 | Code fences + prose < 80 chars | Preserved |
| 9 | Hard T0 classification | Preserved |
| 10 | Valid JSON | Preserved |
| 11 | Everything else | Compressed |
| 10 | Custom `preservePatterns` match | Preserved |
| 11 | Valid JSON | Preserved |
| 12 | Everything else | Compressed |

Soft T0 classifications (file paths, URLs, version numbers, etc.) do **not** prevent compression — entities capture the important references, and the prose is still compressible.

Expand Down Expand Up @@ -143,6 +144,55 @@ compress(messages, { recencyWindow: 10 }); // protect last 10
compress(messages, { recencyWindow: 0 }); // no recency protection
```

### `preservePatterns` option

Force preservation of messages matching domain-specific regex patterns. Each pattern is a hard T0 — the message is preserved verbatim, no summarization. Patterns are checked after the built-in heuristic classifier but before JSON detection.

```ts
compress(messages, {
preservePatterns: [
{ re: /§\s*\d+/, label: 'section_ref' },
{ re: /\d+\s*mg\b/i, label: 'dosage' },
],
});
```

**Domain examples:**

**Legal** — preserve clause references, case citations, regulatory references:

```ts
preservePatterns: [
{ re: /§\s*\d+/, label: 'section_ref' },
{ re: /\b\d+\s+U\.S\.C\.\s*§/, label: 'usc_cite' },
{ re: /\bArticle\s+[IVX]+\b/, label: 'article_ref' },
{ re: /\bGDPR\s+Art\.\s*\d+/, label: 'gdpr_ref' },
];
```

**Medical** — preserve dosages, diagnostic codes, lab values:

```ts
preservePatterns: [
{ re: /\d+\s*mg\b/i, label: 'dosage' },
{ re: /\bICD-10:\s*[A-Z]\d+/i, label: 'icd_code' },
{ re: /\bCPT\s+\d{5}/, label: 'cpt_code' },
{ re: /\bBP\s+\d+\/\d+/, label: 'vital_sign' },
];
```

**Academic** — preserve DOIs, citation markers, theorem references:

```ts
preservePatterns: [
{ re: /\bdoi:\s*10\.\d{4,}/, label: 'doi' },
{ re: /\[(\d+(?:,\s*\d+)*)\]/, label: 'citation_marker' },
{ re: /\bTheorem\s+\d+/i, label: 'theorem_ref' },
];
```

The stat `compression.messages_pattern_preserved` reports how many messages were preserved by custom patterns.

---

## See also
Expand Down
22 changes: 21 additions & 1 deletion src/compress.ts
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,7 @@ type Classified = {
preserved: boolean;
codeSplit?: boolean;
dedup?: DedupAnnotation;
patternPreserved?: boolean;
};

/** Build a compressed message with _cce_original provenance metadata. */
Expand Down Expand Up @@ -538,6 +539,7 @@ function classifyAll(
preserveRoles: Set<string>,
recencyWindow: number,
dedupAnnotations?: Map<number, DedupAnnotation>,
preservePatterns?: Array<{ re: RegExp; label: string }>,
): Classified[] {
const recencyStart = Math.max(0, messages.length - recencyWindow);

Expand Down Expand Up @@ -590,6 +592,11 @@ function classifyAll(
// Soft T0 only — allow compression, entities will capture references
}
}
if (preservePatterns && preservePatterns.length > 0 && content) {
if (preservePatterns.some((p) => p.re.test(content))) {
return { msg, preserved: true, patternPreserved: true };
}
}
if (content && isValidJson(content)) {
return { msg, preserved: true };
}
Expand All @@ -607,6 +614,7 @@ function computeStats(
counter: (msg: Message) => number,
messagesDeduped?: number,
messagesFuzzyDeduped?: number,
messagesPatternPreserved?: number,
): CompressResult['compression'] {
const originalTotalChars = originalMessages.reduce((sum, m) => sum + contentLength(m), 0);
const compressedTotalChars = resultMessages.reduce((sum, m) => sum + contentLength(m), 0);
Expand All @@ -627,6 +635,9 @@ function computeStats(
...(messagesFuzzyDeduped && messagesFuzzyDeduped > 0
? { messages_fuzzy_deduped: messagesFuzzyDeduped }
: {}),
...(messagesPatternPreserved && messagesPatternPreserved > 0
? { messages_pattern_preserved: messagesPatternPreserved }
: {}),
};
}

Expand Down Expand Up @@ -696,14 +707,21 @@ function* compressGen(
}
}

const classified = classifyAll(messages, preserveRoles, recencyWindow, dedupAnnotations);
const classified = classifyAll(
messages,
preserveRoles,
recencyWindow,
dedupAnnotations,
options.preservePatterns,
);

const result: Message[] = [];
const verbatim: Record<string, Message> = {};
let messagesCompressed = 0;
let messagesPreserved = 0;
let messagesDeduped = 0;
let messagesFuzzyDeduped = 0;
let messagesPatternPreserved = 0;
let i = 0;

while (i < classified.length) {
Expand All @@ -712,6 +730,7 @@ function* compressGen(
if (preserved) {
result.push(msg);
messagesPreserved++;
if (classified[i].patternPreserved) messagesPatternPreserved++;
i++;
continue;
}
Expand Down Expand Up @@ -829,6 +848,7 @@ function* compressGen(
counter,
messagesDeduped,
messagesFuzzyDeduped,
messagesPatternPreserved,
),
verbatim,
};
Expand Down
5 changes: 5 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ export type CompressOptions = {
embedSummaryId?: boolean;
/** Hard-truncate non-recency messages when binary search bottoms out and budget still exceeded. Default: false. */
forceConverge?: boolean;
/** Custom patterns that force preservation (hard T0). Matched against message content.
* Each pattern needs a regex and a label used in classification reasons.
* Example: `[{ re: /§\s*\d+/, label: 'section_ref' }]` */
preservePatterns?: Array<{ re: RegExp; label: string }>;
/** Custom token counter per message. Default: ceil(content.length / 3.5) — see defaultTokenCounter for rationale. */
tokenCounter?: (msg: Message) => number;
};
Expand Down Expand Up @@ -63,6 +67,7 @@ export type CompressResult = {
messages_preserved: number;
messages_deduped?: number;
messages_fuzzy_deduped?: number;
messages_pattern_preserved?: number;
};
/**
* Original verbatim messages keyed by ID — every compressed message's
Expand Down
Loading
Loading